未验证 提交 7e671c07 编写于 作者: W wuhuanzhou 提交者: GitHub

optimize unity build (#30195)

* optimize unity build, test=develop

* fix code style error, test=develop

* fix code style error and test /MP settings, test=develop
上级 e5b0d9e1
...@@ -84,6 +84,8 @@ if(WIN32) ...@@ -84,6 +84,8 @@ if(WIN32)
endforeach(flag_var) endforeach(flag_var)
endif() endif()
# NOTE(Avin0323): Less parallel count result in faster compilation.
math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
# windows build turn off warnings, use parallel compiling. # windows build turn off warnings, use parallel compiling.
foreach(flag_var foreach(flag_var
CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
...@@ -91,13 +93,7 @@ if(WIN32) ...@@ -91,13 +93,7 @@ if(WIN32)
CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
# NOTE(Avin0323): Less parallel count result in faster compilation with set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
# Unity Build on GPU.
if(WITH_UNITY_BUILD AND WITH_GPU)
set(${flag_var} "${${flag_var}} /MP8")
else()
set(${flag_var} "${${flag_var}} /MP")
endif()
endforeach(flag_var) endforeach(flag_var)
foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
set(${flag_var} "${${flag_var}} /w") set(${flag_var} "${${flag_var}} /w")
......
...@@ -23,10 +23,6 @@ namespace operators { ...@@ -23,10 +23,6 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename DeviceContext> template <typename DeviceContext>
void GetAccumulators(const framework::ExecutionContext& ctx, void GetAccumulators(const framework::ExecutionContext& ctx,
int64_t* num_updates, int64_t* num_accumulates, int64_t* num_updates, int64_t* num_accumulates,
...@@ -67,18 +63,18 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> { ...@@ -67,18 +63,18 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1"); auto* in_sum_1 = ctx.Input<Tensor>("in_sum_1");
auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2"); auto* in_sum_2 = ctx.Input<Tensor>("in_sum_2");
auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3"); auto* in_sum_3 = ctx.Input<Tensor>("in_sum_3");
auto param_tensor = EigenVector<T>::Flatten(*param); auto param_tensor = framework::EigenVector<T>::Flatten(*param);
auto in_sum_1_tensor = EigenVector<T>::Flatten(*in_sum_1); auto in_sum_1_tensor = framework::EigenVector<T>::Flatten(*in_sum_1);
auto in_sum_2_tensor = EigenVector<T>::Flatten(*in_sum_2); auto in_sum_2_tensor = framework::EigenVector<T>::Flatten(*in_sum_2);
auto in_sum_3_tensor = EigenVector<T>::Flatten(*in_sum_3); auto in_sum_3_tensor = framework::EigenVector<T>::Flatten(*in_sum_3);
// Get outputs // Get outputs
auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1"); auto* out_sum_1 = ctx.Output<Tensor>("out_sum_1");
auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2"); auto* out_sum_2 = ctx.Output<Tensor>("out_sum_2");
auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3"); auto* out_sum_3 = ctx.Output<Tensor>("out_sum_3");
auto out_sum_1_tensor = EigenVector<T>::Flatten(*out_sum_1); auto out_sum_1_tensor = framework::EigenVector<T>::Flatten(*out_sum_1);
auto out_sum_2_tensor = EigenVector<T>::Flatten(*out_sum_2); auto out_sum_2_tensor = framework::EigenVector<T>::Flatten(*out_sum_2);
auto out_sum_3_tensor = EigenVector<T>::Flatten(*out_sum_3); auto out_sum_3_tensor = framework::EigenVector<T>::Flatten(*out_sum_3);
// Compute // Compute
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
......
...@@ -25,7 +25,6 @@ namespace paddle { ...@@ -25,7 +25,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using DataLayout = platform::DataLayout;
template <typename T, int D> template <typename T, int D>
static void DataTranspose(const framework::ExecutionContext& ctx, static void DataTranspose(const framework::ExecutionContext& ctx,
...@@ -67,14 +66,15 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -67,14 +66,15 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
const std::string data_layout_str = ctx.Attr<std::string>("data_format"); const std::string data_layout_str = ctx.Attr<std::string>("data_format");
const paddle::operators::DataLayout data_layout = const paddle::platform::DataLayout data_layout =
(data_layout_str != "NHWC" ? DataLayout::kNCHW : DataLayout::kNHWC); (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
: platform::DataLayout::kNHWC);
// if channel_last, transpose to channel_first // if channel_last, transpose to channel_first
Tensor input_transpose; Tensor input_transpose;
std::vector<int> input_vec = framework::vectorize<int>(input->dims()); std::vector<int> input_vec = framework::vectorize<int>(input->dims());
std::vector<int> output_vec = framework::vectorize<int>(output->dims()); std::vector<int> output_vec = framework::vectorize<int>(output->dims());
if (data_layout == DataLayout::kNHWC) { if (data_layout == platform::DataLayout::kNHWC) {
if (strides.size() == 2U) { if (strides.size() == 2U) {
std::vector<int> axis = {0, 3, 1, 2}; std::vector<int> axis = {0, 3, 1, 2};
for (size_t i = 0; i < axis.size(); ++i) { for (size_t i = 0; i < axis.size(); ++i) {
...@@ -195,7 +195,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -195,7 +195,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
} }
T* transformed_output_data = transformed_output.data<T>(); T* transformed_output_data = transformed_output.data<T>();
DataLayout layout; platform::DataLayout layout;
int iwo_groups = groups; int iwo_groups = groups;
int c_groups = 1; int c_groups = 1;
...@@ -206,9 +206,9 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -206,9 +206,9 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
#endif #endif
if (strides.size() == 2U) { if (strides.size() == 2U) {
layout = DataLayout::kNCHW; layout = platform::DataLayout::kNCHW;
} else { } else {
layout = DataLayout::kNCDHW; layout = platform::DataLayout::kNCDHW;
} }
size_t workspace_size = 0; size_t workspace_size = 0;
...@@ -269,7 +269,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> { ...@@ -269,7 +269,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
ctx, &transformed_output, output, starts, ends, axes); ctx, &transformed_output, output, starts, ends, axes);
} }
if (data_layout == DataLayout::kNHWC) { if (data_layout == platform::DataLayout::kNHWC) {
Tensor output_transpose; Tensor output_transpose;
Tensor output_nchw; Tensor output_nchw;
output_nchw.ShareDataWith(*output); output_nchw.ShareDataWith(*output);
...@@ -309,8 +309,9 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -309,8 +309,9 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm"); std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
int user_workspace_size = ctx.Attr<int>("workspace_size_MB"); int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
const std::string data_layout_str = ctx.Attr<std::string>("data_format"); const std::string data_layout_str = ctx.Attr<std::string>("data_format");
const paddle::operators::DataLayout data_layout = const paddle::platform::DataLayout data_layout =
(data_layout_str != "NHWC" ? DataLayout::kNCHW : DataLayout::kNHWC); (data_layout_str != "NHWC" ? platform::DataLayout::kNCHW
: platform::DataLayout::kNHWC);
// if channel_last, transpose to channel_first // if channel_last, transpose to channel_first
Tensor input_transpose; Tensor input_transpose;
...@@ -318,7 +319,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -318,7 +319,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
std::vector<int> input_vec = framework::vectorize<int>(input->dims()); std::vector<int> input_vec = framework::vectorize<int>(input->dims());
std::vector<int> output_vec = std::vector<int> output_vec =
framework::vectorize<int>(output_grad->dims()); framework::vectorize<int>(output_grad->dims());
if (data_layout == DataLayout::kNHWC) { if (data_layout == platform::DataLayout::kNHWC) {
if (strides.size() == 2U) { if (strides.size() == 2U) {
std::vector<int> axis = {0, 3, 1, 2}; std::vector<int> axis = {0, 3, 1, 2};
for (size_t i = 0; i < axis.size(); ++i) { for (size_t i = 0; i < axis.size(); ++i) {
...@@ -416,12 +417,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -416,12 +417,12 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
output_vec = framework::vectorize<int>(transformed_output_grad.dims()); output_vec = framework::vectorize<int>(transformed_output_grad.dims());
// ------------------- cudnn descriptors --------------------- // ------------------- cudnn descriptors ---------------------
DataLayout layout; platform::DataLayout layout;
if (strides.size() == 2U) { if (strides.size() == 2U) {
layout = DataLayout::kNCHW; layout = platform::DataLayout::kNCHW;
} else { } else {
layout = DataLayout::kNCDHW; layout = platform::DataLayout::kNCDHW;
} }
int iwo_groups = groups; int iwo_groups = groups;
...@@ -515,7 +516,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> { ...@@ -515,7 +516,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
workspace_handle.RunFunc(cudnn_func, workspace_size); workspace_handle.RunFunc(cudnn_func, workspace_size);
} }
if (data_layout == DataLayout::kNHWC) { if (data_layout == platform::DataLayout::kNHWC) {
Tensor input_grad_transpose; Tensor input_grad_transpose;
Tensor input_grad_nchw; Tensor input_grad_nchw;
input_grad_nchw.ShareDataWith(*input_grad); input_grad_nchw.ShareDataWith(*input_grad);
...@@ -849,7 +850,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -849,7 +850,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
cudnnConvolutionBwdFilterAlgo_t filter_algo = cudnnConvolutionBwdFilterAlgo_t filter_algo =
static_cast<cudnnConvolutionBwdFilterAlgo_t>(0); static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
auto layout = GetCudnnTensorFormat(DataLayout::kNCHW); auto layout = GetCudnnTensorFormat(platform::DataLayout::kNCHW);
// ddo = conv(ddI, W) + conv(I, ddW) // ddo = conv(ddI, W) + conv(I, ddW)
size_t workspace_size = 0; size_t workspace_size = 0;
...@@ -916,12 +917,12 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> { ...@@ -916,12 +917,12 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
} }
int i_n, i_c, i_d, i_h, i_w; int i_n, i_c, i_d, i_h, i_w;
GetNCDHW(transformed_X.dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, GetNCDHW(transformed_X.dims(), platform::DataLayout::kNCHW, &i_n, &i_c,
&i_w); &i_d, &i_h, &i_w);
int o_n, o_c, o_d, o_h, o_w; int o_n, o_c, o_d, o_h, o_w;
GetNCDHW(transformed_dO.dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h, GetNCDHW(transformed_dO.dims(), platform::DataLayout::kNCHW, &o_n, &o_c,
&o_w); &o_d, &o_h, &o_w);
int group_offset_in = int group_offset_in =
transformed_X.numel() / transformed_X.dims()[0] / groups; transformed_X.numel() / transformed_X.dims()[0] / groups;
......
...@@ -23,7 +23,6 @@ namespace operators { ...@@ -23,7 +23,6 @@ namespace operators {
#define FULL_MASK 0xffffffff #define FULL_MASK 0xffffffff
using framework::Tensor; using framework::Tensor;
using DataLayout = framework::DataLayout;
template <typename T> template <typename T>
__forceinline__ __device__ T warpReduceSum(T val) { __forceinline__ __device__ T warpReduceSum(T val) {
......
...@@ -26,10 +26,6 @@ using Tensor = framework::Tensor; ...@@ -26,10 +26,6 @@ using Tensor = framework::Tensor;
using complex64 = platform::complex64; using complex64 = platform::complex64;
using complex128 = platform::complex128; using complex128 = platform::complex128;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, typename R> template <typename T, typename R>
struct P { struct P {
void operator()(T a, R b); void operator()(T a, R b);
...@@ -85,11 +81,11 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> { ...@@ -85,11 +81,11 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> {
dy.device(dev) = dy * dout.broadcast(size); dy.device(dev) = dy * dout.broadcast(size);
} }
} else { } else {
auto dout = EigenMatrix<T>::From(*tensor_dout); auto dout = framework::EigenMatrix<T>::From(*tensor_dout);
if (tensor_dx) { if (tensor_dx) {
tensor_dx->mutable_data<T>(ctx.GetPlace()); tensor_dx->mutable_data<T>(ctx.GetPlace());
auto y = EigenMatrix<T>::From(*tensor_y); auto y = framework::EigenMatrix<T>::From(*tensor_y);
auto& dev_raw = ctx.template device_context<DeviceContext>(); auto& dev_raw = ctx.template device_context<DeviceContext>();
auto& dev = *dev_raw.eigen_device(); auto& dev = *dev_raw.eigen_device();
Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]); Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
...@@ -99,14 +95,14 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> { ...@@ -99,14 +95,14 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> {
math::ConjFunctor<T> functor(tensor_y->data<T>(), tensor_y->numel(), math::ConjFunctor<T> functor(tensor_y->data<T>(), tensor_y->numel(),
tensor_dx->data<T>()); tensor_dx->data<T>());
for_range(functor); for_range(functor);
auto dx = EigenMatrix<T>::From(*tensor_dx); auto dx = framework::EigenMatrix<T>::From(*tensor_dx);
dx.device(dev) = dx * dout.broadcast(size); dx.device(dev) = dx * dout.broadcast(size);
} }
if (tensor_dy) { if (tensor_dy) {
tensor_dy->mutable_data<T>(ctx.GetPlace()); tensor_dy->mutable_data<T>(ctx.GetPlace());
auto x = EigenMatrix<T>::From(*tensor_x); auto x = framework::EigenMatrix<T>::From(*tensor_x);
auto& dev_raw = ctx.template device_context<DeviceContext>(); auto& dev_raw = ctx.template device_context<DeviceContext>();
auto& dev = *dev_raw.eigen_device(); auto& dev = *dev_raw.eigen_device();
Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]); Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
...@@ -117,7 +113,7 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> { ...@@ -117,7 +113,7 @@ struct DotGradFunction<DeviceContext, T, math::EnableComplex<T>> {
tensor_dy->data<T>()); tensor_dy->data<T>());
for_range(functor); for_range(functor);
auto dy = EigenMatrix<T>::From(*tensor_dy); auto dy = framework::EigenMatrix<T>::From(*tensor_dy);
dy.device(dev) = dy * dout.broadcast(size); dy.device(dev) = dy * dout.broadcast(size);
} }
...@@ -186,12 +182,12 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> { ...@@ -186,12 +182,12 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
dy.device(dev) = x * dout.broadcast(size); dy.device(dev) = x * dout.broadcast(size);
} }
} else { } else {
auto dout = EigenMatrix<T>::From(*tensor_dout); auto dout = framework::EigenMatrix<T>::From(*tensor_dout);
if (tensor_dx) { if (tensor_dx) {
tensor_dx->mutable_data<T>(ctx.GetPlace()); tensor_dx->mutable_data<T>(ctx.GetPlace());
auto y = EigenMatrix<T>::From(*tensor_y); auto y = framework::EigenMatrix<T>::From(*tensor_y);
auto dx = EigenMatrix<T>::From(*tensor_dx); auto dx = framework::EigenMatrix<T>::From(*tensor_dx);
auto& dev = auto& dev =
*ctx.template device_context<DeviceContext>().eigen_device(); *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]); Eigen::DSizes<int, 2> size(1, tensor_dx->dims()[1]);
...@@ -200,8 +196,8 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> { ...@@ -200,8 +196,8 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
if (tensor_dy) { if (tensor_dy) {
tensor_dy->mutable_data<T>(ctx.GetPlace()); tensor_dy->mutable_data<T>(ctx.GetPlace());
auto x = EigenMatrix<T>::From(*tensor_x); auto x = framework::EigenMatrix<T>::From(*tensor_x);
auto dy = EigenMatrix<T>::From(*tensor_dy); auto dy = framework::EigenMatrix<T>::From(*tensor_dy);
auto& dev = auto& dev =
*ctx.template device_context<DeviceContext>().eigen_device(); *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]); Eigen::DSizes<int, 2> size(1, tensor_dy->dims()[1]);
...@@ -262,9 +258,9 @@ class DotKernel : public framework::OpKernel<T> { ...@@ -262,9 +258,9 @@ class DotKernel : public framework::OpKernel<T> {
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device(); auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
out.device(dev) = (x * y).sum(); out.device(dev) = (x * y).sum();
} else { } else {
auto out = EigenMatrix<T>::From(*tensor_out); auto out = framework::EigenMatrix<T>::From(*tensor_out);
auto x = EigenMatrix<T>::From(*tensor_x); auto x = framework::EigenMatrix<T>::From(*tensor_x);
auto y = EigenMatrix<T>::From(*tensor_y); auto y = framework::EigenMatrix<T>::From(*tensor_y);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device(); auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
out.device(dev) = (x * y).sum(Eigen::DSizes<int, 1>(1)); out.device(dev) = (x * y).sum(Eigen::DSizes<int, 1>(1));
......
...@@ -50,16 +50,6 @@ ...@@ -50,16 +50,6 @@
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class MeshgridKernel : public framework::OpKernel<T> { class MeshgridKernel : public framework::OpKernel<T> {
public: public:
...@@ -120,9 +110,9 @@ class MeshgridKernel : public framework::OpKernel<T> { ...@@ -120,9 +110,9 @@ class MeshgridKernel : public framework::OpKernel<T> {
bcast_dims[i] = 1; bcast_dims[i] = 1;
outs[i]->Resize(out_dims); outs[i]->Resize(out_dims);
auto x = EigenTensor<T, Rank>::From(reshape_ins_tensor); auto x = framework::EigenTensor<T, Rank>::From(reshape_ins_tensor);
outs[i]->mutable_data<T>(context.GetPlace()); outs[i]->mutable_data<T>(context.GetPlace());
auto y = EigenTensor<T, Rank>::From(*outs[i]); auto y = framework::EigenTensor<T, Rank>::From(*outs[i]);
auto& place = auto& place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
y.device(place) = x.broadcast(bcast_dims); y.device(place) = x.broadcast(bcast_dims);
...@@ -159,8 +149,8 @@ class MeshgridGradKernel : public framework::OpKernel<T> { ...@@ -159,8 +149,8 @@ class MeshgridGradKernel : public framework::OpKernel<T> {
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
outs[i]->mutable_data<T>(context.GetPlace()); outs[i]->mutable_data<T>(context.GetPlace());
auto out_grad_tmp = EigenVector<T>::Flatten(*out_grad[i]); auto out_grad_tmp = framework::EigenVector<T>::Flatten(*out_grad[i]);
auto in_grad = EigenVector<T>::Flatten(*outs[i]); auto in_grad = framework::EigenVector<T>::Flatten(*outs[i]);
std::vector<int> reduce_dims_vec; std::vector<int> reduce_dims_vec;
std::vector<int> reshape_dims_vec; std::vector<int> reshape_dims_vec;
......
...@@ -37,7 +37,7 @@ class RankLossKernel : public framework::OpKernel<T> { ...@@ -37,7 +37,7 @@ class RankLossKernel : public framework::OpKernel<T> {
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device(); auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
out.device(dev) = out.device(dev) =
(1. + (left - right).exp()).log() - label * (left - right); (1.0f + (left - right).exp()).log() - label * (left - right);
} }
}; };
...@@ -65,14 +65,15 @@ class RankLossGradKernel : public framework::OpKernel<T> { ...@@ -65,14 +65,15 @@ class RankLossGradKernel : public framework::OpKernel<T> {
if (d_left_t) { if (d_left_t) {
d_left_t->mutable_data<T>(ctx.GetPlace()); d_left_t->mutable_data<T>(ctx.GetPlace());
auto d_left = framework::EigenVector<T>::Flatten(*d_left_t); auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label); d_left.device(dev) =
d_out * (1.0f / (1.0f + (right - left).exp()) - label);
} }
// compute d_right // compute d_right
if (d_right_t) { if (d_right_t) {
d_right_t->mutable_data<T>(ctx.GetPlace()); d_right_t->mutable_data<T>(ctx.GetPlace());
auto d_right = framework::EigenVector<T>::Flatten(*d_right_t); auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
d_right.device(dev) = d_right.device(dev) =
-d_out * (1.0 / (1. + (right - left).exp()) - label); -d_out * (1.0f / (1.0f + (right - left).exp()) - label);
} }
} }
}; };
......
...@@ -23,9 +23,6 @@ namespace paddle { ...@@ -23,9 +23,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T> template <typename T>
class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> { class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
...@@ -95,12 +92,12 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> { ...@@ -95,12 +92,12 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n}); labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim}); out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim});
auto out_grad_mat = EigenMatrix<T>::From(out_grad_2d); auto out_grad_mat = framework::EigenMatrix<T>::From(out_grad_2d);
auto logit_grad_mat = EigenMatrix<T>::From(logit_grad_2d); auto logit_grad_mat = framework::EigenMatrix<T>::From(logit_grad_2d);
auto& place = *context.template device_context<platform::CPUDeviceContext>() auto& place = *context.template device_context<platform::CPUDeviceContext>()
.eigen_device(); .eigen_device();
if (soft_label) { if (soft_label) {
auto lbl_mat = EigenMatrix<T>::From(labels_2d); auto lbl_mat = framework::EigenMatrix<T>::From(labels_2d);
logit_grad_mat.device(place) = logit_grad_mat.device(place) =
out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim)) * out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim)) *
(logit_grad_mat - lbl_mat); (logit_grad_mat - lbl_mat);
......
...@@ -20,12 +20,6 @@ namespace paddle { ...@@ -20,12 +20,6 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SquaredL2DistanceKernel : public framework::OpKernel<T> { class SquaredL2DistanceKernel : public framework::OpKernel<T> {
...@@ -41,15 +35,15 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> { ...@@ -41,15 +35,15 @@ class SquaredL2DistanceKernel : public framework::OpKernel<T> {
int cols = in0->numel() / in0_dims[0]; int cols = in0->numel() / in0_dims[0];
// reduce dimensions except the first // reduce dimensions except the first
auto x = auto x = framework::EigenMatrix<T>::From(
EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols})); *in0, framework::make_ddim({in0_dims[0], cols}));
auto y = auto y = framework::EigenMatrix<T>::From(
EigenMatrix<T>::From(*in1, framework::make_ddim({in1_dims[0], cols})); *in1, framework::make_ddim({in1_dims[0], cols}));
out0->mutable_data<T>(context.GetPlace()); out0->mutable_data<T>(context.GetPlace());
out1->mutable_data<T>(context.GetPlace()); out1->mutable_data<T>(context.GetPlace());
auto sub_result = EigenMatrix<T>::From(*out0); auto sub_result = framework::EigenMatrix<T>::From(*out0);
auto z = EigenVector<T>::Flatten(*out1); auto z = framework::EigenVector<T>::Flatten(*out1);
auto& place = auto& place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
...@@ -88,8 +82,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> { ...@@ -88,8 +82,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
"in scope for operator 'squared_l2_distance_grad'.", "in scope for operator 'squared_l2_distance_grad'.",
framework::GradVarName("Y"))); framework::GradVarName("Y")));
auto sub_result = EigenMatrix<T>::From(*in0); auto sub_result = framework::EigenMatrix<T>::From(*in0);
auto out_grad = EigenMatrix<T>::From(*in1); auto out_grad = framework::EigenMatrix<T>::From(*in1);
auto x_dims = x_g->dims(); auto x_dims = x_g->dims();
auto y_dims = y_g->dims(); auto y_dims = y_g->dims();
...@@ -106,8 +100,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> { ...@@ -106,8 +100,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
x_g->mutable_data<T>(context.GetPlace()); x_g->mutable_data<T>(context.GetPlace());
// eigen matrix // eigen matrix
auto x_grad = auto x_grad = framework::EigenMatrix<T>::From(
EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols})); *x_g, framework::make_ddim({x_dims[0], cols}));
// dimensions are same with subResult // dimensions are same with subResult
x_grad.device(eigen_place) = grad_mat; x_grad.device(eigen_place) = grad_mat;
...@@ -121,12 +115,12 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> { ...@@ -121,12 +115,12 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
sub_result.dimensions()[0], y_dims[0])); sub_result.dimensions()[0], y_dims[0]));
if (sub_result.dimensions()[0] == y_dims[0]) { if (sub_result.dimensions()[0] == y_dims[0]) {
auto y_grad = auto y_grad = framework::EigenMatrix<T>::From(
EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols})); *y_g, framework::make_ddim({y_dims[0], cols}));
y_grad.device(eigen_place) = -1 * grad_mat; y_grad.device(eigen_place) = -1 * grad_mat;
} else { } else {
auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}}))); auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
auto y_grad = EigenVector<T>::Flatten(*y_g); auto y_grad = framework::EigenVector<T>::Flatten(*y_g);
y_grad.device(eigen_place) = col_sum_res; y_grad.device(eigen_place) = col_sum_res;
} }
} }
......
...@@ -307,32 +307,36 @@ register_unity_group(cc ...@@ -307,32 +307,36 @@ register_unity_group(cc
spp_op.cu.cc spp_op.cu.cc
squeeze_op.cu.cc squeeze_op.cu.cc
unbind_op.cu.cc unbind_op.cu.cc
unique_op.cu
unpool_op.cu.cc unpool_op.cu.cc
unsqueeze_op.cu.cc) unsqueeze_op.cu.cc)
register_unity_group(cu register_unity_group(cu
addmm_op.cu addmm_op.cu
affine_channel_op.cu affine_channel_op.cu
allclose_op.cu allclose_op.cu
argsort_op.cu
assign_value_op.cu assign_value_op.cu
bce_loss_op.cu bce_loss_op.cu
bernoulli_op.cu bernoulli_op.cu
bilateral_slice_op.cu) bilateral_slice_op.cu
batch_norm_op.cu)
register_unity_group(cu register_unity_group(cu
bilinear_tensor_product_op.cu bilinear_tensor_product_op.cu
bmm_op.cu bmm_op.cu
cast_op.cu cast_op.cu
cholesky_op.cu cholesky_op.cu
clip_by_norm_op.cu clip_by_norm_op.cu
clip_op.cu) clip_op.cu
conv_cudnn_op.cu
affine_grid_op.cu)
register_unity_group(cu register_unity_group(cu
center_loss_op.cu center_loss_op.cu
conv_op.cu conv_op.cu
conv_transpose_cudnn_op.cu conv_transpose_cudnn_op.cu
conv_transpose_op.cu conv_transpose_op.cu
cos_sim_op.cu cos_sim_op.cu
crop_op.cu) crop_op.cu
average_accumulates_op.cu
conj_op.cu
correlation_op.cu)
register_unity_group(cu register_unity_group(cu
cross_entropy_op.cu cross_entropy_op.cu
cross_op.cu cross_op.cu
...@@ -349,7 +353,9 @@ register_unity_group(cu ...@@ -349,7 +353,9 @@ register_unity_group(cu
diag_op.cu diag_op.cu
diag_v2_op.cu diag_v2_op.cu
edit_distance_op.cu edit_distance_op.cu
erf_op.cu) erf_op.cu
meshgrid_op.cu
imag_op.cu)
register_unity_group(cu register_unity_group(cu
expand_v2_op.cu expand_v2_op.cu
fake_dequantize_op.cu fake_dequantize_op.cu
...@@ -377,10 +383,8 @@ register_unity_group(cu ...@@ -377,10 +383,8 @@ register_unity_group(cu
inplace_abn_op.cu inplace_abn_op.cu
interpolate_v2_op.cu interpolate_v2_op.cu
isfinite_op.cu isfinite_op.cu
kron_op.cu
l1_norm_op.cu l1_norm_op.cu
label_smooth_op.cu label_smooth_op.cu
layer_norm_op.cu
linspace_op.cu linspace_op.cu
load_combine_op.cu load_combine_op.cu
load_op.cu) load_op.cu)
...@@ -388,20 +392,30 @@ register_unity_group(cu ...@@ -388,20 +392,30 @@ register_unity_group(cu
lod_reset_op.cu lod_reset_op.cu
log_softmax_op.cu log_softmax_op.cu
lrn_op.cu lrn_op.cu
lstm_unit_op.cu) lstm_unit_op.cu
dot_op.cu
psroi_pool_op.cu
rank_loss_op.cu
real_op.cu)
register_unity_group(cu register_unity_group(cu
log_loss_op.cu log_loss_op.cu
lookup_table_v2_op.cu lookup_table_v2_op.cu
margin_rank_loss_op.cu margin_rank_loss_op.cu
masked_select_op.cu masked_select_op.cu
merge_selected_rows_op.cu) merge_selected_rows_op.cu
lstmp_op.cu
shuffle_channel_op.cu
softmax_cudnn_op.cu
squared_l2_distance_op.cu)
register_unity_group(cu register_unity_group(cu
conv_shift_op.cu conv_shift_op.cu
dequantize_log_op.cu dequantize_log_op.cu
dropout_op.cu dropout_op.cu
fake_quantize_op.cu fake_quantize_op.cu
gelu_op.cu gelu_op.cu
lookup_table_op.cu) lookup_table_op.cu
sigmoid_cross_entropy_with_logits_op.cu
softmax_with_cross_entropy_op.cu)
register_unity_group(cu register_unity_group(cu
mean_iou_op.cu mean_iou_op.cu
mean_op.cu mean_op.cu
...@@ -430,7 +444,10 @@ register_unity_group(cu ...@@ -430,7 +444,10 @@ register_unity_group(cu
random_crop_op.cu random_crop_op.cu
randperm_op.cu randperm_op.cu
range_op.cu range_op.cu
reverse_op.cu) reverse_op.cu
partial_concat_op.cu
kldiv_loss_op.cu
instance_norm_op.cu)
register_unity_group(cu register_unity_group(cu
roi_align_op.cu roi_align_op.cu
roll_op.cu roll_op.cu
...@@ -457,40 +474,42 @@ register_unity_group(cu ...@@ -457,40 +474,42 @@ register_unity_group(cu
split_op.cu split_op.cu
split_selected_rows_op.cu split_selected_rows_op.cu
squared_l2_norm_op.cu squared_l2_norm_op.cu
stack_op.cu
strided_slice_op.cu
sum_op.cu sum_op.cu
temporal_shift_op.cu) temporal_shift_op.cu
arg_max_op.cu)
register_unity_group(cu register_unity_group(cu
row_conv_op.cu row_conv_op.cu
tile_op.cu
trace_op.cu
transpose_op.cu
tree_conv_op.cu tree_conv_op.cu
tril_triu_op.cu tril_triu_op.cu
truncated_gaussian_random_op.cu truncated_gaussian_random_op.cu
unfold_op.cu) unfold_op.cu
arg_min_op.cu
crop_tensor_op.cu)
register_unity_group(cu register_unity_group(cu
smooth_l1_loss_op.cu smooth_l1_loss_op.cu
uniform_random_op.cu uniform_random_op.cu
unique_op.cu
unstack_op.cu unstack_op.cu
where_index_op.cu where_index_op.cu
where_op.cu) where_op.cu
layer_norm_op.cu)
register_unity_group(cu
expand_as_op.cu
stack_op.cu)
# The following groups are to make better use of `/MP` which MSVC's parallel # The following groups are to make better use of `/MP` which MSVC's parallel
# compilation instruction when compiling in Unity Build. # compilation instruction when compiling in Unity Build.
register_unity_group(cu activation_op.cu) register_unity_group(cu activation_op.cu)
register_unity_group(cu arg_max_op.cu)
register_unity_group(cu arg_min_op.cu)
register_unity_group(cu batch_norm_op.cu)
register_unity_group(cu crop_tensor_op.cu)
register_unity_group(cu dist_op.cu) register_unity_group(cu dist_op.cu)
register_unity_group(cu expand_as_op.cu)
register_unity_group(cu expand_as_v2_op.cu) register_unity_group(cu expand_as_v2_op.cu)
register_unity_group(cu gru_unit_op.cu) register_unity_group(cu gru_unit_op.cu)
register_unity_group(cu instance_norm_op.cu)
register_unity_group(cu kldiv_loss_op.cu)
register_unity_group(cu partial_concat_op.cu)
register_unity_group(cu softmax_with_cross_entropy_op.cu)
register_unity_group(cu squared_l2_distance_op.cu)
register_unity_group(cu top_k_op.cu) register_unity_group(cu top_k_op.cu)
register_unity_group(cu argsort_op.cu)
register_unity_group(cu kron_op.cu)
register_unity_group(cu unique_op.cu)
register_unity_group(cu tile_op.cu)
register_unity_group(cu trace_op.cu)
register_unity_group(cu transpose_op.cu)
register_unity_group(cu strided_slice_op.cu)
register_unity_group(cu expand_op.cu)
register_unity_group(cu matmul_v2_op.cu)
register_unity_group(cu top_k_v2_op.cu)
register_unity_group(cu set_value_op.cu)
...@@ -262,7 +262,7 @@ echo ======================================== ...@@ -262,7 +262,7 @@ echo ========================================
echo Step 2. Buile Paddle ... echo Step 2. Buile Paddle ...
echo ======================================== echo ========================================
for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*9/10 for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*2/3
set build_times=1 set build_times=1
:build_tp :build_tp
echo Build third_party the %build_times% time: echo Build third_party the %build_times% time:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册