提交 26736576 编写于 作者: D dangqingqing

Move RowwiseAdd functor to math_funcion and Add ColwiseSum functor.

上级 48947b51
......@@ -205,14 +205,8 @@ class GRUGradKernel : public framework::OpKernel<T> {
}
if (bias_grad) {
bias_grad->mutable_data<T>(context.GetPlace());
int m = static_cast<int>(batch_gate_grad.dims()[0]);
int n = static_cast<int>(batch_gate_grad.dims()[1]);
Tensor ones;
ones.mutable_data<T>({m}, context.GetPlace());
math::SetConstant<Place, T> set;
set(dev_ctx, &ones, static_cast<T>(1));
math::gemv<Place, T>(dev_ctx, true, m, n, 1., batch_gate_grad.data<T>(),
ones.data<T>(), 0., bias_grad->data<T>());
math::ColwiseSum<Place, T> col_sum;
col_sum(dev_ctx, batch_gate_grad, bias_grad);
}
}
......
......@@ -341,16 +341,11 @@ class LSTMGradKernel : public framework::OpKernel<T> {
}
if (bias && bias_g) {
/* backward bias */
int m = static_cast<int>(batch_gate_g.dims()[0]);
int n = static_cast<int>(batch_gate_g.dims()[1]);
Tensor ones;
ones.mutable_data<T>({m}, ctx.GetPlace());
math::SetConstant<Place, T> set;
set(device_ctx, &ones, static_cast<T>(1.0));
math::gemv<Place, T>(device_ctx, true, m, n, 1., batch_gate_g.data<T>(),
ones.data<T>(), 0., bias_g->data<T>());
Tensor b_g = *bias_g;
b_g.Resize({bias_g->numel(), 1});
Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
math::ColwiseSum<Place, T> col_sum;
col_sum(device_ctx, batch_gate_g, &gate_bias_g);
}
if (h0 && h0_g) {
......
......@@ -308,6 +308,11 @@ void set_constant(const platform::DeviceContext& context,
#endif
}
template struct RowwiseAdd<platform::CPUPlace, float>;
template struct RowwiseAdd<platform::CPUPlace, double>;
template struct ColwiseSum<platform::CPUPlace, float>;
template struct ColwiseSum<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -292,6 +292,11 @@ void set_constant_with_place<platform::GPUPlace>(
TensorSetConstantGPU(context, tensor, value));
}
template struct RowwiseAdd<platform::GPUPlace, float>;
template struct RowwiseAdd<platform::GPUPlace, double>;
template struct ColwiseSum<platform::GPUPlace, float>;
template struct ColwiseSum<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -117,6 +117,19 @@ void set_constant_with_place(const platform::DeviceContext& context,
void set_constant(const platform::DeviceContext& context,
framework::Tensor* tensor, float value);
template <typename Place, typename T>
struct RowwiseAdd {
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& vec,
framework::Tensor* output);
};
template <typename Place, typename T>
struct ColwiseSum {
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor* vec);
};
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -43,6 +43,41 @@ void Transpose<Place, T, Rank>::operator()(
auto* dev = context.GetEigenDevice<Place>();
eigen_out.device(*dev) = eigen_in.shuffle(permute);
}
template <typename Place, typename T>
void RowwiseAdd<Place, T>::operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
const framework::Tensor& vector,
framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
auto in = framework::EigenMatrix<T>::From(input);
auto vec = framework::EigenMatrix<T>::From(vector);
auto out = framework::EigenMatrix<T>::From(*output);
Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
out.device(*context.GetEigenDevice<Place>()) =
in + vec.reshape(shape).broadcast(bcast);
}
template <typename Place, typename T>
void ColwiseSum<Place, T>::operator()(const platform::DeviceContext& context,
const framework::Tensor& input,
framework::Tensor* vector) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(vector->numel(), size);
auto vec = framework::EigenMatrix<T>::From(*vector);
auto in = framework::EigenMatrix<T>::From(input);
Eigen::array<int, 2> shape({{1, static_cast<int>(size)}});
vec.reshape(shape).device(*context.GetEigenDevice<Place>()) =
in.sum(Eigen::array<int, 1>({{0}})).reshape(shape);
}
}
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -56,29 +56,6 @@ template class LoDTensor2BatchFunctor<platform::CPUPlace, double>;
template class Batch2LoDTensorFunctor<platform::CPUPlace, float>;
template class Batch2LoDTensorFunctor<platform::CPUPlace, double>;
template <typename T>
struct RowwiseAdd<platform::CPUPlace, T> {
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& bias,
framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(bias.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
auto in = EigenMatrix<T>::From(input);
auto b = EigenMatrix<T>::From(bias);
auto out = EigenMatrix<T>::From(*output);
Eigen::array<int, 2> bshape({{1, static_cast<int>(size)}});
Eigen::array<int, 2> bcast({{static_cast<int>(in_dims[0]), 1}});
out.device(*context.GetEigenDevice<platform::CPUPlace>()) =
in + b.reshape(bshape).broadcast(bcast);
}
};
template struct RowwiseAdd<platform::CPUPlace, float>;
template struct RowwiseAdd<platform::CPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -74,37 +74,6 @@ template class LoDTensor2BatchFunctor<platform::GPUPlace, double>;
template class Batch2LoDTensorFunctor<platform::GPUPlace, float>;
template class Batch2LoDTensorFunctor<platform::GPUPlace, double>;
template <typename T>
__global__ void RowwiseAddKernel(const T* src, const T* b, T* dst,
int64_t height, int64_t width) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < height * width;
i += blockDim.x * gridDim.x) {
int64_t h = i / width;
int64_t w = i % width;
dst[h * width + w] = src[h * width + w] + b[w];
}
}
template <typename T>
struct RowwiseAdd<platform::GPUPlace, T> {
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& bias,
framework::Tensor* output) {
auto in_dims = input.dims();
auto size = input.numel() / in_dims[0];
PADDLE_ENFORCE_EQ(bias.numel(), size);
PADDLE_ENFORCE_EQ(output->dims(), in_dims);
int block = 512;
int grid = (input.numel() + block - 1) / block;
auto stream =
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
RowwiseAddKernel<T><<<grid, block, 0, stream>>>(
input.data<T>(), bias.data<T>(), output->data<T>(), in_dims[0], size);
}
};
template struct RowwiseAdd<platform::GPUPlace, float>;
template struct RowwiseAdd<platform::GPUPlace, double>;
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -164,13 +164,6 @@ class Batch2LoDTensorFunctor {
}
};
template <typename Place, typename T>
struct RowwiseAdd {
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, const framework::Tensor& bias,
framework::Tensor* output);
};
} // namespace math
} // namespace operators
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册