提交 86fa8c05 编写于 作者: X xutianbing

Wei Xu's comments, set up right inouts.

上级 1dd972f9
...@@ -70,10 +70,11 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat, ...@@ -70,10 +70,11 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
} }
/** /**
* \param outputs[0] output value.
*
* \param inputs[0] input value. * \param inputs[0] input value.
* \param inputs[1] input weight. * \param inputs[1] input weight.
* \param inputs[2] input sequence. * \param inputs[2] input sequence.
* \param outputs[0] output value.
*/ */
template <DeviceType Device> template <DeviceType Device>
class ContextProjectionForwardFunc : public FunctionBase { class ContextProjectionForwardFunc : public FunctionBase {
...@@ -123,7 +124,8 @@ private: ...@@ -123,7 +124,8 @@ private:
}; };
template <> template <>
void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat, <<<<<<< HEAD
void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
CpuMatrix& in_grad_mat, CpuMatrix& in_grad_mat,
CpuMatrix& w_grad_mat, CpuMatrix& w_grad_mat,
const CpuIVector& seq_vec, const CpuIVector& seq_vec,
...@@ -176,10 +178,10 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat, ...@@ -176,10 +178,10 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
} }
/** /**
* \param inputs[0] input grad. * \param inputs[0] input sequence.
* \param inputs[1] weight grad. * \param inputs[1] output grad.
* \param inputs[2] input sequence. * \param inouts[0] input grad.
* \param outputs[0] output value. * \param inouts[1] weight grad.
*/ */
template <DeviceType Device> template <DeviceType Device>
class ContextProjectionBackwardFunc : public FunctionBase { class ContextProjectionBackwardFunc : public FunctionBase {
...@@ -192,6 +194,7 @@ public: ...@@ -192,6 +194,7 @@ public:
total_pad_ = config.get<size_t>("total_pad"); total_pad_ = config.get<size_t>("total_pad");
} }
<<<<<<< HEAD
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ((size_t)3, inputs.size()); CHECK_EQ((size_t)3, inputs.size());
CHECK_EQ((size_t)1, outputs.size()); CHECK_EQ((size_t)1, outputs.size());
...@@ -210,6 +213,42 @@ public: ...@@ -210,6 +213,42 @@ public:
CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_); CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
CHECK_EQ(outputs[0].getArgType(), ADD_TO); CHECK_EQ(outputs[0].getArgType(), ADD_TO);
=======
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(2, inputs.size());
CHECK_EQ(0, outputs.size());
CHECK_EQ(2, inouts.size());
CHECK(inputs[0].getData() && inputs[1].getData());
CHECK_EQ(inputs[0].dims_.size(), 1);
CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(inouts[0].dims_.size(), 2);
CHECK_EQ(inouts[1].dims_.size(), 2);
/// dim of input grad == dim of weight grad
CHECK_EQ(inouts[0].dims_[1], inouts[1].dims_[1]);
/// input grad and output grad have the same batch_size
CHECK_EQ(inouts[0].dims_[0], inputs[1].dims_[0]);
/// dim of output = dim of input * context_length
CHECK_EQ(inputs[1].dims_[1], inputs[0].dims_[1] * context_length_);
typename SequenceT<Device>::type seq_vec(
inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
auto in_grad_mat =
!inouts[0].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
auto w_grad_mat =
!inouts[1].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inouts[1].getData(), inouts[1].dims_[0], inouts[1].dims_[1]);
>>>>>>> Wei Xu's comments, set up right inouts.
auto out_grad_mat = outputs[0].matrix<Device>(); auto out_grad_mat = outputs[0].matrix<Device>();
auto in_grad_mat = auto in_grad_mat =
...@@ -240,9 +279,9 @@ private: ...@@ -240,9 +279,9 @@ private:
#if 0 #if 0
/** /**
* \param inputs[0] input grad. * \param inouts[0] input grad.
* \param inputs[1] input sequence. * \param inputs[0] input sequence.
* \param outputs[0] output grad. * \param inputs[1] output grad.
*/ */
template <DeviceType Device> template <DeviceType Device>
class ContextProjectionBackwardDataFunc : public FunctionBase { class ContextProjectionBackwardDataFunc : public FunctionBase {
...@@ -255,23 +294,24 @@ public: ...@@ -255,23 +294,24 @@ public:
void calc(const Arguments& inputs, void calc(const Arguments& inputs,
const Arguments& outputs, const Arguments& outputs,
const Arguments& inouts) override { const Arguments& inouts) override {
CHECK_EQ(2, static_cast<int>(inputs.size())); CHECK_EQ(2, inputs.size());
CHECK_EQ(1, static_cast<int>(outputs.size())); CHECK_EQ(0, outputs.size());
CHECK_EQ(0, static_cast<int>(inouts.size())); CHECK_EQ(1, inouts.size());
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2); CHECK(inouts[0].getData() && inputs[0].getData() && inputs[1].getData());
CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2); CHECK_EQ(inputs[0].dims_.size(), 1);
CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1); CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_); CHECK_EQ(inouts[0].dims_.size(), 2);
/// input and output has the same batch_size CHECK_EQ(inputs[1].dims_[1], inouts[0].dims_[1] * context_length_);
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]); /// input and output grad have the same batch_size
CHECK_EQ(inouts[0].dims_[0], inputs[1].dims_[0]);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec( typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData())); inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
ContextProjectionBackwardData<Device>(out_grad_mat.get(), ContextProjectionBackwardData<Device>(out_grad_mat.get(),
in_grad_mat.get(), in_grad_mat.get(),
...@@ -286,9 +326,9 @@ private: ...@@ -286,9 +326,9 @@ private:
}; };
/** /**
* \param inputs[0] weight grad. * \param inouts[0] weight grad.
* \param inputs[1] input sequence. * \param inputs[0] input sequence.
* \param outputs[0] output grad. * \param inputs[1] output grad.
*/ */
template <DeviceType Device> template <DeviceType Device>
class ContextProjectionBackwardWeightFunc : public FunctionBase { class ContextProjectionBackwardWeightFunc : public FunctionBase {
...@@ -303,22 +343,22 @@ public: ...@@ -303,22 +343,22 @@ public:
void calc(const Arguments& inputs, void calc(const Arguments& inputs,
const Arguments& outputs, const Arguments& outputs,
const Arguments& inouts) override { const Arguments& inouts) override {
CHECK_EQ(2, static_cast<int>(inputs.size())); CHECK_EQ(2, inputs.size());
CHECK_EQ(1, static_cast<int>(outputs.size())); CHECK_EQ(0, outputs.size());
CHECK_EQ(0, static_cast<int>(inouts.size())); CHECK_EQ(1, inouts.size());
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData()); CHECK(inouts[0].getData() && inputs[0].getData() && inputs[1].getData());
CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2); CHECK_EQ(inputs[0].dims_.size(), 1);
CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2); CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1); CHECK_EQ(inouts[0].dims_.size(), 2);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_); CHECK_EQ(inputs[1].dims_[1], inouts[0].dims_[1] * context_length_);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec( typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData())); inputs[0].dims_[0], reinterpret_cast<int*>(inputs[0].getData()));
const auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inouts[0].getData(), inouts[0].dims_[0], inouts[0].dims_[1]);
ContextProjectionBackwardWeight<Device>(out_grad_mat.get(), ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
w_grad_mat.get(), w_grad_mat.get(),
......
...@@ -21,14 +21,14 @@ namespace paddle { ...@@ -21,14 +21,14 @@ namespace paddle {
/** /**
* \brief Context Projection Forward. * \brief Context Projection Forward.
* *
* \param[out] outputs output data. * \param[in/out] outputs output data.
* \param[in] input input data. * \param[in] input input data.
* \param[in] weight input weight. * \param[in] weight input weight.
* \param[in] sequence input data. * \param[in] sequence input data.
* \param[in] context_length consecutive rows for concatenation. * \param[in] context_length consecutive rows for concatenation.
* \param[in] context_start context start position. * \param[in] context_start context start position.
* \param[in] begin_pad begining pad position. * \param[in] begin_pad begining pad position.
* \param[in] is_padding whether padding 0 or not. * \param[in] is_padding whether padding 0 or not.
* *
*/ */
template <DeviceType DType> template <DeviceType DType>
...@@ -68,7 +68,7 @@ void ContextProjectionBackward( ...@@ -68,7 +68,7 @@ void ContextProjectionBackward(
template <DeviceType DType> template <DeviceType DType>
void ContextProjectionBackwardData( void ContextProjectionBackwardData(
typename Tensor<real, DType>::Matrix& out_grad, const typename Tensor<real, DType>::Matrix& out_grad,
typename Tensor<real, DType>::Matrix& in_grad, typename Tensor<real, DType>::Matrix& in_grad,
const typename Tensor<int, DType>::Vector& sequence, const typename Tensor<int, DType>::Vector& sequence,
size_t context_length, size_t context_length,
...@@ -76,7 +76,7 @@ void ContextProjectionBackwardData( ...@@ -76,7 +76,7 @@ void ContextProjectionBackwardData(
template <DeviceType DType> template <DeviceType DType>
void ContextProjectionBackwardWeight( void ContextProjectionBackwardWeight(
typename Tensor<real, DType>::Matrix& out_grad, const typename Tensor<real, DType>::Matrix& out_grad,
typename Tensor<real, DType>::Matrix& w_grad, typename Tensor<real, DType>::Matrix& w_grad,
const typename Tensor<int, DType>::Vector& seq_vec, const typename Tensor<int, DType>::Vector& seq_vec,
size_t context_length, size_t context_length,
......
...@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output, ...@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
begin_pad); begin_pad);
} }
__global__ void KeContextProjectionBackwardData(real* out_grad, __global__ void KeContextProjectionBackwardData(const real* out_grad,
const int* sequence, const int* sequence,
real* in_grad, real* in_grad,
int input_dim, size_t input_dim,
int context_length, int context_length,
int context_start) { int context_start) {
int idx = threadIdx.x; int idx = threadIdx.x;
...@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad, ...@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
real value = 0; real value = 0;
int instances = seq_end - seq_start + context_length - 1; int instances = seq_end - seq_start + context_length - 1;
out_grad += seq_start * input_dim * context_length; auto out = const_cast<real*>(out_grad);
out += seq_start * input_dim * context_length;
in_grad += seq_start * input_dim; in_grad += seq_start * input_dim;
for (int k = 0; k <= input_dim / block_size; k++) { for (int k = 0; k <= input_dim / block_size; k++) {
if (idx < input_dim) { if (idx < input_dim) {
...@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad, ...@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
int outx = (i - context_length) < 0 ? i : (context_length - 1); int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1)); int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r = real* output_r =
out_grad + outy * input_dim * context_length + outx * input_dim; out + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) { for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[idx]; value += output_r[idx];
if (j - outy == outx) break; if (j - outy == outx) break;
...@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad, ...@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
* @param[in] context_start context start. * @param[in] context_start context start.
* *
*/ */
void hl_context_projection_backward_data(real* out_grad, void hl_context_projection_backward_data(const real* out_grad,
const int* sequence, const int* sequence,
real* input_grad, real* input_grad,
size_t num_sequences, size_t num_sequences,
...@@ -216,7 +217,8 @@ void hl_context_projection_backward_data(real* out_grad, ...@@ -216,7 +217,8 @@ void hl_context_projection_backward_data(real* out_grad,
} }
template <> template <>
void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad, <<<<<<< HEAD
void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
GpuMatrix& in_grad, GpuMatrix& in_grad,
const GpuIVector& sequence, const GpuIVector& sequence,
size_t context_length, size_t context_length,
...@@ -231,7 +233,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad, ...@@ -231,7 +233,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
} }
template<int THREADS_X, int THREADS_Y> template<int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(real* out_grad, __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
const int* sequence, const int* sequence,
real* w_grad, real* w_grad,
int num_sequences, int num_sequences,
...@@ -254,7 +256,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad, ...@@ -254,7 +256,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) { for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId]; int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1]; int seq_end = sequence[seqId+1];
output_r = out_grad + seq_start * w_dim * context_length; output_r = const_cast<real*>(out_grad)
+ seq_start * w_dim * context_length;
if (context_start < 0) { if (context_start < 0) {
if (padId + context_start < 0) { if (padId + context_start < 0) {
...@@ -318,7 +321,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad, ...@@ -318,7 +321,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
* beginning. * beginning.
* *
*/ */
void hl_context_projection_backward_weight(real* out_grad, void hl_context_projection_backward_weight(const real* out_grad,
const int* sequence, const int* sequence,
real* w_grad, real* w_grad,
size_t num_sequences, size_t num_sequences,
...@@ -346,7 +349,7 @@ void hl_context_projection_backward_weight(real* out_grad, ...@@ -346,7 +349,7 @@ void hl_context_projection_backward_weight(real* out_grad,
template <> template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
GpuMatrix& out_grad, const GpuMatrix& out_grad,
GpuMatrix& w_grad, GpuMatrix& w_grad,
const GpuIVector& seq_vec, const GpuIVector& seq_vec,
size_t context_length, size_t context_length,
...@@ -365,7 +368,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>( ...@@ -365,7 +368,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
} }
template <> template <>
void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad, void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
GpuMatrix& in_grad, GpuMatrix& in_grad,
GpuMatrix& w_grad, GpuMatrix& w_grad,
const GpuIVector& sequence, const GpuIVector& sequence,
......
...@@ -62,16 +62,18 @@ void testMatrixProjectionForward(int context_start, ...@@ -62,16 +62,18 @@ void testMatrixProjectionForward(int context_start,
Dims{pad, input_dim}), Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()), Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})}, Dims{cpu_seq->getSize()})},
{Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})}, {},
{}); {Tensor(cpu_out.getData(),
Dims{batch_size, input_dim * context_length})});
compare.getGpuFunction()->calc( compare.getGpuFunction()->calc(
{Tensor(gpu_in.getData(), Dims{batch_size, input_dim}), {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_weight ? gpu_weight->getData() : nullptr, Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
Dims{pad, input_dim}), Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()), Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})}, Dims{gpu_seq->getSize()})},
{Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})}, {},
{}); {Tensor(gpu_out.getData(),
Dims{batch_size, input_dim * context_length})});
autotest::TensorCheckEqual(cpu_out, gpu_out); autotest::TensorCheckEqual(cpu_out, gpu_out);
} }
...@@ -118,24 +120,24 @@ void testMatrixProjectionBackward(int context_start, ...@@ -118,24 +120,24 @@ void testMatrixProjectionBackward(int context_start,
} }
compare.getCpuFunction()->calc( compare.getCpuFunction()->calc(
{Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()}),
Tensor(cpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{},
{Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}), {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr, Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}), Dims{pad, input_dim})});
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc( compare.getGpuFunction()->calc(
{Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()}),
Tensor(gpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{},
{Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}), {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr, Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}), Dims{pad, input_dim})});
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad); autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
if (is_padding) { if (is_padding) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册