提交 be3e2764 编写于 作者: T tianbingsz 提交者: GitHub

Merge pull request #1009 from tianbingsz/paddle_func_mat

add paddle functions for Matrix ContextProjection APIs
......@@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
extern void hl_max_sequence_backward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
/**
* @brief Context projection forward.
*
* @param[in] input input sequence.
* @param[in] sequence sequence index.
* @param[in] weightData padding data.
* @param[out] output output sequence.
* @param[in] numSequences number of sequences.
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
* @param[in] beginPad number of extra timesteps added at the
* beginning.
* @param[in] isPadding trainable padding.
*
*/
extern void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding);
/**
* @brief Context projection backward data.
*
* @param[in] outputGrad output gradient.
* @param[in] sequence sequence index.
* @param[out] inputGrad input gradient.
* @param[in] numSequences number of sequences.
* @param[in] inputDim input sequence dimension.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
*
*/
extern void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart);
/**
* @brief Context projection backward weight.
*
* @param[in] outputGrad output gradient.
* @param[in] sequence sequence index.
* @param[out] weightGrad weight gradient.
* @param[in] numSequences number of sequences.
* @param[in] weightDim input sequence dimension.
* @param[in] totalPad number of extra timesteps.
* @param[in] contextLength context length.
* @param[in] contextStart context start.
* @param[in] beginPad number of extra timesteps added at the
* beginning.
*
*/
extern void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad);
/**
* @brief Memory copy from sequence to batch.
*
......
......@@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
inline void hl_max_sequence_backward(
real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
inline void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding) {}
inline void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart) {}
inline void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad) {}
inline void hl_sequence2batch_copy(real* batch,
real* sequence,
const int* batchIndex,
......
......@@ -90,258 +90,6 @@ void hl_max_sequence_backward(real* outputGrad,
CHECK_SYNC("hl_max_sequence_backward failed");
}
template <bool padding>
__global__ void KeContextProjectionForward(real* input,
const int* sequence,
real* weightData,
real* output,
int inputDim,
int contextLength,
int contextStart,
int beginPad) {
int idx = threadIdx.x;
int blockSize = blockDim.x;
int sequenceId = blockIdx.x;
int seqStart = sequence[sequenceId];
int seqEnd = sequence[sequenceId+1];
real value = 0;
int instances = seqEnd - seqStart + contextLength - 1;
output += seqStart * inputDim * contextLength;
input += seqStart * inputDim;
for (int k = 0; k <= inputDim / blockSize; k++) {
if (idx < inputDim) {
for (int i = 0; i < instances; i++) {
// i + contextStart;
if ((i + contextStart) < 0) {
if (padding) {
value = weightData[i * inputDim + idx];
} else {
continue;
}
} else if ((i + contextStart) >= (seqEnd - seqStart)) {
if (padding) {
value =
weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
inputDim + idx];
} else {
continue;
}
} else {
value = input[(i + contextStart) * inputDim + idx];
}
int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
real* output_r =
output + outy * inputDim * contextLength + outx * inputDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
output_r[idx] += value;
if (j - outy == outx) break;
output_r += (contextLength - 1) * inputDim;
}
}
}
idx += blockSize;
}
}
void hl_context_projection_forward(real* input,
const int* sequence,
real* weightData,
real* output,
int numSequences,
int inputDim,
int contextLength,
int contextStart,
int beginPad,
bool isPadding) {
CHECK_NOTNULL(input);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(output);
CHECK(!isPadding || weightData);
int blockSize = 128;
int blocksX = numSequences;
int blocksY = 1;
dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY);
if (isPadding) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weightData, output, inputDim,
contextLength, contextStart, beginPad);
} else {
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weightData, output, inputDim,
contextLength, contextStart, beginPad);
}
CHECK_SYNC("hl_context_projection_forward failed");
}
__global__ void KeContextProjectionBackwardData(real* outputGrad,
const int* sequence,
real* inputGrad,
int inputDim,
int contextLength,
int contextStart) {
int idx = threadIdx.x;
int blockSize = blockDim.x;
int sequenceId = blockIdx.x;
int seqStart = sequence[sequenceId];
int seqEnd = sequence[sequenceId+1];
real value = 0;
int instances = seqEnd - seqStart + contextLength - 1;
outputGrad += seqStart * inputDim * contextLength;
inputGrad += seqStart * inputDim;
for (int k = 0; k <= inputDim / blockSize; k++) {
if (idx < inputDim) {
for (int i = 0; i < instances; i++) {
if ((i + contextStart) < 0) {
continue;
} else if ((i + contextStart) >= (seqEnd - seqStart)) {
continue;
} else {
// value = 0;
value = inputGrad[(i + contextStart) * inputDim + idx];
}
int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
real* output_r =
outputGrad + outy * inputDim * contextLength + outx * inputDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
value += output_r[idx];
if (j - outy == outx) break;
output_r += (contextLength - 1) * inputDim;
}
inputGrad[(i + contextStart) * inputDim + idx] = value;
}
}
idx += blockSize;
}
}
void hl_context_projection_backward_data(real* outputGrad,
const int* sequence,
real* inputGrad,
int numSequences,
int inputDim,
int contextLength,
int contextStart) {
CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(inputGrad);
int blockSize = 128;
int blocksX = numSequences;
int blocksY = 1;
dim3 threads(blockSize, 1);
dim3 grid(blocksX, blocksY);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
(outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
CHECK_SYNC("hl_context_projection_backward_data failed");
}
template<int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int contextLength,
int contextStart,
int beginPad) {
__shared__ real sum_s[THREADS_Y][THREADS_X];
int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
const int idx = threadIdx.x;
const int idy = threadIdx.y;
int padId = blockIdx.x / padOfBlock;
int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
int instanceId;
real value = 0;
real* output_r;
sum_s[idy][idx] = 0.0f;
if (weightIdx < weightDim) {
for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
int seqStart = sequence[seqId];
int seqEnd = sequence[seqId+1];
output_r = outputGrad + seqStart * weightDim * contextLength;
if (contextStart < 0) {
if (padId + contextStart < 0) {
instanceId = padId;
} else {
// beginPad > 0;
instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
}
} else {
if (padId + (seqEnd - seqStart) < contextStart) {
continue;
} else {
// beginPad == 0;
instanceId = padId + (seqEnd - seqStart) - contextStart;
}
}
int outx = (instanceId - contextLength) < 0 ?
instanceId : (contextLength - 1);
int outy = (instanceId - contextLength) < 0 ?
0 : (instanceId - (contextLength - 1));
output_r += outy * weightDim * contextLength + outx * weightDim;
for (int j = outy; j < seqEnd - seqStart; j++) {
value += output_r[weightIdx];
if (j - outy == outx) break;
output_r += (contextLength - 1) * weightDim;
}
}
sum_s[idy][idx] = value;
}
__syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx];
}
__syncthreads();
}
__syncthreads();
if (weightIdx < weightDim) {
if (idy == 0) {
weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
}
}
}
void hl_context_projection_backward_weight(real* outputGrad,
const int* sequence,
real* weightGrad,
int numSequences,
int weightDim,
int totalPad,
int contextLength,
int contextStart,
int beginPad) {
CHECK_NOTNULL(outputGrad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(weightGrad);
int threadsX = 32;
int threadsY = 32;
int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
dim3 threads(threadsX, threadsY);
dim3 grid(blocksX, 1);
KeContextProjectionBackwardWeight<32, 32>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(outputGrad, sequence, weightGrad, numSequences, weightDim,
contextLength, contextStart, beginPad);
CHECK_SYNC("hl_context_projection_backward_weight failed");
}
template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
__global__ void KeMatrixAddRows(real* output,
real* table,
......
......@@ -17,6 +17,9 @@ if(WITH_TESTING)
# file(GLOB test_files . *OpTest.cpp)
# add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
add_simple_unittest(CrossMapNormalOpTest)
add_unittest(ContextProjectionOpTest
ContextProjectionOpTest.cpp
../gserver/tests/TestUtil.cpp)
endif()
endif()
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "ContextProjectionOp.h"
#include "paddle/math/Matrix.h"
#include "paddle/math/Vector.h"
namespace paddle {
template <>
void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
const CpuMatrix* input_mat,
const CpuMatrix* weight_mat,
const CpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad) {
const int* starts = seq_vec.getData();
const size_t num_sequences = seq_vec.getSize() - 1;
auto w_mat = const_cast<CpuMatrix*>(weight_mat);
auto in_mat = const_cast<CpuMatrix*>(input_mat);
for (size_t i = 0; i < num_sequences; ++i) {
for (size_t j = 0; j < context_length; ++j) {
int begin = starts[i] + context_start + j;
int end = starts[i + 1] + context_start + j;
int dst_begin = starts[i];
int dst_end = starts[i + 1];
if (begin < starts[i]) {
int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
if (w_mat) {
MatrixPtr sub = w_mat->subMatrix(j, pad_size);
mat->addAtOffset(*sub, j * in_mat->getWidth());
}
dst_begin = starts[i] + pad_size;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t pad_size =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
if (w_mat) {
MatrixPtr sub = w_mat->subMatrix(
begin_pad + context_start + j - pad_size, pad_size);
mat->addAtOffset(*sub, j * in_mat->getWidth());
}
dst_end = starts[i + 1] - pad_size;
end = starts[i + 1];
}
if (end <= begin) continue;
MatrixPtr src = in_mat->subMatrix(begin, end - begin);
MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
dst->addAtOffset(*src, j * in_mat->getWidth());
}
}
}
/**
* \param inputs[0] input value.
* \param inputs[1] input weight.
* \param inputs[2] input sequence.
* \param outputs[0] output value.
*/
template <DeviceType Device>
class ContextProjectionForwardFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(3, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(inputs[2].dims_.size(), 1);
/// dim of output = dim of input * context_length
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
/// dim of input == dim of weight
CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
const auto w_mat =
!inputs[1].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
ContextProjectionForward<Device>(out_mat.get(),
in_mat.get(),
w_mat.get(),
seq_vec,
context_length_,
context_start_,
begin_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
};
template <>
void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
CpuMatrix* in_grad_mat,
CpuMatrix* w_grad_mat,
const CpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad) {
CHECK(out_grad_mat);
size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
: w_grad_mat ? w_grad_mat->getWidth() : 0;
const int* starts = seq_vec.getData();
size_t num_sequences = seq_vec.getSize() - 1;
for (size_t i = 0; i < num_sequences; ++i) {
for (size_t j = 0; j < context_length; ++j) {
int begin = starts[i] + context_start + j;
int end = starts[i + 1] + context_start + j;
int dst_begin = starts[i];
int dst_end = starts[i + 1];
if (begin < starts[i]) {
int64_t pad_size =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) {
MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
sub->addAtOffset(*mat, j * input_dim);
}
dst_begin = starts[i] + pad_size;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t pad_size =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
if (is_padding && w_grad_mat) {
MatrixPtr mat =
out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
MatrixPtr sub = w_grad_mat->subMatrix(
begin_pad + context_start + j - pad_size, pad_size);
sub->addAtOffset(*mat, j * input_dim);
}
dst_end = starts[i + 1] - pad_size;
end = starts[i + 1];
}
if (end <= begin) continue;
if (!in_grad_mat) continue;
MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
src->addAtOffset(*dst, j * input_dim);
}
}
}
/**
* \param inputs[0] input grad.
* \param inputs[1] weight grad.
* \param inputs[2] input sequence.
* \param outputs[0] output value.
*/
template <DeviceType Device>
class ContextProjectionBackwardFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
is_padding_ = config.get<bool>("is_padding");
total_pad_ = config.get<size_t>("total_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(3, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(outputs[0].getData() && inputs[2].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 2);
CHECK_EQ(inputs[2].dims_.size(), 1);
/// dim of input == dim of weight
CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
/// dim of output = dim of input * context_length
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
auto in_grad_mat =
!inputs[0].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
auto w_grad_mat =
!inputs[1].getData()
? nullptr
: std::make_shared<typename MatrixT<Device>::type>(
inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
ContextProjectionBackward<Device>(out_grad_mat.get(),
in_grad_mat ? in_grad_mat.get() : nullptr,
w_grad_mat ? w_grad_mat.get() : nullptr,
seq_vec,
context_length_,
context_start_,
begin_pad_,
is_padding_,
total_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
bool is_padding_;
size_t total_pad_;
};
/**
* \param inputs[0] input grad.
* \param inputs[1] input sequence.
* \param outputs[0] output grad.
*/
template <DeviceType Device>
class ContextProjectionBackwardDataFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(2, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 1);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
/// input and output has the same batch_size
CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
ContextProjectionBackwardData<Device>(out_grad_mat.get(),
in_grad_mat.get(),
seq_vec,
context_length_,
context_start_);
}
private:
size_t context_length_;
int context_start_;
};
/**
* \param inputs[0] weight grad.
* \param inputs[1] input sequence.
* \param outputs[0] output grad.
*/
template <DeviceType Device>
class ContextProjectionBackwardWeightFunc : public FunctionBase {
public:
void init(const FuncConfig& config) override {
context_length_ = config.get<size_t>("context_length");
context_start_ = config.get<int>("context_start");
begin_pad_ = config.get<size_t>("begin_pad");
total_pad_ = config.get<size_t>("total_pad");
}
void calc(const Arguments& inputs,
const Arguments& outputs,
const Arguments& inouts) override {
CHECK_EQ(2, inputs.size());
CHECK_EQ(1, outputs.size());
CHECK_EQ(0, inouts.size());
CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
CHECK_EQ(outputs[0].dims_.size(), 2);
CHECK_EQ(inputs[0].dims_.size(), 2);
CHECK_EQ(inputs[1].dims_.size(), 1);
CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
typename SequenceT<Device>::type seq_vec(
inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
w_grad_mat.get(),
seq_vec,
context_length_,
context_start_,
total_pad_,
begin_pad_);
}
private:
size_t context_length_;
int context_start_;
size_t begin_pad_;
size_t total_pad_;
};
REGISTER_TYPED_FUNC(ContextProjectionForward,
CPU,
ContextProjectionForwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackward,
CPU,
ContextProjectionBackwardFunc);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(ContextProjectionForward,
GPU,
ContextProjectionForwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackward,
GPU,
ContextProjectionBackwardFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
GPU,
ContextProjectionBackwardDataFunc);
REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
GPU,
ContextProjectionBackwardWeightFunc);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Function.h"
namespace paddle {
/**
* \brief Context Projection Forward.
*
* \param[out] outputs output data.
* \param[in] input input data.
* \param[in] weight input weight.
* \param[in] sequence input data.
* \param[in] context_length consecutive rows for concatenation.
* \param[in] context_start context start position.
* \param[in] begin_pad begining pad position.
* \param[in] is_padding whether padding 0 or not.
*
*/
template <DeviceType Device>
void ContextProjectionForward(typename MatrixT<Device>::type* output,
const typename MatrixT<Device>::type* input,
const typename MatrixT<Device>::type* weight,
const typename SequenceT<Device>::type& sequence,
size_t context_length,
int context_start,
size_t begin_pad);
/**
* \brief Context Projection Backward.
*
* \param[out] outputs output gradient.
* \param[in] input input gradient.
* \param[in] weight input weight gradient.
* \param[in] sequence input data.
* \param[in] context_length consecutive rows for concatenation.
* \param[in] context_start context start position.
* \param[in] begin_pad begining pad position.
* \param[in] is_padding whether padding 0 or not.
*
*/
template <DeviceType Device>
void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* in_grad,
typename MatrixT<Device>::type* w_grad,
const typename SequenceT<Device>::type& seq_vec,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad);
template <DeviceType Device>
void ContextProjectionBackwardData(
typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* in_grad,
const typename SequenceT<Device>::type& sequence,
size_t context_length,
int context_start);
template <DeviceType Device>
void ContextProjectionBackwardWeight(
typename MatrixT<Device>::type* out_grad,
typename MatrixT<Device>::type* w_grad,
const typename SequenceT<Device>::type& seq_vec,
size_t context_length,
int context_start,
size_t total_pad,
size_t begin_pad);
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "hl_base.h"
#include "ContextProjectionOp.h"
namespace paddle {
template <bool padding>
__global__ void KeContextProjectionForward(const real* input,
const int* sequence,
const real* weight,
real* output,
int input_dim,
int context_length,
int context_start,
int begin_pad) {
int idx = threadIdx.x;
int block_size = blockDim.x;
int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1];
real value = 0;
int instances = seq_end - seq_start + context_length - 1;
output += seq_start * input_dim * context_length;
input += seq_start * input_dim;
for (int k = 0; k <= input_dim / block_size; k++) {
if (idx < input_dim) {
for (int i = 0; i < instances; i++) {
// i + context_start;
if ((i + context_start) < 0) {
if (padding) {
value = weight[i * input_dim + idx];
} else {
continue;
}
} else if ((i + context_start) >= (seq_end - seq_start)) {
if (padding) {
value =
weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
input_dim + idx];
} else {
continue;
}
} else {
value = input[(i + context_start) * input_dim + idx];
}
int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r =
output + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
output_r[idx] += value;
if (j - outy == outx) break;
output_r += (context_length - 1) * input_dim;
}
}
}
idx += block_size;
}
}
/**
* @brief Context projection forward.
*
* @param[in] input input sequence.
* @param[in] sequence sequence index.
* @param[in] weight padding data.
* @param[out] output output sequence.
* @param[in] num_sequences number of sequences.
* @param[in] input_dim input sequence dimension.
* @param[in] context_length context length.
* @param[in] context_start context start.
* @param[in] begin_pad number of extra timesteps added at the
* beginning.
*
*/
void hl_context_projection_forward(const real* input,
const int* sequence,
const real* weight,
real* output,
size_t num_sequences,
size_t input_dim,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK_NOTNULL(input);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(output);
int block_size = 128;
int blocks_x = num_sequences;
int blocks_y = 1;
dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y);
if (weight) {
KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weight, output, input_dim,
context_length, context_start, begin_pad);
} else {
KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
(input, sequence, weight, output, input_dim,
context_length, context_start, begin_pad);
}
CHECK_SYNC("hl_context_projection_forward failed");
}
template <>
void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
const GpuMatrix* input,
const GpuMatrix* weight,
const GpuIVector& sequence,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK(input && output);
hl_context_projection_forward(input->getData(),
sequence.getData(),
weight ? weight->getData() : nullptr,
output->getData(),
sequence.getSize() - 1,
input->getWidth(),
context_length,
context_start,
begin_pad);
}
__global__ void KeContextProjectionBackwardData(real* out_grad,
const int* sequence,
real* in_grad,
int input_dim,
int context_length,
int context_start) {
int idx = threadIdx.x;
int block_size = blockDim.x;
int sequenceId = blockIdx.x;
int seq_start = sequence[sequenceId];
int seq_end = sequence[sequenceId+1];
real value = 0;
int instances = seq_end - seq_start + context_length - 1;
out_grad += seq_start * input_dim * context_length;
in_grad += seq_start * input_dim;
for (int k = 0; k <= input_dim / block_size; k++) {
if (idx < input_dim) {
for (int i = 0; i < instances; i++) {
if ((i + context_start) < 0) {
continue;
} else if ((i + context_start) >= (seq_end - seq_start)) {
continue;
} else {
// value = 0;
value = in_grad[(i + context_start) * input_dim + idx];
}
int outx = (i - context_length) < 0 ? i : (context_length - 1);
int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
real* output_r =
out_grad + outy * input_dim * context_length + outx * input_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[idx];
if (j - outy == outx) break;
output_r += (context_length - 1) * input_dim;
}
in_grad[(i + context_start) * input_dim + idx] = value;
}
}
idx += block_size;
}
}
/**
* @brief Context projection backward data.
*
* @param[in] out_grad output gradient.
* @param[in] sequence sequence index.
* @param[out] input_grad input gradient.
* @param[in] num_sequences number of sequences.
* @param[in] input_dim input sequence dimension.
* @param[in] context_length context length.
* @param[in] context_start context start.
*
*/
void hl_context_projection_backward_data(real* out_grad,
const int* sequence,
real* input_grad,
size_t num_sequences,
size_t input_dim,
size_t context_length,
int context_start) {
CHECK_NOTNULL(out_grad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(input_grad);
int block_size = 128;
int blocks_x = num_sequences;
int blocks_y = 1;
dim3 threads(block_size, 1);
dim3 grid(blocks_x, blocks_y);
KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
(out_grad, sequence, input_grad, input_dim, context_length, context_start);
CHECK_SYNC("hl_context_projection_backward_data failed");
}
template <>
void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
GpuMatrix* in_grad,
const GpuIVector& sequence,
size_t context_length,
int context_start) {
CHECK(in_grad && out_grad);
hl_context_projection_backward_data(out_grad->getData(),
sequence.getData(),
in_grad->getData(),
sequence.getSize() - 1,
in_grad->getWidth(),
context_length,
context_start);
}
template<int THREADS_X, int THREADS_Y>
__global__ void KeContextProjectionBackwardWeight(real* out_grad,
const int* sequence,
real* w_grad,
int num_sequences,
int w_dim,
int context_length,
int context_start,
int begin_pad) {
__shared__ real sum_s[THREADS_Y][THREADS_X];
int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
const int idx = threadIdx.x;
const int idy = threadIdx.y;
int padId = blockIdx.x / pad_of_block;
int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
int instanceId;
real value = 0;
real* output_r;
sum_s[idy][idx] = 0.0f;
if (weight_idx < w_dim) {
for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
int seq_start = sequence[seqId];
int seq_end = sequence[seqId+1];
output_r = out_grad + seq_start * w_dim * context_length;
if (context_start < 0) {
if (padId + context_start < 0) {
instanceId = padId;
} else {
// begin_pad > 0;
instanceId = (padId - begin_pad) +
(seq_end - seq_start) - context_start;
}
} else {
if (padId + (seq_end - seq_start) < context_start) {
continue;
} else {
// begin_pad == 0;
instanceId = padId + (seq_end - seq_start) - context_start;
}
}
int outx = (instanceId - context_length) < 0 ?
instanceId : (context_length - 1);
int outy = (instanceId - context_length) < 0 ?
0 : (instanceId - (context_length - 1));
output_r += outy * w_dim * context_length + outx * w_dim;
for (int j = outy; j < seq_end - seq_start; j++) {
value += output_r[weight_idx];
if (j - outy == outx) break;
output_r += (context_length - 1) * w_dim;
}
}
sum_s[idy][idx] = value;
}
__syncthreads();
for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
if (idy < stride) {
sum_s[idy][idx] += sum_s[idy + stride][idx];
}
__syncthreads();
}
__syncthreads();
if (weight_idx < w_dim) {
if (idy == 0) {
w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
}
}
}
/**
* @brief Context projection backward weight.
*
* @param[in] out_grad output gradient.
* @param[in] sequence sequence index.
* @param[out] w_grad weight gradient.
* @param[in] num_sequences number of sequences.
* @param[in] w_dim input sequence dimension.
* @param[in] total_pad number of extra timesteps.
* @param[in] context_length context length.
* @param[in] context_start context start.
* @param[in] begin_pad number of extra timesteps added at the
* beginning.
*
*/
void hl_context_projection_backward_weight(real* out_grad,
const int* sequence,
real* w_grad,
size_t num_sequences,
size_t w_dim,
size_t total_pad,
size_t context_length,
int context_start,
size_t begin_pad) {
CHECK_NOTNULL(out_grad);
CHECK_NOTNULL(sequence);
CHECK_NOTNULL(w_grad);
int threads_x = 32;
int threads_y = 32;
int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
dim3 threads(threads_x, threads_y);
dim3 grid(blocks_x, 1);
KeContextProjectionBackwardWeight<32, 32>
<<< grid, threads, 0, STREAM_DEFAULT >>>
(out_grad, sequence, w_grad, num_sequences, w_dim,
context_length, context_start, begin_pad);
CHECK_SYNC("hl_context_projection_backward_weight failed");
}
template <>
void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
GpuMatrix* out_grad,
GpuMatrix* w_grad,
const GpuIVector& seq_vec,
size_t context_length,
int context_start,
size_t total_pad,
size_t begin_pad) {
CHECK(out_grad && w_grad);
hl_context_projection_backward_weight(out_grad->getData(),
seq_vec.getData(),
w_grad->getData(),
seq_vec.getSize() - 1,
w_grad->getWidth(),
total_pad,
context_length,
context_start,
begin_pad);
}
template <>
void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
GpuMatrix* in_grad,
GpuMatrix* w_grad,
const GpuIVector& sequence,
size_t context_length,
int context_start,
size_t begin_pad,
bool is_padding,
size_t total_pad) {
CHECK(out_grad);
if (in_grad) {
ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
out_grad,
in_grad,
sequence,
context_length,
context_start);
}
if (is_padding && w_grad) {
ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
out_grad,
w_grad,
sequence,
context_length,
context_start,
total_pad,
begin_pad);
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "FunctionTest.h"
#include "paddle/gserver/tests/TestUtil.h"
#include "paddle/math/Matrix.h"
using namespace paddle; // NOLINT
void testMatrixProjectionForward(int context_start,
size_t context_length,
bool is_padding,
size_t batch_size,
size_t input_dim) {
size_t pad = std::max(0, -context_start) +
std::max(0, (int)(context_start + context_length - 1));
if (pad == 0) is_padding = false;
FunctionCompare compare("ContextProjectionForward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", std::max(0, -context_start)));
CpuMatrix cpu_in(batch_size, input_dim);
cpu_in.randomizeUniform();
GpuMatrix gpu_in(batch_size, input_dim);
gpu_in.copyFrom(cpu_in);
auto cpu_weight =
is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
auto gpu_weight =
is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
if (is_padding) {
cpu_weight->randomizeUniform();
gpu_weight->copyFrom(*cpu_weight);
}
IVectorPtr cpu_seq;
generateSequenceStartPositions(batch_size, cpu_seq);
IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
gpu_seq->copyFrom(*cpu_seq);
CpuMatrix cpu_out(batch_size, input_dim * context_length);
GpuMatrix gpu_out(batch_size, input_dim * context_length);
cpu_out.randomizeUniform();
gpu_out.copyFrom(cpu_out);
compare.getCpuFunction()->calc(
{Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc(
{Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckEqual(cpu_out, gpu_out);
}
void testMatrixProjectionBackward(int context_start,
int context_length,
bool is_padding,
size_t batch_size,
size_t input_dim) {
size_t pad = std::max(0, -context_start) +
std::max(0, (int)(context_start + context_length - 1));
if (pad == 0) is_padding = false;
FunctionCompare compare("ContextProjectionBackward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", std::max(0, -context_start))
.set("is_padding", is_padding)
.set("total_pad", pad));
CpuMatrix cpu_in_grad(batch_size, input_dim);
cpu_in_grad.randomizeUniform();
GpuMatrix gpu_in_grad(batch_size, input_dim);
gpu_in_grad.copyFrom(cpu_in_grad);
CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
cpu_out_grad.randomizeUniform();
GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
gpu_out_grad.copyFrom(cpu_out_grad);
IVectorPtr cpu_seq;
generateSequenceStartPositions(batch_size, cpu_seq);
IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
gpu_seq->copyFrom(*cpu_seq);
auto cpu_w_grad =
is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
auto gpu_w_grad =
is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
if (is_padding) {
cpu_w_grad->randomizeUniform();
gpu_w_grad->copyFrom(*cpu_w_grad);
}
compare.getCpuFunction()->calc(
{Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
Dims{cpu_seq->getSize()})},
{Tensor(cpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
compare.getGpuFunction()->calc(
{Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
Dims{pad, input_dim}),
Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
Dims{gpu_seq->getSize()})},
{Tensor(gpu_out_grad.getData(),
Dims{batch_size, input_dim * context_length})},
{});
autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
if (is_padding) {
autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
}
}
TEST(ContextProjection, projection) {
for (auto context_start : {-5, -3, -1, 0, 3}) {
for (auto context_length : {1, 2, 5, 7}) {
for (auto trainable_padding : {false, true}) {
for (auto batch_size : {1, 2, 5, 20, 100}) {
for (auto input_dim : {15, 32, 63, 128, 200}) {
VLOG(3) << " context_start=" << context_start
<< " context_length=" << context_length
<< " trainable_padding=" << trainable_padding
<< " batch_size=" << batch_size
<< " input_dim=" << input_dim;
testMatrixProjectionForward(context_start,
context_length,
trainable_padding,
batch_size,
input_dim);
testMatrixProjectionBackward(context_start,
context_length,
trainable_padding,
batch_size,
input_dim);
}
}
}
}
}
}
......@@ -30,6 +30,20 @@ real FuncConfig::get<real>(const std::string& key) const {
return it->second.r;
}
template <>
int FuncConfig::get<int>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.i;
}
template <>
bool FuncConfig::get<bool>(const std::string& key) const {
auto it = valueMap_.find(key);
CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
return it->second.b;
}
template <>
FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
......@@ -44,6 +58,20 @@ FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
return *this;
}
template <>
FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
valueMap_[key].i = v;
return *this;
}
template <>
FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
valueMap_[key].b = v;
return *this;
}
ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
} // namespace paddle
......@@ -40,6 +40,19 @@ struct MatrixT<DEVICE_TYPE_GPU> {
using type = GpuMatrix;
};
template <DeviceType Device>
struct SequenceT;
template <>
struct SequenceT<DEVICE_TYPE_CPU> {
using type = CpuIVector;
};
template <>
struct SequenceT<DEVICE_TYPE_GPU> {
using type = GpuIVector;
};
typedef std::vector<size_t> Dims;
class Tensor {
......@@ -59,6 +72,8 @@ public:
union value {
size_t s;
real r;
int i;
bool b;
};
template <typename T>
......
......@@ -33,25 +33,33 @@ public:
// init cpu and gpu arguments
auto initArgs = [=](
Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
for (auto arg : inArgs) {
for (const auto arg : inArgs) {
size_t size = sizeof(real);
for (auto dim : arg.dims_) {
for (const auto dim : arg.dims_) {
size *= dim;
}
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(
Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
gpuArgs.emplace_back(
Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
// will use an api to refactor this code.
CpuVector cpuVector(size / sizeof(real),
(real*)cpuArgs.back().getData());
GpuVector gpuVector(size / sizeof(real),
(real*)gpuArgs.back().getData());
cpuVector.uniform(0.001, 1);
gpuVector.copyFrom(cpuVector);
if (arg.getData()) {
// todo(tianbing), waste unnecessary mem here
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
// already init outside
} else {
cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
cpuArgs.emplace_back(
Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
gpuArgs.emplace_back(
Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
// will use an api to refactor this code.
CpuVector cpuVector(size / sizeof(real),
(real*)cpuArgs.back().getData());
GpuVector gpuVector(size / sizeof(real),
(real*)gpuArgs.back().getData());
cpuVector.uniform(0.001, 1);
gpuVector.copyFrom(cpuVector);
}
}
};
initArgs(cpuInputs, gpuInputs, inputs);
......@@ -81,6 +89,10 @@ public:
checkArgs(cpuInouts, gpuInouts);
}
std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
protected:
std::shared_ptr<FunctionBase> cpu;
std::shared_ptr<FunctionBase> gpu;
......
......@@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
CHECK_EQ(inputDim * totalPad, parameter->getSize());
weight_.reset(new Weight(totalPad, inputDim, parameter));
}
// init forward_ and backward_ functions
init();
}
bool ContextProjection::init() {
size_t context_length = config_.context_length();
int context_start = config_.context_start();
bool is_padding = config_.trainable_padding();
size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
createFunction(forward_,
"ContextProjectionForward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", beginPad_));
createFunction(backward_,
"ContextProjectionBackward",
FuncConfig()
.set("context_length", context_length)
.set("context_start", context_start)
.set("begin_pad", beginPad_)
.set("is_padding", is_padding)
.set("total_pad", total_pad));
return true;
}
void ContextProjection::resetState() {
......@@ -78,25 +104,29 @@ LayerStatePtr ContextProjection::getState() {
}
void ContextProjection::forward() {
CHECK(in_->value);
CHECK(in_->value && out_->value);
CHECK(in_->sequenceStartPositions);
auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
int64_t inputDim = in_->value->getWidth();
int64_t dim = out_->value->getWidth();
CHECK_EQ(dim, inputDim * config_.context_length());
size_t input_dim = in_->value->getWidth();
size_t dim = out_->value->getWidth();
CHECK_EQ(dim, input_dim * config_.context_length());
size_t batch_size = in_->value->getHeight();
CHECK_EQ(forward_.size(), 1) << "Only one forward function here";
REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
bool isPadding = config_.trainable_padding();
out_->value->contextProjectionForward(
*(in_->value),
state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr,
*startPositions,
config_.context_length(),
config_.context_start(),
beginPad_,
state_ ? true : isPadding);
bool is_padding = config_.trainable_padding();
/// first use state_, otherwise use weight_(padding false === w nullptr)
auto w_ptr =
state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
auto start_pos = in_->sequenceStartPositions;
forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
Tensor(w_ptr ? w_ptr->getData() : nullptr,
Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
Tensor(reinterpret_cast<real*>(
const_cast<int*>(start_pos->getData(useGpu_))),
Dims{start_pos->getSize()})},
{Tensor(out_->value->getData(), Dims{batch_size, dim})},
{});
if (state_ && config_.context_start() < 0) {
CHECK_EQ(1, in_->getNumSequences());
......@@ -118,41 +148,27 @@ void ContextProjection::forward() {
}
void ContextProjection::backward(const UpdateCallback& callback) {
CHECK(in_->value);
int64_t inputDim = in_->value->getWidth();
int64_t dim = out_->value->getWidth();
CHECK_EQ(dim, inputDim * config_.context_length());
auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
CHECK(in_->value && out_->value && out_->grad);
size_t input_dim = in_->value->getWidth();
size_t dim = out_->value->getWidth();
CHECK_EQ(dim, input_dim * config_.context_length());
size_t batch_size = in_->value->getHeight();
CHECK_EQ(batch_size, out_->value->getHeight());
CHECK_EQ(backward_.size(), 1) << "Only one backward function here";
REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
bool isPadding = config_.trainable_padding();
if (!out_->grad->useGpu()) {
out_->grad->contextProjectionBackward(
in_->grad.get(),
isPadding ? weight_->getWGrad().get() : nullptr,
*startPositions,
config_.context_length(),
config_.context_start(),
beginPad_,
isPadding);
} else {
if (in_->grad) {
out_->grad->contextProjectionBackwardData(*(in_->grad),
*startPositions,
config_.context_length(),
config_.context_start());
}
if (isPadding && weight_->getWGrad()) {
out_->grad->contextProjectionBackwardWeight(
*(weight_->getWGrad()),
*startPositions,
config_.context_length(),
config_.context_start(),
weight_->getWGrad()->getHeight(),
beginPad_);
}
}
bool is_padding = config_.trainable_padding();
auto start_pos = in_->sequenceStartPositions;
auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
Dims{batch_size, input_dim}),
Tensor(w_ptr ? w_ptr->getData() : nullptr,
Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
Tensor(reinterpret_cast<real*>(
const_cast<int*>(start_pos->getData(useGpu_))),
Dims{start_pos->getSize()})},
{Tensor(out_->grad->getData(), Dims{batch_size, dim})},
{});
if (config_.trainable_padding()) {
weight_->getParameterPtr()->incUpdate(callback);
......
......@@ -61,6 +61,8 @@ public:
virtual LayerStatePtr getState();
virtual bool init();
protected:
std::unique_ptr<Weight> weight_;
/// number of extra timesteps added at the beginning
......
......@@ -88,11 +88,37 @@ public:
*/
virtual LayerStatePtr getState() { return nullptr; }
/**
* init forward_ and backward_ functions
*/
virtual bool init() { return true; }
/**
* Get output size of projection.
*/
size_t getOutputSize() const { return config_.output_size(); }
protected:
/**
* Create layer function. Function is called in forward or backward.
* \param function, Layer::forward_ or Layer::backward_
* \param name, function name
* \param config, initialization configuration for the function
*/
void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
const std::string& name,
const FuncConfig& config) {
if (useGpu_) {
function.emplace_back(
FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
} else {
function.emplace_back(
FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
}
auto& func = function.back();
func->init(config);
}
protected:
/// Config of projection
ProjectionConfig config_;
......@@ -106,5 +132,9 @@ protected:
const Argument* out_;
/// Store `passType` passed to forward()
PassType passType_;
/// Layer forward function
std::vector<std::shared_ptr<FunctionBase>> forward_;
/// Layer backward function
std::vector<std::shared_ptr<FunctionBase>> backward_;
};
} // namespace paddle
......@@ -1304,68 +1304,6 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
}
void GpuMatrix::contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
CHECK(dynamic_cast<GpuMatrix*>(&input));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight));
CHECK_EQ(getWidth(), input.getWidth() * contextLength);
hl_context_projection_forward(input.getData(),
sequence.getData(),
isPadding ? weight->getData() : NULL,
getData(),
sequence.getSize() - 1,
input.getWidth(),
contextLength,
contextStart,
beginPad,
isPadding);
}
void GpuMatrix::contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart) {
CHECK(dynamic_cast<GpuMatrix*>(&inputGrad));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
CHECK_EQ(getWidth(), inputGrad.getWidth() * contextLength);
hl_context_projection_backward_data(getData(),
sequence.getData(),
inputGrad.getData(),
sequence.getSize() - 1,
inputGrad.getWidth(),
contextLength,
contextStart);
}
void GpuMatrix::contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad) {
CHECK(dynamic_cast<GpuMatrix*>(&weightGrad));
CHECK(dynamic_cast<const GpuIVector*>(&sequence));
CHECK_EQ(getWidth(), weightGrad.getWidth() * contextLength);
hl_context_projection_backward_weight(getData(),
sequence.getData(),
weightGrad.getData(),
sequence.getSize() - 1,
weightGrad.getWidth(),
totalPad,
contextLength,
contextStart,
beginPad);
}
void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
CHECK(data.useGpu_ == true && W.useGpu_ == true)
<< "Matrix type are not equal";
......@@ -2203,113 +2141,6 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
}
}
void CpuMatrix::contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
auto input_ptr = dynamic_cast<CpuMatrix*>(&input);
auto seq_ptr = dynamic_cast<const CpuIVector*>(&sequence);
CHECK(input_ptr && seq_ptr);
if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight));
CHECK_EQ(getWidth(), input_ptr->getWidth() * contextLength);
const int* starts = seq_ptr->getData();
size_t numSequences = seq_ptr->getSize() - 1;
for (size_t i = 0; i < numSequences; ++i) {
for (int j = 0; j < contextLength; ++j) {
int begin = starts[i] + contextStart + j;
int end = starts[i + 1] + contextStart + j;
int dstBegin = starts[i];
int dstEnd = starts[i + 1];
if (begin < starts[i]) {
int64_t padSize =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
MatrixPtr mat = this->subMatrix(starts[i], padSize);
if (isPadding) {
MatrixPtr sub = weight->subMatrix(j, padSize);
mat->addAtOffset(*sub, j * input_ptr->getWidth());
}
dstBegin = starts[i] + padSize;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t padSize =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
if (isPadding) {
MatrixPtr sub =
weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
mat->addAtOffset(*sub, j * input_ptr->getWidth());
}
dstEnd = starts[i + 1] - padSize;
end = starts[i + 1];
}
if (end <= begin) continue;
MatrixPtr src = input_ptr->subMatrix(begin, end - begin);
MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
dst->addAtOffset(*src, j * input_ptr->getWidth());
}
}
}
void CpuMatrix::contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad));
if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad));
CHECK(dynamic_cast<const CpuIVector*>(&sequence));
int64_t inputDim = inputGrad ? inputGrad->getWidth()
: weightGrad ? weightGrad->getWidth() : 0;
CHECK_EQ(getWidth(), inputDim * contextLength);
const int* starts = sequence.getData();
size_t numSequences = sequence.getSize() - 1;
for (size_t i = 0; i < numSequences; ++i) {
for (int j = 0; j < contextLength; ++j) {
int begin = starts[i] + contextStart + j;
int end = starts[i + 1] + contextStart + j;
int dstBegin = starts[i];
int dstEnd = starts[i + 1];
if (begin < starts[i]) {
int64_t padSize =
std::min(starts[i] - begin, starts[i + 1] - starts[i]);
if (isPadding && weightGrad) {
MatrixPtr mat = this->subMatrix(starts[i], padSize);
MatrixPtr sub = weightGrad->subMatrix(j, padSize);
sub->addAtOffset(*mat, j * inputDim);
}
dstBegin = starts[i] + padSize;
begin = starts[i];
}
if (end > starts[i + 1]) {
int64_t padSize =
std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
if (isPadding && weightGrad) {
MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
MatrixPtr sub = weightGrad->subMatrix(
beginPad + contextStart + j - padSize, padSize);
sub->addAtOffset(*mat, j * inputDim);
}
dstEnd = starts[i + 1] - padSize;
end = starts[i + 1];
}
if (end <= begin) continue;
if (!inputGrad) continue;
MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
src->addAtOffset(*dst, j * inputDim);
}
}
}
inline void vecAddTo(real* a, const real* b, size_t len) {
for (unsigned int i = 0; i < len; ++i) {
a[i] += b[i];
......
......@@ -972,42 +972,6 @@ public:
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart) {
LOG(FATAL) << "Not implemeted";
}
virtual void contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad) {
LOG(FATAL) << "Not implemeted";
}
/**
* @code
* this.row[i] += table.row[ids[i]]
......@@ -1442,26 +1406,6 @@ public:
const IVector& sequence,
IVector& index);
void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
void contextProjectionBackwardData(Matrix& inputGrad,
const IVector& sequence,
int contextLength,
int contextStart);
void contextProjectionBackwardWeight(Matrix& weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
int totalPad,
size_t beginPad);
void bilinearForward(const Matrix& in,
const size_t inImgH,
const size_t inImgW,
......@@ -1648,22 +1592,6 @@ public:
const IVector& sequence,
IVector& index);
void contextProjectionForward(Matrix& input,
Matrix* weight,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
void contextProjectionBackward(Matrix* inputGrad,
Matrix* weightGrad,
const IVector& sequence,
int contextLength,
int contextStart,
size_t beginPad,
bool isPadding);
real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
virtual real* getRowBuf(size_t row) { return getRow(row); }
......
......@@ -29,148 +29,6 @@ using namespace std; // NOLINT
using autotest::TensorCheckEqual;
using autotest::TensorCheckErr;
void testMatrixProjectionForward(int contextStart,
int contextLength,
bool padding,
int batchSize,
int inputDim) {
MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInput->randomizeUniform();
gpuInput->copyFrom(*cpuInput);
int pad = std::max(0, -contextStart) +
std::max(0, contextStart + contextLength - 1);
if (pad == 0) padding = false;
MatrixPtr cpuWeight = nullptr;
MatrixPtr gpuWeight = nullptr;
if (padding) {
cpuWeight = std::make_shared<CpuMatrix>(pad, inputDim);
gpuWeight = std::make_shared<GpuMatrix>(pad, inputDim);
cpuWeight->randomizeUniform();
gpuWeight->copyFrom(*cpuWeight);
}
IVectorPtr cpuSequence;
generateSequenceStartPositions(batchSize, cpuSequence);
IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
gpuSequence->copyFrom(*cpuSequence);
MatrixPtr cpuOutput =
std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
MatrixPtr gpuOutput =
std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
cpuOutput->randomizeUniform();
gpuOutput->copyFrom(*cpuOutput);
// calculate
int beginPad = std::max(0, -contextStart);
cpuOutput->contextProjectionForward(*cpuInput,
cpuWeight.get(),
*cpuSequence,
contextLength,
contextStart,
beginPad,
padding);
gpuOutput->contextProjectionForward(*gpuInput,
gpuWeight.get(),
*gpuSequence,
contextLength,
contextStart,
beginPad,
padding);
TensorCheckEqual(*cpuOutput, *gpuOutput);
}
void testMatrixProjectionBackward(int contextStart,
int contextLength,
bool padding,
int batchSize,
int inputDim) {
MatrixPtr cpuOutputGrad =
std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
MatrixPtr gpuOutputGrad =
std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
cpuOutputGrad->randomizeUniform();
gpuOutputGrad->copyFrom(*cpuOutputGrad);
IVectorPtr cpuSequence;
generateSequenceStartPositions(batchSize, cpuSequence);
IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
gpuSequence->copyFrom(*cpuSequence);
MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
cpuInputGrad->randomizeUniform();
gpuInputGrad->copyFrom(*cpuInputGrad);
int pad = std::max(0, -contextStart) +
std::max(0, contextStart + contextLength - 1);
if (pad == 0) padding = false;
MatrixPtr cpuWeightGrad = nullptr;
MatrixPtr gpuWeightGrad = nullptr;
if (padding) {
cpuWeightGrad = std::make_shared<CpuMatrix>(pad, inputDim);
gpuWeightGrad = std::make_shared<GpuMatrix>(pad, inputDim);
cpuWeightGrad->randomizeUniform();
gpuWeightGrad->copyFrom(*cpuWeightGrad);
}
// calculate
int beginPad = std::max(0, -contextStart);
cpuOutputGrad->contextProjectionBackward(cpuInputGrad.get(),
cpuWeightGrad.get(),
*cpuSequence,
contextLength,
contextStart,
beginPad,
padding);
gpuOutputGrad->contextProjectionBackwardData(
*gpuInputGrad, *gpuSequence, contextLength, contextStart);
if (padding) {
gpuOutputGrad->contextProjectionBackwardWeight(*gpuWeightGrad,
*gpuSequence,
contextLength,
contextStart,
pad,
beginPad);
}
TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
if (padding) {
TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
}
}
TEST(Matrix, projection) {
for (auto contextStart : {-5, -3, -1, 0, 3}) {
for (auto contextLength : {1, 2, 5, 7}) {
for (auto trainablePadding : {false, true}) {
for (auto batchSize : {1, 2, 5, 20, 100}) {
for (auto inputDim : {15, 32, 63, 128, 200}) {
VLOG(3) << " contextStart=" << contextStart
<< " contextLength=" << contextLength
<< " trainablePadding=" << trainablePadding
<< " batchSize=" << batchSize << " inputDim=" << inputDim;
testMatrixProjectionForward(contextStart,
contextLength,
trainablePadding,
batchSize,
inputDim);
testMatrixProjectionBackward(contextStart,
contextLength,
trainablePadding,
batchSize,
inputDim);
}
}
}
}
}
}
void testMatrixMaxSequence(int batchSize, int inputDim) {
// forward
MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册