提交 45c81a41 编写于 作者: Q qingqing01 提交者: GitHub

Add job=time in trainer, refine cudnn_conv to reduce gpu memory and speed up training. (#218)

* Add benchmark for PaddlePaddle, tensorflow and caffe

* ConvProjection to reduce memory for goolenet

* Add unit test for ConvProjection.
1. unit test in test_LayerGrad.
2. compare the ConvPorjection and CudnnConvLayer, also compare the concat_layer+img_conv_layer and concat_layer_conv_projection.

* Reduce cudnn_conv memory and add benchmark document.
1. Use TmpMatrix as the workspace in cudnn_conv to reduce gpu memory. It reduce lots of memory.
2. Add benchmark document.
3. fix smallnet_mnist_cifar.py in paddle.

* Add job=time and refine cudnn_conv to reduce gpu memroy and speed up

* Refine cudnn_conv and shared biases operation in concat_layer and mixed_layer.

* follow comments

* follow comments

* Use unique_ptr to prevent memory leaks in CudnnConvLayer.
上级 12945b2c
......@@ -183,7 +183,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
</tr>
<tr>
<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
......@@ -207,6 +207,11 @@ It looks like there are a lot of arguments. However, most of them are for develo
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left">cudnn_conv_workspace_limit_in_mb</td>
<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
</tr>
<tr>
<td class="left" rowspan = "4">RNN</td>
<td class="left">beam_size</td>
......
......@@ -163,6 +163,10 @@
- Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
- type: string (default: "", null)
* `--cudnn_conv_workspace_limit_in_mb`
- Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default.
- type: int32 (default: 4096MB=4GB)
## NLP: RNN/LSTM/GRU
* `--rnn_use_batch`
- Whether to use batch method for calculation in simple RecurrentLayer.
......
......@@ -48,5 +48,24 @@ inline __device__ double paddleAtomicAdd(double* address, double val) {
}
} // namespace paddle
/**
* @brief sum reduction
*
* @param[in,out] smem input data, better to use __shared__ memory.
* @param[in] tid thread index.
* @param[in] threads the total thread number used to reduce,
* such as, blockDim.x.
*
* @return smem[0]: the sum of each elements in smem.
*/
__device__ __forceinline__
void simpleReduce(real* smem, int tid, int threads) {
for (unsigned int s = threads / 2; s > 0; s >>= 1) {
if (tid < s) {
smem[tid] += smem[tid + s];
}
__syncthreads();
}
}
#endif /* HL_DEVICE_FUNCTIONS_CUH_ */
......@@ -229,4 +229,40 @@ extern void hl_cossim_derivative(real* grad,
int input2_height,
real scale);
/**
* @brief Matrix addition: A_d[i][j] += scale * B_d[j/channel].
*
* @param[in] A_d input matrix (M x N).
* @param[in] B_d input matrix (1 x channel).
* @param[in] channel width of B.
* @param[in] dimM height of A.
* @param[in] dimN width of A.
* @param[in] scale scalar used for addition.
*
*/
extern void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale);
/**
* @brief Matrix addition: A_d[i][j] += scale * B_d[j/channel].
*
* @param[in] B_d input matrix (1 x channel).
* @param[in] A_d input matrix (M x N).
* @param[in] channel width of B.
* @param[in] dimM height of A.
* @param[in] dimN width of A.
* @param[in] scale scalar used for addition.
*
*/
extern void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale);
#endif /* HL_MATRIX_H_ */
......@@ -101,4 +101,17 @@ inline void hl_cossim_derivative(real* grad,
int input2_height,
real scale) {}
inline void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale) {}
inline void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale) {}
#endif // HL_MATRIX_STUB_H_
......@@ -20,6 +20,11 @@ limitations under the License. */
#include "hl_thread.ph"
#include "hl_dso_loader.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/CommandLineParser.h"
P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
"Specify cuDNN max workspace limit, in units MB, "
"4096MB=4GB by default.");
namespace dynload {
......@@ -242,7 +247,7 @@ void hl_conv_workspace(hl_tensor_descriptor input,
CHECK_NOTNULL(conv);
// Specify workspace limit directly
size_t memoryLimitBytes = 8 * 1024 * 1024;
size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
// cudnn convolution forward configuration
cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "hl_sequence.h"
#include "paddle/utils/Logging.h"
#include "hl_device_functions.cuh"
#include "hl_gpu_matrix_kernel.cuh"
DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
......@@ -673,3 +674,89 @@ void hl_cossim_derivative(real* grad,
input1_height, input2_height, scale);
CHECK_SYNC("hl_cossim_derivate failed");
}
__global__ void KeMatrixAddSharedBias(real* A,
real* B,
const int channel,
const int M,
const int N,
real scale) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int dim = N / channel;
if (index < M * N) {
int i = index % N;
i = i / dim;
A[index] += scale * B[i];
}
}
void hl_matrix_add_shared_bias(real* A_d,
real* B_d,
const int channel,
const int dimM,
const int dimN,
real scale) {
const int blocks = 512;
const int grids = DIVUP(dimM * dimN, blocks);
KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
(A_d, B_d, channel, dimM, dimN, scale);
CHECK_SYNC("hl_matrix_add_shared_bias failed");
}
template <int blockSize>
__global__ void KeMatrixCollectSharedBias(real *B,
real *A,
const int channel,
const int M,
const int N,
const int dim,
const int limit,
real scale) {
if (dim < limit) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < channel) {
real sum = 0.0;
for (int i = 0; i < M; ++i) {
for (int j = 0; j < dim; ++j) {
sum += A[i * N + index * dim + j];
}
}
B[index] += scale * sum;
}
} else {
const int tid = threadIdx.x;
const int bid = blockIdx.x;
__shared__ real smem[blockSize];
real sum = 0.0;
for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
int n = j * blockSize + tid;
int m = n / dim;
int w = n % dim;
smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
__syncthreads();
simpleReduce(smem, tid, blockSize);
sum += smem[0];
}
if (tid == 0) {
B[bid] += scale * sum;
}
}
}
void hl_matrix_collect_shared_bias(real* B_d,
real* A_d,
const int channel,
const int dimM,
const int dimN,
real scale) {
const int dim = dimN / channel;
const int blocks = 256;
const int limit = 64;
int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
KeMatrixCollectSharedBias<blocks>
<<< grids, blocks, 0, STREAM_DEFAULT>>>
(B_d, A_d, channel, dimM, dimN, dim, limit, scale);
CHECK_SYNC("hl_matrix_collect_shared_bias failed");
}
......@@ -908,24 +908,6 @@ int findIndex(int* indice, int num, int index) {
return (end - 1);
}
/**
* @brief sum reduction
*
* @param[in,out] smem input data, better to use __shared__ memory.
* @param[in] tid local thread index.
* @param[in] blockDimX the size of blockDim.x.
*
* note: return smem[0]: the sum of each elements of smem.
*/
__device__ __forceinline__
void reduce(real* smem, int tid, int blockDimX) {
for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
if (tid < s) {
smem[tid] += smem[tid + s];
}
__syncthreads();
}
}
/**
* @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
......
......@@ -97,7 +97,8 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
*/
class ConcatenateLayer2 : public Layer {
public:
explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
explicit ConcatenateLayer2(const LayerConfig& config) :
Layer(config) {}
~ConcatenateLayer2() {}
......@@ -110,6 +111,8 @@ protected:
std::vector<std::unique_ptr<Projection>> projections_;
std::vector<Argument> projOutput_;
std::vector<std::pair<size_t, size_t>> projCol_;
bool sharedBias_;
std::unique_ptr<Weight> biases_;
};
REGISTER_LAYER(concat2, ConcatenateLayer2);
......@@ -119,7 +122,6 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
/* Initialize the basic parent class */
if (!Layer::init(layerMap, parameterMap)) return false;
CHECK(!biasParameter_);
CHECK_EQ(inputLayers_.size(), parameters_.size());
projections_.reserve(inputLayers_.size());
projCol_.reserve(inputLayers_.size());
......@@ -137,6 +139,13 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
}
CHECK_EQ(getSize(), endCol);
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
sharedBias_ = config_.shared_biases();
size_t psize = config_.bias_size();
biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
}
return true;
}
......@@ -154,8 +163,17 @@ void ConcatenateLayer2::forward(PassType passType) {
projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
}
for (size_t i = 0; i != inputLayers_.size(); ++i) {
projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
{
AsyncGpuBlock block;
for (size_t i = 0; i != inputLayers_.size(); ++i) {
projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
}
}
/* add the bias-vector */
if (biases_) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
}
/* activation */ {
......@@ -170,6 +188,13 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
backwardActivation();
}
AsyncGpuBlock block;
if (biases_ && biases_->getWGrad()) {
REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
biases_->getParameterPtr()->incUpdate(callback);
}
for (size_t i = 0; i != inputLayers_.size(); ++i) {
if (projections_[i]) {
projections_[i]->backward(callback);
......
......@@ -35,25 +35,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
filterSizeY_.push_back(conf.filter_size_y());
filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
channels_.push_back(conf.channels());
imgSize_.push_back(conf.img_size());
imgPixels_.push_back(imgSize_.back() * imgSize_.back());
imgSizeH_.push_back(conf.img_size());
imgSizeW_.push_back(conf.img_size());
groups_.push_back(conf.groups());
filterChannels_.push_back(conf.filter_channels());
outputX_.push_back(conf.output_x());
outputs_.push_back(outputX_.back() * outputX_.back());
}
/* initialize the weightList */
CHECK(inputLayers_.size() == parameters_.size());
for (size_t i = 0; i < inputLayers_.size(); i++) {
size_t height, width;
height = filterPixels_[i] * filterChannels_[i];
width = numFilters_;
// create a new weight
CHECK_EQ(parameters_[i]->getSize(), width * height);
Weight* w = new Weight(height, width, parameters_[i]);
weights_.emplace_back(w);
outputH_.push_back(conf.output_x());
outputW_.push_back(conf.output_x());
}
/* initialize the biases_ */
......@@ -74,4 +61,34 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
return true;
}
size_t ConvBaseLayer::calOutputSize() {
auto clearAndReserve = [this](IntV* vec) {
vec->clear();
vec->reserve(this->inputLayers_.size());
};
clearAndReserve(&imgSizeH_);
clearAndReserve(&imgSizeW_);
clearAndReserve(&outputH_);
clearAndReserve(&outputW_);
size_t layerSize = 0;
for (size_t i = 0; i < inputLayers_.size(); i++) {
imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
if (imgSizeH_[i] == 0)
imgSizeH_[i] = config_.inputs(i).conv_conf().img_size();
if (imgSizeW_[i] == 0)
imgSizeW_[i] = config_.inputs(i).conv_conf().img_size();
outputH_.push_back(
outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i]));
outputW_.push_back(
outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
CHECK_EQ(outputH_[i], outputH_[0]);
CHECK_EQ(outputW_[i], outputW_[0]);
}
getOutput().setFrameHeight(outputH_[0]);
getOutput().setFrameWidth(outputW_[0]);
layerSize = outputH_[0] * outputW_[0] * size_t(numFilters_);
return layerSize;
}
} // namespace paddle
......@@ -43,19 +43,18 @@ protected:
IntV filterSizeY_;
/// The spatial dimensions of the convolution input.
IntV channels_;
/// The spatial dimensions of input feature map.
IntV imgSize_;
/// The total pixel size of input feature map.
/// imgPixels_ = imgSizeX_ * imgSizeY_.
IntV imgPixels_;
/// The spatial dimensions of input feature map height.
IntV imgSizeH_;
/// The spatial dimensions of input feature map width.
IntV imgSizeW_;
/// filterPixels_ = filterSizeX_ * filterSizeY_.
IntV filterPixels_;
/// filterChannels_ = channels_/groups_.
IntV filterChannels_;
/// The spatial dimensions of output feature map.
IntV outputX_;
/// The spatial dimensions of output feature map.
IntV outputs_;
/// The spatial dimensions of output feature map height.
IntV outputH_;
/// The spatial dimensions of output feature map width.
IntV outputW_;
/// Group size, refer to grouped convolution in
/// Alex Krizhevsky's paper: when group=2, the first half of the
/// filters are only connected to the first half of the input channels,
......@@ -80,6 +79,13 @@ public:
virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
/**
* imgSizeH_ and imgSizeW_ will be set according to the previous input layers
* in this function. Then it will calculate outputH_ and outputW_ and set them
* into output argument.
*/
virtual size_t calOutputSize();
Weight& getWeight(int idx) { return *weights_[idx]; }
/**
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/utils/Stat.h"
#include "ConvProjection.h"
namespace paddle {
REGISTER_PROJECTION(conv, ConvProjection);
ThreadLocalD<std::vector<MemoryHandle*>> ConvProjection::convMem_;
ConvProjection::ConvProjection(const ProjectionConfig& config,
ParameterPtr parameter, bool useGpu)
: Projection(config, parameter, useGpu) {
CHECK(useGpu); // only support GPU
getConvParams();
initCudnn();
size_t height = filterH_ * filterW_ * channels_ / groups_;
size_t width = numFilters_;
weight_.reset(new Weight(height, width, parameter));
weightOffset_ = height * width / groups_;
}
void ConvProjection::getConvParams() {
const ConvConfig &conf = config_.conv_conf();
paddingH_ = conf.padding_y();
paddingW_ = conf.padding();
strideH_ = conf.stride_y();
strideW_ = conf.stride();
filterH_ = conf.filter_size_y();
filterW_ = conf.filter_size();
configImgH_ = conf.img_size();
configImgW_ = conf.img_size();
channels_ = conf.channels();
numFilters_ = config_.num_filters();
groups_ = conf.groups();
CHECK_EQ(channels_ % groups_, 0);
CHECK_EQ(numFilters_ % groups_, 0);
}
void ConvProjection::initCudnn() {
hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
filterH_, filterW_);
hl_create_tensor_descriptor(&inputDesc_);
hl_create_tensor_descriptor(&outputDesc_);
hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
paddingH_, paddingW_, strideH_, strideW_);
// initialize all to default algorithms
fwdAlgo_ = 0;
bwdFilterAlgo_ = 0;
bwdDataAlgo_ = 0;
fwdLimitBytes_ = 0;
bwdDataLimitBytes_ = 0;
bwdFilterLimitBytes_ = 0;
workSpaceInBytes_ = 0;
batchNum_ = 0;
isSelectAlgo_ = false;
}
void ConvProjection::reshapeTensorDesc(int batchSize) {
hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_,
channels_ * imageH_ * imageW_, imageH_ * imageW_,
imageW_, 1);
hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
paddingH_, paddingW_, strideH_, strideW_);
// The stride between two consecutive images in ConvProjection may not be 1,
// for example, in the case of layer ConcatenateLayer2 with two
// ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
// So the calculation of nStride is different from CudnnConvLayer.
// In fact, only "nStride = out_->value->getStride()" is ok.
size_t nStride = numFilters_ * outputH_ * outputW_;
if (out_->value->isContiguous()) {
CHECK_EQ(nStride, out_->value->getWidth());
} else {
nStride = out_->value->getStride();
}
hl_tensor_reshape(outputDesc_, batchSize, numFilters_, outputH_, outputW_,
nStride, outputH_ * outputW_, outputW_, 1);
}
void ConvProjection::reshape(int batchSize) {
size_t width = calOutputSize();
CHECK_EQ(width, out_->value->getWidth());
isSelectAlgo_ = (batchSize == batchNum_);
batchNum_ = batchSize;
if (!isSelectAlgo_) {
reshapeTensorDesc(batchSize);
hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
convDesc_, &fwdAlgo_, &fwdLimitBytes_,
&bwdDataAlgo_, &bwdDataLimitBytes_,
&bwdFilterAlgo_, &bwdFilterLimitBytes_);
size_t maxWorkSpace = 0;
maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
workSpaceInBytes_ = maxWorkSpace;
VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
<< " / " << bwdDataAlgo_
<< " / " << bwdFilterAlgo_;
}
isSelectAlgo_ = true;
}
void ConvProjection::forward() {
int batchSize = in_->value->getHeight();
reshape(batchSize);
void* workSpace = NULL;
if (workSpaceInBytes_ > 0) {
workSpace = getSpaceBytes(workSpaceInBytes_);
}
for (int g = 0; g < groups_; ++g) {
REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
real *inputData = in_->value->getData() + g * inputOffset_;
real *wgtData = weight_->getW()->getData() + g * weightOffset_;
real *outData = out_->value->getData() + g * outputOffset_;
hl_convolution_forward(inputDesc_, inputData, outputDesc_,
outData, filterDesc_, wgtData,
convDesc_, workSpace,
fwdLimitBytes_, fwdAlgo_);
}
}
void ConvProjection::backward(const UpdateCallback& callback) {
REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
void* workSpace = NULL;
if (workSpaceInBytes_ > 0) {
workSpace = getSpaceBytes(workSpaceInBytes_);
}
for (int g = 0; g < groups_; ++g) {
real *outGrad = out_->grad->getData() + g * outputOffset_;
if (weight_->getWGrad()) {
real *inputData = in_->value->getData() + g * inputOffset_;
real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
hl_convolution_backward_filter(
inputDesc_, inputData, outputDesc_, outGrad, filterDesc_,
weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_,
bwdFilterAlgo_);
}
MatrixPtr preGrad = in_->grad;
if (NULL != preGrad) {
real *inputGrad = preGrad->getData() + g * inputOffset_;
real *wgtData = weight_->getW()->getData() + g* weightOffset_;
hl_convolution_backward_data(
inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_,
wgtData, convDesc_, workSpace, bwdDataLimitBytes_,
bwdDataAlgo_);
}
}
weight_->getParameterPtr()->incUpdate(callback);
}
void* ConvProjection::getSpaceBytes(size_t size) {
std::vector<MemoryHandle*>& convMem = *convMem_;
if (convMem.empty()) {
int numDevices = hl_get_device_count();
convMem.resize(numDevices);
}
int devId = hl_get_device();
MemoryHandle** localMem = &(convMem[devId]);
if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
*localMem = new GpuMemoryHandle(size);
}
return (*localMem)->getBuf();
}
ConvProjection::~ConvProjection() {
hl_destroy_tensor_descriptor(inputDesc_);
hl_destroy_tensor_descriptor(outputDesc_);
hl_destroy_filter_descriptor(filterDesc_);
hl_destroy_convolution_descriptor(convDesc_);
}
} // namespace paddle
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Projection.h"
namespace paddle {
/**
* @brief Convolution projection do the same calculation with CudnnConvLayer.
*/
class ConvProjection : public Projection {
public:
/**
* Constructor.
*/
ConvProjection(const ProjectionConfig& config, ParameterPtr parameter,
bool useGpu);
~ConvProjection();
virtual void forward();
virtual void backward(const UpdateCallback& callback);
protected:
void getConvParams();
void initCudnn();
void reshapeTensorDesc(int batchSize);
void reshape(int batchSize);
int outputSize(int imageSize, int filterSize, int padding, int stride) {
return (imageSize - filterSize + 2 * padding) / stride + 1;
}
size_t calOutputSize() {
imageH_ = in_->getFrameHeight();
imageW_ = in_->getFrameWidth();
if (imageH_ == 0) imageH_ = configImgH_;
if (imageW_ == 0) imageW_ = configImgW_;
outputH_ = outputSize(imageH_, filterH_, paddingH_, strideH_);
outputW_ = outputSize(imageW_, filterW_, paddingW_, strideW_);
const_cast<Argument*>(out_)->setFrameHeight(outputH_);
const_cast<Argument*>(out_)->setFrameWidth(outputW_);
inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
return outputH_ * outputW_ * numFilters_;
}
static void* getSpaceBytes(size_t size);
/// imageH_ and imageW_ is calculated from the input layer.
int imageH_, imageW_;
/// configImgH_ and configImgW_ is obtained from config.
int configImgH_, configImgW_;
int outputH_, outputW_;
int channels_, numFilters_;
int paddingH_, paddingW_;
int strideH_, strideW_;
int filterH_, filterW_;
/// One group offset of input data.
int inputOffset_;
/// One group offset of output data.
int outputOffset_;
/// One group offset of weight.
int weightOffset_;
int groups_;
/// Cudnn tensor descriptor for input.
hl_tensor_descriptor inputDesc_;
/// Cudnn tensor descriptor for output.
hl_tensor_descriptor outputDesc_;
/// Cudnn tensor descriptor for filter.
hl_filter_descriptor filterDesc_;
/// Cudnn tensor descriptor for a convolution operation.
hl_convolution_descriptor convDesc_;
/// Record the algorithm for forward convolution, which is obtained by cudnn
/// api to search the best suited algorithm.
int fwdAlgo_;
/// Record the algorithm for computing convolution gradient with respect to
/// filter coefficients.
int bwdFilterAlgo_;
/// Record the algorithm for computing convolution gradient with respect to
/// the output.
int bwdDataAlgo_;
/// Amount of GPU memory needed as workspace to be able to execute a
/// forward convolution with the specified algo.
size_t fwdLimitBytes_;
/// Amount of GPU memory needed as workspace to be able to execute a
/// backwardFilter with the specified algo.
size_t bwdDataLimitBytes_;
/// Amount of GPU memory needed as workspace to be able to execute a
/// backwardData with the specified algo.
size_t bwdFilterLimitBytes_;
/// Size of total work space.
size_t workSpaceInBytes_;
/// Whether to call cuDNN api to choose conv algorithm.
bool isSelectAlgo_;
/// batchNum is used to record batch size. If the batch size is changed,
/// the selection algorithm will be called.
int batchNum_;
bool bias_;
std::unique_ptr<Weight> weight_;
static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
};
} // namespace paddle
......@@ -22,215 +22,64 @@ REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
bool CudnnConvLayer::init(const LayerMap &layerMap,
const ParameterMap &parameterMap) {
ConvBaseLayer::init(layerMap, parameterMap);
if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
CHECK(useGpu_) << "CudnnConvLayer only support gpu";
maxGroups_ = 0;
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(channels_[i] % groups_[i], 0);
CHECK_EQ(numFilters_ % groups_[i], 0);
hl_filter_descriptor filter;
hl_create_filter_descriptor(&filter, channels_[i] / groups_[i],
numFilters_ / groups_[i], filterSizeY_[i],
filterSize_[i]);
filterDesc_.push_back(filter);
hl_tensor_descriptor input;
hl_create_tensor_descriptor(&input);
inputDesc_.push_back(input);
hl_tensor_descriptor output;
int outputX =
outputSize(imgSize_[i], filterSize_[i], padding_[i], stride_[i]);
CHECK_EQ(outputX, outputX_[i]);
hl_create_tensor_descriptor(&output);
outputDesc_.push_back(output);
CHECK_EQ(inputLayers_.size(), parameters_.size());
projections_.reserve(inputLayers_.size());
projConf_.reserve(inputLayers_.size());
hl_convolution_descriptor conv;
hl_create_convolution_descriptor(&conv, input, filter, paddingY_[i],
padding_[i], strideY_[i], stride_[i]);
convDesc_.push_back(conv);
weightOffset_.push_back((numFilters_ / groups_[i]) *
(channels_[i] / groups_[i]) * filterPixels_[i]);
inputOffset_.push_back((channels_[i] / groups_[i]) * imgSize_[i] *
imgSize_[i]);
outputOffset_.push_back((numFilters_ / groups_[i]) * outputX_[i] *
outputX_[i]);
// initialize all to default algorithms
fwdAlgo_.push_back(0);
bwdFilterAlgo_.push_back(0);
bwdDataAlgo_.push_back(0);
fwdLimitBytes_.push_back(0);
bwdFilterLimitBytes_.push_back(0);
bwdDataLimitBytes_.push_back(0);
// cudnn streams per group equal to 1
if (groups_[i] > maxGroups_) {
maxGroups_ = groups_[i];
}
}
workSpaceInBytes_ = 0;
workSpaceData_ = NULL;
for (int i = 0; i < maxGroups_; ++i) {
workSpace_.push_back(NULL);
numFilters_ = config_.num_filters();
CHECK(config_.shared_biases());
for (size_t i = 0; i < inputLayers_.size(); i++) {
ProjectionConfig* conf = new ProjectionConfig();
conf->set_type("conv");
conf->set_num_filters(numFilters_);
conf->set_allocated_conv_conf(
config_.mutable_inputs(i)->mutable_conv_conf());
conf->set_input_size(getPrev(i)->getSize());
conf->set_output_size(getSize());
projConf_.emplace_back(conf);
projections_.emplace_back(Projection::create(*projConf_[i],
parameters_[i], useGpu_));
}
if (biases_.get() && sharedBiases_) {
hl_create_tensor_descriptor(&biasDesc_);
hl_create_tensor_descriptor(&outputDesc_);
hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
biasOffset_ = numFilters_ / groups_[0];
}
batchNum_ = 0;
isSelectAlgo_ = false;
return true;
}
void CudnnConvLayer::allocConvWorkSpace(size_t maxWorkSpace) {
size_t totalWorkSpace = maxWorkSpace * maxGroups_;
if (totalWorkSpace > workSpaceInBytes_) {
if (workSpaceInBytes_ != 0) {
hl_free_mem_device(workSpaceData_);
}
// total amount of storage needed over all groups
workSpaceData_ = hl_malloc_device(totalWorkSpace);
// update work space address for each group
for (int i = 0; i < maxGroups_; ++i) {
workSpace_[i] = reinterpret_cast<char *>(workSpaceData_)
+ i * maxWorkSpace;
}
workSpaceInBytes_ = totalWorkSpace;
}
}
void CudnnConvLayer::reshape(int batchSize) {
CHECK_NE(inputLayers_.size(), 0UL);
imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
if (imageH_ == 0) imageH_ = imgSize_[0];
if (imageW_ == 0) imageW_ = imgSize_[0];
for (size_t i = 1; i < inputLayers_.size(); i++) {
int imageH = inputLayers_[i]->getOutput().getFrameHeight();
int imageW = inputLayers_[i]->getOutput().getFrameWidth();
if (imageH) {
CHECK_EQ(imageH_, imageH) << "Inputs must have same height.";
}
if (imageW) {
CHECK_EQ(imageW_, imageW) << "Inputs must have same width.";
}
}
outputH_ = outputSize(imageH_, filterSizeY_[0], paddingY_[0], strideY_[0]);
outputW_ = outputSize(imageW_, filterSize_[0], padding_[0], stride_[0]);
// check outputH & outputW
getOutput().setFrameHeight(outputH_);
getOutput().setFrameWidth(outputW_);
// if the batchSize remains the same, set isSelectAlgo_ true.
// Otherwise, set isSelectAlgo_ false and select algo again.
isSelectAlgo_ = (batchSize == batchNum_);
batchNum_ = batchSize;
size_t maxWorkSpace = 0;
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
(size_t)(channels_[i] * imageH_ * imageW_));
hl_tensor_reshape(inputDesc_[i], batchSize, channels_[i] / groups_[i],
imageH_, imageW_, channels_[i] * imageH_ * imageW_,
imageH_ * imageW_, imageW_, 1);
hl_tensor_reshape(outputDesc_[i], batchSize, numFilters_ / groups_[i],
outputH_, outputW_, numFilters_ * outputH_ * outputW_,
outputH_ * outputW_, outputW_, 1);
hl_reset_convolution_descriptor(convDesc_[i], inputDesc_[i],
filterDesc_[i], paddingY_[i],
padding_[i], strideY_[i], stride_[i]);
inputOffset_[i] = (channels_[i] / groups_[i]) * imageH_ * imageW_;
outputOffset_[i] = (numFilters_ / groups_[i]) * outputH_ * outputW_;
if (!isSelectAlgo_) {
hl_conv_workspace(inputDesc_[i], outputDesc_[i], filterDesc_[i],
convDesc_[i], &fwdAlgo_[i], &fwdLimitBytes_[i],
&bwdDataAlgo_[i], &bwdDataLimitBytes_[i],
&bwdFilterAlgo_[i], &bwdFilterLimitBytes_[i]);
maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i]
<< " / " << bwdDataAlgo_[i]
<< " / " << bwdFilterAlgo_[i];
}
}
if (!isSelectAlgo_) {
allocConvWorkSpace(maxWorkSpace);
}
isSelectAlgo_ = true;
}
void CudnnConvLayer::forward(PassType passType) {
Layer::forward(passType);
int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
reshape(batchSize);
resetOutput(batchSize, outputH_ * outputW_ * numFilters_);
int batchSize = getInput(0).getBatchSize();
resetOutput(batchSize, calOutputSize());
for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
for (int g = 0; g < groups_[i]; ++g) {
real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
real *outData = getOutputValue()->getData() + outputOffset_[i] * g;
hl_convolution_forward(inputDesc_[i], inputData, outputDesc_[i],
outData, filterDesc_[i], wgtData,
convDesc_[i], workSpace_[g],
fwdLimitBytes_[i], fwdAlgo_[i]);
}
projections_[i]->forward(&getInput(i), &getOutput(), passType);
}
if (biases_) {
REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
addBiases();
}
forwardActivation();
}
void CudnnConvLayer::addBiases() {
if (sharedBiases_) {
int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
hl_tensor_reshape(outputDesc_, batchSize, numFilters_ / groups_[0],
outputH_[0], outputW_[0], numFilters_ * outputH_[0] * outputW_[0],
outputH_[0] * outputW_[0], outputW_[0], 1);
outputOffset_ = getOutputValue()->getWidth() / groups_[0];
for (int g = 0; g < groups_[0]; ++g) {
real *biasData = biases_->getW()->getData() + biasOffset_ * g;
real *outData = getOutputValue()->getData() + outputOffset_[0] * g;
real *outData = getOutputValue()->getData() + outputOffset_ * g;
hl_convolution_forward_add_bias(biasDesc_, biasData,
outputDesc_[0], outData);
outputDesc_, outData);
}
} else {
LOG(FATAL) << "Not supported";
}
}
void CudnnConvLayer::bpropBiases() {
if (sharedBiases_) {
for (int g = 0; g < groups_[0]; ++g) {
real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
real *outGrad = getOutputGrad()->getData() + outputOffset_[0] * g;
hl_convolution_backward_bias(biasDesc_, biasGrad,
outputDesc_[0], outGrad);
}
} else {
LOG(FATAL) << "Not supported";
}
forwardActivation();
}
void CudnnConvLayer::backward(const UpdateCallback &callback) {
......@@ -238,52 +87,23 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
if (biases_ && biases_->getWGrad()) {
REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
bpropBiases();
for (int g = 0; g < groups_[0]; ++g) {
real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
}
biases_->getParameterPtr()->incUpdate(callback);
}
for (size_t i = 0; i != inputLayers_.size(); ++i) {
REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
for (int g = 0; g < groups_[i]; ++g) {
real *outGrad = getOutputGrad()->getData() + outputOffset_[i] * g;
if (weights_[i]->getWGrad()) {
real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
real *weightGrad =
weights_[i]->getWGrad()->getData() + weightOffset_[i] * g;
hl_convolution_backward_filter(
inputDesc_[i], inputData, outputDesc_[i], outGrad, filterDesc_[i],
weightGrad, convDesc_[i], workSpace_[g], bwdFilterLimitBytes_[i],
bwdFilterAlgo_[i]);
}
MatrixPtr preGrad = getInputGrad(i);
if (NULL != preGrad) {
real *inputGrad = preGrad->getData() + inputOffset_[i] * g;
real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
hl_convolution_backward_data(
inputDesc_[i], inputGrad, outputDesc_[i], outGrad, filterDesc_[i],
wgtData, convDesc_[i], workSpace_[g], bwdDataLimitBytes_[i],
bwdDataAlgo_[i]);
}
}
weights_[i]->getParameterPtr()->incUpdate(callback);
projections_[i]->backward(callback);
}
}
CudnnConvLayer::~CudnnConvLayer() {
if (biasDesc_) {
if (biases_) {
hl_destroy_tensor_descriptor(biasDesc_);
}
for (size_t i = 0; i < inputDesc_.size(); i++) {
hl_destroy_tensor_descriptor(inputDesc_[i]);
hl_destroy_tensor_descriptor(outputDesc_[i]);
hl_destroy_filter_descriptor(filterDesc_[i]);
hl_destroy_convolution_descriptor(convDesc_[i]);
}
if (workSpaceInBytes_ != 0) {
hl_free_mem_device(workSpaceData_);
workSpaceInBytes_ = 0;
hl_destroy_tensor_descriptor(outputDesc_);
}
}
......
......@@ -17,12 +17,13 @@ limitations under the License. */
#include "ConvBaseLayer.h"
#include "paddle/math/Matrix.h"
#include "Projection.h"
#include <vector>
namespace paddle {
/**
* @brief A subclass of ConvBaseLayer by cuDNN implementation. It only
* @brief A 2-dimension conv layer implemented by cuDNN. It only
* supports GPU mode. We automatic select CudnnConvLayer for GPU
* mode and ExpandConvLayer for CPU mode if you set type of "conv".
* User also can specfiy type of "exconv" or "cudnn_conv" for
......@@ -31,81 +32,21 @@ namespace paddle {
* The config file api is img_conv_layer.
*/
class CudnnConvLayer : public ConvBaseLayer {
private:
/// resize Cudnn workspace size
void allocConvWorkSpace(size_t maxWorkSpace);
protected:
int imageH_, imageW_, outputH_, outputW_;
/// Cudnn tensor descriptor for bias.
std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
std::vector<std::unique_ptr<Projection>> projections_;
hl_tensor_descriptor biasDesc_;
/// Cudnn tensor descriptor for input.
std::vector<hl_tensor_descriptor> inputDesc_;
/// Cudnn tensor descriptor for output.
std::vector<hl_tensor_descriptor> outputDesc_;
/// Cudnn tensor descriptor for filter.
std::vector<hl_filter_descriptor> filterDesc_;
/// Cudnn tensor descriptor for a convolution operation.
std::vector<hl_convolution_descriptor> convDesc_;
/// One sample offset of input data.
IntV inputOffset_;
/// One sample offset of output data.
IntV outputOffset_;
/// One group offset of weight.
IntV weightOffset_;
/// One group offset of bias.
hl_tensor_descriptor outputDesc_;
int biasOffset_;
/// Save the algorithm for forward convolution, which is obtained by cudnn
/// api to search the best suited algorithm.
std::vector<int> fwdAlgo_;
/// Save the algorithm for computing convolution gradient with respect to
/// filter coefficients.
std::vector<int> bwdFilterAlgo_;
/// Save the algorithm for computing convolution gradient with respect to
/// the output.
std::vector<int> bwdDataAlgo_;
/// Amount of GPU memory needed as workspace to be able to execute a
/// forward convolution with the specified algo.
std::vector<size_t> fwdLimitBytes_;
/// Amount of GPU memory needed as workspace to be able to execute a
/// backwardFilter with the specified algo.
std::vector<size_t> bwdFilterLimitBytes_;
/// Amount of GPU memory needed as workspace to be able to execute a
/// backwardData with the specified algo.
std::vector<size_t> bwdDataLimitBytes_;
/// Device work space address for each group.
std::vector<void*> workSpace_;
/// Max number of groups.
int maxGroups_;
/// Total work space address in device for all groups.
void* workSpaceData_;
/// Size of total work space.
size_t workSpaceInBytes_;
/// Is or not select conv algorihtm.
bool isSelectAlgo_;
/// batchNum is used to record batch size. If the batch size is changed,
/// the selection algorithm will be called.
int batchNum_;
int outputOffset_;
public:
explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
~CudnnConvLayer();
/**
* Intialization. Initialize member variables and create tenor descriptor.
*/
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
/**
* Reshape is done each forward. Reshape tensor decriptor
* inputDesc_, outputDesc_, convDesc_. And search the faster algo
* or the fastest algo within a given memeory limit.
*/
void reshape(int batchSize);
void forward(PassType passType);
void backward(const UpdateCallback& callback);
void addBiases();
......
......@@ -37,32 +37,29 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
caffeMode_ = conf.caffe_mode();
}
/* initialize the weightList */
CHECK(inputLayers_.size() == parameters_.size());
for (size_t i = 0; i < inputLayers_.size(); i++) {
size_t height, width;
height = filterPixels_[i] * filterChannels_[i];
width = numFilters_;
// create a new weight
CHECK_EQ(parameters_[i]->getSize(), width * height);
Weight* w = new Weight(height, width, parameters_[i]);
weights_.emplace_back(w);
}
return true;
}
size_t ExpandConvLayer::getSize() {
size_t ExpandConvLayer::getOutputSize() {
CHECK_NE(inputLayers_.size(), 0UL);
imgSizeH_.clear();
imgSizeW_.clear();
outputH_.clear();
outputW_.clear();
size_t layerSize = ConvBaseLayer::calOutputSize();
subN_.clear();
size_t layerSize = 0;
for (size_t i = 0; i < inputLayers_.size(); i++) {
imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
if (imgSizeH_[i] == 0) imgSizeH_[i] = imgSize_[i];
if (imgSizeW_[i] == 0) imgSizeW_[i] = imgSize_[i];
outputH_.push_back(
outputSize(imgSizeH_[i], filterSize_[i], padding_[i], stride_[i]));
outputW_.push_back(
outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
subN_.push_back(outputH_[i] * outputW_[i]);
CHECK(layerSize == 0 || subN_[i] * size_t(numFilters_) == layerSize);
layerSize = subN_[i] * numFilters_;
}
getOutput().setFrameHeight(outputH_[0]);
getOutput().setFrameWidth(outputW_[0]);
return layerSize;
}
......@@ -119,7 +116,7 @@ void ExpandConvLayer::expandFwdOnce(MatrixPtr image, int inIdx, int startIdx) {
}
void ExpandConvLayer::addSharedBias() {
size_t mapW = getSize() / numFilters_;
size_t mapW = getOutputValue()->getWidth() / numFilters_;
size_t mapH = getOutputValue()->getElementCnt() / mapW;
MatrixPtr out =
Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
......@@ -158,7 +155,7 @@ void ExpandConvLayer::forward(PassType passType) {
* transOutValue correspond sample to one row */
int batchSize = inputLayers_[0]->getOutputValue()->getWidth();
batchSize = inputLayers_[0]->getOutputValue()->getHeight();
resetOutput(batchSize, getSize());
resetOutput(batchSize, getOutputSize());
MatrixPtr image = nullptr;
for (size_t i = 0; i != inputLayers_.size(); ++i) {
......@@ -183,7 +180,7 @@ void ExpandConvLayer::forward(PassType passType) {
}
void ExpandConvLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
size_t mapW = getSize() / numFilters_;
size_t mapW = v->getWidth() / numFilters_;
size_t mapH = v->getElementCnt() / mapW;
MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
......
......@@ -37,14 +37,6 @@ protected:
IntV subN_;
/// subK_ = channels_ * filterPixels_ * groups_.
IntV subK_;
/// The spatial dimensions of height of input feature map.
IntV imgSizeH_;
/// The spatial dimensions of width of input feature map.
IntV imgSizeW_;
/// The spatial dimensions of height of output feature map.
IntV outputH_;
/// The spatial dimensions of width of output feature map.
IntV outputW_;
/// Expand one sample at a time. shape:
/// (numChannels * filterPixels_, outputSizeH * outputSizeW)
MatrixPtr expandInput_;
......@@ -58,7 +50,7 @@ public:
bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
size_t getSize();
size_t getOutputSize();
/**
* Create or resize expandInput_.
......
......@@ -41,9 +41,13 @@ bool MixedLayer::init(const LayerMap& layerMap,
}
operators_.emplace_back(Operator::create(operator_conf, useGpu_));
}
/* initialize biases_ */
if (biasParameter_.get() != NULL) {
biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
sharedBias_ = config_.shared_biases();
size_t psize = config_.bias_size();
biases_ = std::unique_ptr<Weight>(
new Weight(1, psize, biasParameter_));
}
return true;
......@@ -119,12 +123,6 @@ void MixedLayer::forward(PassType passType) {
MatrixPtr outV = getOutputValue();
/* add the bias-vector */
if (biases_.get() != NULL) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
outV->addBias(*(biases_->getW()), 1);
}
for (size_t i = 0; i != inputLayers_.size(); ++i) {
if (projections_[i]) {
projections_[i]->forward(&getInput(i), &output_, passType);
......@@ -140,6 +138,12 @@ void MixedLayer::forward(PassType passType) {
op->forward(ins, &output_, passType);
}
/* add the bias-vector */
if (biases_.get() != NULL) {
REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
outV->addBias(*(biases_->getW()), 1, sharedBias_);
}
/* activation */ {
REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
forwardActivation();
......@@ -154,7 +158,7 @@ void MixedLayer::backward(const UpdateCallback& callback) {
if (biases_ && biases_->getWGrad()) {
REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
/* Increasing the number of gradient */
biases_->getParameterPtr()->incUpdate(callback);
......
......@@ -58,5 +58,6 @@ protected:
/// the matrix size of projection state
std::vector<int> projectionStateMatrixSize_;
std::unique_ptr<Weight> biases_;
bool sharedBias_;
};
} // namespace paddle
......@@ -669,12 +669,14 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
void testProjectionGrad(ProjectionConfig conf, InputType inputType,
size_t parameterSize, size_t batchSize, bool useGpu,
bool testState) {
bool testState, int biasSize, bool sharedBias) {
TestConfig config;
conf.set_name(conf.type());
config.layerConfig.set_type("mixed");
config.layerConfig.set_size(conf.output_size());
config.biasSize = config.layerConfig.size();
config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
config.layerConfig.set_bias_size(config.biasSize);
config.layerConfig.set_shared_biases(sharedBias);
config.inputDefs.push_back(
{inputType, "layer_0", conf.input_size(), parameterSize});
*config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
......
......@@ -217,7 +217,8 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
void testProjectionGrad(ProjectionConfig conf, InputType inputType,
size_t parameterSize, size_t batchSize, bool useGpu,
bool testState = false);
bool testState = false, int biasSize = 0,
bool sharedBias = false);
void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
size_t batchSize, bool useGpu, bool testState = false);
......
#edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
settings(batch_size=10)
data = data_layer(name ="input", size=8*16*16)
conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
num_channels=8,
num_filters=16, stride=1,
bias_attr=False,
act=ReluActivation())
conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
num_channels=8,
num_filters=16, stride=1,
bias_attr=False,
act=ReluActivation())
concat = concat_layer(input=[conv1, conv2])
conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
num_channels=8,
num_filters=16, stride=1,
bias_attr=True,
act=LinearActivation())
outputs(concat, conv)
#edit-mode: -*- python -*-
# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
settings(batch_size=10)
data = data_layer(name ="input", size=8*16*16)
proj1 = conv_projection(input=data, filter_size=1, filter_size_y=1,
num_channels=8, num_filters=16, stride=1)
proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
num_channels=8, num_filters=16, stride=1)
concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
num_channels=8, num_filters=16, stride=1)
with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
conv += proj
outputs(concat, conv)
......@@ -134,6 +134,45 @@ TEST(Projection, identity) {
}
}
#ifndef PADDLE_ONLY_CPU
TEST(Projection, conv) {
const int NUM_FILTERS = 16;
const int FILTER_SIZE = 2;
const int FILTER_SIZE_Y = 3;
const int CHANNELS = 3;
const int IMAGE_SIZE = 16;
ProjectionConfig conf;
conf.set_type("conv");
conf.set_num_filters(NUM_FILTERS);
ConvConfig* conv = conf.mutable_conv_conf();
conv->set_filter_size(FILTER_SIZE);
conv->set_filter_size_y(FILTER_SIZE_Y);
conv->set_channels(CHANNELS);
conv->set_padding(0);
conv->set_padding_y(1);
conv->set_stride(2);
conv->set_stride_y(2);
conv->set_groups(1);
conv->set_filter_channels(conv->channels() / conv->groups());
conv->set_img_size(IMAGE_SIZE);
int outputSize = (2 * conv->padding() + conv->img_size() -
conv->filter_size()) / conv->stride() + 1;
int outputSizeY = (2 * conv->padding_y() + conv->img_size() -
conv->filter_size_y()) / conv->stride_y() + 1;
conv->set_output_x(outputSize);
conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
conf.set_output_size(outputSize * outputSizeY * NUM_FILTERS);
testProjectionGrad(conf, INPUT_DATA,
/* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
/* batchSize */ 100, true, false, NUM_FILTERS, true);
}
#endif
TEST(Layer, concat) {
TestConfig config;
config.biasSize = 0;
......
......@@ -236,6 +236,15 @@ TEST(Compare, img_pool) {
compareNetwork(config_file_a, config_file_b);
FLAGS_use_gpu = useGpu;
}
TEST(Compare, img_conv) {
std::string config_file_a = "./gserver/tests/img_conv_a.conf";
std::string config_file_b = "./gserver/tests/img_conv_b.conf";
bool useGpu = FLAGS_use_gpu;
FLAGS_use_gpu = true;
compareNetwork(config_file_a, config_file_b);
FLAGS_use_gpu = useGpu;
}
#endif
......
......@@ -340,6 +340,15 @@ void GpuMatrix::addBias(Matrix& b, real scale) {
BaseMatrix::addBias(b, scale);
}
void GpuMatrix::addSharedBias(Matrix& b, real scale) {
CHECK(b.getHeight() == 1) << "the Bias should be a vector";
CHECK_LE(b.getWidth(), getWidth());
CHECK_EQ(getWidth() % b.getWidth(), 0UL);
hl_matrix_add_shared_bias(getData(), b.getData(), b.getWidth(),
getHeight(), getWidth(), scale);
}
void GpuMatrix::collectBias(Matrix& a, real scale) {
CHECK_EQ(getHeight(), (size_t)1);
CHECK_EQ(width_, a.getWidth());
......@@ -354,6 +363,14 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
}
}
void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
CHECK_EQ(getHeight(), (size_t)1);
CHECK_EQ(a.getWidth() % getWidth(), 0UL);
hl_matrix_collect_shared_bias(getData(), a.getData(), getWidth(),
a.getHeight(), a.getWidth(), scale);
}
void GpuMatrix::sequenceAvgForward(Matrix& a,
const IVector& startsPos,
int mode) {
......@@ -1983,6 +2000,24 @@ void CpuMatrix::addBias(Matrix& b, real scale) {
}
}
void CpuMatrix::addSharedBias(Matrix& b, real scale) {
CHECK_EQ(b.getHeight(), (size_t)1);
real* aData = getData();
real* bData = b.getData();
size_t numSamples = getHeight();
size_t channel = b.getWidth();
CHECK_EQ(getWidth() % channel, 0UL);
size_t dim = getWidth() / channel;
for (size_t i = 0; i < numSamples; i++) {
for (size_t c = 0; c < channel; c++) {
for (size_t j = 0; j < dim; j++) {
aData[i * getStride() + c * dim + j] += scale * bData[c];
}
}
}
}
void CpuMatrix::collectBias(Matrix& a, real scale) {
CHECK_EQ(getHeight(), (size_t)1);
CHECK_EQ(width_, a.getWidth());
......@@ -2000,6 +2035,23 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
}
}
void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
CHECK_EQ(getHeight(), (size_t)1);
real* B = getData();
real* A = a.getData();
size_t numSamples = a.getHeight();
size_t channel = getWidth();
CHECK_EQ(a.getWidth() % channel, 0UL);
size_t dim = a.getWidth() / channel;
for (size_t i = 0; i < numSamples; i++) {
for (size_t c = 0; c < channel; c++) {
for (size_t j = 0; j < dim; j++) {
B[c] += scale * A[i * channel * dim + c * dim + j];
}
}
}
}
void CpuMatrix::sequenceAvgForward(Matrix& a,
const IVector& startsPos,
int mode) {
......
......@@ -343,11 +343,35 @@ public:
LOG(FATAL) << "Not implemented";
}
virtual void addSharedBias(Matrix& b, real scale) {
LOG(FATAL) << "Not implemented";
}
virtual void addBias(Matrix& b, real scale, bool sharedBias) {
if (!sharedBias) {
addBias(b, scale);
} else {
addSharedBias(b, scale);
}
}
/// add each sample from a to this.
virtual void collectBias(Matrix& a, real scale) {
LOG(FATAL) << "Not implemented";
}
virtual void collectSharedBias(Matrix& a, real scale) {
LOG(FATAL) << "Not implemented";
}
virtual void collectBias(Matrix& a, real scale, bool sharedBias) {
if (!sharedBias) {
collectBias(a, scale);
} else {
collectSharedBias(a, scale);
}
}
virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos,
int mode) {
LOG(FATAL) << "Not implemented";
......@@ -1021,6 +1045,7 @@ public:
/// add b to each sample of this.
void addBias(Matrix& b, real scale);
void addSharedBias(Matrix& b, real scale);
/**
* @code
......@@ -1028,6 +1053,7 @@ public:
* @endcode
*/
void collectBias(Matrix& a, real scale);
void collectSharedBias(Matrix& a, real scale);
void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
......@@ -1341,9 +1367,11 @@ public:
public:
/// add b to each sample of this.
void addBias(Matrix& b, real scale);
void addSharedBias(Matrix& b, real scale);
/// add each sample of a to this.
void collectBias(Matrix& a, real scale);
void collectSharedBias(Matrix& a, real scale);
void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
......
......@@ -21,6 +21,8 @@ limitations under the License. */
#include "paddle/math/SparseMatrix.h"
#include <gtest/gtest.h>
#include "paddle/gserver/tests/TestUtil.h"
#include "paddle/utils/Stat.h"
using namespace paddle; // NOLINT
using namespace std; // NOLINT
......@@ -2071,6 +2073,60 @@ TEST(Matrix, MaxOutFwdBwd) {
}
}
void testAddSharedBias(int numSamples, int dim, int channel) {
MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
cpuData->randomizeUniform();
gpuData->copyFrom(*cpuData);
cpuBias->randomizeUniform();
gpuBias->copyFrom(*cpuBias);
cpuData->addSharedBias(*cpuBias, 1.0);
gpuData->addSharedBias(*gpuBias, 1.0);
MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
check->copyFrom(*gpuData);
MatrixCheckErr(*cpuData, *check);
}
void testCollectSharedBias(int numSamples, int dim, int channel) {
MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
cpuData->randomizeUniform();
gpuData->copyFrom(*cpuData);
cpuBias->randomizeUniform();
gpuBias->copyFrom(*cpuBias);
cpuBias->collectSharedBias(*cpuData, 1.0);
gpuBias->collectSharedBias(*gpuData, 1.0);
MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
check->copyFrom(*gpuBias);
MatrixCheckErr(*cpuBias, *check);
}
TEST(Matrix, sharedBias) {
for (auto numSamples : {1, 100, 520}) {
for (auto dim : {100 * 16, 100 * 32}) {
for (auto channel : {8, 16}) {
VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
<< " channel=" << channel;
testAddSharedBias(numSamples, dim, channel);
testCollectSharedBias(numSamples, dim, channel);
}
}
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
......
......@@ -7,6 +7,7 @@ set(TRAINER_SOURCES
Tester.cpp
Trainer.cpp
TrainerInternal.cpp
TrainerBenchmark.cpp
ThreadParameterUpdater.cpp
TrainerInternalConfig.cpp
TrainerConfigHelper.cpp)
......
......@@ -99,6 +99,7 @@ public:
void startTrainPass();
void finishTrainPass();
void trainOneDataBatch(DataBatch& dataBatch);
void time();
/**
* given a dataBatch and the current parameter value
......
/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#undef PADDLE_DISABLE_TIMER
#include "Trainer.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/Util.h"
P_DECLARE_int32(test_period);
P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
namespace paddle {
void Trainer::time() {
startTrain();
trainerInternal_.getParameterUpdater()->startPass();
evaluator_->start();
DataBatch dataBatch;
int32_t batchSize = config_->getOptConfig().batch_size();
int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
<< num << " != " << batchSize;
CHECK(dataBatch.getSize()) << "No data from data provider";
std::vector<paddle::Argument> outputs;
// burning time
LOG(INFO) << "Burning time...";
for (int n = 0; n < 10; ++n) {
trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
}
LOG(INFO) << "Burning time end.";
for (int n = 0; n < FLAGS_test_period; n++) {
if (FLAGS_feed_data) {
REGISTER_TIMER("GetData");
num = dataProvider_->getNextBatch(batchSize, &dataBatch);
}
if (num != batchSize) {
break;
}
{
REGISTER_TIMER("FwdBwd");
trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
}
}
globalStat.setThreadInfo(true);
globalStat.printSegTimerStatus();
globalStat.reset();
finishTrain();
}
} // namespace paddle
......@@ -103,6 +103,8 @@ int main(int argc, char** argv) {
trainer.checkGradient();
} else if (FLAGS_job == "test") {
trainer.test();
} else if (FLAGS_job == "time") {
trainer.time();
} else {
LOG(FATAL) << "Unknown job type: " << FLAGS_job;
}
......
......@@ -255,7 +255,7 @@ sinclude(`ModelConfigLayer.proto.m4')
// (which is how convnets are usually trained). Setting this to
// false will untie the biases, yielding a separate bias for
// every location at which the filter is applied.
optional bool shared_biases = 8;
optional bool shared_biases = 8 [default = false];
// Valid values are ones that divide the area of the output
// grid in this convolutional layer. For example if this layer
......@@ -379,6 +379,9 @@ sinclude(`ModelConfigLayer.proto.m4')
// use to compute moving mean and variance.
optional real moving_average_fraction = 47 [default = 0.9];
// bias size
optional uint32 bias_size = 48 [default = 0];
}
message EvaluatorConfig {
......
......@@ -632,6 +632,44 @@ class ContextProjection(Projection):
_total_pad = 0
@config_class
class ConvProjection(Projection):
type = 'conv'
def __init__(
self,
input_layer_name,
num_filters=None,
conv_conf=None,
**xargs):
super(ConvProjection, self).__init__(input_layer_name, **xargs)
if num_filters is not None:
self.proj_conf.num_filters = num_filters
parse_conv(conv_conf,
input_layer_name,
self.proj_conf.conv_conf)
# TODO: support rectangle input
self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x ** 2) * num_filters
def calc_output_size(self, input_layer_config):
return self.proj_conf.output_size
def calc_parameter_size(self, input_size, output_size):
co = self.proj_conf.num_filters
ci = self.proj_conf.conv_conf.channels
fh = self.proj_conf.conv_conf.filter_size
fw = self.proj_conf.conv_conf.filter_size_y
return co * ci * fh * fw
def calc_bias_size(self):
return self.proj_conf.num_filters
def calc_parameter_dims(self, input_size, output_size):
return None
# Define a operator for mixed layer
@config_class
class Operator(Cfg):
......@@ -2528,8 +2566,15 @@ class MixedLayer(LayerBase):
record_operator_conf = self.config.operator_confs.add()
record_operator_conf.CopyFrom(operator_conf)
psize = self.config.size
if isinstance(self.inputs[0], ConvProjection):
self.config.shared_biases = True
psize = 0
for input in self.inputs:
psize += input.calc_bias_size()
self.create_bias_parameter(bias, self.config.size)
self.config.bias_size = psize
self.create_bias_parameter(bias, psize)
if error_clipping_threshold is not None:
self.config.error_clipping_threshold = error_clipping_threshold
......@@ -2547,8 +2592,10 @@ class ConcatenateLayer(LayerBase):
self,
name,
inputs,
bias=False,
**xargs):
config_assert(inputs, 'inputs cannot be empty')
config_assert(not bias, 'ConcatenateLayer cannot support bias.')
super(ConcatenateLayer, self).__init__(
name, 'concat', 0, inputs=inputs, **xargs)
size = 0
......@@ -2567,10 +2614,19 @@ class ConcatenateLayer2(LayerBase):
self,
name,
inputs,
bias=False,
**xargs):
config_assert(inputs, 'inputs cannot be empty')
super(ConcatenateLayer2, self).__init__(
name, 'concat2', 0, inputs=inputs, **xargs)
if isinstance(self.inputs[0], ConvProjection):
for input_index in xrange(len(self.inputs) - 1):
input = self.inputs[input_index + 1]
config_assert(isinstance(input, ConvProjection),
"The first input of ConcatenateLayer2 is ConvProjection, "
"the other inputs should also be ConvProjection.")
size = 0
for input_index in xrange(len(self.inputs)):
input_layer = self.get_input_layer(input_index)
......@@ -2596,6 +2652,16 @@ class ConcatenateLayer2(LayerBase):
input.proj_conf.output_size)
self.create_input_parameter(input_index, psize, dims)
psize = self.config.size
if isinstance(self.inputs[0], ConvProjection):
self.config.shared_biases = True
psize = 0
for input in self.inputs:
psize += input.calc_bias_size()
self.config.bias_size = psize
self.create_bias_parameter(bias, psize)
@config_layer('recurrent')
class RecurrentLayer(LayerBase):
def __init__(
......
......@@ -34,7 +34,7 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
"table_projection", "mixed_layer", "data_layer",
"embedding_layer", "fc_layer", "grumemory",
"pooling_layer", "lstmemory", "last_seq", "first_seq",
"cos_sim", "hsigmoid",
"cos_sim", "hsigmoid", "conv_projection",
"regression_cost", 'classification_cost', "LayerOutput",
'img_conv_layer', 'img_pool_layer', 'batch_norm_layer',
'img_cmrnorm_layer', 'addto_layer',
......@@ -1984,7 +1984,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
@wrap_act_default(act=IdentityActivation())
@wrap_name_default("concat")
@layer_support()
def concat_layer(input, act=None, name=None, layer_attr=None):
def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
"""
Concat all input vector into one huge vector.
Inputs can be list of LayerOutput or list of projection.
......@@ -2043,10 +2043,14 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
layer_type = (LayerType.CONCAT_LAYER if is_concat_layer
else LayerType.CONCAT_PROJ_LAYER)
if layer_type == LayerType.CONCAT_LAYER:
assert not bias_attr
Layer(
name=name, type=layer_type,
inputs=[x.name for x in input] if is_concat_layer else input,
active_type=act.name,
bias=ParamAttr.to_bias(bias_attr),
**ExtraLayerAttribute.to_kwargs(layer_attr)
)
......@@ -2950,6 +2954,103 @@ def conv_operator(img, filter, filter_size, num_filters,
op.origin = [img, filter]
return op
@wrap_param_attr_default()
def conv_projection(input, filter_size, num_filters,
num_channels=None, stride=1, padding=0,
filter_size_y=None, stride_y=None, padding_y=None,
groups=1, param_attr=None):
"""
ConvProjection with a layer as input.
It performs element-wise multiplication with weight.
Different from img_conv_layer and conv_op, conv_projection is an Projection,
which can be used in mixed_layer and conat_layer. It use cudnn to implement
conv and only support GPU mode.
The example usage is:
.. code-block:: python
proj = conv_projection(img=input1,
filter_size=3,
num_filters=64,
num_channels=64)
:param input: input layer
:type input: LayerOutput
:param filter_size: The x dimension of a filter kernel.
:type filter_size: int
:param filter_size_y: The y dimension of a filter kernel. Since
PaddlePaddle now supports rectangular filters,
the filter's shape can be (filter_size, filter_size_y).
:type filter_size_y: int
:param num_filters: channel of output data.
:type num_filters: int
:param num_channel: channel of input data.
:type num_channel: int
:param stride: The x dimension of the stride.
:type stride: int
:param stride_y: The y dimension of the stride.
:type stride_y: int
:param padding: The x dimension of padding.
:type padding: int
:param padding_y: The y dimension of padding.
:type padding_y: int
:param groups: The group number.
:type groups: int
:param param_attr: Convolution param attribute. None means default attribute
:type param_attr: ParameterAttribute
:return: A DotMulProjection Object.
:rtype: DotMulProjection
"""
if num_channels is None:
assert input.num_filters is not None
num_channels = input.num_filters
if filter_size_y is None:
if isinstance(filter_size, collections.Sequence):
assert len(filter_size) == 2
filter_size, filter_size_y = filter_size
else:
filter_size_y = filter_size
if stride_y is None:
if isinstance(stride, collections.Sequence):
assert len(stride) == 2
stride, stride_y = stride
else:
stride_y = stride
if padding_y is None:
if isinstance(padding, collections.Sequence):
assert len(padding) == 2
padding, padding_y = padding
else:
padding_y = padding
if param_attr.attr.get('initial_smart'):
# special initial for conv layers.
init_w = (2.0 / (filter_size ** 2 * num_channels)) ** 0.5
param_attr.attr["initial_mean"] = 0.0
param_attr.attr["initial_std"] = init_w
param_attr.attr["initial_strategy"] = 0
param_attr.attr["initial_smart"] = False
proj = ConvProjection(input_layer_name=input.name,
num_filters=num_filters,
conv_conf=Conv(filter_size=filter_size,
padding=padding,
stride=stride,
channels=num_channels,
filter_size_y=filter_size_y,
padding_y=padding_y,
stride_y=stride_y,
groups=groups),
**param_attr.attr)
proj.origin = input
return proj
@wrap_name_default()
@layer_support()
......
......@@ -29,7 +29,7 @@ __all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
"img_conv_bn_pool", 'dropout_layer', 'lstmemory_group',
'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
'text_conv_pool',
'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
'bidirectional_lstm', 'inputs', 'outputs']
......@@ -811,22 +811,37 @@ def simple_gru(input,
gru_layer_attr=None
):
"""
simple_gru is also a recurrent layer group version Gated Recurrent Unit as
gru_group. The difference only lies in implemention details.
You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
simple_gru in network.py. The reason why there are so many interfaces is
that we have two ways to implement recurrent neural network. One way is to
use one complete layer to implement rnn (including simple rnn, gru and lstm)
with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
the multiplication operation :math:`W x_t` is not computed in these layers.
See details in their interfaces in layers.py.
The other implementation is to use an recurrent group which can ensemble a
series of layers to compute rnn step by step. This way is flexible for
attenion mechanism or other complex connections.
- gru_step_layer: only compute rnn by one step. It needs an memory as input
and can be used in recurrent group.
- gru_unit: a wrapper of gru_step_layer with memory.
- gru_group: a GRU cell implemented by a combination of multiple layers in
recurrent group.
But :math:`W x_t` is not done in group.
- gru_memory: a GRU cell implemented by one layer, which does same calculation
with gru_group and is faster than gru_group.
- simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and
gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
formula in grumemory.
The computational speed is that, grumemory is relatively better than
gru_group, and gru_group is relatively better than simple_gru.
simple_gru does exactly the same calculation as the grumemory layer does.
Please see grumemory in layers.py for more detail about the maths.
The example usage is:
.. code-block:: python
gru = gur_group(input=[layer1],
size=256,
act=TanhActivation(),
gate_act=SigmoidActivation())
gru = simple_gru(input=[layer1], size=256)
:param input: input layer name.
:type input: LayerOutput
......@@ -863,6 +878,132 @@ def simple_gru(input,
gru_layer_attr=gru_layer_attr)
@wrap_name_default('simple_gru2')
def simple_gru2(input,
size,
name=None,
reverse=False,
mixed_param_attr=None,
mixed_bias_attr=None,
gru_param_attr=None,
gru_bias_attr=None,
act=None,
gate_act=None,
mixed_layer_attr=None,
gru_cell_attr=None
):
"""
simple_gru2 is the same with simple_gru, but using grumemory instead
Please see grumemory in layers.py for more detail about the maths.
simple_gru2 is faster than simple_gru.
The example usage is:
.. code-block:: python
gru = simple_gru2(input=[layer1], size=256)
:param input: input layer name.
:type input: LayerOutput
:param name: name of the gru group.
:type name: basestring
:param size: hidden size of the gru.
:type size: int
:param reverse: whether to process the input data in a reverse order
:type reverse: bool
:param act: type of the activiation
:type act: BaseActivation
:param gate_act: type of the gate activiation
:type gate_act: BaseActivation
:param gru_bias_attr: bias. False means no bias, None means default bias.
:type gru_bias_attr: ParameterAttribute|False
:param gru_layer_attr: Extra parameter attribute of the gru layer.
:type gru_layer_attr: ParameterAttribute|False
:return: the gru group.
:rtype: LayerOutput
"""
with mixed_layer(name='%s_transform' % name,
size=size * 3,
bias_attr=mixed_bias_attr,
layer_attr=mixed_layer_attr) as m:
m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
return grumemory(name=name,
size=size,
input=m,
reverse=reverse,
bias_attr=gru_bias_attr,
param_attr=gru_param_attr,
act=act,
gate_act=gate_act,
layer_attr=gru_cell_attr)
@wrap_name_default("bidirectional_gru")
def bidirectional_gru(input, size, name=None, return_seq=False,
fwd_mixed_param_attr=None, fwd_mixed_bias_attr=None,
fwd_gru_param_attr=None, fwd_gru_bias_attr=None,
fwd_act=None, fwd_gate_act=None,
fwd_mixed_layer_attr=None, fwd_gru_cell_attr=None,
bwd_mixed_param_attr=None, bwd_mixed_bias_attr=None,
bwd_gru_param_attr=None, bwd_gru_bias_attr=None,
bwd_act=None, bwd_gate_act=None,
bwd_mixed_layer_attr=None, bwd_gru_cell_attr=None,
last_seq_attr=None, first_seq_attr=None,
concat_attr=None, concat_act=None):
"""
A bidirectional_gru is a recurrent unit that iterates over the input
sequence both in forward and bardward orders, and then concatenate two
outputs to form a final output. However, concatenation of two outputs
is not the only way to form the final output, you can also, for example,
just add them together.
The example usage is:
.. code-block:: python
bi_gru = bidirectional_gru(input=[input1], size=512)
:param name: bidirectional gru layer name.
:type name: basestring
:param input: input layer.
:type input: LayerOutput
:param size: gru layer size.
:type size: int
:param return_seq: If set False, outputs of the last time step are
concatenated and returned.
If set True, the entire output sequences that are
processed in forward and backward directions are
concatenated and returned.
:type return_seq: bool
:return: LayerOutput object.
:rtype: LayerOutput
"""
args = locals()
fw = simple_gru2(name='%s_fw' % name, input=input, size=size,
**dict((k[len('fwd_'):], v) for k, v in args.iteritems()
if k.startswith('fwd_')))
bw = simple_gru2(name="%s_bw" % name, input=input, size=size,
reverse=True,
**dict((k[len('bwd_'):], v) for k, v in args.iteritems()
if k.startswith('bwd_')))
if return_seq:
return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
act=concat_act)
else:
fw_seq = last_seq(name="%s_fw_last" % name, input=fw,
layer_attr=last_seq_attr)
bw_seq = first_seq(name="%s_bw_last" % name, input=bw,
layer_attr=first_seq_attr)
return concat_layer(name=name, input=[fw_seq, bw_seq],
layer_attr=concat_attr, act=concat_act)
@wrap_name_default("bidirectional_lstm")
def bidirectional_lstm(input, size, name=None, return_seq=False,
fwd_mat_param_attr=None, fwd_bias_param_attr=None,
......@@ -893,7 +1034,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
.. code-block:: python
lstm_step = bidirectional_lstm(input=[input1], size=512)
bi_lstm = bidirectional_lstm(input=[input1], size=512)
:param name: bidirectional lstm layer name.
:type name: basestring
......@@ -907,7 +1048,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
processed in forward and backward directions are
concatenated and returned.
:type return_seq: bool
:return: lstm layer name.
:return: LayerOutput object accroding to the return_seq.
:rtype: LayerOutput
"""
args = locals()
......
86c0815275a9d5eb902e23c6a592f58a img_layers.protostr
a5d9259ff1fd7ca23d0ef090052cb1f2 last_first_seq.protostr
9c038249ec8ff719753a746cdb04c026 layer_activations.protostr
5913f87b39cee3b2701fa158270aca26 projections.protostr
34e04043cbb12931c47fa44ec50eeffc projections.protostr
7334ba0a4544f0623231330fc51d390d shared_fc.protostr
8b8b6bb128a7dfcc937be86145f53e2f shared_lstm.protostr
bb8e233b05b8e07f9ed386b7aee4f2c6 shared_lstm.protostr
6b39e34beea8dfb782bee9bd3dea9eb5 simple_rnn_layers.protostr
f98e79e1630d5eb827c300e64836d269 test_bi_grumemory.protostr
0fc1409600f1a3301da994ab9d28b0bf test_cost_layers.protostr
6cd5f28a3416344f20120698470e0a4c test_cost_layers_with_weight.protostr
144bc6d3a509de74115fa623741797ed test_expand_layer.protostr
......@@ -15,7 +16,7 @@ d350bd91a0dc13e854b1364c3d9339c6 test_lstmemory_layer.protostr
5433ed33d4e7414eaf658f2a55946186 test_maxout.protostr
251a948ba41c1071afcd3d9cf9c233f7 test_ntm_layers.protostr
e6ff04e70aea27c7b06d808cc49c9497 test_print_layer.protostr
2a75dd33b640c49a8821c2da6e574577 test_rnn_group.protostr
fded24727338fb8ce44d9951ed8aea08 test_rnn_group.protostr
67d6fde3afb54f389d0ce4ff14726fe1 test_sequence_pooling.protostr
f586a548ef4350ba1ed47a81859a64cb unused_layers.protostr
8122477f4f65244580cec09edc590041 util_layers.protostr
f937a5a6e7e8864b4d8cf56b0f7c7f44 util_layers.protostr
......@@ -9,7 +9,7 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer
last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
test_maxout)
test_maxout test_bi_grumemory)
for conf in ${configs[*]}
......
from paddle.trainer_config_helpers import *
settings(
batch_size=1000,
learning_rate=1e-4
)
din = data_layer(name='data', size=120)
outputs(bidirectional_gru(input=din, size=40, return_seq=True))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册