diff --git a/Dockerfile b/Dockerfile index b6f99ca539d077164c71d797a5ccda7b1b5c44ba..39af60966b6cab7d8b9e644f4ea658613f8ba518 100644 --- a/Dockerfile +++ b/Dockerfile @@ -30,7 +30,8 @@ RUN apt-get update && \ python-numpy python-matplotlib gcc g++ \ automake locales clang-format-3.8 swig doxygen cmake \ liblapack-dev liblapacke-dev libboost-dev \ - clang-3.8 llvm-3.8 libclang-3.8-dev && \ + clang-3.8 llvm-3.8 libclang-3.8-dev \ + net-tools && \ apt-get clean -y # Install Go diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index 1efa74ecda4170332d96603ca2253c68468474f9..c7b017bc07b25bc606fd838a5fb9d3715f4faecb 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -59,6 +59,11 @@ context_projection .. autoclass:: paddle.v2.layer.context_projection :noindex: +row_conv +-------- +.. autoclass:: paddle.v2.layer.row_conv + :noindex: + Image Pooling Layer =================== @@ -130,7 +135,7 @@ recurrent_group --------------- .. autoclass:: paddle.v2.layer.recurrent_group :noindex: - + lstm_step --------- .. autoclass:: paddle.v2.layer.lstm_step @@ -145,12 +150,12 @@ beam_search ------------ .. autoclass:: paddle.v2.layer.beam_search :noindex: - + get_output ---------- .. autoclass:: paddle.v2.layer.get_output :noindex: - + Mixed Layer =========== @@ -203,7 +208,7 @@ trans_full_matrix_projection ---------------------------- .. autoclass:: paddle.v2.layer.trans_full_matrix_projection :noindex: - + Aggregate Layers ================ @@ -346,6 +351,12 @@ sampling_id .. autoclass:: paddle.v2.layer.sampling_id :noindex: +multiplex +--------- +.. autoclass:: paddle.v2.layer.multiplex + :noindex: + + Slicing and Joining Layers ========================== @@ -434,10 +445,26 @@ smooth_l1_cost .. autoclass:: paddle.v2.layer.smooth_l1_cost :noindex: -Check Layer +Check Layer ============ eos --- .. autoclass:: paddle.v2.layer.eos :noindex: + +Miscs +===== + +dropout +-------------- +.. autoclass:: paddle.v2.layer.dropout + :noindex: + +Activation with learnable parameter +=================================== + +prelu +-------- +.. autoclass:: paddle.v2.layer.prelu + :noindex: diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst index b2a617fff134035c04eeabbbaf6d9cbe2a525f1c..6e813ab1a820d068ea3e54cad6178f1cf928eadc 100644 --- a/doc/api/v2/config/networks.rst +++ b/doc/api/v2/config/networks.rst @@ -125,11 +125,3 @@ simple_attention :members: simple_attention :noindex: -Miscs -===== - -dropout_layer --------------- -.. automodule:: paddle.v2.networks - :members: dropout_layer - :noindex: diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 9898dc083ebb1783a0e2ddd12afaa9c3d5a79e98..47ca1833967ee705d6558b1dad06a6335b30f03a 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -8,6 +8,7 @@ add_subdirectory(gserver) add_subdirectory(pserver) add_subdirectory(trainer) add_subdirectory(scripts) +add_subdirectory(strings) # Do not build go directory until go cmake is working smoothly. # if(CMAKE_Go_COMPILER) diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index e147659566dba6cfbfd677e3b616bdaa4a73485c..071bc36c2ded51ba977350aeae15f6d244cea5be 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -41,6 +41,7 @@ SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS paddle_network paddle_proto ${external_project_dependencies} + ${RDMA_LIBS} ) IF(APPLE) @@ -73,6 +74,7 @@ SWIG_LINK_LIBRARIES(swig_paddle ${CMAKE_DL_LIBS} ${EXTERNAL_LIBS} ${CMAKE_THREAD_LIBS_INIT} + ${RDMA_LD_FLAGS} ${START_END} ) diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt index 233a53709a80f06dd2a06995b159c1aef10e2788..1f54ac1231c6ac2e19b25bb336292194c63c11e9 100644 --- a/paddle/function/CMakeLists.txt +++ b/paddle/function/CMakeLists.txt @@ -28,6 +28,7 @@ if(WITH_TESTING) add_simple_unittest(PadOpTest) add_simple_unittest(MulOpTest) add_simple_unittest(CosSimOpTest) + add_simple_unittest(RowConvOpTest) endif() endif() diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b6501e8f4db7fd33891cd80e07a6f36dd0b34532 --- /dev/null +++ b/paddle/function/RowConvOp.cpp @@ -0,0 +1,225 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "RowConvOp.h" +#include +#include "paddle/math/Vector.h" + +namespace paddle { + +template <> +void RowConv(CpuMatrix& out, + const CpuMatrix& in, + const CpuMatrix& filter, + const CpuIVector& seq) { + const int* starts = seq.getData(); + const size_t numSeq = seq.getSize() - 1; + const size_t contextLength = filter.getHeight(); + for (size_t i = 0; i < numSeq; ++i) { + size_t begin = starts[i]; + size_t end = starts[i + 1]; + for (size_t j = begin; j < end; ++j) { + MatrixPtr x; + MatrixPtr w; + if ((j + contextLength) < end) { + x = (const_cast(in)).subMatrix(j, contextLength); + w = (const_cast(filter)).subMatrix(0, contextLength); + } else { + x = (const_cast(in)).subMatrix(j, end - j); + w = (const_cast(filter)).subMatrix(0, end - j); + } + MatrixPtr y = out.subMatrix(j, 1); + y->addDotMulVMM(*x, *w); + } + } +} + +template <> +void RowConvGrad(const CpuMatrix& outG, + const CpuMatrix& in, + const CpuMatrix& filter, + CpuMatrix& inG, + CpuMatrix& filterG, + const CpuIVector& seq) { + // gradient w.r.t filter + const int* starts = seq.getData(); + const size_t numSeq = seq.getSize() - 1; + const size_t contextLength = filter.getHeight(); + if (filterG) { + for (size_t i = 0; i < numSeq; ++i) { + size_t begin = starts[i]; + size_t end = starts[i + 1]; + size_t steps = end - begin; + for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) { + MatrixPtr x = + (const_cast(in)).subMatrix(begin + j, steps - j); + MatrixPtr dy = + (const_cast(outG)).subMatrix(begin, steps - j); + MatrixPtr dw = filterG.subMatrix(j, 1); + dw->addDotMulVMM(*dy, *x); + } + } + } + + // gradient w.r.t input feature + if (inG) { + for (size_t i = 0; i < numSeq; ++i) { + size_t begin = starts[i]; + size_t end = starts[i + 1]; + size_t steps = end - begin; + for (size_t j = 0; j < steps; ++j) { + MatrixPtr dx = inG.subMatrix(begin + j, 1); + for (size_t t = 0; t < contextLength; ++t) { + if (int(j - t) >= 0) { + MatrixPtr dy = + (const_cast(outG)).subMatrix(begin + j - t, 1); + MatrixPtr w = (const_cast(filter)).subMatrix(t, 1); + dx->addDotMul(*dy, *w, 1.0, 1.0); + } + } + } + } + } +} + +/** + * \brief The row convolution is called lookahead convolution. It is firstly + * introduced in deep-speech2 system. The bidirectional RNN that learns + * representation for a sequence by performing a forward and a backward pass + * through the entire sequence. However, unlike unidirectional RNNs, + * bidirectional RNNs are challenging to deploy in an online and low-latency + * setting. The lookahead convolution incorporates information from future + * subsequences in a computationally efficient manner to improve unidirectional + * recurrent neural networks. + * + * The connection of row convolution is different form the 1D sequence + * convolution. Assumed that, the future context-length is k, that is to say, + * it can get the output at timestep t by using the the input feature from t-th + * timestep to (t+k)-th timestep. Assumed that the hidden dim of input + * activations are d, the activations r_t for the new layer at time-step t are: + * + * + * -- k + 1 + * r(t,i) = > W(i,j) * h(t+j-1, i), for (1 <= i <= d) + * -- j = 1 + * + * + * The weight shape is: (k + 1) x d + * Function Arguments: + * + * \param inputs[0] The input activations. + * \param inputs[0] The filter (or weight) and shape is (k+1) x d. + * \param outputs[1] The output activations. + * + * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in + * English + * and Mandarin. https://arxiv.org/abs/1512.02595 + */ + +template +class RowConvFunc : public FunctionBase { +public: + void init(const FuncConfig& config) override {} + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + // check + CHECK_EQ(2UL, inputs.size()); + CHECK_EQ(1UL, outputs.size()); + // TODO(qingqing): support ASSIGN_TO. + CHECK_EQ(outputs[0].getArgType(), ADD_TO); + CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg()) + << "SequenceArg required here."; + const auto in = dynamic_cast(inputs[0]); + auto out = dynamic_cast(outputs[0]); + auto w = inputs[1]; + CHECK(in.data() && out.data() && in.getSequenceId().data()); + CHECK_EQ(in.shape().ndims(), 2UL); + CHECK(in.shape() == out.shape()); + CHECK_EQ(w.shape()[1], in.shape()[1]); + + auto outMat = out.matrix(); + const auto inMat = in.matrix(); + const auto wMat = w.matrix(); + const auto seqId = in.getSequenceId().vector(); + + RowConv(outMat, inMat, wMat, seqId); + } +}; + +/** + * \brief The backward of row convolution function. This function calculated + * the gradient w.r.t filter and the gradient w.r.t input activations(or data). + * + * Argument in this Function: + * + * \param inputs[0] The gradient w.r.t output activations. + * \param inputs[1] The input activations. + * \param inputs[2] The filter (or weight) and shape is (k+1) x d. + * \param outputs[0] The gradient w.r.t input activations. + * \param outputs[1] The gradient w.r.r filter. + * + * Abbreviation: + * w.r.t: with respect to. + */ + +template +class RowConvGradFunc : public FunctionBase { + // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc +public: + void init(const FuncConfig& config) override {} + + void calc(const BufferArgs& inputs, const BufferArgs& outputs) override { + // check + CHECK_EQ(3UL, inputs.size()); + CHECK_EQ(2UL, outputs.size()); + CHECK_EQ(outputs[0].getArgType(), ADD_TO); + CHECK_EQ(outputs[1].getArgType(), ADD_TO); + CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() && + outputs[0].isSequenceArg()) + << "SequenceArg required here."; + + const auto outGrad = dynamic_cast(inputs[0]); + const auto in = dynamic_cast(inputs[1]); + const auto w = inputs[2]; + auto inGrad = dynamic_cast(outputs[0]); + auto wGrad = outputs[1]; + + CHECK_EQ(in.shape().ndims(), 2UL); + CHECK(in.shape() == inGrad.shape()); + CHECK(in.shape() == outGrad.shape()); + CHECK_EQ(wGrad.shape()[1], in.shape()[1]); + + const auto outGMat = outGrad.matrix(); + const auto inMat = in.matrix(); + const auto wMat = w.matrix(); + auto inGMat = inGrad.data() + ? inGrad.matrix() + : typename Tensor::Matrix(nullptr, 0, 0); + auto wGMat = wGrad.data() + ? wGrad.matrix() + : typename Tensor::Matrix(nullptr, 0, 0); + const auto seqId = in.getSequenceId().vector(); + + RowConvGrad(outGMat, inMat, wMat, inGMat, wGMat, seqId); + } +}; + +REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc); +REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc); +#ifndef PADDLE_ONLY_CPU +REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc); +REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc); +#endif + +} // namespace paddle diff --git a/paddle/function/RowConvOp.h b/paddle/function/RowConvOp.h new file mode 100644 index 0000000000000000000000000000000000000000..2c5de6151aa2ed73ace1569d880b1c3677c195c4 --- /dev/null +++ b/paddle/function/RowConvOp.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Function.h" + +namespace paddle { + +/** + * \brief The forward of row convolution. + * + * \param[out] out The output data and shape is h x d. h is the sum of + * time steps of all samples in one mini-batch. + * \param[in] in The input data and shape is h x d. + * \param[in] filter The filter and shape is k x d. The lookahead step + * number plus one equals k. + * \param[in] seq The sequence start positions. + * + */ +template +void RowConv(typename Tensor::Matrix& out, + const typename Tensor::Matrix& in, + const typename Tensor::Matrix& filter, + const typename Tensor::Vector& seq); + +/** + * \brief The backward of row convolution. + * + * \param[in] outG The gradient w.r.t output data. + * \param[in] in The input data. + * \param[in] filter The filter. + * \param[out] inG The gradient w.r.t input data. + * \param[out] filterG The gradient w.r.t filter. + * \param[in] seq The sequence start positions. + * + */ +template +void RowConvGrad(const typename Tensor::Matrix& outG, + const typename Tensor::Matrix& in, + const typename Tensor::Matrix& filter, + typename Tensor::Matrix& inG, + typename Tensor::Matrix& filterG, + const typename Tensor::Vector& seq); +} // namespace paddle diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu new file mode 100644 index 0000000000000000000000000000000000000000..c0b947e224313abaf4fadfb8293dc78ca085ff84 --- /dev/null +++ b/paddle/function/RowConvOpGpu.cu @@ -0,0 +1,344 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "hl_base.h" +#include "RowConvOp.h" + +namespace paddle { + +template +__global__ void KeRowConv(real* y, const real* x, const real* w, + const int* starts, const int height, const int width, + const int numSeq, const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + __shared__ real sw[BLOCK_H][BLOCK_W]; + + for (int i = tidy; i < context; i += blky) { + sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + } + + __syncthreads(); + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += blky) { + real sum = 0; + int off = (start + j) * width; + for (int t = 0; t < context; ++t) { + if ((start + j + t) < end) { + int xoff = off + t * width; + real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0; + sum += sw[t][tidx] * xVal; + } + } + if (gidx + tidx < width) { + y[off + gidx + tidx] += sum; + } + } + } +} + +__global__ void KeRowConv2(real* y, const real* x, const real* w, + const int* starts, const int height, const int width, + const int numSeq, const int context) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += blky) { + int off = (start + j) * width; + real sum = 0; + for (int t = 0; t < context && (start + j + t) < end; ++t) { + int xoff = off + t * width; + real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0; + real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0; + sum += wd * xd; + } + if (gidx + tidx < width) { + y[off + gidx + tidx] += sum; + } + } + } +} + + + +template <> +void RowConv(GpuMatrix& out, + const GpuMatrix& in, + const GpuMatrix& filter, + const GpuIVector& seq) { + const size_t numSeq = seq.getSize() - 1; + const size_t contextLength = filter.getHeight(); + const size_t height = in.getHeight(); + const size_t width = in.getWidth(); + + real* y = out.getData(); + const real* x = in.getData(); + const real* w = filter.getData(); + const int* starts = seq.getData(); + + dim3 dimBlock(32, 32); + dim3 dimGrid(DIVUP(width, dimBlock.x), 1); + + if (contextLength <= 32) { + KeRowConv<32, 32><<>> + (y, x, w, starts, height, width, numSeq, contextLength); + } else { + KeRowConv2<<>> + (y, x, w, starts, height, width, numSeq, contextLength); + } + CHECK_SYNC("RowConv"); +} + + +template +__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy, + const int* starts, const int height, const int width, const int numSeq, + const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + __shared__ real sh_x[BLOCK_W][BLOCK_H]; + __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1]; + __shared__ real sh_dw[CONTEXT][BLOCK_W]; + + if (tidy < context) { + sh_dw[tidy][tidx] = 0.0; + } + __syncthreads(); + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; + for (int j = tidy; j < size; j += BLOCK_H) { + int xoff = gidx + tidx; + int yoff = start + j; + + // transpose + sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; + sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0; + __syncthreads(); + if (tidy < (context - 1)) { + yoff = yoff - context + 1; + sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0; + } + __syncthreads(); + + for (int t = 0; t < context; t++) { + real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t]; + __syncthreads(); + // warp size and blockDim.x is 32. + val += __shfl_down(val, 16); + val += __shfl_down(val, 8); + val += __shfl_down(val, 4); + val += __shfl_down(val, 2); + val += __shfl_down(val, 1); + __syncthreads(); + if (tidx == 0) { + sh_dw[t][tidy] += val; + } + __syncthreads(); + } + } + } + + for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) { + dw[t * width + gidx + tidx] += sh_dw[t][tidx]; + } +} + +template +__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy, + const int* starts, const int height, const int width, const int numSeq, + const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int gidx = blockIdx.x * blockDim.x; + + __shared__ real sh_x[BLOCK_H][BLOCK_W]; + __shared__ real sh_dy[BLOCK_H][BLOCK_W]; + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + + const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H; + for (int j = tidy; j < size; j += BLOCK_H) { + int xoff = gidx + tidx; + int yoff = start + j; + + // transpose + sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0; + __syncthreads(); + + for (int t = 0; t < context; t++) { + sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0; + __syncthreads(); + + real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx]; + __syncthreads(); + // warp size and blockDim.x is 32. + val += __shfl_down(val, 16); + val += __shfl_down(val, 8); + val += __shfl_down(val, 4); + val += __shfl_down(val, 2); + val += __shfl_down(val, 1); + __syncthreads(); + + if (tidx == 0 && (gidx + tidy) < width) { + dw[t*width + gidx + tidy] += val; + } + } + } + } +} + +template +__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy, + const int* starts, const int height, const int width, const int numSeq, + const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + __shared__ real sw[BLOCK_H][BLOCK_W]; + + for (int i = tidy; i < context; i += blky) { + sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0; + } + + __syncthreads(); + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += blky) { + real sum = 0; + int off = (start + j) * width; + for (int t = 0; t < context && (j - t) >= 0; ++t) { + int dyOff = off - t * width; + real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0; + sum += sw[t][tidx] * dyVal; + } + if (gidx + tidx < width) { + dx[off + gidx + tidx] += sum; + } + } + } +} + +__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy, + const int* starts, const int height, const int width, const int numSeq, + const int context) { + + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int blky = blockDim.y; + const int gidx = blockIdx.x * blockDim.x; + + for (int i = 0; i < numSeq; ++i) { + const int start = starts[i]; + const int end = starts[i + 1]; + const int steps = end - start; + for (int j = tidy; j < steps; j += blky) { + real sum = 0; + int off = (start + j) * width; + for (int t = 0; t < context && (j - t) >= 0; ++t) { + int dyOff = off - t * width; + real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0; + real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0; + sum += wVal * dyVal; + } + if (gidx + tidx < width) { + dx[off + gidx + tidx] += sum; + } + } + } +} + + +template <> +void RowConvGrad(const GpuMatrix& outG, + const GpuMatrix& in, + const GpuMatrix& filter, + GpuMatrix& inG, + GpuMatrix& filterG, + const GpuIVector& seq) { + const size_t numSeq = seq.getSize() - 1; + const size_t contextLength = filter.getHeight(); + const size_t height = in.getHeight(); + const size_t width = in.getWidth(); + + const real* dy = outG.getData(); + const real* x = in.getData(); + const real* w = filter.getData(); + const int* starts = seq.getData(); + + if (filterG) { + dim3 dimBlock(32, 32); + dim3 dimGrid(DIVUP(width, dimBlock.x), 1); + real* dw = filterG.getData(); + if (contextLength <= 32) { + KeRowConvBwWeight<32, 32, 32> + <<>> + (dw, x, dy, starts, height, width, numSeq, contextLength); + } else { + KeRowConvBwWeight2<32, 32> + <<>> + (dw, x, dy, starts, height, width, numSeq, contextLength); + } + } + + if (inG) { + real* dx = inG.getData(); + dim3 dimBlock2(32, 32); + dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1); + if (contextLength <= 64) { + KeRowConvBwData<32, 64> + <<>> + (dx, w, dy, starts, height, width, numSeq, contextLength); + } else { + KeRowConvBwData2 + <<>> + (dx, w, dy, starts, height, width, numSeq, contextLength); + } + } + + CHECK_SYNC("RowConvGrad"); +} + +} // namespace paddle diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/function/RowConvOpTest.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1c95d3ff2cccbf33f4c5f91f6daf340871a8f7b0 --- /dev/null +++ b/paddle/function/RowConvOpTest.cpp @@ -0,0 +1,62 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "FunctionTest.h" + +namespace paddle { + +void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) { + FunctionCompare test("RowConv", FuncConfig()); + + test.addSequence(SequenceIdArg(TensorShape{batchSize})); + test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim})); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim})); + + test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}), + ADD_TO); + + test.run(); +} + +void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) { + FunctionCompare test("RowConvGrad", FuncConfig()); + + test.addSequence(SequenceIdArg(TensorShape{batchSize})); + test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim})); + test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim})); + test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim})); + + test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}), + ADD_TO); + test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}), + ADD_TO); + + test.run(); +} + +TEST(RowConv, real) { + for (size_t numSamples : {17, 129, 2020}) { + for (size_t dim : {16, 512, 2560}) { + for (size_t context : {3, 19, 65}) { + VLOG(3) << " numSamples=" << numSamples << " dim=" << dim + << " context length=" << context; + testRowConvFw(numSamples, dim, context); + testRowConvBw(numSamples, dim, context); + } + } + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/RowConvLayer.cpp b/paddle/gserver/layers/RowConvLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..54d77999ad5b30a8d9f4feaa02d81417957544a7 --- /dev/null +++ b/paddle/gserver/layers/RowConvLayer.cpp @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "RowConvLayer.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +REGISTER_LAYER(row_conv, RowConvLayer); + +bool RowConvLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parent class */ + Layer::init(layerMap, parameterMap); + + contexLength_ = config_.inputs(0).row_conv_conf().context_length(); + + CHECK_EQ(inputLayers_.size(), 1UL); + weight_.reset(new Weight(contexLength_, getSize(), parameters_[0])); + createFunction(forward_, "RowConv", FuncConfig()); + createFunction(backward_, "RowConvGrad", FuncConfig()); + + return true; +} + +void RowConvLayer::forward(PassType passType) { + Layer::forward(passType); + MatrixPtr input = getInputValue(0); + size_t height = input->getHeight(); + size_t width = input->getWidth(); + CHECK_EQ(width, getSize()); + resetOutput(height, width); + + const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_); + MatrixPtr w = weight_->getW(); + wDims_ = TensorShape({w->getHeight(), w->getWidth()}); + + MatrixPtr outV = getOutputValue(); + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getInputValue(0), *startPos); + inputs.addArg(*w, wDims_); + outputs.addArg(*getOutputValue(), *startPos, ADD_TO); + + { + REGISTER_TIMER_INFO("RowConvForward", getName().c_str()); + forward_[0]->calc(inputs, outputs); + } + + /* activation */ { + REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str()); + forwardActivation(); + } +} + +void RowConvLayer::backward(const UpdateCallback& callback) { + /* Do derivation */ { + REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str()); + backwardActivation(); + } + + const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_); + + BufferArgs inputs; + BufferArgs outputs; + inputs.addArg(*getOutputGrad(), *startPos); + inputs.addArg(*getInputValue(0), *startPos); + inputs.addArg(*weight_->getW(), wDims_); + + MatrixPtr inGrad = getInputGrad(0); + MatrixPtr wGrad = weight_->getWGrad(); + size_t h = getInputValue(0)->getHeight(); + size_t w = getInputValue(0)->getWidth(); + outputs.addArg( + inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)), + *startPos, + ADD_TO); + outputs.addArg( + wGrad ? (*wGrad) + : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)), + wDims_, + ADD_TO); + + { + REGISTER_TIMER_INFO("RowConvBackward", getName().c_str()); + backward_[0]->calc(inputs, outputs); + } + + { + REGISTER_TIMER_INFO("WeightUpdate", getName().c_str()); + weight_->getParameterPtr()->incUpdate(callback); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/gserver/layers/RowConvLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..b3bdda2f3542b312eda531695c975ff2dfc29c93 --- /dev/null +++ b/paddle/gserver/layers/RowConvLayer.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" + +namespace paddle { + +/** + * \brief Row Convolution Layer. + */ +class RowConvLayer : public Layer { +public: + explicit RowConvLayer(const LayerConfig& config) : Layer(config) {} + + ~RowConvLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; + +protected: + // Row convolution weight, context_lenght_ * fan_out. + // fan_out is the size of output feature. + std::unique_ptr weight_; + + // The step number to look ahead plus one equals contexLength_. + size_t contexLength_; + TensorShape wDims_; +}; +} // namespace paddle diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index e1e8e7fae7ca4c96206d60703db1f35aa1196875..6adffcf53b7966bd6f3d02970e5f07cc9802f469 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1705,6 +1705,26 @@ TEST(Layer, TransLayer) { } } +TEST(Layer, RowConvLayer) { + const int context = 3; + const int size = 512; + + TestConfig config; + config.layerConfig.set_type("row_conv"); + config.layerConfig.set_size(size); + config.layerConfig.set_active_type("sigmoid"); + + config.inputDefs.push_back( + {INPUT_SEQUENCE_DATA, "layer_0", size, context * size}); + LayerInputConfig* input = config.layerConfig.add_inputs(); + RowConvConfig* conv = input->mutable_row_conv_conf(); + conv->set_context_length(context); + + for (auto useGpu : {false, true}) { + testLayerGrad(config, "row_conv", 100, false, useGpu, false); + } +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp index 6d9365af2d14673146d9e427138bf6dd5f5b41b6..5beced3bb5a1050078f88dfd4350a2df71d27f35 100644 --- a/paddle/parameter/Argument.cpp +++ b/paddle/parameter/Argument.cpp @@ -632,7 +632,7 @@ void Argument::printValueString(std::ostream& stream, const std::string& prefix) const { std::unordered_map out; getValueString(&out); - for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) { + for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) { auto it = out.find(field); if (it != out.end()) { stream << prefix << field << ":\n" << it->second; diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp index 8c8ba0a2e51b85bde0544c6780b07130336a6bdd..922f25734dee0a6db7fbcfcef3d29d2bad5b7858 100644 --- a/paddle/pserver/LightNetwork.cpp +++ b/paddle/pserver/LightNetwork.cpp @@ -383,20 +383,23 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) { setOption(sockfd); /// Now connect to the server - int retry_second = 0; - int error = 0; + int retry_count = 0; do { - error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)); - if (error == ECONNREFUSED) { + if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) { + break; + } + + if (errno == ECONNREFUSED) { LOG(WARNING) << "connection refused by pserver, try again!"; - if (retry_second++ >= 7) { + if (retry_count++ >= 7) { LOG(FATAL) << "connection refused by pserver, maybe pserver failed!"; } std::this_thread::sleep_for(std::chrono::seconds(1)); } else { - PCHECK(error >= 0) << "ERROR connecting to " << serverAddr; + PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":" + << serverPort << "errorno: " << errno; } - } while (error == ECONNREFUSED); + } while (errno == ECONNREFUSED); channel_.reset(new SocketChannel(sockfd, serverAddr)); tcpRdma_ = F_TCP; diff --git a/paddle/strings/CMakeLists.txt b/paddle/strings/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e55eecd484c0e218ecd51bbd19b3eb4f6f92a25 --- /dev/null +++ b/paddle/strings/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(stringpiece SRCS stringpiece.cc) +cc_test(stringpiece_test SRCS stringpiece_test.cc DEPS stringpiece glog gflags) diff --git a/paddle/strings/stringpiece.cc b/paddle/strings/stringpiece.cc new file mode 100644 index 0000000000000000000000000000000000000000..415b3558d5dfffde26275bcb16ea3922424ca9f3 --- /dev/null +++ b/paddle/strings/stringpiece.cc @@ -0,0 +1,141 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "paddle/strings/stringpiece.h" + +#include + +#include +#include +#include + +namespace paddle { + +StringPiece::StringPiece() : data_(NULL), size_(0) {} + +StringPiece::StringPiece(const char* d, size_t n) : data_(d), size_(n) { + if (d == NULL && n != 0) + throw std::invalid_argument( + "StringPiece requires len to be 0 for NULL data"); +} + +StringPiece::StringPiece(const char* s) : data_(s) { + size_ = (s == NULL) ? 0 : strlen(s); +} + +StringPiece::StringPiece(const std::string& s) + : data_(s.data()), size_(s.size()) {} + +char StringPiece::operator[](size_t n) const { + if (n >= len()) + throw std::invalid_argument("index out of StringPiece length"); + return data_[n]; +} + +int Compare(StringPiece a, StringPiece b) { + const size_t min_len = (a.len() < b.len()) ? a.len() : b.len(); + int r = memcmp(a.data(), b.data(), min_len); + if (r == 0) { + if (a.len() < b.len()) + return -1; + else if (a.len() > b.len()) + return 1; + } + return r; +} + +bool operator==(StringPiece x, StringPiece y) { + return ((x.len() == y.len()) && + (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0)); +} + +bool operator!=(StringPiece x, StringPiece y) { return !(x == y); } + +bool operator<(StringPiece x, StringPiece y) { return Compare(x, y) < 0; } +bool operator>(StringPiece x, StringPiece y) { return Compare(x, y) > 0; } + +bool operator<=(StringPiece x, StringPiece y) { return Compare(x, y) <= 0; } +bool operator>=(StringPiece x, StringPiece y) { return Compare(x, y) >= 0; } + +bool HasPrefix(StringPiece s, StringPiece x) { + return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0)); +} + +bool HasSuffix(StringPiece s, StringPiece x) { + return ((s.len() >= x.len()) && + (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0)); +} + +StringPiece SkipPrefix(StringPiece s, size_t n) { + if (n > s.len()) + throw std::invalid_argument("Skip distance larger than StringPiece length"); + return StringPiece(s.data() + n, s.len() - n); +} + +StringPiece SkipSuffix(StringPiece s, size_t n) { + if (n > s.len()) + throw std::invalid_argument("Skip distance larger than StringPiece length"); + return StringPiece(s.data(), s.len() - n); +} + +StringPiece TrimPrefix(StringPiece s, StringPiece x) { + return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s; +} + +StringPiece TrimSuffix(StringPiece s, StringPiece x) { + return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s; +} + +bool Contains(StringPiece s, StringPiece sub) { + return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end(); +} + +size_t Index(StringPiece s, StringPiece sub) { + auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end()); + return e != s.end() ? e - s.data() : StringPiece::npos; +} + +size_t Find(StringPiece s, char c, size_t pos) { + if (pos >= s.len()) { + return StringPiece::npos; + } + const char* result = + reinterpret_cast(memchr(s.data() + pos, c, s.len() - pos)); + return result != nullptr ? result - s.data() : StringPiece::npos; +} + +size_t RFind(StringPiece s, char c, size_t pos) { + if (s.len() == 0) return StringPiece::npos; + for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data(); + p--) { + if (*p == c) { + return p - s.data(); + } + } + return StringPiece::npos; +} + +StringPiece SubStr(StringPiece s, size_t pos, size_t n) { + if (pos > s.len()) pos = s.len(); + if (n > s.len() - pos) n = s.len() - pos; + return StringPiece(s.data() + pos, n); +} + +std::ostream& operator<<(std::ostream& o, StringPiece piece) { + return o << piece.ToString(); +} + +} // namespace paddle diff --git a/paddle/strings/stringpiece.h b/paddle/strings/stringpiece.h new file mode 100644 index 0000000000000000000000000000000000000000..adff713e86f49349b8f189c1d24584bfc1bb8aa7 --- /dev/null +++ b/paddle/strings/stringpiece.h @@ -0,0 +1,105 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#pragma once + +#include +#include + +namespace paddle { + +// StringPiece points into a std::string object but doesn't own the +// string. It is for efficient access to strings. Like Go's string +// type. Not that StringPiece doesn't mutate the underlying string, +// so it is thread-safe given that the underlying string doesn't +// change. Because StringPiece contains a little data members, and +// its syntax is simple as it doesn't own/manage the string, it is +// cheap to construct StringPieces and pass them around. +class StringPiece { +public: + static const size_t npos = static_cast(-1); + + // We provide non-explicit singleton constructors so users can + // pass in a "const char*" or a "string" wherever a "StringPiece" + // is expected. These contructors ensure that if data_ is NULL, + // size_ is 0. + StringPiece(); + StringPiece(const char* d, size_t n); + StringPiece(const char* d); + StringPiece(const std::string& s); + + const char* data() const { return data_; } + size_t len() const { return size_; } + + char operator[](size_t n) const; + + // StringPiece doesn't own the string, so both iterator and const + // iterator are const char* indeed. + typedef const char* const_iterator; + typedef const char* iterator; + iterator begin() const { return data_; } + iterator end() const { return data_ + size_; } + + // Return a string that contains the copy of the referenced data. + std::string ToString() const { return std::string(data_, size_); } + +private: + const char* data_; + size_t size_; + + // Intentionally copyable +}; + +int Compare(StringPiece a, StringPiece b); + +bool operator==(StringPiece x, StringPiece y); +bool operator!=(StringPiece x, StringPiece y); +bool operator<(StringPiece x, StringPiece y); +bool operator>(StringPiece x, StringPiece y); +bool operator<=(StringPiece x, StringPiece y); +bool operator>=(StringPiece x, StringPiece y); + +bool HasPrefix(StringPiece s, StringPiece prefix); +bool HasSuffix(StringPiece s, StringPiece suffix); + +StringPiece SkipPrefix(StringPiece s, size_t n); +StringPiece SkipSuffix(StringPiece s, size_t n); + +// Skip the prefix (or suffix) if it matches with the string. +StringPiece TrimPrefix(StringPiece s, StringPiece prefix); +StringPiece TrimSuffix(StringPiece s, StringPiece suffix); + +// Returns if s contains sub. Any s except for empty s contains an +// empty sub. +bool Contains(StringPiece s, StringPiece sub); + +// Return the first occurrence of sub in s, or npos. If both s and +// sub is empty, it returns npos; otherwise, if only sub is empty, it +// returns 0. +size_t Index(StringPiece s, StringPiece sub); + +// Return the first occurrence of c in s[pos:end], or npos. +size_t Find(StringPiece s, char c, size_t pos); + +// Search range is [0..pos] inclusive. If pos == npos, search everything. +size_t RFind(StringPiece s, char c, size_t pos); + +StringPiece SubStr(StringPiece s, size_t pos, size_t n); + +// allow StringPiece to be logged +std::ostream& operator<<(std::ostream& o, StringPiece piece); + +} // namespace paddle diff --git a/paddle/strings/stringpiece_test.cc b/paddle/strings/stringpiece_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2ba66a04f641c3457efa713383484491a213668f --- /dev/null +++ b/paddle/strings/stringpiece_test.cc @@ -0,0 +1,293 @@ +/* + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +#include "paddle/strings/stringpiece.h" + +#include + +#include "gtest/gtest.h" + +TEST(StringPiece, Construct) { + { + paddle::StringPiece s; + EXPECT_EQ(NULL, s.data()); + EXPECT_EQ(0U, s.len()); + } + { EXPECT_THROW(paddle::StringPiece s(NULL, 10000U), std::invalid_argument); } + { + paddle::StringPiece s(NULL); + EXPECT_EQ(0U, s.len()); + } + { + std::string a; + EXPECT_EQ(0U, a.size()); + paddle::StringPiece s(a); + EXPECT_EQ(0U, s.len()); + } +} + +TEST(StringPiece, CopyAndAssign) { + paddle::StringPiece empty; + EXPECT_EQ(0U, empty.len()); + + paddle::StringPiece a("hello"); + paddle::StringPiece b = a; + EXPECT_EQ(b.len(), strlen("hello")); + EXPECT_EQ(a, b); + + std::string storage("hello"); + paddle::StringPiece c(storage); + EXPECT_EQ(a, c); + EXPECT_NE(a.data(), c.data()); +} + +TEST(StringPiece, Compare) { + { + paddle::StringPiece a("hello"); + paddle::StringPiece b("world"); + EXPECT_TRUE(a != b); + EXPECT_FALSE(a == b); + EXPECT_TRUE(a < b); + EXPECT_TRUE(a <= b); + EXPECT_FALSE(a > b); + EXPECT_FALSE(a >= b); + EXPECT_LT(Compare(a, b), 0); + EXPECT_GT(Compare(b, a), 0); + } + { + paddle::StringPiece a, b; + EXPECT_TRUE(a == b); + EXPECT_FALSE(a != b); + EXPECT_FALSE(a < b); + EXPECT_FALSE(a > b); + EXPECT_TRUE(a <= b); + EXPECT_TRUE(a >= b); + EXPECT_EQ(0, Compare(a, b)); + EXPECT_EQ(0, Compare(b, a)); + } +} + +TEST(StringPiece, ToString) { + { + paddle::StringPiece s; + EXPECT_EQ(std::string(""), s.ToString()); + } + { + paddle::StringPiece s(NULL); + EXPECT_EQ(std::string(""), s.ToString()); + } + { + paddle::StringPiece s("hello"); + EXPECT_EQ(std::string("hello"), s.ToString()); + } +} + +TEST(StringPiece, HasPrefixSuffix) { + using paddle::HasPrefix; + using paddle::HasSuffix; + { + paddle::StringPiece s; + EXPECT_FALSE(HasPrefix(s, "something")); + EXPECT_TRUE(HasPrefix(s, "")); + EXPECT_FALSE(HasSuffix(s, "something")); + EXPECT_TRUE(HasSuffix(s, "")); + } + { + paddle::StringPiece s("app"); + EXPECT_TRUE(HasPrefix(s, "")); + EXPECT_TRUE(HasPrefix(s, "a")); + EXPECT_TRUE(HasPrefix(s, "ap")); + EXPECT_TRUE(HasPrefix(s, "app")); + + EXPECT_TRUE(HasSuffix(s, "")); + EXPECT_TRUE(HasSuffix(s, "p")); + EXPECT_TRUE(HasSuffix(s, "pp")); + EXPECT_TRUE(HasSuffix(s, "app")); + } +} + +TEST(StringPiece, SkipPrefixSuffix) { + using paddle::SkipPrefix; + using paddle::SkipSuffix; + { + paddle::StringPiece s; + EXPECT_EQ("", SkipPrefix(s, 0)); + EXPECT_THROW(SkipPrefix(s, 1), std::invalid_argument); + + EXPECT_EQ("", SkipSuffix(s, 0)); + EXPECT_THROW(SkipSuffix(s, 1), std::invalid_argument); + } + { + paddle::StringPiece s("app"); + EXPECT_EQ("app", SkipPrefix(s, 0)); + EXPECT_EQ("pp", SkipPrefix(s, 1)); + EXPECT_EQ("p", SkipPrefix(s, 2)); + EXPECT_EQ("", SkipPrefix(s, 3)); + EXPECT_THROW(SkipPrefix(s, 4), std::invalid_argument); + + EXPECT_EQ("app", SkipSuffix(s, 0)); + EXPECT_EQ("ap", SkipSuffix(s, 1)); + EXPECT_EQ("a", SkipSuffix(s, 2)); + EXPECT_EQ("", SkipSuffix(s, 3)); + EXPECT_THROW(SkipSuffix(s, 4), std::invalid_argument); + } +} + +TEST(StringPiece, TrimPrefixSuffix) { + using paddle::TrimPrefix; + using paddle::TrimSuffix; + { + paddle::StringPiece s; + EXPECT_EQ("", TrimPrefix(s, "")); + EXPECT_EQ("", TrimPrefix(s, "something")); + + EXPECT_EQ("", TrimSuffix(s, "")); + EXPECT_EQ("", TrimSuffix(s, "something")); + } + { + paddle::StringPiece s("app"); + EXPECT_EQ("app", TrimPrefix(s, "")); + EXPECT_EQ("pp", TrimPrefix(s, "a")); + EXPECT_EQ("p", TrimPrefix(s, "ap")); + EXPECT_EQ("", TrimPrefix(s, "app")); + EXPECT_EQ("app", TrimPrefix(s, "something")); + + EXPECT_EQ("app", TrimSuffix(s, "")); + EXPECT_EQ("ap", TrimSuffix(s, "p")); + EXPECT_EQ("a", TrimSuffix(s, "pp")); + EXPECT_EQ("", TrimSuffix(s, "app")); + EXPECT_EQ("app", TrimSuffix(s, "something")); + } +} + +TEST(StringPiece, Contains) { + using paddle::Contains; + { + paddle::StringPiece s; + EXPECT_FALSE(Contains(s, "")); + EXPECT_FALSE(Contains(s, "something")); + } + { + paddle::StringPiece s("app"); + EXPECT_TRUE(Contains(s, "")); + EXPECT_TRUE(Contains(s, "a")); + EXPECT_TRUE(Contains(s, "p")); + EXPECT_TRUE(Contains(s, "ap")); + EXPECT_TRUE(Contains(s, "pp")); + EXPECT_TRUE(Contains(s, "app")); + EXPECT_FALSE(Contains(s, "something")); + } +} + +TEST(StringPiece, Index) { + using paddle::Index; + auto npos = paddle::StringPiece::npos; + { + paddle::StringPiece s; + EXPECT_EQ(npos, Index(s, "")); + EXPECT_EQ(npos, Index(s, "something")); + } + { + paddle::StringPiece s("app"); + EXPECT_EQ(0U, Index(s, "")); + EXPECT_EQ(0U, Index(s, "a")); + EXPECT_EQ(1U, Index(s, "p")); + EXPECT_EQ(0U, Index(s, "ap")); + EXPECT_EQ(1U, Index(s, "pp")); + EXPECT_EQ(0U, Index(s, "app")); + EXPECT_EQ(npos, Index(s, "something")); + } +} + +TEST(StringPiece, Find) { + using paddle::Find; + auto npos = paddle::StringPiece::npos; + { + paddle::StringPiece s; + EXPECT_EQ(npos, Find(s, 'a', 0U)); + } + { + paddle::StringPiece s("app"); + EXPECT_EQ(0U, Find(s, 'a', 0U)); + EXPECT_EQ(1U, Find(s, 'p', 0U)); + EXPECT_EQ(1U, Find(s, 'p', 1U)); + EXPECT_EQ(2U, Find(s, 'p', 2U)); + EXPECT_EQ(npos, Find(s, 'z', 2U)); + } +} + +TEST(StringPiece, RFind) { + using paddle::RFind; + auto npos = paddle::StringPiece::npos; + { + paddle::StringPiece s; + EXPECT_EQ(npos, RFind(s, 'a', 0U)); + } + { + paddle::StringPiece s("app"); + EXPECT_EQ(2U, RFind(s, 'p', 2U)); + EXPECT_EQ(0U, RFind(s, 'a', 2U)); + EXPECT_EQ(1U, RFind(s, 'p', 1U)); + EXPECT_EQ(0U, RFind(s, 'a', 0)); + EXPECT_EQ(npos, RFind(s, 'z', 2U)); + } +} + +TEST(StringPiece, SubStr) { + using paddle::SubStr; + { + paddle::StringPiece s; + EXPECT_EQ("", SubStr(s, 0, 0)); + EXPECT_EQ("", SubStr(s, 0, 1)); + EXPECT_EQ("", SubStr(s, 1, 0)); + } + { + paddle::StringPiece s("app"); + EXPECT_EQ("", SubStr(s, 0, 0)); + EXPECT_EQ("", SubStr(s, 1, 0)); + EXPECT_EQ("", SubStr(s, 2, 0)); + EXPECT_EQ("", SubStr(s, 3, 0)); + + EXPECT_EQ("a", SubStr(s, 0, 1)); + EXPECT_EQ("p", SubStr(s, 1, 1)); + EXPECT_EQ("p", SubStr(s, 2, 1)); + EXPECT_EQ("", SubStr(s, 3, 1)); + + EXPECT_EQ("ap", SubStr(s, 0, 2)); + EXPECT_EQ("pp", SubStr(s, 1, 2)); + EXPECT_EQ("p", SubStr(s, 2, 2)); + EXPECT_EQ("", SubStr(s, 3, 2)); + + EXPECT_EQ("app", SubStr(s, 0, 3)); + EXPECT_EQ("pp", SubStr(s, 1, 3)); + EXPECT_EQ("p", SubStr(s, 2, 3)); + EXPECT_EQ("", SubStr(s, 3, 3)); + } +} + +TEST(StringPiece, StreamOutput) { + using paddle::StringPiece; + + std::stringstream o; + o << StringPiece(); + EXPECT_EQ("", o.str()); + + o << StringPiece("hello"); + EXPECT_EQ("hello", o.str()); + + o << StringPiece(); + EXPECT_EQ("hello", o.str()); +} diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 4f9b53d6f6553e55406dd000029a598a92fd2fb6..29270829bbc3af6990aaf03a5228ef7f6a892a5c 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -194,6 +194,10 @@ message MaxOutConfig { required uint32 groups = 2; } +message RowConvConfig { + required uint32 context_length = 1; +} + message ProjectionConfig { required string type = 1; required string name = 2; @@ -279,6 +283,7 @@ message LayerInputConfig { optional SppConfig spp_conf = 12; optional PriorBoxConfig priorbox_conf = 13; optional PadConfig pad_conf = 14; + optional RowConvConfig row_conv_conf = 15; } message LayerConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 5d540664a7f56b4fc27ecd5dc46bf36b0268eb98..0792e2d40b43f5fb2de8d6bb43a62cfa23f77082 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -73,7 +73,6 @@ To use this from paddle_trainer, paddle_trainer should be called with --config_args=extension_module_name=[MODULE_NAME] ''' - import copy import logging import os @@ -1731,9 +1730,10 @@ class ParameterReluLayer(LayerBase): def __init__(self, name, inputs, partial_sum=1, **args): super(ParameterReluLayer, self).__init__( name, self.layer_type, 0, inputs=inputs, **args) - config_assert(len(self.inputs) == 1) - config_assert(self.input_layer.size % partial_sum == 0) input_layer = self.get_input_layer(0) + config_assert(len(self.inputs) == 1, "prelu layer has only one input.") + config_assert(input_layer.size % partial_sum == 0, + "a wrong setting for partial_sum") self.set_layer_size(input_layer.size) self.create_input_parameter(0, input_layer.size / partial_sum) @@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase): g_layer_map[input_layer.name].width, out_channels) +@config_layer('row_conv') +class RowConvLayer(LayerBase): + def __init__(self, name, inputs, context_length, **xargs): + super(RowConvLayer, self).__init__( + name, 'maxout', 0, inputs=inputs, **xargs) + config_assert( + len(self.inputs) == 1, + 'TransLayer must have one and only one input') + input_layer = self.get_input_layer(0) + row_conv_conf = self.config.inputs[0].row_conv_conf + row_conv_conf.context_length = context_length + self.set_layer_size(input_layer.size) + psize = context_length * input_layer.size + dims = [context_length, input_layer.size] + self.create_input_parameter(0, psize, dims) + + # key: cost type # value: cost class g_cost_map = {} @@ -3546,11 +3563,7 @@ def update_g_config(): return g_config -def begin_parse(config_arg_str=''): - ''' - @param config_arg_str: a string of the form var1=val1,var2=val2. It will be - passed to config script as a dictionary CONFIG_ARGS - ''' +def begin_parse(): init_config_environment() for hook in _parse_config_hooks: hook() @@ -3568,8 +3581,12 @@ def begin_parse(config_arg_str=''): def parse_config(trainer_config, config_arg_str): - begin_parse(config_arg_str) + ''' + @param config_arg_str: a string of the form var1=val1,var2=val2. It will be + passed to config script as a dictionary CONFIG_ARGS + ''' + begin_parse() config_args = {} if config_arg_str: diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 5667e5ff2bccd38f2da00a3b17ea8bc8e3a6fb8e..2d8ddbb9007b241eb1986887d8ea6c2de8235c29 100755 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -31,31 +31,31 @@ except ImportError: import copy __all__ = [ - "full_matrix_projection", - "AggregateLevel", - "ExpandLevel", - "identity_projection", - "dotmul_projection", - "dotmul_operator", - "repeat_layer", - "seq_reshape_layer", - "table_projection", - "mixed_layer", - "data_layer", - "embedding_layer", - "fc_layer", - "grumemory", - "pooling_layer", - "lstmemory", - "last_seq", - "first_seq", - "cos_sim", - "hsigmoid", - "conv_projection", - "mse_cost", - "regression_cost", + 'full_matrix_projection', + 'AggregateLevel', + 'ExpandLevel', + 'identity_projection', + 'dotmul_projection', + 'dotmul_operator', + 'repeat_layer', + 'seq_reshape_layer', + 'table_projection', + 'mixed_layer', + 'data_layer', + 'embedding_layer', + 'fc_layer', + 'grumemory', + 'pooling_layer', + 'lstmemory', + 'last_seq', + 'first_seq', + 'cos_sim', + 'hsigmoid', + 'conv_projection', + 'mse_cost', + 'regression_cost', 'classification_cost', - "LayerOutput", + 'LayerOutput', 'img_conv_layer', 'img_pool_layer', 'batch_norm_layer', @@ -121,6 +121,9 @@ __all__ = [ 'smooth_l1_cost', 'layer_support', 'multiplex_layer', + 'row_conv_layer', + 'dropout_layer', + 'prelu_layer', ] @@ -129,26 +132,26 @@ class LayerType(object): Layer type enumerations. """ - DATA = "data" - MIXED_LAYER = "mixed" - LSTMEMORY = "lstmemory" - GRUMEMORY = "gated_recurrent" - SEQUENCE_LAST_INSTANCE = "seqlastins" - SEQUENCE_FIRST_INSTANCE = "seqfirstins" - SEQUENCE_RESHAPE = "seqreshape" - POOLING_MAX = "max" + DATA = 'data' + MIXED_LAYER = 'mixed' + LSTMEMORY = 'lstmemory' + GRUMEMORY = 'gated_recurrent' + SEQUENCE_LAST_INSTANCE = 'seqlastins' + SEQUENCE_FIRST_INSTANCE = 'seqfirstins' + SEQUENCE_RESHAPE = 'seqreshape' + POOLING_MAX = 'max' POOLING_AVG = 'average' - FC_LAYER = "fc" + FC_LAYER = 'fc' COST = 'cost' COSINE_SIM_VEC = 'cos_vm' COSINE_SIM = 'cos' HSIGMOID = 'hsigmoid' - CONV_LAYER = "conv" - CONVTRANS_LAYER = "convt" - EXCONV_LAYER = "exconv" - EXCONVTRANS_LAYER = "exconvt" - CUDNNCONV_LAYER = "cudnn_conv" - POOL_LAYER = "pool" + CONV_LAYER = 'conv' + CONVTRANS_LAYER = 'convt' + EXCONV_LAYER = 'exconv' + EXCONVTRANS_LAYER = 'exconvt' + CUDNNCONV_LAYER = 'cudnn_conv' + POOL_LAYER = 'pool' BATCH_NORM_LAYER = 'batch_norm' NORM_LAYER = 'norm' SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm' @@ -188,25 +191,28 @@ class LayerType(object): SPP_LAYER = "spp" PAD_LAYER = "pad" MULTIPLEX_LAYER = "multiplex" + ROW_CONV_LAYER = "row_conv" - PRINT_LAYER = "print" - PRIORBOX_LAYER = "priorbox" + PRINT_LAYER = 'print' + PRIORBOX_LAYER = 'priorbox' - CTC_LAYER = "ctc" - WARP_CTC_LAYER = "warp_ctc" - CRF_LAYER = "crf" - CRF_DECODING_LAYER = "crf_decoding" + CTC_LAYER = 'ctc' + WARP_CTC_LAYER = 'warp_ctc' + CRF_LAYER = 'crf' + CRF_DECODING_LAYER = 'crf_decoding' NCE_LAYER = 'nce' - RANK_COST = "rank-cost" - LAMBDA_COST = "lambda_cost" - HUBER = "huber" - CROSS_ENTROPY = "multi-class-cross-entropy" - CROSS_ENTROPY_WITH_SELFNORM = "multi_class_cross_entropy_with_selfnorm" - SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy" - MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy" - SUM_COST = "sum_cost" - SMOOTH_L1 = "smooth_l1" + RANK_COST = 'rank-cost' + LAMBDA_COST = 'lambda_cost' + HUBER = 'huber' + CROSS_ENTROPY = 'multi-class-cross-entropy' + CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm' + SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy' + MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy' + SUM_COST = 'sum_cost' + SMOOTH_L1 = 'smooth_l1' + + PRELU = 'prelu' @staticmethod def is_layer_type(type_name): @@ -3768,7 +3774,6 @@ def beam_search(step, assert generated_input_index != -1 gipt = input[generated_input_index] - assert isinstance(gipt, BaseGeneratedInput) gipt.bos_id = bos_id gipt.eos_id = eos_id @@ -3788,7 +3793,6 @@ def beam_search(step, predict = gipt.after_real_step(step(*args)) eos_layer(input=predict, eos_id=eos_id, name=eos_name) - return predict tmp = recurrent_group( @@ -3860,7 +3864,6 @@ def classification_cost(input, label, weight=None, name=None, - top_k=None, evaluator=classification_error_evaluator, layer_attr=None): """ @@ -3875,8 +3878,6 @@ def classification_cost(input, :param weight: The weight affects the cost, namely the scale of cost. It is an optional argument. :type weight: LayerOutput - :param top_k: number k in top-k error rate - :type top_k: int :param evaluator: Evaluator method. :param layer_attr: layer's extra attribute. :type layer_attr: ExtraLayerAttribute @@ -3904,7 +3905,7 @@ def classification_cost(input, assert isinstance(e.for_classification, bool) assert e.for_classification - e(name=e.__name__, input=input, label=label, weight=weight, top_k=top_k) + e(name=e.__name__, input=input, label=label, weight=weight) if not isinstance(evaluator, collections.Sequence): evaluator = [evaluator] @@ -4725,7 +4726,7 @@ def ctc_layer(input, fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer should also be num_classes + 1. - The simple usage: + The example usage is: .. code-block:: python @@ -4812,7 +4813,7 @@ def warp_ctc_layer(input, - As a native 'softmax' activation is interated to the warp-ctc library, 'linear' activation is expected instead in the 'input' layer. - The simple usage: + The example usage is: .. code-block:: python @@ -4873,7 +4874,7 @@ def crf_layer(input, A layer for calculating the cost of sequential conditional random field model. - The simple usage: + The example usage is: .. code-block:: python @@ -4947,7 +4948,7 @@ def crf_decoding_layer(input, this layer will also calculate error. output.value[i] is 1 for incorrect decoding or 0 for correct decoding. - The simple usage: + The example usage is: .. code-block:: python @@ -5140,7 +5141,7 @@ def rank_cost(left, - :math:`o_i` and :math:`o_j`: the left output and right output. Their dimension is one. - The simple usage: + The example usage is: .. code-block:: python @@ -5197,7 +5198,7 @@ def lambda_cost(input, """ lambdaCost for lambdaRank LTR approach. - The simple usage: + The example usage is: .. code-block:: python @@ -5255,6 +5256,8 @@ def cross_entropy(input, """ A loss layer for multi class entropy. + The example usage is: + .. code-block:: python cost = cross_entropy(input=input_layer, @@ -5301,6 +5304,8 @@ def cross_entropy_with_selfnorm(input, A loss layer for multi class entropy with selfnorm. Input should be a vector of positive numbers, without normalization. + The example usage is: + .. code-block:: python cost = cross_entropy_with_selfnorm(input=input_layer, @@ -5342,6 +5347,8 @@ def sum_cost(input, name=None, layer_attr=None): """ A loss layer which calculate the sum of the input as loss + The example usage is: + .. code-block:: python cost = sum_cost(input=input_layer) @@ -5371,6 +5378,8 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None): """ A loss layer for huber loss. + The example usage is: + .. code-block:: python cost = huber_cost(input=input_layer, @@ -5411,6 +5420,8 @@ def multi_binary_label_cross_entropy(input, """ A loss layer for multi binary label cross entropy. + The example usage is: + .. code-block:: python cost = multi_binary_label_cross_entropy(input=input_layer, @@ -5470,6 +5481,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None): More details can be found by referring to `Fast R-CNN `_ + The example usage is: + .. code-block:: python cost = smooth_l1_cost(input=input_layer, @@ -5519,6 +5532,8 @@ def multiplex_layer(input, name=None, layer_attr=None): where, y is output. :math:`x_{k}` is the k-th input layer and :math:`k = x_{0}[i] + 1`. + The example usage is: + .. code-block:: python maxid = multiplex_layer(input=layers) @@ -5551,3 +5566,155 @@ def multiplex_layer(input, name=None, layer_attr=None): layer_type=LayerType.MULTIPLEX_LAYER, parents=input, size=l.config.size) + + +@wrap_name_default("dropout") +def dropout_layer(input, dropout_rate, name=None): + """ + @TODO(yuyang18): Add comments. + + :param name: + :param input: + :param dropout_rate: + :return: + """ + return addto_layer( + name=name, + input=input, + act=LinearActivation(), + bias_attr=False, + layer_attr=ExtraAttr(drop_rate=dropout_rate)) + + +@wrap_name_default() +@wrap_act_default(act=LinearActivation()) +@wrap_param_attr_default() +@layer_support(DROPOUT) +def row_conv_layer(input, + context_len, + act=None, + name=None, + param_attr=None, + layer_attr=None): + """ + + The row convolution is called lookahead convolution. It is firstly + introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition + in English and Mandarin `_ . + + The bidirectional RNN that learns representation for a sequence by + performing a forward and a backward pass through the entire sequence. + However, unlike unidirectional RNNs, bidirectional RNNs are challenging + to deploy in an online and low-latency setting. The lookahead convolution + incorporates information from future subsequences in a computationally + efficient manner to improve unidirectional recurrent neural networks. + + The connection of row convolution is different form the 1D sequence + convolution. Assumed that, the future context-length is k, that is to say, + it can get the output at timestep t by using the the input feature from t-th + timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input + activations are d, the activations r_t for the new layer at time-step t are: + + .. math:: + + r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}} + \quad \text{for} \quad (1 \leq i \leq d) + + Note: + The `context_len` is `k + 1`. That is to say, the lookahead step + number plus one equals context_len. + + + .. code-block:: python + + row_conv = row_conv_layer(input=input_layer, context_len=3) + + + :param input: The input layer. + :type input: LayerOutput + :param context_len: The context length equals the lookahead step number + plus one. + :type context_len: int + :param act: Activation Type. Default is linear activation. + :type act: BaseActivation + :param param_attr: The Parameter Attribute. If None, the parameter will be + initialized smartly. It's better set it by yourself. + :type param_attr: ParameterAttribute + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None + :return: LayerOutput object. + :rtype: LayerOutput + + """ + assert isinstance(input, LayerOutput) + assert context_len > 0, "the context_len must be greatet than 0." + + Layer( + inputs=[Input(input.name, **param_attr.attr)], + name=name, + context_length=context_len, + type=LayerType.ROW_CONV_LAYER, + active_type=act.name, + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size) + + +@layer_support() +@wrap_name_default() +@wrap_param_attr_default() +def prelu_layer(input, + name=None, + partial_sum=1, + param_attr=None, + layer_attr=None): + """ + The Parameter Relu activation that actives outputs with a learnable weight. + + Reference: + Delving Deep into Rectifiers: Surpassing Human-Level Performance on + ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf + + .. math:: + z_i &\\quad if \\quad z_i > 0 \\\\ + a_i * z_i &\\quad \\mathrm{otherwise} + + The example usage is: + + .. code-block:: python + + prelu = prelu_layer(input=layers, partial_sum=1) + + :param name: Name of this layer. + :type name: basestring + :param input: The input layer. + :type input: LayerOutput + :param partial_sum: this parameter makes a group of inputs share a same weight. + + - partial_sum = 1, indicates the element-wise activation: each element has a weight. + - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share a same weight. + - partial_sum = number of outputs, indicates all elements share a same weight. + + :type partial_sum: int + :param param_attr: The parameter attribute. See ParameterAttribute for details. + :type param_attr: ParameterAttribute|None + :param layer_attr: Extra layer configurations. Default is None. + :type layer_attr: ExtraLayerAttribute|None + :return: LayerOutput object. + :rtype: LayerOutput + """ + + assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input' + assert isinstance(param_attr, ParameterAttribute) + + l = Layer( + name=name, + type=LayerType.PRELU, + inputs=Input(input.name, **param_attr.attr), + partial_sum=partial_sum, + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name=name, + layer_type=LayerType.PRELU, + parents=input, + size=l.config.size) diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py index fb533a47e0b0585be6f0e019086993f8b3aa7f38..1bf59ed4840ae69afc5bce49c86a08b60e9603ee 100755 --- a/python/paddle/trainer_config_helpers/networks.py +++ b/python/paddle/trainer_config_helpers/networks.py @@ -26,10 +26,10 @@ from paddle.trainer.config_parser import * __all__ = [ 'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool", - "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group', 'lstmemory_unit', - 'small_vgg', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', - 'simple_gru', 'simple_attention', 'simple_gru2', 'bidirectional_gru', - 'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs' + "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg', + 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru', + 'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool', + 'bidirectional_lstm', 'inputs', 'outputs' ] ###################################################### @@ -1366,29 +1366,6 @@ def simple_attention(encoded_sequence, input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name) -############################################################################ -# Miscs # -############################################################################ - - -@wrap_name_default("dropout") -def dropout_layer(input, dropout_rate, name=None): - """ - @TODO(yuyang18): Add comments. - - :param name: - :param input: - :param dropout_rate: - :return: - """ - return addto_layer( - name=name, - input=input, - act=LinearActivation(), - bias_attr=False, - layer_attr=ExtraAttr(drop_rate=dropout_rate)) - - def inputs(layers, *args): """ Declare the inputs of network. The order of input should be as same as diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index 981ccbf248391b5db4339570d918404df6033f3d..c24102255f5bbed0f551b2dbfec20be7daf5f5b4 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -5,6 +5,7 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops -test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer) +test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer +test_prelu_layer test_row_conv) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr new file mode 100644 index 0000000000000000000000000000000000000000..64d227565f2b21ff43d4391c682ca90c0f47908e --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr @@ -0,0 +1,36 @@ +type: "nn" +layers { + name: "input" + type: "data" + size: 300 + active_type: "" +} +layers { + name: "__prelu_layer_0__" + type: "prelu" + size: 300 + active_type: "" + inputs { + input_layer_name: "input" + input_parameter_name: "___prelu_layer_0__.w0" + } +} +parameters { + name: "___prelu_layer_0__.w0" + size: 300 + initial_mean: 0.0 + initial_std: 0.057735026919 + initial_strategy: 0 + initial_smart: true +} +input_layer_names: "input" +output_layer_names: "__prelu_layer_0__" +sub_models { + name: "root" + layer_names: "input" + layer_names: "__prelu_layer_0__" + input_layer_names: "input" + output_layer_names: "__prelu_layer_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr new file mode 100644 index 0000000000000000000000000000000000000000..9ec15d2a19ec50a1729f9eeaa6dce8b1153c776b --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr @@ -0,0 +1,41 @@ +type: "nn" +layers { + name: "data" + type: "data" + size: 2560 + active_type: "" +} +layers { + name: "__row_conv_layer_0__" + type: "maxout" + size: 2560 + active_type: "relu" + inputs { + input_layer_name: "data" + input_parameter_name: "___row_conv_layer_0__.w0" + row_conv_conf { + context_length: 19 + } + } +} +parameters { + name: "___row_conv_layer_0__.w0" + size: 48640 + initial_mean: 0.0 + initial_std: 0.229415733871 + dims: 19 + dims: 2560 + initial_strategy: 0 + initial_smart: true +} +input_layer_names: "data" +output_layer_names: "__row_conv_layer_0__" +sub_models { + name: "root" + layer_names: "data" + layer_names: "__row_conv_layer_0__" + input_layer_names: "data" + output_layer_names: "__row_conv_layer_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..2e3057f323db22ffc3911cce30ec2e8bb95e3dbe --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py @@ -0,0 +1,6 @@ +from paddle.trainer_config_helpers import * + +data = data_layer(name='input', size=300) +prelu = prelu_layer(input=data) + +outputs(prelu) diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..ab33c496b0663d8472ce4b272be6c5cecbcfc978 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py @@ -0,0 +1,9 @@ +from paddle.trainer_config_helpers import * + +settings(batch_size=1000, learning_rate=1e-5) + +data = data_layer(name='data', size=2560) + +row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation()) + +outputs(row_conv) diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 815635f5dd4654fe3a31a9244e6e4473c397dd2f..aeed9ebd7d4d64efa5d0bf1638742a485c0fa44a 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -13,7 +13,7 @@ # limitations under the License. """ `paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2, -we want to make Paddle a plain Python package. The model config package defined +we want to make Paddle a plain Python package. The model config package defines the way how to configure a neural network topology in Paddle Python code. The primary usage shows below. @@ -30,7 +30,6 @@ The primary usage shows below. # use prediction instance where needed. parameters = paddle.parameters.create(cost) """ - import collections import copy import re @@ -44,9 +43,10 @@ __all__ = ['data', 'parse_network'] def __need_to_keep__(name): - if name in ['StaticInput', 'LayerType', 'layer_support']: - return False - return True + return name in [ + 'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType', + 'layer_support' + ] def __need_to_wrap__(name): @@ -54,6 +54,8 @@ def __need_to_wrap__(name): def __convert_name__(inname): + if __need_to_keep__(inname): + return inname if inname == 'maxid_layer': return 'max_id' elif inname.endswith('memory') or inname.endswith( @@ -74,8 +76,6 @@ def __convert_name__(inname): for name in v1_layers.__all__: obj = getattr(v1_layers, name) - if not __need_to_keep__(name): - continue new_name = __convert_name__(name) if callable(obj) and __need_to_wrap__(name): globals()[new_name] = __convert_to_v2__(obj, new_name, __name__) @@ -107,7 +107,7 @@ __data_layer__.__doc__ = __map_data_docstr__(v1_layers.data_layer.__doc__) data = __convert_to_v2__(__data_layer__, 'name', __name__) -def __get_used_layers__(output_layers, extra_layers=None): +def __get_used_layers__(output_layers): layer_names = set() parents = {} @@ -132,6 +132,13 @@ def __get_used_layers__(output_layers, extra_layers=None): add_parent(mem.layer_name, mem.boot_layer_name) add_parent(mem.link_name, mem.layer_name) + if sub_model.HasField('generator'): + # according to the implementation of text generation + # in recurrent layer group, the generated word must be + # the first out link + add_parent(sub_model.out_links[0].layer_name, + sub_model.generator.eos_layer_name) + def dfs_travel(layer_name): if layer_name in layer_names: return @@ -247,9 +254,9 @@ def __trim_submodel__(old_submodel, layer_names, input_layer_names, def parse_network(output_layers, extra_layers=None): if not isinstance(output_layers, collections.Sequence): output_layers = [output_layers] - if extra_layers is not None and not isinstance(extra_layers, - collections.Sequence): - extra_layers = [extra_layers] + if extra_layers is not None: + if not isinstance(extra_layers, collections.Sequence): + extra_layers = [extra_layers] else: extra_layers = [] @@ -262,18 +269,29 @@ def parse_network(output_layers, extra_layers=None): model_config = ModelConfig() model_config.type = cp.g_config.model_config.type + + for layer in output_layers: + model_config.output_layer_names.append(layer.full_name) + output_layer_names.add(layer.full_name) + for l in cp.g_config.model_config.layers: if l.name not in layer_names: continue model_config.layers.extend([l]) if l.type == 'data': + if l.name in model_config.output_layer_names: + """ + In text generation, the outlink to save the generated word + indices is a data_layer defined in recurrent_group. This + data_layer is sure to be the output of the network in text + generation task, so this statement excludes such a special + data_layer from being inputs of the network, otherwise an error + will occur during data feeding. + """ + continue model_config.input_layer_names.append(l.name) input_layer_names.add(l.name) - for layer in output_layers: - model_config.output_layer_names.append(layer.full_name) - output_layer_names.add(layer.full_name) - for e in cp.g_config.model_config.evaluators: if e.name in evaluator_names: model_config.evaluators.extend([e]) diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py index f3bb4d5f10dd6c5b220161e32dfc3a94642ac7a2..a20e878d0817d0a75e9c47a44f8765deca99225c 100644 --- a/python/paddle/v2/topology.py +++ b/python/paddle/v2/topology.py @@ -31,7 +31,6 @@ class Topology(object): def __init__(self, layers, extra_layers=None): def __check__(layers): if not isinstance(layers, collections.Sequence): - __check_layer_type__(layers) layers = [layers] for layer in layers: __check_layer_type__(layer) @@ -91,6 +90,7 @@ class Topology(object): [('image', dense_vector(768)), ('label', integer_value(10))] """ data_layers = self.data_layers() + return [(nm, data_layers[nm].data_type) for nm in self.proto().input_layer_names]