提交 3ef776ef 编写于 作者: W wanghaox

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into my_maxout_op

...@@ -21,11 +21,10 @@ third_party/ ...@@ -21,11 +21,10 @@ third_party/
cmake-build-* cmake-build-*
# generated while compiling # generated while compiling
python/paddle/v2/framework/core.so python/paddle/v2/fluid/core.so
paddle/pybind/pybind.h paddle/pybind/pybind.h
CMakeFiles CMakeFiles
cmake_install.cmake cmake_install.cmake
paddle/.timestamp paddle/.timestamp
python/paddlepaddle.egg-info/ python/paddlepaddle.egg-info/
paddle/pybind/pybind.h paddle/pybind/pybind.h
python/paddle/v2/framework/tests/tmp/*
...@@ -98,7 +98,7 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -98,7 +98,7 @@ IF(NOT ${CBLAS_FOUND})
ENDIF() ENDIF()
INSTALL(CODE "execute_process( INSTALL(CODE "execute_process(
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
destination ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
)" )"
) )
INSTALL(CODE "MESSAGE(STATUS \"Installing: \" INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
......
## Evaluator Design
### The Problem
During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
### Evaluator Design
Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block.
2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation
This design is shown in python API.
Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass.
```python
class Evaluator(object):
"""
Evaluator Base class.
"""
def __init__(self, name, **kwargs):
"""
Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
Auc need four variables, `true_positives`,
`true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
The initialization of Evaluator should be responsible for:
create metric states and append to the main_program
"""
pass
def _update_ops(self, input, label, **kwargs)
"""
Add mini-batch evaluator caculate operators to the main_program.
Add increment operator to accumulate the metric states.
"""
def reset(self, executor, reset_program=None):
"""
Reset metric states at the begin of each pass/user specified batch number.
Execute the reset_program to reset the states.
"""
def eval(self, executor, eval_program=None):
"""
Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
Execute the eval_program and return the result.
"""
return eval_result
```
...@@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat, ...@@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,
paddle_matrix paddle_matrix_create_sparse( paddle_matrix paddle_matrix_create_sparse(
uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) { uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
#ifndef PADDLE_MOBILE_INFERENCE
auto ptr = new paddle::capi::CMatrix(); auto ptr = new paddle::capi::CMatrix();
ptr->mat = paddle::Matrix::createSparseMatrix( ptr->mat = paddle::Matrix::createSparseMatrix(
height, height,
...@@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse( ...@@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
false, false,
useGpu); useGpu);
return ptr; return ptr;
#else
return nullptr;
#endif
} }
paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat, paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
...@@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat, ...@@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
uint64_t colSize, uint64_t colSize,
float* valueArray, float* valueArray,
uint64_t valueSize) { uint64_t valueSize) {
#ifndef PADDLE_MOBILE_INFERENCE
if (mat == nullptr) return kPD_NULLPTR; if (mat == nullptr) return kPD_NULLPTR;
auto ptr = cast(mat); auto ptr = cast(mat);
if (rowArray == nullptr || colArray == nullptr || if (rowArray == nullptr || colArray == nullptr ||
...@@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat, ...@@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
} else { } else {
return kPD_NOT_SUPPORTED; return kPD_NOT_SUPPORTED;
} }
#else
return kPD_NOT_SUPPORTED;
#endif
} }
...@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height, ...@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
* @param isBinary is binary (either 1 or 0 in matrix) or not. * @param isBinary is binary (either 1 or 0 in matrix) or not.
* @param useGpu is using GPU or not. * @param useGpu is using GPU or not.
* @return paddle_matrix. * @return paddle_matrix.
* @note Mobile inference does not support this interface.
*/ */
PD_API paddle_matrix paddle_matrix_create_sparse( PD_API paddle_matrix paddle_matrix_create_sparse(
uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu); uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
...@@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat, ...@@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
* NULL if the matrix is binary. * NULL if the matrix is binary.
* @param [in] valueSize length of value array. Zero if the matrix is binary. * @param [in] valueSize length of value array. Zero if the matrix is binary.
* @return paddle_error * @return paddle_error
* @note Mobile inference does not support this interface.
*/ */
PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat, PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
int* rowArray, int* rowArray,
......
...@@ -27,7 +27,9 @@ if(WITH_GPU) ...@@ -27,7 +27,9 @@ if(WITH_GPU)
set_source_files_properties(${CUDA_CXX_SOURCES} set_source_files_properties(${CUDA_CXX_SOURCES}
PROPERTIES COMPILE_FLAGS "-D__NVCC__") PROPERTIES COMPILE_FLAGS "-D__NVCC__")
else() else()
if (NOT MOBILE_INFERENCE)
set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc) set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
endif()
endif() endif()
set(CUDA_CU_SOURCES set(CUDA_CU_SOURCES
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include "hl_base.h" #include "hl_base.h"
/** /**
* @brief Maximum pool forward. * @brief Maximum pool forward with Mask output.
* *
* @param[in] frameCnt batch size of input image. * @param[in] frameCnt batch size of input image.
* @param[in] inputData input data. * @param[in] inputData input data.
...@@ -35,7 +35,7 @@ limitations under the License. */ ...@@ -35,7 +35,7 @@ limitations under the License. */
* @param[in] paddingW padding width. * @param[in] paddingW padding width.
* @param[out] tgtData output data. * @param[out] tgtData output data.
* @param[in] tgtStride stride between output data samples. * @param[in] tgtStride stride between output data samples.
* * @param[out] maskData the location indices of select max data.
*/ */
extern void hl_maxpool_forward(const int frameCnt, extern void hl_maxpool_forward(const int frameCnt,
const real* inputData, const real* inputData,
...@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt, ...@@ -51,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
const int paddingH, const int paddingH,
const int paddingW, const int paddingW,
real* tgtData, real* tgtData,
const int tgtStride); const int tgtStride,
real* maskData = NULL);
/** /**
* @brief Maximum pool backward. * @brief Maximum pool backward.
......
...@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt, ...@@ -31,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
const int paddingH, const int paddingH,
const int paddingW, const int paddingW,
real* tgtData, real* tgtData,
const int tgtStride) {} const int tgtStride,
real* MaskData) {}
inline void hl_maxpool_backward(const int frameCnt, inline void hl_maxpool_backward(const int frameCnt,
const real* inputData, const real* inputData,
......
...@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads, ...@@ -31,7 +31,8 @@ __global__ void KeMaxPoolForward(const int nthreads,
const int offsetH, const int offsetH,
const int offsetW, const int offsetW,
real* tgtData, real* tgtData,
const int tgtStride) { const int tgtStride,
real* maskData) {
int index = blockIdx.x * blockDim.x + threadIdx.x; int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < nthreads) { if (index < nthreads) {
int pw = index % pooledW; int pw = index % pooledW;
...@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads, ...@@ -45,16 +46,22 @@ __global__ void KeMaxPoolForward(const int nthreads,
hstart = max(hstart, 0); hstart = max(hstart, 0);
wstart = max(wstart, 0); wstart = max(wstart, 0);
real maxval = -FLT_MAX; real maxval = -FLT_MAX;
int max_index = -1;
inputData += (frameNum * channels + c) * height * width; inputData += (frameNum * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) { for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) { for (int w = wstart; w < wend; ++w) {
if (maxval < inputData[h * width + w]) if (maxval < inputData[h * width + w]) {
maxval = inputData[h * width + w]; max_index = h * width + w;
maxval = inputData[max_index];
}
} }
} }
int tgtIndex = int tgtIndex =
index % (pooledW * pooledH * channels) + frameNum * tgtStride; index % (pooledW * pooledH * channels) + frameNum * tgtStride;
tgtData[tgtIndex] = maxval; tgtData[tgtIndex] = maxval;
if (maskData != NULL) {
maskData[tgtIndex] = max_index;
}
} }
} }
...@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt, ...@@ -72,7 +79,8 @@ void hl_maxpool_forward(const int frameCnt,
const int paddingH, const int paddingH,
const int paddingW, const int paddingW,
real* tgtData, real* tgtData,
const int tgtStride) { const int tgtStride,
real* maskData) {
int num_kernels = pooledH * pooledW * channels * frameCnt; int num_kernels = pooledH * pooledW * channels * frameCnt;
int blocks = (num_kernels + 1024 - 1) / 1024; int blocks = (num_kernels + 1024 - 1) / 1024;
dim3 threads(1024, 1); dim3 threads(1024, 1);
...@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt, ...@@ -92,7 +100,8 @@ void hl_maxpool_forward(const int frameCnt,
paddingH, paddingH,
paddingW, paddingW,
tgtData, tgtData,
tgtStride); tgtStride,
maskData);
CHECK_SYNC("hl_maxpool_forward failed"); CHECK_SYNC("hl_maxpool_forward failed");
} }
......
...@@ -38,9 +38,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto) ...@@ -38,9 +38,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init) add_dependencies(framework_py_proto framework_py_proto_init)
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto
COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto/ COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/
COMMENT "Copy generated python proto into directory paddle/v2/framework/proto." COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
cc_library(backward SRCS backward.cc DEPS net_op) cc_library(backward SRCS backward.cc DEPS net_op)
......
...@@ -61,6 +61,7 @@ public: ...@@ -61,6 +61,7 @@ public:
// function arguments // function arguments
strides_ = config.get<std::vector<size_t>>("strides"); strides_ = config.get<std::vector<size_t>>("strides");
paddings_ = config.get<std::vector<size_t>>("paddings"); paddings_ = config.get<std::vector<size_t>>("paddings");
dilations_ = config.get<std::vector<size_t>>("dilations");
groups_ = config.get<size_t>("groups"); groups_ = config.get<size_t>("groups");
// number of inputs and outputs // number of inputs and outputs
...@@ -118,6 +119,7 @@ protected: ...@@ -118,6 +119,7 @@ protected:
std::vector<size_t> strides_; std::vector<size_t> strides_;
std::vector<size_t> paddings_; std::vector<size_t> paddings_;
std::vector<size_t> dilations_;
/// Group size, refer to grouped convolution in /// Group size, refer to grouped convolution in
/// Alex Krizhevsky's paper: when group=2, the first half of the /// Alex Krizhevsky's paper: when group=2, the first half of the
...@@ -133,6 +135,10 @@ protected: ...@@ -133,6 +135,10 @@ protected:
inline int paddingW() const { return paddings_[1]; } inline int paddingW() const { return paddings_[1]; }
inline int dilationH() const { return dilations_[0]; }
inline int dilationW() const { return dilations_[1]; }
// A temporary memory in convolution calculation. // A temporary memory in convolution calculation.
MemoryHandlePtr memory_; MemoryHandlePtr memory_;
......
...@@ -79,45 +79,59 @@ void Convolution(const std::string& conv1, ...@@ -79,45 +79,59 @@ void Convolution(const std::string& conv1,
if (outputChannels < inputChannels) continue; if (outputChannels < inputChannels) continue;
for (size_t stride : {1, 2}) { for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) { for (size_t padding : {0, 1}) {
if (padding >= filterSize) break; for (size_t dilation : {1, 3}) {
if (padding >= filterSize) break;
size_t filterS = (filterSize - 1) * dilation + 1;
// NNPACK only supports stride = 1 if batchSize > 1 if (inputSize + 2 * padding < filterS) break;
if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
batchSize > 1 && stride > 1)
break;
size_t outputSize = if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
(inputSize - filterSize + 2 * padding + stride) / stride; conv1 == "NNPACKConv-CPU" ||
VLOG(3) << " batchSize=" << batchSize conv2 == "NNPACKConv-CPU") &&
<< " inputChannels=" << inputChannels dilation > 1)
<< " inputHeight=" << inputSize break;
<< " inputWidth=" << inputSize
<< " outputChannels=" << outputChannels
<< " filterHeight=" << filterSize
<< " filterWidth=" << filterSize
<< " outputHeight=" << outputSize
<< " outputWidth=" << outputSize << " stride=" << stride
<< " padding=" << padding;
std::vector<size_t> paddings = {padding, padding}; // NNPACK only supports stride = 1 if batchSize > 1
std::vector<size_t> strides = {stride, stride}; if ((conv1 == "NNPACKConv-CPU" ||
Compare2Function<DType1, DType2> test( conv2 == "NNPACKConv-CPU") &&
conv1, batchSize > 1 && stride > 1)
conv2, break;
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)1)
.set("algo", (std::string) "auto"));
TensorShape input{ size_t outputSize =
batchSize, inputChannels, inputSize, inputSize}; (inputSize - filterS + 2 * padding + stride) / stride;
TensorShape filter{ VLOG(3) << " batchSize=" << batchSize
outputChannels, inputChannels, filterSize, filterSize}; << " inputChannels=" << inputChannels
TensorShape output{ << " inputHeight=" << inputSize
batchSize, outputChannels, outputSize, outputSize}; << " inputWidth=" << inputSize
<< " outputChannels=" << outputChannels
<< " filterHeight=" << filterSize
<< " filterWidth=" << filterSize
<< " outputHeight=" << outputSize
<< " outputWidth=" << outputSize
<< " stride=" << stride << " padding=" << padding;
function(test, input, filter, output); std::vector<size_t> paddings = {padding, padding};
std::vector<size_t> strides = {stride, stride};
std::vector<size_t> dilations = {dilation, dilation};
Compare2Function<DType1, DType2> test(
conv1,
conv2,
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("dilations", dilations)
.set("groups", (size_t)1)
.set("algo", (std::string) "auto"));
TensorShape input{
batchSize, inputChannels, inputSize, inputSize};
TensorShape filter{
outputChannels, inputChannels, filterSize, filterSize};
TensorShape output{
batchSize, outputChannels, outputSize, outputSize};
function(test, input, filter, output);
}
} }
} }
} }
...@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1, ...@@ -144,6 +158,7 @@ void Convolution2(const std::string& conv1,
for (size_t outputChannels : {7}) { for (size_t outputChannels : {7}) {
size_t stride = 1; size_t stride = 1;
size_t padding = 0; size_t padding = 0;
size_t dilation = 1;
size_t outputHeight = size_t outputHeight =
(inputHeight - filterHeight + 2 * padding + stride) / (inputHeight - filterHeight + 2 * padding + stride) /
stride; stride;
...@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1, ...@@ -162,6 +177,7 @@ void Convolution2(const std::string& conv1,
std::vector<size_t> paddings = {padding, padding}; std::vector<size_t> paddings = {padding, padding};
std::vector<size_t> strides = {stride, stride}; std::vector<size_t> strides = {stride, stride};
std::vector<size_t> dilations = {dilation, dilation};
Compare2Function<DType1, DType2> test( Compare2Function<DType1, DType2> test(
conv1, conv1,
conv2, conv2,
...@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1, ...@@ -169,6 +185,7 @@ void Convolution2(const std::string& conv1,
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("groups", (size_t)1) .set("groups", (size_t)1)
.set("dilations", dilations)
.set("algo", (std::string) "auto")); .set("algo", (std::string) "auto"));
TensorShape input{ TensorShape input{
...@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1, ...@@ -223,6 +240,7 @@ void DepthwiseConvolution(const std::string& conv1,
std::vector<size_t> paddings = {padding, padding}; std::vector<size_t> paddings = {padding, padding};
std::vector<size_t> strides = {stride, stride}; std::vector<size_t> strides = {stride, stride};
std::vector<size_t> dilations = {1, 1};
size_t groups = inputChannels; size_t groups = inputChannels;
Compare2Function<DType1, DType2> test( Compare2Function<DType1, DType2> test(
conv1, conv1,
...@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1, ...@@ -231,6 +249,7 @@ void DepthwiseConvolution(const std::string& conv1,
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("groups", groups) .set("groups", groups)
.set("dilations", dilations)
.set("algo", (std::string) "auto")); .set("algo", (std::string) "auto"));
TensorShape input{ TensorShape input{
......
...@@ -100,7 +100,9 @@ public: ...@@ -100,7 +100,9 @@ public:
strideH(), strideH(),
strideW(), strideW(),
paddingH(), paddingH(),
paddingW()); paddingW(),
dilationH(),
dilationW());
} else { } else {
colData = inputData + g * inputOffset; colData = inputData + g * inputOffset;
} }
...@@ -223,7 +225,9 @@ public: ...@@ -223,7 +225,9 @@ public:
strideH(), strideH(),
strideW(), strideW(),
paddingH(), paddingH(),
paddingW()); paddingW(),
dilationH(),
dilationW());
} }
} }
inputGrad += inputChannels * inputHeight * inputWidth; inputGrad += inputChannels * inputHeight * inputWidth;
...@@ -310,7 +314,9 @@ public: ...@@ -310,7 +314,9 @@ public:
strideH(), strideH(),
strideW(), strideW(),
paddingH(), paddingH(),
paddingW()); paddingW(),
dilationH(),
dilationW());
} else { } else {
colData = inputData + g * inputOffset; colData = inputData + g * inputOffset;
} }
......
...@@ -78,7 +78,9 @@ public: ...@@ -78,7 +78,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth); int paddingWidth,
int dilationHeight = 1,
int dilationWidth = 1);
}; };
template <ColFormat Format, DeviceType Device, class T> template <ColFormat Format, DeviceType Device, class T>
...@@ -91,7 +93,9 @@ public: ...@@ -91,7 +93,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth); int paddingWidth,
int dilationHeight = 1,
int dilationWidth = 1);
}; };
} // namespace paddle } // namespace paddle
...@@ -31,7 +31,9 @@ public: ...@@ -31,7 +31,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -47,8 +49,8 @@ public: ...@@ -47,8 +49,8 @@ public:
int c_im = c / filterWidth / filterHeight; int c_im = c / filterWidth / filterHeight;
for (int h = 0; h < outputHeight; ++h) { for (int h = 0; h < outputHeight; ++h) {
for (int w = 0; w < outputWidth; ++w) { for (int w = 0; w < outputWidth; ++w) {
int imRowIdx = h * strideHeight + hOffset; int imRowIdx = h * strideHeight + hOffset * dilationHeight;
int imColIdx = w * strideWidth + wOffset; int imColIdx = w * strideWidth + wOffset * dilationWidth;
if ((imRowIdx - paddingHeight) < 0 || if ((imRowIdx - paddingHeight) < 0 ||
(imRowIdx - paddingHeight) >= inputHeight || (imRowIdx - paddingHeight) >= inputHeight ||
(imColIdx - paddingWidth) < 0 || (imColIdx - paddingWidth) < 0 ||
...@@ -81,7 +83,9 @@ public: ...@@ -81,7 +83,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -97,8 +101,8 @@ public: ...@@ -97,8 +101,8 @@ public:
int c_im = c / filterWidth / filterHeight; int c_im = c / filterWidth / filterHeight;
for (int h = 0; h < outputHeight; ++h) { for (int h = 0; h < outputHeight; ++h) {
for (int w = 0; w < outputWidth; ++w) { for (int w = 0; w < outputWidth; ++w) {
int imRowIdx = h * strideHeight + hOffset; int imRowIdx = h * strideHeight + hOffset * dilationHeight;
int imColIdx = w * strideWidth + wOffset; int imColIdx = w * strideWidth + wOffset * dilationWidth;
if ((imRowIdx - paddingHeight) >= 0 && if ((imRowIdx - paddingHeight) >= 0 &&
(imRowIdx - paddingHeight) < inputHeight && (imRowIdx - paddingHeight) < inputHeight &&
(imColIdx - paddingWidth) >= 0 && (imColIdx - paddingWidth) >= 0 &&
...@@ -134,7 +138,9 @@ public: ...@@ -134,7 +138,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight = 1,
int dilationWidth = 1) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -147,9 +153,10 @@ public: ...@@ -147,9 +153,10 @@ public:
for (int channel = 0; channel < inputChannels; ++channel) { for (int channel = 0; channel < inputChannels; ++channel) {
for (int filterH = 0; filterH < filterHeight; ++filterH) { for (int filterH = 0; filterH < filterHeight; ++filterH) {
for (int filterW = 0; filterW < filterWidth; ++filterW) { for (int filterW = 0; filterW < filterWidth; ++filterW) {
int imRowOffset = int imRowOffset = outputH * strideHeight +
outputH * strideHeight + filterH - paddingHeight; filterH * dilationHeight - paddingHeight;
int imColOffset = outputW * strideWidth + filterW - paddingWidth; int imColOffset = outputW * strideWidth +
filterW * dilationWidth - paddingWidth;
int colDataOffset = int colDataOffset =
(((outputH * outputWidth + outputW) * inputChannels + (((outputH * outputWidth + outputW) * inputChannels +
channel) * channel) *
...@@ -189,7 +196,9 @@ public: ...@@ -189,7 +196,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight = 1,
int dilationWidth = 1) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -202,9 +211,10 @@ public: ...@@ -202,9 +211,10 @@ public:
for (int channel = 0; channel < inputChannels; ++channel) { for (int channel = 0; channel < inputChannels; ++channel) {
for (int filterH = 0; filterH < filterHeight; ++filterH) { for (int filterH = 0; filterH < filterHeight; ++filterH) {
for (int filterW = 0; filterW < filterWidth; ++filterW) { for (int filterW = 0; filterW < filterWidth; ++filterW) {
int imRowOffset = int imRowOffset = outputH * strideHeight +
outputH * strideHeight + filterH - paddingHeight; filterH * dilationHeight - paddingHeight;
int imColOffset = outputW * strideWidth + filterW - paddingWidth; int imColOffset = outputW * strideWidth +
filterW * dilationWidth - paddingWidth;
int colDataOffset = int colDataOffset =
(((outputH * outputWidth + outputW) * inputChannels + (((outputH * outputWidth + outputW) * inputChannels +
channel) * channel) *
......
...@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im, ...@@ -28,6 +28,8 @@ __global__ void im2col(const T* data_im,
int strideW, int strideW,
int paddingH, int paddingH,
int paddingW, int paddingW,
int dilationH,
int dilationW,
int height_col, int height_col,
int width_col, int width_col,
T* data_col) { T* data_col) {
...@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im, ...@@ -44,8 +46,8 @@ __global__ void im2col(const T* data_im,
data_col += (channel_out * height_col + h_out) * width_col + w_out; data_col += (channel_out * height_col + h_out) * width_col + w_out;
for (int i = 0; i < blockH; ++i) { for (int i = 0; i < blockH; ++i) {
for (int j = 0; j < blockW; ++j) { for (int j = 0; j < blockW; ++j) {
int rIdx = int(h_in + i); int rIdx = int(h_in + i * dilationH);
int cIdx = int(w_in + j); int cIdx = int(w_in + j * dilationW);
if ((rIdx - (int)paddingH) >= (int)height || if ((rIdx - (int)paddingH) >= (int)height ||
(rIdx - (int)paddingH) < 0 || (rIdx - (int)paddingH) < 0 ||
(cIdx - (int)paddingW) >= (int)width || (cIdx - (int)paddingW) >= (int)width ||
...@@ -77,7 +79,9 @@ public: ...@@ -77,7 +79,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -102,6 +106,8 @@ public: ...@@ -102,6 +106,8 @@ public:
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
dilationHeight,
dilationWidth,
outputHeight, outputHeight,
outputWidth, outputWidth,
colData); colData);
...@@ -121,6 +127,8 @@ __global__ void col2im(size_t n, ...@@ -121,6 +127,8 @@ __global__ void col2im(size_t n,
size_t strideW, size_t strideW,
size_t paddingH, size_t paddingH,
size_t paddingW, size_t paddingW,
size_t dilationH,
size_t dilationW,
size_t height_col, size_t height_col,
size_t width_col, size_t width_col,
T* data_im) { T* data_im) {
...@@ -131,23 +139,34 @@ __global__ void col2im(size_t n, ...@@ -131,23 +139,34 @@ __global__ void col2im(size_t n,
int w = int(index % width); int w = int(index % width);
int h = int((index / width) % height); int h = int((index / width) % height);
int c = int(index / (width * height)); int c = int(index / (width * height));
int filterH = (blockH - 1) * dilationH + 1;
int filterW = (blockW - 1) * dilationW + 1;
if ((w - (int)paddingW) >= 0 && if ((w - (int)paddingW) >= 0 &&
(w - (int)paddingW) < (width - 2 * paddingW) && (w - (int)paddingW) < (width - 2 * paddingW) &&
(h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) { (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
// compute the start and end of the output // compute the start and end of the output
int w_col_start = int w_col_start =
(w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1; (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col)); int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
int h_col_start = int h_col_start =
(h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1; (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
int h_col_end = min(int(h / strideH + 1), int(height_col)); int h_col_end = min(int(h / strideH + 1), int(height_col));
for (int h_col = h_col_start; h_col < h_col_end; ++h_col) { for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
for (int w_col = w_col_start; w_col < w_col_end; ++w_col) { for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
// the col location: [c * width * height + h_out, w_out] // the col location: [c * width * height + h_out, w_out]
int c_col = int(c * blockH * blockW) + int h_k = (h - h_col * strideH);
(h - h_col * (int)strideH) * (int)blockW + int w_k = (w - w_col * strideW);
(w - w_col * (int)strideW); if (h_k % dilationH == 0 && w_k % dilationW == 0) {
val += data_col[(c_col * height_col + h_col) * width_col + w_col]; h_k /= dilationH;
w_k /= dilationW;
int c_col =
(((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
width_col +
w_col;
val += data_col[c_col];
}
} }
} }
h -= paddingH; h -= paddingH;
...@@ -173,7 +192,9 @@ public: ...@@ -173,7 +192,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -205,6 +226,8 @@ public: ...@@ -205,6 +226,8 @@ public:
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
dilationHeight,
dilationWidth,
outputHeight, outputHeight,
outputWidth, outputWidth,
imData); imData);
...@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData, ...@@ -229,6 +252,8 @@ __global__ void im2colOCF(const T* imData,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth, int paddingWidth,
int dilationHeight,
int dilationWidth,
int outputHeight, int outputHeight,
int outputWidth) { int outputWidth) {
int swId = blockIdx.x; int swId = blockIdx.x;
...@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData, ...@@ -237,8 +262,10 @@ __global__ void im2colOCF(const T* imData,
channelId += blockDim.z) { channelId += blockDim.z) {
for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
int widthOffset = idx + swId * strideWidth - paddingWidth; int widthOffset =
int heightOffset = idy + shId * strideHeight - paddingHeight; idx * dilationHeight + swId * strideWidth - paddingWidth;
int heightOffset =
idy * dilationWidth + shId * strideHeight - paddingHeight;
int imOffset = widthOffset + heightOffset * inputWidth + int imOffset = widthOffset + heightOffset * inputWidth +
channelId * inputHeight * inputWidth; channelId * inputHeight * inputWidth;
...@@ -273,7 +300,9 @@ public: ...@@ -273,7 +300,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -312,6 +341,8 @@ public: ...@@ -312,6 +341,8 @@ public:
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
dilationHeight,
dilationWidth,
outputHeight, outputHeight,
outputWidth); outputWidth);
CHECK_SYNC("Im2ColFunctor GPU failed"); CHECK_SYNC("Im2ColFunctor GPU failed");
...@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData, ...@@ -330,6 +361,8 @@ __global__ void col2imOCF(T* imData,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth, int paddingWidth,
int dilationHeight,
int dilationWidth,
int outputHeight, int outputHeight,
int outputWidth) { int outputWidth) {
int swId = blockIdx.x; int swId = blockIdx.x;
...@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData, ...@@ -338,8 +371,10 @@ __global__ void col2imOCF(T* imData,
channelId += blockDim.z) { channelId += blockDim.z) {
for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) { for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) { for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
int widthOffset = idx + swId * strideWidth - paddingWidth; int widthOffset =
int heightOffset = idy + shId * strideHeight - paddingHeight; idx * dilationWidth + swId * strideWidth - paddingWidth;
int heightOffset =
idy * dilationHeight + shId * strideHeight - paddingHeight;
int imOffset = widthOffset + heightOffset * inputWidth + int imOffset = widthOffset + heightOffset * inputWidth +
channelId * inputHeight * inputWidth; channelId * inputHeight * inputWidth;
...@@ -372,7 +407,9 @@ public: ...@@ -372,7 +407,9 @@ public:
int strideHeight, int strideHeight,
int strideWidth, int strideWidth,
int paddingHeight, int paddingHeight,
int paddingWidth) { int paddingWidth,
int dilationHeight,
int dilationWidth) {
int inputChannels = imShape[0]; int inputChannels = imShape[0];
int inputHeight = imShape[1]; int inputHeight = imShape[1];
int inputWidth = imShape[2]; int inputWidth = imShape[2];
...@@ -411,6 +448,8 @@ public: ...@@ -411,6 +448,8 @@ public:
strideWidth, strideWidth,
paddingHeight, paddingHeight,
paddingWidth, paddingWidth,
dilationHeight,
dilationWidth,
outputHeight, outputHeight,
outputWidth); outputWidth);
CHECK_SYNC("Col2ImFunctor GPU failed"); CHECK_SYNC("Col2ImFunctor GPU failed");
......
...@@ -29,82 +29,98 @@ void TestIm2ColFunctor() { ...@@ -29,82 +29,98 @@ void TestIm2ColFunctor() {
for (size_t filterWidth : {3, 7}) { for (size_t filterWidth : {3, 7}) {
for (size_t stride : {1, 2}) { for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) { for (size_t padding : {0, 1}) {
if (inputHeight <= filterHeight || inputWidth <= filterWidth) for (size_t dilation : {1, 3}) {
break; size_t filterSizeH = (filterHeight - 1) * dilation + 1;
if (padding >= filterHeight || padding >= filterWidth) break; size_t filterSizeW = (filterWidth - 1) * dilation + 1;
size_t outputHeight = if (inputHeight + 2 * padding < filterSizeH ||
(inputHeight - filterHeight + 2 * padding + stride) / inputWidth + 2 * padding < filterSizeW)
stride; break;
size_t outputWidth = if (padding >= filterSizeH || padding >= filterSizeW) break;
(inputWidth - filterWidth + 2 * padding + stride) / stride; size_t outputHeight =
(inputHeight - filterSizeH + 2 * padding) / stride + 1;
TensorShape imShape = size_t outputWidth =
TensorShape({channels, inputHeight, inputWidth}); (inputWidth - filterSizeW + 2 * padding) / stride + 1;
TensorShape colShape1 = TensorShape({channels,
filterHeight, TensorShape imShape =
filterWidth, TensorShape({channels, inputHeight, inputWidth});
outputHeight, TensorShape colShape1 = TensorShape({channels,
outputWidth}); filterHeight,
TensorShape colShape2 = TensorShape({outputHeight, filterWidth,
outputWidth, outputHeight,
channels, outputWidth});
filterHeight, TensorShape colShape2 = TensorShape({outputHeight,
filterWidth}); outputWidth,
channels,
size_t height = channels * filterHeight * filterWidth; filterHeight,
size_t width = outputHeight * outputWidth; filterWidth});
VectorPtr input1 = Vector::create(imShape.getElements(), false);
VectorPtr input2 = Vector::create(imShape.getElements(), false); size_t height = channels * filterHeight * filterWidth;
MatrixPtr output1 = Matrix::create(height, width, false, false); size_t width = outputHeight * outputWidth;
MatrixPtr output2 = Matrix::create(width, height, false, false); VectorPtr input1 =
input1->uniform(0.001, 1); Vector::create(imShape.getElements(), false);
input2->copyFrom(*input1); VectorPtr input2 =
Vector::create(imShape.getElements(), false);
Im2ColFunctor<kCFO, Device, T> im2Col1; MatrixPtr output1 =
Im2ColFunctor<kOCF, Device, T> im2Col2; Matrix::create(height, width, false, false);
im2Col1(input1->getData(), MatrixPtr output2 =
imShape, Matrix::create(width, height, false, false);
output1->getData(), input1->uniform(0.001, 1);
colShape1, input2->copyFrom(*input1);
stride,
stride, Im2ColFunctor<kCFO, Device, T> im2Col1;
padding, Im2ColFunctor<kOCF, Device, T> im2Col2;
padding); im2Col1(input1->getData(),
im2Col2(input2->getData(), imShape,
imShape, output1->getData(),
output2->getData(), colShape1,
colShape2, stride,
stride, stride,
stride, padding,
padding, padding,
padding); dilation,
dilation);
// The transposition of the result of ColFormat == kCFO im2Col2(input2->getData(),
// is equal to the result of ColFormat == kOCF. imShape,
MatrixPtr test; output2->getData(),
output2->transpose(test, true); colShape2,
autotest::TensorCheckErr(*output1, *test); stride,
stride,
Col2ImFunctor<kCFO, Device, T> col2Im1; padding,
Col2ImFunctor<kOCF, Device, T> col2Im2; padding,
col2Im1(input1->getData(), dilation,
imShape, dilation);
output1->getData(),
colShape1, // The transposition of the result of ColFormat == kCFO
stride, // is equal to the result of ColFormat == kOCF.
stride, MatrixPtr test;
padding, output2->transpose(test, true);
padding); autotest::TensorCheckErr(*output1, *test);
col2Im2(input2->getData(),
imShape, Col2ImFunctor<kCFO, Device, T> col2Im1;
output2->getData(), Col2ImFunctor<kOCF, Device, T> col2Im2;
colShape2,
stride, col2Im1(input1->getData(),
stride, imShape,
padding, output1->getData(),
padding); colShape1,
stride,
autotest::TensorCheckErr(*input1, *input2); stride,
padding,
padding,
dilation,
dilation);
col2Im2(input2->getData(),
imShape,
output2->getData(),
colShape2,
stride,
stride,
padding,
padding,
dilation,
dilation);
autotest::TensorCheckErr(*input1, *input2);
}
} }
} }
} }
......
...@@ -85,9 +85,49 @@ if(MOBILE_INFERENCE) ...@@ -85,9 +85,49 @@ if(MOBILE_INFERENCE)
gradientmachines/GradientMachineMode.cpp gradientmachines/GradientMachineMode.cpp
gradientmachines/MultiGradientMachine.cpp) gradientmachines/MultiGradientMachine.cpp)
# Remove useless layers # Remove layers that used in training
list(REMOVE_ITEM GSERVER_SOURCES list(REMOVE_ITEM GSERVER_SOURCES
layers/RecurrentLayerGroup.cpp) layers/RecurrentLayerGroup.cpp
layers/CostLayer.cpp
layers/MultiBoxLossLayer.cpp
layers/WarpCTCLayer.cpp
layers/CTCLayer.cpp
layers/LinearChainCTC.cpp
layers/PrintLayer.cpp)
list(REMOVE_ITEM GSERVER_SOURCES
layers/OuterProdLayer.cpp
layers/SumToOneNormLayer.cpp
layers/ConvShiftLayer.cpp
layers/InterpolationLayer.cpp
layers/AgentLayer.cpp
layers/DotMulOperator.cpp
layers/GruStepLayer.cpp
layers/LstmStepLayer.cpp
layers/ConvexCombinationLayer.cpp
layers/Conv3DLayer.cpp
layers/DeConv3DLayer.cpp
layers/CropLayer.cpp
layers/CrossEntropyOverBeam.cpp
layers/DataNormLayer.cpp
layers/FeatureMapExpandLayer.cpp
layers/HierarchicalSigmoidLayer.cpp
layers/MultinomialSampler.cpp
layers/NCELayer.cpp
layers/KmaxSeqScoreLayer.cpp
layers/MDLstmLayer.cpp
layers/MultiplexLayer.cpp
layers/PadLayer.cpp
layers/Pool3DLayer.cpp
layers/ResizeLayer.cpp
layers/RotateLayer.cpp
layers/RowConvLayer.cpp
layers/RowL2NormLayer.cpp
layers/SamplingIdLayer.cpp
layers/ScaleShiftLayer.cpp
layers/SelectiveFullyConnectedLayer.cpp
layers/SpatialPyramidPoolLayer.cpp
layers/BilinearInterpLayer.cpp
layers/ClipLayer.cpp)
endif() endif()
if(WITH_GPU) if(WITH_GPU)
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include "NeuralNetwork.h" #include "NeuralNetwork.h"
#include "hl_gpu.h" #include "hl_gpu.h"
#include "paddle/gserver/layers/AgentLayer.h"
#include "paddle/utils/CustomStackTrace.h" #include "paddle/utils/CustomStackTrace.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
...@@ -28,6 +27,7 @@ limitations under the License. */ ...@@ -28,6 +27,7 @@ limitations under the License. */
#ifndef PADDLE_MOBILE_INFERENCE #ifndef PADDLE_MOBILE_INFERENCE
#include "MultiNetwork.h" #include "MultiNetwork.h"
#include "RecurrentGradientMachine.h" #include "RecurrentGradientMachine.h"
#include "paddle/gserver/layers/AgentLayer.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config, ...@@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config,
void NeuralNetwork::connect(LayerPtr agentLayer, void NeuralNetwork::connect(LayerPtr agentLayer,
LayerPtr realLayer, LayerPtr realLayer,
int height) { int height) {
#ifndef PADDLE_MOBILE_INFERENCE
AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get()); AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
CHECK_NOTNULL(agent); CHECK_NOTNULL(agent);
agent->setRealLayer(realLayer, height); agent->setRealLayer(realLayer, height);
#endif
} }
void NeuralNetwork::connect(std::string agentLayerName, void NeuralNetwork::connect(std::string agentLayerName,
......
...@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -79,6 +79,10 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
for (int i = 0; i < config_.inputs_size(); i++) { for (int i = 0; i < config_.inputs_size(); i++) {
std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]}; std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]}; std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
std::vector<size_t> dilations = {(size_t)dilationY_[i],
(size_t)dilation_[i]};
bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
// Convolution Layer uses the GemmConv function by default. // Convolution Layer uses the GemmConv function by default.
convType = "GemmConv"; convType = "GemmConv";
...@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -97,13 +101,14 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
#if defined(__ARM_NEON__) || defined(__ARM_NEON) #if defined(__ARM_NEON__) || defined(__ARM_NEON)
if ((filterSize_[i] == filterSizeY_[i]) && if ((filterSize_[i] == filterSizeY_[i]) &&
(filterSize_[i] == 3 || filterSize_[i] == 4) && (filterSize_[i] == 3 || filterSize_[i] == 4) &&
(stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2)) { (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
!useDilation) {
convType = "NeonDepthwiseConv"; convType = "NeonDepthwiseConv";
} }
#endif #endif
} }
if (FLAGS_use_nnpack && !isDeconv_) { if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
createFunction(forward_, createFunction(forward_,
"NNPACKConv", "NNPACKConv",
FuncConfig() FuncConfig()
...@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -117,6 +122,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
FuncConfig() FuncConfig()
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("dilations", dilations)
.set("groups", (size_t)groups_[i])); .set("groups", (size_t)groups_[i]));
createFunction(backward_, createFunction(backward_,
...@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -124,6 +130,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
FuncConfig() FuncConfig()
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("dilations", dilations)
.set("groups", (size_t)groups_[i])); .set("groups", (size_t)groups_[i]));
createFunction(backward_, createFunction(backward_,
...@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap, ...@@ -131,6 +138,7 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
FuncConfig() FuncConfig()
.set("paddings", paddings) .set("paddings", paddings)
.set("strides", strides) .set("strides", strides)
.set("dilations", dilations)
.set("groups", (size_t)groups_[i])); .set("groups", (size_t)groups_[i]));
} }
} }
......
...@@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_; ...@@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
LayerPtr Layer::create(const LayerConfig& config) { LayerPtr Layer::create(const LayerConfig& config) {
std::string type = config.type(); std::string type = config.type();
#ifndef PADDLE_MOBILE_INFERENCE
// NOTE: As following types have illegal character '-', // NOTE: As following types have illegal character '-',
// they can not use REGISTER_LAYER to registrar. // they can not use REGISTER_LAYER to registrar.
// Besides, to fit with old training models, // Besides, to fit with old training models,
...@@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) { ...@@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) {
return LayerPtr(new MultiClassCrossEntropy(config)); return LayerPtr(new MultiClassCrossEntropy(config));
else if (type == "rank-cost") else if (type == "rank-cost")
return LayerPtr(new RankingCost(config)); return LayerPtr(new RankingCost(config));
#ifndef PADDLE_MOBILE_INFERENCE
else if (type == "auc-validation") else if (type == "auc-validation")
return LayerPtr(new AucValidation(config)); return LayerPtr(new AucValidation(config));
else if (type == "pnpair-validation") else if (type == "pnpair-validation")
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MaxPoolWithMaskLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
PoolLayer::init(layerMap, parameterMap);
setOutput("mask", &mask_);
return true;
}
size_t MaxPoolWithMaskLayer::getSize() {
CHECK_EQ(inputLayers_.size(), 1UL);
size_t layerSize = 0;
outputY_ = outputSize(imgSizeY_,
sizeY_,
confPaddingY_,
strideY_,
/* caffeMode */ false);
outputX_ = outputSize(imgSize_,
sizeX_,
confPadding_,
stride_,
/* caffeMode */ false);
layerSize = outputX_ * outputY_ * channels_;
getOutput().setFrameHeight(outputY_);
getOutput().setFrameWidth(outputX_);
return layerSize;
}
void MaxPoolWithMaskLayer::forward(PassType passType) {
size_t size = getSize();
MatrixPtr inputV = inputLayers_[0]->getOutputValue();
int batchSize = inputV->getHeight();
resetOutput(batchSize, size);
MatrixPtr outV = getOutputValue();
CHECK_EQ(size, outV->getWidth());
resetSpecifyOutput(mask_,
batchSize,
size,
/* isValueClean */ false,
/* isGradClean */ true);
MatrixPtr maskV = mask_.value;
outV->maxPoolForward(*inputV,
imgSizeY_,
imgSize_,
channels_,
sizeX_,
sizeY_,
strideY_,
stride_,
outputY_,
outputX_,
confPaddingY_,
confPadding_,
maskV);
}
void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
(void)callback;
if (NULL == getInputGrad(0)) {
return;
}
MatrixPtr outGrad = getOutputGrad();
MatrixPtr inputV = inputLayers_[0]->getOutputValue();
MatrixPtr outV = getOutputValue();
MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
inputGrad->maxPoolBackward(*inputV,
imgSizeY_,
imgSize_,
*outGrad,
*outV,
sizeX_,
sizeY_,
strideY_,
stride_,
outputY_,
outputX_,
1,
1,
confPaddingY_,
confPadding_);
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "PoolLayer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* @brief Basic parent layer of different kinds of pooling
*/
class MaxPoolWithMaskLayer : public PoolLayer {
protected:
Argument mask_;
public:
explicit MaxPoolWithMaskLayer(const LayerConfig& config)
: PoolLayer(config) {}
size_t getSize();
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
};
} // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "PoolLayer.h" #include "PoolLayer.h"
#include "MaxPoolWithMaskLayer.h"
#include "PoolProjectionLayer.h" #include "PoolProjectionLayer.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -44,7 +45,6 @@ bool PoolLayer::init(const LayerMap& layerMap, ...@@ -44,7 +45,6 @@ bool PoolLayer::init(const LayerMap& layerMap,
strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride(); strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding(); confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
return true; return true;
} }
...@@ -57,6 +57,8 @@ Layer* PoolLayer::create(const LayerConfig& config) { ...@@ -57,6 +57,8 @@ Layer* PoolLayer::create(const LayerConfig& config) {
} else if (CudnnPoolLayer::typeCheck(pool)) { } else if (CudnnPoolLayer::typeCheck(pool)) {
return new CudnnPoolLayer(config); return new CudnnPoolLayer(config);
#endif #endif
} else if (pool == "max-pool-with-mask") {
return new MaxPoolWithMaskLayer(config);
} else { } else {
LOG(FATAL) << "Unknown pool type: " << pool; LOG(FATAL) << "Unknown pool type: " << pool;
return nullptr; return nullptr;
......
...@@ -100,8 +100,9 @@ void ROIPoolLayer::forward(PassType passType) { ...@@ -100,8 +100,9 @@ void ROIPoolLayer::forward(PassType passType) {
size_t roiEndH = round(bottomROIs[4] * spatialScale_); size_t roiEndH = round(bottomROIs[4] * spatialScale_);
CHECK_GE(roiBatchIdx, 0UL); CHECK_GE(roiBatchIdx, 0UL);
CHECK_LT(roiBatchIdx, batchSize); CHECK_LT(roiBatchIdx, batchSize);
size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL); size_t roiHeight =
size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL); std::max(roiEndH - roiStartH + 1, static_cast<size_t>(1));
size_t roiWidth = std::max(roiEndW - roiStartW + 1, static_cast<size_t>(1));
real binSizeH = real binSizeH =
static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_); static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
real binSizeW = real binSizeW =
...@@ -114,10 +115,14 @@ void ROIPoolLayer::forward(PassType passType) { ...@@ -114,10 +115,14 @@ void ROIPoolLayer::forward(PassType passType) {
size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW)); size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH)); size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW)); size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
hstart = std::min(std::max(hstart + roiStartH, 0UL), height_); hstart = std::min(
wstart = std::min(std::max(wstart + roiStartW, 0UL), width_); std::max(hstart + roiStartH, static_cast<size_t>(0)), height_);
hend = std::min(std::max(hend + roiStartH, 0UL), height_); wstart = std::min(
wend = std::min(std::max(wend + roiStartW, 0UL), width_); std::max(wstart + roiStartW, static_cast<size_t>(0)), width_);
hend = std::min(std::max(hend + roiStartH, static_cast<size_t>(0)),
height_);
wend = std::min(std::max(wend + roiStartW, static_cast<size_t>(0)),
width_);
bool isEmpty = (hend <= hstart) || (wend <= wstart); bool isEmpty = (hend <= hstart) || (wend <= wstart);
size_t poolIndex = ph * pooledWidth_ + pw; size_t poolIndex = ph * pooledWidth_ + pw;
......
# gserver pacakge unittests # gserver pacakge unittests
add_simple_unittest(test_LinearChainCRF) add_simple_unittest(test_LinearChainCRF)
add_simple_unittest(test_MultinomialSampler)
add_simple_unittest(test_RecurrentLayer) add_simple_unittest(test_RecurrentLayer)
if(NOT MOBILE_INFERENCE)
add_simple_unittest(test_MultinomialSampler)
endif()
function(gserver_test TARGET) function(gserver_test TARGET)
add_unittest_without_exec(${TARGET} add_unittest_without_exec(${TARGET}
${TARGET}.cpp ${TARGET}.cpp
...@@ -24,6 +27,7 @@ gserver_test(test_ConvUnify) ...@@ -24,6 +27,7 @@ gserver_test(test_ConvUnify)
gserver_test(test_BatchNorm) gserver_test(test_BatchNorm)
gserver_test(test_KmaxSeqScore) gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand) gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput)
########## test_Mkldnn layers and activations ########## ########## test_Mkldnn layers and activations ##########
if(WITH_MKLDNN) if(WITH_MKLDNN)
...@@ -48,7 +52,7 @@ if(WITH_PYTHON) ...@@ -48,7 +52,7 @@ if(WITH_PYTHON)
endif() endif()
############### test_WarpCTCLayer ####################### ############### test_WarpCTCLayer #######################
if(NOT WITH_DOUBLE) if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
add_unittest_without_exec(test_WarpCTCLayer add_unittest_without_exec(test_WarpCTCLayer
test_WarpCTCLayer.cpp) test_WarpCTCLayer.cpp)
......
...@@ -434,7 +434,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) { ...@@ -434,7 +434,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
config.layerConfig.set_partial_sum(1); config.layerConfig.set_partial_sum(1);
config.layerConfig.set_shared_biases(true); config.layerConfig.set_shared_biases(true);
int dilation = 1; int dilation = 2;
if (type == "cudnn_conv") { if (type == "cudnn_conv") {
#if CUDNN_VERSION >= 6000 #if CUDNN_VERSION >= 6000
dilation = 2; dilation = 2;
...@@ -1234,6 +1234,7 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) { ...@@ -1234,6 +1234,7 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
TEST(Layer, PoolLayer) { TEST(Layer, PoolLayer) {
testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false); testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false); testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true); testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
...@@ -1242,6 +1243,7 @@ TEST(Layer, PoolLayer) { ...@@ -1242,6 +1243,7 @@ TEST(Layer, PoolLayer) {
testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <string>
#include <vector>
#include "LayerGradUtil.h"
#include "paddle/math/MathUtils.h"
#include "paddle/testing/TestUtil.h"
using namespace paddle;
void setPoolConfig(TestConfig* config,
PoolConfig* pool,
const string& poolType) {
(*config).biasSize = 0;
(*config).layerConfig.set_type("pool");
(*config).layerConfig.set_num_filters(1);
int kw = 3, kh = 3;
int pw = 0, ph = 0;
int sw = 2, sh = 2;
pool->set_pool_type(poolType);
pool->set_channels(1);
pool->set_size_x(kw);
pool->set_size_y(kh);
pool->set_start(0);
pool->set_padding(pw);
pool->set_padding_y(ph);
pool->set_stride(sw);
pool->set_stride_y(sh);
int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
pool->set_output_x(ow);
pool->set_output_y(oh);
}
void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
const string& poolType,
bool use_gpu,
MatrixPtr& maskMat) {
TestConfig config;
config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
LayerInputConfig* input = config.layerConfig.add_inputs();
PoolConfig* pool = input->mutable_pool_conf();
pool->set_img_size(5);
pool->set_img_size_y(5);
setPoolConfig(&config, pool, poolType);
config.layerConfig.set_size(pool->output_x() * pool->output_y() *
pool->channels());
config.layerConfig.set_name("MaxPoolWithMask");
std::vector<DataLayerPtr> dataLayers;
LayerMap layerMap;
vector<Argument> datas;
initDataLayer(config,
&dataLayers,
&datas,
&layerMap,
"MaxPoolWithMask",
1,
false,
use_gpu);
dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
FLAGS_use_gpu = use_gpu;
std::vector<ParameterPtr> parameters;
LayerPtr maxPoolingWithMaskOutputLayer;
initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
maxPoolingWithMaskOutputLayer->forward(PASS_GC);
checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
maskMat);
}
TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
bool useGpu = false;
MatrixPtr inputMat;
MatrixPtr maskMat;
real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
real maskData[] = {12, 4, 22, 24};
inputMat = Matrix::create(1, 25, false, useGpu);
maskMat = Matrix::create(1, 4, false, useGpu);
inputMat->setData(inputData);
maskMat->setData(maskData);
doOneMaxPoolingWithMaskOutputTest(
inputMat, "max-pool-with-mask", useGpu, maskMat);
#ifdef PADDLE_WITH_CUDA
useGpu = true;
inputMat = Matrix::create(1, 25, false, useGpu);
maskMat = Matrix::create(1, 4, false, useGpu);
inputMat->copyFrom(inputData, 25);
maskMat->copyFrom(maskData, 4);
doOneMaxPoolingWithMaskOutputTest(
inputMat, "max-pool-with-mask", useGpu, maskMat);
#endif
}
...@@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b, ...@@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
} }
template class BaseMatrixT<real>; template class BaseMatrixT<real>;
#ifndef PADDLE_MOBILE_INFERENCE
template class BaseMatrixT<int>; template class BaseMatrixT<int>;
#else
template <>
void BaseMatrixT<int>::zero() {
applyUnary(unary::Zero<int>());
}
template <>
void BaseMatrixT<int>::assign(int p) {
applyUnary(unary::Assign<int>(p));
}
template <>
void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
applyBinary(binary::IsEqual<int>(value), b);
}
template <>
void BaseMatrixT<int>::neg() {
applyUnary(unary::Neg<int>());
}
template <>
void BaseMatrixT<int>::abs2() {
applyUnary(unary::Abs<int>());
}
template <>
void BaseMatrixT<int>::add(int p) {
applyUnary(unary::Add<int>(p));
}
template <>
void BaseMatrixT<int>::add(int p1, int p2) {
applyUnary(unary::Add2<int>(p1, p2));
}
template <>
void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
}
#endif
} // namespace paddle } // namespace paddle
...@@ -25,6 +25,19 @@ else() ...@@ -25,6 +25,19 @@ else()
message(STATUS "Compile with MKLDNNMatrix") message(STATUS "Compile with MKLDNNMatrix")
endif() endif()
if(MOBILE_INFERENCE)
list(REMOVE_ITEM MATH_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
# Remove sparse
list(REMOVE_ITEM MATH_HEADERS
${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
list(REMOVE_ITEM MATH_SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
endif()
set(MATH_SOURCES set(MATH_SOURCES
"${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu" "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
"${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu" "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifndef PADDLE_MOBILE_INFERENCE
#include <cstddef> #include <cstddef>
#include "Matrix.h" #include "Matrix.h"
...@@ -309,3 +312,57 @@ private: ...@@ -309,3 +312,57 @@ private:
using Matrix::subMatrix; using Matrix::subMatrix;
}; };
} // namespace paddle } // namespace paddle
#else
#include "Matrix.h"
namespace paddle {
class CpuSparseMatrix : public Matrix {
public:
CpuSparseMatrix(size_t height,
size_t width,
size_t nnz, /* used to allocate space */
SparseValueType valueType = FLOAT_VALUE,
SparseFormat format = SPARSE_CSR,
bool trans = false)
: Matrix(NULL, height, width, trans, false) {}
CpuSparseMatrix(real* data,
int* rows,
int* cols,
size_t height,
size_t width,
size_t nnz,
SparseValueType valueType,
SparseFormat format,
bool trans)
: Matrix(NULL, height, width, trans, false) {}
real* getValue() const { return nullptr; }
size_t getColStartIdx(size_t i) const { return 0; }
size_t getRowStartIdx(size_t i) const { return 0; }
size_t getColNum(size_t i) const { return 0; }
int* getRowCols(size_t i) const { return nullptr; }
CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
return nullptr;
}
void resize(size_t newHeight,
size_t newWidth,
size_t newNnz, /* used to allocate space */
SparseValueType valueType,
SparseFormat format) {}
void resize(size_t newHeight, size_t newWidth) {}
MatrixPtr getTranspose() { return nullptr; }
void setRow(size_t row,
size_t colNum,
const unsigned int* cols,
const real* values) {}
};
} // namespace paddle
#endif
...@@ -451,6 +451,7 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) { ...@@ -451,6 +451,7 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
} }
void GpuMatrix::collectBias(Matrix& a, real scale) { void GpuMatrix::collectBias(Matrix& a, real scale) {
#ifdef PADDLE_WITH_CUDA
CHECK_EQ(getHeight(), (size_t)1); CHECK_EQ(getHeight(), (size_t)1);
CHECK_EQ(width_, a.getWidth()); CHECK_EQ(width_, a.getWidth());
GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a); GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
...@@ -461,6 +462,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) { ...@@ -461,6 +462,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get(); hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale); hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
} }
#endif
} }
void GpuMatrix::collectSharedBias(Matrix& a, real scale) { void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
...@@ -552,6 +554,7 @@ void GpuMatrix::mul(const GpuSparseMatrix& a, ...@@ -552,6 +554,7 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
const GpuMatrix& b, const GpuMatrix& b,
real scaleAB, real scaleAB,
real scaleT) { real scaleT) {
#ifdef PADDLE_WITH_CUDA
CHECK(isContiguous()); CHECK(isContiguous());
CHECK(b.isContiguous()); CHECK(b.isContiguous());
CHECK(b.useGpu_ == true) << "Matrix type are not equal"; CHECK(b.useGpu_ == true) << "Matrix type are not equal";
...@@ -578,12 +581,14 @@ void GpuMatrix::mul(const GpuSparseMatrix& a, ...@@ -578,12 +581,14 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
b.height_, b.height_,
scaleAB, scaleAB,
scaleT); scaleT);
#endif
} }
void GpuMatrix::mul(const GpuMatrix& a, void GpuMatrix::mul(const GpuMatrix& a,
const GpuSparseMatrix& b, const GpuSparseMatrix& b,
real scaleAB, real scaleAB,
real scaleT) { real scaleT) {
#ifdef PADDLE_WITH_CUDA
CHECK(isContiguous()); CHECK(isContiguous());
CHECK(a.isContiguous()); CHECK(a.isContiguous());
CHECK(a.useGpu_ == true) << "Matrix type are not equal"; CHECK(a.useGpu_ == true) << "Matrix type are not equal";
...@@ -622,6 +627,7 @@ void GpuMatrix::mul(const GpuMatrix& a, ...@@ -622,6 +627,7 @@ void GpuMatrix::mul(const GpuMatrix& a,
scaleAB, scaleAB,
scaleT); scaleT);
} }
#endif
} }
/* this = a*b */ /* this = a*b */
...@@ -1028,15 +1034,23 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -1028,15 +1034,23 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
size_t outputH, size_t outputH,
size_t outputW, size_t outputW,
size_t paddingH, size_t paddingH,
size_t paddingW) { size_t paddingW,
MatrixPtr maskMatP) {
CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal"; CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
real* inputData = inputMat.getData(); real* inputData = inputMat.getData();
real* maskData = NULL;
size_t frameNum = inputMat.getHeight(); size_t frameNum = inputMat.getHeight();
CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth()); CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
CHECK(height_ == inputMat.getHeight()); CHECK(height_ == inputMat.getHeight());
CHECK(width_ == outputH * outputW * channels); CHECK(width_ == outputH * outputW * channels);
if (maskMatP != NULL) {
CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
CHECK(outputH * outputW * channels == maskMatP->getWidth());
maskData = maskMatP->getData();
}
hl_maxpool_forward(frameNum, hl_maxpool_forward(frameNum,
inputData, inputData,
channels, channels,
...@@ -1051,7 +1065,8 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -1051,7 +1065,8 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
paddingH, paddingH,
paddingW, paddingW,
data_, data_,
getStride()); getStride(),
maskData);
} }
void GpuMatrix::maxPoolBackward(Matrix& inputMat, void GpuMatrix::maxPoolBackward(Matrix& inputMat,
...@@ -1548,6 +1563,7 @@ void GpuMatrix::bilinearBackward(const Matrix& out, ...@@ -1548,6 +1563,7 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
} }
void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) { void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
#ifdef PADDLE_WITH_CUDA
GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output); GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label); auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
...@@ -1563,9 +1579,11 @@ void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) { ...@@ -1563,9 +1579,11 @@ void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get(); hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
hl_matrix_multi_binary_cross_entropy( hl_matrix_multi_binary_cross_entropy(
output_d, entropy_d, mat_d, height_, outputPtr->width_); output_d, entropy_d, mat_d, height_, outputPtr->width_);
#endif
} }
void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) { void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
#ifdef PADDLE_WITH_CUDA
GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output); GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label); auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
...@@ -1581,6 +1599,7 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) { ...@@ -1581,6 +1599,7 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get(); hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
hl_matrix_multi_binary_cross_entropy_bp( hl_matrix_multi_binary_cross_entropy_bp(
output_d, grad_d, mat_d, height_, width_); output_d, grad_d, mat_d, height_, width_);
#endif
} }
void GpuMatrix::vol2Col(real* dataSrc, void GpuMatrix::vol2Col(real* dataSrc,
...@@ -1973,9 +1992,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -1973,9 +1992,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
size_t outputH, size_t outputH,
size_t outputW, size_t outputW,
size_t paddingH, size_t paddingH,
size_t paddingW) { size_t paddingW,
MatrixPtr maskMatP) {
real* inputData = inputMat.getData(); real* inputData = inputMat.getData();
real* outData = data_; real* outData = data_;
real* maskData = NULL;
size_t num = inputMat.getHeight(); size_t num = inputMat.getHeight();
size_t inLength = imgSizeH * imgSizeW; size_t inLength = imgSizeH * imgSizeW;
size_t outLength = outputH * outputW; size_t outLength = outputH * outputW;
...@@ -1984,6 +2005,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -1984,6 +2005,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
CHECK_EQ(channels * outLength, this->getWidth()); CHECK_EQ(channels * outLength, this->getWidth());
size_t outStride = getStride(); size_t outStride = getStride();
if (maskMatP != NULL) {
maskData = maskMatP->getData();
CHECK_EQ(channels * outLength, maskMatP->getWidth());
}
/* initialize the data_ */ /* initialize the data_ */
for (size_t i = 0; i < height_; i++) { for (size_t i = 0; i < height_; i++) {
for (size_t j = 0; j < width_; j++) { for (size_t j = 0; j < width_; j++) {
...@@ -2005,10 +2031,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2005,10 +2031,21 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
int wstart = pw * strideW - paddingW; int wstart = pw * strideW - paddingW;
int wend = std::min(wstart + sizeX, imgSizeW); int wend = std::min(wstart + sizeX, imgSizeW);
wstart = std::max(wstart, 0); wstart = std::max(wstart, 0);
for (int h = hstart; h < hend; ++h) { if (maskData == NULL) {
for (int w = wstart; w < wend; ++w) { for (int h = hstart; h < hend; ++h) {
outData[ph * outputW + pw] = std::max( for (int w = wstart; w < wend; ++w) {
outData[ph * outputW + pw], inputData[h * imgSizeW + w]); outData[ph * outputW + pw] = std::max(
outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
}
}
} else {
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
maskData[ph * outputW + pw] = h * imgSizeW + w;
}
}
} }
} }
} }
...@@ -2016,6 +2053,8 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, ...@@ -2016,6 +2053,8 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
// compute offset // compute offset
inputData += inLength; inputData += inLength;
outData += outLength; outData += outLength;
if (maskData != NULL) maskData += outLength;
} }
} }
} }
...@@ -3226,6 +3265,7 @@ template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a, ...@@ -3226,6 +3265,7 @@ template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
real scaleAB, real scaleAB,
real scaleT); real scaleT);
#ifndef PADDLE_MOBILE_INFERENCE
void SharedCpuMatrix::mul(CpuSparseMatrix* a, void SharedCpuMatrix::mul(CpuSparseMatrix* a,
CpuMatrix* b, CpuMatrix* b,
real scaleAB, real scaleAB,
...@@ -3354,6 +3394,7 @@ void SharedCpuMatrix::initBlock(int blockNum) { ...@@ -3354,6 +3394,7 @@ void SharedCpuMatrix::initBlock(int blockNum) {
} }
} }
#endif
/* Add a (column) vector b to matrix a, column by column */ /* Add a (column) vector b to matrix a, column by column */
void CpuMatrix::addColumnVector(const Matrix& b) { void CpuMatrix::addColumnVector(const Matrix& b) {
BaseMatrix::addColVector(const_cast<Matrix&>(b)); BaseMatrix::addColVector(const_cast<Matrix&>(b));
......
...@@ -861,7 +861,8 @@ public: ...@@ -861,7 +861,8 @@ public:
/** /**
* Pooling forward operation, pick out the largest element * Pooling forward operation, pick out the largest element
* in the sizeX of value * in the sizeX of value, if the maskMatP is not NULL, it will
* also caculate the location indices.
*/ */
virtual void maxPoolForward(Matrix& inputMat, virtual void maxPoolForward(Matrix& inputMat,
size_t imgSizeH, size_t imgSizeH,
...@@ -874,7 +875,8 @@ public: ...@@ -874,7 +875,8 @@ public:
size_t outputH, size_t outputH,
size_t outputW, size_t outputW,
size_t paddingH, size_t paddingH,
size_t paddingW) { size_t paddingW,
MatrixPtr maskMatP = NULL) {
LOG(FATAL) << "Not implemeted"; LOG(FATAL) << "Not implemeted";
} }
...@@ -1426,7 +1428,8 @@ public: ...@@ -1426,7 +1428,8 @@ public:
size_t outputH, size_t outputH,
size_t outputW, size_t outputW,
size_t paddingH, size_t paddingH,
size_t paddingW); size_t paddingW,
MatrixPtr maskMatP);
void maxPoolBackward(Matrix& image, void maxPoolBackward(Matrix& image,
size_t imgSizeH, size_t imgSizeH,
...@@ -1697,7 +1700,8 @@ public: ...@@ -1697,7 +1700,8 @@ public:
size_t outputH, size_t outputH,
size_t outputW, size_t outputW,
size_t paddingH, size_t paddingH,
size_t paddingW); size_t paddingW,
MatrixPtr maskMatP);
void maxPoolBackward(Matrix& image, void maxPoolBackward(Matrix& image,
size_t imgSizeH, size_t imgSizeH,
...@@ -2066,6 +2070,7 @@ public: ...@@ -2066,6 +2070,7 @@ public:
class SharedCpuMatrix : public CpuMatrix { class SharedCpuMatrix : public CpuMatrix {
public: public:
#ifndef PADDLE_MOBILE_INFERENCE
/* blockNum is number of partitions of the matrix */ /* blockNum is number of partitions of the matrix */
SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false) SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
: CpuMatrix(height, width, trans) { : CpuMatrix(height, width, trans) {
...@@ -2111,6 +2116,7 @@ private: ...@@ -2111,6 +2116,7 @@ private:
ThreadLocal<CpuMatrixPtr> localBuf_; ThreadLocal<CpuMatrixPtr> localBuf_;
ThreadLocal<std::vector<int>> localBufRows_; ThreadLocal<std::vector<int>> localBufRows_;
ThreadLocal<std::vector<int>> blockSeq_; ThreadLocal<std::vector<int>> blockSeq_;
#endif
}; };
typedef struct { unsigned int col; } sparse_non_value_t; typedef struct { unsigned int col; } sparse_non_value_t;
......
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#ifndef PADDLE_MOBILE_INFERENCE
#include <cstddef> #include <cstddef>
#include "CpuSparseMatrix.h" #include "CpuSparseMatrix.h"
#include "Matrix.h" #include "Matrix.h"
...@@ -237,3 +240,47 @@ private: ...@@ -237,3 +240,47 @@ private:
}; };
} // namespace paddle } // namespace paddle
#else
#include "CpuSparseMatrix.h"
namespace paddle {
class GpuSparseMatrix : public Matrix {
public:
GpuSparseMatrix(size_t height,
size_t width,
size_t nnz, /* used to allocate space */
SparseValueType valueType = FLOAT_VALUE,
SparseFormat format_ = SPARSE_CSR,
bool trans = false)
: Matrix(NULL, height, width, trans, false) {}
GpuSparseMatrix(real* value,
int* rows,
int* cols,
size_t height,
size_t width,
size_t nnz,
SparseValueType valueType,
SparseFormat format,
bool trans)
: Matrix(NULL, height, width, trans, true) {}
void resize(size_t newHeight,
size_t newWidth,
size_t newNnz, /* used to allocate space */
SparseValueType valueType,
SparseFormat format) {}
void resize(size_t newHeight, size_t newWidth) {}
MatrixPtr getTranspose() { return nullptr; }
void setRow(size_t row,
size_t colNum,
const unsigned int* cols,
const real* values) {}
};
} // namespace paddle
#endif
...@@ -14,6 +14,8 @@ limitations under the License. */ ...@@ -14,6 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#ifndef PADDLE_MOBILE_INFERENCE
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <string.h> #include <string.h>
#include <algorithm> #include <algorithm>
...@@ -313,3 +315,27 @@ private: ...@@ -313,3 +315,27 @@ private:
}; };
} // namespace paddle } // namespace paddle
#else
namespace paddle {
class SparseRowCpuMatrix : public CpuMatrix {
public:
void reserveStore() {}
void clearIndices() {}
};
class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
public:
void setupIndices() {}
void addRows(MatrixPtr input) {}
void addRows(IVectorPtr ids) {}
};
class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
class SparseRowIdsCpuMatrix : public CpuMatrix {};
} // namespace paddle
#endif
...@@ -3,8 +3,10 @@ ...@@ -3,8 +3,10 @@
add_simple_unittest(test_ExecViaCpu) add_simple_unittest(test_ExecViaCpu)
add_simple_unittest(test_SIMDFunctions) add_simple_unittest(test_SIMDFunctions)
add_simple_unittest(test_TrainingAlgorithm) add_simple_unittest(test_TrainingAlgorithm)
add_simple_unittest(test_SparseMatrix)
add_simple_unittest(test_RowBuffer) add_simple_unittest(test_RowBuffer)
if(NOT MOBILE_INFERENCE)
add_simple_unittest(test_SparseMatrix)
endif()
# TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference. # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
add_unittest(test_matrixCompare add_unittest(test_matrixCompare
......
...@@ -30,6 +30,10 @@ class AccuracyOp : public framework::OperatorWithKernel { ...@@ -30,6 +30,10 @@ class AccuracyOp : public framework::OperatorWithKernel {
"Input (Label) of accuracy op should not be null."); "Input (Label) of accuracy op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Accuracy"), PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
"Output (Accuracy) of AccuracyOp should not be null."); "Output (Accuracy) of AccuracyOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Correct"),
"Output (Correct) of AccuracyOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Total"),
"Output (Total) of AccuracyOp should not be null.");
auto inference_dim = ctx->GetInputDim("Out"); auto inference_dim = ctx->GetInputDim("Out");
auto label_dim = ctx->GetInputDim("Label"); auto label_dim = ctx->GetInputDim("Label");
...@@ -43,6 +47,8 @@ class AccuracyOp : public framework::OperatorWithKernel { ...@@ -43,6 +47,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
" the same as label."); " the same as label.");
ctx->SetOutputDim("Accuracy", {1}); ctx->SetOutputDim("Accuracy", {1});
ctx->SetOutputDim("Correct", {1});
ctx->SetOutputDim("Total", {1});
ctx->ShareLoD("Out", /*->*/ "Accuracy"); ctx->ShareLoD("Out", /*->*/ "Accuracy");
} }
...@@ -66,6 +72,8 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,6 +72,8 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", "Label of the training data"); AddInput("Label", "Label of the training data");
// TODO(typhoonzero): AddInput("Weight", ... // TODO(typhoonzero): AddInput("Weight", ...
AddOutput("Accuracy", "The accuracy of current batch"); AddOutput("Accuracy", "The accuracy of current batch");
AddOutput("Correct", "The correct samples count of current batch");
AddOutput("Total", "The samples count of current batch");
AddComment(R"DOC( AddComment(R"DOC(
Accuracy Operator. Accuracy Operator.
......
...@@ -24,7 +24,8 @@ using platform::PADDLE_CUDA_NUM_THREADS; ...@@ -24,7 +24,8 @@ using platform::PADDLE_CUDA_NUM_THREADS;
template <int BlockSize> template <int BlockSize>
__global__ void AccuracyCudaKernel(const int N, const int D, __global__ void AccuracyCudaKernel(const int N, const int D,
const int64_t* Xdata, const int64_t* Xdata,
const int64_t* labeldata, float* accuracy) { const int64_t* labeldata, int* correct_data,
float* accuracy) {
int count = 0; int count = 0;
__shared__ int total[BlockSize]; __shared__ int total[BlockSize];
...@@ -43,6 +44,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D, ...@@ -43,6 +44,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
// reduce the count with init value 0, and output accuracy. // reduce the count with init value 0, and output accuracy.
int result = thrust::reduce(thrust::device, total, total + BlockSize, 0); int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
*correct_data = result;
*accuracy = static_cast<float>(result) / static_cast<float>(N); *accuracy = static_cast<float>(result) / static_cast<float>(N);
} }
} }
...@@ -56,31 +58,48 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> { ...@@ -56,31 +58,48 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
auto* inference = ctx.Input<Tensor>("Out"); auto* inference = ctx.Input<Tensor>("Out");
auto* indices = ctx.Input<Tensor>("Indices"); auto* indices = ctx.Input<Tensor>("Indices");
auto* label = ctx.Input<Tensor>("Label"); auto* label = ctx.Input<Tensor>("Label");
auto* accuracy = ctx.Output<Tensor>("Accuracy"); auto* accuracy = ctx.Output<Tensor>("Accuracy");
auto* correct = ctx.Output<Tensor>("Correct");
auto* total = ctx.Output<Tensor>("Total");
// FIXME(typhoonzero): only support indices currently // FIXME(typhoonzero): only support indices currently
// if add support for output values, how to detect the data type? // if add support for output values, how to detect the data type?
const int64_t* indices_data = indices->data<int64_t>(); const int64_t* indices_data = indices->data<int64_t>();
const int64_t* label_data = label->data<int64_t>(); const int64_t* label_data = label->data<int64_t>();
int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
int* total_data = total->mutable_data<int>(ctx.GetPlace());
float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace()); float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
size_t num_samples = inference->dims()[0]; int num_samples = static_cast<int>(inference->dims()[0]);
size_t infer_width = inference->dims()[1]; size_t infer_width = inference->dims()[1];
PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float))); PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
// cudaMemset((void**)&correct_data, 0, sizeof(float));
if (num_samples == 0) { if (num_samples == 0) {
return; return;
} }
cudaMemcpy(total_data, &num_samples, sizeof(int), cudaMemcpyHostToDevice);
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<< AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>( 1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
num_samples, infer_width, indices_data, label_data, accuracy_data); num_samples, infer_width, indices_data, label_data, correct_data,
accuracy_data);
int d_num_samples, d_num_correct;
float d_accuracy;
cudaMemcpy(&d_num_correct, correct_data, sizeof(int),
cudaMemcpyDeviceToHost);
cudaMemcpy(&d_num_samples, total_data, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&d_accuracy, accuracy_data, sizeof(float),
cudaMemcpyDeviceToHost);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
// FIXME(typhoonzero): types of T is for infernece data. // FIXME(typhoonzero): types of T is for inference data.
// label data is always int // label data is always int64
REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>, REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>); paddle::operators::AccuracyOpCUDAKernel<double>);
...@@ -29,7 +29,11 @@ class AccuracyKernel : public framework::OpKernel<T> { ...@@ -29,7 +29,11 @@ class AccuracyKernel : public framework::OpKernel<T> {
auto* indices = ctx.Input<Tensor>("Indices"); auto* indices = ctx.Input<Tensor>("Indices");
auto* label = ctx.Input<Tensor>("Label"); auto* label = ctx.Input<Tensor>("Label");
auto* accuracy = ctx.Output<Tensor>("Accuracy"); auto* accuracy = ctx.Output<Tensor>("Accuracy");
auto* correct = ctx.Output<Tensor>("Correct");
auto* total = ctx.Output<Tensor>("Total");
int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
int* total_data = total->mutable_data<int>(ctx.GetPlace());
float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace()); float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
const int64_t* indices_data = indices->data<int64_t>(); const int64_t* indices_data = indices->data<int64_t>();
...@@ -55,7 +59,8 @@ class AccuracyKernel : public framework::OpKernel<T> { ...@@ -55,7 +59,8 @@ class AccuracyKernel : public framework::OpKernel<T> {
} }
} }
// FIXME(typhoonzero): we don't accumulate the accuracy for now. *correct_data = num_correct;
*total_data = num_samples;
*accuracy_data = *accuracy_data =
static_cast<float>(num_correct) / static_cast<float>(num_samples); static_cast<float>(num_correct) / static_cast<float>(num_samples);
} }
......
...@@ -94,5 +94,13 @@ class CompareOp : public framework::OperatorWithKernel { ...@@ -94,5 +94,13 @@ class CompareOp : public framework::OperatorWithKernel {
REGISTER_LOGICAL_OP(less_than, "Out = X < Y"); REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor); REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
REGISTER_LOGICAL_OP(greater_than, "Out = X > Y");
REGISTER_LOGICAL_KERNEL(greater_than, CPU,
paddle::operators::GreaterThanFunctor);
REGISTER_LOGICAL_OP(greater_equal, "Out = X >= Y");
REGISTER_LOGICAL_KERNEL(greater_equal, CPU,
paddle::operators::GreaterEqualFunctor);
REGISTER_LOGICAL_OP(equal, "Out = X == Y"); REGISTER_LOGICAL_OP(equal, "Out = X == Y");
REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor); REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
...@@ -15,4 +15,9 @@ ...@@ -15,4 +15,9 @@
#include "paddle/operators/compare_op.h" #include "paddle/operators/compare_op.h"
REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor); REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
REGISTER_LOGICAL_KERNEL(greater_than, GPU,
paddle::operators::GreaterThanFunctor);
REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
paddle::operators::GreaterEqualFunctor);
REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor); REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
...@@ -27,6 +27,24 @@ struct LessThanFunctor { ...@@ -27,6 +27,24 @@ struct LessThanFunctor {
HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; } HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
}; };
template <typename T>
struct LessEqualFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
};
template <typename T>
struct GreaterThanFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; }
};
template <typename T>
struct GreaterEqualFunctor {
using ELEM_TYPE = T;
HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; }
};
template <typename T> template <typename T>
struct EqualFunctor { struct EqualFunctor {
using ELEM_TYPE = T; using ELEM_TYPE = T;
......
...@@ -34,7 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, ...@@ -34,7 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
elementwise_add_grad, ops::ElementwiseOpGrad); elementwise_add_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_add, elementwise_add,
ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>); ops::ElementwiseAddKernel<paddle::platform::CPUPlace, float>,
ops::ElementwiseAddKernel<paddle::platform::CPUPlace, double>,
ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int>,
ops::ElementwiseAddKernel<paddle::platform::CPUPlace, int64_t>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_add_grad, elementwise_add_grad,
ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>); ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, float>,
ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, double>,
ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int>,
ops::ElementwiseAddGradKernel<paddle::platform::CPUPlace, int64_t>);
...@@ -19,7 +19,13 @@ namespace ops = paddle::operators; ...@@ -19,7 +19,13 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
elementwise_add, elementwise_add,
ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>); ops::ElementwiseAddKernel<paddle::platform::GPUPlace, float>,
ops::ElementwiseAddKernel<paddle::platform::GPUPlace, double>,
ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int>,
ops::ElementwiseAddKernel<paddle::platform::GPUPlace, int64_t>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
elementwise_add_grad, elementwise_add_grad,
ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>); ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, float>,
ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, double>,
ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int>,
ops::ElementwiseAddGradKernel<paddle::platform::GPUPlace, int64_t>);
...@@ -35,7 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker, ...@@ -35,7 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
elementwise_div_grad, ops::ElementwiseOpGrad); elementwise_div_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_div, elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>); ops::ElementwiseDivKernel<paddle::platform::CPUPlace, float>,
ops::ElementwiseDivKernel<paddle::platform::CPUPlace, double>,
ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int>,
ops::ElementwiseDivKernel<paddle::platform::CPUPlace, int64_t>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_div_grad, elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>); ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, float>,
ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, double>,
ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int>,
ops::ElementwiseDivGradKernel<paddle::platform::CPUPlace, int64_t>);
...@@ -19,7 +19,13 @@ namespace ops = paddle::operators; ...@@ -19,7 +19,13 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
elementwise_div, elementwise_div,
ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>); ops::ElementwiseDivKernel<paddle::platform::GPUPlace, float>,
ops::ElementwiseDivKernel<paddle::platform::GPUPlace, double>,
ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int>,
ops::ElementwiseDivKernel<paddle::platform::GPUPlace, int64_t>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
elementwise_div_grad, elementwise_div_grad,
ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>); ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, float>,
ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, double>,
ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int>,
ops::ElementwiseDivGradKernel<paddle::platform::GPUPlace, int64_t>);
...@@ -37,8 +37,12 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker, ...@@ -37,8 +37,12 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_mul, elementwise_mul,
ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>, ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>); ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>,
ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int>,
ops::ElementwiseMulKernel<paddle::platform::CPUPlace, int64_t>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_mul_grad, elementwise_mul_grad,
ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>, ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>); ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>,
ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int>,
ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, int64_t>);
...@@ -20,8 +20,12 @@ namespace ops = paddle::operators; ...@@ -20,8 +20,12 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
elementwise_mul, elementwise_mul,
ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>, ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>); ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>,
ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int>,
ops::ElementwiseMulKernel<paddle::platform::GPUPlace, int64_t>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
elementwise_mul_grad, elementwise_mul_grad,
ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>, ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>); ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>,
ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int>,
ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, int64_t>);
...@@ -34,7 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker, ...@@ -34,7 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
elementwise_sub_grad, ops::ElementwiseOpGrad); elementwise_sub_grad, ops::ElementwiseOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_sub, elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>); ops::ElementwiseSubKernel<paddle::platform::CPUPlace, float>,
ops::ElementwiseSubKernel<paddle::platform::CPUPlace, double>,
ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int>,
ops::ElementwiseSubKernel<paddle::platform::CPUPlace, int64_t>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
elementwise_sub_grad, elementwise_sub_grad,
ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>); ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, float>,
ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, double>,
ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int>,
ops::ElementwiseSubGradKernel<paddle::platform::CPUPlace, int64_t>);
...@@ -19,7 +19,13 @@ namespace ops = paddle::operators; ...@@ -19,7 +19,13 @@ namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
elementwise_sub, elementwise_sub,
ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>); ops::ElementwiseSubKernel<paddle::platform::GPUPlace, float>,
ops::ElementwiseSubKernel<paddle::platform::GPUPlace, double>,
ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int>,
ops::ElementwiseSubKernel<paddle::platform::GPUPlace, int64_t>);
REGISTER_OP_GPU_KERNEL( REGISTER_OP_GPU_KERNEL(
elementwise_sub_grad, elementwise_sub_grad,
ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>); ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, float>,
ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, double>,
ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int>,
ops::ElementwiseSubGradKernel<paddle::platform::GPUPlace, int64_t>);
...@@ -27,15 +27,15 @@ template <typename PoolProcess, typename T> ...@@ -27,15 +27,15 @@ template <typename PoolProcess, typename T>
class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> { class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output, const framework::Tensor& input, std::vector<int>& ksize,
std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& paddings, PoolProcess pool_process) { PoolProcess pool_process, framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
const int output_channels = output.dims()[1]; const int output_channels = output->dims()[1];
const int output_height = output.dims()[2]; const int output_height = output->dims()[2];
const int output_width = output.dims()[3]; const int output_width = output->dims()[3];
const int ksize_height = ksize[0]; const int ksize_height = ksize[0];
const int ksize_width = ksize[1]; const int ksize_width = ksize[1];
const int stride_height = strides[0]; const int stride_height = strides[0];
...@@ -47,7 +47,7 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> { ...@@ -47,7 +47,7 @@ class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
const int output_stride = output_height * output_width; const int output_stride = output_height * output_width;
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
T* output_data = output.mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -87,11 +87,12 @@ template <typename PoolProcess, class T> ...@@ -87,11 +87,12 @@ template <typename PoolProcess, class T>
class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> { class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_grad_process) { PoolProcess pool_grad_process,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -110,7 +111,7 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> { ...@@ -110,7 +111,7 @@ class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -154,10 +155,11 @@ template <class T> ...@@ -154,10 +155,11 @@ template <class T>
class MaxPool2dGradFunctor<platform::CPUPlace, T> { class MaxPool2dGradFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings) { std::vector<int>& strides, std::vector<int>& paddings,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
...@@ -176,7 +178,7 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> { ...@@ -176,7 +178,7 @@ class MaxPool2dGradFunctor<platform::CPUPlace, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -240,17 +242,17 @@ template <typename PoolProcess, class T> ...@@ -240,17 +242,17 @@ template <typename PoolProcess, class T>
class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> { class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output, const framework::Tensor& input, std::vector<int>& ksize,
std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& paddings, PoolProcess pool_process) { PoolProcess pool_process, framework::Tensor* output) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
const int input_width = input.dims()[4]; const int input_width = input.dims()[4];
const int output_channels = output.dims()[1]; const int output_channels = output->dims()[1];
const int output_depth = output.dims()[2]; const int output_depth = output->dims()[2];
const int output_height = output.dims()[3]; const int output_height = output->dims()[3];
const int output_width = output.dims()[4]; const int output_width = output->dims()[4];
const int ksize_depth = ksize[0]; const int ksize_depth = ksize[0];
const int ksize_height = ksize[1]; const int ksize_height = ksize[1];
const int ksize_width = ksize[2]; const int ksize_width = ksize[2];
...@@ -265,7 +267,7 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> { ...@@ -265,7 +267,7 @@ class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
const int output_stride = output_depth * output_height * output_width; const int output_stride = output_depth * output_height * output_width;
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
T* output_data = output.mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -315,11 +317,12 @@ template <typename PoolProcess, class T> ...@@ -315,11 +317,12 @@ template <typename PoolProcess, class T>
class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> { class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_grad_process) { PoolProcess pool_grad_process,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -343,7 +346,7 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> { ...@@ -343,7 +346,7 @@ class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -398,10 +401,11 @@ template <class T> ...@@ -398,10 +401,11 @@ template <class T>
class MaxPool3dGradFunctor<platform::CPUPlace, T> { class MaxPool3dGradFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings) { std::vector<int>& strides, std::vector<int>& paddings,
framework::Tensor* input_grad) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
...@@ -425,7 +429,7 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> { ...@@ -425,7 +429,7 @@ class MaxPool3dGradFunctor<platform::CPUPlace, T> {
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
const T* output_data = output.data<T>(); const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -498,15 +502,15 @@ template <typename T> ...@@ -498,15 +502,15 @@ template <typename T>
class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> { class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output, const framework::Tensor& input, std::vector<int>& ksize,
framework::Tensor& mask, std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& strides, std::vector<int>& paddings) { framework::Tensor* output, framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_height = input.dims()[2]; const int input_height = input.dims()[2];
const int input_width = input.dims()[3]; const int input_width = input.dims()[3];
const int output_channels = output.dims()[1]; const int output_channels = output->dims()[1];
const int output_height = output.dims()[2]; const int output_height = output->dims()[2];
const int output_width = output.dims()[3]; const int output_width = output->dims()[3];
const int ksize_height = ksize[0]; const int ksize_height = ksize[0];
const int ksize_width = ksize[1]; const int ksize_width = ksize[1];
const int stride_height = strides[0]; const int stride_height = strides[0];
...@@ -517,8 +521,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> { ...@@ -517,8 +521,8 @@ class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
const int output_stride = output_height * output_width; const int output_stride = output_height * output_width;
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
T* output_data = output.mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
T* mask_data = mask.mutable_data<T>(context.GetPlace()); T* mask_data = mask->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -563,13 +567,13 @@ template <typename T> ...@@ -563,13 +567,13 @@ template <typename T>
class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> { class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
framework::Tensor& input_grad,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings) { std::vector<int>& strides, std::vector<int>& paddings,
const int batch_size = input_grad.dims()[0]; framework::Tensor* input_grad) {
const int input_height = input_grad.dims()[2]; const int batch_size = input_grad->dims()[0];
const int input_width = input_grad.dims()[3]; const int input_height = input_grad->dims()[2];
const int input_width = input_grad->dims()[3];
const int output_channels = output_grad.dims()[1]; const int output_channels = output_grad.dims()[1];
const int output_height = output_grad.dims()[2]; const int output_height = output_grad.dims()[2];
const int output_width = output_grad.dims()[3]; const int output_width = output_grad.dims()[3];
...@@ -578,7 +582,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> { ...@@ -578,7 +582,7 @@ class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
const T* mask_data = mask.data<T>(); const T* mask_data = mask.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -612,17 +616,17 @@ template <typename T> ...@@ -612,17 +616,17 @@ template <typename T>
class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> { class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output, const framework::Tensor& input, std::vector<int>& ksize,
framework::Tensor& mask, std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& strides, std::vector<int>& paddings) { framework::Tensor* output, framework::Tensor* mask) {
const int batch_size = input.dims()[0]; const int batch_size = input.dims()[0];
const int input_depth = input.dims()[2]; const int input_depth = input.dims()[2];
const int input_height = input.dims()[3]; const int input_height = input.dims()[3];
const int input_width = input.dims()[4]; const int input_width = input.dims()[4];
const int output_channels = output.dims()[1]; const int output_channels = output->dims()[1];
const int output_depth = output.dims()[2]; const int output_depth = output->dims()[2];
const int output_height = output.dims()[3]; const int output_height = output->dims()[3];
const int output_width = output.dims()[4]; const int output_width = output->dims()[4];
const int ksize_depth = ksize[0]; const int ksize_depth = ksize[0];
const int ksize_height = ksize[1]; const int ksize_height = ksize[1];
const int ksize_width = ksize[2]; const int ksize_width = ksize[2];
...@@ -636,8 +640,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> { ...@@ -636,8 +640,8 @@ class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
const int output_stride = output_depth * output_height * output_width; const int output_stride = output_depth * output_height * output_width;
const T* input_data = input.data<T>(); const T* input_data = input.data<T>();
T* output_data = output.mutable_data<T>(context.GetPlace()); T* output_data = output->mutable_data<T>(context.GetPlace());
T* mask_data = mask.mutable_data<T>(context.GetPlace()); T* mask_data = mask->mutable_data<T>(context.GetPlace());
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
...@@ -691,14 +695,14 @@ template <typename T> ...@@ -691,14 +695,14 @@ template <typename T>
class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> { class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
framework::Tensor& input_grad,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings) { std::vector<int>& strides, std::vector<int>& paddings,
const int batch_size = input_grad.dims()[0]; framework::Tensor* input_grad) {
const int input_depth = input_grad.dims()[2]; const int batch_size = input_grad->dims()[0];
const int input_height = input_grad.dims()[3]; const int input_depth = input_grad->dims()[2];
const int input_width = input_grad.dims()[4]; const int input_height = input_grad->dims()[3];
const int input_width = input_grad->dims()[4];
const int output_channels = output_grad.dims()[1]; const int output_channels = output_grad.dims()[1];
const int output_depth = output_grad.dims()[2]; const int output_depth = output_grad.dims()[2];
const int output_height = output_grad.dims()[3]; const int output_height = output_grad.dims()[3];
...@@ -708,7 +712,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> { ...@@ -708,7 +712,7 @@ class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
const T* mask_data = mask.data<T>(); const T* mask_data = mask.data<T>();
const T* output_grad_data = output_grad.data<T>(); const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace()); T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
for (int c = 0; c < output_channels; ++c) { for (int c = 0; c < output_channels; ++c) {
......
此差异已折叠。
...@@ -88,60 +88,62 @@ template <typename Place, typename PoolProcess, typename T> ...@@ -88,60 +88,62 @@ template <typename Place, typename PoolProcess, typename T>
class Pool2dFunctor { class Pool2dFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output, const framework::Tensor& input, std::vector<int>& ksize,
std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& paddings, PoolProcess pool_compute); PoolProcess pool_compute, framework::Tensor* output);
}; };
template <typename Place, typename PoolProcess, typename T> template <typename Place, typename PoolProcess, typename T>
class Pool2dGradFunctor { class Pool2dGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_compute); PoolProcess pool_compute, framework::Tensor* input_grad);
}; };
template <typename Place, class T> template <typename Place, class T>
class MaxPool2dGradFunctor { class MaxPool2dGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings); std::vector<int>& strides, std::vector<int>& paddings,
framework::Tensor* input_grad);
}; };
template <typename Place, typename PoolProcess, typename T> template <typename Place, typename PoolProcess, typename T>
class Pool3dFunctor { class Pool3dFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output, const framework::Tensor& input, std::vector<int>& ksize,
std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& paddings, PoolProcess pool_compute); PoolProcess pool_compute, framework::Tensor* output);
}; };
template <typename Place, typename PoolProcess, typename T> template <typename Place, typename PoolProcess, typename T>
class Pool3dGradFunctor { class Pool3dGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings, std::vector<int>& strides, std::vector<int>& paddings,
PoolProcess pool_compute); PoolProcess pool_compute, framework::Tensor* input_grad);
}; };
template <typename Place, class T> template <typename Place, class T>
class MaxPool3dGradFunctor { class MaxPool3dGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& input_grad, const framework::Tensor& input,
const framework::Tensor& output, const framework::Tensor& output,
const framework::Tensor& output_grad, std::vector<int>& ksize, const framework::Tensor& output_grad, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings); std::vector<int>& strides, std::vector<int>& paddings,
framework::Tensor* input_grad);
}; };
/* /*
...@@ -155,38 +157,38 @@ template <typename Place, typename T> ...@@ -155,38 +157,38 @@ template <typename Place, typename T>
class MaxPool2dWithIndexFunctor { class MaxPool2dWithIndexFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output, const framework::Tensor& input, std::vector<int>& ksize,
framework::Tensor& mask, std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& strides, std::vector<int>& paddings); framework::Tensor* output, framework::Tensor* mask);
}; };
template <typename Place, typename T> template <typename Place, typename T>
class MaxPool2dWithIndexGradFunctor { class MaxPool2dWithIndexGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
framework::Tensor& input_grad,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings); std::vector<int>& strides, std::vector<int>& paddings,
framework::Tensor* input_grad);
}; };
template <typename Place, typename T> template <typename Place, typename T>
class MaxPool3dWithIndexFunctor { class MaxPool3dWithIndexFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor& output, const framework::Tensor& input, std::vector<int>& ksize,
framework::Tensor& mask, std::vector<int>& ksize, std::vector<int>& strides, std::vector<int>& paddings,
std::vector<int>& strides, std::vector<int>& paddings); framework::Tensor* output, framework::Tensor* mask);
}; };
template <typename Place, typename T> template <typename Place, typename T>
class MaxPool3dWithIndexGradFunctor { class MaxPool3dWithIndexGradFunctor {
public: public:
void operator()(const platform::DeviceContext& context, void operator()(const platform::DeviceContext& context,
framework::Tensor& input_grad,
const framework::Tensor& output_grad, const framework::Tensor& output_grad,
const framework::Tensor& mask, std::vector<int>& ksize, const framework::Tensor& mask, std::vector<int>& ksize,
std::vector<int>& strides, std::vector<int>& paddings); std::vector<int>& strides, std::vector<int>& paddings,
framework::Tensor* input_grad);
}; };
} // namespace math } // namespace math
......
...@@ -75,16 +75,16 @@ class PoolKernel : public framework::OpKernel<T> { ...@@ -75,16 +75,16 @@ class PoolKernel : public framework::OpKernel<T> {
Place, paddle::operators::math::MaxPool<T>, T> Place, paddle::operators::math::MaxPool<T>, T>
pool2d_forward; pool2d_forward;
paddle::operators::math::MaxPool<T> pool_process; paddle::operators::math::MaxPool<T> pool_process;
pool2d_forward(context.device_context(), *in_x, *out, ksize, strides, pool2d_forward(context.device_context(), *in_x, ksize, strides,
paddings, pool_process); paddings, pool_process, out);
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
paddle::operators::math::Pool2dFunctor< paddle::operators::math::Pool2dFunctor<
Place, paddle::operators::math::AvgPool<T>, T> Place, paddle::operators::math::AvgPool<T>, T>
pool2d_forward; pool2d_forward;
paddle::operators::math::AvgPool<T> pool_process; paddle::operators::math::AvgPool<T> pool_process;
pool2d_forward(context.device_context(), *in_x, *out, ksize, strides, pool2d_forward(context.device_context(), *in_x, ksize, strides,
paddings, pool_process); paddings, pool_process, out);
} }
} break; } break;
case 3: { case 3: {
...@@ -93,15 +93,15 @@ class PoolKernel : public framework::OpKernel<T> { ...@@ -93,15 +93,15 @@ class PoolKernel : public framework::OpKernel<T> {
Place, paddle::operators::math::MaxPool<T>, T> Place, paddle::operators::math::MaxPool<T>, T>
pool3d_forward; pool3d_forward;
paddle::operators::math::MaxPool<T> pool_process; paddle::operators::math::MaxPool<T> pool_process;
pool3d_forward(context.device_context(), *in_x, *out, ksize, strides, pool3d_forward(context.device_context(), *in_x, ksize, strides,
paddings, pool_process); paddings, pool_process, out);
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
paddle::operators::math::Pool3dFunctor< paddle::operators::math::Pool3dFunctor<
Place, paddle::operators::math::AvgPool<T>, T> Place, paddle::operators::math::AvgPool<T>, T>
pool3d_forward; pool3d_forward;
paddle::operators::math::AvgPool<T> pool_process; paddle::operators::math::AvgPool<T> pool_process;
pool3d_forward(context.device_context(), *in_x, *out, ksize, strides, pool3d_forward(context.device_context(), *in_x, ksize, strides,
paddings, pool_process); paddings, pool_process, out);
} }
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
...@@ -142,30 +142,30 @@ class PoolGradKernel : public framework::OpKernel<T> { ...@@ -142,30 +142,30 @@ class PoolGradKernel : public framework::OpKernel<T> {
if (pooling_type == "max") { if (pooling_type == "max") {
paddle::operators::math::MaxPool2dGradFunctor<Place, T> paddle::operators::math::MaxPool2dGradFunctor<Place, T>
pool2d_backward; pool2d_backward;
pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out, pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
*out_grad, ksize, strides, paddings); ksize, strides, paddings, in_x_grad);
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
paddle::operators::math::Pool2dGradFunctor< paddle::operators::math::Pool2dGradFunctor<
Place, paddle::operators::math::AvgPoolGrad<T>, T> Place, paddle::operators::math::AvgPoolGrad<T>, T>
pool2d_backward; pool2d_backward;
paddle::operators::math::AvgPoolGrad<T> pool_process; paddle::operators::math::AvgPoolGrad<T> pool_process;
pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out, pool2d_backward(context.device_context(), *in_x, *out, *out_grad,
*out_grad, ksize, strides, paddings, pool_process); ksize, strides, paddings, pool_process, in_x_grad);
} }
} break; } break;
case 3: { case 3: {
if (pooling_type == "max") { if (pooling_type == "max") {
paddle::operators::math::MaxPool3dGradFunctor<Place, T> paddle::operators::math::MaxPool3dGradFunctor<Place, T>
pool3d_backward; pool3d_backward;
pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out, pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
*out_grad, ksize, strides, paddings); ksize, strides, paddings, in_x_grad);
} else if (pooling_type == "avg") { } else if (pooling_type == "avg") {
paddle::operators::math::Pool3dGradFunctor< paddle::operators::math::Pool3dGradFunctor<
Place, paddle::operators::math::AvgPoolGrad<T>, T> Place, paddle::operators::math::AvgPoolGrad<T>, T>
pool3d_backward; pool3d_backward;
paddle::operators::math::AvgPoolGrad<T> pool_process; paddle::operators::math::AvgPoolGrad<T> pool_process;
pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out, pool3d_backward(context.device_context(), *in_x, *out, *out_grad,
*out_grad, ksize, strides, paddings, pool_process); ksize, strides, paddings, pool_process, in_x_grad);
} }
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
......
...@@ -46,14 +46,14 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> { ...@@ -46,14 +46,14 @@ class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
case 2: { case 2: {
paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T> paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
pool2d_forward; pool2d_forward;
pool2d_forward(context.device_context(), *in_x, *out, *mask, ksize, pool2d_forward(context.device_context(), *in_x, ksize, strides,
strides, paddings); paddings, out, mask);
} break; } break;
case 3: { case 3: {
paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T> paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
pool3d_forward; pool3d_forward;
pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize, pool3d_forward(context.device_context(), *in_x, ksize, strides,
strides, paddings); paddings, out, mask);
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
} }
...@@ -89,14 +89,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> { ...@@ -89,14 +89,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
case 2: { case 2: {
paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T> paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
pool2d_backward; pool2d_backward;
pool2d_backward(context.device_context(), *in_x_grad, *out_grad, pool2d_backward(context.device_context(), *out_grad, *mask, ksize,
*mask, ksize, strides, paddings); strides, paddings, in_x_grad);
} break; } break;
case 3: { case 3: {
paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T> paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
pool3d_backward; pool3d_backward;
pool3d_backward(context.device_context(), *in_x_grad, *out_grad, pool3d_backward(context.device_context(), *out_grad, *mask, ksize,
*mask, ksize, strides, paddings); strides, paddings, in_x_grad);
} break; } break;
default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include "glog/logging.h"
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
...@@ -26,6 +27,10 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor, ...@@ -26,6 +27,10 @@ template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>; using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
struct SumFunctor { struct SumFunctor {
template <typename Place, typename X, typename Y, typename Dim> template <typename Place, typename X, typename Y, typename Dim>
void operator()(const Place& place, X& x, Y& y, const Dim& dim) { void operator()(const Place& place, X& x, Y& y, const Dim& dim) {
...@@ -133,10 +138,17 @@ class ReduceKernel : public framework::OpKernel<T> { ...@@ -133,10 +138,17 @@ class ReduceKernel : public framework::OpKernel<T> {
dims_vector.erase(dims_vector.begin() + dim); dims_vector.erase(dims_vector.begin() + dim);
dims = framework::make_ddim(dims_vector); dims = framework::make_ddim(dims_vector);
} }
auto out = EigenTensor < T, D == 1 ? 1 : (D - 1) > ::From(*output, dims);
auto& place = context.GetEigenDevice<Place>(); auto& place = context.GetEigenDevice<Place>();
Functor functor; Functor functor;
functor(place, x, out, reduce_dim);
if (D == 1) {
auto out = EigenScalar<T>::From(*output);
functor(place, x, out, reduce_dim);
} else {
auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
functor(place, x, out, reduce_dim);
}
} }
}; };
...@@ -186,13 +198,13 @@ class ReduceGradKernel : public framework::OpKernel<T> { ...@@ -186,13 +198,13 @@ class ReduceGradKernel : public framework::OpKernel<T> {
auto x_reduce = EigenTensor<T, D>::From(*input1, dims); auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims); auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
Eigen::array<int, D> braodcast_dim; Eigen::array<int, D> broadcast_dim;
for (size_t i = 0; i < D; ++i) braodcast_dim[i] = 1; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
braodcast_dim[dim] = input0->dims()[dim]; broadcast_dim[dim] = input0->dims()[dim];
auto& place = context.GetEigenDevice<Place>(); auto& place = context.GetEigenDevice<Place>();
Functor functor; Functor functor;
functor(place, x, x_reduce, x_grad, x_reduce_grad, braodcast_dim, functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
braodcast_dim[dim]); broadcast_dim[dim]);
} }
}; };
......
...@@ -200,7 +200,10 @@ void Parameter::setMat(ParameterType pType, int matType) { ...@@ -200,7 +200,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
false, false,
useGpu_); useGpu_);
} }
} else if (matType == MAT_NORMAL_SHARED) { }
#ifndef PADDLE_MOBILE_INFERENCE
// NOLINTNEXTLINE
else if (matType == MAT_NORMAL_SHARED) {
CHECK_EQ(height * width, bufs_[pType]->getSize()); CHECK_EQ(height * width, bufs_[pType]->getSize());
size_t blockNum = 0; size_t blockNum = 0;
CHECK(isGradShared(&blockNum)); CHECK(isGradShared(&blockNum));
...@@ -259,7 +262,10 @@ void Parameter::setMat(ParameterType pType, int matType) { ...@@ -259,7 +262,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
} else if (matType == MAT_SPARSE_ROW_AUTO_GROW) { } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
CHECK(isGradSparseUpdate()); CHECK(isGradSparseUpdate());
mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width); mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
} else { }
#endif
// NOLINTNEXTLINE
else {
LOG(FATAL) << "Unsupported mat type" << matType; LOG(FATAL) << "Unsupported mat type" << matType;
} }
} }
......
#!/bin/bash
set -e
echo "Post install paddle debian package."
echo "Install some python package used for paddle. You can run "
echo " pip install /usr/opt/paddle/share/wheels/*.whl to install them."
find /usr/ -name '*paddle*.whl' | xargs pip install
...@@ -2,178 +2,198 @@ ...@@ -2,178 +2,198 @@
## Goals ## Goals
We want the building procedure generates Docker images so that we can run PaddlePaddle applications on Kubernetes clusters. We want to make the building procedures:
We want to build .deb packages so that enterprise users can run PaddlePaddle applications without Docker. 1. Static, can reproduce easily.
1. Generate python `whl` packages that can be widely use cross many distributions.
1. Build different binaries per release to satisfy different environments:
- Binaries for different CUDA and CUDNN versions, like CUDA 7.5, 8.0, 9.0
- Binaries containing only capi
- Binaries for python with wide unicode support or not.
1. Build docker images with PaddlePaddle pre-installed, so that we can run
PaddlePaddle applications directly in docker or on Kubernetes clusters.
We want to minimize the size of generated Docker images and .deb packages so to reduce the download time. To achieve this, we created a repo: https://github.com/PaddlePaddle/buildtools
which gives several docker images that are `manylinux1` sufficient. Then we
can build PaddlePaddle using these images to generate corresponding `whl`
binaries.
We want to encapsulate building tools and dependencies in a *development* Docker image so to ease the tools installation for developers. ## Run The Build
Developers use various editors (emacs, vim, Eclipse, Jupyter Notebook), so the development Docker image contains only building tools, not editing tools, and developers are supposed to git clone source code into their development computers and map the code into the development container. ### Build Evironments
We want the procedure and tools also work with testing, continuous integration, and releasing. The pre-built build environment images are:
| Image | Tag |
| ----- | --- |
| paddlepaddle/paddle_manylinux_devel | cuda7.5_cudnn5 |
| paddlepaddle/paddle_manylinux_devel | cuda8.0_cudnn5 |
| paddlepaddle/paddle_manylinux_devel | cuda7.5_cudnn7 |
| paddlepaddle/paddle_manylinux_devel | cuda9.0_cudnn7 |
## Docker Images ### Start Build
So we need two Docker images for each version of PaddlePaddle:
1. `paddle:<version>-dev`
This a development image contains only the development tools and standardizes the building procedure. Users include:
- developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer). Choose one docker image that suit your environment and run the following
- release engineers -- use this to build the official release from certain branch/tag on Github.com. command to start a build:
- document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code. We need tools to extract the information, typeset, and generate Web pages.
Of course, developers can install building tools on their development computers. But different versions of PaddlePaddle might require different set or version of building tools. Also, it makes collaborative debugging easier if all developers use a unified development environment. ```bash
git clone https://github.com/PaddlePaddle/Paddle.git
The development image should include the following tools: cd Paddle
docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" -e "PYTHON_ABI=cp27-cp27mu" paddlepaddle/paddle_manylinux_devel /paddle/paddle/scripts/docker/build.sh
- gcc/clang ```
- nvcc
- Python
- sphinx
- woboq
- sshd
Many developers work on a remote computer with GPU; they could ssh into the computer and `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly. After the build finishes, you can get output `whl` package under
`build/python/dist`.
1. `paddle:<version>` This command mounts the source directory on the host into `/paddle` in the container, then run the build script `/paddle/paddle/scripts/docker/build.sh`
in the container. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
This is the production image, generated using the development image. This image might have multiple variants: ### Build Options
- GPU/AVX `paddle:<version>-gpu` Users can specify the following Docker build arguments with either "ON" or "OFF" value:
- GPU/no-AVX `paddle:<version>-gpu-noavx`
- no-GPU/AVX `paddle:<version>`
- no-GPU/no-AVX `paddle:<version>-noavx`
We allow users to choose between GPU and no-GPU because the GPU version image is much larger than then the no-GPU version. | Option | Default | Description |
| ------ | -------- | ----------- |
| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
| `WITH_TESTING` | ON | Build unit tests binaries. |
| `WITH_MKLDNN` | ON | Build with [Intel® MKL DNN](https://github.com/01org/mkl-dnn) support. |
| `WITH_MKLML` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) support. |
| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
| `WITH_C_API` | OFF | Build capi libraries for inference. |
| `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
| `WITH_STYLE_CHECK` | ON | Check the code style when building. |
| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
| `RUN_TEST` | OFF | Run unit test immediently after the build. |
| `WITH_DOC` | OFF | Build docs after build binaries. |
| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
We allow users the choice between AVX and no-AVX, because some cloud providers don't provide AVX-enabled VMs.
## Docker Images
## Development Environment You can get the latest PaddlePaddle docker images by
`docker pull paddlepaddle/paddle:<version>` or build one by yourself.
Here we describe how to use above two images. We start from considering our daily development environment. ### Official Docker Releases
Developers work on a computer, which is usually a laptop or desktop: Official docker images at
[here](https://hub.docker.com/r/paddlepaddle/paddle/tags/),
you can choose either latest or images with a release tag like `0.10.0`,
Currently available tags are:
<img src="doc/paddle-development-environment.png" width=500 /> | Tag | Description |
| ------ | --------------------- |
| latest | latest CPU only image |
| latest-gpu | latest binary with GPU support |
| 0.10.0 | release 0.10.0 CPU only binary image |
| 0.10.0-gpu | release 0.10.0 with GPU support |
or, they might rely on a more sophisticated box (like with GPUs): ### Build Your Own Image
<img src="doc/paddle-development-environment-gpu.png" width=500 /> Build PaddlePaddle docker images are quite simple since PaddlePaddle can
be installed by just running `pip install`. A sample `Dockerfile` is:
A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion. ```dockerfile
FROM nvidia/cuda:7.5-cudnn5-runtime-centos6
RUN yum install -y centos-release-SCL
RUN yum install -y python27
# This whl package is generated by previous build steps.
ADD python/dist/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl /
RUN pip install /paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl && rm -f /*.whl
```
Then build the image by running `docker build -t [REPO]/paddle:[TAG] .` under
the directory containing your own `Dockerfile`.
## Usages - NOTE: note that you can choose different base images for your environment, you can find all the versions [here](https://hub.docker.com/r/nvidia/cuda/).
### Build the Development Docker Image ### Use Docker Images
The following commands check out the source code to the host and build the development image `paddle:dev`: Suppose that you have written an application program `train.py` using
PaddlePaddle, we can test and run it using docker:
```bash ```bash
git clone https://github.com/PaddlePaddle/Paddle paddle docker run --rm -it -v $PWD:/work paddlepaddle/paddle /work/a.py
cd paddle
docker build -t paddle:dev .
``` ```
The `docker build` command assumes that `Dockerfile` is in the root source tree. Note that in this design, this `Dockerfile` is this only one in our repo. But this works only if all dependencies of `train.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
Users can specify a Ubuntu mirror server for faster downloading:
```bash
docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com/mirrors.txt .
```
### Build PaddlePaddle from Source Code ### Run PaddlePaddle Book In Docker
Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host): Our [book repo](https://github.com/paddlepaddle/book) also provide a docker
image to start a jupiter notebook inside docker so that you can run this book
using docker:
```bash ```bash
docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" paddle:dev docker run -d -p 8888:8888 paddlepaddle/book
``` ```
This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed. Please refer to https://github.com/paddlepaddle/book if you want to build this
docker image by your self.
`build.sh` builds the following:
- PaddlePaddle binaries,
- `$PWD/build/paddle-<version>.deb` for production installation, and
- `$PWD/build/Dockerfile`, which builds the production Docker image.
Users can specify the following Docker build arguments with either "ON" or "OFF" value: ### Run Distributed Applications
- `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
```bash
docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
```
- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
### Build the Production Docker Image In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster. This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster. This API might need to generate a Dockerfile look like above and call `docker build`.
The following command builds the production image: Of course, we can manually build an application image and launch the job using the kubectl tool:
```bash ```bash
docker build -t paddle -f build/Dockerfile ./build docker build -f some/Dockerfile -t myapp .
docker tag myapp me/myapp
docker push
kubectl ...
``` ```
This production image is minimal -- it includes binary `paddle`, the shared library `libpaddle.so`, and Python runtime. ## Docker Images for Developers
### Run PaddlePaddle Applications We have a special docker image for developers:
`paddlepaddle/paddle:<version>-dev`. This image is also generated from
https://github.com/PaddlePaddle/buildtools
Again the development happens on the host. Suppose that we have a simple application program in `a.py`, we can test and run it using the production image: This a development image contains only the
development tools and standardizes the building procedure. Users include:
```bash - developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
docker run --rm -it -v $PWD:/work paddle /work/a.py - release engineers -- use this to build the official release from certain branch/tag on Github.com.
``` - document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code. We need tools to extract the information, typeset, and generate Web pages.
But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs. Of course, developers can install building tools on their development computers. But different versions of PaddlePaddle might require different set or version of building tools. Also, it makes collaborative debugging easier if all developers use a unified development environment.
### Build and Run PaddlePaddle Applications The development image contains the following tools:
We need a Dockerfile in https://github.com/paddlepaddle/book that builds Docker image `paddlepaddle/book:<version>`, basing on the PaddlePaddle production image: - gcc/clang
- nvcc
- Python
- sphinx
- woboq
- sshd
``` Many developers work on a remote computer with GPU; they could ssh into the computer and `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
FROM paddlepaddle/paddle:<version>
RUN pip install -U matplotlib jupyter ...
COPY . /book
EXPOSE 8080
CMD ["jupyter"]
```
The book image is an example of PaddlePaddle application image. We can build it
```bash ### Development Workflow
git clone https://github.com/paddlepaddle/book
cd book
docker build -t book .
```
### Build and Run Distributed Applications Here we describe how the workflow goes on. We start from considering our daily development environment.
In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster. This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster. This API might need to generate a Dockerfile look like above and call `docker build`. Developers work on a computer, which is usually a laptop or desktop:
Of course, we can manually build an application image and launch the job using the kubectl tool: <img src="doc/paddle-development-environment.png" width=500 />
```bash or, they might rely on a more sophisticated box (like with GPUs):
docker build -f some/Dockerfile -t myapp .
docker tag myapp me/myapp <img src="doc/paddle-development-environment-gpu.png" width=500 />
docker push
kubectl ... A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
```
### Reading source code with woboq codebrowser ### Reading source code with woboq codebrowser
For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser). For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host: - The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
```bash ```bash
docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddle:dev docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
``` ```
- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run: - You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
......
#!/bin/bash #!/bin/bash
set -xe
function cmake_gen() { function cmake_gen() {
# Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
fi
DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
fi
mkdir -p /paddle/build mkdir -p /paddle/build
cd /paddle/build cd /paddle/build
...@@ -26,10 +9,29 @@ function cmake_gen() { ...@@ -26,10 +9,29 @@ function cmake_gen() {
# delete previous built whl packages # delete previous built whl packages
rm -rf /paddle/paddle/dist 2>/dev/null || true rm -rf /paddle/paddle/dist 2>/dev/null || true
# Support build for all python versions, currently
# including cp27-cp27m and cp27-cp27mu.
PYTHON_FLAGS=""
if [ "$1" != "" ]; then
echo "using python abi: $1"
if [ "$1" == "cp27-cp27m" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
elif [ "$1" == "cp27-cp27mu" ]; then
export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
-DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
-DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
fi
fi
cat <<EOF cat <<EOF
======================================== ========================================
Configuring cmake in /paddle/build ... Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=Release -DCMAKE_BUILD_TYPE=Release
${PYTHON_FLAGS}
-DWITH_DOC=OFF -DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF} -DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_MKLDNN=${WITH_MKLDNN:-ON} -DWITH_MKLDNN=${WITH_MKLDNN:-ON}
...@@ -46,12 +48,12 @@ function cmake_gen() { ...@@ -46,12 +48,12 @@ function cmake_gen() {
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
======================================== ========================================
EOF EOF
# Disable UNITTEST_USE_VIRTUALENV in docker because # Disable UNITTEST_USE_VIRTUALENV in docker because
# docker environment is fully controlled by this script. # docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option. # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \ cmake .. \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
${PYTHON_FLAGS} \
-DWITH_DOC=OFF \ -DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_MKLDNN=${WITH_MKLDNN:-ON} \ -DWITH_MKLDNN=${WITH_MKLDNN:-ON} \
...@@ -134,6 +136,19 @@ EOF ...@@ -134,6 +136,19 @@ EOF
function gen_dockerfile() { function gen_dockerfile() {
# Set BASE_IMAGE according to env variables
if [[ ${WITH_GPU} == "ON" ]]; then
BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
else
BASE_IMAGE="ubuntu:16.04"
fi
DOCKERFILE_GPU_ENV=""
DOCKERFILE_CUDNN_DSO=""
if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
fi
cat <<EOF cat <<EOF
======================================== ========================================
...@@ -168,13 +183,14 @@ EOF ...@@ -168,13 +183,14 @@ EOF
${DOCKERFILE_GPU_ENV} ${DOCKERFILE_GPU_ENV}
ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/ ADD go/cmd/master/master /usr/bin/
ADD paddle/pybind/print_operators_doc /usr/bin/
# default command shows the paddle version and exit # default command shows the paddle version and exit
CMD ["paddle", "version"] CMD ["paddle", "version"]
EOF EOF
} }
cmake_gen set -xe
cmake_gen ${PYTHON_ABI:-""}
run_build run_build
run_test run_test
gen_docs gen_docs
......
...@@ -44,7 +44,7 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then ...@@ -44,7 +44,7 @@ if [ $ANDROID_ABI == "armeabi-v7a" ]; then
-DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \ -DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=MinSizeRel \
-DUSE_EIGEN_FOR_BLAS=ON \ -DUSE_EIGEN_FOR_BLAS=ON \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
...@@ -58,7 +58,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then ...@@ -58,7 +58,7 @@ elif [ $ANDROID_ABI == "arm64-v8a" ]; then
-DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \ -DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=MinSizeRel \
-DUSE_EIGEN_FOR_BLAS=OFF \ -DUSE_EIGEN_FOR_BLAS=OFF \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
...@@ -72,7 +72,7 @@ elif [ $ANDROID_ABI == "armeabi" ]; then ...@@ -72,7 +72,7 @@ elif [ $ANDROID_ABI == "armeabi" ]; then
-DHOST_C_COMPILER=/usr/bin/gcc \ -DHOST_C_COMPILER=/usr/bin/gcc \
-DHOST_CXX_COMPILER=/usr/bin/g++ \ -DHOST_CXX_COMPILER=/usr/bin/g++ \
-DCMAKE_INSTALL_PREFIX=$DEST_ROOT \ -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=MinSizeRel \
-DWITH_C_API=ON \ -DWITH_C_API=ON \
-DWITH_SWIG_PY=OFF \ -DWITH_SWIG_PY=OFF \
-DWITH_STYLE_CHECK=OFF \ -DWITH_STYLE_CHECK=OFF \
......
...@@ -33,6 +33,7 @@ MatrixPtr makeRandomSparseMatrix(size_t height, ...@@ -33,6 +33,7 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
bool withValue, bool withValue,
bool useGpu, bool useGpu,
bool equalNnzPerSample) { bool equalNnzPerSample) {
#ifndef PADDLE_MOBILE_INFERENCE
std::vector<int64_t> ids(height); std::vector<int64_t> ids(height);
std::vector<int64_t> indices(height + 1); std::vector<int64_t> indices(height + 1);
indices[0] = 0; indices[0] = 0;
...@@ -84,6 +85,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height, ...@@ -84,6 +85,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
} }
return mat; return mat;
} }
#endif
return nullptr;
} }
void generateSequenceStartPositions(size_t batchSize, void generateSequenceStartPositions(size_t batchSize,
......
...@@ -27,6 +27,9 @@ using namespace paddle; // NOLINT ...@@ -27,6 +27,9 @@ using namespace paddle; // NOLINT
using namespace std; // NOLINT using namespace std; // NOLINT
int main(int argc, char** argv) { int main(int argc, char** argv) {
initMain(argc, argv);
initPython(argc, argv);
if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() || if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
FLAGS_model_file.empty()) { FLAGS_model_file.empty()) {
LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 " LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
...@@ -34,9 +37,6 @@ int main(int argc, char** argv) { ...@@ -34,9 +37,6 @@ int main(int argc, char** argv) {
return 0; return 0;
} }
initMain(argc, argv);
initPython(argc, argv);
string confFile = FLAGS_config_file; string confFile = FLAGS_config_file;
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
FLAGS_use_gpu = false; FLAGS_use_gpu = false;
......
...@@ -37,10 +37,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ...@@ -37,10 +37,10 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py) ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
DEPENDS paddle_pybind) DEPENDS paddle_pybind)
add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so) add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
...@@ -66,7 +66,7 @@ if (WITH_TESTING) ...@@ -66,7 +66,7 @@ if (WITH_TESTING)
add_subdirectory(paddle/v2/tests) add_subdirectory(paddle/v2/tests)
add_subdirectory(paddle/v2/reader/tests) add_subdirectory(paddle/v2/reader/tests)
add_subdirectory(paddle/v2/plot/tests) add_subdirectory(paddle/v2/plot/tests)
add_subdirectory(paddle/v2/framework/tests) add_subdirectory(paddle/v2/fluid/tests)
endif() endif()
endif() endif()
install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
......
...@@ -1200,8 +1200,14 @@ def TestData(data_config, async_load_data=None): ...@@ -1200,8 +1200,14 @@ def TestData(data_config, async_load_data=None):
#caffe_mode: compute the output size using floor instead of ceil, #caffe_mode: compute the output size using floor instead of ceil,
# which is consistent of caffe and CuDNN's convention. # which is consistent of caffe and CuDNN's convention.
def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode): def cnn_output_size(img_size,
output = (2 * padding + img_size - filter_size) / float(stride) filter_size,
padding,
stride,
caffe_mode,
dilation=1):
filter_s = (filter_size - 1) * dilation + 1
output = (2 * padding + img_size - filter_s) / float(stride)
if caffe_mode: if caffe_mode:
return 1 + int(math.floor(output)) return 1 + int(math.floor(output))
else: else:
...@@ -1210,8 +1216,14 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode): ...@@ -1210,8 +1216,14 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
#calcualte image_size based on output_size for de-convolution (ConvTransLayer). #calcualte image_size based on output_size for de-convolution (ConvTransLayer).
#It is the reverse function of cnn_output_size #It is the reverse function of cnn_output_size
def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode): def cnn_image_size(output_size,
img_size = (output_size - 1) * stride + filter_size - 2 * padding filter_size,
padding,
stride,
caffe_mode,
dilation=1):
filter_s = (filter_size - 1) * dilation + 1
img_size = (output_size - 1) * stride + filter_s - 2 * padding
if not caffe_mode: if not caffe_mode:
img_size = img_size + 1 img_size = img_size + 1
return img_size return img_size
...@@ -1253,9 +1265,9 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf): ...@@ -1253,9 +1265,9 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
def parse_pool(pool, input_layer_name, pool_conf, ceil_mode): def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
pool_conf.pool_type = pool.pool_type pool_conf.pool_type = pool.pool_type
config_assert(pool.pool_type in [ config_assert(pool.pool_type in [
'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool' 'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
], "pool-type %s is not in " ], "pool-type %s is not in " \
"['max-projection', 'avg-projection', " "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
"'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type) "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)
pool_conf.channels = pool.channels pool_conf.channels = pool.channels
...@@ -1376,6 +1388,12 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False): ...@@ -1376,6 +1388,12 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
conv_conf.stride_y = conv.stride_y conv_conf.stride_y = conv.stride_y
conv_conf.groups = conv.groups conv_conf.groups = conv.groups
conv_conf.caffe_mode = conv.caffe_mode conv_conf.caffe_mode = conv.caffe_mode
if not conv.dilation:
conv.dilation = 1
conv.dilation_y = 1
else:
conv_conf.dilation = conv.dilation
conv_conf.dilation_y = conv.dilation_y
if not trans: if not trans:
conv_conf.filter_channels = conv.channels / conv.groups conv_conf.filter_channels = conv.channels / conv.groups
...@@ -1383,20 +1401,20 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False): ...@@ -1383,20 +1401,20 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
get_img_size(input_layer_name, conv.channels) get_img_size(input_layer_name, conv.channels)
conv_conf.output_x = cnn_output_size( conv_conf.output_x = cnn_output_size(
conv_conf.img_size, conv_conf.filter_size, conv_conf.padding, conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
conv_conf.stride, conv_conf.caffe_mode) conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
conv_conf.output_y = cnn_output_size( conv_conf.output_y = cnn_output_size(
conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y, conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
conv_conf.stride_y, conv_conf.caffe_mode) conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
else: else:
conv_conf.filter_channels = num_filters / conv.groups conv_conf.filter_channels = num_filters / conv.groups
conv_conf.output_x, conv_conf.output_y = \ conv_conf.output_x, conv_conf.output_y = \
get_img_size(input_layer_name, conv.channels) get_img_size(input_layer_name, conv.channels)
conv_conf.img_size = cnn_image_size( conv_conf.img_size = cnn_image_size(
conv_conf.output_x, conv_conf.filter_size, conv_conf.padding, conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
conv_conf.stride, conv_conf.caffe_mode) conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
conv_conf.img_size_y = cnn_image_size( conv_conf.img_size_y = cnn_image_size(
conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y, conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
conv_conf.stride_y, conv_conf.caffe_mode) conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
#caffe_mode: compute the output size using floor instead of ceil, #caffe_mode: compute the output size using floor instead of ceil,
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
""" """
__all__ = [ __all__ = [
"BasePoolingType", "MaxPooling", "AvgPooling", "CudnnMaxPooling", "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
"CudnnAvgPooling", "SumPooling", "SquareRootNPooling" "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
] ]
...@@ -55,6 +55,19 @@ class MaxPooling(BasePoolingType): ...@@ -55,6 +55,19 @@ class MaxPooling(BasePoolingType):
self.output_max_index = output_max_index self.output_max_index = output_max_index
class MaxWithMaskPooling(BasePoolingType):
"""
MaxWithMask pooling.
Not only return the very large values for each dimension in sequence or time steps,
but also the location indices of found maxinum values.
"""
def __init__(self):
BasePoolingType.__init__(self, "max-pool-with-mask")
class CudnnMaxPooling(BasePoolingType): class CudnnMaxPooling(BasePoolingType):
""" """
Cudnn max pooling only support GPU. Return the maxinum value in the Cudnn max pooling only support GPU. Return the maxinum value in the
......
...@@ -28,6 +28,8 @@ layers { ...@@ -28,6 +28,8 @@ layers {
stride_y: 1 stride_y: 1
output_y: 227 output_y: 227
img_size_y: 256 img_size_y: 256
dilation: 1
dilation_y: 1
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
......
...@@ -28,6 +28,8 @@ layers { ...@@ -28,6 +28,8 @@ layers {
stride_y: 1 stride_y: 1
output_y: 227 output_y: 227
img_size_y: 256 img_size_y: 256
dilation: 1
dilation_y: 1
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
......
...@@ -28,6 +28,8 @@ layers { ...@@ -28,6 +28,8 @@ layers {
stride_y: 1 stride_y: 1
output_y: 48 output_y: 48
img_size_y: 48 img_size_y: 48
dilation: 1
dilation_y: 1
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
......
...@@ -30,6 +30,8 @@ layers { ...@@ -30,6 +30,8 @@ layers {
stride_y: 1 stride_y: 1
output_y: 48 output_y: 48
img_size_y: 48 img_size_y: 48
dilation: 1
dilation_y: 1
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
...@@ -105,6 +107,8 @@ layers { ...@@ -105,6 +107,8 @@ layers {
stride_y: 1 stride_y: 1
output_y: 24 output_y: 24
img_size_y: 24 img_size_y: 24
dilation: 1
dilation_y: 1
} }
} }
bias_parameter_name: "___conv_1__.wbias" bias_parameter_name: "___conv_1__.wbias"
......
...@@ -30,6 +30,8 @@ layers { ...@@ -30,6 +30,8 @@ layers {
stride_y: 1 stride_y: 1
output_y: 48 output_y: 48
img_size_y: 48 img_size_y: 48
dilation: 1
dilation_y: 1
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
......
...@@ -36,6 +36,8 @@ layers { ...@@ -36,6 +36,8 @@ layers {
stride_y: 1 stride_y: 1
output_y: 14 output_y: 14
img_size_y: 14 img_size_y: 14
dilation: 1
dilation_y: 1
} }
} }
bias_parameter_name: "___conv_0__.wbias" bias_parameter_name: "___conv_0__.wbias"
......
...@@ -37,6 +37,8 @@ import model ...@@ -37,6 +37,8 @@ import model
import paddle.trainer.config_parser as cp import paddle.trainer.config_parser as cp
__all__ = [ __all__ = [
'default_startup_program',
'default_main_program',
'optimizer', 'optimizer',
'layer', 'layer',
'activation', 'activation',
......
from paddle.v2.framework import framework as framework from paddle.v2.fluid import framework as framework
__all__ = ['append_backward_ops'] __all__ = ['append_backward_ops']
......
...@@ -13,7 +13,7 @@ A `scoped_function` will take a `function` as input. That function will be ...@@ -13,7 +13,7 @@ A `scoped_function` will take a `function` as input. That function will be
invoked in a new local scope. invoked in a new local scope.
""" """
import paddle.v2.framework.core import paddle.v2.fluid.core
import threading import threading
__tl_scope__ = threading.local() __tl_scope__ = threading.local()
...@@ -27,13 +27,13 @@ __all__ = [ ...@@ -27,13 +27,13 @@ __all__ = [
def get_cur_scope(): def get_cur_scope():
""" """
Get current scope. Get current scope.
:rtype: paddle.v2.framework.core.Scope :rtype: paddle.v2.fluid.core.Scope
""" """
cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None) cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
if cur_scope_stack is None: if cur_scope_stack is None:
__tl_scope__.cur_scope = list() __tl_scope__.cur_scope = list()
if len(__tl_scope__.cur_scope) == 0: if len(__tl_scope__.cur_scope) == 0:
__tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope()) __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
return __tl_scope__.cur_scope[-1] return __tl_scope__.cur_scope[-1]
......
import numpy as np
from paddle.v2.fluid.framework import Program, g_main_program, unique_name, Variable
import paddle.v2.fluid.core as core
def _clone_var_in_block_(block, var):
assert isinstance(var, Variable)
return block.create_var(
name=var.name,
shape=var.shape,
dtype=var.data_type,
type=var.type,
lod_level=var.lod_level,
persistable=True)
class Evaluator(object):
"""
Evalutor Base class.
create metric states
add mini-batch evaluator caculate operator
add increment operator to accumulate the metric states
"""
def __init__(self, name, **kwargs):
"""
init the global states
"""
self._states = {}
if kwargs.has_key("main_program"):
self._main_program = kwargs.get("main_program")
else:
self._main_program = g_main_program
def _update_ops(self, *args, **kwargs):
"""
append update ops to the global states
"""
raise NotImplementedError()
def reset(self, executor, reset_program=None):
"""
Clear metric states at the begin of each pass/user specified batch
"""
if reset_program == None:
reset_program = Program()
else:
reset_program = program
block = reset_program.global_block()
for k, var in self._states.iteritems():
g_var = _clone_var_in_block_(block, var)
zeros = block.create_var(dtype="float32", persistable=True)
block.append_op(
type="fill_constant",
outputs={"Out": [zeros]},
attrs={
"shape": g_var.shape,
"value": .0,
"data_type": 5,
})
block.append_op(
type="scale", inputs={"X": zeros}, outputs={"Out": g_var})
executor.run(reset_program, fetch_list=self._states.values())
def eval(self, executor, eval_program=None):
"""
Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
"""
raise NotImplementedError()
class Accuracy(Evaluator):
"""
Accuracy need two state variable Total, Correct
"""
def __init__(self, *args, **kwargs):
super(Accuracy, self).__init__("accuracy", **kwargs)
block = self._main_program.global_block()
g_total = block.create_var(
name=unique_name("Total"),
persistable=True,
dtype="int64",
shape=[1])
g_correct = block.create_var(
name=unique_name("Correct"),
persistable=True,
dtype="int64",
shape=[1])
self._states["Total"] = g_total
self._states["Correct"] = g_correct
def _update_ops(self, input, label, k=1, **kwargs):
block = self._main_program.global_block()
topk_out = block.create_var(dtype=input.data_type)
topk_indices = block.create_var(dtype="int64")
block.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
acc_out = block.create_var(dtype=kwargs.get("out_dtype", "float32"))
correct = block.create_var(dtype="int64", persistable=True)
total = block.create_var(dtype="int64", persistable=True)
block.append_op(
type="accuracy",
inputs={
"Out": [topk_out],
"Indices": [topk_indices],
"Label": [label]
},
outputs={
"Accuracy": [acc_out],
"Correct": [correct],
"Total": [total],
})
block.append_op(
type="cast",
inputs={"X": [self._states["Total"]]},
outputs={"Out": [self._states["Total"]]},
attrs={
"in_data_type": 5, # float32
"out_data_type": 2, #int32
})
block.append_op(
type="cast",
inputs={"X": [self._states["Correct"]]},
outputs={"Out": [self._states["Correct"]]},
attrs={
"in_data_type": 5,
"out_data_type": 2,
})
block.append_op(
type="elementwise_add",
inputs={"X": [self._states["Total"]],
"Y": [total]},
outputs={"Out": [self._states["Total"]]})
block.append_op(
type="elementwise_add",
inputs={"X": [self._states["Correct"]],
"Y": [correct]},
outputs={"Out": [self._states["Correct"]]})
return acc_out
def eval(self, executor, eval_program=None):
if eval_program != None:
eval_program = eval_program
else:
eval_program = Program()
block = eval_program.global_block()
eval_out = block.create_var(dtype=self._states["Total"].data_type)
e_total = _clone_var_in_block_(block, self._states["Total"])
e_correct = _clone_var_in_block_(block, self._states["Correct"])
block.append_op(
type="cast",
inputs={"X": [e_total]},
outputs={"Out": [e_total]},
attrs={
"in_data_type": 2, #int32
"out_data_type": 5, #float32
})
block.append_op(
type="cast",
inputs={"X": [e_correct]},
outputs={"Out": [e_correct]},
attrs={
"in_data_type": 2,
"out_data_type": 5,
})
block.append_op(
type="elementwise_div",
inputs={"X": e_correct,
"Y": e_total},
outputs={"Out": eval_out})
out = executor.run(eval_program, fetch_list=[eval_out])
return np.array(out[0])
def accuracy(*args, **kwargs):
cls = Accuracy(*args, **kwargs)
out = cls._update_ops(*args, **kwargs)
return cls, out
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
from paddle.v2.framework.framework import Block, Program, g_main_program from paddle.v2.fluid.framework import Block, Program, g_main_program
g_scope = core.Scope() g_scope = core.Scope()
......
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.proto.framework_pb2 as framework_pb2 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
import collections import collections
import numpy as np import numpy as np
import copy import copy
__all__ = ['Block', 'Variable', 'Program', 'Operator'] __all__ = ['Block', 'Variable', 'Program', 'Operator', 'default_startup_program', 'default_main_program']
def unique_name(prefix): def unique_name(prefix):
...@@ -562,3 +562,9 @@ class Parameter(Variable): ...@@ -562,3 +562,9 @@ class Parameter(Variable):
# program is a global instance. # program is a global instance.
g_main_program = Program() g_main_program = Program()
g_startup_program = Program() g_startup_program = Program()
def default_startup_program():
return g_startup_program
def default_main_program():
return g_main_program
import paddle.v2.framework.framework as framework import paddle.v2.fluid.framework as framework
import numpy as np import numpy as np
__all__ = [ __all__ = [
......
import os import os
import cPickle as pickle import cPickle as pickle
from paddle.v2.framework.framework import Program, Parameter, g_main_program, \ from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
Variable Variable
__all__ = [ __all__ = [
......
import copy import copy
import itertools import itertools
from paddle.v2.framework.framework import Variable, g_main_program, \ from paddle.v2.fluid.framework import Variable, g_main_program, \
g_startup_program, unique_name, Program g_startup_program, unique_name, Program
from paddle.v2.framework.initializer import ConstantInitializer, \ from paddle.v2.fluid.initializer import ConstantInitializer, \
UniformInitializer, XavierInitializer UniformInitializer, XavierInitializer
......
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.proto.framework_pb2 as framework_pb2 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
from paddle.v2.framework.framework import OpProtoHolder, Variable, Program, \ from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
Operator Operator
from paddle.v2.framework.initializer import ConstantInitializer, \ from paddle.v2.fluid.initializer import ConstantInitializer, \
NormalInitializer NormalInitializer
from paddle.v2.framework.layer_helper import LayerHelper, unique_name from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
import re import re
import cStringIO import cStringIO
...@@ -574,7 +574,9 @@ def accuracy(input, label, k=1, **kwargs): ...@@ -574,7 +574,9 @@ def accuracy(input, label, k=1, **kwargs):
"Indices": [topk_indices]}, "Indices": [topk_indices]},
attrs={"k": k}) attrs={"k": k})
acc_out_dtype = kwargs.get("out_dtype", "float32") acc_out_dtype = kwargs.get("out_dtype", "float32")
acc_out = helper.create_tmp_variable(dtype=acc_out_dtype) acc_out = helper.create_tmp_variable(dtype="float32")
correct = helper.create_tmp_variable(dtype="int64")
total = helper.create_tmp_variable(dtype="int64")
helper.append_op( helper.append_op(
type="accuracy", type="accuracy",
inputs={ inputs={
...@@ -582,7 +584,11 @@ def accuracy(input, label, k=1, **kwargs): ...@@ -582,7 +584,11 @@ def accuracy(input, label, k=1, **kwargs):
"Indices": [topk_indices], "Indices": [topk_indices],
"Label": [label] "Label": [label]
}, },
outputs={"Accuracy": [acc_out]}) outputs={
"Accuracy": [acc_out],
"Correct": [correct],
"Total": [total],
})
return acc_out return acc_out
......
...@@ -3,8 +3,8 @@ import json ...@@ -3,8 +3,8 @@ import json
import logging import logging
from collections import defaultdict from collections import defaultdict
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.proto.framework_pb2 as framework_pb2 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
......
import paddle.v2.framework.layers as layers import paddle.v2.fluid.layers as layers
__all__ = ["simple_img_conv_pool", "sequence_conv_pool"] __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
......
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.proto.framework_pb2 as framework_pb2 import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
def get_all_op_protos(): def get_all_op_protos():
......
from collections import defaultdict from collections import defaultdict
import paddle.v2.framework.framework as framework import paddle.v2.fluid.framework as framework
from paddle.v2.framework.framework import unique_name, Program from paddle.v2.fluid.framework import unique_name, Program
from paddle.v2.framework.backward import append_backward_ops from paddle.v2.fluid.backward import append_backward_ops
from paddle.v2.framework.initializer import ConstantInitializer from paddle.v2.fluid.initializer import ConstantInitializer
from paddle.v2.framework.regularizer import append_regularization_ops from paddle.v2.fluid.regularizer import append_regularization_ops
from paddle.v2.framework.layer_helper import LayerHelper from paddle.v2.fluid.layer_helper import LayerHelper
__all__ = [ __all__ = [
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
'AdamaxOptimizer' 'AdamaxOptimizer', 'DecayedAdagradOptimizer'
] ]
...@@ -85,7 +85,7 @@ class Optimizer(object): ...@@ -85,7 +85,7 @@ class Optimizer(object):
""" """
if (name in self._accumulators and if (name in self._accumulators and
param.name in self._accumulators[name]): param.name in self._accumulators[name]):
raise Exception("Accumulator {} already exists for parmeter {}". raise Exception("Accumulator {} already exists for parameter {}".
format(name, param.name)) format(name, param.name))
assert isinstance(self.helper, LayerHelper) assert isinstance(self.helper, LayerHelper)
...@@ -307,7 +307,7 @@ class AdagradOptimizer(Optimizer): ...@@ -307,7 +307,7 @@ class AdagradOptimizer(Optimizer):
moment_acc = self._get_accumulator(self._moment_acc_str, moment_acc = self._get_accumulator(self._moment_acc_str,
param_and_grad[0]) param_and_grad[0])
# create the adagrad optimizer op # Create the adagrad optimizer op
adagrad_op = block.append_op( adagrad_op = block.append_op(
type=self.type, type=self.type,
inputs={ inputs={
...@@ -510,3 +510,51 @@ class AdamaxOptimizer(Optimizer): ...@@ -510,3 +510,51 @@ class AdamaxOptimizer(Optimizer):
attrs={"scale": self._beta1}) attrs={"scale": self._beta1})
return [scale_beta1] return [scale_beta1]
class DecayedAdagradOptimizer(Optimizer):
"""Simple Decayed Adagrad optimizer with moment state
"""
_moment_acc_str = "moment"
def __init__(self,
learning_rate,
decay=0.95,
epsilon=1.0e-6,
global_step=None):
assert learning_rate is not None
assert decay is not None
assert epsilon is not None
super(DecayedAdagradOptimizer, self).__init__(global_step)
self.type = "decayed_adagrad"
self._learning_rate = learning_rate
self._decay = decay
self._epsilon = epsilon
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
self._add_accumulator(self._moment_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
moment_acc = self._get_accumulator(self._moment_acc_str,
param_and_grad[0])
# Create the decayed adagrad optimizer op
decayed_adagrad_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Moment": moment_acc,
"LearningRate": self._create_param_lr(param_and_grad)
},
outputs={"ParamOut": param_and_grad[0],
"MomentOut": moment_acc},
attrs={"epsilon": self._epsilon})
return decayed_adagrad_op
import paddle.v2.framework.framework as framework import paddle.v2.fluid.framework as framework
__all__ = [ __all__ = [
'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer' 'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'
......
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.framework.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
from paddle.v2.framework.framework import Program from paddle.v2.fluid.io import save_persistables, load_persistables
from paddle.v2.framework.io import save_persistables, load_persistables from paddle.v2.fluid.executor import Executor
from paddle.v2.framework.executor import Executor
import numpy as np import numpy as np
startup_program = Program()
main_program = Program()
x = layers.data( x = layers.data(
name='x', name='x',
shape=[13], shape=[13],
data_type='float32', data_type='float32')
main_program=main_program,
startup_program=startup_program)
y_predict = layers.fc(input=x, y_predict = layers.fc(input=x,
size=1, size=1,
act=None, act=None)
main_program=main_program,
startup_program=startup_program)
y = layers.data( y = layers.data(
name='y', name='y',
shape=[1], shape=[1],
data_type='float32', data_type='float32')
main_program=main_program,
startup_program=startup_program)
cost = layers.square_error_cost( cost = layers.square_error_cost(
input=y_predict, input=y_predict,
label=y, label=y)
main_program=main_program, avg_cost = layers.mean(x=cost)
startup_program=startup_program)
avg_cost = layers.mean(
x=cost, main_program=main_program, startup_program=startup_program)
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001) sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
opts = sgd_optimizer.minimize(avg_cost, startup_program) opts = sgd_optimizer.minimize(avg_cost)
BATCH_SIZE = 20 BATCH_SIZE = 20
...@@ -52,12 +40,12 @@ train_reader = paddle.batch( ...@@ -52,12 +40,12 @@ train_reader = paddle.batch(
place = core.CPUPlace() place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(startup_program, feed={}, fetch_list=[]) exe.run(framework.default_startup_program())
PASS_NUM = 100 PASS_NUM = 100
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
save_persistables(exe, "./fit_a_line.model/", main_program=main_program) save_persistables(exe, "./fit_a_line.model/")
load_persistables(exe, "./fit_a_line.model/", main_program=main_program) load_persistables(exe, "./fit_a_line.model/")
for data in train_reader(): for data in train_reader():
x_data = np.array(map(lambda x: x[0], data)).astype("float32") x_data = np.array(map(lambda x: x[0], data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("float32")
...@@ -69,7 +57,7 @@ for pass_id in range(PASS_NUM): ...@@ -69,7 +57,7 @@ for pass_id in range(PASS_NUM):
tensor_y = core.LoDTensor() tensor_y = core.LoDTensor()
tensor_y.set(y_data, place) tensor_y.set(y_data, place)
# print tensor_y.get_dims() # print tensor_y.get_dims()
outs = exe.run(main_program, outs = exe.run(framework.default_main_program(),
feed={'x': tensor_x, feed={'x': tensor_x,
'y': tensor_y}, 'y': tensor_y},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
......
import numpy as np import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.framework.nets as nets import paddle.v2.fluid.nets as nets
import paddle.v2.framework.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
from paddle.v2.framework.executor import Executor from paddle.v2.fluid.executor import Executor
from paddle.v2.framework.framework import g_startup_program, g_main_program import paddle.v2.fluid.framework as framework
from paddle.v2.framework.initializer import XavierInitializer from paddle.v2.fluid.initializer import XavierInitializer
def resnet_cifar10(input, depth=32, main_program=None, startup_program=None): def resnet_cifar10(input, depth=32):
def conv_bn_layer(input, def conv_bn_layer(input,
ch_out, ch_out,
filter_size, filter_size,
stride, stride,
padding, padding,
act='relu', act='relu'):
main_program=None,
startup_program=None):
tmp = layers.conv2d( tmp = layers.conv2d(
input=input, input=input,
filter_size=filter_size, filter_size=filter_size,
...@@ -25,14 +23,10 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None): ...@@ -25,14 +23,10 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
stride=stride, stride=stride,
padding=padding, padding=padding,
act=None, act=None,
bias_attr=False, bias_attr=False)
main_program=main_program,
startup_program=startup_program)
return layers.batch_norm( return layers.batch_norm(
input=tmp, input=tmp,
act=act, act=act)
main_program=main_program,
startup_program=startup_program)
def shortcut(input, ch_in, ch_out, stride, program, init_program): def shortcut(input, ch_in, ch_out, stride, program, init_program):
if ch_in != ch_out: if ch_in != ch_out:
...@@ -44,40 +38,30 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None): ...@@ -44,40 +38,30 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
def basicblock(input, def basicblock(input,
ch_in, ch_in,
ch_out, ch_out,
stride, stride):
main_program=main_program,
startup_program=startup_program):
tmp = conv_bn_layer( tmp = conv_bn_layer(
input, input,
ch_out, ch_out,
3, 3,
stride, stride,
1, 1)
main_program=main_program,
startup_program=startup_program)
tmp = conv_bn_layer( tmp = conv_bn_layer(
tmp, tmp,
ch_out, ch_out,
3, 3,
1, 1,
1, 1,
act=None, act=None)
main_program=main_program, short = shortcut(input, ch_in, ch_out, stride)
startup_program=startup_program)
short = shortcut(input, ch_in, ch_out, stride, main_program,
startup_program)
return layers.elementwise_add( return layers.elementwise_add(
x=tmp, x=tmp,
y=short, y=short,
act='relu', act='relu')
main_program=main_program,
startup_program=startup_program)
def layer_warp(block_func, input, ch_in, ch_out, count, stride, program, def layer_warp(block_func, input, ch_in, ch_out, count, stride):
startup_program): tmp = block_func(input, ch_in, ch_out, stride)
tmp = block_func(input, ch_in, ch_out, stride, program, startup_program)
for i in range(1, count): for i in range(1, count):
tmp = block_func(tmp, ch_out, ch_out, 1, program, startup_program) tmp = block_func(tmp, ch_out, ch_out, 1)
return tmp return tmp
assert (depth - 2) % 6 == 0 assert (depth - 2) % 6 == 0
...@@ -87,53 +71,41 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None): ...@@ -87,53 +71,41 @@ def resnet_cifar10(input, depth=32, main_program=None, startup_program=None):
ch_out=16, ch_out=16,
filter_size=3, filter_size=3,
stride=1, stride=1,
padding=1, padding=1)
main_program=main_program,
startup_program=startup_program)
res1 = layer_warp( res1 = layer_warp(
basicblock, basicblock,
conv1, conv1,
16, 16,
16, 16,
n, n,
1, 1)
main_program=main_program,
startup_program=startup_program)
res2 = layer_warp( res2 = layer_warp(
basicblock, basicblock,
res1, res1,
16, 16,
32, 32,
n, n,
2, 2)
main_program=main_program,
startup_program=startup_program)
res3 = layer_warp( res3 = layer_warp(
basicblock, basicblock,
res2, res2,
32, 32,
64, 64,
n, n,
2, 2)
main_program=main_program,
startup_program=startup_program)
pool = layers.pool2d( pool = layers.pool2d(
input=res3, input=res3,
pool_size=8, pool_size=8,
pool_type='avg', pool_type='avg',
pool_stride=1, pool_stride=1)
main_program=main_program,
startup_program=startup_program)
return pool return pool
def vgg16_bn_drop(input, main_program=None, startup_program=None): def vgg16_bn_drop(input):
def conv_block(input, def conv_block(input,
num_filter, num_filter,
groups, groups,
dropouts, dropouts):
main_program=None,
startup_program=None):
return nets.img_conv_group( return nets.img_conv_group(
input=input, input=input,
pool_size=2, pool_size=2,
...@@ -143,51 +115,34 @@ def vgg16_bn_drop(input, main_program=None, startup_program=None): ...@@ -143,51 +115,34 @@ def vgg16_bn_drop(input, main_program=None, startup_program=None):
conv_act='relu', conv_act='relu',
conv_with_batchnorm=True, conv_with_batchnorm=True,
conv_batchnorm_drop_rate=dropouts, conv_batchnorm_drop_rate=dropouts,
pool_type='max', pool_type='max')
main_program=main_program,
startup_program=startup_program)
conv1 = conv_block(input, 64, 2, [0.3, 0], main_program, startup_program) conv1 = conv_block(input, 64, 2, [0.3, 0])
conv2 = conv_block(conv1, 128, 2, [0.4, 0], main_program, startup_program) conv2 = conv_block(conv1, 128, 2, [0.4, 0])
conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0], main_program, conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
startup_program) conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0], main_program, conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
startup_program)
conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0], main_program,
startup_program)
drop = layers.dropout( drop = layers.dropout(
x=conv5, x=conv5,
dropout_prob=0.5, dropout_prob=0.5)
main_program=main_program,
startup_program=startup_program)
fc1 = layers.fc(input=drop, fc1 = layers.fc(input=drop,
size=512, size=512,
act=None, act=None,
param_attr={"initializer": XavierInitializer()}, param_attr={"initializer": XavierInitializer()})
main_program=main_program,
startup_program=startup_program)
reshape1 = layers.reshape( reshape1 = layers.reshape(
x=fc1, x=fc1,
shape=list(fc1.shape + (1, 1)), shape=list(fc1.shape + (1, 1)))
main_program=main_program,
startup_program=startup_program)
bn = layers.batch_norm( bn = layers.batch_norm(
input=reshape1, input=reshape1,
act='relu', act='relu')
main_program=main_program,
startup_program=startup_program)
drop2 = layers.dropout( drop2 = layers.dropout(
x=bn, x=bn,
dropout_prob=0.5, dropout_prob=0.5)
main_program=main_program,
startup_program=startup_program)
fc2 = layers.fc(input=drop2, fc2 = layers.fc(input=drop2,
size=512, size=512,
act=None, act=None,
param_attr={"initializer": XavierInitializer()}, param_attr={"initializer": XavierInitializer()})
main_program=main_program,
startup_program=startup_program)
return fc2 return fc2
...@@ -225,7 +180,7 @@ train_reader = paddle.batch( ...@@ -225,7 +180,7 @@ train_reader = paddle.batch(
place = core.CPUPlace() place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(g_startup_program, feed={}, fetch_list=[]) exe.run(framework.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
batch_id = 0 batch_id = 0
...@@ -243,7 +198,7 @@ for pass_id in range(PASS_NUM): ...@@ -243,7 +198,7 @@ for pass_id in range(PASS_NUM):
tensor_img.set(img_data, place) tensor_img.set(img_data, place)
tensor_y.set(y_data, place) tensor_y.set(y_data, place)
outs = exe.run(g_main_program, outs = exe.run(framework.default_main_program(),
feed={"pixel": tensor_img, feed={"pixel": tensor_img,
"label": tensor_y}, "label": tensor_y},
fetch_list=[avg_cost, accuracy]) fetch_list=[avg_cost, accuracy])
......
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.framework.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.framework.nets as nets import paddle.v2.fluid.nets as nets
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.evaluator as evaluator
from paddle.v2.framework.framework import Program import paddle.v2.fluid.framework as framework
from paddle.v2.framework.executor import Executor from paddle.v2.fluid.executor import Executor
import numpy as np import numpy as np
startup_program = Program()
main_program = Program()
images = layers.data( images = layers.data(
name='pixel', name='pixel',
shape=[1, 28, 28], shape=[1, 28, 28],
data_type='float32', data_type='float32')
main_program=main_program,
startup_program=startup_program)
label = layers.data( label = layers.data(
name='label', name='label',
shape=[1], shape=[1],
data_type='int64', data_type='int64')
main_program=main_program,
startup_program=startup_program)
conv_pool_1 = nets.simple_img_conv_pool( conv_pool_1 = nets.simple_img_conv_pool(
input=images, input=images,
filter_size=5, filter_size=5,
num_filters=20, num_filters=20,
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu")
main_program=main_program,
startup_program=startup_program)
conv_pool_2 = nets.simple_img_conv_pool( conv_pool_2 = nets.simple_img_conv_pool(
input=conv_pool_1, input=conv_pool_1,
filter_size=5, filter_size=5,
num_filters=50, num_filters=50,
pool_size=2, pool_size=2,
pool_stride=2, pool_stride=2,
act="relu", act="relu")
main_program=main_program,
startup_program=startup_program)
predict = layers.fc(input=conv_pool_2, predict = layers.fc(input=conv_pool_2,
size=10, size=10,
act="softmax", act="softmax")
main_program=main_program, cost = layers.cross_entropy(input=predict, label=label)
startup_program=startup_program) avg_cost = layers.mean(x=cost)
cost = layers.cross_entropy(
input=predict,
label=label,
main_program=main_program,
startup_program=startup_program)
avg_cost = layers.mean(x=cost, main_program=main_program)
accuracy = layers.accuracy(
input=predict,
label=label,
main_program=main_program,
startup_program=startup_program)
# optimizer = optimizer.MomentumOptimizer(learning_rate=0.1 / 128.0,
# momentum=0.9)
optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999) optimizer = optimizer.AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
opts = optimizer.minimize(avg_cost, startup_program) opts = optimizer.minimize(avg_cost)
accuracy, acc_out = evaluator.accuracy(
input=predict,
label=label)
BATCH_SIZE = 50 BATCH_SIZE = 50
PASS_NUM = 3 PASS_NUM = 3
...@@ -75,10 +54,11 @@ train_reader = paddle.batch( ...@@ -75,10 +54,11 @@ train_reader = paddle.batch(
place = core.CPUPlace() place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(startup_program, feed={}, fetch_list=[]) exe.run(framework.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
count = 0 count = 0
accuracy.reset(exe)
for data in train_reader(): for data in train_reader():
img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
data)).astype("float32") data)).astype("float32")
...@@ -90,14 +70,20 @@ for pass_id in range(PASS_NUM): ...@@ -90,14 +70,20 @@ for pass_id in range(PASS_NUM):
tensor_img.set(img_data, place) tensor_img.set(img_data, place)
tensor_y.set(y_data, place) tensor_y.set(y_data, place)
outs = exe.run(main_program, outs = exe.run(framework.default_main_program(),
feed={"pixel": tensor_img, feed={"pixel": tensor_img,
"label": tensor_y}, "label": tensor_y},
fetch_list=[avg_cost, accuracy]) fetch_list=[avg_cost, acc_out])
loss = np.array(outs[0]) loss = np.array(outs[0])
acc = np.array(outs[1]) acc = np.array(outs[1])
pass_acc = accuracy.eval(exe)
print "pass id : ", pass_id, pass_acc
# print loss, acc
if loss < 10.0 and acc > 0.9: if loss < 10.0 and acc > 0.9:
# if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good. # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
exit(0) exit(0)
pass_acc = accuracy.eval(exe)
print "pass id : ", pass_id, pass_acc
exit(1) exit(1)
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.framework.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
from paddle.v2.framework.framework import Program from paddle.v2.fluid.executor import Executor
from paddle.v2.framework.executor import Executor from paddle.v2.fluid.regularizer import L2DecayRegularizer
from paddle.v2.framework.regularizer import L2DecayRegularizer from paddle.v2.fluid.initializer import UniformInitializer
from paddle.v2.framework.initializer import UniformInitializer
import numpy as np import numpy as np
BATCH_SIZE = 128 BATCH_SIZE = 128
startup_program = Program()
main_program = Program()
image = layers.data( image = layers.data(
name='x', name='x',
shape=[784], shape=[784],
data_type='float32', data_type='float32')
main_program=main_program,
startup_program=startup_program)
param_attr = { param_attr = {
'name': None, 'name': None,
...@@ -30,45 +25,30 @@ param_attr = { ...@@ -30,45 +25,30 @@ param_attr = {
hidden1 = layers.fc(input=image, hidden1 = layers.fc(input=image,
size=128, size=128,
act='relu', act='relu',
main_program=main_program,
startup_program=startup_program,
param_attr=param_attr) param_attr=param_attr)
hidden2 = layers.fc(input=hidden1, hidden2 = layers.fc(input=hidden1,
size=64, size=64,
act='relu', act='relu',
main_program=main_program,
startup_program=startup_program,
param_attr=param_attr) param_attr=param_attr)
predict = layers.fc(input=hidden2, predict = layers.fc(input=hidden2,
size=10, size=10,
act='softmax', act='softmax',
main_program=main_program,
startup_program=startup_program,
param_attr=param_attr) param_attr=param_attr)
label = layers.data( label = layers.data(
name='y', name='y',
shape=[1], shape=[1],
data_type='int64', data_type='int64')
main_program=main_program,
startup_program=startup_program)
cost = layers.cross_entropy( cost = layers.cross_entropy(input=predict, label=label)
input=predict, avg_cost = layers.mean(x=cost)
label=label,
main_program=main_program,
startup_program=startup_program)
avg_cost = layers.mean(
x=cost, main_program=main_program, startup_program=startup_program)
accuracy = layers.accuracy( accuracy = layers.accuracy(
input=predict, input=predict,
label=label, label=label)
main_program=main_program,
startup_program=startup_program)
optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer = optimizer.MomentumOptimizer(learning_rate=0.001, momentum=0.9)
opts = optimizer.minimize(avg_cost, startup_program) opts = optimizer.minimize(avg_cost)
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
...@@ -78,7 +58,7 @@ train_reader = paddle.batch( ...@@ -78,7 +58,7 @@ train_reader = paddle.batch(
place = core.CPUPlace() place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(startup_program, feed={}, fetch_list=[]) exe.run(framework.default_startup_program())
PASS_NUM = 100 PASS_NUM = 100
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
...@@ -93,7 +73,7 @@ for pass_id in range(PASS_NUM): ...@@ -93,7 +73,7 @@ for pass_id in range(PASS_NUM):
tensor_y = core.LoDTensor() tensor_y = core.LoDTensor()
tensor_y.set(y_data, place) tensor_y.set(y_data, place)
outs = exe.run(main_program, outs = exe.run(framework.default_main_program(),
feed={'x': tensor_x, feed={'x': tensor_x,
'y': tensor_y}, 'y': tensor_y},
fetch_list=[avg_cost, accuracy]) fetch_list=[avg_cost, accuracy])
......
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.framework.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.framework.nets as nets import paddle.v2.fluid.nets as nets
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
from paddle.v2.framework.framework import Program, g_main_program, g_startup_program from paddle.v2.fluid.executor import Executor
from paddle.v2.framework.executor import Executor
import numpy as np import numpy as np
...@@ -70,7 +69,7 @@ def main(): ...@@ -70,7 +69,7 @@ def main():
place = core.CPUPlace() place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(g_startup_program) exe.run(framework.default_startup_program())
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
for data in train_data(): for data in train_data():
...@@ -82,7 +81,7 @@ def main(): ...@@ -82,7 +81,7 @@ def main():
tensor_label = core.LoDTensor() tensor_label = core.LoDTensor()
tensor_label.set(label, place) tensor_label.set(label, place)
outs = exe.run(g_main_program, outs = exe.run(framework.default_main_program(),
feed={"words": tensor_words, feed={"words": tensor_words,
"label": tensor_label}, "label": tensor_label},
fetch_list=[cost, acc]) fetch_list=[cost, acc])
......
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.framework.layers as layers import paddle.v2.fluid.layers as layers
import paddle.v2.framework.nets as nets import paddle.v2.fluid.nets as nets
import paddle.v2.framework.core as core import paddle.v2.fluid.core as core
import paddle.v2.framework.optimizer as optimizer import paddle.v2.fluid.optimizer as optimizer
import paddle.v2.fluid.framework as framework
from paddle.v2.framework.framework import Program, g_main_program, g_startup_program from paddle.v2.fluid.executor import Executor
from paddle.v2.framework.executor import Executor
import numpy as np import numpy as np
...@@ -81,7 +80,7 @@ def main(): ...@@ -81,7 +80,7 @@ def main():
place = core.CPUPlace() place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(g_startup_program) exe.run(framework.default_startup_program())
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
for data in train_data(): for data in train_data():
...@@ -93,7 +92,7 @@ def main(): ...@@ -93,7 +92,7 @@ def main():
tensor_label = core.LoDTensor() tensor_label = core.LoDTensor()
tensor_label.set(label, place) tensor_label.set(label, place)
outs = exe.run(g_main_program, outs = exe.run(framework.default_main_program(),
feed={"words": tensor_words, feed={"words": tensor_words,
"label": tensor_label}, "label": tensor_label},
fetch_list=[cost, acc]) fetch_list=[cost, acc])
......
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册