diff --git a/.gitignore b/.gitignore index 4f21fefda9f64a0392881971a715b97c234030e3..351b8204100dfd71e94cb3efa2e946b44b9e4285 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,4 @@ CMakeFiles cmake_install.cmake paddle/.timestamp python/paddlepaddle.egg-info/ +paddle/pybind/pybind.h diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md new file mode 100644 index 0000000000000000000000000000000000000000..e105861e926411a269b0b52dd4688744912c9ab3 --- /dev/null +++ b/doc/design/refactorization.md @@ -0,0 +1,253 @@ +# Design Doc: Refactorization Overview + +The goal of refactorizaiton include: + +1. Make it easy for external contributors to write new elementory computaiton operations. +1. Make the codebase clean and readable. +1. Introduce a new design of computation representation -- a computation graph of operators and variables. +1. The graph representation helps implementing auto-scalable and auto fault recoverable distributed computing. + +## Computation Graphs + +1. PaddlePaddle represent the computation, training and inference of DL models, by computation graphs. + + 1. Please dig into [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a solid example. + +1. Users write Python programs to describe the graphs and run it (locally or remotely). + +1. A graph is composed of *variabels* and *operators*. + +1. The description of graphs must be able to be serialized/deserialized, so it + + 1. could to be sent to the cloud for distributed execution, and + 1. be sent to clients for mobile or enterprise deployment. + +1. The Python program do + + 1. *compilation*: runs a Python program to generate a protobuf message representation of the graph and send it to + 1. the C++ library `libpaddle.so` for local execution, + 1. the master process of a distributed training job for training, or + 1. the server process of a Kubernetes serving job for distributed serving. + 1. *execution*: according to the protobuf message, constructs instances of class `Variable` and `OperatorBase`, and run them. + +## Description and Realization + +At compile time, the Python program generates protobuf message representation of the graph, or the description of the graph. + +At runtime, the C++ program realizes the graph and run it. + +| | Representation (protobuf messages) | Realization (C++ class objects) | +|---|---|---| +|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)| +|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)| +|Block|BlockDesc|Block| + +The word *graph* is exchangable with *block* in this document. A graph represent computation steps and local variables as a C++/Java program block, or a pair of { and }. + +## Compilation and Execution + +1. Run an applicaton Python program to describe the graph. In particular, + + 1. create VarDesc to represent local/intermediate variables, + 1. create operators and set attributes, + 1. validate attribute values, + 1. inference the type and the shape of variables, + 1. plan for memory-reuse for variables, + 1. generate backward and optimization part of the Graph. + 1. possiblly split the graph for distributed training. + +1. The invocation of `train` or `infer` in the application Python program: + + 1. create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block, + 1. realize local variables defined in the BlockDesc message in the new scope, + 1. a scope is similar to the stack frame in programming languages, + + 1. create an instance of class `Block`, in which, + 1. realize operators in the BlockDesc message, + + 1. run the Block by calling + 1. `Block::Eval(vector* targets)` for forward and backward computations, or + 1. `Block::Eval(vector* targets)` for optimization. + + +## Intermediate Representation (IR) + +```text +Compile Time -> IR -> Runtime +``` + +### Benefit + +- Optimization + ```text + Compile Time -> IR -> Optimized IR -> Runtime + ``` +- Send automatically partitioned IR to different nodes. + - Automatic data parallel + ```text + Compile Time + |-> Single GPU IR + |-> [trainer-IR-0, trainer-IR-1, pserver-IR] + |-> Node-0 (runs trainer-IR-0) + |-> Node-1 (runs trainer-IR-1) + |-> Node-2 (runs pserver-IR) + ``` + - Automatic model parallel (planned for future) + +--- + +# Operator/OpWithKernel/OpKernel + +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot) + +--- + +# Operator +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) + +* `Operator` is the fundamental building block as the user interface. + * Operator stores input/output variable name, and attributes. + * The `InferShape` interface is used to infer output variable shapes by its input shapes. + * Use `Run` to compute `input variables` to `output variables`. + +--- + +# OpWithKernel/Kernel + +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot) + +* `OpWithKernel` inherits `Operator`. +* `OpWithKernel` contains a Kernel map. + * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`. + * `OpKernelKey` is the map key. Only device place now, but may be data type later. + +--- + +# Why separate Kernel and Operator + +* Separate GPU and CPU code. + * Make Paddle can run without GPU. +* Make one operator (which is user interface) can contain many implementations. + * Same mul op, different FP16, FP32 Kernel. different MKL, eigen kernel. +--- + +# Libraries for Kernel development + +* `Eigen::Tensor` contains basic math and element-wise functions. + * Note that `Eigen::Tensor` has broadcast implementation. + * Limit number of `tensor.device(dev) = ` in your code. +* `thrust::tranform` and `std::transform`. + * `thrust` has the same API as C++ standard library. Using `transform` can quickly implement a customized elementwise kernel. + * `thrust` has more complex API, like `scan`, `reduce`, `reduce_by_key`. +* Hand-writing `GPUKernel` and `CPU` code + * Do not write `.h`. CPU Kernel should be in `.cc`. CPU kernel should be in `.cu`. (`GCC` cannot compile GPU code.) +--- +# Operator Register + +## Why register is necessary? +We need a method to build mappings between Op type names and Op classes. + +## How to do the register? + +Maintain a map, whose key is the type name and value is corresponding Op constructor. + +--- +# The Registry Map + +### `OpInfoMap` + +`op_type(string)` -> `OpInfo` + +`OpInfo`: + +- **`creator`**: The Op constructor. +- **`grad_op_type`**: The type of the gradient Op. +- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes. +- **`checker`**: Used to check attributes. + +--- +# Related Concepts + +### Op_Maker +It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)) + +### Register Macros +```cpp +REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class) +REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) +``` + +### `USE` Macros +make sure the registration process is executed and linked. + +--- +# Register Process +1. Write Op class, as well as its gradient Op class if there is. +2. Write Op maker class. In the constructor, describe its inputs, outputs, and attributes. +3. Invoke macro `REGISTER_OP`. The macro will + 1. call maker class to complete `proto` and `checker` + 2. with the completed `proto` and `checker`, build a new key-value pair in the `OpInfoMap` + +4. Invoke `USE` macro in where the Op is used to make sure it is linked. + +--- +# Backward Module (1/2) +### Create Backward Operator +- Mapping from forwarding Op to backward Op +![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png) + +--- +# Backward Module (2/2) +### Build Backward Network +- **Input** graph of forwarding operators +- **Output** graph of backward operators +- **corner case in construction** + - shared variable => insert `Add` operator + - no gradient => insert `fill_zero_grad` operator + - recursive netOp => call `Backward` recursively + - RNN Op => recursively call `Backward` on stepnet + + +--- +# Scope, Variable, Tensor + +* `Tensor` is an n-dimension array with type. + * Only dims and data pointers are stored in `Tensor`. + * All operators on `Tensor` is written in `Operator` or global functions. + * variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) +* `Variable` is the inputs and outputs of an operator. Not just `Tensor`. + * step_scopes in RNN is a variable and not a tensor. +* `Scope` is where variables store at. + * map + * `Scope` has a hierarchical structure. The local scope can get variable from its parent scope. + +--- +# Block (in design) +## the difference with original RNNOp +- as an operator is more intuitive than `RNNOp`, +- offers new interface `Eval(targets)` to deduce the minimal block to `Run`, +- fits the compile-time/ runtime separation design. + - during the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc` + - when graph executes, a Block with `BlockDesc` passed in creates `Op` and `Var` then `Run` + +--- +# Milestone +- take Paddle/books as the main line, the requirement of the models motivates framework refactoring, +- model migration + - framework development gives **priority support** to model migration, for example, + - the MNIST demo needs a Python interface, + - the RNN models require the framework to support `LoDTensor`. + - determine some timelines, + - heavily-relied Ops need to be migrated first, + - different models can be migrated parallelly. +- improve the framework at the same time +- accept imperfection, concentrated on solving the specific problem at the right price. + +--- +# Control the migration quality +- compare the performance of migrated models with old ones. +- follow google C style +- build the automatic workflow of generating Python/C++ documentations + - the documentation of layers and ops should be written inside the code + - take the documentation quality into account when doing PR + - preview the documentations, read and improve them from users' perspective diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h index 3f68c62de6d9b3aaadc9180d86159089dc728ea9..b44b071bd1b3b6e9e5539d5dc0c2b155c524fd57 100644 --- a/paddle/cuda/include/hl_cuda_cudnn.h +++ b/paddle/cuda/include/hl_cuda_cudnn.h @@ -22,10 +22,10 @@ limitations under the License. */ */ typedef enum { HL_POOLING_MAX = 0, - // average includes padded values - HL_POOLING_AVERAGE = 1, // average does not include padded values - HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2, + HL_POOLING_AVERAGE = 1, + // average includes padded values + HL_POOLING_AVERAGE_INCLUDE_PADDING = 2, HL_POOLING_END } hl_pooling_mode_t; diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index 9ba3d142617537c0160f6dccb86ddca43ada15a5..58674febdc4a094c95ff03701e4586c32729847d 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -211,13 +211,11 @@ __global__ void KeAvgPoolForward(const int nthreads, int hstart = ph * strideH - padH; int wstart = pw * strideW - padW; - int hend = min(hstart + sizeY, height + padH); - int wend = min(wstart + sizeX, width + padW); - int pool_size = (hend - hstart) * (wend - wstart); + int hend = min(hstart + sizeY, height); + int wend = min(wstart + sizeX, width); hstart = max(hstart, 0); wstart = max(wstart, 0); - hend = min(hend, height); - wend = min(wend, width); + int pool_size = (hend - hstart) * (wend - wstart); real aveval = 0; inputData += (frameNum * channels + c) * height * width; @@ -299,12 +297,14 @@ __global__ void KeAvgPoolBackward(const int nthreads, outGrad += (frameNum * outStride + offsetC * pooledH * pooledW); for (int ph = phstart; ph < phend; ++ph) { + int hstart = ph * strideH - padH; + int hend = min(hstart + sizeY, height); + hstart = max(hstart, 0); for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size - int hstart = ph * strideH - padH; int wstart = pw * strideW - padW; - int hend = min(hstart + sizeY, height + padH); - int wend = min(wstart + sizeX, width + padW); + int wend = min(wstart + sizeX, width); + wstart = max(wstart, 0); int poolsize = (hend - hstart) * (wend - wstart); gradient += outGrad[ph * pooledW + pw] / poolsize; } @@ -600,16 +600,13 @@ __global__ void KeAvgPool3DForward(const int nthreads, int dstart = pd * strideD - padD; int hstart = ph * strideH - padH; int wstart = pw * strideW - padW; - int dend = min(dstart + sizeZ, depth + padD); - int hend = min(hstart + sizeY, height + padH); - int wend = min(wstart + sizeX, width + padW); - int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); + int dend = min(dstart + sizeZ, depth); + int hend = min(hstart + sizeY, height); + int wend = min(wstart + sizeX, width); dstart = max(dstart, 0); hstart = max(hstart, 0); wstart = max(wstart, 0); - dend = min(dend, depth); - hend = min(hend, height); - wend = min(wend, width); + int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart); real aveval = 0; inputData += (frameNum * channels + c) * depth * height * width; @@ -712,15 +709,18 @@ __global__ void KeAvgPool3DBackward(const int nthreads, outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW; for (int pd = pdstart; pd < pdend; ++pd) { + int dstart = pd * strideD - padD; + int dend = min(dstart + sizeZ, depth); + dstart = max(dstart, 0); for (int ph = phstart; ph < phend; ++ph) { + int hstart = ph * strideH - padH; + int hend = min(hstart + sizeY, height); + hstart = max(hstart, 0); for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size - int dstart = pd * strideD - padD; - int hstart = ph * strideH - padH; int wstart = pw * strideW - padW; - int dend = min(dstart + sizeZ, depth + padD); - int hend = min(hstart + sizeY, height + padH); - int wend = min(wstart + sizeX, width + padW); + int wend = min(wstart + sizeX, width); + wstart = max(wstart, 0); int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart); gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize; } diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc index f38ef692558b908ed65d2c84821bbb7c3b439742..b8caf48f9c06094e85765f7aa5a3f4195d0ca931 100644 --- a/paddle/cuda/src/hl_cuda_cudnn.cc +++ b/paddle/cuda/src/hl_cuda_cudnn.cc @@ -432,11 +432,11 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc, cudnn_mode = CUDNN_POOLING_MAX; break; case HL_POOLING_AVERAGE: - cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - break; - case HL_POOLING_AVERAGE_EXCLUDE_PADDING: cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; break; + case HL_POOLING_AVERAGE_INCLUDE_PADDING: + cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; default: LOG(FATAL) << "parameter mode error"; } diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index c57537be4bf67a8db6a49669ab8d2ed1b1324bdc..f8a64a786611ef872dbbfced10919e00c4d46715 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -22,14 +22,14 @@ namespace framework { template <> Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< platform::CPUPlace, Eigen::DefaultDevice>() const { - return *device_context_->get_eigen_device(); + return *device_context_.get_eigen_device(); } #ifndef PADDLE_ONLY_CPU template <> Eigen::GpuDevice& ExecutionContext::GetEigenDevice() const { - return *device_context_->get_eigen_device(); + return *device_context_.get_eigen_device(); } #endif diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index adae7bfc3d7d31b1ed0373f01db4ef80343a08f7..b7c9c39402d57daf0aec97d98535ac8a8d9c0150 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -366,7 +366,7 @@ struct EigenDeviceConverter { class ExecutionContext : public InferShapeContext { public: ExecutionContext(const OperatorBase& op, const Scope& scope, - const platform::DeviceContext* device_context) + const platform::DeviceContext& device_context) : InferShapeContext(op, scope), device_context_(device_context) {} template ::EigenDeviceType> DeviceType& GetEigenDevice() const; - platform::Place GetPlace() const { return device_context_->GetPlace(); } + platform::Place GetPlace() const { return device_context_.GetPlace(); } - const platform::DeviceContext* device_context() const { + const platform::DeviceContext& device_context() const { return device_context_; } @@ -401,7 +401,8 @@ class ExecutionContext : public InferShapeContext { return res; } - const platform::DeviceContext* device_context_; + private: + const platform::DeviceContext& device_context_; }; template <> @@ -461,7 +462,7 @@ class OperatorWithKernel : public OperatorBase { void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const final { auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx)); - opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx)); + opKernel->Compute(ExecutionContext(*this, scope, dev_ctx)); } static std::unordered_map& diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp index 4adb2d4709e585a6fec052435c33714d6e3a3f0e..810a1af2d09c63c3787a1ac225c2c7de4238d609 100644 --- a/paddle/gserver/layers/CudnnPoolLayer.cpp +++ b/paddle/gserver/layers/CudnnPoolLayer.cpp @@ -29,9 +29,9 @@ bool CudnnPoolLayer::typeCheck(const std::string &poolType, if (mode) { *mode = HL_POOLING_AVERAGE; } - } else if (poolType == "cudnn-avg-excl-pad-pool") { + } else if (poolType == "cudnn-avg-incl-pad-pool") { if (mode) { - *mode = HL_POOLING_AVERAGE_EXCLUDE_PADDING; + *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING; } } else { return false; diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp new file mode 100644 index 0000000000000000000000000000000000000000..48b2f5a4cb37f6a9c4b1fdc6178c914b46c76e63 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp @@ -0,0 +1,277 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLDNNPoolLayer.h" +#include "paddle/math/MathUtils.h" +#include "paddle/utils/Logging.h" + +using namespace mkldnn; // NOLINT +typedef memory::format format; + +namespace paddle { + +REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer); + +bool MKLDNNPoolLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + if (!MKLDNNLayer::init(layerMap, parameterMap)) { + return false; + } + + /* the size of inputs for pool-layer is 1 */ + CHECK_EQ(config_.inputs_size(), 1); + const PoolConfig& conf = config_.inputs(0).pool_conf(); + ic_ = conf.channels(); + ih_ = conf.img_size_y(); + iw_ = conf.img_size(); + oc_ = ic_; + oh_ = conf.output_y(); + ow_ = conf.output_x(); + fh_ = conf.size_y(); + fw_ = conf.size_x(); + ph_ = conf.padding_y(); + pw_ = conf.padding(); + sh_ = conf.stride_y(); + sw_ = conf.stride(); + + const std::string& type = conf.pool_type(); + if (type == "max-projection") { + poolAlgo_ = algorithm::pooling_max; + } else if (type == "avg-projection") { + // paddle only use exclude_padding + poolAlgo_ = algorithm::pooling_avg_exclude_padding; + } else { + LOG(FATAL) << "unknow pooling type!"; + } + return true; +} + +void MKLDNNPoolLayer::reshape( + int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { + reshapeInput(bs, ih, iw); + // ic_ and oc can not be changed + CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic) + << "Input channel can not be changed"; + + // cal output sizes + // paddle used false caffeMode for pooling + oh = outputSize(ih, fh_, ph_, sh_, false); + ow = outputSize(iw, fw_, pw_, sw_, false); + reshapeOutput(oh, ow); + + resizeOutput(bs, oc * oh * ow); + + printSizeInfo(); +} + +void MKLDNNPoolLayer::resetFwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) { + resetFwdBuffers(in, out); + + resetFwdPD(fwdPD_, in, out); + + resetFwdPipeline(pipeline, fwdPD_, in, out); + + printValueFormatFlow(); +} + +void MKLDNNPoolLayer::resetBwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) { + std::shared_ptr pd; + + resetBwdBuffers(in, out); + + resetBwdPD(pd, in, out); + + resetBwdPipeline(pipeline, pd, in, out); + + printGradFormatFlow(); +} + +void MKLDNNPoolLayer::updateInputData() { + inVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); +} + +void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + resetInValue(in); + + resetOutValue(out); +} + +void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) { + if (inputIsOnlyMKLDNN()) { + const MatrixPtr& dnnIn = getInputValue(0); + in = std::dynamic_pointer_cast(dnnIn); + CHECK(in) << "Input should be MKLDNNMatrix"; + } else { + CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet"; + const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE); + in = MKLDNNMatrix::create( + cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_); + } +} + +void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) { + CHECK(inVal_) << "Should reset input value first"; + memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; + out = MKLDNNMatrix::create( + output_.value, outDims, inVal_->getFormat(), engine_); + output_.value = std::dynamic_pointer_cast(out); + + // create reorder if output value has cpu device and pd do not match + cpuOutVal_ = nullptr; + cvtOutVal_ = nullptr; + if (!outputIsOnlyMKLDNN()) { + const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value; + cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_); + if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) { + cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_); + CHECK(cvtOutVal_) << "should not be emptry"; + } else { + // CPU output share the same data of MKLDNN output + cpuOut->setData(out->getData()); + cpuOutVal_ = out; + } + } +} + +void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr& pd, + MKLDNNMatrixPtr in, + MKLDNNMatrixPtr out) { + memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_}; + memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_}; + memory::dims kernels = memory::dims{fh_, fw_}; + memory::dims strides = memory::dims{sh_, sw_}; + memory::dims padL = memory::dims{ph_, pw_}; + memory::dims padR = getPaddingR(); + padding_kind padKind = padding_kind::zero; + prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring + : prop_kind::forward_training; + auto fwdDesc = pool_fwd::desc(pk, + poolAlgo_, + in->getMemoryDesc(), + out->getMemoryDesc(), + strides, + kernels, + padL, + padR, + padKind); + pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_)); + + // prepare workspace if necessary + workspace_ = + (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max) + ? std::make_shared(memory(pd->workspace_primitive_desc())) + : nullptr; +} + +void MKLDNNPoolLayer::resetFwdPipeline( + std::vector& pipeline, + std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + pipeline.clear(); + fwd_ = workspace_ + ? std::make_shared(pool_fwd(*pd, *in, *out, *workspace_)) + : std::make_shared(pool_fwd(*pd, *in, *out)); + pipeline.push_back(*fwd_); + + if (cvtOutVal_) { + pipeline.push_back(*cvtOutVal_); + } +} + +void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + resetOutGrad(out); + + resetInGrad(in); +} +void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) { + CHECK(outVal_) << "Should have output value"; + out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc()); + + // create reorder if output value has cpu device and pd do not match + cpuOutGrad_ = nullptr; + cvtOutGrad_ = nullptr; + if (!outputIsOnlyMKLDNN()) { + const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad; + cpuOutGrad_ = MKLDNNMatrix::create( + cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_); + if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) { + cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out); + CHECK(cvtOutGrad_) << "should not be emptry"; + } else { + // share the same data of CPU output + output_.grad->setData(cpuOut->getData()); + out = cpuOutGrad_; + } + } +} + +void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) { + in = nullptr; + const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad; + if (inGrad == nullptr) { + return; + } + CHECK(inVal_); + in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc()); +} + +void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + memory::dims kernels = memory::dims{fh_, fw_}; + memory::dims strides = memory::dims{sh_, sw_}; + memory::dims padL = memory::dims{ph_, pw_}; + memory::dims padR = getPaddingR(); + CHECK(in); + CHECK(out); + auto bwdDesc = pool_bwd::desc(poolAlgo_, + in->getMemoryDesc(), + out->getMemoryDesc(), + strides, + kernels, + padL, + padR, + padding_kind::zero); + pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_)); +} + +void MKLDNNPoolLayer::resetBwdPipeline( + std::vector& pipeline, + std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + pipeline.clear(); + if (cvtOutGrad_) { + pipeline.push_back(*cvtOutGrad_); + } + + bwdData_ = + workspace_ + ? std::make_shared(pool_bwd(*pd, *out, *workspace_, *in)) + : std::make_shared(pool_bwd(*pd, *out, *in)); + pipeline.push_back(*bwdData_); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h new file mode 100644 index 0000000000000000000000000000000000000000..891e15a7efcdd2e54f61352efc1ba7345b91c76b --- /dev/null +++ b/paddle/gserver/layers/MKLDNNPoolLayer.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "MKLDNNLayer.h" +#include "mkldnn.hpp" + +namespace paddle { +typedef mkldnn::pooling_forward pool_fwd; +typedef mkldnn::pooling_backward pool_bwd; + +/** + * @brief A subclass of MKLDNNLayer pool layer. + * + * The config file api is mkldnn_pool + */ +class MKLDNNPoolLayer : public MKLDNNLayer { +protected: + // padding height and width + int ph_, pw_; + // stride height and width + int sh_, sw_; + // filter(kenerl) height and width + int fh_, fw_; + + // pooling_avg or pooling_max + mkldnn::algorithm poolAlgo_; + + // MKLDNNMatrixPtr which should be created from CPU Device + MKLDNNMatrixPtr cpuOutVal_; + MKLDNNMatrixPtr cpuOutGrad_; + // convert handle between CPU device and MKLDNN device + std::shared_ptr cvtOutVal_; + std::shared_ptr cvtOutGrad_; + + // save forward primitive_desc, which can be used backward + std::shared_ptr fwdPD_; + // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/ + // test_pooling_forward.cpp, pool need workspace for backward + std::shared_ptr workspace_; + +public: + explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {} + + ~MKLDNNPoolLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void reshape( + int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override; + + void resetFwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) override; + + void resetBwd(std::vector& pipeline, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& wgt, + MKLDNNMatrixPtr& bias, + MKLDNNMatrixPtr& out) override; + + void updateInputData() override; + + void printSizeInfo() override { + MKLDNNLayer::printSizeInfo(); + VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_ + << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_ + << ", sw: " << sw_; + } + +protected: + /** + * Forward functions: reset buffers(input, output), + * reset primitive descriptor, + * reset pipeline. + */ + void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); + void resetInValue(MKLDNNMatrixPtr& in); + void resetOutValue(MKLDNNMatrixPtr& out); + void resetFwdPD(std::shared_ptr& pd, + MKLDNNMatrixPtr in, + MKLDNNMatrixPtr out); + void resetFwdPipeline(std::vector& pipeline, + std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out); + + /** + * Backward functions: reset buffers(input, output), + * reset primitive descriptor, + * reset pipeline. + */ + void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); + void resetOutGrad(MKLDNNMatrixPtr& out); + void resetInGrad(MKLDNNMatrixPtr& in); + void resetBwdPD(std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out); + void resetBwdPipeline(std::vector& pipeline, + std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out); + + /** + * get padding_r according to + * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/ + * test_pooling_forward.cpp + */ + mkldnn::memory::dims getPaddingR() const { + mkldnn::memory::dims padR = {ph_, pw_}; + for (int i = 0; i < 2; ++i) { + if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) { + ++padR[0]; + } + if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) { + ++padR[1]; + } + } + return padR; + } +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp index e70802881e3f22160a87b7a4babda07ffbcf9d6f..b593f65fe49ef2271ad7cd0f609c9b828be03037 100644 --- a/paddle/gserver/tests/test_MKLDNN.cpp +++ b/paddle/gserver/tests/test_MKLDNN.cpp @@ -141,6 +141,68 @@ TEST(MKLDNNLayer, ConvLayer) { testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1}); } +struct testPoolDesc { + int bs, ch; // input channel and output channel are the same + int ih, iw; + int oh, ow; + int fh, fw; + int ph, pw; + int sh, sw; +}; + +void testPoolLayer(const testPoolDesc& pm) { + const std::string compareTypes[] = {"mkldnn_pool", "pool"}; + TestConfig cfg; + cfg.layerConfig.set_type(compareTypes[0]); + cfg.layerConfig.set_size(pm.ch * pm.oh * pm.ow); + cfg.inputDefs.push_back( + {INPUT_DATA, + "layer_0", + /* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw), + 0}); + LayerInputConfig* input = cfg.layerConfig.add_inputs(); + PoolConfig* pool = input->mutable_pool_conf(); + // pool->set_pool_type(poolType); + pool->set_channels(pm.ch); + pool->set_img_size(pm.iw); + pool->set_img_size_y(pm.ih); + pool->set_output_x(pm.ow); + pool->set_output_y(pm.oh); + pool->set_size_x(pm.fw); + pool->set_size_y(pm.fh); + pool->set_padding(pm.pw); + pool->set_padding_y(pm.ph); + pool->set_stride(pm.sw); + pool->set_stride_y(pm.sh); + + int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false); + int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false); + CHECK_EQ(ow, pm.ow) << "output size check failed"; + CHECK_EQ(oh, pm.oh) << "output size check failed"; + + MKLDNNTester tester; + for (auto type : {"max-projection", "avg-projection"}) { + pool->set_pool_type(type); + TestConfig ref = cfg; + ref.layerConfig.set_type(compareTypes[1]); + for (auto bs : {pm.bs, 1}) { + tester.run(cfg, ref, bs, pm.ih, pm.iw); + } + } +} + +TEST(MkldnnLayer, PoolLayer) { + /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw*/ + testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2}); + testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2}); + testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2}); + testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2}); + testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2}); + testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1}); + testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1}); + testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2}); +} + // TODO(TJ): add branch test int main(int argc, char** argv) { diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 4a2132c8d1bfa329ced575f9b78052bdbfe3e4d5..0023b4d0f5da500f380ecb836b7c54e050b13d67 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -1033,17 +1033,15 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat, real* inputData = inputMat.getData(); size_t frameNum = inputMat.getHeight(); - size_t width = imgSizeW; - size_t height = imgSizeH; - CHECK(height * width * channels == inputMat.getWidth()); + CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth()); CHECK(height_ == inputMat.getHeight()); CHECK(width_ == outputH * outputW * channels); hl_maxpool_forward(frameNum, inputData, channels, - height, - width, + imgSizeH, + imgSizeW, outputH, outputW, sizeX, @@ -1080,11 +1078,8 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat, real* outDiff = outGrad.getData(); size_t frameNum = inputMat.getHeight(); size_t channels = outV.getWidth() / outputH / outputW; - size_t width = imgSizeW; - size_t height = imgSizeH; - CHECK(height * width * channels == inputMat.getWidth()); + CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth()); CHECK(height_ == inputMat.getHeight()); - CHECK(width_ == width * height * channels); CHECK(outGrad.getHeight() == outV.getHeight() && outGrad.getWidth() == outV.getWidth()); @@ -1093,8 +1088,8 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat, outData, outDiff, channels, - height, - width, + imgSizeH, + imgSizeW, outputH, outputW, sizeX, @@ -1125,17 +1120,15 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, real* inputData = inputMat.getData(); size_t frameNum = inputMat.getHeight(); - size_t height = imgSizeH; - size_t width = imgSizeW; - CHECK(height * width * channels == inputMat.getWidth()); + CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth()); CHECK(height_ == inputMat.getHeight()); CHECK(width_ == outputH * outputW * channels); hl_avgpool_forward(frameNum, inputData, channels, - height, - width, + imgSizeH, + imgSizeW, outputH, outputW, sizeX, @@ -1166,17 +1159,15 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, real* outDiff = outGrad.getData(); size_t frameNum = outGrad.getHeight(); size_t channels = outGrad.getWidth() / outputH / outputW; - size_t height = imgSizeH; - size_t width = imgSizeW; - CHECK(height * width * channels == width_); + CHECK(imgSizeH * imgSizeW * channels == width_); CHECK(height_ == outGrad.getHeight()); CHECK(outGrad.getWidth() == outputH * outputW * channels); hl_avgpool_backward(frameNum, outDiff, channels, - height, - width, + imgSizeH, + imgSizeW, outputH, outputW, sizeX, @@ -1214,19 +1205,16 @@ void GpuMatrix::maxPool3DForward(Matrix& inputMat, real* inputData = inputMat.getData(); real* maxPoolIdxData = maxPoolIdx.getData(); size_t num = inputMat.getHeight(); - size_t width = imgSizeW; - size_t height = imgSizeH; - size_t depth = imgSizeD; - CHECK(depth * height * width * channels == inputMat.getWidth()); + CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth()); CHECK(height_ == inputMat.getHeight()); CHECK(width_ == outputD * outputH * outputW * channels); hl_maxpool3D_forward(num, inputData, channels, - depth, - height, - width, + imgSizeD, + imgSizeH, + imgSizeW, outputD, outputH, outputW, @@ -1269,20 +1257,16 @@ void GpuMatrix::maxPool3DBackward(Matrix& outGrad, real* maxPoolIdxData = maxPoolIdx.getData(); size_t frameNum = getHeight(); size_t channels = outGrad.getWidth() / outputD / outputH / outputW; - size_t width = imgSizeW; - size_t height = imgSizeH; - size_t depth = imgSizeD; - CHECK(depth * height * width * channels == getWidth()); - CHECK(width_ == depth * width * height * channels); + CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth()); CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() && outGrad.getWidth() == maxPoolIdx.getWidth()); hl_maxpool3D_backward(frameNum, outDiff, channels, - depth, - height, - width, + imgSizeD, + imgSizeH, + imgSizeW, outputD, outputH, outputW, @@ -1323,19 +1307,16 @@ void GpuMatrix::avgPool3DForward(Matrix& inputMat, real* inputData = inputMat.getData(); size_t frameNum = inputMat.getHeight(); - size_t height = imgSizeH; - size_t width = imgSizeW; - size_t depth = imgSizeD; - CHECK(depth * height * width * channels == inputMat.getWidth()); + CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth()); CHECK(height_ == inputMat.getHeight()); CHECK(width_ == outputD * outputH * outputW * channels); hl_avgpool3D_forward(frameNum, inputData, channels, - depth, - height, - width, + imgSizeD, + imgSizeH, + imgSizeW, outputD, outputH, outputW, @@ -1375,19 +1356,16 @@ void GpuMatrix::avgPool3DBackward(Matrix& outGrad, real* outDiff = outGrad.getData(); size_t frameNum = outGrad.getHeight(); size_t channels = outGrad.getWidth() / outputD / outputH / outputW; - size_t height = imgSizeH; - size_t width = imgSizeW; - size_t depth = imgSizeD; - CHECK(depth * height * width * channels == width_); + CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_); CHECK(height_ == outGrad.getHeight()); CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels); hl_avgpool3D_backward(frameNum, outDiff, channels, - depth, - height, - width, + imgSizeD, + imgSizeH, + imgSizeW, outputD, outputH, outputW, @@ -1999,11 +1977,11 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, real* inputData = inputMat.getData(); real* outData = data_; size_t num = inputMat.getHeight(); - size_t inWidth = imgSizeW; - size_t inHeight = imgSizeH; - CHECK(inHeight * inWidth == inputMat.getWidth() / channels); + size_t inLength = imgSizeH * imgSizeW; + size_t outLength = outputH * outputW; + CHECK(inLength == inputMat.getWidth() / channels); CHECK_EQ(num, this->getHeight()); - CHECK_EQ(channels * outputH * outputW, this->getWidth()); + CHECK_EQ(channels * outLength, this->getWidth()); size_t outStride = getStride(); /* initialize the data_ */ @@ -2020,24 +1998,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat, } for (size_t c = 0; c < channels; ++c) { // channel by channel for (size_t ph = 0; ph < outputH; ++ph) { + int hstart = ph * strideH - paddingH; + int hend = std::min(hstart + sizeY, imgSizeH); + hstart = std::max(hstart, 0); for (size_t pw = 0; pw < outputW; ++pw) { - int hstart = ph * strideH - paddingH; int wstart = pw * strideW - paddingW; - int hend = std::min(hstart + sizeY, inHeight); - int wend = std::min(wstart + sizeX, inWidth); - hstart = std::max(hstart, 0); + int wend = std::min(wstart + sizeX, imgSizeW); wstart = std::max(wstart, 0); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - outData[ph * outputW + pw] = std::max(outData[ph * outputW + pw], - inputData[h * inWidth + w]); + outData[ph * outputW + pw] = std::max( + outData[ph * outputW + pw], inputData[h * imgSizeW + w]); } } } } // compute offset - inputData += inHeight * inWidth; - outData += outputH * outputW; + inputData += inLength; + outData += outLength; } } } @@ -2058,8 +2036,10 @@ void CpuMatrix::maxPoolBackward(Matrix& image, size_t paddingH, size_t paddingW) { size_t num = image.getHeight(); - size_t channels = size_t(width_ / imgSizeH / imgSizeW); - CHECK(image.getWidth() == imgSizeH * imgSizeW * channels); + size_t inLength = imgSizeH * imgSizeW; + size_t outLength = outputH * outputW; + size_t channels = size_t(width_ / inLength); + CHECK(image.getWidth() == inLength * channels); CHECK(image.getHeight() == height_ && image.getWidth() == width_); CHECK(outV.getHeight() == outGrad.getHeight() && outV.getWidth() == outGrad.getWidth()); @@ -2080,12 +2060,12 @@ void CpuMatrix::maxPoolBackward(Matrix& image, } for (size_t c = 0; c < channels; ++c) { for (size_t ph = 0; ph < outputH; ++ph) { + int hstart = ph * strideH - paddingH; + int hend = std::min(hstart + sizeY, imgSizeH); + hstart = std::max(hstart, 0); for (size_t pw = 0; pw < outputW; ++pw) { - int hstart = ph * strideH - paddingH; int wstart = pw * strideW - paddingW; - int hend = std::min(hstart + sizeY, imgSizeH); int wend = std::min(wstart + sizeX, imgSizeW); - hstart = std::max(hstart, 0); wstart = std::max(wstart, 0); for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -2098,10 +2078,10 @@ void CpuMatrix::maxPoolBackward(Matrix& image, } } // offset - inData += imgSizeH * imgSizeW; - tgtGrad += imgSizeH * imgSizeW; - otData += outputH * outputW; - otGrad += outputH * outputW; + inData += inLength; + tgtGrad += inLength; + otData += outLength; + otGrad += outLength; } } } @@ -2120,10 +2100,10 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t paddingW) { // The main loop size_t num = input.getHeight(); - size_t inHeight = imgSizeH; - size_t inWidth = imgSizeW; - CHECK(inHeight * inWidth * channels == input.getWidth()); - CHECK(outputH * outputW * channels * num == height_ * width_); + size_t inLength = imgSizeH * imgSizeW; + size_t outLength = outputH * outputW; + CHECK(inLength * channels == input.getWidth()); + CHECK(outLength * channels * num == height_ * width_); real* tgtData = data_; real* inData = input.getData(); @@ -2133,30 +2113,27 @@ void CpuMatrix::avgPoolForward(Matrix& input, } for (size_t c = 0; c < channels; ++c) { for (size_t ph = 0; ph < outputH; ++ph) { + int hstart = ph * strideH - paddingH; + int hend = std::min(hstart + sizeY, imgSizeH); + hstart = std::max(hstart, 0); for (size_t pw = 0; pw < outputW; ++pw) { - int hstart = ph * strideH - paddingH; int wstart = pw * strideW - paddingW; - int hend = std::min(hstart + sizeY, inHeight + paddingH); - int wend = std::min(wstart + sizeX, inWidth + paddingW); - int poolSize = (hend - hstart) * (wend - wstart); - hstart = std::max(hstart, 0); + int wend = std::min(wstart + sizeX, imgSizeW); wstart = std::max(wstart, 0); - hend = std::min(hend, static_cast(inHeight)); - wend = std::min(wend, static_cast(inWidth)); - - CHECK(poolSize); tgtData[ph * outputW + pw] = 0; // clear for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { - tgtData[ph * outputW + pw] += inData[h * inWidth + w]; + tgtData[ph * outputW + pw] += inData[h * imgSizeW + w]; } } + int poolSize = (hend - hstart) * (wend - wstart); + CHECK(poolSize); tgtData[ph * outputW + pw] /= poolSize; } } // compute offset - inData += inHeight * inWidth; - tgtData += outputH * outputW; + inData += inLength; + tgtData += outLength; } } } @@ -2176,7 +2153,9 @@ void CpuMatrix::avgPoolBackward(Matrix& input, size_t paddingW) { size_t num = input.getHeight(); size_t channels = input.getWidth() / outputH / outputW; - CHECK(imgSizeH * imgSizeW * channels == getWidth()); + size_t inLength = imgSizeH * imgSizeW; + size_t outLength = outputH * outputW; + CHECK(inLength * channels == getWidth()); real* inData = input.getData(); real* outData = getData(); @@ -2186,16 +2165,14 @@ void CpuMatrix::avgPoolBackward(Matrix& input, } for (size_t c = 0; c < channels; ++c) { for (size_t ph = 0; ph < outputH; ++ph) { + int hstart = ph * strideH - paddingH; + int hend = std::min(hstart + sizeY, imgSizeH); + hstart = std::max(hstart, 0); for (size_t pw = 0; pw < outputW; ++pw) { - int hstart = ph * strideH - paddingH; int wstart = pw * strideW - paddingW; - int hend = std::min(hstart + sizeY, imgSizeH + paddingH); - int wend = std::min(wstart + sizeX, imgSizeW + paddingW); - int poolSize = (hend - hstart) * (wend - wstart); - hstart = std::max(hstart, 0); + int wend = std::min(wstart + sizeX, imgSizeW); wstart = std::max(wstart, 0); - hend = std::min(hend, static_cast(imgSizeH)); - wend = std::min(wend, static_cast(imgSizeW)); + int poolSize = (hend - hstart) * (wend - wstart); CHECK(poolSize); for (int h = hstart; h < hend; ++h) { @@ -2206,8 +2183,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input, } } // offset - outData += imgSizeH * imgSizeW; - inData += outputH * outputW; + outData += inLength; + inData += outLength; } } } @@ -2234,12 +2211,11 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat, real* outData = getData(); real* maxPoolIdxData = maxPoolIdx.getData(); size_t num = inputMat.getHeight(); - size_t inWidth = imgSizeW; - size_t inHeight = imgSizeH; - size_t inDepth = imgSizeD; - CHECK(inHeight * inWidth * inDepth == inputMat.getWidth() / channels); + size_t inLength = imgSizeH * imgSizeW * imgSizeD; + size_t outLength = outputH * outputW * outputD; + CHECK(inLength == inputMat.getWidth() / channels); CHECK_EQ(num, this->getHeight()); - CHECK_EQ(channels * outputH * outputW * outputD, this->getWidth()); + CHECK_EQ(channels * outLength, this->getWidth()); size_t outStride = getStride(); /* initialize the data_ */ @@ -2258,16 +2234,16 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat, } for (size_t c = 0; c < channels; ++c) { // channel by channel for (size_t pd = 0; pd < outputD; ++pd) { + int dstart = pd * strideD - paddingD; + int dend = std::min(dstart + sizeZ, imgSizeD); + dstart = std::max(dstart, 0); for (size_t ph = 0; ph < outputH; ++ph) { + int hstart = ph * strideH - paddingH; + int hend = std::min(hstart + sizeY, imgSizeH); + hstart = std::max(hstart, 0); for (size_t pw = 0; pw < outputW; ++pw) { - int dstart = pd * strideD - paddingD; - int hstart = ph * strideH - paddingH; int wstart = pw * strideW - paddingW; - int dend = std::min(dstart + sizeZ, inDepth); - int hend = std::min(hstart + sizeY, inHeight); - int wend = std::min(wstart + sizeX, inWidth); - dstart = std::max(dstart, 0); - hstart = std::max(hstart, 0); + int wend = std::min(wstart + sizeX, imgSizeW); wstart = std::max(wstart, 0); int maxIdx = -1; real maxOutData = outData[(pd * outputH + ph) * outputW + pw]; @@ -2275,9 +2251,9 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat, for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { if (maxOutData < - inputData[(d * inHeight + h) * inWidth + w]) { - maxOutData = inputData[(d * inHeight + h) * inWidth + w]; - maxIdx = (d * inHeight + h) * inWidth + w; + inputData[(d * imgSizeH + h) * imgSizeW + w]) { + maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w]; + maxIdx = (d * imgSizeH + h) * imgSizeW + w; } } } @@ -2288,9 +2264,9 @@ void CpuMatrix::maxPool3DForward(Matrix& inputMat, } } // compute offset - inputData += inDepth * inHeight * inWidth; - outData += outputD * outputH * outputW; - maxPoolIdxData += outputD * outputH * outputW; + inputData += inLength; + outData += outLength; + maxPoolIdxData += outLength; } } } @@ -2315,7 +2291,9 @@ void CpuMatrix::maxPool3DBackward(Matrix& outGrad, real scaleTargets, real scaleOutput) { size_t num = getHeight(); - size_t channels = size_t(width_ / imgSizeD / imgSizeH / imgSizeW); + size_t inLength = imgSizeH * imgSizeW * imgSizeD; + size_t outLength = outputH * outputW * outputD; + size_t channels = size_t(width_ / inLength); CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() && maxPoolIdx.getWidth() == outGrad.getWidth()); @@ -2341,9 +2319,9 @@ void CpuMatrix::maxPool3DBackward(Matrix& outGrad, } } // offset - tgtGrad += imgSizeD * imgSizeH * imgSizeW; - otGrad += outputD * outputH * outputW; - maxPoolIdxData += outputD * outputH * outputW; + tgtGrad += inLength; + otGrad += outLength; + maxPoolIdxData += outLength; } } } @@ -2367,11 +2345,10 @@ void CpuMatrix::avgPool3DForward(Matrix& input, size_t paddingW) { // The main loop size_t num = input.getHeight(); - size_t inDepth = imgSizeD; - size_t inHeight = imgSizeH; - size_t inWidth = imgSizeW; - CHECK(inDepth * inHeight * inWidth * channels == input.getWidth()); - CHECK(outputD * outputH * outputW * channels * num == height_ * width_); + size_t inLength = imgSizeH * imgSizeW * imgSizeD; + size_t outLength = outputH * outputW * outputD; + CHECK(inLength * channels == input.getWidth()); + CHECK(outLength * channels * num == height_ * width_); real* tgtData = getData(); real* inData = input.getData(); @@ -2381,39 +2358,36 @@ void CpuMatrix::avgPool3DForward(Matrix& input, } for (size_t c = 0; c < channels; ++c) { for (size_t pd = 0; pd < outputD; ++pd) { + int dstart = pd * strideD - paddingD; + int dend = std::min(dstart + sizeZ, imgSizeD); + dstart = std::max(dstart, 0); for (size_t ph = 0; ph < outputH; ++ph) { + int hstart = ph * strideH - paddingH; + int hend = std::min(hstart + sizeY, imgSizeH); + hstart = std::max(hstart, 0); for (size_t pw = 0; pw < outputW; ++pw) { - int dstart = pd * strideD - paddingD; - int hstart = ph * strideH - paddingH; int wstart = pw * strideW - paddingW; - int dend = std::min(dstart + sizeZ, inDepth + paddingD); - int hend = std::min(hstart + sizeY, inHeight + paddingH); - int wend = std::min(wstart + sizeX, inWidth + paddingW); - int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = std::max(dstart, 0); - hstart = std::max(hstart, 0); + int wend = std::min(wstart + sizeX, imgSizeW); wstart = std::max(wstart, 0); - dend = std::min(dend, static_cast(inDepth)); - hend = std::min(hend, static_cast(inHeight)); - wend = std::min(wend, static_cast(inWidth)); - CHECK(poolSize); tgtData[(pd * outputH + ph) * outputW + pw] = 0; // clear for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { tgtData[(pd * outputH + ph) * outputW + pw] += - inData[(d * inHeight + h) * inWidth + w]; + inData[(d * imgSizeH + h) * imgSizeW + w]; } } } + int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart); + CHECK(poolSize); tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize; } } } // compute offset - inData += inDepth * inHeight * inWidth; - tgtData += outputD * outputH * outputW; + inData += inLength; + tgtData += outLength; } } } @@ -2437,8 +2411,10 @@ void CpuMatrix::avgPool3DBackward(Matrix& input, real scaleTargets, real scaleOutput) { size_t num = input.getHeight(); - size_t channels = input.getWidth() / outputD / outputH / outputW; - CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth()); + size_t inLength = imgSizeH * imgSizeW * imgSizeD; + size_t outLength = outputH * outputW * outputD; + size_t channels = input.getWidth() / outLength; + CHECK(inLength * channels == getWidth()); real* inData = input.getData(); real* outData = getData(); @@ -2448,21 +2424,18 @@ void CpuMatrix::avgPool3DBackward(Matrix& input, } for (size_t c = 0; c < channels; ++c) { for (size_t pd = 0; pd < outputD; ++pd) { + int dstart = pd * strideD - paddingD; + int dend = std::min(dstart + sizeZ, imgSizeD); + dstart = std::max(dstart, 0); for (size_t ph = 0; ph < outputH; ++ph) { + int hstart = ph * strideH - paddingH; + int hend = std::min(hstart + sizeY, imgSizeH); + hstart = std::max(hstart, 0); for (size_t pw = 0; pw < outputW; ++pw) { - int dstart = pd * strideD - paddingD; - int hstart = ph * strideH - paddingH; int wstart = pw * strideW - paddingW; - int dend = std::min(dstart + sizeZ, imgSizeD + paddingD); - int hend = std::min(hstart + sizeY, imgSizeH + paddingH); - int wend = std::min(wstart + sizeX, imgSizeW + paddingW); - int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart); - dstart = std::max(dstart, 0); - hstart = std::max(hstart, 0); + int wend = std::min(wstart + sizeX, imgSizeW); wstart = std::max(wstart, 0); - dend = std::min(dend, static_cast(imgSizeD)); - hend = std::min(hend, static_cast(imgSizeH)); - wend = std::min(wend, static_cast(imgSizeW)); + int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart); CHECK(poolSize); for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { @@ -2476,8 +2449,8 @@ void CpuMatrix::avgPool3DBackward(Matrix& input, } } // offset - outData += imgSizeD * imgSizeH * imgSizeW; - inData += outputD * outputH * outputW; + outData += inLength; + inData += outLength; } } } diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp index 103f06acc57d7a23f019f5e713f6cacf2179e9e0..061fb22e3fd744d9d9895fd1008089e4a6ce6a0f 100644 --- a/paddle/math/tests/test_matrixCompare.cpp +++ b/paddle/math/tests/test_matrixCompare.cpp @@ -825,9 +825,8 @@ void testMaxPoolFwdBwd(int numSamples, int strideW, int padH, int padW) { - int outH = 0, outW = 0; - outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1; - outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1; + int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true); + int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true); int inWidth = imgSizeH * imgSizeW * channels; MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false); @@ -927,9 +926,8 @@ void testAvgPoolFwdBwd(int numSamples, int strideW, int padH, int padW) { - int outH = 0, outW = 0; - outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1; - outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1; + int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true); + int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true); int inWidth = imgSizeH * imgSizeW * channels; MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false); diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 1e86fc3d166077265e0f433a6712b0665ea5a152..def4b01da098fc960ce7c0e497732fbcc2579945 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -19,12 +19,13 @@ namespace operators { namespace math { template <> -void gemm(const CBLAS_TRANSPOSE transA, +void gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, - const float* B, const float beta, float* C, - platform::DeviceContext* context) { + const float* B, const float beta, + float* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -33,13 +34,13 @@ void gemm(const CBLAS_TRANSPOSE transA, } template <> -void gemm(const CBLAS_TRANSPOSE transA, +void gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, - double* C, - platform::DeviceContext* context) { + double* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -48,13 +49,10 @@ void gemm(const CBLAS_TRANSPOSE transA, } template <> -void matmul(const framework::Tensor& matrix_a, - bool trans_a, - const framework::Tensor& matrix_b, - bool trans_b, float alpha, - framework::Tensor* matrix_out, - float beta, - platform::DeviceContext* context) { +void matmul( + const platform::DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -74,18 +72,15 @@ void matmul(const framework::Tensor& matrix_a, CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; gemm( - transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data(), context); + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul(const framework::Tensor& matrix_a, - bool trans_a, - const framework::Tensor& matrix_b, - bool trans_b, double alpha, - framework::Tensor* matrix_out, - double beta, - platform::DeviceContext* context) { +void matmul( + const platform::DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, + framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -105,8 +100,8 @@ void matmul(const framework::Tensor& matrix_a, CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; gemm( - transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data(), context); + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); } } // namespace math diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index da40b27c948918e4997f4a046d2145552296158b..71563b77b4b262c3f1e17ae7c4381da56ba780a3 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -19,12 +19,13 @@ namespace operators { namespace math { template <> -void gemm(const CBLAS_TRANSPOSE transA, +void gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, - const float* B, const float beta, float* C, - platform::DeviceContext* context) { + const float* B, const float beta, + float* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -35,18 +36,19 @@ void gemm(const CBLAS_TRANSPOSE transA, (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasSgemm( - reinterpret_cast(context)->cublas_handle(), + reinterpret_cast(context) + .cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template <> -void gemm(const CBLAS_TRANSPOSE transA, +void gemm(const platform::DeviceContext& context, + const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, - double* C, - platform::DeviceContext* context) { + double* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -56,18 +58,16 @@ void gemm(const CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasDgemm( - reinterpret_cast(context)->cublas_handle(), + reinterpret_cast(context) + .cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } template <> -void matmul(const framework::Tensor& matrix_a, - bool trans_a, - const framework::Tensor& matrix_b, - bool trans_b, float alpha, - framework::Tensor* matrix_out, - float beta, - platform::DeviceContext* context) { +void matmul( + const platform::DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, + framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -87,18 +87,15 @@ void matmul(const framework::Tensor& matrix_a, CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; gemm( - transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data(), context); + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul(const framework::Tensor& matrix_a, - bool trans_a, - const framework::Tensor& matrix_b, - bool trans_b, double alpha, - framework::Tensor* matrix_out, - double beta, - platform::DeviceContext* context) { +void matmul( + const platform::DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, + framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -118,8 +115,8 @@ void matmul(const framework::Tensor& matrix_a, CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; gemm( - transA, transB, M, N, K, alpha, matrix_a.data(), - matrix_b.data(), beta, matrix_out->data(), context); + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); } } // namespace math diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index 155589fadb3ed9f59160a750d546dd8093a56cbe..d8518e77fa7b4abdbcf08b7983013c24806e14ca 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -66,16 +66,16 @@ namespace math { // For more detailed info, please refer to // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html template -void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, const T alpha, const T* A, - const T* B, const T beta, T* C, platform::DeviceContext* context); +void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const T alpha, const T* A, const T* B, const T beta, T* C); // matrix multiply with continuous memory template -void matmul(const framework::Tensor& matrix_a, bool trans_a, +void matmul(const platform::DeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, const framework::Tensor& matrix_b, bool trans_b, T alpha, - framework::Tensor* matrix_out, T beta, - platform::DeviceContext* context); + framework::Tensor* matrix_out, T beta); } // namespace math } // namespace operators diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc index 6c020c4ff7285b43bc5836d80c173d3a068e72b3..7e339457f7f08ff16162f399064a4b4dca594d7f 100644 --- a/paddle/operators/math/math_function_test.cc +++ b/paddle/operators/math/math_function_test.cc @@ -15,8 +15,7 @@ TEST(math_function, notrans_mul_trans) { memcpy(input1_ptr, arr, 6 * sizeof(float)); auto* gpu_place = new paddle::platform::GPUPlace(0); - paddle::platform::DeviceContext* context = - new paddle::platform::CUDADeviceContext(*gpu_place); + paddle::platform::CUDADeviceContext context(*gpu_place); input1_gpu.CopyFrom(input1, *gpu_place); input2_gpu.CopyFrom(input1, *gpu_place); @@ -24,7 +23,7 @@ TEST(math_function, notrans_mul_trans) { out_gpu.mutable_data({2, 2}, *gpu_place); paddle::operators::math::matmul( - input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0, context); + context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); out.CopyFrom(out_gpu, *cpu_place); @@ -33,6 +32,7 @@ TEST(math_function, notrans_mul_trans) { EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[2], 14); EXPECT_EQ(out_ptr[3], 50); + delete gpu_place; } TEST(math_function, trans_mul_notrans) { @@ -48,8 +48,7 @@ TEST(math_function, trans_mul_notrans) { memcpy(input1_ptr, arr, 6 * sizeof(float)); auto* gpu_place = new paddle::platform::GPUPlace(0); - paddle::platform::DeviceContext* context = - new paddle::platform::CUDADeviceContext(*gpu_place); + paddle::platform::CUDADeviceContext context(*gpu_place); input1_gpu.CopyFrom(input1, *gpu_place); input2_gpu.CopyFrom(input1, *gpu_place); @@ -57,7 +56,7 @@ TEST(math_function, trans_mul_notrans) { out_gpu.mutable_data({3, 3}, *gpu_place); paddle::operators::math::matmul( - input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0, context); + context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); out.CopyFrom(out_gpu, *cpu_place); @@ -71,5 +70,6 @@ TEST(math_function, trans_mul_notrans) { EXPECT_EQ(out_ptr[6], 15); EXPECT_EQ(out_ptr[7], 22); EXPECT_EQ(out_ptr[8], 29); + delete gpu_place; } #endif diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 3c01f868bda8cba488b3403df456d63d6b082fa6..ac7136a76933d1f3ead86518c65d589747227631 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -46,10 +46,8 @@ class MulKernel : public framework::OpKernel { : *y; z->mutable_data(context.GetPlace()); - auto* device_context = - const_cast(context.device_context_); - math::matmul(x_matrix, false, y_matrix, false, 1, z, 0, - device_context); + math::matmul(context.device_context(), x_matrix, false, y_matrix, + false, 1, z, 0); } }; @@ -71,16 +69,14 @@ class MulGradKernel : public framework::OpKernel { Tensor* dx = ctx.Output(framework::GradVarName("X")); Tensor* dy = ctx.Output(framework::GradVarName("Y")); - auto* device_context = - const_cast(ctx.device_context_); if (dx) { dx->mutable_data(ctx.GetPlace()); Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix( *dx, x_num_col_dims) : *dx; // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - math::matmul(*dout, false, y_matrix, true, 1, &dx_matrix, 0, - device_context); + math::matmul(ctx.device_context(), *dout, false, y_matrix, true, + 1, &dx_matrix, 0); } if (dy) { dy->mutable_data(ctx.GetPlace()); @@ -88,8 +84,8 @@ class MulGradKernel : public framework::OpKernel { *dy, y_num_col_dims) : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K - math::matmul(x_matrix, true, *dout, false, 1, &dy_matrix, 0, - device_context); + math::matmul(ctx.device_context(), x_matrix, true, *dout, false, + 1, &dy_matrix, 0); } } }; diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 8b605e51c3f4ea38fc358ce054bb36fcc82063c4..daf519b91d623d4369774dc4e37dcb7b1733666b 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -24,4 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) -nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place) +nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index ad212c5b2c47312743362db4926c80bf056e100d..93b472b41c8a4c3a2bfada9d4fbf0e9e1b0cc736 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -101,19 +101,17 @@ CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); + PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); + PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); + PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); } CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); - if (cublas_handle_) { - PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); - } - - if (cudnn_handle_) { - PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); - } - + PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); + PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -129,25 +127,13 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } -cublasHandle_t CUDADeviceContext::cublas_handle() { - if (!cublas_handle_) { - SetDeviceId(place_.device); - PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); - PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); - } +cublasHandle_t CUDADeviceContext::cublas_handle() const { return cublas_handle_; } -cudnnHandle_t CUDADeviceContext::cudnn_handle() { - if (!cudnn_handle_) { - SetDeviceId(place_.device); - PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); - PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); - } - return cudnn_handle_; -} +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } -cudaStream_t CUDADeviceContext::stream() { return stream_; } +cudaStream_t CUDADeviceContext::stream() const { return stream_; } #endif // PADDLE_ONLY_CPU diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 11528e1194e4516891034fa8febdac3ba6eed204..a106592e454e21c46cd2f87f1bbf6694955d6e23 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -67,16 +67,14 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; - // clang-format off /*! \brief Return cublas handle in the device context. */ - cublasHandle_t cublas_handle(); + cublasHandle_t cublas_handle() const; /*! \brief Return cudnn handle in the device context. */ - cudnnHandle_t cudnn_handle(); + cudnnHandle_t cudnn_handle() const; /*! \brief Return cuda stream in the device context. */ - cudaStream_t stream(); - // clang-format on + cudaStream_t stream() const; private: GPUPlace place_; @@ -84,11 +82,9 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; - // clang-format off - cudaStream_t stream_{nullptr}; - cudnnHandle_t cudnn_handle_{nullptr}; - cublasHandle_t cublas_handle_{nullptr}; - // clang-format on + cudaStream_t stream_; + cudnnHandle_t cudnn_handle_; + cublasHandle_t cublas_handle_; }; #endif diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h index 3ee4acd29660f201d318ce6d39baa6f3999ae274..8eaab047fd4daa386f5ebdbb99a4caeed5fe2fbf 100644 --- a/paddle/platform/transform.h +++ b/paddle/platform/transform.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" #include "paddle/platform/hostdevice.h" #include "paddle/platform/place.h" @@ -21,6 +22,7 @@ #include #include #ifdef __NVCC__ +#include #include #include "paddle/platform/details/device_ptr_cast.h" #endif @@ -28,34 +30,39 @@ namespace paddle { namespace platform { // Transform on host or device. It provides the same API in std library. -template -void Transform(Place place, InputIter first, InputIter last, OutputIter result, - UnaryOperation op) { +template +void Transform(const DeviceContext& context, InputIter first, InputIter last, + OutputIter result, UnaryOperation op) { + auto place = context.GetPlace(); if (is_cpu_place(place)) { std::transform(first, last, result, op); } else { #ifdef __NVCC__ + auto& ctx = reinterpret_cast(context); using namespace details; - thrust::transform(DevPtrCast(first), DevPtrCast(last), DevPtrCast(result), - op); + thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first), + DevPtrCast(last), DevPtrCast(result), op); #else PADDLE_THROW("Do not invoke `Transform` in .cc file"); #endif } } -template -void Transform(Place place, InputIter1 first1, InputIter1 last1, - InputIter2 first2, OutputIter result, BinaryOperation op) { +template +void Transform(const DeviceContext& context, InputIter1 first1, + InputIter1 last1, InputIter2 first2, OutputIter result, + BinaryOperation op) { + auto place = context.GetPlace(); if (is_cpu_place(place)) { std::transform(first1, last1, first2, result, op); } else { #ifdef __NVCC__ + auto& ctx = reinterpret_cast(context); using namespace details; - thrust::transform(DevPtrCast(first1), DevPtrCast(last1), DevPtrCast(first2), - DevPtrCast(result), op); + thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first1), + DevPtrCast(last1), DevPtrCast(first2), DevPtrCast(result), + op); #else PADDLE_THROW("Do not invoke `Transform` in .cc file"); #endif diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu index 600fed8f45077a6fee91f295aa854153c9cf9c01..b8a6200bb03c9a40b67be8d113012856e2a407e9 100644 --- a/paddle/platform/transform_test.cu +++ b/paddle/platform/transform_test.cu @@ -36,8 +36,9 @@ class Multiply { TEST(Transform, CPUUnary) { using namespace paddle::platform; + CPUDeviceContext ctx; float buf[4] = {0.1, 0.2, 0.3, 0.4}; - Transform(CPUPlace(), buf, buf + 4, buf, Scale(10)); + Transform(ctx, buf, buf + 4, buf, Scale(10)); for (int i = 0; i < 4; ++i) { ASSERT_NEAR(buf[i], static_cast(i + 1), 1e-5); } @@ -47,10 +48,12 @@ TEST(Transform, GPUUnary) { using namespace paddle::platform; using namespace paddle::memory; GPUPlace gpu0(0); + CUDADeviceContext ctx(gpu0); float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); - Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); + Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); + ctx.Wait(); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); Free(gpu0, gpu_buf); for (int i = 0; i < 4; ++i) { @@ -62,7 +65,7 @@ TEST(Transform, CPUBinary) { using namespace paddle::platform; using namespace paddle::memory; int buf[4] = {1, 2, 3, 4}; - Transform(CPUPlace(), buf, buf + 4, buf, buf, Multiply()); + Transform(CPUDeviceContext(), buf, buf + 4, buf, buf, Multiply()); for (int i = 0; i < 4; ++i) { ASSERT_EQ((i + 1) * (i + 1), buf[i]); } @@ -73,9 +76,11 @@ TEST(Transform, GPUBinary) { using namespace paddle::memory; int buf[4] = {1, 2, 3, 4}; GPUPlace gpu0(0); + CUDADeviceContext ctx(gpu0); int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); - Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); + Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); + ctx.Wait(); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); Free(gpu0, gpu_buf); for (int i = 0; i < 4; ++i) { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index a9e1d6d2e06d56f837690ec95fa8f8d41a90725f..7c32eb0069f4075d72cd4c3654c83e3d5c98fb1c 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2286,8 +2286,15 @@ class NormLayer(LayerBase): @config_layer('pool') class PoolLayer(LayerBase): + layer_type = 'pool' + def __init__(self, name, inputs, ceil_mode=True, **xargs): - super(PoolLayer, self).__init__(name, 'pool', 0, inputs=inputs, **xargs) + use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0)) + if self.layer_type == "mkldnn_pool": + config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN") + self.layer_type = 'mkldnn_pool' if use_mkldnn else 'pool' + super(PoolLayer, self).__init__( + name, self.layer_type, 0, inputs=inputs, **xargs) for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) pool_conf = self.config.inputs[input_index].pool_conf @@ -2297,6 +2304,11 @@ class PoolLayer(LayerBase): pool_conf.channels) +@config_layer('mkldnn_pool') +class MKLDNNPoolLayer(PoolLayer): + layer_type = 'mkldnn_pool' + + @config_layer('pool3d') class Pool3DLayer(LayerBase): def __init__(self, name, inputs, ceil_mode=True, **xargs): diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py index 6bbea22c5f147c8314c5d607f8e6953b470b5bd1..0a5673868c547d9e184e8ce05346c3ebabe06892 100644 --- a/python/paddle/v2/framework/tests/op_test.py +++ b/python/paddle/v2/framework/tests/op_test.py @@ -192,6 +192,9 @@ class OpTest(unittest.TestCase): self.op.run(self.scope, ctx) for out_name, out_dup in Operator.get_op_outputs(self.op.type()): + if out_name not in self.outputs: + continue + if out_dup: sub_out = self.outputs[out_name] if not isinstance(sub_out, list): @@ -206,14 +209,12 @@ class OpTest(unittest.TestCase): actual, expect, atol=1e-05), "output name: " + out_name + " has diff") else: - var = self.scope.find_var(out_name) - if var is not None: - actual = np.array(var.get_tensor()) - expect = self.outputs[out_name] - self.assertTrue( - np.allclose( - actual, expect, atol=1e-05), - "output name: " + out_name + " has diff") + actual = np.array(self.scope.find_var(out_name).get_tensor()) + expect = self.outputs[out_name] + self.assertTrue( + np.allclose( + actual, expect, atol=1e-05), + "output name: " + out_name + " has diff") def check_output(self): places = [core.CPUPlace()]