diff --git a/.copyright.hook b/.copyright.hook index dc1b096a0ad28db732b794fa856efed71917c5e8..09afff2072df3384a429d01d06188218ae6e85d1 100644 --- a/.copyright.hook +++ b/.copyright.hook @@ -9,7 +9,7 @@ import subprocess import platform COPYRIGHT = ''' - Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/CMakeLists.txt b/CMakeLists.txt index f4e7d5c20db5fb95dfd5de05f8209608707b772c..7c7eb260aea8478f4833cb79253f4481e10b8685 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING) find_package(Git REQUIRED) find_package(Threads REQUIRED) -if(NOT ANDROID AND NOT IOS) - find_package(Boost QUIET) -endif() include(simd) @@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas include(external/mkldnn) # download, build, install mkldnn include(external/swig) # download, build, install swig include(external/warpctc) # download, build, install warpctc +include(external/boost) # download, build, install boost include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 @@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}") include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") -include_directories(${Boost_INCLUDE_DIRS}) set(EXTERNAL_LIBS ${GFLAGS_LIBRARIES} diff --git a/Dockerfile b/Dockerfile index 857d3f3e5f64791146741ffb29feabfcb2ecbb84..6ac9901ac6cea12e97047efdfb6272c957f166ae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -27,7 +27,7 @@ RUN apt-get update && \ curl sed grep graphviz libjpeg-dev zlib1g-dev \ python-matplotlib gcc-4.8 g++-4.8 \ automake locales clang-format swig doxygen cmake \ - liblapack-dev liblapacke-dev libboost-dev \ + liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ net-tools libtool && \ apt-get clean -y diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c70d83b3f4bb24740ed67b4e2f98a3ced26d1648 --- /dev/null +++ b/cmake/external/boost.cmake @@ -0,0 +1,51 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(ExternalProject) + +set(BOOST_PROJECT "extern_boost") +set(BOOST_VER "1.41.0") +set(BOOST_TAR "boost_1_41_0") +set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz") +set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) +set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") +set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) + +include_directories(${BOOST_INCLUDE_DIR}) + +ExternalProject_Add( + ${BOOST_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz + && tar zxf ${BOOST_TAR}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + PREFIX ${BOOST_SOURCES_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_COMMAND "" +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(boost STATIC ${dummyfile}) +else() + add_library(boost INTERFACE) +endif() + +add_dependencies(boost ${BOOST_PROJECT}) +list(APPEND external_project_dependencies boost) +set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst index 550b0e5b82609750ccd318eee889313cb2d7925a..f738bf15641d9fca0bfb0c10821de778ceee0d79 100644 --- a/doc/api/v2/fluid/layers.rst +++ b/doc/api/v2/fluid/layers.rst @@ -18,6 +18,11 @@ dynamic_lstm .. autofunction:: paddle.v2.fluid.layers.dynamic_lstm :noindex: +dynamic_gru +----------- +.. autofunction:: paddle.v2.fluid.layers.dynamic_gru + :noindex: + data ---- .. autofunction:: paddle.v2.fluid.layers.data @@ -500,6 +505,11 @@ swish .. autofunction:: paddle.v2.fluid.layers.swish :noindex: +im2sequence +------ +.. autofunction:: paddle.v2.fluid.layers.im2sequence + :noindex: + edit_distance --------------- .. autofunction:: paddle.v2.fluid.layers.edit_distance_error @@ -519,3 +529,13 @@ sequence_reshape ---------------- .. autofunction:: paddle.v2.fluid.layers.sequence_reshape :noindex: + +row_conv +-------- +.. autofunction:: paddle.v2.fluid.layers.row_conv + :noindex: + +multiplex +--------- +.. autofunction:: paddle.v2.fluid.layers.multiplex + :noindex: diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst index f6b1cb4ba10659fb336899f08376c265c67290f1..500019bc507f859c4c91de5d322a82eb1e78e2de 100644 --- a/doc/api/v2/fluid/nets.rst +++ b/doc/api/v2/fluid/nets.rst @@ -26,8 +26,8 @@ glu :noindex: -dot_product_attention ---------------------- -.. autofunction:: paddle.v2.fluid.nets.dot_product_attention +scaled_dot_product_attention +---------------------------- +.. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention :noindex: diff --git a/doc/design/csp.md b/doc/design/csp.md new file mode 100644 index 0000000000000000000000000000000000000000..ba9cacfdea7dcf7c6499b562dfc58400d082f2c8 --- /dev/null +++ b/doc/design/csp.md @@ -0,0 +1,96 @@ +# Design Doc: CSP in PaddlePaddle Fluid + +## Motivation + +Concurrent programming is important for deep learning. Few example applications are: + +1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing. +2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server. + +Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language. + +## Concurrent Programming Models + +There were many concurrent programming models, implemented in various forms: + +| concurrent programming model | implementation | +|-----|-----| +| mutex | types and functions in standard libraries | +| semaphore | types and functions in standard libraries | +| communicating sequential processes (CSP) | Go programming language | +| actor model | Erlang programming language | +| message passing | MPI | +| bulk synchronous parallel (BSP) | Pregel distributed programming framework | + +Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid. + +### CSP v.s. Actor Model + +A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv. + +## CSP in Fluid + +Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following: + +1. a new data type: *channel* and operators *send* and *recv*, +1. *goroutine* or thread, and +1. a new control-flow: select. + +We also need Python wrappers for the above components. + +The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv. + +The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll. + +It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax. + +### Type Channel + +Fluid supports many data types: + +1. Tensor, +1. Row-sparse Tensor +1. LoD Tensor, +1. Tensor array, etc + +Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum. + +To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor. + +## Syntax Design + +### Create Channel + +In Go, we create a channel by specifying the element type and buffer size: + +```go +ch := make(chan int) // a channel without buffer +ch1 := make(chan int, 100) // a channel that can buffer 100 ints. +``` + +In Fluid, we should be able to do the same: + +```python +ch = fluid.make_chan(dtype=INT) +ch1 = fluid.make_chan(dtype=INT, 100) +``` + +In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16: + +```python +ch = fluid.make_chan(dtype=Tensor, etype=float16) +``` + +or Tensors of Tensors of float16 etc. + +The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor...> >`. + +### Send and Recv + +### Select + +## Example Programs + +### 1. RPC between Trainers and Parameter Servers + +### 2. Concurrent Minibatch Loading diff --git a/doc/design/dist_refactor/distributed_architecture.md b/doc/design/dist_refactor/distributed_architecture.md index 3a741f95866fb6c301ca9097af7916281f2278cf..9368c5780dc922953f38bf0f86d9f797a4a8a6fe 100644 --- a/doc/design/dist_refactor/distributed_architecture.md +++ b/doc/design/dist_refactor/distributed_architecture.md @@ -152,12 +152,12 @@ for data in train_reader(): `JobDesc` object describe the distributed job resource specification to run on Cluster environment. - + `RemoteExecutor.run` sends the `ProgramDesc` and [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource) to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible -to start the final Kubernetes Jobs to run the different role of `ProgramDesc`. +to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`. ### Placement Algorithm diff --git a/doc/design/dist_refactor/parameter_server.md b/doc/design/dist_refactor/parameter_server.md index 1094f06d461275a9ad4034d5e48b39856d967b71..805dd13048d41b995d2a01cda52b2ea33e4bbe1d 100644 --- a/doc/design/dist_refactor/parameter_server.md +++ b/doc/design/dist_refactor/parameter_server.md @@ -9,16 +9,16 @@ different purposes. ## Background -The previous implementations of the parameter server does not run a +The previous implementations of the parameter server do not run a fluid sub-program. Parameter initialization, optimizer computation, network communication and checkpointing are implemented twice on both the -trainer and the parameter server. +trainer as well as the parameter server. -It would be great if we can write code once and use them on both the -trainer and the parameter server: reduces code duplication and -improves extensibility. Given that after the current refactor, we are -representing everything as a computing graph on the -trainer. Representing everything as a computing graph on the parameter +It would be great if we can write code once and use them on both: the +trainer and the parameter server, since this reduces code duplication and +improves extensibility. Given that after the current refactoring, we are +representing everything as a computation graph on the +trainer. Representing everything as a computation graph on the parameter server becomes a natural extension. ## Design @@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following steps: 1. OP placement: the OPs will be placed on different nodes according - to heuristic that minimizes estimated total computation + to a heuristic that minimizes the estimated total computation time. Currently we will use a simple heuristic that puts parameter - varable on parameter server workers and everything else on trainer + variable on parameter server workers and everything else on trainer workers. 1. Add communication OPs to enable the communication between nodes. @@ -47,22 +47,22 @@ After converting: -1. The parameter variable W and it's optimizer program are placed on the parameter server. +1. The parameter variable W and its optimizer program are placed on the parameter server. 1. Operators are added to the program. - *Send* sends data to the connected *Recv* operator. The scheduler on the receive node will only schedule *Recv* operator to run when the *Send* operator has ran (the *Send* OP will mark the *Recv* OP runnable automatically). - - *Enueue* enqueues the input variable, it can block until space + - *Enqueue* enqueues the input variable, it can block until space become available in the queue. - *Dequeue* outputs configurable numbers of tensors from the - queue. It will block until the queue have the required number of + queue. It will block until the queue has the required number of tensors. ### Benefits -- Model parallelism become easier to implement: it's an extension to +- Model parallelism becomes easier to implement: it is an extension to the trainer - parameter server approach. We can have several "Transpilers" to achieve different goals. - User-defined optimizer is easier to add - user can now express it as @@ -72,22 +72,22 @@ After converting: ### Challenges -- It's important to balance the parameter shards of on multiple - parameter server. If a single parameter is very big (some +- It is important to balance the parameter shards on multiple + parameter servers. If a single parameter is very big (for example: some word-embedding, fully connected, softmax layer), we need to automatically partition the single parameter onto different parameter servers when possible (only element-wise optimizer depends on the parameter variable). -- In the "Aync SGD" figure, the "W" variable on the parameter server - could be read and wrote concurrently. See +- In the "Async SGD" figure, the "W" variable on the parameter server + could be read and written concurrently. See [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more - details about concurrent program in fluid. + details about concurrent program in Fluid. ### Discussion - Can the Enqueue OP be implemented under our current tensor design - (puts the input tensor into the queue tensor)? -- *Dequeue* OP will have variable numbers of output (depends on the + (put the input tensor into the queue tensor)? +- *Dequeue* OP will have variable numbers of output (depending on the `min_count` attribute), does our current design support it? (similar question for the *Add* OP) diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/dist_refactor/src/remote_executor.graffle index ce2c18fee5687732053c48af9c8c290a994a8090..41b2067311694b56d211a4f32d1b76884eeffd2d 100644 Binary files a/doc/design/dist_refactor/src/remote_executor.graffle and b/doc/design/dist_refactor/src/remote_executor.graffle differ diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/dist_refactor/src/remote_executor.png index 6be4b1841b99efdb59557975485d0387f422308c..744e2fb2e0f1bbe058e991ba7b2a09000965ee79 100644 Binary files a/doc/design/dist_refactor/src/remote_executor.png and b/doc/design/dist_refactor/src/remote_executor.png differ diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md index 9db5fb8e9a9f89b004bf71ddc064cd976c0d0bee..c4a9bbeeefca0e05c335dd60233691e8bac33015 100644 --- a/doc/design/ops/sequence_decoder.md +++ b/doc/design/ops/sequence_decoder.md @@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, let's call this format the **absolute-offset LoD** for clarity. -The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows +The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows ```python [[0, 3, 9] [0, 2, 3, 3, 3, 9]] @@ -119,7 +119,7 @@ def generate(): encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) decoder_input = pd.fc( act=pd.activation.Linear(), - input=[target_word, encoder_ctx], + input=[target_word, encoder_ctx_expanded], size=3 * decoder_dim) gru_out, cur_mem = pd.gru_step( decoder_input, mem=decoder_mem, size=decoder_dim) diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md index 4c5f10e2ecb9ec09b78926ca27552741d02d7cc9..b154f01d32c5952367f075a732e2f7729d3c8626 100644 --- a/doc/design/support_new_device.md +++ b/doc/design/support_new_device.md @@ -2,9 +2,9 @@ ## Background -Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently. +Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner. -On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library. +On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library. On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent. @@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith There are mainly three parts that we have to consider while integrating a new device/library: -- Place and DeviceContext: indicates the device id and manages hardware resources +- Place and DeviceContext: indicate the device id and manage hardware resources - Memory and Tensor: malloc/free data on certain device @@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de ### Place and DeviceContext -Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. +Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. #### Place -Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`. +Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`. ``` | CPUPlace @@ -144,7 +144,7 @@ class Tensor { }; ``` -`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory. +`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory. ```cpp paddle::framework::Tensor t; @@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example: -The interface is defined in header file. +The interface is defined in the header file. ``` template @@ -203,7 +203,7 @@ class MaxOutFunctor { ``` -We get computing handle from a concrete DeviceContext, and make compution on tensors. +We first obtain the computing handle from a concrete DeviceContext, and then compute on tensors. The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map. @@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL( ## Advanced topics: How to switch between different Device/Library -Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library. +Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library. For more details, please refer to following docs: diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst index 0c741e936b46eda5e7165e4ee54b545b14a28a19..8e4165da6b8135d083766c650f1092158f9d01c2 100644 --- a/doc/getstarted/build_and_install/pip_install_cn.rst +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -39,6 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具 "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst index 285ed09805b09790beaef014f6813c227aff33ac..c1e806c0fe5f03139c0dff985f9ae0856eaa2e98 100644 --- a/doc/getstarted/build_and_install/pip_install_en.rst +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -42,6 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md index 1775374cf6e518586c28bbd8e04946c74df7e4c5..368af40cc7308cf6f4c609361078fe3ba02213ed 100644 --- a/doc/howto/optimization/cpu_profiling.md +++ b/doc/howto/optimization/cpu_profiling.md @@ -60,8 +60,7 @@ each column is as follows: | column | meaning | | --- | --- | | ncalls | the number of calls into a function | -| tottime | the total execution time of the function, not including the - execution time of other functions called by the function | +| tottime | the total execution time of the function, not including the execution time of other functions called by the function | | percall | tottime divided by ncalls | | cumtime | the total execution time of the function, including the execution time of other functions being called | | percall | cumtime divided by ncalls | diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/usage/cluster/fluid_cluster_train_en.md index 11904a6f71bb6ce37417aeffb8e408ec65961b12..ae825d9a517c7e9005d4e32f8f34b3f6a79be0c9 100644 --- a/doc/howto/usage/cluster/fluid_cluster_train_en.md +++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md @@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html). +In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows: + +``` bash +cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON +``` + ### Update the training script #### Non-cluster training script @@ -119,7 +125,14 @@ for pass_id in range(100): ### E2E demo -Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run the following in the command line: +Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). +First `cd` into the folder that contains the `python` files. In this case: + +```bash +cd /paddle/python/paddle/v2/fluid/tests/book_distribute +``` + +In parameter server node run the following in the command line: ``` bash PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 4a98ede278fad85ff2beef3c8e7dd158912f693a..3f9c132ef6ae03c7614e10484715676c8019821e 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -18,7 +18,7 @@ else() add_subdirectory(capi) endif() - if(Boost_FOUND) + if(NOT ANDROID AND NOT IOS) add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index a912d8492fc6c23f88dd675694b805d0eda88335..280496984251919a8b4b6c52684f950a80b78356 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -1,7 +1,7 @@ # ddim lib proto_library(framework_proto SRCS framework.proto) -cc_library(ddim SRCS ddim.cc DEPS eigen3) +cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) @@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) -cc_library(attribute SRCS attribute.cc DEPS framework_proto) +cc_library(attribute SRCS attribute.cc DEPS framework_proto boost) cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc device_context) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) @@ -74,7 +74,10 @@ cc_library(backward SRCS backward.cc DEPS net_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) -cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table) +cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) + +cc_library(executor SRCS executor.cc DEPS op_registry device_context scope +framework_proto backward glog lod_rank_table profiler feed_fetch_method) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc index b0fd4d2750eb2529706d871947332d39494505cd..5074e8f5a05ed4e824b3db7e506b30eb1b70c3fd 100644 --- a/paddle/framework/attribute.cc +++ b/paddle/framework/attribute.cc @@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { } return val; } + case proto::AttrType::LONG: { + return attr_desc.l(); + } default: PADDLE_THROW("Unsupport attr type %d", attr_desc.type()); } diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h index c1c63d9cb13acb195b3bc3b30088f5fa7daf2a3d..bcff9bc4c48f8f233b7f811640c2789f9618a972 100644 --- a/paddle/framework/attribute.h +++ b/paddle/framework/attribute.h @@ -168,6 +168,32 @@ struct ExtractAttribute { const std::string& attr_name_; }; +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + int64_t* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } + int64_t* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s", + attr_name_, attr.type().name()); + } + return attr_value; + } + + const std::string& attr_name_; +}; + // check whether a certain attribute fit its limits // an attribute can have more than one limits template diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index 54498e175dacfa0a220e3d839f4feb02502b2c03..dd2ed87252102aee6d384f37365d19305f19b281 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -75,7 +75,7 @@ std::vector BlockDesc::AllVars() const { OpDesc *BlockDesc::AppendOp() { need_update_ = true; - ops_.emplace_back(new OpDesc()); + ops_.emplace_back(new OpDesc(this)); return ops_.back().get(); } @@ -86,7 +86,7 @@ void BlockDesc::AppendAllocatedOp(std::unique_ptr &&op_desc) { OpDesc *BlockDesc::PrependOp() { need_update_ = true; - ops_.emplace_front(new OpDesc()); + ops_.emplace_front(new OpDesc(this)); return ops_.front().get(); } @@ -153,7 +153,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc) vars_[var_desc.name()].reset(new VarDesc(var_desc)); } for (const proto::OpDesc &op_desc : desc_->ops()) { - ops_.emplace_back(new OpDesc(op_desc, prog)); + ops_.emplace_back(new OpDesc(op_desc, prog, this)); } } @@ -162,7 +162,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, : prog_(prog), desc_(desc) { need_update_ = true; for (auto &op : other.ops_) { - ops_.emplace_back(new OpDesc(*op)); + ops_.emplace_back(new OpDesc(*op, this)); } for (auto &it : other.vars_) { diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h new file mode 100644 index 0000000000000000000000000000000000000000..9ba0fc5c558a85b41deb01ad57842d9c4c054e0e --- /dev/null +++ b/paddle/framework/channel.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +namespace paddle { +namespace framework { + +template +class Channel { + public: + explicit Channel(std::size_t capacity) : capacity_(capacity) {} + + void Send(T* channel_element) { + std::unique_lock lock(mu_); + + if (IsBounded()) { + full_cond_var_.wait(lock, [this]() { + bool capacity_valid = capacity_ > 0 ? !IsCapacityFull() : true; + return capacity_valid; + }); + } + channel_.push_back(std::move(*channel_element)); + + lock.unlock(); + empty_cond_var_.notify_one(); + } + + T* Receive() { + std::unique_lock lock(mu_); + empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); }); + + T* channel_element = std::move(channel_.front()); + channel_.pop_front(); + + NotifyAllSenders(&lock); + return channel_element; + } + + size_t Size() { + std::unique_lock lock(mu_); + return channel_.size(); + } + + void Clear() { + std::unique_lock lock(mu_); + channel_.clear(); + + NotifyAllSenders(&lock); + } + + private: + std::size_t capacity_; + std::mutex mu_; + std::condition_variable empty_cond_var_; + std::condition_variable full_cond_var_; + std::deque channel_; + + private: + void NotifyAllSenders(std::unique_lock* lock) { + if (IsBounded()) { + lock->unlock(); + full_cond_var_.notify_one(); + } + } + + bool IsBounded() const { return capacity_ > 0; } + + bool IsCapacityFull() const { return channel_.size() >= capacity_; } +}; + +} // namespace operator +} // namespace paddle diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 1382bfca19a674a404916a5c709276ce41219d2f..cbf3ec75265fa74aaacffee684b7b7d5f73b7c02 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -17,11 +17,13 @@ limitations under the License. */ #include #include "gflags/gflags.h" +#include "paddle/framework/feed_fetch_method.h" #include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" #include "paddle/platform/place.h" +#include "paddle/platform/profiler.h" DECLARE_bool(do_memory_benchmark); DEFINE_bool(check_nan_inf, false, @@ -116,8 +118,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - VLOG(3) << op->DebugStringEx(local_scope); + VLOG(4) << op->DebugStringEx(local_scope); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(op->Type(), pool.Get(place_)); + op->Run(*local_scope, place_); + VLOG(3) << op->DebugStringEx(local_scope); if (FLAGS_do_memory_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " << memory::memory_usage(place_); @@ -143,5 +150,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, } } +// Check whether the block already has feed operators and feed_holder. +// Return false if the block does not have any feed operators. +// If some feed operators have been prepended to the block, check that +// the info contained in these feed operators matches the feed_targets +// and feed_holder_name. Raise exception when any mismatch is found. +// Return true if the block has feed operators and holder of matching info. +static bool has_feed_operators( + BlockDesc* block, std::map& feed_targets, + const std::string& feed_holder_name) { + size_t feed_count = 0; + for (auto* op : block->AllOps()) { + if (op->Type() == kFeedOpType) { + feed_count++; + PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name, + "Input to feed op should be '%s'", feed_holder_name); + std::string feed_target_name = op->Output("Out")[0]; + PADDLE_ENFORCE( + feed_targets.find(feed_target_name) != feed_targets.end(), + "Feed operator output name '%s' cannot be found in 'feed_targets'", + feed_target_name); + } + } + + if (feed_count > 0) { + PADDLE_ENFORCE_EQ( + feed_count, feed_targets.size(), + "The number of feed operators should match 'feed_targets'"); + + // When feed operator are present, so should be feed_holder + auto var = block->FindVar(feed_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + feed_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH, + "'%s' variable should be 'FEED_MINIBATCH' type", + feed_holder_name); + } + + return feed_count > 0; +} + +// Check whether the block already has fetch operators and fetch_holder. +// Return false if the block does not have any fetch operators. +// If some fetch operators have been appended to the block, check that +// the info contained in these fetch operators matches the fetch_targets +// and fetch_holder_name. Raise exception when any mismatch is found. +// Return true if the block has fetch operators and holder of matching info. +static bool has_fetch_operators( + BlockDesc* block, std::map& fetch_targets, + const std::string& fetch_holder_name) { + size_t fetch_count = 0; + for (auto* op : block->AllOps()) { + if (op->Type() == kFetchOpType) { + fetch_count++; + PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name, + "Output of fetch op should be '%s'", fetch_holder_name); + std::string fetch_target_name = op->Input("X")[0]; + PADDLE_ENFORCE( + fetch_targets.find(fetch_target_name) != fetch_targets.end(), + "Fetch operator input name '%s' cannot be found in 'fetch_targets'", + fetch_target_name); + } + } + + if (fetch_count > 0) { + PADDLE_ENFORCE_EQ( + fetch_count, fetch_targets.size(), + "The number of fetch operators should match 'fetch_targets'"); + + // When fetch operator are present, so should be fetch_holder + auto var = block->FindVar(fetch_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + fetch_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST, + "'%s' variable should be 'FETCH_LIST' type", + fetch_holder_name); + } + + return fetch_count > 0; +} + +void Executor::Run(const ProgramDesc& program, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + const std::string& feed_holder_name, + const std::string& fetch_holder_name) { + auto* copy_program = new ProgramDesc(program); + auto* global_block = copy_program->MutableBlock(0); + + if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) { + // create feed_holder variable + auto* feed_holder = global_block->Var(feed_holder_name); + feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH); + feed_holder->SetPersistable(true); + + int i = 0; + for (auto& feed_target : feed_targets) { + std::string var_name = feed_target.first; + VLOG(3) << "feed target's name: " << var_name; + + // prepend feed op + auto* op = global_block->PrependOp(); + op->SetType(kFeedOpType); + op->SetInput("X", {feed_holder_name}); + op->SetOutput("Out", {var_name}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + + i++; + } + } + + // map the data of feed_targets to feed_holder + for (auto* op : global_block->AllOps()) { + if (op->Type() == kFeedOpType) { + std::string feed_target_name = op->Output("Out")[0]; + int idx = boost::get(op->GetAttr("col")); + SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name, + idx); + } + } + + if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) { + // create fetch_holder variable + auto* fetch_holder = global_block->Var(fetch_holder_name); + fetch_holder->SetType(proto::VarDesc::FETCH_LIST); + fetch_holder->SetPersistable(true); + + int i = 0; + for (auto& fetch_target : fetch_targets) { + std::string var_name = fetch_target.first; + VLOG(3) << "fetch target's name: " << var_name; + + // append fetch op + auto* op = global_block->AppendOp(); + op->SetType(kFetchOpType); + op->SetInput("X", {var_name}); + op->SetOutput("Out", {fetch_holder_name}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + + i++; + } + } + + Run(*copy_program, scope, 0, true, true); + + // obtain the data of fetch_targets from fetch_holder + for (auto* op : global_block->AllOps()) { + if (op->Type() == kFetchOpType) { + std::string fetch_target_name = op->Input("X")[0]; + int idx = boost::get(op->GetAttr("col")); + *fetch_targets[fetch_target_name] = + GetFetchVariable(*scope, fetch_holder_name, idx); + } + } + + delete copy_program; +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h index d869e18901b82959a40cc296aa0844c20ea63ac1..035ff48a52bd2fc4b1a46b48b1fbf1fbcb2ac70b 100644 --- a/paddle/framework/executor.h +++ b/paddle/framework/executor.h @@ -41,6 +41,12 @@ class Executor { void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, bool create_vars = true); + void Run(const ProgramDesc& program, Scope* scope, + std::map& feed_targets, + std::map& fetch_targets, + const std::string& feed_holder_name = "feed", + const std::string& fetch_holder_name = "fetch"); + private: const platform::Place place_; }; diff --git a/paddle/framework/feed_fetch_method.cc b/paddle/framework/feed_fetch_method.cc new file mode 100644 index 0000000000000000000000000000000000000000..21201b675519e34b11e9f1f3a6f2a135c06d63a7 --- /dev/null +++ b/paddle/framework/feed_fetch_method.cc @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/feed_fetch_method.h" +#include "glog/logging.h" +#include "paddle/framework/variable.h" + +namespace paddle { +namespace framework { + +void SetFeedVariable(Scope* scope, const LoDTensor& input, + const std::string& var_name, size_t index) { + // If var_name Variable is not found in GlobalScope, a new variable will + // be created. + VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; + Variable* g_feed_value = scope->Var(var_name); + auto& feed_inputs = + *(g_feed_value->GetMutable>()); + if (index >= feed_inputs.size()) { + feed_inputs.resize(index + 1); + } + // shared data with input tensor + feed_inputs[index].ShareDataWith(input); + // set lod + feed_inputs[index].set_lod(input.lod()); +} + +LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, + size_t index) { + // Since we want to fetch LodTensor from a variable, the variable must + // be created alreadly. + Variable* g_fetch_value = scope.FindVar(var_name); + PADDLE_ENFORCE(g_fetch_value->IsType(), + "Only %s can be invoked by GetFetchVariable", + typeid(FeedFetchList).name()); + auto& fetch_outputs = *g_fetch_value->GetMutable(); + auto& tensor = fetch_outputs[index]; + VLOG(3) << "Fetch " << var_name << " with index " << index + << " shape= " << tensor.dims(); + PADDLE_ENFORCE_LT(index, fetch_outputs.size()); + return tensor; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h index 7feacb1e24708411e7fbb610f9909447cba9e291..b71945fcc8834d2e5fe21151e1e88788b4acd5c1 100644 --- a/paddle/framework/feed_fetch_method.h +++ b/paddle/framework/feed_fetch_method.h @@ -13,46 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "glog/logging.h" + #include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/scope.h" -#include "paddle/framework/variable.h" namespace paddle { namespace framework { void SetFeedVariable(Scope* scope, const LoDTensor& input, - const std::string& var_name, size_t index) { - // If var_name Variable is not found in GlobalScope, a new variable will - // be created. - VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; - Variable* g_feed_value = scope->Var(var_name); - auto& feed_inputs = - *(g_feed_value->GetMutable>()); - if (index >= feed_inputs.size()) { - feed_inputs.resize(index + 1); - } - // shared data with input tensor - feed_inputs[index].ShareDataWith(input); - // set lod - feed_inputs[index].set_lod(input.lod()); -} + const std::string& var_name, size_t index); LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, - size_t index) { - // Since we want to fetch LodTensor from a variable, the variable must - // be created alreadly. - Variable* g_fetch_value = scope.FindVar(var_name); - PADDLE_ENFORCE(g_fetch_value->IsType(), - "Only %s can be invoked by GetFetchVariable", - typeid(FeedFetchList).name()); - auto& fetch_outputs = *g_fetch_value->GetMutable(); - auto& tensor = fetch_outputs[index]; - VLOG(3) << "Fetch " << var_name << " with index " << index - << " shape= " << tensor.dims(); - PADDLE_ENFORCE_LT(index, fetch_outputs.size()); - return tensor; -} + size_t index); } // namespace framework } // namespace paddle diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto index ea69b87e2ac7dc587333b623c310182bb39eb452..5b6ef03f610926578d2c02dcf06f399f106a30a1 100644 --- a/paddle/framework/framework.proto +++ b/paddle/framework/framework.proto @@ -26,6 +26,7 @@ enum AttrType { BOOLEAN = 6; BOOLEANS = 7; BLOCK = 8; + LONG = 9; } // OpDesc describes an instance of a C++ framework::OperatorBase @@ -44,6 +45,7 @@ message OpDesc { optional bool b = 10; repeated bool bools = 11; optional int32 block_idx = 12; + optional int64 l = 13; }; message Var { diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index b29f528f3f749efa3463125c774c2f4d4ebcbc7c..53b0d0fe083579da4f0bb600f292765aa2aa0d8a 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -107,9 +107,10 @@ LoD ToAbsOffset(const LoD &in) { // the lowest level stores relative offsets if (in.empty() || in.size() == 1) return in; LoD result = in; - for (int level = result.size() - 2; level >= 0; level--) { - for (auto &ele : result[level]) { - ele = result[level + 1][ele]; + for (auto level = static_cast(in.size() - 2); level >= 0; level--) { + for (size_t i = 0; i < in[level].size(); ++i) { + size_t index = in[level][i]; + result[level][i] = result[level + 1][index]; } } return result; diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 1c0372bb16c04e155a68a0411939e4887322107a..f8df2cf97ad532f06cb1393b1a24cd789f8bde29 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -97,7 +97,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) { need_update_ = true; } -OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog) +OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block) : desc_(desc), need_update_(false) { // restore inputs_ int input_size = desc_.inputs_size(); @@ -131,6 +131,7 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog) attrs_[attr_name] = prog->MutableBlock(bid); } } + this->block_ = block; } proto::OpDesc *OpDesc::Proto() { @@ -282,6 +283,7 @@ struct SetAttrDescVisitor : public boost::static_visitor { VectorToRepeated(v, attr_->mutable_bools()); } void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } + void operator()(int64_t v) const { attr_->set_l(v); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } }; diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h index a5ffb162928bfd355d35d3f9b63aab59a88dd061..13695cff59f0bfd79c48eb28670ecc67a0309332 100644 --- a/paddle/framework/op_desc.h +++ b/paddle/framework/op_desc.h @@ -25,7 +25,6 @@ namespace framework { class BlockDesc; class ProgramDesc; - class OpDesc { public: OpDesc() {} @@ -33,7 +32,14 @@ class OpDesc { OpDesc(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs); - OpDesc(const proto::OpDesc &desc, ProgramDesc *prog); + OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block); + + explicit OpDesc(BlockDesc *block) : block_(block) {} + + OpDesc(const OpDesc &other, BlockDesc *block) { + *this = other; + block_ = block; + } void CopyFrom(const OpDesc &op_desc); @@ -117,6 +123,10 @@ class OpDesc { void Flush(); + BlockDesc *Block() { return this->block_; } + + void SetBlock(BlockDesc *block) { this->block_ = block; } + private: template static std::vector MapKeys(const MapType &map) { @@ -129,6 +139,7 @@ class OpDesc { } proto::OpDesc desc_; + BlockDesc *block_; // not_own // input arg name => input variable names VariableNameMap inputs_; // output arg name => output variable names diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc index 109a7e7dc440d91e8223f2c0924f489f54a06f64..b2f5ae4a96593fde1623dd10d3b63c984ae228db 100644 --- a/paddle/framework/threadpool.cc +++ b/paddle/framework/threadpool.cc @@ -1,24 +1,93 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ #include "paddle/framework/threadpool.h" namespace paddle { namespace framework { -std::unique_ptr ThreadPool::threadpool(nullptr); -std::once_flag ThreadPool::init_flag; +std::unique_ptr ThreadPool::threadpool_(nullptr); +std::once_flag ThreadPool::init_flag_; + +ThreadPool* ThreadPool::GetInstance() { + std::call_once(init_flag_, &ThreadPool::Init); + return threadpool_.get(); +} + +void ThreadPool::Init() { + if (threadpool_.get() == nullptr) { + // TODO(Yancey1989): specify the max threads number + int num_threads = std::thread::hardware_concurrency(); + PADDLE_ENFORCE_GT(num_threads, 0); + threadpool_.reset(new ThreadPool(num_threads)); + } +} + +ThreadPool::ThreadPool(int num_threads) + : total_threads_(num_threads), idle_threads_(num_threads), running_(true) { + threads_.resize(num_threads); + for (auto& thread : threads_) { + // TODO(Yancey1989): binding the thread on the specify CPU number + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + } +} + +ThreadPool::~ThreadPool() { + { + // notify all threads to stop running + running_ = false; + scheduled_.notify_all(); + } + + for (auto& t : threads_) { + t->join(); + t.reset(nullptr); + } +} + +void ThreadPool::Wait() { + std::unique_lock lock(mutex_); + completed_.wait(lock, [=] { return Done() == true; }); +} + +void ThreadPool::TaskLoop() { + while (running_) { + std::unique_lock lock(mutex_); + scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); + + if (!running_) { + break; + } + // pop a task from the task queue + auto task = std::move(tasks_.front()); + tasks_.pop(); + + --idle_threads_; + lock.unlock(); + + // run the task + task(); + + { + std::unique_lock lock(mutex_); + ++idle_threads_; + if (Done()) { + completed_.notify_all(); + } + } + } +} } // namespace framework } // namespace paddle diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h index 3ac345851c38557f82698786dd3bc8e1202a4256..8912b1a43a26f9df662d3b5ddf68bfb2b87f4a20 100644 --- a/paddle/framework/threadpool.h +++ b/paddle/framework/threadpool.h @@ -20,52 +20,36 @@ limitations under the License. */ #include #include #include +#include #include "paddle/platform/enforce.h" namespace paddle { namespace framework { +// ThreadPool maintains a queue of tasks, and runs them using a fixed +// number of threads. class ThreadPool { public: typedef std::packaged_task Task; - /** - * @brief Get a instance of threadpool, the thread number will - * be specified as the number of hardware thread contexts - */ - static ThreadPool* GetInstance() { - std::call_once(init_flag, &ThreadPool::Init); - return threadpool.get(); - } + // Returns the singleton of ThreadPool. + static ThreadPool* GetInstance(); - ~ThreadPool() { - { - // notify all threads to stop running - running_ = false; - scheduled_.notify_all(); - } - - for (auto& t : threads_) { - t->join(); - t.reset(nullptr); - } - } + ~ThreadPool(); - int GetNumThreads() const { return num_threads_; } + // Returns the number of threads created by the constructor. + size_t Threads() const { return total_threads_; } - int GetAvailable() { + // Returns the number of currently idle threads. + size_t IdleThreads() { std::unique_lock lock(mutex_); - return available_; + return idle_threads_; } - /** - * @brief Push a function to the queue, and will be scheduled and - * executed if a thread is available. - * @param[in] Task, will be pushed to the task queue. - * @return std::future, we could wait for the task finished by - * f.wait(). - */ + // Run pushes a function to the task queue and returns a std::future + // object. To wait for the completion of the task, call + // std::future::wait(). template std::future Run(Callback fn) { std::unique_lock lock(mutex_); @@ -77,84 +61,40 @@ class ThreadPool { return f; } - /** - * @brief Wait until all the tasks are completed. - */ - void Wait() { - std::unique_lock lock(mutex_); - completed_.wait(lock, [=] { return Done() == true; }); - } + // Wait until all the tasks are completed. + void Wait(); private: DISABLE_COPY_AND_ASSIGN(ThreadPool); - explicit ThreadPool(int num_threads) - : num_threads_(num_threads), available_(num_threads), running_(true) { - threads_.resize(num_threads); - for (auto& thread : threads_) { - // TODO(Yancey1989): binding the thread on the specify CPU number - thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); - } - } + explicit ThreadPool(int num_threads); - /** - * @brief If the task queue is empty and avaialbe - * is equal to the number of threads, means that - * all tasks are completed. - * - * Note: this function is not thread-safe. - * - * @return true if all tasks are completed. - */ - bool Done() { return tasks_.empty() && available_ == num_threads_; } - - void TaskLoop() { - while (running_) { - std::unique_lock lock(mutex_); - scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); - - if (!running_) { - break; - } - // pop a task from the task queue - auto task = std::move(tasks_.front()); - tasks_.pop(); - - --available_; - lock.unlock(); - - // run the task - task(); - - { - std::unique_lock lock(mutex_); - ++available_; - if (Done()) { - completed_.notify_all(); - } - } - } - } + // If the task queue is empty and avaialbe is equal to the number of + // threads, means that all tasks are completed. Note: this function + // is not thread-safe. Returns true if all tasks are completed. + // Note: don't delete the data member total_threads_ and use + // threads_.size() instead; because you'd need to lock the mutex + // before accessing threads_. + bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; } - static void Init() { - if (threadpool.get() == nullptr) { - // TODO(Yancey1989): specify the max threads number - int num_threads = std::thread::hardware_concurrency(); - PADDLE_ENFORCE_GT(num_threads, 0); - threadpool.reset(new ThreadPool(num_threads)); - } - } + // The constructor starts threads to run TaskLoop, which retrieves + // and runs tasks from the queue. + void TaskLoop(); + + // Init is called by GetInstance. + static void Init(); private: - static std::unique_ptr threadpool; - static std::once_flag init_flag; + static std::unique_ptr threadpool_; + static std::once_flag init_flag_; - int num_threads_; - int available_; - bool running_; - std::queue tasks_; std::vector> threads_; + const size_t total_threads_; + size_t idle_threads_; + + std::queue tasks_; std::mutex mutex_; + bool running_; std::condition_variable scheduled_; std::condition_variable completed_; }; diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc index 50b6238cd8786be9d8cf2d5f821daadea12bd208..3fbfe7efc867144dbd0dd2613c824c6a3c41b7d8 100644 --- a/paddle/framework/threadpool_test.cc +++ b/paddle/framework/threadpool_test.cc @@ -22,11 +22,7 @@ namespace framework = paddle::framework; void do_sum(framework::ThreadPool* pool, std::atomic& sum, int cnt) { std::vector> fs; for (int i = 0; i < cnt; ++i) { - auto f = pool->Run([&sum]() { sum.fetch_add(1); }); - fs.push_back(std::move(f)); - } - for (auto& f : fs) { - f.wait(); + fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); })); } } diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h index d834d343759fa279a1444c6337956ffce1b9061a..1eedbbc419ab660f5ce00aa891ef80ca245bc0a8 100644 --- a/paddle/framework/type_defs.h +++ b/paddle/framework/type_defs.h @@ -35,7 +35,7 @@ using VariableNameMap = std::map>; using Attribute = boost::variant, std::vector, std::vector, bool, - std::vector, BlockDesc*>; + std::vector, BlockDesc*, int64_t>; using AttributeMap = std::unordered_map; diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h index fc482c467404a6b9dfed64c43871d91d3d10c766..9316b14bb695c185efd6db4296d422ef0c476d57 100644 --- a/paddle/framework/var_desc.h +++ b/paddle/framework/var_desc.h @@ -66,6 +66,8 @@ class VarDesc { std::string Name() const { return desc_.name(); } + void SetName(std::string name) { desc_.set_name(name); } + void SetShape(const std::vector &dims); void SetDataType(proto::DataType data_type); diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp index 337b9ba7bc0fc4e4bb80ee7b248d934f111379d5..8faf032f550836579522016b4fff3db7e94746e3 100644 --- a/paddle/gserver/layers/PriorBox.cpp +++ b/paddle/gserver/layers/PriorBox.cpp @@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap, if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size()); // flip aspect ratios - for (int index = 0; index < tmp.size(); index++) { + for (unsigned index = 0; index < tmp.size(); index++) { real ar = tmp[index]; if (fabs(ar - 1.) < 1e-6) continue; aspectRatio_.push_back(ar); diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index ba83667ebc9a89c37f77a7f71e6df90b54723cc0..aab02f16849582db4b41087046b810463a855e1a 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) { "seqlastins", "non-seq", -1); // hasSubseq seqlastins to non-seq - testDegradeLayer( - true, "seqlastins", "seq", -1); // hasSubseq seqlastins to seq + testDegradeLayer(true, + "seqlastins", + "seq", + -1); // hasSubseq seqlastins to seq } TEST(Layer, AverageLayer) { @@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) { "average", "non-seq", 5); // seq average to a shorten seq, stride window = 5 - testDegradeLayer( - true, "average", "non-seq", -1); // hasSubseq average to non-seq + testDegradeLayer(true, + "average", + "non-seq", + -1); // hasSubseq average to non-seq testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq } @@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) { testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); - testPoolLayer2( - "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true); + testPoolLayer2("cudnn-avg-incl-pad-pool", + /* trans= */ false, + /* useGpu= */ true); testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true); #endif } @@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) { } TEST(Layer, ScaleShiftLayer) { - const size_t batchSize = 16; - const size_t size = 32; - TestConfig config; - config.layerConfig.set_type("scale_shift"); - config.layerConfig.set_size(size); - config.biasSize = 1; - config.inputDefs.push_back( - {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1}); - config.layerConfig.add_inputs(); - for (auto useGpu : {false, true}) { - testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false); - } + // FIXME: Disable ScaleShiftLayer because it is not stable. + // https://github.com/PaddlePaddle/Paddle/issues/7781 + return; + // const size_t batchSize = 16; + // const size_t size = 32; + // TestConfig config; + // config.layerConfig.set_type("scale_shift"); + // config.layerConfig.set_size(size); + // config.biasSize = 1; + // config.inputDefs.push_back( + // {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1}); + // config.layerConfig.add_inputs(); + // for (auto useGpu : {false, true}) { + // testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false); + // } } TEST(Layer, ScaleSubRegionLayer) { diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc index 49001778808173b82865a4b6632a6b175ef96242..b43c359ed1787143403336e8c1cb4c7f85b1d7a2 100644 --- a/paddle/inference/inference.cc +++ b/paddle/inference/inference.cc @@ -15,18 +15,13 @@ limitations under the License. */ #include "inference.h" #include #include "paddle/framework/executor.h" -#include "paddle/framework/feed_fetch_method.h" #include "paddle/framework/init.h" #include "paddle/framework/scope.h" -#ifdef PADDLE_USE_PTOOLS -#include "chooseser.h" -#endif - namespace paddle { void InferenceEngine::LoadInferenceModel(const std::string& dirname) { - std::string model_filename = dirname + "/__model__.dat"; + std::string model_filename = dirname + "/__model__"; LOG(INFO) << "loading model from " << model_filename; std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary); std::string program_desc_str; @@ -52,39 +47,15 @@ void InferenceEngine::LoadInferenceModel(const std::string& dirname) { } } -void InferenceEngine::LoadInferenceModel( - const std::string& dirname, - const std::vector& feed_var_names, - const std::vector& fetch_var_names) { - std::string model_filename = dirname + "/__model__.dat"; - LOG(INFO) << "loading model from " << model_filename; - std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary); - std::string program_desc_str; - inputfs.seekg(0, std::ios::end); - program_desc_str.resize(inputfs.tellg()); - inputfs.seekg(0, std::ios::beg); - LOG(INFO) << "program_desc_str's size: " << program_desc_str.size(); - inputfs.read(&program_desc_str[0], program_desc_str.size()); - inputfs.close(); - - program_ = new framework::ProgramDesc(program_desc_str); - GenerateLoadProgram(dirname); - - if (feed_var_names.empty() || fetch_var_names.empty()) { - LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names."; - } - feed_var_names_ = feed_var_names; - fetch_var_names_ = fetch_var_names; - PrependFeedOp(); - AppendFetchOp(); -} - bool InferenceEngine::IsParameter(const framework::VarDesc* var) { - if (var->Persistable() && var->Name() != "feed" && var->Name() != "fetch") { + if (var->Persistable()) { // There are many unreachable variables in the program for (size_t i = 0; i < program_->Size(); ++i) { const framework::BlockDesc& block = program_->Block(i); for (auto* op : block.AllOps()) { + if (op->Type() == "feed") { + continue; + } for (auto input_argument_name : op->InputArgumentNames()) { if (input_argument_name == var->Name()) { return true; @@ -182,7 +153,7 @@ void InferenceEngine::Execute(const std::vector& feeds, LOG(FATAL) << "Please initialize the program_ and load_program_ first."; } - if (feeds.size() < feed_var_names_.size()) { + if (feeds.size() != feed_var_names_.size()) { LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors."; } @@ -193,19 +164,22 @@ void InferenceEngine::Execute(const std::vector& feeds, executor->Run(*load_program_, scope, 0, true, true); + std::map feed_targets; + std::map fetch_targets; + // set_feed_variable for (size_t i = 0; i < feed_var_names_.size(); ++i) { - framework::SetFeedVariable(scope, feeds[i], "feed", i); + feed_targets[feed_var_names_[i]] = &feeds[i]; } - executor->Run(*program_, scope, 0, true, true); - // get_fetch_variable fetchs.resize(fetch_var_names_.size()); for (size_t i = 0; i < fetch_var_names_.size(); ++i) { - fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i); + fetch_targets[fetch_var_names_[i]] = &fetchs[i]; } + executor->Run(*program_, scope, feed_targets, fetch_targets); + delete place; delete scope; delete executor; diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h index 7fc09cb9e539a65a8cd3cceb1543bc7d111c22b3..26f259824b945e260b370ced9d065842264075d5 100644 --- a/paddle/inference/inference.h +++ b/paddle/inference/inference.h @@ -29,9 +29,6 @@ public: } void LoadInferenceModel(const std::string& dirname); - void LoadInferenceModel(const std::string& dirname, - const std::vector& feed_var_names, - const std::vector& fetch_var_names); void Execute(const std::vector& feeds, std::vector& fetchs); diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index 061ee1a4d4c97842efe6e64b89f09cfe5c65cd47..496098f80423854be62dc99b8601209ff6a6b182 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(detail) cc_library(memory SRCS memory.cc DEPS place enforce) -cc_library(memcpy SRCS memcpy.cc) +cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(paddle_memory DEPS diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index 88c3d1c597a853abdee7753a5110be4a1726e905..c0809abc05104c1e8c1f42331c0530724dd1472f 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -323,7 +323,7 @@ template struct FloorFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { - out.device(d) = x.ceil(); + out.device(d) = x.floor(); } }; diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc index 4c71d66d22899d2cf6418935bf9358a0f73cec27..844ade40eb2a7ae239b079daa609f03b9e7a06df 100644 --- a/paddle/operators/beam_search_op.cc +++ b/paddle/operators/beam_search_op.cc @@ -24,8 +24,18 @@ namespace operators { void BeamSearch::operator()(const framework::LoDTensor &pre_ids, framework::LoDTensor *selected_ids, framework::LoDTensor *selected_scores) { + auto abs_lod = framework::ToAbsOffset(ids_->lod()); + auto &high_level = abs_lod[lod_level_]; + auto items = SelectTopBeamSizeItems(); - auto selected_items = ToMap(items); + auto selected_items = ToMap(items, high_level.back()); + VLOG(3) << "selected_items:"; + for (size_t i = 0; i < selected_items.size(); ++i) { + VLOG(3) << "offset:" << i; + for (auto &item : selected_items[i]) { + VLOG(3) << ItemToString(item); + } + } PruneEndidCandidates(pre_ids, &selected_items); // calculate the output tensor's height size_t num_instances = std::accumulate( @@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, low_level.push_back(low_offset); // fill lod - auto abs_lod = framework::ToAbsOffset(ids_->lod()); - auto &high_level = abs_lod[lod_level_]; framework::LoD lod(2); lod[0].assign(high_level.begin(), high_level.end()); lod[1].assign(low_level.begin(), low_level.end()); + if (!framework::CheckLoD(lod)) { + PADDLE_THROW("lod %s is not right", framework::LoDToString(lod)); + } selected_ids->set_lod(lod); selected_scores->set_lod(lod); } @@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, } std::vector> BeamSearch::ToMap( - const std::vector> &items) { + const std::vector> &items, size_t element_num) { std::vector> result; + result.resize(element_num); for (auto &entries : items) { for (const auto &item : entries) { - if (item.offset >= result.size()) { - result.resize(item.offset + 1); - } result[item.offset].push_back(item); } } @@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() { } result.emplace_back(items); } + VLOG(3) << "SelectTopBeamSizeItems result size " << result.size(); + for (auto &items : result) { + VLOG(3) << "item set:"; + for (auto &item : items) { + VLOG(3) << ItemToString(item); + } + } + return result; } @@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector *items) { return true; } +std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) { + os << "{"; + os << "offset: " << item.offset << ", "; + os << "id: " << item.id << ", "; + os << "score: " << item.score << ""; + os << "}"; + + return os; +} + +std::string ItemToString(const BeamSearch::Item &item) { + std::ostringstream stream; + stream << item; + return stream.str(); +} + class BeamSearchProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker { public: @@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker } }; +class BeamSearchInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + for (const std::string &arg : + std::vector({"pre_ids", "ids", "scores"})) { + PADDLE_ENFORCE(context->HasInput(arg), + "BeamSearch need input argument '%s'", arg); + } + for (const std::string &arg : + std::vector({"selected_ids", "selected_scores"})) { + PADDLE_ENFORCE(context->HasOutput(arg), + "BeamSearch need output argument '%s'", arg); + } + } +}; + +class BeamSearchInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &o : op_desc.Output("selected_ids")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + for (auto &o : op_desc.Output("selected_scores")) { + block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR); + } + } +}; + } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp, - paddle::operators::BeamSearchProtoAndCheckerMaker); +REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp, + paddle::operators::BeamSearchProtoAndCheckerMaker, + paddle::operators::BeamSearchInferShape, + paddle::operators::BeamSearchInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h index 45d14d68fe8d1c4a84aa826e68e76692444765a8..7ad85874fcbd6ea48d688b32f2cc982d6b76d3c4 100644 --- a/paddle/operators/beam_search_op.h +++ b/paddle/operators/beam_search_op.h @@ -136,8 +136,6 @@ class BeamSearch { void operator()(const framework::LoDTensor& pre_ids, framework::LoDTensor* selected_ids, framework::LoDTensor* selected_scores); - - protected: /* * The basic items help to sort. */ @@ -155,6 +153,7 @@ class BeamSearch { score_t score; }; + protected: /* * Delete all the records that follows the end token. */ @@ -166,7 +165,7 @@ class BeamSearch { * NOTE low performance */ std::vector> ToMap( - const std::vector>& inputs); + const std::vector>& inputs, size_t element_num); /* * For each source, select top beam_size records. @@ -187,6 +186,10 @@ class BeamSearch { int end_id_{0}; }; +std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item); + +std::string ItemToString(const BeamSearch::Item& item); + class BeamSearchOp : public framework::OperatorBase { public: BeamSearchOp(const std::string& type, @@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase { void Run(const framework::Scope& scope, const platform::Place& dev_place) const override { - LOG(INFO) << "run beam search op"; auto ids_var = scope.FindVar(Input("ids")); auto scores_var = scope.FindVar(Input("scores")); auto pre_ids_var = scope.FindVar(Input("pre_ids")); @@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase { size_t level = Attr("level"); size_t beam_size = Attr("beam_size"); int end_id = Attr("end_id"); - LOG(INFO) << "init beam search"; BeamSearch alg(ids, scores, level, beam_size, end_id); - LOG(INFO) << "after beam search"; auto selected_ids_var = scope.FindVar(Output("selected_ids")); auto selected_scores_var = scope.FindVar(Output("selected_scores")); PADDLE_ENFORCE_NOT_NULL(selected_ids_var); @@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase { *selected_ids_var->GetMutable(); auto& selected_scores_tensor = *selected_scores_var->GetMutable(); - LOG(INFO) << "run beam search"; alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor); - LOG(INFO) << "finish beam search"; } }; diff --git a/paddle/operators/ctc_align_op.h b/paddle/operators/ctc_align_op.h index 589413feb3dcbb7fea1f0a878b35d4bf714b5318..fed89aa1e899a2450b315f352b9695056ed13aec 100644 --- a/paddle/operators/ctc_align_op.h +++ b/paddle/operators/ctc_align_op.h @@ -51,7 +51,7 @@ class CTCAlignKernel : public framework::OpKernel { T prev_token = -1; for (size_t i = input_lod[level][seq_idx]; i < input_lod[level][seq_idx + 1]; ++i) { - if (input_data[i] != blank && + if ((unsigned)input_data[i] != blank && !(merge_repeated && input_data[i] == prev_token)) { output_data[output_idx] = input_data[i]; ++output_idx; diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc index 1e41587c418fb0ce4e452d5c6735c54e2d42f798..c43394554263c761d010494f692a4050cae7e8cb 100644 --- a/paddle/operators/detail/grpc_client.cc +++ b/paddle/operators/detail/grpc_client.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "grpc_client.h" +#include "paddle/framework/threadpool.h" namespace paddle { namespace operators { namespace detail { @@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { - sendrecv::VariableMessage req; - auto* var = scope.FindVar(var_name); - SerializeToMessage(var_name, var, ctx, &req); - - // varhandle - VarHandle var_h; - var_h.ep = ep; - var_h.scope = &scope; - var_h.name = var_name; - var_h.ctx = &ctx; - - // stub context - auto ch = GetChannel(ep); - SendProcessor* s = new SendProcessor(ch); - s->Prepare(var_h, time_out); - s->response_call_back_ = NULL; - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, (void*)s); + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { + auto* var = p_scope->FindVar(var_name_val); + sendrecv::VariableMessage req; + SerializeToMessage(var_name_val, var, *p_ctx, &req); + + // varhandle + VarHandle var_h; + var_h.ep = ep_val; + var_h.scope = p_scope; + var_h.name = var_name_val; + var_h.ctx = p_ctx; + + // stub context + SendProcessor* s = new SendProcessor(ch); + s->Prepare(var_h, time_out); + s->response_call_back_ = NULL; + + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + }); req_count_++; @@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, void ProcGetResponse(const VarHandle& var_h, const sendrecv::VariableMessage& ret_msg) { auto* outvar = var_h.scope->FindVar(var_h.name); - - std::istringstream iss(ret_msg.serialized()); DeserializeFromMessage(ret_msg, *var_h.ctx, outvar); } @@ -60,24 +66,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(var_name); - - // varhandle - VarHandle var_h; - var_h.ep = ep; - var_h.scope = &scope; - var_h.name = var_name; - var_h.ctx = &ctx; - - // stub context - auto ch = GetChannel(ep); - GetProcessor* s = new GetProcessor(ch); - s->Prepare(var_h, time_out); - s->response_call_back_ = ProcGetResponse; - - auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, (void*)s); + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + + // varhandle + VarHandle var_h; + var_h.ep = ep_val; + var_h.scope = p_scope; + var_h.name = var_name_val; + var_h.ctx = p_ctx; + + // stub context + GetProcessor* s = new GetProcessor(ch); + s->Prepare(var_h, time_out); + s->response_call_back_ = ProcGetResponse; + + auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, (void*)s); + }); req_count_++; @@ -85,19 +98,31 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, } bool RPCClient::Wait() { - bool ok = true; + if (req_count_ <= 0) { + return true; + } + const size_t kReqCnt = req_count_; + bool a[kReqCnt]; + std::vector> waits(req_count_); - while (true) { - if (req_count_ <= 0) { - break; - } + for (int i = 0; i < req_count_; i++) { + waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); }); + } + + for (int i = 0; i < req_count_; i++) { + waits[i].wait(); + } - if (!Proceed()) { + int last_req_count = req_count_; + req_count_ = 0; + + for (int i = 0; i < last_req_count; i++) { + if (!a[i]) { return false; } } - return ok; + return true; } bool RPCClient::Proceed() { @@ -124,7 +149,6 @@ bool RPCClient::Proceed() { c->Process(); delete c; - req_count_--; return true; } diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc index 76f2adefede3b4bc4035f86f8f8663eed29343ae..fb901b639492a179925ff852f9030fc6674d1f63 100644 --- a/paddle/operators/gru_op.cc +++ b/paddle/operators/gru_op.cc @@ -135,14 +135,14 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( GRU Operator implements part calculations of the complete GRU as following: -\f[ -update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ -reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\ -output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ +$$ +update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ +reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\ +output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t) -\f] +$$ -@note To implement the complete GRU, fully-connected operator must be used +@note To implement the complete GRU, fully-connected operator must be used before to feed xu, xr and xc as the Input of GRU operator. )DOC"); } diff --git a/paddle/operators/im2sequence_op.h b/paddle/operators/im2sequence_op.h index aeb810015134babc132909b3e820fa8391233b1c..f33aec71a92a65ec0e4114530d70e36c9dc1be04 100644 --- a/paddle/operators/im2sequence_op.h +++ b/paddle/operators/im2sequence_op.h @@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel { framework::LoD lod(1); lod[0].reserve(batch_size + 1); for (int i = 0, offset = 0; i < batch_size + 1; ++i) { - lod[0][i] = offset; + lod[0].push_back(offset); offset += output_height * output_width; } out->set_lod(lod); diff --git a/paddle/operators/iou_similarity_op.cc b/paddle/operators/iou_similarity_op.cc new file mode 100755 index 0000000000000000000000000000000000000000..c520b28b83e66dbf53d2e19985370be4a2f69e23 --- /dev/null +++ b/paddle/operators/iou_similarity_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/iou_similarity_op.h" + +namespace paddle { +namespace operators { + +class IOUSimilarityOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of IOUSimilarityOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of IOUSimilarityOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2."); + PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]"); + PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2."); + PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]"); + + ctx->ShareLoD("X", /*->*/ "Out"); + ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]})); + } +}; + +class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker { + public: + IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, default LoDTensor) " + "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "the shape of X is [N, 4]. [xmin, ymin] is the left top " + "coordinate of the box if the input is image feature map, they " + "are close to the origin of the coordinate system. " + "[xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddInput("Y", + "(Tensor, default Tensor) " + "Box list Y holds M boxes, each box is represented as " + "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. " + "[xmin, ymin] is the left top coordinate of the box if the " + "input is image feature map, and [xmax, ymax] is the right " + "bottom coordinate of the box."); + + AddOutput("Out", + "(LoDTensor, the lod is same as input X) The output of " + "iou_similarity op, a tensor with shape [N, M] " + "representing pairwise iou scores."); + + AddComment(R"DOC( +IOU Similarity Operator. +Computes intersection-over-union (IOU) between two box lists. + Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, + boxes in 'Y' are shared by all instance of the batched inputs of X. + Given two boxes A and B, the calculation of IOU is as follows: + +$$ +IOU(A, B) = +\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)} +$$ + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp, + ops::IOUSimilarityOpMaker); + +REGISTER_OP_CPU_KERNEL( + iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/operators/iou_similarity_op.cu b/paddle/operators/iou_similarity_op.cu new file mode 100755 index 0000000000000000000000000000000000000000..fa5052624618c35875b241419946f69b776c81d4 --- /dev/null +++ b/paddle/operators/iou_similarity_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/iou_similarity_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/operators/iou_similarity_op.h b/paddle/operators/iou_similarity_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e36177069d7b18ea23759f99c4679218fbfd32b8 --- /dev/null +++ b/paddle/operators/iou_similarity_op.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/platform/for_range.h" + +template +inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2, + T ymin2, T xmax2, T ymax2) { + constexpr T zero = static_cast(0); + T area1 = (ymax1 - ymin1) * (xmax1 - xmin1); + T area2 = (ymax2 - ymin2) * (xmax2 - xmin2); + T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1; + T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1; + T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2; + T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2; + T inter_height = inter_ymax - inter_ymin; + T inter_width = inter_xmax - inter_xmin; + inter_height = inter_height > zero ? inter_height : zero; + inter_width = inter_width > zero ? inter_width : zero; + T inter_area = inter_width * inter_height; + T union_area = area1 + area2 - inter_area; + T sim_score = inter_area / union_area; + return sim_score; +} + +template +struct IOUSimilarityFunctor { + IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols) + : x_(x), y_(y), z_(z), cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t row_id) const { + T x_min1 = x_[row_id * 4]; + T y_min1 = x_[row_id * 4 + 1]; + T x_max1 = x_[row_id * 4 + 2]; + T y_max1 = x_[row_id * 4 + 3]; + for (size_t i = 0; i < cols_; ++i) { + T x_min2 = y_[i * 4]; + T y_min2 = y_[i * 4 + 1]; + T x_max2 = y_[i * 4 + 2]; + T y_max2 = y_[i * 4 + 3]; + + T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2, + x_max2, y_max2); + + z_[row_id * cols_ + i] = sim; + } + } + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; + +namespace paddle { +namespace operators { + +template +class IOUSimilarityKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::LoDTensor* in_x = ctx.Input("X"); + const framework::Tensor* in_y = ctx.Input("Y"); + framework::LoDTensor* out = ctx.Output("Out"); + + int x_n = in_x->dims()[0]; + int y_n = in_y->dims()[0]; + IOUSimilarityFunctor functor(in_x->data(), in_y->data(), + out->mutable_data(ctx.GetPlace()), y_n); + + platform::ForRange for_range( + static_cast(ctx.device_context()), x_n); + for_range(functor); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc index bb03def4391da80c6219f7863d300fd3c8d8c7ac..2405852f53d46356a474897d3a111d1c94eed081 100644 --- a/paddle/operators/lookup_table_op.cc +++ b/paddle/operators/lookup_table_op.cc @@ -66,6 +66,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "Sparse update") .SetDefault(false); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(-1); AddComment(R"DOC( Lookup Table Operator. diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index 261a28da694bf551d8d9e630139680aebc4be51a..d97390fa1c53fa0bdf16ab34cb209b994621f83c 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -21,9 +21,11 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template __global__ void LookupTable(T* output, const T* table, const int64_t* ids, - const int64_t N, const int64_t K, const int64_t D) { + const int64_t N, const int64_t K, const int64_t D, + const int64_t padding_idx) { int idx = threadIdx.x; int idy = blockIdx.x + threadIdx.y * GridDimX; @@ -34,7 +36,14 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids, T* out = output + idy * D; const T* tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { - out[i] = tab[i]; + if (PaddingFlag) { + if (id == padding_idx) + out[i] = static_cast(0); + else + out[i] = tab[i]; + } else { + out[i] = tab[i]; + } } idy += BlockDimY * GridDimX; } @@ -67,6 +76,7 @@ class LookupTableCUDAKernel : public framework::OpKernel { auto* table_t = context.Input("W"); auto* ids_t = context.Input("Ids"); auto* output_t = context.Output("Out"); + int64_t padding_idx = context.Attr("padding_idx"); size_t N = table_t->dims()[0]; size_t D = table_t->dims()[1]; @@ -77,10 +87,17 @@ class LookupTableCUDAKernel : public framework::OpKernel { dim3 threads(128, 8); dim3 grids(8, 1); - LookupTable< - T, 128, 8, - 8><<>>( - output, table, ids, N, K, D); + + if (padding_idx == -1) + LookupTable< + T, 128, 8, 8, + false><<>>( + output, table, ids, N, K, D, padding_idx); + else + LookupTable< + T, 128, 8, 8, + true><<>>( + output, table, ids, N, K, D, padding_idx); } }; @@ -91,6 +108,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { auto* ids = context.Input("Ids"); auto* table = context.Input("W"); diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h index 2fd3335868406455ec01f9ded6bacc7bda5e2a67..0842c422f7bfd3cad9b36dfdbab930f3cc4a8728 100644 --- a/paddle/operators/lookup_table_op.h +++ b/paddle/operators/lookup_table_op.h @@ -32,16 +32,30 @@ class LookupTableKernel : public framework::OpKernel { auto* table_t = context.Input("W"); // float tensor auto* ids_t = context.Input("Ids"); // int tensor auto* output_t = context.Output("Out"); // float tensor + int64_t padding_idx = context.Attr("padding_idx"); int N = table_t->dims()[0]; int D = table_t->dims()[1]; auto* ids = ids_t->data(); auto* table = table_t->data(); auto* output = output_t->mutable_data(context.GetPlace()); - for (int64_t i = 0; i < ids_t->numel(); ++i) { - PADDLE_ENFORCE_LT(ids[i], N); - PADDLE_ENFORCE_GE(ids[i], 0); - memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + + if (padding_idx == -1) { + for (int64_t i = 0; i < ids_t->numel(); ++i) { + PADDLE_ENFORCE_LT(ids[i], N); + PADDLE_ENFORCE_GE(ids[i], 0); + memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + } + } else { + for (int64_t i = 0; i < ids_t->numel(); ++i) { + if (ids[i] == padding_idx) { + memset(output + i * D, 0, D * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], N); + PADDLE_ENFORCE_GE(ids[i], 0); + memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); + } + } } } }; @@ -51,6 +65,8 @@ class LookupTableGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { auto* ids = context.Input("Ids"); auto* table = context.Input("W"); diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c96b30ba353fabc48630258ea8f88f741b8c415e --- /dev/null +++ b/paddle/operators/lstmp_op.cc @@ -0,0 +1,331 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/lstmp_op.h" + +namespace paddle { +namespace operators { + +class LSTMPOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ProjWeight"), + "Input(ProjWeight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTMP operator should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("Projection"), + "Output(Projection) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Output(Cell) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchGate"), + "Output(BatchGate) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"), + "Output(BatchCellPreAct) of LSTMP operator should not be " + "null."); + PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"), + "Output(BatchHidden) of LSTMP operator should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 2, + "Input(X)'s rank of LSTMP operator must be 2."); + + int frame_size = in_dims[1] / 4; + auto w_dims = ctx->GetInputDim("Weight"); + auto proj_dims = ctx->GetInputDim("ProjWeight"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, + "The rank of Input(Weight) should be 2."); + PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1], + "The first dimension of Input(Weight) " + "should be %d.", + proj_dims[1]); + PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size, + "The second dimension of Input(Weight) " + "should be 4 * %d.", + frame_size); + + PADDLE_ENFORCE_EQ(proj_dims.size(), 2, + "The rank of Input(ProjWeight) should be 2."); + PADDLE_ENFORCE_EQ(proj_dims[0], frame_size, + "The first dimension of Input(ProjWeight) " + "should be %d.", + frame_size); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(C0) of LSTMP operator should not be null after " + "Input(H0) provided."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]}); + } + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + + if (ctx->Attrs().Get("use_peepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes connection", + frame_size); + } + + framework::DDim out_dims({in_dims[0], frame_size}); + framework::DDim proj_out_dims({in_dims[0], proj_dims[1]}); + ctx->SetOutputDim("Projection", proj_out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->SetOutputDim("BatchGate", in_dims); + ctx->SetOutputDim("BatchCellPreAct", out_dims); + ctx->SetOutputDim("BatchHidden", out_dims); + ctx->ShareLoD("Input", "Projection"); + ctx->ShareLoD("Input", "Cell"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(LoDTensor) the input for sequence data, which supports " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X 4D), where T is the " + "total time steps in this mini-batch, D is the hidden size."); + AddInput("H0", + "(Tensor, optional) the initial hidden state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) the initial cell state is an optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `C0` should not be null if `H0` provided.") + .AsDispensable(); + AddInput("Weight", + "(Tensor) the learnable hidden-hidden weights." + " - The shape is (P x 4D), where P is the projection layer size " + "and D is the hidden size." + " - Weight = {W_cr, W_ir, W_fr, W_or}"); + AddInput("ProjWeight", + "(Tensor) the learnable weight of the projection layer." + " - The shape is (D x P), where P is the recurrent projection " + "layer size and D is the hidden size." + " - ProjWeight = {W_rh}"); + AddInput("Bias", + "(Tensor) the learnable biases, which contains two parts: " + "input-hidden biases and peephole connections weights if " + "setting `use_peepholes` to `True`. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddOutput("Projection", + "(LoDTensor) the projection of the hidden state of LSTMP " + "operator. The shape is (T x P), and LoD is the same with the " + "`Input`."); + AddOutput("Cell", + "(LoDTensor) the cell state of LSTMP operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("BatchGate", + "(LoDTensor) This LoDTensor contains input gate, forget gate " + "and output gate after the activations. This LoDTensor has the " + "same shape as the reorganized input, which is also be called " + "batch input. The LoD size is 2. The first-level LoD is the " + "batch offsets and the second contains the indices, which " + "denotes the position of reorganized sequence in the raw input.") + .AsIntermediate(); + AddOutput("BatchCellPreAct", + "(LoDTensor) the pre-activation cell state reorganized in batch. " + "This LoDTensor is obtained in the forward and used in the " + "backward.") + .AsIntermediate(); + AddOutput("BatchHidden", + "(LoDTensor) the hidden state reorganized in batch. " + "This LoDTensor is obtained in the forward and used in the " + "backward.") + .AsIntermediate(); + AddOutput("OrderedP0", + "(Tensor) the projection of the initial hidden state " + "H0. This is a tensor with shape (N x P), where N is the " + "batch size and P is the hidden size.") + .AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTMP.") + .SetDefault(false); + AddAttr( + "gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("proj_activation", + "(string, default: tanh)" + "The activation for projection output, " + "`tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator. + +LSTMP has a separate projection layer after the LSTM layer, projecting the +original hidden state to a lower-dimensional one, which is proposed to reduce +the number of total parameters and furthermore computational complexity for +the LSTM, espeacially for the case that the size of output units is relative +large (https://research.google.com/pubs/archive/43905.pdf). + +The formula is as follows: + +$$ +i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\ + +f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\ + +\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\ + +o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\ + +c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ + +h_t = o_t \odot act_h(c_t) \\ + +r_t = \overline{act_h}(W_{rh}h_t) +$$ + +where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix +of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ +are diagonal weight matrices for peephole connections. In our implementation, +we use vectors to reprenset these diagonal weight matrices. The b terms +denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ +is the activation, such as logistic sigmoid function, and +$i, f, o$ and $c$ are the input gate, forget gate, output gate, +and cell activation vectors, respectively, all of which have the same size as +the cell output activation vector $h$. Here $h$ is usually called the hidden +state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also +called the candidate hidden state, whose computation is based on the current +input and previous hidden state. + +The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ +are the cell input and cell output activation functions and `tanh` is usually +used for them. $\overline{act_h}$ is the activation function for the +projection output, usually using `identity` or same as $act_h$. + +Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ +operations on the input $x_{t}$ are NOT included in this operator. +Users can choose to use fully-connected operator before LSTMP operator. + +)DOC"); + } +}; + +class LSTMPGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Projection"), + "Input(Projection) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cell"), + "Input(Cell) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ProjWeight"), + "Input(ProjWeight) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of LSTMP operator should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("BatchGate"), + "Input(BatchGate) of LSTMP operator should not be null."); + PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"), + "Input(BatchGate) of LSTMP operator should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + }; + + SetOutGradDim("Input"); + SetOutGradDim("Weight"); + SetOutGradDim("ProjWeight"); + SetOutGradDim("Bias"); + SetOutGradDim("H0"); + SetOutGradDim("C0"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad, + ops::LSTMPGradOp); +REGISTER_OP_CPU_KERNEL( + lstmp, ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CPU_KERNEL( + lstmp_grad, ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/operators/lstmp_op.cu b/paddle/operators/lstmp_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..7fcbcfecc871976fdfbfffbbb4e0243b91351a29 --- /dev/null +++ b/paddle/operators/lstmp_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/lstmp_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + lstmp, ops::LSTMPKernel, + ops::LSTMPKernel); +REGISTER_OP_CUDA_KERNEL( + lstmp_grad, + ops::LSTMPGradKernel, + ops::LSTMPGradKernel); diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ee82d5c10a5421b181e525f49a263d4808ede62f --- /dev/null +++ b/paddle/operators/lstmp_op.h @@ -0,0 +1,491 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/operators/activation_op.h" +#include "paddle/operators/math/detail/activation_functions.h" +#include "paddle/operators/math/lstm_compute.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/sequence2batch.h" + +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +inline void ReorderInitState(const DeviceContext& ctx, + const framework::Tensor& src, const size_t* index, + framework::Tensor* dst, bool indexed_src) { + math::CopyMatrixRowsFunctor row_shuffle; + dst->mutable_data(src.dims(), ctx.GetPlace()); + row_shuffle(ctx, src, index, *dst, indexed_src); +} + +template +class LSTMPKernel : public framework::OpKernel { + public: + template + void ActCompute(const math::detail::ActivationType act_type, const Device& d, + X x, Y y) const { + if (act_type == math::detail::ActivationType::kIdentity) + y.device(d) = x; + else if (act_type == math::detail::ActivationType::kSigmoid) + SigmoidFunctor()(d, x, y); + else if (act_type == math::detail::ActivationType::kTanh) + TanhFunctor()(d, x, y); + else if (act_type == math::detail::ActivationType::kReLU) + ReluFunctor()(d, x, y); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* proj_weight = ctx.Input("ProjWeight"); + auto* bias = ctx.Input("Bias"); + + auto* hidden_t0 = ctx.Input("H0"); + auto* ordered_proj0 = ctx.Output("OrderedP0"); + auto* cell_t0 = ctx.Input("C0"); + + auto* batch_gate = ctx.Output("BatchGate"); + batch_gate->mutable_data(ctx.GetPlace()); + auto* proj_out = ctx.Output("Projection"); + proj_out->mutable_data(ctx.GetPlace()); + auto* cell_out = ctx.Output("Cell"); + cell_out->mutable_data(ctx.GetPlace()); + + bool is_reverse = ctx.Attr("is_reverse"); + math::LoDTensor2BatchFunctor to_batch; + auto& device_ctx = ctx.template device_context(); + to_batch(device_ctx, *input, *batch_gate, true, is_reverse); + + auto in_dims = input->dims(); + int frame_size = static_cast(in_dims[1] / 4); + framework::DDim dims({in_dims[0], frame_size}); + framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); + + if (bias) { + Tensor b = *bias; + b.Resize({bias->numel(), 1}); + Tensor gate_bias = b.Slice(0, 4 * frame_size); + math::RowwiseAdd add_bias; + add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); + } + + math::LstmMetaValue lstmp_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + // the code style in LstmpMetaValue will be updated later. + + lstmp_value.check_ig = bias_data + 4 * frame_size; + lstmp_value.check_fg = lstmp_value.check_ig + frame_size; + lstmp_value.check_og = lstmp_value.check_fg + frame_size; + } else { + lstmp_value.check_ig = nullptr; + lstmp_value.check_fg = nullptr; + lstmp_value.check_og = nullptr; + } + lstmp_value.prev_state_value = nullptr; + Tensor ordered_c0; + const size_t* order = batch_gate->lod()[2].data(); + if (cell_t0) { + // Since the batch computing for LSTMP reorders the input sequence + // according to their length. The initialized cell state also needs + // to reorder. + ReorderInitState(device_ctx, *cell_t0, order, + &ordered_c0, true); + lstmp_value.prev_state_value = ordered_c0.data(); + } + + // Use the local variable as here. + LoDTensor batch_proj, batch_cell; + auto* batch_cell_pre_act = ctx.Output("BatchCellPreAct"); + batch_cell_pre_act->mutable_data(dims, ctx.GetPlace()); + auto* batch_hidden = ctx.Output("BatchHidden"); + batch_hidden->mutable_data(dims, ctx.GetPlace()); // T x D + batch_proj.mutable_data(proj_dims, ctx.GetPlace()); // T x P + batch_cell.mutable_data(dims, ctx.GetPlace()); // T x D + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + auto proj_act = math::detail::GetActivationType( + ctx.Attr("proj_activation")); + auto& place = *ctx.template device_context().eigen_device(); + + for (size_t n = 0; n < num_batch; n++) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor gate_t = batch_gate->Slice(bstart, bend); + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + Tensor proj_t = batch_proj.Slice(bstart, bend); + Tensor cell_t = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend); + + int cur_batch_size = bend - bstart; + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_proj_t, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); + } else if (hidden_t0) { + // If n == 0 and there is no initialized hidden state, that is to say + // the H0 is zeros, the calculation W_h * H0 will be skiped. + // If n == 0 and there is initialized hidden state, calculate W_h * H0. + + // Since the batch computing for LSTMP reorders the input sequence + // according to their length. The initialized hidden state also needs + // to reorder. + + Tensor ordered_h0; + ordered_proj0->mutable_data(ctx.GetPlace()); + ReorderInitState(device_ctx, *hidden_t0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, false, + *proj_weight, false, static_cast(1.0), + ordered_proj0, static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj0_dev = EigenMatrix::From(*ordered_proj0); + ActCompute(cell_act, place, proj0_dev, proj0_dev); + } + math::matmul(device_ctx, *ordered_proj0, false, + *weight, false, static_cast(1.0), + &gate_t, static_cast(1.0)); + } + + lstmp_value.gate_value = gate_t.data(); + lstmp_value.output_value = hidden_t.data(); + lstmp_value.state_value = cell_t.data(); + lstmp_value.state_active_value = cell_pre_act_t.data(); + math::LstmUnitFunctor::compute( + device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act, + cell_act, cand_act); + lstmp_value.prev_state_value = lstmp_value.state_value; + math::matmul(device_ctx, hidden_t, false, *proj_weight, + false, static_cast(1.0), &proj_t, + static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj_t_dev = EigenMatrix::From(proj_t); + ActCompute(cell_act, place, proj_t_dev, proj_t_dev); + } + } + + math::Batch2LoDTensorFunctor to_seq; + batch_proj.set_lod(batch_gate->lod()); + // restore the output hidden in LoDTensor from the batch hidden + to_seq(device_ctx, batch_proj, *proj_out); + + batch_cell.set_lod(batch_gate->lod()); + // restore the output cell state in LoDTensor from the batch cell + to_seq(device_ctx, batch_cell, *cell_out); + } +}; + +template +class LSTMPGradKernel : public framework::OpKernel { + public: + template + void ActGradCompute(const math::detail::ActivationType act_type, + const Device& d, X x, Y y, DX dx, DY dy) const { + // x is dummy and won't be used even in Relu(use y instead) + if (act_type == math::detail::ActivationType::kIdentity) + dx.device(d) = dy; + else if (act_type == math::detail::ActivationType::kSigmoid) + SigmoidGradFunctor()(d, x, y, dy, dx); + else if (act_type == math::detail::ActivationType::kTanh) + TanhGradFunctor()(d, x, y, dy, dx); + else if (act_type == math::detail::ActivationType::kReLU) + ReluGradFunctor()(d, x, y, dy, dx); + else + PADDLE_THROW("unsupported activation type"); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* weight = ctx.Input("Weight"); + auto* proj_weight = ctx.Input("ProjWeight"); + auto* bias = ctx.Input("Bias"); + + auto* proj_out = ctx.Input("Projection"); + auto* cell_out = ctx.Input("Cell"); + + auto* batch_gate = ctx.Input("BatchGate"); + auto* batch_cell_pre_act = ctx.Input("BatchCellPreAct"); + auto* batch_hidden = ctx.Input("BatchHidden"); + + auto* projection_g = + ctx.Input(framework::GradVarName("Projection")); + + auto* in_g = ctx.Output(framework::GradVarName("Input")); + auto* weight_g = ctx.Output(framework::GradVarName("Weight")); + auto* proj_weight_g = + ctx.Output(framework::GradVarName("ProjWeight")); + auto* bias_g = ctx.Output(framework::GradVarName("Bias")); + + auto* h0 = ctx.Input("H0"); + auto* ordered_proj0 = ctx.Input("OrderedP0"); + auto* c0 = ctx.Input("C0"); + + auto* h0_g = ctx.Output(framework::GradVarName("H0")); + auto* c0_g = ctx.Output(framework::GradVarName("C0")); + + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + if (weight_g) { + weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, weight_g, static_cast(0.0)); + } + if (proj_weight_g) { + proj_weight_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, proj_weight_g, static_cast(0.0)); + } + + // ordered_h0/c0 is the reordered hidden/cell initialization. + // ordered_h0_g/c0_g is the reordered gradient of hidden/cell + // initialization. + Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; + const size_t* order = batch_gate->lod()[2].data(); + if (c0) { + ReorderInitState(device_ctx, *c0, order, &ordered_c0, + true); + } + if (c0 && c0_g) { + ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); + } + + auto in_dims = input->dims(); + auto out_dims = cell_out->dims(); + framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); + int frame_size = static_cast(in_dims[1] / 4); + PADDLE_ENFORCE_EQ(frame_size, out_dims[1]); + + math::LstmMetaValue lstmp_value; + if (bias && ctx.Attr("use_peepholes")) { + T* bias_data = const_cast(bias->data()); + lstmp_value.check_ig = bias_data + 4 * frame_size; + lstmp_value.check_fg = lstmp_value.check_ig + frame_size; + lstmp_value.check_og = lstmp_value.check_fg + frame_size; + } else { + lstmp_value.check_ig = nullptr; + lstmp_value.check_fg = nullptr; + lstmp_value.check_og = nullptr; + } + + math::LstmMetaGrad lstmp_grad; + + if (bias && bias_g) { + bias_g->mutable_data(ctx.GetPlace()); + zero(device_ctx, bias_g, static_cast(0.0)); + } + if (bias && bias_g && ctx.Attr("use_peepholes")) { + T* bias_g_data = bias_g->data(); + lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size; + lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size; + lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size; + } else { + lstmp_grad.check_ig_grad = nullptr; + lstmp_grad.check_fg_grad = nullptr; + lstmp_grad.check_og_grad = nullptr; + } + + math::LoDTensor2BatchFunctor to_batch; + + auto ToBatch = [&batch_gate, &to_batch]( + const DeviceContext& ctx, const framework::LoDTensor& src, + const framework::DDim& dims, framework::LoDTensor& dst) { + dst.mutable_data(dims, ctx.GetPlace()); + dst.set_lod(batch_gate->lod()); + to_batch(ctx, src, dst, false); + }; + + LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell; + batch_hidden_g.mutable_data(out_dims, ctx.GetPlace()); + ToBatch(device_ctx, *proj_out, proj_dims, batch_proj); // T x P + ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g); // T x P + ToBatch(device_ctx, *cell_out, out_dims, batch_cell); // T x D + + LoDTensor batch_cell_g, batch_gate_g; + batch_cell_g.mutable_data(out_dims, ctx.GetPlace()); + // TODO(qingqing) support the case output cell has gradient. + // to_batch(device_ctx, *cell_g, batch_cell_g, false); + zero(device_ctx, &batch_cell_g, static_cast(0.0)); + batch_gate_g.mutable_data(batch_gate->dims(), ctx.GetPlace()); + batch_gate_g.set_lod(batch_gate->lod()); + + auto gate_act = math::detail::GetActivationType( + ctx.Attr("gate_activation")); + auto cell_act = math::detail::GetActivationType( + ctx.Attr("cell_activation")); + auto cand_act = math::detail::GetActivationType( + ctx.Attr("candidate_activation")); + auto proj_act = math::detail::GetActivationType( + ctx.Attr("proj_activation")); + auto& place = *ctx.template device_context().eigen_device(); + + auto batch_starts = batch_gate->lod()[0]; + size_t num_batch = batch_starts.size() - 1; + for (int n = static_cast(num_batch) - 1; n >= 0; n--) { + int bstart = static_cast(batch_starts[n]); + int bend = static_cast(batch_starts[n + 1]); + + Tensor cur_proj = batch_proj.Slice(bstart, bend); + Tensor proj_g = batch_proj_g.Slice(bstart, bend); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto cur_proj_dev = EigenMatrix::From(cur_proj); + auto proj_g_dev = EigenMatrix::From(proj_g); + ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev, + proj_g_dev); + } + /* hidden state backwarad */ + Tensor out_g = batch_hidden_g.Slice(bstart, bend); + math::matmul(device_ctx, proj_g, false, *proj_weight, + true, static_cast(1.0), &out_g, + static_cast(0.0)); + /* projection weight backward*/ + if (proj_weight_g) { + Tensor hidden_t = batch_hidden->Slice(bstart, bend); + math::matmul(device_ctx, hidden_t, true, proj_g, + false, static_cast(1.0), + proj_weight_g, static_cast(1.0)); + } + + Tensor gate = batch_gate->Slice(bstart, bend); + Tensor cell = batch_cell.Slice(bstart, bend); + Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); + lstmp_value.gate_value = gate.data(); + lstmp_value.state_value = cell.data(); + lstmp_value.state_active_value = cell_pre_act.data(); + + Tensor gate_g = batch_gate_g.Slice(bstart, bend); + Tensor cell_g = batch_cell_g.Slice(bstart, bend); + lstmp_grad.state_grad = cell_g.data(); + lstmp_grad.gate_grad = gate_g.data(); + lstmp_grad.output_grad = out_g.data(); + + if (n > 0) { + int bstart_pre = static_cast(batch_starts[n - 1]); + Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart); + Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); + lstmp_value.prev_state_value = cell_pre.data(); + lstmp_grad.prev_state_grad = cell_pre_g.data(); + } else { + lstmp_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; + lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; + } + + int cur_batch_size = bend - bstart; + math::LstmUnitGradFunctor::compute( + device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size, + gate_act, cell_act, cand_act); + + if (n > 0) { + int pre_h_start = static_cast(batch_starts[n - 1]); + int pre_h_end = pre_h_start + cur_batch_size; + auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, gate_g, false, *weight, true, + static_cast(1.0), &pre_proj_g, + static_cast(1.0)); + if (weight_g) { + /* weight backward*/ + auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end); + math::matmul(device_ctx, pre_proj, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); + } + } else { + if (h0 && weight_g) { + ReorderInitState(device_ctx, *h0, order, + &ordered_h0, true); + if (weight_g) { + math::matmul(device_ctx, *ordered_proj0, true, + gate_g, false, static_cast(1.0), + weight_g, static_cast(1.0)); + } + } + if (h0 && (h0_g || proj_weight_g)) { + ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); + Tensor proj0_g; + proj0_g.Resize({in_dims[0], proj_weight->dims()[1]}); + proj0_g.mutable_data(ctx.GetPlace()); + math::matmul(device_ctx, gate_g, false, *weight, + true, static_cast(1.0), &proj0_g, + static_cast(0.0)); + if (proj_act != math::detail::ActivationType::kIdentity) { + auto proj0_dev = EigenMatrix::From(*ordered_proj0); + auto proj0_g_dev = EigenMatrix::From(proj0_g); + ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev, + proj0_g_dev); + } + if (h0_g) { + math::matmul( + device_ctx, proj0_g, false, *proj_weight, true, + static_cast(1.0), &ordered_h0_g, static_cast(0.0)); + } + if (proj_weight_g) { + math::matmul(device_ctx, ordered_h0, true, + proj0_g, false, static_cast(1.0), + proj_weight_g, static_cast(1.0)); + } + } + } + } + + math::Batch2LoDTensorFunctor to_seq; + if (in_g) { + /* backward data */ + in_g->mutable_data(ctx.GetPlace()); + to_seq(device_ctx, batch_gate_g, *in_g); + } + if (bias && bias_g) { + /* backward bias */ + Tensor b_g = *bias_g; + b_g.Resize({bias_g->numel(), 1}); + Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); + math::ColwiseSum col_sum; + col_sum(device_ctx, batch_gate_g, &gate_bias_g); + } + + if (h0 && h0_g) { + ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, + false); + } + if (c0 && c0_g) { + ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, + false); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index c607704efac86982c8c22e462381aaab488a9b69..28c5aec1996ad04a6cb551ac68c14b613d16858e 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -11,7 +11,7 @@ if(WITH_GPU) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) - nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor) + nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function) nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context) nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) @@ -28,7 +28,7 @@ else() cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor) cc_library(context_project SRCS context_project.cc DEPS device_context math_function) - cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor) + cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function) cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context) cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index 78263da2fbf843f6a5af2ba95aa0b219a7523b52..d275fa5cbbfbf4a949d7bb16c3acc598543ba000 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -119,7 +119,13 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OP_CPU_KERNEL( multiplex, - ops::MultiplexCPUKernel); + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel, + ops::MultiplexCPUKernel); REGISTER_OP_CPU_KERNEL( multiplex_grad, - ops::MultiplexGradCPUKernel); + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel, + ops::MultiplexGradCPUKernel); diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 4372dc2c65ec7c0f28e46cd070ea471701ce8304..546e6e7a24d3653e9904706eac51c1b833f51463 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -90,7 +90,13 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( multiplex, - ops::MultiplexGPUKernel); + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel, + ops::MultiplexGPUKernel); REGISTER_OP_CUDA_KERNEL( multiplex_grad, - ops::MultiplexGradGPUKernel); + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel, + ops::MultiplexGradGPUKernel); diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index 6546096069d4c3fbc4908a16c2dba2ac6d7e6421..072e4eb2eff1f6f3d8745ac8e16709b8e1a69725 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -241,7 +241,7 @@ TEST_F(NCCLTester, ncclReduceOp) { // ncclBcastOp with desc TEST_F(NCCLTester, ncclBcastOp) { std::unique_ptr op2(new f::OpDesc); - const int kRoot = 5; + const int kRoot = 0; op2->SetType("ncclBcast"); op2->SetInput("X", {"st"}); op2->SetInput("Communicator", {"comm"}); diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc index 84ba3ead2b52547b989a4541f31ea31ffcce6c63..994ddf717e7a5b883d8071c6a47da0b4b4074f2e 100644 --- a/paddle/operators/nce_op.cc +++ b/paddle/operators/nce_op.cc @@ -124,7 +124,8 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { "This attribute only be used in unitest. Classes " "in this list wiil be used as negative classes " "for every samples. Under normal conditions, " - "user should avoid setting this attribute."); + "user should avoid setting this attribute.") + .SetDefault({}); AddComment(R"DOC( Compute and return the noise-contrastive estimation training loss. See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index e6b496f7896dcb412be8ff096fdccb2f0b682369..86fa13a649ce7fdcaad64e2609ceea2fb4d7e072 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -197,7 +197,8 @@ class NCEGradKernel : public framework::OpKernel { // get d_x auto d_x = context.Output(framework::GradVarName("Input")); if (d_x != nullptr) { - d_x->mutable_data(context.GetPlace()); + auto* d_x_data = d_x->mutable_data(context.GetPlace()); + std::fill(d_x_data, d_x_data + d_x->numel(), 0.0); auto d_x_matrix = EigenMatrix::From(*d_x); auto w_matrix = EigenMatrix::From(*(context.Input("Weight"))); for (int64_t i = 0; i < sample_labels->numel(); ++i) { diff --git a/paddle/operators/one_hot_op.cc b/paddle/operators/one_hot_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e78b7468de4ea5f29378c2dc5905fdd36fb0ae2f --- /dev/null +++ b/paddle/operators/one_hot_op.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/operators/one_hot_op.h" +#include "paddle/framework/framework.pb.h" + +namespace paddle { +namespace operators { + +class OneHotOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of OneHotOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of OneHotOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) should be at least 2."); + PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U, + "Last dimension of Input(X) should be 1."); + + int depth = ctx->Attrs().Get("depth"); + + PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth); + + framework::DDim out_dims(x_dims); + out_dims[out_dims.size() - 1] = depth; + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /* --> */ "Out"); + } +}; + +class OneHotOpMaker : public framework::OpProtoAndCheckerMaker { + public: + OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor, LoDTensor) Input variable with rank at least 2. " + "The last dimension of X should be 1. Each value of X is an index " + "to indicate the position."); + AddOutput("Out", + "(Tensor, Tensor) Output tensor with same rank as X. " + "The tensor consists of one-hot representations of values in X."); + AddAttr("depth", + "A positive integer to specify the length of one-hot vector."); + AddAttr("dtype", + "An integer to specify the data type of one-hot " + "vector. The default value is FP32.") + .SetDefault(paddle::framework::proto::DataType::FP32); + AddComment(R"DOC( +One Hot Operator. This operator creates the one-hot representations for input +index values. The following example will help to explain the function of this +operator: + +X is a LoDTensor: + X.lod = [[0, 1, 4]] + X.shape = [4, 1] + X.data = [[1], [1], [3], [0]] + +set depth = 4 + +Out is a LoDTensor: + Out.lod = [[0, 1, 4]] + Out.shape = [4, 4] + Out.data = [[0., 1., 0., 0.], + [0., 1., 0., 0.], + [0., 0., 0., 1.], + [1., 0., 0., 0.]] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + one_hot, ops::OneHotKernel, + ops::OneHotKernel); diff --git a/paddle/operators/one_hot_op.cu b/paddle/operators/one_hot_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..16f6d9433eabd7be157ed57362a0d55d86c6ee92 --- /dev/null +++ b/paddle/operators/one_hot_op.cu @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/operators/one_hot_op.h" +#include "paddle/platform/cuda_helper.h" +#include "paddle/platform/gpu_info.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, + const int64_t numel, const int depth) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel) { + *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; + } +} + +template +struct OneHotOpCUDAFunctor { + const framework::LoDTensor* in_; + framework::LoDTensor* out_; + const DeviceContext& ctx_; + int depth_; + + OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, + int depth, const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void operator()() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); + auto stream = ctx_.stream(); + math::set_constant(ctx_, out_, 0.0); + + FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + p_in_data, p_out_data, numel, depth_); + } +}; + +using LoDTensor = framework::LoDTensor; +template +class OneHotCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int depth = context.Attr("depth"); + + framework::VisitDataType( + static_cast(context.Attr("dtype")), + OneHotOpCUDAFunctor( + in, out, depth, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + one_hot, ops::OneHotCUDAKernel, + ops::OneHotCUDAKernel); diff --git a/paddle/operators/one_hot_op.h b/paddle/operators/one_hot_op.h new file mode 100644 index 0000000000000000000000000000000000000000..12031ede2c3cd042a3d25003b714652b4d0d4453 --- /dev/null +++ b/paddle/operators/one_hot_op.h @@ -0,0 +1,68 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +struct OneHotOpFunctor { + const framework::LoDTensor* in_; + framework::LoDTensor* out_; + int depth_; + const DeviceContext& ctx_; + + OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, + int depth, const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void operator()() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); + math::set_constant(ctx_, out_, 0.0); + + for (int i = 0; i < numel; ++i) { + PADDLE_ENFORCE_GE(p_in_data[i], 0, + "Illegal index value, should be at least 0."); + PADDLE_ENFORCE_LT(p_in_data[i], depth_, + "Illegal index value, should be less than depth (%d).", + depth_); + *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; + } + } +}; + +using LoDTensor = framework::LoDTensor; +template +class OneHotKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int depth = context.Attr("depth"); + + framework::VisitDataType( + static_cast(context.Attr("dtype")), + OneHotOpFunctor( + in, out, depth, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc index a00458ea068dd703d2c7f362511ed08bc212d2a8..67f9854c02fa92d0141463088915e720733306fb 100644 --- a/paddle/operators/parallel_do_op.cc +++ b/paddle/operators/parallel_do_op.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/framework/executor.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/threadpool.h" +#include "paddle/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -31,6 +32,7 @@ static constexpr char kParallelScopes[] = "parallel_scopes"; static constexpr char kParallelBlock[] = "sub_block"; using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; static void SplitTensorAndMoveTensorToScopes( const framework::Scope &scope, std::vector *sub_scopes, @@ -38,8 +40,10 @@ static void SplitTensorAndMoveTensorToScopes( const std::vector &names) { size_t num_sub_scopes = 0; for (auto &argu : names) { - auto *var = scope.FindVar(argu); - const auto &tensor = var->Get(); + const auto &tensor = + detail::Ref(scope.FindVar(argu), + "Cannot find variable %s in the parent scope", argu) + .Get(); auto lod_tensors = tensor.SplitLoDTensor(places); for (auto &lod : lod_tensors) { @@ -59,11 +63,37 @@ static void SplitTensorAndMoveTensorToScopes( } for (size_t i = 0; i < lod_tensors.size(); ++i) { - *(*sub_scopes)[i]->Var(argu)->GetMutable() = lod_tensors[i]; + *detail::Ref(sub_scopes->at(i)->Var(argu), + "Cannot find variable in the sub-scope", argu) + .GetMutable() = lod_tensors[i]; } } } +inline void CopyOrShare(const framework::Variable &src, + const platform::Place &dst_place, + framework::Variable *dst) { + if (src.IsType()) { + if (src.Get().place() == dst_place) { + dst->GetMutable()->ShareDataWith(src.Get()); + } else { + Copy(src.Get(), dst_place, dst->GetMutable()); + } + } else if (src.IsType()) { + auto &src_sr = src.Get(); + auto *dst_sr = dst->GetMutable(); + dst_sr->set_rows(src_sr.rows()); + dst_sr->set_height(src_sr.height()); + if (src_sr.value().place() == dst_place) { + dst_sr->mutable_value()->ShareDataWith(src_sr.value()); + } else { + Copy(src_sr.value(), dst_place, dst_sr->mutable_value()); + } + } else { + PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); + } +} + void WaitOnPlace(const platform::Place place) { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); @@ -210,30 +240,30 @@ class ParallelDoGradOp : public framework::OperatorBase { } WaitOnPlaces(places); - // merge grad + AccumulateGrad(scope, place, sub_scopes, places); + } + + void AccumulateGrad(const framework::Scope &scope, + const platform::Place &place, + const std::vector &sub_scopes, + const platform::PlaceList &places) const { for (auto &s : Outputs(framework::GradVarName(kParameters))) { - auto &result = sub_scopes[0]->FindVar(s)->Get(); std::string tmp_name; - auto *tmp = sub_scopes[0]->Var(&tmp_name)->GetMutable(); + auto *tmp = sub_scopes[0]->Var(&tmp_name); for (size_t i = 1; i < sub_scopes.size(); ++i) { - auto &tensor_to_merge = sub_scopes[i]->FindVar(s)->Get(); - if (!(places[i] == places[0])) { - framework::Copy(tensor_to_merge, places[0], tmp); - WaitOnPlace(places[0]); - } else { - tmp->ShareDataWith(tensor_to_merge); - } + CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp); + WaitOnPlace(places[0]); auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, framework::AttributeMap{}); + VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]); sum_op->Run(*sub_scopes[0], places[0]); WaitOnPlace(places[0]); } - VLOG(3) << result; - framework::Copy(result, place, scope.FindVar(s)->GetMutable()); + CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); } WaitOnPlaces(places); } @@ -262,6 +292,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { this->InputGrad(input_param, false)); } } + auto *g_block = this->grad_block_[0]; + + // All variable name that needed by gradient operators + std::unordered_set all_inputs_in_grad_blocks; + + for (size_t i = 0; i < g_block->OpSize(); ++i) { + auto *op = g_block->Op(i); + for (auto &var_name : op->InputArgumentNames()) { + all_inputs_in_grad_blocks.insert(var_name); + } + } for (auto &output_param : this->OutputNames()) { if (output_param == kParallelScopes) { @@ -270,8 +311,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { this->Output(output_param)); } else { grad->SetInput(output_param, this->Output(output_param)); - grad->SetInput(framework::GradVarName(output_param), - this->OutputGrad(output_param)); + std::vector og_names; + for (auto &og_name : this->OutputGrad(output_param)) { + if (all_inputs_in_grad_blocks.count(og_name) != 0) { + // there are some gradient operators who need the OG. So make this + // OG as an input of parallel.do + og_names.push_back(og_name); + } + // else, there is no operator who need the OG. Do not use this OG as + // an input + } + grad->SetInput(framework::GradVarName(output_param), og_names); } } grad->SetAttrMap(this->Attrs()); @@ -289,7 +339,7 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase { PADDLE_ENFORCE(ctx->HasInputs(kParameters)); PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); - PADDLE_ENFORCE(ctx->HasInput(kInputs)); + PADDLE_ENFORCE(ctx->HasInputs(kInputs)); for (auto &s : output) { PADDLE_ENFORCE(ctx->HasInputs(s)); diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h index c3d82ecbdeb412f0234fcddc27361d79b58c7122..d6ba5e298a4939e31fde71bf5bf8484640a7ceaf 100644 --- a/paddle/operators/pool_op.h +++ b/paddle/operators/pool_op.h @@ -139,10 +139,8 @@ class PoolGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); - auto temp = framework::EigenVector::Flatten(*in_x_grad); - temp.device( - *context.template device_context().eigen_device()) = - temp.constant(static_cast(0)); + paddle::operators::math::SetConstant set_constant; + set_constant(dev_ctx, in_x_grad, 0.0); switch (ksize.size()) { case 2: { diff --git a/paddle/operators/prior_box_op.cc b/paddle/operators/prior_box_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..105ff4ac3e3ba889aad880f4204af15829c6da47 --- /dev/null +++ b/paddle/operators/prior_box_op.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/prior_box_op.h" + +namespace paddle { +namespace operators { + +class PriorBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of PriorBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Image"), + "Input(Image) of PriorBoxOp should not be null."); + + auto image_dims = ctx->GetInputDim("Image"); + auto input_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW."); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + PADDLE_ENFORCE_LT(input_dims[2], image_dims[2], + "The height of input must smaller than image."); + + PADDLE_ENFORCE_LT(input_dims[3], image_dims[3], + "The width of input must smaller than image."); + + auto min_sizes = ctx->Attrs().Get>("min_sizes"); + auto max_sizes = ctx->Attrs().Get>("max_sizes"); + auto variances = ctx->Attrs().Get>("variances"); + auto aspect_ratios = ctx->Attrs().Get>("aspect_ratios"); + bool flip = ctx->Attrs().Get("flip"); + + PADDLE_ENFORCE_GT(min_sizes.size(), 0, + "Size of min_sizes must be at least 1."); + for (size_t i = 0; i < min_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i); + } + + std::vector aspect_ratios_vec; + ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec); + + int num_priors = aspect_ratios_vec.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(), + "The number of min_size and max_size must be equal."); + for (size_t i = 0; i < min_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i], + "max_size[%d] must be greater than min_size[%d].", i, + i); + num_priors += 1; + } + } + + PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance."); + for (size_t i = 0; i < variances.size(); ++i) { + PADDLE_ENFORCE_GT(variances[i], 0.0, + "variance[%d] must be greater than 0.", i); + } + + const float step_h = ctx->Attrs().Get("step_h"); + PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0."); + const float step_w = ctx->Attrs().Get("step_w"); + PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0."); + + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } +}; + +class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Input", + "(Tensor, default Tensor), " + "the input feature data of PriorBoxOp, The layout is NCHW."); + AddInput("Image", + "(Tensor, default Tensor), " + "the input image data of PriorBoxOp, The layout is NCHW."); + AddOutput("Boxes", + "(Tensor, default Tensor), the output prior boxes of " + "PriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddOutput("Variances", + "(Tensor, default Tensor), the expanded variances of " + "PriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddAttr>("min_sizes", "(vector) ", + "List of min sizes of generated prior boxes."); + AddAttr>("max_sizes", "(vector) ", + "List of max sizes of generated prior boxes."); + AddAttr>( + "aspect_ratios", "(vector) ", + "List of aspect ratios of generated prior boxes."); + AddAttr>( + "variances", "(vector) ", + "List of variances to be encoded in prior boxes."); + AddAttr("flip", "(bool) ", "Whether to flip aspect ratios.") + .SetDefault(true); + AddAttr("clip", "(bool) ", "Whether to clip out-of-boundary boxes.") + .SetDefault(true); + AddAttr("step_w", + "Prior boxes step across width, 0 for auto calculation.") + .SetDefault(0.0); + AddAttr("step_h", + "Prior boxes step across height, 0 for auto calculation.") + .SetDefault(0.0); + AddAttr("offset", + "(float) " + "Prior boxes center offset.") + .SetDefault(0.5); + AddComment(R"DOC( +Prior box operator +Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. +Each position of the input produce N prior boxes, N is determined by + the count of min_sizes, max_sizes and aspect_ratios, The size of the + box is in range(min_size, max_size) interval, which is generated in + sequence according to the aspect_ratios. + +Please get more information from the following papers: +https://arxiv.org/abs/1512.02325. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker); +REGISTER_OP_CPU_KERNEL( + prior_box, ops::PriorBoxOpKernel, + ops::PriorBoxOpKernel); diff --git a/paddle/operators/prior_box_op.h b/paddle/operators/prior_box_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e0a663ace8f38c2d08fd4714c1247d3313ffae3e --- /dev/null +++ b/paddle/operators/prior_box_op.h @@ -0,0 +1,188 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/platform/transform.h" + +namespace paddle { +namespace operators { + +inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, + bool flip, + std::vector& output_aspect_ratior) { + constexpr float epsilon = 1e-6; + output_aspect_ratior.clear(); + output_aspect_ratior.push_back(1.); + for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { + float ar = input_aspect_ratior[i]; + bool already_exist = false; + for (size_t j = 0; j < output_aspect_ratior.size(); ++j) { + if (fabs(ar - output_aspect_ratior[j]) < epsilon) { + already_exist = true; + break; + } + } + if (!already_exist) { + output_aspect_ratior.push_back(ar); + if (flip) { + output_aspect_ratior.push_back(1. / ar); + } + } + } +} + +template +struct ClipFunctor { + HOSTDEVICE T operator()(T in) const { + return std::min(std::max(in, 0.), 1.); + } +}; + +template +class PriorBoxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto min_sizes = ctx.Attr>("min_sizes"); + auto max_sizes = ctx.Attr>("max_sizes"); + auto input_aspect_ratio = ctx.Attr>("aspect_ratios"); + auto variances = ctx.Attr>("variances"); + auto flip = ctx.Attr("flip"); + auto clip = ctx.Attr("clip"); + + std::vector aspect_ratios; + ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = aspect_ratios.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + num_priors += max_sizes.size(); + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + auto e_boxes = framework::EigenTensor::From(*boxes); + for (int h = 0; h < feature_height; ++h) { + for (int w = 0; w < feature_width; ++w) { + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + T box_width, box_height; + int idx = 0; + for (size_t s = 0; s < min_sizes.size(); ++s) { + int min_size = min_sizes[s]; + // first prior: aspect_ratio = 1, size = min_size + box_width = box_height = min_size; + // xmin + e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width; + // ymin + e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height; + // xmax + e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width; + // ymax + e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height; + + idx++; + if (max_sizes.size() > 0) { + int max_size = max_sizes[s]; + // second prior: aspect_ratio = 1, + // size = sqrt(min_size * max_size) + box_width = box_height = sqrt(min_size * max_size); + // xmin + e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width; + // ymin + e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height; + // xmax + e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width; + // ymax + e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height; + idx++; + } + + // rest of priors + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + if (fabs(ar - 1.) < 1e-6) { + continue; + } + box_width = min_size * sqrt(ar); + box_height = min_size / sqrt(ar); + // xmin + e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width; + // ymin + e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height; + // xmax + e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width; + // ymax + e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height; + idx++; + } + } + } + } + + if (clip) { + platform::Transform trans; + ClipFunctor clip_func; + trans(ctx.template device_context(), + boxes->data(), boxes->data() + boxes->numel(), + boxes->data(), clip_func); + } + + framework::Tensor var_t; + var_t.mutable_data( + framework::make_ddim({1, static_cast(variances.size())}), + ctx.GetPlace()); + auto var_et = framework::EigenTensor::From(var_t); + for (size_t i = 0; i < variances.size(); ++i) { + var_et(0, i) = variances[i]; + } + + int box_num = feature_height * feature_width * num_priors; + auto var_dim = vars->dims(); + vars->Resize({box_num, static_cast(variances.size())}); + + auto e_vars = framework::EigenMatrix::From(*vars); + e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); + + vars->Resize(var_dim); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc index 58e8fd6124d8c076337ae9bb2f5103e7a3cb7ff0..b9743a5df1092917d13a50aa20ea7e7c52b8d151 100644 --- a/paddle/operators/reshape_op.cc +++ b/paddle/operators/reshape_op.cc @@ -90,14 +90,10 @@ Reshape Operator. Reshape Input(X) into the shape specified by Attr(shape). An example: -Given a 2-D tensor X with 2 rows and 2 columns - - [[1, 2], [3, 4]] +Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]] and target shape = [1, 4], the reshape operator will transform -the tensor X into a 2-D tensor: - - [[1, 2, 3, 4]] +the tensor X into a 2-D tensor: [[1, 2, 3, 4]] One dimension in the target shape can be set -1, representing that its size is unknown. In this case, the real dimension will be infered from diff --git a/paddle/operators/sequence_expand_op.h b/paddle/operators/sequence_expand_op.h index 2ba628e9c37278025e31779ab0468db46f2ff40a..6021526eee8e0a1f58885f6de38b14048787a828 100644 --- a/paddle/operators/sequence_expand_op.h +++ b/paddle/operators/sequence_expand_op.h @@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel { const T* x_data = x->data(); auto x_dims = x->dims(); auto* y = context.Input("Y"); + PADDLE_ENFORCE(!y->lod().empty(), "y should have lod"); PADDLE_ENFORCE_EQ(static_cast(x_dims[0]), y->lod().back().size() - 1, "The size of last lod level in Input(Y)" diff --git a/paddle/operators/sequence_reshape_op.h b/paddle/operators/sequence_reshape_op.h index c6f528ab8a73294bb8ee91425f34e44c66f1932c..aaae7ab29281b72848515b80cc60931c13a294c9 100644 --- a/paddle/operators/sequence_reshape_op.h +++ b/paddle/operators/sequence_reshape_op.h @@ -35,7 +35,7 @@ class SequenceReshapeKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(in_lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ( - in_dims[0], in_lod[0].back(), + (uint64_t)in_dims[0], in_lod[0].back(), "Inconsistent size between X.shape[0] and X.lod()[0].back()."); auto in_lod_l0 = in_lod[0]; diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h index e9cd9bbd4d964c28f305fb4ab4c4733ed27ebfff..bf42e15e6b234125d9ec24e8500367b9915213ab 100644 --- a/paddle/operators/top_k_op.h +++ b/paddle/operators/top_k_op.h @@ -22,6 +22,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; template @@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { // Get the top k elements of each row of input tensor // FIXME: only deal with matrix(2d tensor). - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* indices = ctx.Output("Indices"); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); // k is determined by Attr const size_t k = static_cast(ctx.Attr("k")); diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 3742594a504ed728019ac9665c022503748bea01..d68caea99719b37816391f9bddcc5cac051025b2 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -10,7 +10,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) -cc_library(place SRCS place.cc DEPS enforce) +cc_library(place SRCS place.cc DEPS enforce boost) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) add_subdirectory(dynload) diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h index 00337a7f051758559a0f8012d8c78dbe8e3457a6..44a4d38f679ddf6c317e52132b6cf3eb2f0a0649 100644 --- a/paddle/platform/call_once.h +++ b/paddle/platform/call_once.h @@ -29,20 +29,25 @@ namespace platform { */ template inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) { - bool good = false; + bool good = true; std::exception ex; - std::call_once(flag, - [&](Args&&... args) { - try { - f(args...); - good = true; - } catch (const std::exception& e) { - ex = e; - } catch (...) { - ex = std::runtime_error("excption caught in call_once"); - } - }, - args...); + try { + std::call_once(flag, + [&](Args&&... args) { + try { + f(args...); + } catch (const std::exception& e) { + ex = e; + good = false; + } catch (...) { + ex = std::runtime_error("excption caught in call_once"); + good = false; + } + }, + args...); + } catch (std::system_error& x) { + throw std::runtime_error("call once failed"); + } if (!good) { throw std::exception(ex); } diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 7e2e2d968ef877f6aa8b87ab8f044e89574dffa9..2a8afc940393baaaa939471f50f2d5c63edd6a84 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() { } Event::Event(EventKind kind, std::string name, uint32_t thread_id, - DeviceContext* dev_ctx) + const DeviceContext* dev_ctx) : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { #ifdef PADDLE_WITH_CUDA - auto* cuda_dev_ctx = static_cast(dev_ctx); - if (cuda_dev_ctx) { + has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; + if (has_cuda_) { + auto* cuda_dev_ctx = static_cast(dev_ctx); PADDLE_ENFORCE(cudaGetDevice(&device_)); PADDLE_ENFORCE(cudaEventCreate(&event_)); auto stream = cuda_dev_ctx->stream(); PADDLE_ENFORCE(cudaEventRecord(event_, stream)); - has_cuda_ = true; } #endif cpu_ns_ = GetTimeInNsec(); @@ -114,19 +114,20 @@ inline EventList& GetEventList() { return *g_event_list; } -void Mark(const std::string& name, DeviceContext* dev_ctx) { +void Mark(const std::string& name, const DeviceContext* dev_ctx) { GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); } -void PushEvent(const std::string& name, DeviceContext* dev_ctx) { +void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); } -void PopEvent(const std::string& name, DeviceContext* dev_ctx) { +void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); } -RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { +RecordEvent::RecordEvent(const std::string& name, + const DeviceContext* dev_ctx) { if (g_state == ProfilerState::kDisabled) return; dev_ctx_ = dev_ctx; name_ = name; @@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) { DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); Mark("_cuda_startup_", dev_ctx); dev_ctx->Wait(); + delete dev_ctx; }); } } @@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) { Mark("_start_profiler_", nullptr); } -std::vector> DisableProfiler() { - PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, - "Can't disable profiling, since it's not starting."); - // Mark the profiling stop. - Mark("_stop_profiler_", nullptr); - g_state = ProfilerState::kDisabled; - std::vector> result; +void ResetProfiler() { std::lock_guard guard(g_all_event_lists_mutex); + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + (*it)->Clear(); + } +} + +std::vector> GetAllEvents() { + std::lock_guard guard(g_all_event_lists_mutex); + std::vector> result; for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); ++it) { result.emplace_back((*it)->Reduce()); @@ -178,6 +183,18 @@ std::vector> DisableProfiler() { return result; } +void DisableProfiler(EventSortingKey sorted_key) { + PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, + "Can't disable profiling, since it's not starting."); + // Mark the profiling stop. + Mark("_stop_profiler_", nullptr); + g_state = ProfilerState::kDisabled; + + std::vector> all_events = GetAllEvents(); + ParseEvents(all_events, sorted_key); + ResetProfiler(); +} + void ParseEvents(std::vector>& events, EventSortingKey sorted_by) { if (g_profiler_place == "") return; @@ -291,12 +308,12 @@ void ParseEvents(std::vector>& events, } // Print report - PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12); + PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12); } -void PrintProfilingReport(std::vector>& events_table, - std::string& sorted_domain, const size_t name_width, - const size_t data_width) { +void PrintProfiler(std::vector>& events_table, + std::string& sorted_domain, const size_t name_width, + const size_t data_width) { // Output header information std::cout << "\n------------------------->" << " Profiling Report " diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index 6df48ef8806e865f473b4317ac0283863c3c6f64..8de1e6ad296d1e15c1659ccf431f1d5013eb608c 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -29,7 +29,7 @@ class Event { // The DeviceContext is used to get the cuda stream. // If CPU profiling mode, can pass nullptr. Event(EventKind kind, std::string name, uint32_t thread_id, - DeviceContext* dev_ctx); + const DeviceContext* dev_ctx); std::string kind() const; std::string name() const { return name_; } @@ -84,6 +84,8 @@ struct EventList { return result; } + void Clear() { event_blocks.clear(); } + std::forward_list> event_blocks; }; @@ -93,29 +95,26 @@ enum ProfilerState { kCUDA, // GPU profiling state }; -void Mark(const std::string& name, DeviceContext* dev_ctx); +void Mark(const std::string& name, const DeviceContext* dev_ctx); -void PushEvent(const std::string& name, DeviceContext* dev_ctx); +void PushEvent(const std::string& name, const DeviceContext* dev_ctx); -void PopEvent(const std::string& name, DeviceContext* dev_ctx); +void PopEvent(const std::string& name, const DeviceContext* dev_ctx); struct RecordEvent { - explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); + explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx); ~RecordEvent(); // The device context is used by Event to get the current cuda stream. - DeviceContext* dev_ctx_; + const DeviceContext* dev_ctx_; // Event name std::string name_; }; -// Enable the profiling function. -void EnableProfiler(ProfilerState state); - // Return the event list of all threads. Asummed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. -std::vector> DisableProfiler(); +std::vector> GetAllEvents(); // The information of each event given in the profiling report struct EventItem { @@ -130,13 +129,22 @@ struct EventItem { // Candidate keys to sort the profiling report enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; +// Enable the profiling function. +void EnableProfiler(ProfilerState state); + +// Clear the g_all_event_lists, which is total event lists of all threads. +void ResetProfiler(); + +void DisableProfiler(EventSortingKey sorted_key); + // Parse the event list and output the profiling report void ParseEvents(std::vector>&, EventSortingKey sorted_by = EventSortingKey::kDefault); // Print results -void PrintProfilingReport(std::vector>& events_table, - std::string& sorted_domain, const size_t name_width, - const size_t data_width); +void PrintProfiler(std::vector>& events_table, + std::string& sorted_domain, const size_t name_width, + const size_t data_width); + } // namespace platform } // namespace paddle diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 13dea713c71e147ed5dd8d090e92d86c96256c09..81f10c91342f76910cc780b0ebd0c0df04e9d7bf 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) { // Bad Usage: PushEvent("event_without_pop", dev_ctx); PopEvent("event_without_push", dev_ctx); - std::vector> events = paddle::platform::DisableProfiler(); - // Will remove parsing-related code from test later - ParseEvents(events, EventSortingKey::kTotal); + std::vector> events = paddle::platform::GetAllEvents(); int cuda_startup_count = 0; int start_profiler_count = 0; - int stop_profiler_count = 0; for (size_t i = 0; i < events.size(); ++i) { for (size_t j = 0; j < events[i].size(); ++j) { if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; - if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count; if (events[i][j].name() == "push") { EXPECT_EQ(events[i][j + 1].name(), "pop"); #ifdef PADDLE_WITH_CUDA @@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) { } EXPECT_EQ(cuda_startup_count % 5, 0); EXPECT_EQ(start_profiler_count, 1); - EXPECT_EQ(stop_profiler_count, 1); + + // Will remove parsing-related code from test later + DisableProfiler(EventSortingKey::kTotal); } diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index 7b374307071d2da91a677361b404448f1a3816b0..de53fea0dd692167d61fcca552cc834a7916e209 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -1,7 +1,7 @@ if(WITH_PYTHON) cc_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc const_value.cc - DEPS pybind python backward proto_desc paddle_memory executor prune init + DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method ${GLOB_OP_LIB}) if(NOT APPLE AND NOT ANDROID) target_link_libraries(paddle_pybind rt) diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc index 99694fa592059d979297b72748125d02b2dd70a3..b55ddee17616ced4de659be8e55acd5e072c66b7 100644 --- a/paddle/pybind/print_operators_doc.cc +++ b/paddle/pybind/print_operators_doc.cc @@ -64,6 +64,8 @@ std::string AttrType(paddle::framework::proto::AttrType at) { return "bool array"; case paddle::framework::proto::BLOCK: return "block id"; + case paddle::framework::proto::LONG: + return "long"; } return "UNKNOWN"; // not possible } diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc index 4f959481537d29c089be24f9ae306f860c196c0f..371d6119d4ab73e683821d0dc5db5194f44a64ce 100644 --- a/paddle/pybind/protobuf.cc +++ b/paddle/pybind/protobuf.cc @@ -212,6 +212,7 @@ void BindVarDsec(py::module &m) { return name; }, py::return_value_policy::reference) + .def("set_name", &VarDesc::SetName) .def("set_shape", &VarDesc::SetShape) .def("set_dtype", &VarDesc::SetDataType) .def("shape", &VarDesc::Shape, py::return_value_policy::reference) @@ -280,7 +281,8 @@ void BindOpDesc(py::module &m) { .def("check_attrs", &OpDesc::CheckAttrs) .def("infer_shape", &OpDesc::InferShape) .def("infer_var_type", &OpDesc::InferVarType) - .def("serialize_to_string", SerializeMessage); + .def("serialize_to_string", SerializeMessage) + .def("block", &OpDesc::Block, py::return_value_policy::reference); } } // namespace pybind diff --git a/paddle/pybind/protobuf.h b/paddle/pybind/protobuf.h index 089183accc08c3c486a7ae78ccfe060853ec54f5..9e747e9ea60fd95c74937daa283bc7a9eb9368c0 100644 --- a/paddle/pybind/protobuf.h +++ b/paddle/pybind/protobuf.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include "paddle/platform/variant.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index c5d70bc9f91bc92b28a546cc79b08a9fda150050..490397afdd4de0cc1aafde746d31b1d800eded3b 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/operators/net_op.h" #include "paddle/platform/enforce.h" #include "paddle/platform/place.h" +#include "paddle/platform/profiler.h" #include "paddle/pybind/const_value.h" #include "paddle/pybind/exception.h" #include "paddle/pybind/pybind.h" @@ -52,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) { return generators[prefix].fetch_add(1); } -bool IsCompileGPU() { +bool IsCompiledWithCUDA() { #ifndef PADDLE_WITH_CUDA return false; #else @@ -423,14 +424,16 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Executor") .def(py::init()) - .def("run", &Executor::Run); + .def("run", + (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) & + Executor::Run); m.def("unique_integer", UniqueIntegerGenerator); m.def("init_gflags", framework::InitGflags); m.def("init_glog", framework::InitGLOG); m.def("init_devices", &framework::InitDevices); - m.def("is_compile_gpu", IsCompileGPU); + m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); @@ -476,6 +479,24 @@ All parameter, weight, gradient are variables in Paddle. m.def("nvprof_stop", platform::CudaProfilerStop); #endif + py::enum_(m, "ProfilerState", py::arithmetic()) + .value("kDisabled", platform::ProfilerState::kDisabled) + .value("kCPU", platform::ProfilerState::kCPU) + .value("kCUDA", platform::ProfilerState::kCUDA) + .export_values(); + + py::enum_(m, "EventSortingKey", py::arithmetic()) + .value("kDefault", platform::EventSortingKey::kDefault) + .value("kCalls", platform::EventSortingKey::kCalls) + .value("kTotal", platform::EventSortingKey::kTotal) + .value("kMin", platform::EventSortingKey::kMin) + .value("kMax", platform::EventSortingKey::kMax) + .value("kAve", platform::EventSortingKey::kAve) + .export_values(); + + m.def("enable_profiler", platform::EnableProfiler); + m.def("disable_profiler", platform::DisableProfiler); + m.def("reset_profiler", platform::ResetProfiler); return m.ptr(); } } // namespace pybind diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 1f041c74597637a7b74e9690a60b6cd8fdd21cf8..787416aed1acf81138df06110317614dfe77fb48 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -89,7 +89,7 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark' ] - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync'] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index ae81d68bafd22db5d9f7ab0f9cc0dcdb204493e1..29243c90e872ca4a7d1ce6f84f6297b865655da1 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): if _all_in_set_( filter(lambda name: name.find(core.grad_var_suffix()) != -1, op_desc.input_arg_names()), no_grad_set): - no_grad_set.union(out_arg_names) + no_grad_set.update(out_arg_names) return True return False diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py index abcad899bfac9ba3eff20cde825e136d867a4485..908810c8be1d262c459825a49a61c3b3d826c1d0 100644 --- a/python/paddle/v2/fluid/distribute_transpiler.py +++ b/python/paddle/v2/fluid/distribute_transpiler.py @@ -33,6 +33,10 @@ class VarBlock: return "%s:%d:%d" % (self.varname, self.offset, self.size) +def same_or_split_var(p_name, var_name): + return p_name == var_name or p_name.startswith(var_name + ".block") + + def split_dense_variable(var_list, pserver_count, min_block_size=1024, @@ -221,7 +225,7 @@ class DistributeTranspiler: if len(splited_vars) <= 1: continue orig_var = program.global_block().vars[varname] - if orig_var == core.VarDesc.VarType.SELECTED_ROWS: + if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS: height_sections = [] for v in splited_vars: height_sections.append(v.shape[0]) @@ -230,7 +234,7 @@ class DistributeTranspiler: inputs={"X": orig_var}, outputs={"Out": splited_vars}, attrs={"height_sections": height_sections}) - elif orig_var == core.VarDesc.VarType.LOD_TENSOR: + elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR: sections = [] for v in splited_vars: sections.append(v.shape[0]) @@ -303,8 +307,8 @@ class DistributeTranspiler: return True else: for n in param_names: - if n.startswith(op.inputs["Param"].name+".block") and \ - n != op.inputs["Param"].name: + if same_or_split_var(n, op.inputs[ + "Param"].name) and n != op.inputs["Param"].name: return True return False else: @@ -335,7 +339,7 @@ class DistributeTranspiler: if key == "Grad": grad_block = None for g in self.param_grad_ep_mapping[endpoint]["grads"]: - if g.name.startswith(var.name): + if same_or_split_var(g.name, var.name): grad_block = g break if not grad_block: @@ -365,7 +369,7 @@ class DistributeTranspiler: # param is already created on global program param_block = None for p in self.param_grad_ep_mapping[endpoint]["params"]: - if p.name.startswith(var.name): + if same_or_split_var(p.name, var.name): param_block = p break if not param_block: @@ -502,7 +506,7 @@ class DistributeTranspiler: def _get_splited_name_and_shape(varname): for idx, splited_param in enumerate(params): pname = splited_param.name - if pname.startswith(varname) and varname != pname: + if same_or_split_var(pname, varname) and varname != pname: return pname, splited_param.shape return "", [] diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index 9d5ed9571a2fa0a871a25e43b23b1a3c3a6102db..9f48815b8b84426c7d539af4e7d45ea47e69d4d9 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -68,6 +68,84 @@ def as_numpy(tensor): return ans +def has_feed_operators(block, feed_targets, feed_holder_name): + """ Check whether the block already has feed operators. + + Return false if the block does not have any feed operators. + If some feed operators have been prepended to the block, check that + the info contained in these feed operators matches the feed_targets + and feed_holder_name. Raise exception when any mismatch is found. + Return true when the block has feed operators with matching info. + + Args: + block: a block instance (typically global block of a program) + feed_targets: a dictionary of {feed_target_name: feed_target_data} + feed_holder_name: the name of the variable that holds the data of + all feed targets. The type of this feed_holder variable is + FEED_MINIBATCH, which is essentially vector. + + Returns: + A boolean value that indicates whether a block has feed operators + that match the info contained in feed_targets and feed_holder_name. + """ + + feed_count = 0 + for op in block.ops: + if op.desc.type() == 'feed': + feed_count += 1 + assert op.desc.input('X')[0] == feed_holder_name + feed_target_name = op.desc.output('Out')[0] + if feed_target_name not in feed_targets: + raise Exception("'feed_targets' does not have {} variable". + format(feed_target_name)) + else: + break + if feed_count > 0 and feed_count != len(feed_targets): + raise Exception( + "Feed operators in program desc do not match 'feed_targets'") + return feed_count > 0 + + +def has_fetch_operators(block, fetch_targets, fetch_holder_name): + """ Check whether the block already has fetch operators. + + Return false if the block does not have any fetch operators. + If some fetch operators have been appended to the block, check that + the info contained in these fetch operators matches the fetch_targets + and fetch_holder_name. Raise exception when any mismatch is found. + Return true when the block has fetch operators with matching info. + + Args: + block: a block instance (typically global block of a program) + fetch_targets: a dictionary of {fetch_target_name: fetch_target_data} + fetch_holder_name: the name of the variable that holds the data of + all fetch targets. The type of this fetch_holder variable is + FETCH_LIST, which is essentially vector. + + Return: + A boolean value that indicates whether a block has fetch operators + that match the info contained in fetch_targets and fetch_holder_name. + """ + + fetch_count = 0 + for op in block.ops: + if op.desc.type() == 'fetch': + fetch_count += 1 + assert op.desc.output('Out')[0] == fetch_holder_name + fetch_target_name = op.desc.input('X')[0] + if fetch_target_name not in [ + var.desc.name() for var in fetch_targets + ]: + raise Exception("'fetch_targets' does not have {} variable". + format(fetch_target_name)) + idx = op.desc.attr('col') + assert fetch_target_name == fetch_targets[idx].desc.name() + if fetch_count > 0 and fetch_count != len(fetch_targets): + raise Exception( + "Fetch operators in program desc do not match 'fetch_targets'") + return fetch_count > 0 + + class Executor(object): def __init__(self, places): if not isinstance(places, list) and not isinstance(places, tuple): @@ -147,33 +225,50 @@ class Executor(object): program = program.clone() global_block = program.global_block() - feed_var = global_block.create_var( - name=feed_var_name, - type=core.VarDesc.VarType.FEED_MINIBATCH, - persistable=True) - - for i, name in enumerate(feed): - out = global_block.var(name) - global_block.prepend_op( - 'feed', - inputs={'X': [feed_var]}, - outputs={'Out': [out]}, - attrs={'col': i}) - cur_feed = feed[name] - if not isinstance(cur_feed, core.LoDTensor): - cur_feed = self.aslodtensor(cur_feed) - core.set_feed_variable(scope, cur_feed, feed_var.name, i) - - fetch_var = global_block.create_var( - name=fetch_var_name, - type=core.VarDesc.VarType.FETCH_LIST, - persistable=True) - for i, var in enumerate(fetch_list): - global_block.append_op( - type='fetch', - inputs={'X': [var]}, - outputs={'Out': [fetch_var]}, - attrs={'col': i}) + + if feed_var_name in global_block.vars: + feed_var = global_block.var(feed_var_name) + else: + feed_var = global_block.create_var( + name=feed_var_name, + type=core.VarDesc.VarType.FEED_MINIBATCH, + persistable=True) + + if fetch_var_name in global_block.vars: + fetch_var = global_block.var(fetch_var_name) + else: + fetch_var = global_block.create_var( + name=fetch_var_name, + type=core.VarDesc.VarType.FETCH_LIST, + persistable=True) + + if not has_feed_operators(global_block, feed, feed_var_name): + for i, name in enumerate(feed): + out = global_block.var(name) + global_block.prepend_op( + type='feed', + inputs={'X': [feed_var]}, + outputs={'Out': [out]}, + attrs={'col': i}) + + for op in global_block.ops: + if op.desc.type() == 'feed': + feed_target_name = op.desc.output('Out')[0] + cur_feed = feed[feed_target_name] + if not isinstance(cur_feed, core.LoDTensor): + cur_feed = self.aslodtensor(cur_feed) + idx = op.desc.attr('col') + core.set_feed_variable(scope, cur_feed, feed_var_name, idx) + else: + break + + if not has_fetch_operators(global_block, fetch_list, fetch_var_name): + for i, var in enumerate(fetch_list): + global_block.append_op( + type='fetch', + inputs={'X': [var]}, + outputs={'Out': [fetch_var]}, + attrs={'col': i}) self.executor.run(program.desc, scope, 0, True, True) outs = [ diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py index 376d6013a38923014fa35e964e58d7f56bf80546..d56ec45c538b580f5520bc060b4b339bb1be0539 100644 --- a/python/paddle/v2/fluid/io.py +++ b/python/paddle/v2/fluid/io.py @@ -13,8 +13,8 @@ # limitations under the License. import os -import cPickle as pickle +from paddle.v2.fluid.evaluator import Evaluator from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable from . import core @@ -187,18 +187,28 @@ def get_inference_program(target_vars, main_program=None): main_program = default_main_program() if not isinstance(target_vars, list): target_vars = [target_vars] - - pruned_program = main_program.prune(targets=target_vars) + vars = [] + for var in target_vars: + if isinstance(var, Evaluator): + vars.extend(var.states) + vars.extend(var.metrics) + else: + vars.append(var) + pruned_program = main_program.prune(targets=vars) inference_program = pruned_program.inference_optimize() return inference_program -def prepend_feed_ops(inference_program, feeded_var_names): +def prepend_feed_ops(inference_program, + feed_target_names, + feed_holder_name='feed'): global_block = inference_program.global_block() feed_var = global_block.create_var( - name='feed', type=core.VarDesc.VarType.FEED_MINIBATCH, persistable=True) + name=feed_holder_name, + type=core.VarDesc.VarType.FEED_MINIBATCH, + persistable=True) - for i, name in enumerate(feeded_var_names): + for i, name in enumerate(feed_target_names): out = global_block.var(name) global_block.prepend_op( type='feed', @@ -207,12 +217,16 @@ def prepend_feed_ops(inference_program, feeded_var_names): attrs={'col': i}) -def append_fetch_ops(inference_program, fetch_var_names): +def append_fetch_ops(inference_program, + fetch_target_names, + fetch_holder_name='fetch'): global_block = inference_program.global_block() fetch_var = global_block.create_var( - name='fetch', type=core.VarDesc.VarType.FETCH_LIST, persistable=True) + name=fetch_holder_name, + type=core.VarDesc.VarType.FETCH_LIST, + persistable=True) - for i, name in enumerate(fetch_var_names): + for i, name in enumerate(fetch_target_names): global_block.append_op( type='fetch', inputs={'X': [name]}, @@ -262,21 +276,12 @@ def save_inference_model(dirname, inference_program = pruned_program.inference_optimize() fetch_var_names = [v.name for v in target_vars] - model_file_name = dirname + "/__model__" - with open(model_file_name, "w") as f: - pickle.dump({ - "program_desc_str": inference_program.desc.serialize_to_string(), - "feed_var_names": feeded_var_names, - "fetch_var_names": fetch_var_names - }, f, -1) - prepend_feed_ops(inference_program, feeded_var_names) append_fetch_ops(inference_program, fetch_var_names) - # Save only programDesc of inference_program in binary format - # in another file: __model__.dat - with open(model_file_name + ".dat", "wb") as fp: - fp.write(inference_program.desc.serialize_to_string()) + model_file_name = dirname + "/__model__" + with open(model_file_name, "wb") as f: + f.write(inference_program.desc.serialize_to_string()) save_params(executor, dirname, main_program) @@ -299,6 +304,24 @@ def load_persistables_if_exist(executor, dirname, main_program=None): predicate=_is_presistable_and_exist_) +def get_feed_targets_names(program): + feed_targets_names = [] + global_block = program.global_block() + for op in global_block.ops: + if op.desc.type() == 'feed': + feed_targets_names.insert(0, op.desc.output('Out')[0]) + return feed_targets_names + + +def get_fetch_targets_names(program): + fetch_targets_names = [] + global_block = program.global_block() + for op in global_block.ops: + if op.desc.type() == 'fetch': + fetch_targets_names.append(op.desc.input('X')[0]) + return fetch_targets_names + + def load_inference_model(dirname, executor): """ Load inference model from a directory @@ -306,24 +329,28 @@ def load_inference_model(dirname, executor): :param dirname: directory path :param executor: executor that load inference model - :return: [program, feed_var_names, fetch_var_names] + :return: [program, feed_target_names, fetch_targets] program: program especially for inference. - feeded_var_names: Names of variables that need to feed data - fetch_vars: Variables from which we can get inference results. + feed_target_names: Names of variables that need to feed data + fetch_targets: Variables from which we can get inference results. """ if not os.path.isdir(dirname): raise ValueError("There is no directory named '%s'", dirname) model_file_name = dirname + "/__model__" - model = pickle.load(open(model_file_name, "r")) - program_desc_str = model["program_desc_str"] - feed_var_names = model["feed_var_names"] - fetch_var_names = model["fetch_var_names"] + with open(model_file_name, "rb") as f: + program_desc_str = f.read() + program = Program.parse_from_string(program_desc_str) load_persistables_if_exist(executor, dirname, program) - fetch_vars = [program.global_block().var(name) for name in fetch_var_names] - return [program, feed_var_names, fetch_vars] + feed_target_names = get_feed_targets_names(program) + fetch_target_names = get_fetch_targets_names(program) + fetch_targets = [ + program.global_block().var(name) for name in fetch_target_names + ] + + return [program, feed_target_names, fetch_targets] def get_parameter_value(para, executor): diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py index 368cf8ed2e8bae46cb35738e7d5351cfea910968..cc4822735aa996f3caf81fefa78da43a416d4579 100644 --- a/python/paddle/v2/fluid/layer_helper.py +++ b/python/paddle/v2/fluid/layer_helper.py @@ -100,7 +100,8 @@ class LayerHelper(object): if dtype is None: dtype = each.dtype elif dtype != each.dtype: - raise ValueError("Data Type mismatch") + raise ValueError("Data Type mismatch: %d to %d" % + (dtype, each.dtype)) return dtype def _create_weight_normalize(self, attr, shape, dtype): @@ -281,6 +282,7 @@ class LayerHelper(object): is_bias=False, default_initializer=None): # Deepcopy the attr so that parameters can be shared in program + attr = copy.deepcopy(attr) assert isinstance(attr, ParamAttr) suffix = 'b' if is_bias else 'w' if attr.name is None: diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 5f01fdb076d3bf7d060a805d1431f4973993a843..0fcbfe0e2f2f9686366139e84b7fdcc158bf0aa7 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -289,6 +289,7 @@ class ParallelDo(object): for in_var_name in op.input(iname): if in_var_name not in local_inputs: params.append(in_var_name) + params = list(set(params)) return [parent_block.var(name) for name in params] @@ -769,7 +770,7 @@ def topk(input, k): array = fluid.layers.topk(x, k) """ helper = LayerHelper('topk', **locals()) - topk_out = helper.create_tmp_variable(dtype=input.data_type) + topk_out = helper.create_tmp_variable(dtype=input.dtype) topk_indices = helper.create_tmp_variable(dtype='int64') helper.append_op( type='top_k', diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 072119881644c650c3430c70bdab42f8d17df7ba..bae33f6a156b094d6e9cccc9cabb42880f157ac8 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -19,12 +19,14 @@ from ..layer_helper import LayerHelper from ..initializer import Normal, Constant from ..framework import Variable from ..param_attr import ParamAttr +from layer_function_generator import autodoc from tensor import concat __all__ = [ 'fc', 'embedding', 'dynamic_lstm', + 'dynamic_gru', 'gru_unit', 'linear_chain_crf', 'crf_decoding', @@ -57,6 +59,11 @@ __all__ = [ 'warpctc', 'sequence_reshape', 'transpose', + 'im2sequence', + 'nce', + 'beam_search', + 'row_conv', + 'multiplex', ] @@ -104,16 +111,17 @@ def fc(input, into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input tensor is flattened: the first `num_flatten_dims` - dimensions will be flatten to form the first - dimension of the final matrix (height of the - matrix), and the rest `rank(X) - num_flatten_dims` - dimensions are flattened to form the second - dimension of the final matrix (width of the matrix). - For example, suppose `X` is a 6-dimensional tensor - with a shape [2, 3, 4, 5, 6], and - `num_flatten_dims` = 3. Then, the flattened matrix - will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. - By default, `num_flatten_dims` is set to 1. + (inclusive, index starts from 1) dimensions will + be flatten to form the first dimension of the + final matrix (height of the matrix), and the rest + `rank(X) - num_flatten_dims` dimensions are + flattened to form the second dimension of the + final matrix (width of the matrix). For example, + suppose `X` is a 6-dimensional tensor with a shape + [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then, + the flattened matrix will have a shape + [2 x 3 x 4, 5 x 6] = [24, 30]. By default, + `num_flatten_dims` is set to 1. param_attr(ParamAttr|list): The parameter attribute for learnable parameters/weights of the fully connected layer. @@ -154,15 +162,14 @@ def fc(input, param_shape = [ reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) ] + [size] + w = helper.create_parameter( attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False) tmp = helper.create_tmp_variable(dtype) helper.append_op( type="mul", - inputs={ - "X": input_var, - "Y": w, - }, + inputs={"X": input_var, + "Y": w}, outputs={"Out": tmp}, attrs={"x_num_col_dims": num_flatten_dims, "y_num_col_dims": 1}) @@ -181,22 +188,35 @@ def fc(input, return helper.append_activation(pre_activation) -def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): +def embedding(input, + size, + is_sparse=False, + padding_idx=None, + param_attr=None, + dtype='float32'): """ **Embedding Layer** - This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table. - The result of this lookup is the embedding of each ID in the *input*. + This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in + a lookup table. The result of this lookup is the embedding of each ID in the + :attr:`input`. All the input variables are passed in as local variables to the LayerHelper constructor. Args: - input(Variable): Input to the function - size(tuple|list|None): Shape of the look up table parameter - is_sparse(bool): Boolean flag that specifying whether the input is sparse - param_attr(ParamAttr): Parameters for this layer - dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc + input(Variable): The tensor variable containing the IDs. + size(tuple|list): The shape of the look up table parameter. It should + have two elements which indicate the size of the dictionary of + embeddings and the size of each embedding vector respectively. + is_sparse(bool): The flag indicating whether to use sparse update. + padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup. + Otherwise the given :attr:`padding_idx` indicates padding the output + with zeros whenever lookup encounters it in :attr:`input`. If + :math:`padding_idx < 0`, the padding_idx to use in lookup is + :math:`size[0] + dim`. + param_attr(ParamAttr): Parameters for this layer + dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc Returns: Variable: The tensor variable storing the embeddings of the \ @@ -214,12 +234,15 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): w = helper.create_parameter( attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False) tmp = helper.create_tmp_variable(dtype) + padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else ( + size[0] + padding_idx) helper.append_op( type='lookup_table', inputs={'Ids': input, 'W': w}, outputs={'Out': tmp}, - attrs={'is_sparse': is_sparse}) + attrs={'is_sparse': is_sparse, + 'padding_idx': padding_idx}) return tmp @@ -366,6 +389,113 @@ def dynamic_lstm(input, return hidden, cell +def dynamic_gru(input, + size, + param_attr=None, + bias_attr=None, + is_reverse=False, + gate_activation='sigmoid', + candidate_activation='tanh', + h_0=None): + """ + **Dynamic GRU Layer** + + Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on + Sequence Modeling `_ + + The formula is as follows: + + .. math:: + + u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u) + + r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) + + \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) + + h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} + + The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` + is the update gate and reset gate activation function and :math:`sigmoid` + is usually used for it. :math:`act_c` is the activation function for + candidate hidden state and :math:`tanh` is usually used for it. + + Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on + the input :math:`x_{t}` are NOT included in this operator. Users can choose + to use fully-connect layer before GRU layer. + + Args: + input(Variable): The input of dynamic_gru layer, which supports + variable-time length input sequence. The underlying tensor in this + Variable is a matrix with shape :math:`(T \\times 3D)`, where + :math:`T` is the total time steps in this mini-batch, :math:`D` + is the hidden size. + size(int): The dimension of the gru cell. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weight matrix. Note: + + - The shape of the weight matrix is :math:`(T \\times 3D)`, where + :math:`D` is the hidden size. + - All elements in the weight matrix can be divided into two parts. + The first part are weights of the update gate and reset gate with + shape :math:`(D \\times 2D)`, and the second part are weights for + candidate hidden state with shape :math:`(D \\times D)`. + bias_attr(ParamAttr): The parameter attribute for learnable the + hidden-hidden bias. + is_reverse(bool): Whether to compute reversed GRU, default + :attr:`False`. + gate_activation(str): The activation for update gate and reset gate. + Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid". + activation(str): The activation for candidate hidden state. + Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". + + Returns: + Variable: The hidden state of GRU. The shape is (T \\times D), and lod \ + is the same with the input. + + Examples: + .. code-block:: python + + hidden_dim = 512 + x = fluid.layers.fc(input=data, size=hidden_dim * 3) + hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim) + """ + + helper = LayerHelper('gru', **locals()) + dtype = helper.input_dtype() + + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + bias = helper.create_parameter( + attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + if h_0 != None: + assert h_0.shape == ( + size, size), 'The shape of h0 should be(%d, %d)' % (size, size) + inputs['h0'] = h_0 + + hidden = helper.create_tmp_variable(dtype) + batch_gate = helper.create_tmp_variable(dtype) + batch_reset_hidden_prev = helper.create_tmp_variable(dtype) + batch_hidden = helper.create_tmp_variable(dtype) + + helper.append_op( + type='gru', + inputs=inputs, + outputs={ + 'Hidden': hidden, + 'BatchGate': batch_gate, + 'BatchResetHiddenPrev': batch_reset_hidden_prev, + 'BatchHidden': batch_hidden + }, + attrs={ + 'is_reverse': is_reverse, + 'gate_activation': gate_activation, + 'activation': candidate_activation + }) + return hidden + + def gru_unit(input, hidden, size, @@ -403,8 +533,10 @@ def gru_unit(input, size (integer): The input dimension value. weight (ParamAttr): The weight parameters for gru unit. Default: None bias (ParamAttr): The bias parameters for gru unit. Default: None - activation (string): The activation type for cell (actNode). Default: 'tanh' - gate_activation (string): The activation type for gates (actGate). Default: 'sigmoid' + activation (string): The activation type for cell (actNode). + Default: 'tanh' + gate_activation (string): The activation type for gates (actGate). + Default: 'sigmoid' Returns: tuple: The hidden value, reset-hidden value and gate values. @@ -543,8 +675,9 @@ def cross_entropy(input, label, **kwargs): """ **Cross Entropy Layer** - This layer computes the cross entropy between `input` and `label`. It supports - both standard cross-entropy and soft-label cross-entropy loss computation. + This layer computes the cross entropy between `input` and `label`. It + supports both standard cross-entropy and soft-label cross-entropy loss + computation. 1) One-hot cross-entropy: `soft_label = False`, `Label[i, 0]` indicates the class index for sample i: @@ -571,23 +704,28 @@ def cross_entropy(input, label, **kwargs): Args: input (Variable|list): a 2-D tensor with shape [N x D], where N is the - batch size and D is the number of classes. This input is a probability - computed by the previous operator, which is almost always the result - of a softmax operator. + batch size and D is the number of classes. This + input is a probability computed by the previous + operator, which is almost always the result of + a softmax operator. label (Variable|list): the ground truth which is a 2-D tensor. When - `soft_label` is set to `False`, `label` is a tensor with shape - [N x 1]. When `soft_label` is set to `True`, `label` is a - tensor with shape [N x D]. - soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate - the given labels as soft labels, default `False`. + `soft_label` is set to `False`, `label` is a + tensor with shape [N x 1]. When + `soft_label` is set to `True`, `label` is a + tensor with shape [N x D]. + soft_label (bool, via `**kwargs`): a flag indicating whether to + interpretate the given labels as soft + labels, default `False`. Returns: A 2-D tensor with shape [N x 1], the cross entropy loss. Raises: - `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ - `soft_label == True`, and the 2nd dimension of `input` and `label` are not \ - equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1. + `ValueError`: 1) the 1st dimension of `input` and `label` are not equal. + 2) when `soft_label == True`, and the 2nd dimension of + `input` and `label` are not equal. + 3) when `soft_label == False`, and the 2nd dimension of + `label` is not 1. Examples: .. code-block:: python @@ -610,7 +748,9 @@ def square_error_cost(input, label, **kwargs): """ **Square error cost layer** - This layer accepts input predictions and target label and returns the squared error cost. + This layer accepts input predictions and target label and returns the + squared error cost. + For predictions, :math:`X`, and target labels, :math:`Y`, the equation is: .. math:: @@ -628,8 +768,8 @@ def square_error_cost(input, label, **kwargs): label(Variable): Label tensor, has target labels. Returns: - Variable: The tensor variable storing the element-wise squared error difference \ - of input and label. + Variable: The tensor variable storing the element-wise squared error + difference of input and label. Examples: .. code-block:: python @@ -725,7 +865,8 @@ def chunk_eval(input, "chunk_scheme": chunk_scheme, "excluded_chunk_types": excluded_chunk_types or [] }) - return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks + return (precision, recall, f1_score, num_infer_chunks, num_label_chunks, + num_correct_chunks) def sequence_conv(input, @@ -783,13 +924,14 @@ def conv2d(input, **Convlution2D Layer** The convolution2D layer calculates the output based on the input, filter - and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output) - are in NCHW format. Where N is batch size, C is the number of channels, H is the height - of the feature, and W is the width of the feature. + and strides, paddings, dilations, groups parameters. Input(Input) and + Output(Output) are in NCHW format. Where N is batch size, C is the number of + channels, H is the height of the feature, and W is the width of the feature. The details of convolution layer, please refer UFLDL's `convolution, `_ . - If bias attribution and activation type are provided, bias is added to the output of the convolution, - and the corresponding activation function is applied to the final result. + If bias attribution and activation type are provided, bias is added to the + output of the convolution, and the corresponding activation function is + applied to the final result. For each input :math:`X`, the equation is: @@ -804,7 +946,8 @@ def conv2d(input, * :math:`\\ast`: Convolution operation. * :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. * :math:`\\sigma`: Activation function. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be + different. Example: @@ -849,17 +992,20 @@ def conv2d(input, act(str): Activation type. Default: None Returns: - Variable: The tensor variable storing the convolution and \ + Variable: The tensor variable storing the convolution and non-linearity activation result. Raises: - ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch. + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. Examples: .. code-block:: python - data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') - conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu") + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + conv2d = fluid.layers.conv2d( + input=data, num_filters=2, filter_size=3, act="relu") """ if stride is None: stride = [1, 1] @@ -1222,7 +1368,8 @@ def conv2d_transpose(input, H is the height of the feature, and W is the width of the feature. Parameters(dilations, strides, paddings) are two elements. These two elements represent height and width, respectively. The details of convolution transpose - layer, please refer to the following explanation and references `therein `_. + layer, please refer to the following explanation and references + `therein `_. For each input :math:`X`, the equation is: @@ -1235,7 +1382,8 @@ def conv2d_transpose(input, * :math:`X`: Input value, a tensor with NCHW format. * :math:`W`: Filter value, a tensor with MCHW format. * :math:`\\ast` : Convolution transpose operation. - * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. + * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be + different. Example: @@ -1276,7 +1424,8 @@ def conv2d_transpose(input, dilation(int|tuple): The dilation size. If dilation is a tuple, it must contain two integers, (dilation_H, dilation_W). Otherwise, the dilation_H = dilation_W = dilation. Default: dilation = 1. - param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. Default: None + param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. + Default: None use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn library is installed. Default: True name(str|None): A name for this layer(optional). If set None, the layer @@ -1286,13 +1435,16 @@ def conv2d_transpose(input, Variable: The tensor variable storing the convolution transpose result. Raises: - ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch. + ValueError: If the shapes of input, filter_size, stride, padding and + groups mismatch. Examples: .. code-block:: python - data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') - conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3) + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + conv2d_transpose = fluid.layers.conv2d_transpose( + input=data, num_filters=2, filter_size=3) """ helper = LayerHelper("conv2d_transpose", **locals()) if not isinstance(input, Variable): @@ -1424,6 +1576,38 @@ def sequence_expand(x, y, name=None): return tmp +def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0): + ''' + This function implements the beam search algorithm. + ''' + helper = LayerHelper('beam_search', **locals()) + score_type = scores.dtype + id_type = ids.dtype + + selected_scores = helper.create_tmp_variable(dtype=score_type) + selected_ids = helper.create_tmp_variable(dtype=id_type) + + helper.append_op( + type='beam_search', + inputs={ + 'pre_ids': pre_ids, + 'ids': ids, + 'scores': scores, + }, + outputs={ + 'selected_ids': selected_ids, + 'selected_scores': selected_scores, + }, + attrs={ + # TODO(ChunweiYan) to assure other value support + 'level': level, + 'beam_size': beam_size, + 'end_id': end_id, + }) + + return selected_ids, selected_scores + + def lstm_unit(x_t, hidden_t_prev, cell_t_prev, @@ -1484,10 +1668,10 @@ def lstm_unit(x_t, tuple: The hidden value and cell value of lstm unit. Raises: - ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\ - not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \ - and **cell_t_prev** not be the same or the 2nd dimensions of \ - **hidden_t_prev** and **cell_t_prev** not be the same. + ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev** + not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** + and **cell_t_prev** not be the same or the 2nd dimensions of + **hidden_t_prev** and **cell_t_prev** not be the same. Examples: @@ -1819,7 +2003,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): data = fluid.layers.data(name="data", shape=(3, 17, 13), dtype="float32") - fc = fluid.layers.l2_normalize(x=data, axis=1) + normed = fluid.layers.l2_normalize(x=data, axis=1) """ if len(x.shape) == 1: axis = 0 @@ -1871,9 +2055,10 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): def matmul(x, y, transpose_x=False, transpose_y=False, name=None): """ - Applies matrix multiplication to two tensors. Currently, the input - tensors' rank can be any, but when the rank of anyone inputs is - bigger than 3, this two inputs' rank should be equal. + Applies matrix multiplication to two tensors. + + Currently, the input tensors' rank can be any, but when the rank of any + inputs is bigger than 3, this two inputs' rank should be equal. The actual behavior depends on the shapes of :math:`x`, :math:`y` and the flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically: @@ -1914,25 +2099,56 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): # Examples to clarify shapes of the inputs and output # x: [B, ..., M, K], y: [B, ..., K, N] fluid.layers.matmul(x, y) # out: [B, ..., M, N] + # x: [B, M, K], y: [B, K, N] fluid.layers.matmul(x, y) # out: [B, M, N] + # x: [B, M, K], y: [K, N] fluid.layers.matmul(x, y) # out: [B, M, N] - # x: [B, M, K], y: [K] - fluid.layers.matmul(x, y) # out: [B, M] + # x: [M, K], y: [K, N] fluid.layers.matmul(x, y) # out: [M, N] + + # x: [B, M, K], y: [K] + fluid.layers.matmul(x, y) # out: [B, M] + # x: [K], y: [K] fluid.layers.matmul(x, y) # out: [1] - # x: [M], y: [N] + # x: [M], y: [N] fluid.layers.matmul(x, y, True, True) # out: [M, N] """ + + def __check_input(x, y): + if len(y.shape) > len(x.shape): + raise ValueError( + "Invalid inputs for matmul. " + "x's rank should be always greater than or equal to y'rank.") + + x_shape = list(x.shape) + y_shape = list(y.shape) + if len(x_shape) == 1: + x_shape = [1] + x_shape + if len(y_shape) == 1: + y_shape = y_shape + [1] + + # check the inner 2 dimensions + if transpose_x: + x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2] + if transpose_y: + y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2] + if x_shape[-1] != y_shape[-2]: + raise ValueError("Invalid inputs for matmul.") + + if len(y_shape) > 2: + for i, dim_x in enumerate(x_shape[:-2]): + if dim_x != y_shape[i]: + raise ValueError("Invalid inputs for matmul.") + + __check_input(x, y) + helper = LayerHelper('matmul', **locals()) - assert max(len(x.shape), len(y.shape)) <= 3 or len(x.shape) == len( - y. - shape), 'Inputs\' rank should be equal or their rank should be less 4.' - out = helper.create_tmp_variable(dtype=helper.input_dtype()) + out = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( type='matmul', inputs={'X': x, @@ -1949,13 +2165,26 @@ def edit_distance(input, ignored_tokens=None, name=None): """ - EditDistance operator computes the edit distances between a batch of hypothesis strings and their references. Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion: + EditDistance operator computes the edit distances between a batch of + hypothesis strings and their references. Edit distance, also called + Levenshtein distance, measures how dissimilar two strings are by counting + the minimum number of operations to transform one string into anthor. + Here the operations include insertion, deletion, and substitution. + + For example, given hypothesis string A = "kitten" and reference + B = "sitting", the edit distance is 3 for A will be transformed into B + at least after two substitutions and one insertion: - "kitten" -> "sitten" -> "sittin" -> "sitting" + "kitten" -> "sitten" -> "sittin" -> "sitting" - Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total number denoted by `batch_size`, and the separation is specified by the LoD information. And the `batch_size` reference strings are arranged in order in the same way in the LoDTensor Input(Refs). + Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with + the total number denoted by `batch_size`, and the separation is specified + by the LoD information. And the `batch_size` reference strings are arranged + in order in the same way in the LoDTensor Input(Refs). - Output(Out) contains the `batch_size` results and each stands for the edit stance for a pair of strings respectively. If Attr(normalized) is true, the edit distance will be divided by the length of reference string. + Output(Out) contains the `batch_size` results and each stands for the edit + distance for a pair of strings respectively. If Attr(normalized) is true, + the edit distance will be divided by the length of reference string. Args: @@ -1963,9 +2192,11 @@ def edit_distance(input, label(Variable): The indices for reference strings. - normalized(bool): Indicated whether to normalize the edit distance by the length of reference string. + normalized(bool): Indicated whether to normalize the edit distance by + the length of reference string. - ignored_tokens(list of int): Tokens that should be removed before calculating edit distance. + ignored_tokens(list of int): Tokens that should be removed before + calculating edit distance. Returns: Variable: sequence-to-sequence edit distance in shape [batch_size, 1]. @@ -2016,8 +2247,10 @@ def edit_distance(input, def ctc_greedy_decoder(input, blank, name=None): """ This op is used to decode sequences by greedy policy by below steps: - 1. Get the indexes of max value for each row in input. a.k.a. numpy.argmax(input, axis=0). - 2. For each sequence in result of step1, merge repeated tokens between two blanks and delete all blanks. + 1. Get the indexes of max value for each row in input. a.k.a. + numpy.argmax(input, axis=0). + 2. For each sequence in result of step1, merge repeated tokens between two + blanks and delete all blanks. A simple example as below: @@ -2047,9 +2280,16 @@ def ctc_greedy_decoder(input, blank, name=None): Args: - input(Variable): (LoDTensor), the probabilities of variable-length sequences, which is a 2-D Tensor with LoD information. It's shape is [Lp, num_classes + 1], where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label). + input(Variable): (LoDTensor), the probabilities of + variable-length sequences, which is a 2-D Tensor with + LoD information. It's shape is [Lp, num_classes + 1], + where Lp is the sum of all input sequences' length and + num_classes is the true number of classes. (not + including the blank label). - blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1). + blank(int): the blank label index of Connectionist Temporal + Classification (CTC) loss, which is in thehalf-opened + interval [0, num_classes + 1). Returns: Variable: CTC greedy decode result. @@ -2117,8 +2357,10 @@ def warpctc(input, label, blank=0, norm_by_times=False, **kwargs): Examples: .. code-block:: python - y = layers.data(name='y', shape=[11, 8], dtype='float32', lod_level=1) - y_predict = layers.data(name='y_predict', shape=[11, 1], dtype='float32') + y = layers.data( + name='y', shape=[11, 8], dtype='float32', lod_level=1) + y_predict = layers.data( + name='y_predict', shape=[11, 1], dtype='float32') cost = layers.warpctc(input=y_predict, label=y) """ @@ -2190,6 +2432,61 @@ def sequence_reshape(input, new_dim): return out +@autodoc() +def nce(input, + label, + num_total_classes, + sample_weight=None, + param_attr=None, + bias_attr=None, + num_neg_samples=None): + helper = LayerHelper('nce', **locals()) + assert isinstance(input, Variable) + dim = input.shape[1] + assert isinstance(label, Variable) + num_true_class = label.shape[1] + w = helper.create_parameter( + attr=helper.param_attr, + shape=[num_total_classes, dim], + is_bias=False, + dtype=input.dtype) + b = helper.create_parameter( + attr=helper.bias_attr, + shape=[num_total_classes, 1], + is_bias=True, + dtype=input.dtype) + cost = helper.create_tmp_variable(dtype=input.dtype) + sample_logits = helper.create_tmp_variable(dtype=input.dtype) + sample_labels = helper.create_tmp_variable(dtype=label.dtype) + + if num_neg_samples is None: + num_neg_samples = 10 + else: + num_neg_samples = int(num_neg_samples) + + attrs = { + 'num_total_classes': int(num_total_classes), + 'num_neg_samples': num_neg_samples + } + + helper.append_op( + type='nce', + inputs={ + 'Input': input, + 'Label': label, + 'Weight': w, + 'Bias': b, + 'SampleWeight': sample_weight if sample_weight is not None else [] + }, + outputs={ + 'Cost': cost, + 'SampleLogits': sample_logits, + 'SampleLabels': sample_labels + }, + attrs=attrs) + return cost / (num_neg_samples + 1) + + def transpose(x, perm, name=None): """ **transpose Layer** @@ -2217,6 +2514,12 @@ def transpose(x, perm, name=None): raise ValueError( "Input(perm) is the permutation of dimensions of Input(input). " "It's length shoud be equal to Input(input)'s rank.") + for idx, dim in enumerate(perm): + if dim >= len(x.shape): + raise ValueError( + "Each element in perm should be less than x's rank. " + "%d-th element in perm is %d which accesses x's rank %d." % + (idx, perm[idx], len(x.shape))) helper = LayerHelper('transpose', **locals()) out = helper.create_tmp_variable(x.dtype) @@ -2226,3 +2529,234 @@ def transpose(x, perm, name=None): outputs={'Out': [out]}, attrs={'axis': perm}) return out + + +def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): + """ + Extracts image patches from the input tensor to form a tensor of shape + {input.batch_size * output_height * output_width, filter_size_H * + filter_size_W * input.channels} which is similar with im2col. + This op use filter / kernel to scan images and convert these images to + sequences. After expanding, the number of time step are + output_height * output_width for an image, in which output_height and + output_width are calculated by below equation: + + .. math:: + + output\_size = 1 + \ + (2 * padding + img\_size - block\_size + stride - 1) / stride + + And the dimension of each time step is block_y * block_x * input.channels. + + Args: + input (Variable): The input should be a tensor in NCHW format. + + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. + + stride(int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. Default: stride = 1. + + padding(int|tuple): The padding size. If padding is a tuple, it can + contain two integers like (padding_H, padding_W) which means + padding_up = padding_down = padding_H and + padding_left = padding_right = padding_W. Or it can use + (padding_up, padding_left, padding_down, padding_right) to indicate + paddings of four direction. Otherwise, a scalar padding means + padding_up = padding_down = padding_left = padding_right = padding + Default: padding = 0. + + name (int): The name of this layer. It is optional. + + Returns: + output: The output is a LoDTensor with shape + {input.batch_size * output_height * output_width, + filter_size_H * filter_size_W * input.channels}. + If we regard output as a matrix, each row of this matrix is + a step of a sequence. + + Examples: + + As an example: + + .. code-block:: text + + Given: + + x = [[[[ 6. 2. 1.] + [ 8. 3. 5.] + [ 0. 2. 6.]] + + [[ 2. 4. 4.] + [ 6. 3. 0.] + [ 6. 4. 7.]]] + + [[[ 6. 7. 1.] + [ 5. 7. 9.] + [ 2. 4. 8.]] + + [[ 1. 2. 1.] + [ 1. 3. 5.] + [ 9. 0. 8.]]]] + + x.dims = {2, 2, 3, 3} + + And: + + filter = [2, 2] + stride = [1, 1] + padding = [0, 0] + + Then: + + output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.] + [ 2. 1. 3. 5. 4. 4. 3. 0.] + [ 8. 3. 0. 2. 6. 3. 6. 4.] + [ 3. 5. 2. 6. 3. 0. 4. 7.] + [ 6. 7. 5. 7. 1. 2. 1. 3.] + [ 7. 1. 7. 9. 2. 1. 3. 5.] + [ 5. 7. 2. 4. 1. 3. 9. 0.] + [ 7. 9. 4. 8. 3. 5. 0. 8.]] + + output.dims = {8, 9} + + output.lod = [[0, 4, 8]] + + The simple usage is: + + .. code-block:: python + + output = fluid.layers.im2sequence( + input=layer, stride=[1, 1], filter_size=[2, 2]) + + """ + + if isinstance(filter_size, int): + filter_size = [filter_size, filter_size] + if isinstance(stride, int): + stride = [stride, stride] + if isinstance(padding, int): + padding = [padding, padding] + if len(padding) == 2: + padding.append(padding[0]) + padding.append(padding[1]) + + helper = LayerHelper('im2sequence', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype()) + helper.append_op( + type='im2sequence', + inputs={'X': input}, + outputs={'Out': out}, + attrs={ + 'kernels': filter_size, + 'strides': stride, + 'paddings': padding, + }) + return out + + +def row_conv(input, future_context_size, param_attr=None, act=None): + """Row Conv Operator. This layer will apply lookahead convolution to + **input**. The input variable should be a 2D LoDTensor with shape [T, D]. + Parameters with shape [future_context_size + 1, D] will be created. The math + equation of row convolution is as follows: + + .. math:: + Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j} + + In the above equation: + + * :math:`Out_{i}`: The i-th row of output variable with shape [1, D]. + * :math:`\\tau`: Future context size. + * :math:`X_{j}`: The j-th row of input variable with shape [1, D]. + * :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D]. + + More details about row_conv please refer to the paper \ + (http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and + the design document \ + (https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645). + + Args: + input (Variable): Input variable, a 2D LoDTensor with shape [T, D]. + future_context_size (int): Future context size. Please note, the shape + of convolution kernel is [future_context_size + 1, D]. + param_attr (ParamAttr): Attributes of parameters, including + name, initializer etc. + act (str): Non-linear activation to be applied to output variable. + + Returns: + Variable: The output tensor with same shape as input tensor. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[16], + dtype='float32', lod_level=1) + out = fluid.layers.row_conv(input=x, future_context_size=2) + """ + helper = LayerHelper('row_conv', **locals()) + dtype = helper.input_dtype() + filter_shape = [future_context_size + 1, input.shape[1]] + filter_param = helper.create_parameter( + attr=helper.param_attr, shape=filter_shape, dtype=dtype) + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='row_conv', + inputs={'X': [input], + 'Filter': [filter_param]}, + outputs={'Out': [out]}) + return helper.append_activation(out) + + +def multiplex(inputs, index): + """ + **Multiplex Layer** + + Referring to the given index variable, this layer selects rows from the + input variables to construct a multiplex variable. Assuming that there are + :math:`m` input variables and :math:`I_i` represents the i-th input + variable and :math:`i` is in [0, :math:`m`). All input variables are + tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`]. + Please note that rank of the input tensor should be at least 2. Each input + variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`] + where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2` + * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input + variable. The given index variable should be a 2-D tensor with shape + [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable. + Then the output variable will be a tensor with shape [:math:`d_0`, + :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D + matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th + row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`. + + Args: + inputs (list): A list of variables to gather from. All variables have the + same shape and the rank is at least 2. + index (Variable): Tensor, index variable which is a 2-D tensor + with shape [M, 1] where M is the batch size. + + Returns: + Variable: Multiplex variable gathered from input variables. + + Examples: + .. code-block:: python + + x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32') + x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32') + index = fluid.layers.data(name='index', shape=[1], dtype='int32') + out = fluid.layers.multiplex(inputs=[x1, x2], index=index) + """ + helper = LayerHelper('multiplex', **locals()) + + if not isinstance(inputs, list) and len(inputs) < 2: + raise ValueError("inputs should be a list object and contains at least " + "2 elements.") + + out = helper.create_tmp_variable(inputs[0].dtype) + helper.append_op( + type='multiplex', + inputs={'X': inputs, + 'Ids': index}, + outputs={'Out': [out]}) + return out diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py index 1b4b64755963b5edc3d07d861c2a9b6cc3f23587..956c5b66da28fd8e74d4fd12f249688daa72d8ac 100644 --- a/python/paddle/v2/fluid/memory_optimization_transpiler.py +++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py @@ -31,10 +31,12 @@ dtype_to_size = { class ControlFlowGraph(object): - def __init__(self, Program): + def __init__(self, Program, ops, forward_num): self._program = Program - self._succesors = defaultdict(set) - self._presucessors = defaultdict(set) + self._ops = ops + self._forward_num = forward_num + self._successors = defaultdict(set) + self._presuccessors = defaultdict(set) self._uses = defaultdict(set) self._defs = defaultdict(set) self._live_in = defaultdict(set) @@ -45,25 +47,16 @@ class ControlFlowGraph(object): self._add(node1, node2) def _add(self, node1, node2): - self._succesors[node1].add(node2) - self._presucessors[node2].add(node1) + self._successors[node1].add(node2) + self._presuccessors[node2].add(node1) def _build_graph(self): - program_desc = self._program.get_desc() - block_size = program_desc.num_blocks() - - # TODO(qijun) handle Program with if/while operators - self.global_block_desc = program_desc.block(0) - self.op_size = self.global_block_desc.op_size() - + self.op_size = len(self._ops) op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)] self._add_connections(op_node_connections) - - self.ops = [self.global_block_desc.op(i) for i in range(self.op_size)] - for i in range(self.op_size): - self._uses[i].update(self.ops[i].input_arg_names()) - self._defs[i].update(self.ops[i].output_arg_names()) + self._uses[i].update(self._ops[i].input_arg_names()) + self._defs[i].update(self._ops[i].output_arg_names()) def _update_graph(self, old_name, new_name, begin_idx=0): for i in range(begin_idx, self.op_size): @@ -103,7 +96,7 @@ class ControlFlowGraph(object): live_out[i] = set(self._live_out[i]) self._live_in[i] = self._uses[i] | ( self._live_out[i] - self._defs[i]) - for s in self._succesors[i]: + for s in self._successors[i]: self._live_out[i] |= self._live_in[s] if self._reach_fixed_point(live_in, live_out): @@ -113,39 +106,76 @@ class ControlFlowGraph(object): u = a & b return a - u, b - u + def _has_var(self, block_desc, var_name, is_forward): + if is_forward: + return block_desc.has_var(str(var_name)) + else: + return block_desc.has_var_recursive(str(var_name)) + + def _find_var(self, block_desc, var_name, is_forward): + if is_forward: + return block_desc.find_var(str(var_name)) + else: + return block_desc.find_var_recursive(str(var_name)) + def memory_optimize(self): + def check_var_validity(block_desc, x, is_forward): + if str(x) == "@EMPTY@": + return False + if not self._has_var(block_desc, x, is_forward): + return False + if self._find_var(block_desc, x, is_forward).persistable(): + return False + if self._find_var( + block_desc, x, + is_forward).type() != core.VarDesc.VarType.LOD_TENSOR: + return False + return True + self._build_graph() self._dataflow_analyze() self.pool = [] for i in range(self.op_size): + op = self._ops[i] + if op.type() == "while" or op.type() == "while_grad": + continue + block_desc = op.block() + is_forward = i < self._forward_num if self.pool: - out_pair = [(x, self.global_block_desc.var(str(x)).shape()) - for x in self._defs[i]] + defs_can_optimize = filter( + lambda x: check_var_validity(block_desc, x, is_forward), + self._defs[i]) + out_pair = [ + (x, self._find_var(block_desc, x, is_forward).shape()) + for x in defs_can_optimize + ] for x, x_shape in out_pair: - if not self.global_block_desc.var(str(x)).persistable(): - for index, cache_pair in enumerate(self.pool): - cache_var = cache_pair[0] - cache_shape = cache_pair[1] - if x_shape == cache_shape: - x_dtype = self.global_block_desc.var(str( - x)).dtype() - cache_dtype = self.global_block_desc.var( - str(cache_var)).dtype() + for index, cache_pair in enumerate(self.pool): + cache_var = cache_pair[0] + cache_shape = cache_pair[1] + if x_shape == cache_shape: + if self._has_var(block_desc, cache_var, is_forward): + x_dtype = self._find_var(block_desc, x, + is_forward).dtype() + cache_dtype = self._find_var( + block_desc, cache_var, is_forward).dtype() # TODO(qijun): actually, we should compare dtype_to_size[x_dtype] # and dtype_to_size[cache_dtype] if x_dtype == cache_dtype: - print( - ("Hit Cache !!!! cache pool index " - "is %d, var name is %s, " - "cached var name is %s, " - "var shape is %s ") % - (index, x, cache_var, str(cache_shape))) + print(("Hit Cache !!!! cache pool index " + "is %d, var name is %s, " + "cached var name is %s, " + "var shape is %s ") % + (index, x, cache_var, + str(cache_shape))) self.pool.pop(index) + if x == cache_var: + break _rename_arg_( - self.ops, x, cache_var, begin_idx=i) - self._program.current_block().var(str( - x)).desc = self.global_block_desc.var( - str(cache_var)) + self._ops, x, cache_var, begin_idx=i) + self._program.block(block_desc.id).var( + str(x)).desc = self._find_var( + block_desc, cache_var, is_forward) self._update_graph( x, cache_var, begin_idx=i) break @@ -153,20 +183,70 @@ class ControlFlowGraph(object): in_diff, out_diff = self._get_diff(self._live_in[i], self._live_out[i]) can_optimize = filter( - lambda x: not self.global_block_desc.var(str(x)).persistable(), + lambda x: check_var_validity(block_desc, x, is_forward), in_diff) if can_optimize: for var_name in can_optimize: - self.pool.append( - (var_name, - self.global_block_desc.var(str(var_name)).shape())) - - def get_program(self): - return self._program + self.pool.append((var_name, self._find_var( + block_desc, var_name, is_forward).shape())) + + +def get_cfgs(input_program): + ops_list = [] + pdesc = input_program.get_desc() + block_desc = pdesc.block(0) + op_size = block_desc.op_size() + # Get global block ops + ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size)) + + while_sub_block_ids = [] + while_grad_sub_block_ids = [] + while_pair = [] + + for i in range(op_size): + op = block_desc.op(i) + if op.type() == "while": + while_sub_block_ids.append(op.attr("sub_block").id) + elif op.type() == "while_grad": + while_grad_sub_block_ids.append(op.attr("sub_block").id) + + # Find while/while_grad block pair + for grad_id in while_grad_sub_block_ids: + parent_id = pdesc.block(grad_id).parent + if parent_id in while_sub_block_ids: + while_pair.append((parent_id, grad_id)) + while_sub_block_ids.remove(parent_id) + + # Get while/while_grad block ops + for parent_id, grad_id in while_pair: + while_block_ops = [] + while_block = pdesc.block(parent_id) + while_block_op_size = while_block.op_size() + for i in range(while_block_op_size): + while_block_ops.append(while_block.op(i)) + + while_grad_block = pdesc.block(grad_id) + while_grad_block_op_size = while_grad_block.op_size() + for i in range(while_grad_block_op_size): + while_block_ops.append(while_grad_block.op(i)) + + ops_list.append((while_block_ops, while_block_op_size)) + + # Process rest while block ops + for parent_id in while_sub_block_ids: + while_block_ops = [] + while_block = pdesc.block(parent_id) + while_block_op_size = while_block.op_size() + for i in range(while_block_op_size): + while_block_ops.append(while_block.op(i)) + + ops_list.append((while_block_ops, while_block_op_size)) + + cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list] + return cfgs def memory_optimize(input_program): - graph = ControlFlowGraph(input_program) - graph.memory_optimize() - result_program = graph.get_program() - return result_program + cfgs = get_cfgs(input_program) + for cfg in cfgs: + cfg.memory_optimize() diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py index a30e646d8cbccb397d11c1f6164946e748f40c5e..cb63d43709e23ae04c4d23457bbb79e6f7f0ce3c 100644 --- a/python/paddle/v2/fluid/nets.py +++ b/python/paddle/v2/fluid/nets.py @@ -11,14 +11,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import layers __all__ = [ "simple_img_conv_pool", "sequence_conv_pool", "glu", - "dot_product_attention", + "scaled_dot_product_attention", ] @@ -56,7 +55,7 @@ def img_conv_group(input, conv_act=None, param_attr=None, conv_with_batchnorm=False, - conv_batchnorm_drop_rate=None, + conv_batchnorm_drop_rate=0.0, pool_stride=1, pool_type=None, use_cudnn=True): @@ -127,21 +126,21 @@ def sequence_conv_pool(input, def glu(input, dim=-1): """ - The gated linear unit composed by split, sigmoid activation and elementwise - multiplication. Specifically, Split the input into two equal sized parts - :math:`a` and :math:`b` along the given dimension and then compute as + The gated linear unit composed by split, sigmoid activation and elementwise + multiplication. Specifically, Split the input into two equal sized parts + :math:`a` and :math:`b` along the given dimension and then compute as following: .. math:: {GLU}(a, b)= a \otimes \sigma(b) - Refer to `Language Modeling with Gated Convolutional Networks + Refer to `Language Modeling with Gated Convolutional Networks `_. - + Args: input (Variable): The input variable which is a Tensor or LoDTensor. - dim (int): The dimension along which to split. If :math:`dim < 0`, the + dim (int): The dimension along which to split. If :math:`dim < 0`, the dimension to split along is :math:`rank(input) + dim`. Returns: @@ -160,53 +159,180 @@ def glu(input, dim=-1): return out -def dot_product_attention(querys, keys, values): +def scaled_dot_product_attention(queries, + keys, + values, + num_heads=1, + dropout_rate=0.): """ The dot-product attention. - Attention mechanism can be seen as mapping a query and a set of key-value - pairs to an output. The output is computed as a weighted sum of the values, - where the weight assigned to each value is computed by a compatibility + Attention mechanism can be seen as mapping a query and a set of key-value + pairs to an output. The output is computed as a weighted sum of the values, + where the weight assigned to each value is computed by a compatibility function (dot-product here) of the query with the corresponding key. - - The dot-product attention can be implemented through (batch) matrix + + The dot-product attention can be implemented through (batch) matrix multipication as follows: .. math:: - Attention(Q, K, V)= softmax(QK^\mathrm{T})V + Attention(Q, K, V)= softmax(QK^\mathrm{T})V - Refer to `Attention Is All You Need + Refer to `Attention Is All You Need `_. - Note that batch data containing sequences with different lengths is not - supported by this because of the (batch) matrix multipication. - Args: - query (Variable): The input variable which is a Tensor or LoDTensor. - key (Variable): The input variable which is a Tensor or LoDTensor. - value (Variable): The input variable which is a Tensor or LoDTensor. + + queries (Variable): The input variable which should be a 3-D Tensor. + keys (Variable): The input variable which should be a 3-D Tensor. + values (Variable): The input variable which should be a 3-D Tensor. + num_heads (int): Head number to compute the scaled dot product + attention. Default value is 1. + dropout_rate (float): The dropout rate to drop the attention weight. + Default value is 0. Returns: - tuple: The Tensor variables representing the output and attention scores. + + Variable: A 3-D Tensor computed by multi-head scaled dot product + attention. + + Raises: + + ValueError: If input queries, keys, values are not 3-D Tensors. + + NOTE: + 1. When num_heads > 1, three linear projections are learned respectively + to map input queries, keys and values into queries', keys' and values'. + queries', keys' and values' have the same shapes with queries, keys + and values. + + 1. When num_heads == 1, scaled_dot_product_attention has no learnable + parameters. Examples: .. code-block:: python - # Suppose q, k, v are tensor variables with the following shape: + # Suppose q, k, v are Tensors with the following shape: # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] - out, attn_scores = fluid.nets.dot_product_attention(q, k, v) - out.shape # [3, 5, 10] - attn_scores.shape # [3, 5, 6] + + contexts = fluid.nets.scaled_dot_product_attention(q, k, v) + contexts.shape # [3, 5, 10] """ - assert keys.shape[-2] == values.shape[ - -2], 'The shapes of keys and values mismatch.' - assert querys.shape[-1] == keys.shape[ - -1], 'The shapes of querys and keys mismatch.' - product = layers.matmul(x=querys, y=keys, transpose_y=True) - attn_scores = layers.reshape( + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs quries, keys and values should all be 3-D tensors.") + + if queries.shape[-1] != keys.shape[-1]: + raise ValueError( + "The hidden size of queries and keys should be the same.") + if keys.shape[-2] != values.shape[-2]: + raise ValueError( + "The max sequence length in query batch and in key batch " + "should be the same.") + if keys.shape[-1] % num_heads != 0: + raise ValueError("The hidden size of keys (%d) must be divisible " + "by the number of attention heads (%d)." % + (keys.shape[-1], num_heads)) + if values.shape[-1] % num_heads != 0: + raise ValueError("The hidden size of values (%d) must be divisible " + "by the number of attention heads (%d)." % + (values.shape[-1], num_heads)) + + def __compute_qkv(queries, keys, values, num_heads): + """ + Add linear projection to queries, keys, and values. + + Args: + queries(Tensor): a 3-D input Tensor. + keys(Tensor): a 3-D input Tensor. + values(Tensor): a 3-D input Tensor. + num_heads(int): The number of heads. Linearly project the inputs + ONLY when num_heads > 1. + + Returns: + Tensor: linearly projected output Tensors: queries', keys' and + values'. They have the same shapes with queries, keys and + values. + """ + + if num_heads == 1: + return queries, keys, values + + q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2) + k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2) + v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2) + return q, k, v + + def __split_heads(x, num_heads): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions. + + Args: + x(Tensor): a 3-D input Tensor. + num_heads(int): The number of heads. + + Returns: + Tensor: a Tensor with shape [..., n, m/num_heads], where m is size + of the last dimension of x. + """ + if num_heads == 1: + return x + + hidden_size = x.shape[-1] + # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim] + # into a 4-D output: + # [batch_size, max_sequence_length, num_heads, hidden_size_per_head]. + reshaped = layers.reshape( + x=x, + shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads]) + + # permuate the dimensions into: + # [batch_size, num_heads, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Reshape the last two dimensions of inpunt tensor x so that it becomes + one dimension. + + Args: + x(Tensor): a 4-D input Tensor with shape + [bs, num_heads, max_sequence_length, hidden_dim]. + + Returns: + Tensor: a Tensor with shape + [bs, max_sequence_length, num_heads * hidden_dim]. + """ + + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + return layers.reshape( + x=trans_x, + shape=map(int, [ + trans_x.shape[0], trans_x.shape[1], + trans_x.shape[2] * trans_x.shape[3] + ])) + + q, k, v = __compute_qkv(queries, keys, values, num_heads) + + q = __split_heads(q, num_heads) + k = __split_heads(k, num_heads) + v = __split_heads(v, num_heads) + + key_dim_per_head = keys.shape[-1] // num_heads + scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5) + product = layers.matmul(x=k, y=scaled_q, transpose_y=True) + + weights = layers.reshape( x=layers.reshape( - x=product, shape=[-1, product.shape[-1]], act='softmax'), + x=product, shape=[-1, product.shape[-1]], act="softmax"), shape=product.shape) - out = layers.matmul(attn_scores, values) - return out, attn_scores + if dropout_rate: + weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False) + ctx_multiheads = layers.matmul(weights, v) + return __combine_heads(ctx_multiheads) diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index 29e0d54a3ac9622e5505c8e5de38616d9c636e67..51c1c8aa705513825b46fb936c6c99090c50fb7d 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None): # Disables profiler collection. core.nvprof_stop() os.remove(config_file) + + +def reset_profiler(): + """The profiler clear interface. + reset_profiler will clear the previous time record. + """ + core.reset_profiler() + + +@contextmanager +def profiler(state, sorted_key=None): + """The profiler interface. + Different from cuda_profiler, this profiler can be used to profile both CPU + and GPU program. By defalut, it records the CPU and GPU operator kernels, + if you want to profile other program, you can refer the profiling tutorial + to add more records. + + Args: + state (string) : The profiling state, which should be 'CPU' or 'GPU', + telling the profiler to use CPU timer or GPU timer for profiling. + Although users may have already specified the execution place + (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler + would not inherit this place. + sorted_key (string) : If None, the profiling results will be printed + in the order of first end time of events. Otherwise, the profiling + results will be sorted by the this flag. This flag should be one + of 'calls', 'total', 'max', 'min' or 'ave'. + The `calls` means sorting by the number of calls. + The `total` means sorting by the total execution time. + The `max` means sorting by the maximum execution time. + The `min` means sorting by the minimum execution time. + The `ave` means sorting by the average execution time. + """ + + if state not in ['CPU', 'GPU']: + raise ValueError("The state must be 'CPU' or 'GPU'.") + prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU + core.enable_profiler(prof_state) + yield + + if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']: + raise ValueError("The state must be in 'calls', 'total', " + "'max', 'min', 'ave'") + sorted_key = 'default' if sorted_key is None else sorted_key + key_map = { + 'default': core.EventSortingKey.kDefault, + 'calls': core.EventSortingKey.kCalls, + 'total': core.EventSortingKey.kTotal, + 'max': core.EventSortingKey.kMax, + 'min': core.EventSortingKey.kMin, + 'ave': core.EventSortingKey.kAve, + } + # TODO(qingqing) : redirect C++ ostream to Python stream. + # with core.ostream_redirect(stdout=True, stderr=True): + core.disable_profiler(key_map[sorted_key]) diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt index a35abe3e0c436be4eaed01c9b9183344c6d3b275..dda02c03fd531445c1b33b39a6ded10921991d9c 100644 --- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt +++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt @@ -1,9 +1,33 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") -list(REMOVE_ITEM TEST_OPS test_image_classification_train) +list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits) py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet) py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg) +py_test(test_recognize_digits_mlp_cpu + SRCS test_recognize_digits.py + ARGS mlp) +py_test(test_recognize_digits_mlp_cuda + SRCS test_recognize_digits.py + ARGS mlp --use_cuda) +py_test(test_recognize_digits_conv_cpu + SRCS test_recognize_digits.py + ARGS conv) +py_test(test_recognize_digits_conv_cuda + SRCS test_recognize_digits.py + ARGS conv --use_cuda) +py_test(test_recognize_digits_mlp_cpu_parallel + SRCS test_recognize_digits.py + ARGS mlp --parallel) +py_test(test_recognize_digits_mlp_cuda_parallel + SRCS test_recognize_digits.py + ARGS mlp --use_cuda --parallel) +py_test(test_recognize_digits_conv_cpu_parallel + SRCS test_recognize_digits.py + ARGS conv --parallel) +py_test(test_recognize_digits_conv_cuda_parallel + SRCS test_recognize_digits.py + ARGS conv --use_cuda --parallel) # default test foreach(src ${TEST_OPS}) diff --git a/python/paddle/v2/fluid/tests/book/__init__.py b/python/paddle/v2/fluid/tests/book/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b94a21a7e406b833797f8f521c62a2351c2bc30a --- /dev/null +++ b/python/paddle/v2/fluid/tests/book/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py index 462669c262f285a7c6d36cf60f2f3f952c83f6b3..0b954c60b6bc2d721c0373243e747056f8f572cf 100644 --- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py @@ -49,7 +49,7 @@ for pass_id in range(PASS_NUM): avg_loss_value, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost]) - + print(avg_loss_value) if avg_loss_value[0] < 10.0: exit(0) # if avg cost less than 10.0, we think our code is good. exit(1) diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py index 53ae200a2387712c63ab67f44d4e9da03ebbe4b2..82b760d693560dae1ab1fa39afdc186f60423e65 100644 --- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py @@ -17,7 +17,7 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework -import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.layers as pd from paddle.v2.fluid.executor import Executor dict_size = 30000 @@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) hidden_dim = 32 word_dim = 16 IS_SPARSE = True -batch_size = 10 -max_length = 50 +batch_size = 2 +max_length = 8 topk_size = 50 trg_dic_size = 10000 +beam_size = 2 decoder_size = hidden_dim +place = core.CPUPlace() -def encoder_decoder(): + +def encoder(): # encoder - src_word_id = layers.data( + src_word_id = pd.data( name="src_word_id", shape=[1], dtype='int64', lod_level=1) - src_embedding = layers.embedding( + src_embedding = pd.embedding( input=src_word_id, size=[dict_size, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name='vemb')) - fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') - lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) - encoder_out = layers.sequence_last_step(input=lstm_hidden0) + fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = pd.sequence_last_step(input=lstm_hidden0) + return encoder_out + +def decoder_train(context): # decoder - trg_language_word = layers.data( + trg_language_word = pd.data( name="target_language_word", shape=[1], dtype='int64', lod_level=1) - trg_embedding = layers.embedding( + trg_embedding = pd.embedding( input=trg_language_word, size=[dict_size, word_dim], dtype='float32', is_sparse=IS_SPARSE, param_attr=fluid.ParamAttr(name='vemb')) - rnn = fluid.layers.DynamicRNN() + rnn = pd.DynamicRNN() with rnn.block(): current_word = rnn.step_input(trg_embedding) - mem = rnn.memory(init=encoder_out) - fc1 = fluid.layers.fc(input=[current_word, mem], + pre_state = rnn.memory(init=context) + current_state = pd.fc(input=[current_word, pre_state], size=decoder_size, act='tanh') - out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax') - rnn.update_memory(mem, fc1) - rnn.output(out) + + current_score = pd.fc(input=current_state, + size=target_dict_dim, + act='softmax') + rnn.update_memory(pre_state, current_state) + rnn.output(current_score) return rnn() +def decoder_decode(context): + init_state = context + array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) + counter = pd.zeros(shape=[1], dtype='int64') + + # fill the first element with init_state + state_array = pd.create_array('float32') + pd.array_write(init_state, array=state_array, i=counter) + + # ids, scores as memory + ids_array = pd.create_array('int64') + scores_array = pd.create_array('float32') + + init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = pd.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + pd.array_write(init_ids, array=ids_array, i=counter) + pd.array_write(init_scores, array=scores_array, i=counter) + + cond = pd.less_than(x=counter, y=array_len) + + while_op = pd.While(cond=cond) + with while_op.block(): + pre_ids = pd.array_read(array=ids_array, i=counter) + pre_state = pd.array_read(array=state_array, i=counter) + pre_score = pd.array_read(array=scores_array, i=counter) + + # expand the lod of pre_state to be the same with pre_score + pre_state_expanded = pd.sequence_expand(pre_state, pre_score) + + pre_ids_emb = pd.embedding( + input=pre_ids, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE) + + # use rnn unit to update rnn + current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded], + size=decoder_size, + act='tanh') + + # use score to do beam search + current_score = pd.fc(input=current_state, + size=target_dict_dim, + act='softmax') + topk_scores, topk_indices = pd.topk(current_score, k=50) + selected_ids, selected_scores = pd.beam_search( + pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0) + + pd.increment(x=counter, value=1, in_place=True) + + # update the memories + pd.array_write(current_state, array=state_array, i=counter) + pd.array_write(selected_ids, array=ids_array, i=counter) + pd.array_write(selected_scores, array=scores_array, i=counter) + + pd.less_than(x=counter, y=array_len, cond=cond) + + translation_ids, translation_scores = pd.beam_search_decode( + ids=ids_array, scores=scores_array) + + # return init_ids, init_scores + + return translation_ids, translation_scores + + +def set_init_lod(data, lod, place): + res = core.LoDTensor() + res.set(data, place) + res.set_lod(lod) + return res + + def to_lodtensor(data, place): seq_lens = [len(seq) for seq in data] cur_len = 0 @@ -88,12 +171,13 @@ def to_lodtensor(data, place): return res -def main(): - rnn_out = encoder_decoder() - label = layers.data( +def train_main(): + context = encoder() + rnn_out = decoder_train(context) + label = pd.data( name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) - cost = layers.cross_entropy(input=rnn_out, label=label) - avg_cost = fluid.layers.mean(x=cost) + cost = pd.cross_entropy(input=rnn_out, label=label) + avg_cost = pd.mean(x=cost) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer.minimize(avg_cost) @@ -103,13 +187,12 @@ def main(): paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) - place = core.CPUPlace() exe = Executor(place) exe.run(framework.default_startup_program()) batch_id = 0 - for pass_id in xrange(2): + for pass_id in xrange(1): for data in train_data(): word_data = to_lodtensor(map(lambda x: x[0], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place) @@ -125,9 +208,48 @@ def main(): print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + " avg_cost=" + str(avg_cost_val)) if batch_id > 3: - exit(0) + break batch_id += 1 +def decode_main(): + context = encoder() + translation_ids, translation_scores = decoder_decode(context) + + exe = Executor(place) + exe.run(framework.default_startup_program()) + + init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_lod = [i for i in range(batch_size)] + [batch_size] + init_lod = [init_lod, init_lod] + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + for _, data in enumerate(train_data()): + init_ids = set_init_lod(init_ids_data, init_lod, place) + init_scores = set_init_lod(init_scores_data, init_lod, place) + + src_word_data = to_lodtensor(map(lambda x: x[0], data), place) + + result_ids, result_scores = exe.run( + framework.default_main_program(), + feed={ + 'src_word_id': src_word_data, + 'init_ids': init_ids, + 'init_scores': init_scores + }, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + print result_ids.lod() + break + + if __name__ == '__main__': - main() + # train_main() + decode_main() diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py new file mode 100644 index 0000000000000000000000000000000000000000..ac7ef4046f9ff55c2cbfc28b50784b9bffb80d53 --- /dev/null +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py @@ -0,0 +1,149 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function +import argparse +import paddle.v2.fluid as fluid +import paddle.v2 as paddle +import sys +import numpy + + +def parse_arg(): + parser = argparse.ArgumentParser() + parser.add_argument( + "nn_type", + help="The neural network type, in ['mlp', 'conv']", + type=str, + choices=['mlp', 'conv']) + parser.add_argument( + "--parallel", + help='Run in parallel or not', + default=False, + action="store_true") + parser.add_argument( + "--use_cuda", + help="Run the program by using CUDA", + default=False, + action="store_true") + return parser.parse_args() + + +BATCH_SIZE = 64 + + +def loss_net(hidden, label): + prediction = fluid.layers.fc(input=hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + return fluid.layers.mean(x=loss), fluid.layers.accuracy( + input=prediction, label=label) + + +def mlp(img, label): + hidden = fluid.layers.fc(input=img, size=200, act='tanh') + hidden = fluid.layers.fc(input=hidden, size=200, act='tanh') + return loss_net(hidden, label) + + +def conv_net(img, label): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + return loss_net(conv_pool_2, label) + + +def main(): + args = parse_arg() + print("recognize digits with args: {0}".format(" ".join(sys.argv[1:]))) + + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if args.nn_type == 'mlp': + net_conf = mlp + else: + net_conf = conv_net + + if args.parallel: + places = fluid.layers.get_places() + pd = fluid.layers.ParallelDo(places) + with pd.do(): + img_ = pd.read_input(img) + label_ = pd.read_input(label) + for o in net_conf(img_, label_): + pd.write_output(o) + + avg_loss, acc = pd() + # get mean loss and acc through every devices. + avg_loss = fluid.layers.mean(x=avg_loss) + acc = fluid.layers.mean(x=acc) + else: + avg_loss, acc = net_conf(img, label) + + test_program = fluid.default_main_program().clone() + + optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer.minimize(avg_loss) + + place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace() + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) + feeder = fluid.DataFeeder(feed_list=[img, label], place=place) + + PASS_NUM = 100 + for pass_id in range(PASS_NUM): + for batch_id, data in enumerate(train_reader()): + # train a mini-batch, fetch nothing + exe.run(feed=feeder.feed(data)) + if (batch_id + 1) % 10 == 0: + acc_set = [] + avg_loss_set = [] + for test_data in test_reader(): + acc_np, avg_loss_np = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[acc, avg_loss]) + acc_set.append(float(acc_np)) + avg_loss_set.append(float(avg_loss_np)) + # get test acc and loss + acc_val = numpy.array(acc_set).mean() + avg_loss_val = numpy.array(avg_loss_set).mean() + if float(acc_val) > 0.85: # test acc > 85% + exit(0) + else: + print( + 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'. + format(pass_id, batch_id + 1, + float(avg_loss_val), float(acc_val))) + + +if __name__ == '__main__': + main() diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py deleted file mode 100644 index 4710d16c24e95a11108801a014f94687558fd91e..0000000000000000000000000000000000000000 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import numpy as np -import paddle.v2 as paddle -import paddle.v2.fluid as fluid - -images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32') -label = fluid.layers.data(name='label', shape=[1], dtype='int64') -conv_pool_1 = fluid.nets.simple_img_conv_pool( - input=images, - filter_size=5, - num_filters=20, - pool_size=2, - pool_stride=2, - act="relu") -conv_pool_2 = fluid.nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=50, - pool_size=2, - pool_stride=2, - act="relu") - -predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax") -cost = fluid.layers.cross_entropy(input=predict, label=label) -avg_cost = fluid.layers.mean(x=cost) -optimizer = fluid.optimizer.Adam(learning_rate=0.01) -optimizer.minimize(avg_cost) - -accuracy = fluid.evaluator.Accuracy(input=predict, label=label) - -BATCH_SIZE = 50 -PASS_NUM = 3 -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=500), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -feeder = fluid.DataFeeder(feed_list=[images, label], place=place) -exe.run(fluid.default_startup_program()) - -for pass_id in range(PASS_NUM): - accuracy.reset(exe) - for data in train_reader(): - loss, acc = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost] + accuracy.metrics) - pass_acc = accuracy.eval(exe) - print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" + - str(pass_acc)) - # print loss, acc - if loss < 10.0 and pass_acc > 0.9: - # if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good. - exit(0) - - pass_acc = accuracy.eval(exe) - print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc)) - -exit(1) diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py deleted file mode 100644 index 8776a65bf804e93dfeb295ecca34fac0840b0a90..0000000000000000000000000000000000000000 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py +++ /dev/null @@ -1,96 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function -import numpy as np -import paddle.v2 as paddle -import paddle.v2.fluid as fluid - -BATCH_SIZE = 128 -image = fluid.layers.data(name='x', shape=[784], dtype='float32') - -regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE) - -hidden1 = fluid.layers.fc(input=image, - size=128, - act='relu', - param_attr=fluid.ParamAttr( - regularizer=regularizer, - gradient_clip=fluid.clip.ClipByValue(10))) - -hidden2 = fluid.layers.fc(input=hidden1, - size=64, - act='relu', - param_attr=regularizer) - -predict = fluid.layers.fc(input=hidden2, - size=10, - act='softmax', - param_attr=regularizer) - -label = fluid.layers.data(name='y', shape=[1], dtype='int64') - -cost = fluid.layers.cross_entropy(input=predict, label=label) -avg_cost = fluid.layers.mean(x=cost) - -optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) -opts = optimizer.minimize(avg_cost) - -accuracy = fluid.evaluator.Accuracy(input=predict, label=label) - -inference_program = fluid.default_main_program().clone() -with fluid.program_guard(inference_program): - test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label) - test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states - inference_program = fluid.io.get_inference_program(test_target) - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=8192), - batch_size=BATCH_SIZE) - -test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -feeder = fluid.DataFeeder(feed_list=[image, label], place=place) -exe.run(fluid.default_startup_program()) - -PASS_NUM = 100 -for pass_id in range(PASS_NUM): - accuracy.reset(exe) - for data in train_reader(): - out, acc = exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[avg_cost] + accuracy.metrics) - pass_acc = accuracy.eval(exe) - - test_accuracy.reset(exe) - for data in test_reader(): - out, acc = exe.run(inference_program, - feed=feeder.feed(data), - fetch_list=[avg_cost] + test_accuracy.metrics) - - test_pass_acc = test_accuracy.eval(exe) - print("pass_id=" + str(pass_id) + " train_cost=" + str( - out) + " train_acc=" + str(acc) + " train_pass_acc=" + str(pass_acc) - + " test_acc=" + str(test_pass_acc)) - - if test_pass_acc > 0.7: - fluid.io.save_inference_model( - "./recognize_digits_mlp.inference.model/", ["x"], [predict], - exe) - exit(0) - -exit(1) diff --git a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..fdc60861760163d2ebad3b050e551929321baafd --- /dev/null +++ b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py @@ -0,0 +1,204 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +from paddle.v2.fluid.executor import Executor + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) +hidden_dim = 32 +embedding_dim = 16 +batch_size = 10 +max_length = 50 +topk_size = 50 +encoder_size = decoder_size = hidden_dim +IS_SPARSE = True +USE_PEEPHOLES = False + + +def bi_lstm_encoder(input_seq, hidden_size): + input_forward_proj = fluid.layers.fc(input=input_seq, + size=hidden_size * 4, + bias_attr=True) + forward, _ = fluid.layers.dynamic_lstm( + input=input_forward_proj, + size=hidden_size * 4, + use_peepholes=USE_PEEPHOLES) + input_backward_proj = fluid.layers.fc(input=input_seq, + size=hidden_size * 4, + bias_attr=True) + backward, _ = fluid.layers.dynamic_lstm( + input=input_backward_proj, + size=hidden_size * 4, + is_reverse=True, + use_peepholes=USE_PEEPHOLES) + + forward_last = fluid.layers.sequence_last_step(input=forward) + backward_first = fluid.layers.sequence_first_step(input=backward) + + return forward_last, backward_first + + +# FIXME(peterzhang2029): Replace this function with the lstm_unit_op. +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): + def linear(inputs): + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) + + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) + + cell_t = fluid.layers.sums(input=[ + fluid.layers.elementwise_mul( + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( + x=input_gate, y=cell_tilde) + ]) + + hidden_t = fluid.layers.elementwise_mul( + x=output_gate, y=fluid.layers.tanh(x=cell_t)) + + return hidden_t, cell_t + + +def lstm_decoder_without_attention(target_embedding, decoder_boot, context, + decoder_size): + rnn = fluid.layers.DynamicRNN() + + cell_init = fluid.layers.fill_constant_batch_size_like( + input=decoder_boot, + value=0.0, + shape=[-1, decoder_size], + dtype='float32') + cell_init.stop_gradient = False + + with rnn.block(): + current_word = rnn.step_input(target_embedding) + context = rnn.static_input(context) + + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) + cell_mem = rnn.memory(init=cell_init) + decoder_inputs = fluid.layers.concat( + input=[context, current_word], axis=1) + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) + rnn.update_memory(hidden_mem, h) + rnn.update_memory(cell_mem, c) + out = fluid.layers.fc(input=h, + size=target_dict_dim, + bias_attr=True, + act='softmax') + rnn.output(out) + return rnn() + + +def seq_to_seq_net(): + """Construct a seq2seq network.""" + + src_word_idx = fluid.layers.data( + name='source_sequence', shape=[1], dtype='int64', lod_level=1) + + src_embedding = fluid.layers.embedding( + input=src_word_idx, + size=[source_dict_dim, embedding_dim], + dtype='float32') + + src_forward_last, src_backward_first = bi_lstm_encoder( + input_seq=src_embedding, hidden_size=encoder_size) + + encoded_vector = fluid.layers.concat( + input=[src_forward_last, src_backward_first], axis=1) + + decoder_boot = fluid.layers.fc(input=src_backward_first, + size=decoder_size, + bias_attr=False, + act='tanh') + + trg_word_idx = fluid.layers.data( + name='target_sequence', shape=[1], dtype='int64', lod_level=1) + + trg_embedding = fluid.layers.embedding( + input=trg_word_idx, + size=[target_dict_dim, embedding_dim], + dtype='float32') + + prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot, + encoded_vector, decoder_size) + label = fluid.layers.data( + name='label_sequence', shape=[1], dtype='int64', lod_level=1) + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = core.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def main(): + avg_cost = seq_to_seq_net() + + optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) + optimizer.minimize(avg_cost) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + + place = core.CPUPlace() + exe = Executor(place) + + exe.run(framework.default_startup_program()) + + batch_id = 0 + for pass_id in xrange(2): + for data in train_data(): + word_data = to_lodtensor(map(lambda x: x[0], data), place) + trg_word = to_lodtensor(map(lambda x: x[1], data), place) + trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) + outs = exe.run(framework.default_main_program(), + feed={ + 'source_sequence': word_data, + 'target_sequence': trg_word, + 'label_sequence': trg_word_next + }, + fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + + " avg_cost=" + str(avg_cost_val)) + if batch_id > 3: + exit(0) + batch_id += 1 + + +if __name__ == '__main__': + main() diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py index 52c7ecdeb3646fdce36937b84ba8956947371d87..9774edebfb1de0ae73970d582c620f8a984a4ebf 100644 --- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py @@ -68,10 +68,10 @@ else: fluid.io.save_persistables(exe, "./fit_a_line.model/") fluid.io.load_persistables(exe, "./fit_a_line.model/") for data in train_reader(): - avg_loss_value, = exe.run(trainer_prog, - feed=feeder.feed(data), - fetch_list=[avg_cost]) - + avg_loss_value = exe.run(trainer_prog, + feed=feeder.feed(data), + fetch_list=[avg_cost]) + print("loss:" + str(avg_loss_value)) if avg_loss_value[0] < 10.0: exit(0) exit(1) diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py index 218dea31e10757d901c5524567f13501b64dbea5..0c51ccf306367d9a458968a7a4a8aa53bea663c3 100644 --- a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py +++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py @@ -14,8 +14,6 @@ from __future__ import print_function -import sys - import paddle.v2 as paddle import paddle.v2.fluid as fluid import os @@ -106,10 +104,10 @@ if len(sys.argv) >= 2: net_type = sys.argv[1] if net_type == "vgg": - print("train vgg net") + print("training vgg net") net = vgg16_bn_drop(images) elif net_type == "resnet": - print("train resnet") + print("training resnet") net = resnet_cifar10(images, 32) else: raise ValueError("%s network is not supported" % net_type) @@ -129,6 +127,7 @@ train_reader = paddle.batch( batch_size=BATCH_SIZE) place = fluid.CPUPlace() +feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) exe = fluid.Executor(place) t = fluid.DistributeTranspiler() @@ -146,17 +145,14 @@ if training_role == "PSERVER": if not current_endpoint: print("need env SERVER_ENDPOINT") exit(1) - print("start pserver at:", current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) - print("pserver run end") elif training_role == "TRAINER": - print("start trainer") trainer_prog = t.get_trainer_program() - feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) exe.run(fluid.default_startup_program()) + for pass_id in range(PASS_NUM): accuracy.reset(exe) for data in train_reader(): @@ -164,9 +160,10 @@ elif training_role == "TRAINER": feed=feeder.feed(data), fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) - print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( - pass_acc)) - # this model is slow, so if we can train two mini batch, we think it works properly. + print("pass_id:" + str(pass_id) + "loss:" + str(loss) + " pass_acc:" + + str(pass_acc)) + # this model is slow, so if we can train two mini batches, + # we think it works properly. print("trainer run end") else: print("environment var TRAINER_ROLE should be TRAINER os PSERVER") diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py new file mode 100644 index 0000000000000000000000000000000000000000..adeacd4adf2150e0302965d80457e26d07c6b96d --- /dev/null +++ b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py @@ -0,0 +1,157 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +from paddle.v2.fluid.executor import Executor +import os + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) +hidden_dim = 32 +word_dim = 16 +IS_SPARSE = True +batch_size = 10 +max_length = 50 +topk_size = 50 +trg_dic_size = 10000 + +decoder_size = hidden_dim + + +def encoder_decoder(): + # encoder + src_word_id = layers.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = layers.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = layers.sequence_last_step(input=lstm_hidden0) + + # decoder + trg_language_word = layers.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = layers.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + current_word = rnn.step_input(trg_embedding) + mem = rnn.memory(init=encoder_out) + fc1 = fluid.layers.fc(input=[current_word, mem], + size=decoder_size, + act='tanh') + out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax') + rnn.update_memory(mem, fc1) + rnn.output(out) + + return rnn() + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = core.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def main(): + rnn_out = encoder_decoder() + label = layers.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = layers.cross_entropy(input=rnn_out, label=label) + avg_cost = fluid.layers.mean(x=cost) + + optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) + optimize_ops, params_grads = optimizer.minimize(avg_cost) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + + place = core.CPUPlace() + exe = Executor(place) + + t = fluid.DistributeTranspiler() + # all parameter server endpoints list for spliting parameters + pserver_endpoints = os.getenv("PSERVERS") + # server endpoint for current node + current_endpoint = os.getenv("SERVER_ENDPOINT") + # run as trainer or parameter server + training_role = os.getenv( + "TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver + t.transpile( + optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) + + if training_role == "PSERVER": + if not current_endpoint: + print("need env SERVER_ENDPOINT") + exit(1) + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) + exe.run(pserver_startup) + exe.run(pserver_prog) + elif training_role == "TRAINER": + trainer_prog = t.get_trainer_program() + exe.run(framework.default_startup_program()) + + batch_id = 0 + for pass_id in xrange(2): + for data in train_data(): + word_data = to_lodtensor(map(lambda x: x[0], data), place) + trg_word = to_lodtensor(map(lambda x: x[1], data), place) + trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) + outs = exe.run(trainer_prog, + feed={ + 'src_word_id': word_data, + 'target_language_word': trg_word, + 'target_language_next_word': trg_word_next + }, + fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + + " avg_cost=" + str(avg_cost_val)) + if batch_id > 3: + exit(0) + batch_id += 1 + else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + + +if __name__ == '__main__': + main() diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py new file mode 100644 index 0000000000000000000000000000000000000000..2d8885e377b0a10d8b5bad4e8fcecb9cc6fc8b64 --- /dev/null +++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py @@ -0,0 +1,216 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import os +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.nets as nets +from paddle.v2.fluid.optimizer import SGDOptimizer + +IS_SPARSE = True +BATCH_SIZE = 256 +PASS_NUM = 100 + + +def get_usr_combined_features(): + USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1 + uid = layers.data(name='user_id', shape=[1], dtype='int64') + usr_emb = layers.embedding( + input=uid, + dtype='float32', + size=[USR_DICT_SIZE, 32], + param_attr='user_table', + is_sparse=IS_SPARSE) + usr_fc = layers.fc(input=usr_emb, size=32) + USR_GENDER_DICT_SIZE = 2 + + usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64') + usr_gender_emb = layers.embedding( + input=usr_gender_id, + size=[USR_GENDER_DICT_SIZE, 16], + param_attr='gender_table', + is_sparse=IS_SPARSE) + usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) + + USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table) + usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64") + usr_age_emb = layers.embedding( + input=usr_age_id, + size=[USR_AGE_DICT_SIZE, 16], + is_sparse=IS_SPARSE, + param_attr='age_table') + usr_age_fc = layers.fc(input=usr_age_emb, size=16) + + USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1 + usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64") + usr_job_emb = layers.embedding( + input=usr_job_id, + size=[USR_JOB_DICT_SIZE, 16], + param_attr='job_table', + is_sparse=IS_SPARSE) + usr_job_fc = layers.fc(input=usr_job_emb, size=16) + + concat_embed = layers.concat( + input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1) + + usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") + return usr_combined_features + + +def get_mov_combined_features(): + MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1 + mov_id = layers.data(name='movie_id', shape=[1], dtype='int64') + mov_emb = layers.embedding( + input=mov_id, + dtype='float32', + size=[MOV_DICT_SIZE, 32], + param_attr='movie_table', + is_sparse=IS_SPARSE) + mov_fc = layers.fc(input=mov_emb, size=32) + + CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories()) + category_id = layers.data(name='category_id', shape=[1], dtype='int64') + mov_categories_emb = layers.embedding( + input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE) + mov_categories_hidden = layers.sequence_pool( + input=mov_categories_emb, pool_type="sum") + + MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict()) + mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64') + mov_title_emb = layers.embedding( + input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE) + mov_title_conv = nets.sequence_conv_pool( + input=mov_title_emb, + num_filters=32, + filter_size=3, + act="tanh", + pool_type="sum") + + concat_embed = layers.concat( + input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1) + + mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh") + return mov_combined_features + + +def model(): + usr_combined_features = get_usr_combined_features() + mov_combined_features = get_mov_combined_features() + + # need cos sim + inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features) + scale_infer = layers.scale(x=inference, scale=5.0) + + label = layers.data(name='score', shape=[1], dtype='float32') + square_cost = layers.square_error_cost(input=scale_infer, label=label) + avg_cost = layers.mean(x=square_cost) + + return avg_cost + + +def func_feed(feeding, data, place): + feed_tensors = {} + for (key, idx) in feeding.iteritems(): + tensor = core.LoDTensor() + if key != "category_id" and key != "movie_title": + if key == "score": + numpy_data = np.array(map(lambda x: x[idx], data)).astype( + "float32") + else: + numpy_data = np.array(map(lambda x: x[idx], data)).astype( + "int64") + else: + numpy_data = map(lambda x: np.array(x[idx]).astype("int64"), data) + lod_info = [len(item) for item in numpy_data] + offset = 0 + lod = [offset] + for item in lod_info: + offset += item + lod.append(offset) + numpy_data = np.concatenate(numpy_data, axis=0) + tensor.set_lod([lod]) + + numpy_data = numpy_data.reshape([numpy_data.shape[0], 1]) + tensor.set(numpy_data, place) + feed_tensors[key] = tensor + return feed_tensors + + +def main(): + cost = model() + optimizer = SGDOptimizer(learning_rate=0.2) + optimize_ops, params_grads = optimizer.minimize(cost) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.movielens.train(), buf_size=8192), + batch_size=BATCH_SIZE) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + t = fluid.DistributeTranspiler() + + # all parameter server endpoints list for spliting parameters + pserver_endpoints = os.getenv("PSERVERS") + # server endpoint for current node + current_endpoint = os.getenv("SERVER_ENDPOINT") + # run as trainer or parameter server + training_role = os.getenv("TRAINING_ROLE", "TRAINER") + t.transpile( + optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) + + if training_role == "PSERVER": + if not current_endpoint: + print("need env SERVER_ENDPOINT") + exit(1) + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) + exe.run(pserver_startup) + exe.run(pserver_prog) + elif training_role == "TRAINER": + exe.run(fluid.default_startup_program()) + trainer_prog = t.get_trainer_program() + + feeding = { + 'user_id': 0, + 'gender_id': 1, + 'age_id': 2, + 'job_id': 3, + 'movie_id': 4, + 'category_id': 5, + 'movie_title': 6, + 'score': 7 + } + + for pass_id in range(PASS_NUM): + for data in train_reader(): + outs = exe.run(trainer_prog, + feed=func_feed(feeding, data, place), + fetch_list=[cost]) + out = np.array(outs[0]) + print("cost=" + str(out[0])) + if out[0] < 6.0: + print("Training complete. Average cost is less than 6.0.") + # if avg cost less than 6.0, we think our code is good. + exit(0) + else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + + +if __name__ == '__main__': + main() diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py new file mode 100644 index 0000000000000000000000000000000000000000..bff376a0e2ee0fbb0d869e0dddf4460ed5dc4ac6 --- /dev/null +++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py @@ -0,0 +1,135 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import os +import paddle.v2 as paddle +import paddle.v2.fluid as fluid + + +def stacked_lstm_net(data, + label, + input_dim, + class_dim=2, + emb_dim=128, + hid_dim=512, + stacked_num=3): + assert stacked_num % 2 == 1 + + emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) + # add bias attr + + # TODO(qijun) linear act + fc1 = fluid.layers.fc(input=emb, size=hid_dim) + lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim) + + inputs = [fc1, lstm1] + + for i in range(2, stacked_num + 1): + fc = fluid.layers.fc(input=inputs, size=hid_dim) + lstm, cell = fluid.layers.dynamic_lstm( + input=fc, size=hid_dim, is_reverse=(i % 2) == 0) + inputs = [fc, lstm] + + fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max') + lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max') + + prediction = fluid.layers.fc(input=[fc_last, lstm_last], + size=class_dim, + act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) + optimize_ops, params_grads = adam_optimizer.minimize(avg_cost) + accuracy = fluid.evaluator.Accuracy(input=prediction, label=label) + return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = fluid.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def main(): + BATCH_SIZE = 100 + PASS_NUM = 5 + + word_dict = paddle.dataset.imdb.word_dict() + print "loaded word dict successfully" + dict_dim = len(word_dict) + class_dim = 2 + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost, accuracy, acc_out, optimize_ops, params_grads = stacked_lstm_net( + data, label, input_dim=dict_dim, class_dim=class_dim) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.imdb.train(word_dict), buf_size=1000), + batch_size=BATCH_SIZE) + place = fluid.CPUPlace() + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + + t = fluid.DistributeTranspiler() + # all parameter server endpoints list for spliting parameters + pserver_endpoints = os.getenv("PSERVERS") + # server endpoint for current node + current_endpoint = os.getenv("SERVER_ENDPOINT") + # run as trainer or parameter server + training_role = os.getenv( + "TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver + t.transpile( + optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) + + if training_role == "PSERVER": + if not current_endpoint: + print("need env SERVER_ENDPOINT") + exit(1) + pserver_prog = t.get_pserver_program(current_endpoint) + pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) + exe.run(pserver_startup) + exe.run(pserver_prog) + elif training_role == "TRAINER": + exe.run(fluid.default_startup_program()) + trainer_prog = t.get_trainer_program() + for pass_id in xrange(PASS_NUM): + accuracy.reset(exe) + for data in train_data(): + cost_val, acc_val = exe.run(trainer_prog, + feed=feeder.feed(data), + fetch_list=[cost, acc_out]) + pass_acc = accuracy.eval(exe) + print("cost=" + str(cost_val) + " acc=" + str(acc_val) + + " pass_acc=" + str(pass_acc)) + if cost_val < 1.0 and acc_val > 0.8: + exit(0) + else: + print("environment var TRAINER_ROLE should be TRAINER os PSERVER") + + +if __name__ == '__main__': + main() diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index cf054bb0fe778d34add4ac456f672a8b47483e84..7ad5e2c594f24999e298533b6c05ba688a935f0b 100644 --- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -16,6 +16,11 @@ import numpy as np import paddle.v2 as paddle import paddle.v2.fluid as fluid +# need to fix random seed and training data to compare the loss +# value accurately calculated by the default and the memory optimization +# version. +fluid.default_startup_program().random_seed = 111 + x = fluid.layers.data(name='x', shape=[13], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) @@ -28,15 +33,18 @@ avg_cost = fluid.layers.mean(x=cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) sgd_optimizer.minimize(avg_cost) -# memopt_program = fluid.default_main_program() -memopt_program = fluid.memory_optimize(fluid.default_main_program()) +fluid.memory_optimize(fluid.default_main_program()) BATCH_SIZE = 200 +# fix the order of training data train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.uci_housing.train(), buf_size=500), - batch_size=BATCH_SIZE) + paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE) + +# train_reader = paddle.batch( +# paddle.reader.shuffle( +# paddle.dataset.uci_housing.train(), buf_size=500), +# batch_size=BATCH_SIZE) place = fluid.CPUPlace() feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) @@ -49,7 +57,7 @@ for pass_id in range(PASS_NUM): fluid.io.save_persistables(exe, "./fit_a_line.model/") fluid.io.load_persistables(exe, "./fit_a_line.model/") for data in train_reader(): - avg_loss_value, = exe.run(memopt_program, + avg_loss_value, = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost]) diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py index 42b3cb81ce67d38494677f3ecbfb1e07f7c0c3ad..26673afd83c48328c3f354e82bfa3725aa4805b5 100644 --- a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py @@ -19,6 +19,11 @@ import sys import paddle.v2 as paddle import paddle.v2.fluid as fluid +# need to fix random seed and training data to compare the loss +# value accurately calculated by the default and the memory optimization +# version. +fluid.default_startup_program().random_seed = 111 + def resnet_cifar10(input, depth=32): def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): @@ -117,31 +122,37 @@ opts = optimizer.minimize(avg_cost) accuracy = fluid.evaluator.Accuracy(input=predict, label=label) -# memopt_program = fluid.default_main_program() -memopt_program = fluid.memory_optimize(fluid.default_main_program()) +fluid.memory_optimize(fluid.default_main_program()) BATCH_SIZE = 128 PASS_NUM = 1 +# fix the order of training data train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.cifar.train10(), buf_size=128 * 10), - batch_size=BATCH_SIZE) + paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE) + +# train_reader = paddle.batch( +# paddle.reader.shuffle( +# paddle.dataset.cifar.train10(), buf_size=128 * 10), +# batch_size=BATCH_SIZE) place = fluid.CPUPlace() exe = fluid.Executor(place) feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) exe.run(fluid.default_startup_program()) +i = 0 for pass_id in range(PASS_NUM): accuracy.reset(exe) for data in train_reader(): - loss, acc = exe.run(memopt_program, + loss, acc = exe.run(fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( pass_acc)) # this model is slow, so if we can train two mini batch, we think it works properly. - exit(0) + if i > 2: + exit(0) + i += 1 exit(1) diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py new file mode 100644 index 0000000000000000000000000000000000000000..ffd53e7a78142162317a677de49c1821635a65b5 --- /dev/null +++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py @@ -0,0 +1,144 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle.v2 as paddle +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +from paddle.v2.fluid.executor import Executor + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) +hidden_dim = 32 +word_dim = 16 +IS_SPARSE = True +batch_size = 10 +max_length = 50 +topk_size = 50 +trg_dic_size = 10000 + +decoder_size = hidden_dim + +# need to fix random seed and training data to compare the loss +# value accurately calculated by the default and the memory optimization +# version. +fluid.default_startup_program().random_seed = 111 + + +def encoder_decoder(): + # encoder + src_word_id = layers.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = layers.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = layers.sequence_last_step(input=lstm_hidden0) + + # decoder + trg_language_word = layers.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = layers.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + current_word = rnn.step_input(trg_embedding) + mem = rnn.memory(init=encoder_out) + fc1 = fluid.layers.fc(input=[current_word, mem], + size=decoder_size, + act='tanh') + out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax') + rnn.update_memory(mem, fc1) + rnn.output(out) + + return rnn() + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = core.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def main(): + rnn_out = encoder_decoder() + label = layers.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = layers.cross_entropy(input=rnn_out, label=label) + avg_cost = fluid.layers.mean(x=cost) + + optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) + optimizer.minimize(avg_cost) + + fluid.memory_optimize(fluid.default_main_program()) + + # fix the order of training data + train_data = paddle.batch( + paddle.dataset.wmt14.train(dict_size), batch_size=batch_size) + + # train_data = paddle.batch( + # paddle.reader.shuffle( + # paddle.dataset.wmt14.train(dict_size), buf_size=1000), + # batch_size=batch_size) + + place = core.CPUPlace() + exe = Executor(place) + + exe.run(framework.default_startup_program()) + + batch_id = 0 + for pass_id in xrange(10): + for data in train_data(): + word_data = to_lodtensor(map(lambda x: x[0], data), place) + trg_word = to_lodtensor(map(lambda x: x[1], data), place) + trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) + outs = exe.run(fluid.default_main_program(), + feed={ + 'src_word_id': word_data, + 'target_language_word': trg_word, + 'target_language_next_word': trg_word_next + }, + fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + + " avg_cost=" + str(avg_cost_val)) + if batch_id > 2: + exit(0) + batch_id += 1 + + +if __name__ == '__main__': + main() diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index 56f54de86f680653fbd97a7ce1d3f547d1657587..3f6d7070c2987d0557c60db84a2c679cd2cfe36b 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -334,7 +334,7 @@ class OpTest(unittest.TestCase): def check_output(self, atol=1e-5): places = [core.CPUPlace()] - if core.is_compile_gpu() and core.op_support_gpu(self.op_type): + if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) for place in places: self.check_output_with_place(place, atol) @@ -367,7 +367,7 @@ class OpTest(unittest.TestCase): max_relative_error=0.005, user_defined_grads=None): places = [core.CPUPlace()] - if core.is_compile_gpu() and core.op_support_gpu(self.op_type): + if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) for place in places: self.check_grad_with_place(place, inputs_to_check, output_names, diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py index 18605e60652a1614571a91918a012f0c08c8f1b3..1de5d446b8eaf57d3718dde7540c929996ee3432 100644 --- a/python/paddle/v2/fluid/tests/test_activation_op.py +++ b/python/paddle/v2/fluid/tests/test_activation_op.py @@ -186,8 +186,7 @@ class TestFloor(OpTest): self.op_type = "floor" x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.inputs = {'X': x} - # numpy floor need +1 - self.outputs = {'Out': np.floor(self.inputs['X']) + 1.0} + self.outputs = {'Out': np.floor(self.inputs['X'])} def test_check_output(self): self.check_output() diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py index 86b0567ce123b00bace639fb8fe76cf3894abd6d..3556bcf8ba0d7f16b1d9bf50e46aebde83de2e25 100644 --- a/python/paddle/v2/fluid/tests/test_adagrad_op.py +++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py @@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase): def test_sparse_adagrad(self): places = [core.CPUPlace()] - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: self.check_with_place(place) diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py index 10580adca714beeb7571312b8fdc4235ecaaccfe..df1fa8983c1984a9bb9f204aded148c17d3d609d 100644 --- a/python/paddle/v2/fluid/tests/test_adam_op.py +++ b/python/paddle/v2/fluid/tests/test_adam_op.py @@ -305,7 +305,7 @@ class TestSparseAdamOp(unittest.TestCase): def test_sparse_sgd(self): places = [core.CPUPlace()] - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: self.check_with_place(place) diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py index 371bd426781b457582e74c33c80c46b5d56946fa..cf13166f255c782bdcec622d58d073a0943c8e1e 100644 --- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py @@ -352,7 +352,7 @@ class TestBatchNormOp(OpTest): print "op test backward passed: ", str(place), data_layout places = [core.CPUPlace()] - if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): places.append(core.CUDAPlace(0)) for place in places: diff --git a/python/paddle/v2/fluid/tests/test_detection_output_op.py b/python/paddle/v2/fluid/tests/test_detection_output_op.py index 4a9cd474b81a419bfb42c202327df04c0d2e5bd9..8a5e06b38f5ed5336ef02bac7876610758b44258 100644 --- a/python/paddle/v2/fluid/tests/test_detection_output_op.py +++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py @@ -68,4 +68,6 @@ class TestUnpoolOp(OpTest): if __name__ == '__main__': - unittest.main() + # FIXME: detection_output_op will be rewritten. This unittest should be + # enabled after rewriting. + exit(0) # temporary disable this unittest diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py index 82842534d4ac7ad8b0a8e0d877c6a638fb53cadc..79beb8b1fcef610bc2f3e8d18da4345baa9b99c3 100644 --- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py +++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py @@ -33,7 +33,7 @@ class TestGaussianRandomOp(unittest.TestCase): self.gaussian_random_test(place=fluid.CPUPlace()) def test_gpu(self): - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): self.gaussian_random_test(place=fluid.CUDAPlace(0)) def gaussian_random_test(self, place): diff --git a/python/paddle/v2/fluid/tests/test_iou_similarity_op.py b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py new file mode 100644 index 0000000000000000000000000000000000000000..128f2e4977195a563efcd26364cc6261da2dd685 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py @@ -0,0 +1,55 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +class TestIOUSimilarityOp(OpTest): + def test_check_output(self): + self.check_output() + + def setUp(self): + self.op_type = "iou_similarity" + self.boxes1 = np.array( + [[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]).astype('float32') + self.boxes2 = np.array([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0], + [0.0, 0.0, 20.0, 20.0]]).astype('float32') + self.output = np.array( + [[2.0 / 16.0, 0, 6.0 / 400.0], + [1.0 / 16.0, 0.0, 5.0 / 400.0]]).astype('float32') + + self.inputs = {'X': self.boxes1, 'Y': self.boxes2} + + self.outputs = {'Out': self.output} + + +class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp): + def test_check_output(self): + self.check_output() + + def setUp(self): + super(TestIOUSimilarityOpWithLoD, self).setUp() + self.boxes1_lod = [[0, 1, 2]] + self.output_lod = [[0, 1, 2]] + + self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2} + self.outputs = {'Out': (self.output, self.output_lod)} + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 709abd6c6a4e0c2aa1b38a135d7424cd6886c966..4e863625422c93c77ad4fb65be35580943d1cf54 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -17,8 +17,9 @@ import unittest import paddle.v2.fluid.layers as layers import paddle.v2.fluid.nets as nets -from paddle.v2.fluid.framework import Program, program_guard +from paddle.v2.fluid.framework import Program, program_guard, default_main_program from paddle.v2.fluid.param_attr import ParamAttr +import decorators class TestBook(unittest.TestCase): @@ -225,6 +226,69 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(out) print(str(program)) + def test_im2sequence(self): + print("test_im2sequence") + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 128, 128], dtype='float32') + output = layers.im2sequence( + input=x, stride=[1, 1], filter_size=[2, 2]) + self.assertIsNotNone(output) + print(str(program)) + + @decorators.prog_scope() + def test_nce(self): + window_size = 5 + words = [] + for i in xrange(window_size): + words.append( + layers.data( + name='word_{0}'.format(i), shape=[1], dtype='int64')) + + dict_size = 10000 + label_word = int(window_size / 2) + 1 + + embs = [] + for i in xrange(window_size): + if i == label_word: + continue + + emb = layers.embedding( + input=words[i], + size=[dict_size, 32], + param_attr='emb.w', + is_sparse=True) + + embs.append(emb) + + embs = layers.concat(input=embs, axis=1) + loss = layers.nce(input=embs, + label=words[label_word], + num_total_classes=dict_size, + param_attr='nce.w', + bias_attr='nce.b') + avg_loss = layers.mean(x=loss) + self.assertIsNotNone(avg_loss) + print(str(default_main_program())) + + def test_row_conv(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[16], dtype='float32', lod_level=1) + out = layers.row_conv(input=x, future_context_size=2) + self.assertIsNotNone(out) + print(str(program)) + + def test_multiplex(self): + program = Program() + with program_guard(program): + x1 = layers.data(name='x1', shape=[4], dtype='float32') + x2 = layers.data(name='x2', shape=[4], dtype='float32') + index = layers.data(name='index', shape=[1], dtype='int32') + out = layers.multiplex(inputs=[x1, x2], index=index) + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_lookup_table_op.py b/python/paddle/v2/fluid/tests/test_lookup_table_op.py index d5255ba31f7c9e45cf29f412546146234f822026..0c566c76c91dce8dcfc882eed998f492ae3cde76 100644 --- a/python/paddle/v2/fluid/tests/test_lookup_table_op.py +++ b/python/paddle/v2/fluid/tests/test_lookup_table_op.py @@ -33,5 +33,19 @@ class TestLookupTableOp(OpTest): self.check_grad(['W'], 'Out', no_grad_set=set('Ids')) +class TestLookupTableOpWithPadding(TestLookupTableOp): + def test_check_output(self): + ids = np.squeeze(self.inputs['Ids']) + padding_idx = np.random.choice(ids, 1)[0] + self.outputs['Out'][ids == padding_idx] = np.zeros(31) + self.attrs = {'padding_idx': long(padding_idx)} + self.check_output() + + def test_check_grad(self): + # Since paddings are not trainable and fixed in forward, the gradient of + # paddings makes no sense and we don't test the gradient here. + pass + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py index d9fa01e247ae613fb2a7ed523a447e31a5bd5994..3e79f9d8e157bc744f14ecfa7c9a6d7de4eae1f9 100644 --- a/python/paddle/v2/fluid/tests/test_lstm_op.py +++ b/python/paddle/v2/fluid/tests/test_lstm_op.py @@ -42,7 +42,7 @@ def relu(x): return np.maximum(x, 0) -ACTVATION = { +ACTIVATION = { 'identity': identity, 'sigmoid': sigmoid, 'tanh': tanh, @@ -158,8 +158,8 @@ class TestLstmOp(OpTest): w_b = b[:, 0:4 * self.D] w_c = b[:, 4 * self.D:] if self.use_peepholes else None h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse, - ACTVATION[self.act_gate], ACTVATION[self.act_cell], - ACTVATION[self.act_cand]) + ACTIVATION[self.act_gate], ACTIVATION[self.act_cell], + ACTIVATION[self.act_cand]) self.inputs = {'Input': (x, self.lod), 'Weight': w} diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py new file mode 100644 index 0000000000000000000000000000000000000000..92a954a9aa5574c3016cf9744e1765fff9e9c091 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py @@ -0,0 +1,286 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. +import unittest +import numpy as np +import test_lstm_op as LstmTest + +ACTIVATION = { + 'identity': LstmTest.identity, + 'sigmoid': LstmTest.sigmoid, + 'tanh': LstmTest.tanh, + 'relu': LstmTest.relu +} + + +# LSTM with recurrent projection Layer +def lstmp( + input, # T x 4D + lod, # 1 x N + h0=None, # N x D + c0=None, # N x D + w_r=None, # P x 4D + w_rh=None, # D x P + w_b=None, # 1 x 4D + w_c=None, # 1 x 3D + is_reverse=False, + act_gate=None, + act_cell=None, + act_cand=None, + act_proj=None): + def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand, + act_proj): + g = np.dot(r_pre, w_r) # 1 x 4D + g = g + x + g = np.reshape(g, (1, g.size)) + c, g_i, g_f, g_o = np.split(g, 4, axis=1) + if w_c is None: + g_i = act_gate(g_i) # 1 x D + g_f = act_gate(g_f) # 1 x D + else: + w_ic, w_fc, _ = np.split(w_c, 3, axis=1) + g_i = act_gate(g_i + w_ic * c_pre) # 1 x D + g_f = act_gate(g_f + w_fc * c_pre) # 1 x D + c = g_f * c_pre + g_i * act_cand(c) # 1 x D + + if w_c is None: + g_o = act_gate(g_o) # 1 x D + else: + _, _, w_oc = np.split(w_c, 3, axis=1) + g_o = act_gate(g_o + w_oc * c) # 1 x D + h = g_o * act_cell(c) + # projection + r = np.dot(h, w_rh) + r = act_proj(r) + return r, c + + def _reverse(x, lod): + y = np.zeros_like(x) + for i in range(len(lod) - 1): + b, e = lod[i], lod[i + 1] + y[b:e, :] = np.flip(x[b:e, :], 0) + return y + + offset = lod[0] + batch_size = len(offset) - 1 + # recurrent projection state + projection = [] + cell = [] + input = _reverse(input, offset) if is_reverse else input + if w_b is not None: + input = input + np.tile(w_b, (offset[-1], 1)) + for i in range(batch_size): + # compute one sequence + seq_len = offset[i + 1] - offset[i] + x = input[offset[i]:offset[i + 1], :] + r_pre = np.dot(h0[i], w_rh) # 1 x P + r_pre = act_proj(r_pre) + c_pre = c0[i] # 1 x D + for j in range(seq_len): + # compute one step + r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate, + act_cell, act_cand, act_proj) + projection.append(r_pre.flatten()) + cell.append(c_pre.flatten()) + + projection = np.array(projection).astype('float64') + cell = np.array(cell).astype('float64') + + projection = _reverse(projection, offset) if is_reverse else projection + cell = _reverse(cell, offset) if is_reverse else cell + + assert projection.shape == (input.shape[0], w_r.shape[0]) # T x P + assert cell.shape == (input.shape[0], input.shape[1] / 4) # T x D + return projection, cell + + +class TestLstmpOp(LstmTest.TestLstmOp): + def reset_argument(self): + pass + + def setUp(self): + self.set_argument() + # projection size + self.P = 10 + self.act_proj = self.act_cell + + self.reset_argument() + self.op_type = 'lstmp' + + T = self.lod[0][-1] + N = len(self.lod[0]) - 1 + + x = np.random.normal(size=(T, 4 * self.D)).astype('float64') + if self.has_initial_state: + h0 = np.random.normal(size=(N, self.D)).astype('float64') + c0 = np.random.normal(size=(N, self.D)).astype('float64') + else: + h0 = np.zeros((N, self.D)).astype('float64') + c0 = np.zeros((N, self.D)).astype('float64') + w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64') + if self.use_peepholes: + b = np.random.normal(size=(1, 7 * self.D)).astype('float64') + else: + b = np.random.normal(size=(1, 4 * self.D)).astype('float64') + + w_b = b[:, 0:4 * self.D] + w_c = b[:, 4 * self.D:] if self.use_peepholes else None + w_rh = np.random.normal(size=(self.D, self.P)).astype('float64') + r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse, + ACTIVATION[self.act_gate], ACTIVATION[self.act_cell], + ACTIVATION[self.act_cand], ACTIVATION[self.act_proj]) + + self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh} + + self.inputs['Bias'] = b + + if self.has_initial_state: + self.inputs['H0'] = h0 + self.inputs['C0'] = c0 + + self.outputs = { + 'Projection': (r, self.lod), + 'Cell': (c, self.lod), + } + self.attrs = { + 'use_peepholes': self.use_peepholes, + 'is_reverse': self.is_reverse, + 'gate_activation': self.act_gate, + 'cell_activation': self.act_cell, + 'candidate_activation': self.act_cand, + 'proj_activation': self.act_proj + } + + def test_check_output(self): + self.check_output(atol=1e-8) + + def test_check_grad(self): + # TODO(qingqing) remove folowing lines after the check_grad is refined. + N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') + self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') + self.outputs['BatchCellPreAct'] = np.zeros( + (N, self.D)).astype('float64') + self.check_grad( + ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'], + max_relative_error=1e-2) + + +class TestLstmpOpHasInitial(TestLstmpOp): + def reset_argument(self): + self.has_initial_state = True + + def test_check_grad(self): + # TODO(qingqing) remove folowing lines after the check_grad is refined. + N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') + self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') + self.outputs['BatchCellPreAct'] = np.zeros( + (N, self.D)).astype('float64') + self.check_grad( + ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'], + ['Projection'], + max_relative_error=1e-2) + + def test_check_grad_ingore_bias(self): + N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') + self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') + self.outputs['BatchCellPreAct'] = np.zeros( + (N, self.D)).astype('float64') + self.check_grad( + ['Input', 'ProjWeight', 'Weight'], ['Projection'], + max_relative_error=1e-2, + no_grad_set=set('Bias')) + + def test_check_grad_ingore_weight(self): + N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') + self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') + self.outputs['BatchCellPreAct'] = np.zeros( + (N, self.D)).astype('float64') + self.check_grad( + ['Input', 'ProjWeight', 'Bias'], ['Projection'], + max_relative_error=1e-2, + no_grad_set=set('Weight')) + + def test_check_grad_ingore_proj_weight(self): + N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') + self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') + self.outputs['BatchCellPreAct'] = np.zeros( + (N, self.D)).astype('float64') + self.check_grad( + ['Input', 'Weight', 'Bias'], ['Projection'], + max_relative_error=1e-2, + no_grad_set=set('ProjWeight')) + + def test_check_grad_ingore_input(self): + N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') + self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') + self.outputs['BatchCellPreAct'] = np.zeros( + (N, self.D)).astype('float64') + self.check_grad( + ['Weight', 'ProjWeight', 'Bias'], ['Projection'], + max_relative_error=1e-2, + no_grad_set=set('Input')) + + def test_check_grad_ingore_h0(self): + N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') + self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') + self.outputs['BatchCellPreAct'] = np.zeros( + (N, self.D)).astype('float64') + self.check_grad( + ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'], + max_relative_error=1e-2, + no_grad_set=set('H0')) + + def test_check_grad_ingore_c0(self): + N = len(self.lod[0]) - 1 + self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64') + self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64') + self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64') + self.outputs['BatchCellPreAct'] = np.zeros( + (N, self.D)).astype('float64') + self.check_grad( + ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'], + max_relative_error=1e-2, + no_grad_set=set('C0')) + + +class TestLstmpOpRerverse(TestLstmpOp): + def reset_argument(self): + self.is_reverse = True + + +class TestLstmpOpNotUsePeepholes(TestLstmpOp): + def reset_argument(self): + self.use_peepholes = False + + +class TestLstmpOpLinearProjection(TestLstmpOp): + def reset_argument(self): + self.act_proj = 'identity' + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_multihead_attention.py b/python/paddle/v2/fluid/tests/test_multihead_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..a2b300a645fe21931cc12a4e7bb8ebe9b85707c9 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_multihead_attention.py @@ -0,0 +1,98 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import numpy as np + + +class TestMultiheadAttention(unittest.TestCase): + def gen_random_input(self): + """Generate random input data. + """ + # batch_size, max_sequence_length, hidden dimension + self.input_shape = (3, 13, 16) + self.queries = np.random.random(size=self.input_shape).astype("float32") + self.keys = np.random.random(size=self.input_shape).astype("float32") + + def set_program(self): + """Build the test program. + """ + queries = fluid.layers.data( + name="queries", + shape=self.input_shape, + dtype="float32", + append_batch_size=False) + queries.stop_gradient = False + keys = fluid.layers.data( + name="keys", + shape=self.input_shape, + dtype="float32", + append_batch_size=False) + keys.stop_gradient = False + + contexts = fluid.nets.scaled_dot_product_attention( + queries=queries, + keys=keys, + values=keys, + num_heads=8, + dropout_rate=0.) + out = fluid.layers.reduce_sum(contexts, dim=None) + fluid.backward.append_backward(loss=out) + + self.fetch_list = [contexts] + + def run_program(self): + """Run the test program. + """ + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self.set_inputs(place) + exe = fluid.Executor(place) + + exe.run(fluid.default_startup_program()) + output = exe.run(fluid.default_main_program(), + feed=self.inputs, + fetch_list=self.fetch_list, + return_numpy=True) + self.op_output = output + + def set_inputs(self, place): + """Set the randomly generated data to the test program. + """ + self.inputs = {} + queries = fluid.Tensor() + queries.set(self.queries, place) + + keys = fluid.Tensor() + keys.set(self.keys, place) + + self.inputs["keys"] = keys + self.inputs["queries"] = queries + + def test_multihead_attention(self): + self.gen_random_input() + + self.set_program() + self.run_program() + + #fixme(caoying) add more meaningfull unittest. + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_nce.py b/python/paddle/v2/fluid/tests/test_nce.py index 3ae727a573855b3cb618a8fab70404adf3d92f51..9a51c1f612a0d5363d36e6642ed3b409970025b1 100644 --- a/python/paddle/v2/fluid/tests/test_nce.py +++ b/python/paddle/v2/fluid/tests/test_nce.py @@ -109,4 +109,6 @@ class TestNCECase1(TestNCE): if __name__ == '__main__': + # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 + exit(0) unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py index 57f14f6b9cc9c7cf9ae93274cf3d7763350e6e10..6b71f2a923f0cf0744d6b2190aa35830dcf15f24 100644 --- a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py +++ b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py @@ -46,7 +46,7 @@ class TestNormalization(unittest.TestCase): """Run the test program. """ places = [core.CPUPlace()] - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: diff --git a/python/paddle/v2/fluid/tests/test_one_hot_op.py b/python/paddle/v2/fluid/tests/test_one_hot_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e51ea27d14d0637021f8902fa935beb318658018 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_one_hot_op.py @@ -0,0 +1,110 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import math +from op_test import OpTest +import paddle.v2.fluid as fluid +import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework +from paddle.v2.fluid.framework import Program, program_guard + + +class TestOneHotOp(OpTest): + def setUp(self): + self.op_type = 'one_hot' + depth = 10 + dimension = 12 + x_lod = [[0, 4, 5, 8, 11]] + x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])] + x = np.array(x).astype('int').reshape([x_lod[0][-1], 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), + depth)).astype('float32') + + for i in xrange(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth, 'dtype': int(core.DataType.FP32)} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + self.check_output() + + +class TestOneHotOp_default_dtype(OpTest): + def setUp(self): + self.op_type = 'one_hot' + depth = 10 + dimension = 12 + x_lod = [[0, 4, 5, 8, 11]] + x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])] + x = np.array(x).astype('int').reshape([x_lod[0][-1], 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), + depth)).astype('float32') + + for i in xrange(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + self.check_output() + + +class TestOneHotOp_exception(OpTest): + def setUp(self): + self.op_type = 'one_hot' + self.depth = 10 + self.place = core.CPUPlace() + self.dimension = 12 + self.x = core.LoDTensor() + x_lod = [[0, 4, 5, 8, 11]] + data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])] + data = np.array(data).astype('int').reshape([x_lod[0][-1], 1]) + self.x.set(data, self.place) + self.x.set_lod(x_lod) + + def test_check_output(self): + program = Program() + with program_guard(program): + x = fluid.layers.data( + name='x', shape=[self.dimension], dtype='float32', lod_level=1) + block = program.current_block() + one_hot_out = block.create_var( + name="one_hot_out", + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='float32') + block.append_op( + type='one_hot', + inputs={'X': x}, + attrs={'depth': self.depth}, + outputs={'Out': one_hot_out}) + exe = fluid.Executor(self.place) + + def run(): + exe.run(feed={'x': self.x}, + fetch_list=[one_hot_out], + return_numpy=False) + + self.assertRaises(core.EnforceNotMet, run) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_op_support_gpu.py b/python/paddle/v2/fluid/tests/test_op_support_gpu.py index 34939818126b1d747fb76861bbd691894fb3759b..7de02a8fda22a3db82a2e0b5e6fa9c9f2718fa12 100644 --- a/python/paddle/v2/fluid/tests/test_op_support_gpu.py +++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py @@ -18,7 +18,8 @@ import paddle.v2.fluid.core as core class TestOpSupportGPU(unittest.TestCase): def test_case(self): - self.assertEqual(core.is_compile_gpu(), core.op_support_gpu("sum")) + self.assertEqual(core.is_compiled_with_cuda(), + core.op_support_gpu("sum")) if __name__ == '__main__': diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py index dfde492c7cd930615c030bb0c8e5a2cf36ff59a8..367cc8b1aaf0aff24c685031f33d35becb9eb7ef 100644 --- a/python/paddle/v2/fluid/tests/test_parallel_op.py +++ b/python/paddle/v2/fluid/tests/test_parallel_op.py @@ -53,7 +53,7 @@ class BaseParallelForTest(unittest.TestCase): fetch=fetch, place=cpu, use_parallel=True) - if fluid.core.is_compile_gpu(): + if fluid.core.is_compiled_with_cuda(): gpu = fluid.CUDAPlace(0) result_gpu = self._run_test_impl_( callback=callback, @@ -159,7 +159,7 @@ class ParallelOpTest(BaseParallelForTest): def test_simple_fc(self): self.run_test( - callback=ParallelOpTest.__network__, + callback=self.__network__, feed={ 'img': numpy.random.random(size=(51, 784)).astype('float32') }, @@ -167,10 +167,35 @@ class ParallelOpTest(BaseParallelForTest): def test_fc_with_tiny_data(self): self.run_test( - callback=ParallelOpTest.__network__, + callback=self.__network__, feed={'img': numpy.random.random(size=(1, 784)).astype('float32')}, fetch=['fc1.w@GRAD']) +class ParallelOpTestMultipleInput(BaseParallelForTest): + @staticmethod + def __network__(): + x = fluid.layers.data( + shape=[784], dtype='float32', name='img1', stop_gradient=False) + y = fluid.layers.data( + shape=[784], dtype='float32', name='img2', stop_gradient=False) + yield [x, y] + x = x + y + hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') + hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w') + hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w') + loss = fluid.layers.mean(x=hidden3) + yield loss + + def test_simple_fc(self): + self.run_test( + callback=self.__network__, + feed={ + 'img1': numpy.random.random(size=(51, 784)).astype('float32'), + 'img2': numpy.random.random(size=(51, 784)).astype('float32') + }, + fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD']) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_prior_box_op.py b/python/paddle/v2/fluid/tests/test_prior_box_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ca8d2bca74ce2d4be8160c8851e393489691ae56 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_prior_box_op.py @@ -0,0 +1,148 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import sys +import math +from op_test import OpTest + + +class TestPriorBoxOp(OpTest): + def set_data(self): + self.init_test_params() + self.init_test_input() + self.init_test_output() + self.inputs = {'Input': self.input, 'Image': self.image} + + self.attrs = { + 'min_sizes': self.min_sizes, + 'max_sizes': self.max_sizes, + 'aspect_ratios': self.aspect_ratios, + 'variances': self.variances, + 'flip': self.flip, + 'clip': self.clip, + 'step_w': self.step_w, + 'step_h': self.step_h, + 'offset': self.offset + } + + self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + return + + def setUp(self): + self.op_type = "prior_box" + self.set_data() + + def init_test_params(self): + self.layer_w = 4 + self.layer_h = 4 + + self.image_w = 20 + self.image_h = 20 + + self.step_w = float(self.image_w) / float(self.layer_w) + self.step_h = float(self.image_h) / float(self.layer_h) + + self.input_channels = 2 + self.image_channels = 3 + self.batch_size = 10 + + self.min_sizes = [2, 4] + self.min_sizes = np.array(self.min_sizes).astype('int64') + self.max_sizes = [5, 10] + self.max_sizes = np.array(self.max_sizes).astype('int64') + self.aspect_ratios = [2.0, 3.0] + self.flip = True + self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0] + self.aspect_ratios = np.array( + self.aspect_ratios, dtype=np.float).flatten() + self.variances = [0.1, 0.1, 0.2, 0.2] + self.variances = np.array(self.variances, dtype=np.float).flatten() + + self.clip = True + + self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes) + if len(self.max_sizes) > 1: + self.num_priors += len(self.max_sizes) + self.offset = 0.5 + + def init_test_input(self): + self.image = np.random.random( + (self.batch_size, self.image_channels, self.image_w, + self.image_h)).astype('float32') + + self.input = np.random.random( + (self.batch_size, self.input_channels, self.layer_w, + self.layer_h)).astype('float32') + + def init_test_output(self): + out_dim = (self.layer_h, self.layer_w, self.num_priors, 4) + out_boxes = np.zeros(out_dim).astype('float32') + out_var = np.zeros(out_dim).astype('float32') + + idx = 0 + for h in range(self.layer_h): + for w in range(self.layer_w): + c_x = (w + self.offset) * self.step_w + c_y = (h + self.offset) * self.step_h + idx = 0 + for s in range(len(self.min_sizes)): + min_size = self.min_sizes[s] + c_w = c_h = min_size / 2. + out_boxes[h, w, idx, :] = [ + (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h, + (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h + ] + idx += 1 + + if len(self.max_sizes) > 0: + max_size = self.max_sizes[s] + # second prior: aspect_ratio = 1, + c_w = c_h = math.sqrt(min_size * max_size) / 2 + out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, + (c_y - c_h) / self.image_h, + (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h] + idx += 1 + + # rest of priors + for r in range(len(self.real_aspect_ratios)): + ar = self.real_aspect_ratios[r] + if math.fabs(ar - 1.) < 1e-6: + continue + c_w = min_size * math.sqrt(ar) / 2 + c_h = (min_size / math.sqrt(ar)) / 2 + out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w, + (c_y - c_h) / self.image_h, + (c_x + c_w) / self.image_w, + (c_y + c_h) / self.image_h] + idx += 1 + # clip the prior's coordidate such that it is within[0, 1] + if self.clip: + out_boxes = np.clip(out_boxes, 0.0, 1.0) + # set the variance. + out_var = np.tile(self.variances, (self.layer_h, self.layer_w, + self.num_priors, 1)) + self.out_boxes = out_boxes.astype('float32') + self.out_var = out_var.astype('float32') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py index abf8881b6786416f56f93e498761a4791b35d7c3..09b2d08401878448b4b3f3c6c03193e255e9ffeb 100644 --- a/python/paddle/v2/fluid/tests/test_profiler.py +++ b/python/paddle/v2/fluid/tests/test_profiler.py @@ -13,16 +13,17 @@ # limitations under the License. import unittest +import os import numpy as np import paddle.v2.fluid as fluid import paddle.v2.fluid.profiler as profiler import paddle.v2.fluid.layers as layers -import os +import paddle.v2.fluid.core as core class TestProfiler(unittest.TestCase): def test_nvprof(self): - if not fluid.core.is_compile_gpu(): + if not fluid.core.is_compiled_with_cuda(): return epoc = 8 dshape = [4, 3, 28, 28] @@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase): exe.run(fluid.default_main_program(), feed={'data': input}) os.remove(output_file) + def net_profiler(self, state): + if state == 'GPU' and not core.is_compiled_with_cuda(): + return + startup_program = fluid.Program() + main_program = fluid.Program() + + with fluid.program_guard(main_program, startup_program): + image = fluid.layers.data(name='x', shape=[784], dtype='float32') + hidden1 = fluid.layers.fc(input=image, size=128, act='relu') + hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') + predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') + label = fluid.layers.data(name='y', shape=[1], dtype='int64') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + accuracy = fluid.evaluator.Accuracy(input=predict, label=label) + + optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) + opts = optimizer.minimize(avg_cost, startup_program=startup_program) + + place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(startup_program) + + accuracy.reset(exe) + with profiler.profiler(state, 'total') as prof: + for iter in range(10): + if iter == 2: + profiler.reset_profiler() + x = np.random.random((32, 784)).astype("float32") + y = np.random.randint(0, 10, (32, 1)).astype("int64") + + outs = exe.run(main_program, + feed={'x': x, + 'y': y}, + fetch_list=[avg_cost] + accuracy.metrics) + acc = np.array(outs[1]) + pass_acc = accuracy.eval(exe) + + def test_cpu_profiler(self): + self.net_profiler('CPU') + + def test_cuda_profiler(self): + self.net_profiler('GPU') + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py index 74cd6de9e6fde70c001bb2189c4976cdd8e34633..0a223bac0ce8fd626881cef983c7cd960f2c5ba8 100644 --- a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py +++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py @@ -45,7 +45,7 @@ class TestReorderLoDTensor(unittest.TestCase): outputs = [] input_grads = [] places = [core.CPUPlace()] - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: self.set_inputs(place) diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py index f87927968b0fdb00ec207ff1d52be9e0d81af139..ba2ca1683f9f6d72bbd1550df89c7424d223a1d9 100644 --- a/python/paddle/v2/fluid/tests/test_sgd_op.py +++ b/python/paddle/v2/fluid/tests/test_sgd_op.py @@ -91,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase): def test_sparse_sgd(self): places = [core.CPUPlace()] - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: self.check_with_place(place) diff --git a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py index 37c6587c4151a89563f93cab35d63b2419ef88ab..343aa20066146ae08462a92f1efaa20c4d4b5ed8 100644 --- a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py +++ b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py @@ -21,7 +21,7 @@ from paddle.v2.fluid.op import Operator class TestSpliteSelectedRows(unittest.TestCase): def get_places(self): places = [core.CPUPlace()] - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) return places diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py index b2a39f975eb461292dc2e7be332a26931684bf90..94cf416fad8f02cdea8017ae1350fa264ce644b1 100644 --- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py +++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py @@ -36,7 +36,7 @@ class TestUniformRandomOp(unittest.TestCase): self.uniform_random_test(place=core.CPUPlace()) def test_gpu(self): - if core.is_compile_gpu(): + if core.is_compiled_with_cuda(): self.uniform_random_test(place=core.CUDAPlace(0)) def uniform_random_test(self, place): diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py index 1429d6b1e08fe4ab2d1c5a0f19f1cedbcbc85abd..e5000e440cc8d822dbd38dce3978d2722d32ebe4 100644 --- a/python/paddle/v2/image.py +++ b/python/paddle/v2/image.py @@ -176,7 +176,6 @@ def resize_short(im, size): :param size: the shorter edge size of image after resizing. :type size: int """ - assert im.shape[-1] == 1 or im.shape[-1] == 3 h, w = im.shape[:2] h_new, w_new = size, size if h > w: @@ -267,7 +266,7 @@ def random_crop(im, size, is_color=True): return im -def left_right_flip(im): +def left_right_flip(im, is_color=True): """ Flip an image along the horizontal direction. Return the flipped image. @@ -278,13 +277,15 @@ def left_right_flip(im): im = left_right_flip(im) - :paam im: input image with HWC layout + :param im: input image with HWC layout or HW layout for gray image :type im: ndarray + :param is_color: whether input image is color or not + :type is_color: bool """ - if len(im.shape) == 3: + if len(im.shape) == 3 and is_color: return im[:, ::-1, :] else: - return im[:, ::-1, :] + return im[:, ::-1] def simple_transform(im, @@ -319,11 +320,12 @@ def simple_transform(im, """ im = resize_short(im, resize_size) if is_train: - im = random_crop(im, crop_size) + im = random_crop(im, crop_size, is_color=is_color) if np.random.randint(2) == 0: - im = left_right_flip(im) + im = left_right_flip(im, is_color) else: - im = center_crop(im, crop_size) + im = center_crop(im, crop_size, is_color) + im = center_crop(im, crop_size, is_color=is_color) if len(im.shape) == 3: im = to_chw(im) @@ -331,8 +333,10 @@ def simple_transform(im, if mean is not None: mean = np.array(mean, dtype=np.float32) # mean value, may be one value per channel - if mean.ndim == 1: + if mean.ndim == 1 and is_color: mean = mean[:, np.newaxis, np.newaxis] + elif mean.ndim == 1: + mean = mean else: # elementwise mean assert len(mean.shape) == len(im) @@ -372,6 +376,6 @@ def load_and_transform(filename, mean values per channel. :type mean: numpy array | list """ - im = load_image(filename) + im = load_image(filename, is_color) im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean) return im diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index 2c6ba650a5d7996bef212e88a16f2a159ca377e7..0f1b8331309248aaaf0ed32cf14c583a4cdb7437 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -35,7 +35,7 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz -RUN yum install -y sqlite-devel zlib-devel openssl-devel boost boost-devel pcre-devel vim tk-devel tkinter libtool +RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt