提交 e163fd30 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into recv_op_python_with_guard

...@@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING) ...@@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED) find_package(Git REQUIRED)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
if(NOT ANDROID AND NOT IOS)
find_package(Boost QUIET)
endif()
include(simd) include(simd)
...@@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas ...@@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
...@@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}") ...@@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}")
include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
include_directories(${Boost_INCLUDE_DIRS})
set(EXTERNAL_LIBS set(EXTERNAL_LIBS
${GFLAGS_LIBRARIES} ${GFLAGS_LIBRARIES}
......
...@@ -27,7 +27,7 @@ RUN apt-get update && \ ...@@ -27,7 +27,7 @@ RUN apt-get update && \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \ net-tools libtool && \
apt-get clean -y apt-get clean -y
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(BOOST_PROJECT "extern_boost")
set(BOOST_VER "1.66.0")
set(BOOST_TAR "boost_1_66_0")
set(BOOST_URL "https://dl.bintray.com/boostorg/release/${BOOST_VER}/source/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
include_directories(${BOOST_INCLUDE_DIR})
ExternalProject_Add(
${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
&& tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(boost STATIC ${dummyfile})
else()
add_library(boost INTERFACE)
endif()
add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
...@@ -505,6 +505,11 @@ swish ...@@ -505,6 +505,11 @@ swish
.. autofunction:: paddle.v2.fluid.layers.swish .. autofunction:: paddle.v2.fluid.layers.swish
:noindex: :noindex:
im2sequence
------
.. autofunction:: paddle.v2.fluid.layers.im2sequence
:noindex:
edit_distance edit_distance
--------------- ---------------
.. autofunction:: paddle.v2.fluid.layers.edit_distance_error .. autofunction:: paddle.v2.fluid.layers.edit_distance_error
...@@ -524,3 +529,8 @@ sequence_reshape ...@@ -524,3 +529,8 @@ sequence_reshape
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape .. autofunction:: paddle.v2.fluid.layers.sequence_reshape
:noindex: :noindex:
row_conv
--------
.. autofunction:: paddle.v2.fluid.layers.row_conv
:noindex:
# Design Doc: CSP in PaddlePaddle Fluid
## Motivation
Concurrent programming is important for deep learning. Few example applications are:
1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
## Concurrent Programming Models
There were many concurrent programming models, implemented in various forms:
| concurrent programming model | implementation |
|-----|-----|
| mutex | types and functions in standard libraries |
| semaphore | types and functions in standard libraries |
| communicating sequential processes (CSP) | Go programming language |
| actor model | Erlang programming language |
| message passing | MPI |
| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
### CSP v.s. Actor Model
A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
## CSP in Fluid
Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following:
1. a new data type: *channel* and operators *send* and *recv*,
1. *goroutine* or thread, and
1. a new control-flow: select.
We also need Python wrappers for the above components.
The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
### Type Channel
Fluid supports many data types:
1. Tensor,
1. Row-sparse Tensor
1. LoD Tensor,
1. Tensor array, etc
Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum.
To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
## Syntax Design
### Create Channel
In Go, we create a channel by specifying the element type and buffer size:
```go
ch := make(chan int) // a channel without buffer
ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
```
In Fluid, we should be able to do the same:
```python
ch = fluid.make_chan(dtype=INT)
ch1 = fluid.make_chan(dtype=INT, 100)
```
In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
```python
ch = fluid.make_chan(dtype=Tensor, etype=float16)
```
or Tensors of Tensors of float16 etc.
The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
### Send and Recv
### Select
## Example Programs
### 1. RPC between Trainers and Parameter Servers
### 2. Concurrent Minibatch Loading
...@@ -9,16 +9,16 @@ different purposes. ...@@ -9,16 +9,16 @@ different purposes.
## Background ## Background
The previous implementations of the parameter server does not run a The previous implementations of the parameter server do not run a
fluid sub-program. Parameter initialization, optimizer computation, network fluid sub-program. Parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the communication and checkpointing are implemented twice on both the
trainer and the parameter server. trainer as well as the parameter server.
It would be great if we can write code once and use them on both the It would be great if we can write code once and use them on both: the
trainer and the parameter server: reduces code duplication and trainer and the parameter server, since this reduces code duplication and
improves extensibility. Given that after the current refactor, we are improves extensibility. Given that after the current refactoring, we are
representing everything as a computing graph on the representing everything as a computation graph on the
trainer. Representing everything as a computing graph on the parameter trainer. Representing everything as a computation graph on the parameter
server becomes a natural extension. server becomes a natural extension.
## Design ## Design
...@@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following ...@@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following
steps: steps:
1. OP placement: the OPs will be placed on different nodes according 1. OP placement: the OPs will be placed on different nodes according
to heuristic that minimizes estimated total computation to a heuristic that minimizes the estimated total computation
time. Currently we will use a simple heuristic that puts parameter time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer variable on parameter server workers and everything else on trainer
workers. workers.
1. Add communication OPs to enable the communication between nodes. 1. Add communication OPs to enable the communication between nodes.
...@@ -47,22 +47,22 @@ After converting: ...@@ -47,22 +47,22 @@ After converting:
<img src="src/dist-graph.png" width="700"/> <img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer program are placed on the parameter server. 1. The parameter variable W and its optimizer program are placed on the parameter server.
1. Operators are added to the program. 1. Operators are added to the program.
- *Send* sends data to the connected *Recv* operator. The - *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark to run when the *Send* operator has ran (the *Send* OP will mark
the *Recv* OP runnable automatically). the *Recv* OP runnable automatically).
- *Enueue* enqueues the input variable, it can block until space - *Enqueue* enqueues the input variable, it can block until space
become available in the queue. become available in the queue.
- *Dequeue* outputs configurable numbers of tensors from the - *Dequeue* outputs configurable numbers of tensors from the
queue. It will block until the queue have the required number of queue. It will block until the queue has the required number of
tensors. tensors.
### Benefits ### Benefits
- Model parallelism become easier to implement: it's an extension to - Model parallelism becomes easier to implement: it is an extension to
the trainer - parameter server approach. We can have several "Transpilers" the trainer - parameter server approach. We can have several "Transpilers"
to achieve different goals. to achieve different goals.
- User-defined optimizer is easier to add - user can now express it as - User-defined optimizer is easier to add - user can now express it as
...@@ -72,22 +72,22 @@ After converting: ...@@ -72,22 +72,22 @@ After converting:
### Challenges ### Challenges
- It's important to balance the parameter shards of on multiple - It is important to balance the parameter shards on multiple
parameter server. If a single parameter is very big (some parameter servers. If a single parameter is very big (for example: some
word-embedding, fully connected, softmax layer), we need to word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends parameter servers when possible (only element-wise optimizer depends
on the parameter variable). on the parameter variable).
- In the "Aync SGD" figure, the "W" variable on the parameter server - In the "Async SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently. See could be read and written concurrently. See
[here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
details about concurrent program in fluid. details about concurrent program in Fluid.
### Discussion ### Discussion
- Can the Enqueue OP be implemented under our current tensor design - Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)? (put the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the - *Dequeue* OP will have variable numbers of output (depending on the
`min_count` attribute), does our current design support it? (similar `min_count` attribute), does our current design support it? (similar
question for the *Add* OP) question for the *Add* OP)
......
...@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences ...@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
let's call this format the **absolute-offset LoD** for clarity. let's call this format the **absolute-offset LoD** for clarity.
The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
```python ```python
[[0, 3, 9] [[0, 3, 9]
[0, 2, 3, 3, 3, 9]] [0, 2, 3, 3, 3, 9]]
...@@ -119,7 +119,7 @@ def generate(): ...@@ -119,7 +119,7 @@ def generate():
encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
decoder_input = pd.fc( decoder_input = pd.fc(
act=pd.activation.Linear(), act=pd.activation.Linear(),
input=[target_word, encoder_ctx], input=[target_word, encoder_ctx_expanded],
size=3 * decoder_dim) size=3 * decoder_dim)
gru_out, cur_mem = pd.gru_step( gru_out, cur_mem = pd.gru_step(
decoder_input, mem=decoder_mem, size=decoder_dim) decoder_input, mem=decoder_mem, size=decoder_dim)
......
...@@ -60,8 +60,7 @@ each column is as follows: ...@@ -60,8 +60,7 @@ each column is as follows:
| column | meaning | | column | meaning |
| --- | --- | | --- | --- |
| ncalls | the number of calls into a function | | ncalls | the number of calls into a function |
| tottime | the total execution time of the function, not including the | tottime | the total execution time of the function, not including the execution time of other functions called by the function |
execution time of other functions called by the function |
| percall | tottime divided by ncalls | | percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called | | cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls | | percall | cumtime divided by ncalls |
......
...@@ -18,7 +18,7 @@ else() ...@@ -18,7 +18,7 @@ else()
add_subdirectory(capi) add_subdirectory(capi)
endif() endif()
if(Boost_FOUND) if(NOT ANDROID AND NOT IOS)
add_subdirectory(memory) add_subdirectory(memory)
add_subdirectory(platform) add_subdirectory(platform)
add_subdirectory(framework) add_subdirectory(framework)
......
# ddim lib # ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
...@@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_ ...@@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
cc_library(attribute SRCS attribute.cc DEPS framework_proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context) device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
...@@ -74,7 +74,8 @@ cc_library(backward SRCS backward.cc DEPS net_op) ...@@ -74,7 +74,8 @@ cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table profiler)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
......
...@@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { ...@@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
} }
return val; return val;
} }
case proto::AttrType::LONG: {
return attr_desc.l();
}
default: default:
PADDLE_THROW("Unsupport attr type %d", attr_desc.type()); PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
} }
......
...@@ -168,6 +168,32 @@ struct ExtractAttribute<bool> { ...@@ -168,6 +168,32 @@ struct ExtractAttribute<bool> {
const std::string& attr_name_; const std::string& attr_name_;
}; };
template <>
struct ExtractAttribute<int64_t> {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
int64_t* operator()(Attribute& attr) const {
if (attr.type() == typeid(int)) { // NOLINT
int val = boost::get<int>(attr);
attr = static_cast<int64_t>(val);
} else if (attr.type() == typeid(float)) { // NOLINT
int val = boost::get<float>(attr);
attr = static_cast<int64_t>(val);
}
int64_t* attr_value = nullptr;
try {
attr_value = &boost::get<int64_t>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
attr_name_, attr.type().name());
}
return attr_value;
}
const std::string& attr_name_;
};
// check whether a certain attribute fit its limits // check whether a certain attribute fit its limits
// an attribute can have more than one limits // an attribute can have more than one limits
template <typename T> template <typename T>
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
DECLARE_bool(do_memory_benchmark); DECLARE_bool(do_memory_benchmark);
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
...@@ -116,8 +117,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -116,8 +117,13 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugStringEx(local_scope); VLOG(4) << op->DebugStringEx(local_scope);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(op->Type(), pool.Get(place_));
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_do_memory_benchmark) { if (FLAGS_do_memory_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
......
...@@ -26,6 +26,7 @@ enum AttrType { ...@@ -26,6 +26,7 @@ enum AttrType {
BOOLEAN = 6; BOOLEAN = 6;
BOOLEANS = 7; BOOLEANS = 7;
BLOCK = 8; BLOCK = 8;
LONG = 9;
} }
// OpDesc describes an instance of a C++ framework::OperatorBase // OpDesc describes an instance of a C++ framework::OperatorBase
...@@ -44,6 +45,7 @@ message OpDesc { ...@@ -44,6 +45,7 @@ message OpDesc {
optional bool b = 10; optional bool b = 10;
repeated bool bools = 11; repeated bool bools = 11;
optional int32 block_idx = 12; optional int32 block_idx = 12;
optional int64 l = 13;
}; };
message Var { message Var {
......
...@@ -107,9 +107,10 @@ LoD ToAbsOffset(const LoD &in) { ...@@ -107,9 +107,10 @@ LoD ToAbsOffset(const LoD &in) {
// the lowest level stores relative offsets // the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in; if (in.empty() || in.size() == 1) return in;
LoD result = in; LoD result = in;
for (int level = result.size() - 2; level >= 0; level--) { for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
for (auto &ele : result[level]) { for (size_t i = 0; i < in[level].size(); ++i) {
ele = result[level + 1][ele]; size_t index = in[level][i];
result[level][i] = result[level + 1][index];
} }
} }
return result; return result;
......
...@@ -283,6 +283,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> { ...@@ -283,6 +283,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
VectorToRepeated(v, attr_->mutable_bools()); VectorToRepeated(v, attr_->mutable_bools());
} }
void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
void operator()(int64_t v) const { attr_->set_l(v); }
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
}; };
......
...@@ -35,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>; ...@@ -35,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
using Attribute = using Attribute =
boost::variant<boost::blank, int, float, std::string, std::vector<int>, boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>, bool, std::vector<float>, std::vector<std::string>, bool,
std::vector<bool>, BlockDesc*>; std::vector<bool>, BlockDesc*, int64_t>;
using AttributeMap = std::unordered_map<std::string, Attribute>; using AttributeMap = std::unordered_map<std::string, Attribute>;
......
...@@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap, ...@@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap,
if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size()); if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
// flip aspect ratios // flip aspect ratios
for (int index = 0; index < tmp.size(); index++) { for (unsigned index = 0; index < tmp.size(); index++) {
real ar = tmp[index]; real ar = tmp[index];
if (fabs(ar - 1.) < 1e-6) continue; if (fabs(ar - 1.) < 1e-6) continue;
aspectRatio_.push_back(ar); aspectRatio_.push_back(ar);
......
...@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) { ...@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
"seqlastins", "seqlastins",
"non-seq", "non-seq",
-1); // hasSubseq seqlastins to non-seq -1); // hasSubseq seqlastins to non-seq
testDegradeLayer( testDegradeLayer(true,
true, "seqlastins", "seq", -1); // hasSubseq seqlastins to seq "seqlastins",
"seq",
-1); // hasSubseq seqlastins to seq
} }
TEST(Layer, AverageLayer) { TEST(Layer, AverageLayer) {
...@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) { ...@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) {
"average", "average",
"non-seq", "non-seq",
5); // seq average to a shorten seq, stride window = 5 5); // seq average to a shorten seq, stride window = 5
testDegradeLayer( testDegradeLayer(true,
true, "average", "non-seq", -1); // hasSubseq average to non-seq "average",
"non-seq",
-1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
} }
...@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) { ...@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) {
testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2( testPoolLayer2("cudnn-avg-incl-pad-pool",
"cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true); /* trans= */ false,
/* useGpu= */ true);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true); testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
...@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) { ...@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) {
} }
TEST(Layer, ScaleShiftLayer) { TEST(Layer, ScaleShiftLayer) {
const size_t batchSize = 16; // FIXME: Disable ScaleShiftLayer because it is not stable.
const size_t size = 32; // https://github.com/PaddlePaddle/Paddle/issues/7781
TestConfig config; return;
config.layerConfig.set_type("scale_shift"); // const size_t batchSize = 16;
config.layerConfig.set_size(size); // const size_t size = 32;
config.biasSize = 1; // TestConfig config;
config.inputDefs.push_back( // config.layerConfig.set_type("scale_shift");
{INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1}); // config.layerConfig.set_size(size);
config.layerConfig.add_inputs(); // config.biasSize = 1;
for (auto useGpu : {false, true}) { // config.inputDefs.push_back(
testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false); // {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
} // config.layerConfig.add_inputs();
// for (auto useGpu : {false, true}) {
// testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
// }
} }
TEST(Layer, ScaleSubRegionLayer) { TEST(Layer, ScaleSubRegionLayer) {
......
add_subdirectory(detail) add_subdirectory(detail)
cc_library(memory SRCS memory.cc DEPS place enforce) cc_library(memory SRCS memory.cc DEPS place enforce)
cc_library(memcpy SRCS memcpy.cc) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(paddle_memory cc_library(paddle_memory
DEPS DEPS
......
...@@ -24,8 +24,18 @@ namespace operators { ...@@ -24,8 +24,18 @@ namespace operators {
void BeamSearch::operator()(const framework::LoDTensor &pre_ids, void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
framework::LoDTensor *selected_ids, framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores) { framework::LoDTensor *selected_scores) {
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
auto items = SelectTopBeamSizeItems(); auto items = SelectTopBeamSizeItems();
auto selected_items = ToMap(items); auto selected_items = ToMap(items, high_level.back());
VLOG(3) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) {
VLOG(3) << "offset:" << i;
for (auto &item : selected_items[i]) {
VLOG(3) << ItemToString(item);
}
}
PruneEndidCandidates(pre_ids, &selected_items); PruneEndidCandidates(pre_ids, &selected_items);
// calculate the output tensor's height // calculate the output tensor's height
size_t num_instances = std::accumulate( size_t num_instances = std::accumulate(
...@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, ...@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
low_level.push_back(low_offset); low_level.push_back(low_offset);
// fill lod // fill lod
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
framework::LoD lod(2); framework::LoD lod(2);
lod[0].assign(high_level.begin(), high_level.end()); lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end()); lod[1].assign(low_level.begin(), low_level.end());
if (!framework::CheckLoD(lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
}
selected_ids->set_lod(lod); selected_ids->set_lod(lod);
selected_scores->set_lod(lod); selected_scores->set_lod(lod);
} }
...@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, ...@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
} }
std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap( std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
const std::vector<std::vector<Item>> &items) { const std::vector<std::vector<Item>> &items, size_t element_num) {
std::vector<std::vector<Item>> result; std::vector<std::vector<Item>> result;
result.resize(element_num);
for (auto &entries : items) { for (auto &entries : items) {
for (const auto &item : entries) { for (const auto &item : entries) {
if (item.offset >= result.size()) {
result.resize(item.offset + 1);
}
result[item.offset].push_back(item); result[item.offset].push_back(item);
} }
} }
...@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() { ...@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() {
} }
result.emplace_back(items); result.emplace_back(items);
} }
VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
for (auto &items : result) {
VLOG(3) << "item set:";
for (auto &item : items) {
VLOG(3) << ItemToString(item);
}
}
return result; return result;
} }
...@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) { ...@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
return true; return true;
} }
std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
os << "{";
os << "offset: " << item.offset << ", ";
os << "id: " << item.id << ", ";
os << "score: " << item.score << "";
os << "}";
return os;
}
std::string ItemToString(const BeamSearch::Item &item) {
std::ostringstream stream;
stream << item;
return stream.str();
}
class BeamSearchProtoAndCheckerMaker class BeamSearchProtoAndCheckerMaker
: public framework::OpProtoAndCheckerMaker { : public framework::OpProtoAndCheckerMaker {
public: public:
...@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker ...@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker
} }
}; };
class BeamSearchInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
for (const std::string &arg :
std::vector<std::string>({"pre_ids", "ids", "scores"})) {
PADDLE_ENFORCE(context->HasInput(arg),
"BeamSearch need input argument '%s'", arg);
}
for (const std::string &arg :
std::vector<std::string>({"selected_ids", "selected_scores"})) {
PADDLE_ENFORCE(context->HasOutput(arg),
"BeamSearch need output argument '%s'", arg);
}
}
};
class BeamSearchInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &o : op_desc.Output("selected_ids")) {
block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
}
for (auto &o : op_desc.Output("selected_scores")) {
block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp, REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
paddle::operators::BeamSearchProtoAndCheckerMaker); paddle::operators::BeamSearchProtoAndCheckerMaker,
paddle::operators::BeamSearchInferShape,
paddle::operators::BeamSearchInferVarType,
paddle::framework::EmptyGradOpMaker);
...@@ -136,8 +136,6 @@ class BeamSearch { ...@@ -136,8 +136,6 @@ class BeamSearch {
void operator()(const framework::LoDTensor& pre_ids, void operator()(const framework::LoDTensor& pre_ids,
framework::LoDTensor* selected_ids, framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores); framework::LoDTensor* selected_scores);
protected:
/* /*
* The basic items help to sort. * The basic items help to sort.
*/ */
...@@ -155,6 +153,7 @@ class BeamSearch { ...@@ -155,6 +153,7 @@ class BeamSearch {
score_t score; score_t score;
}; };
protected:
/* /*
* Delete all the records that follows the end token. * Delete all the records that follows the end token.
*/ */
...@@ -166,7 +165,7 @@ class BeamSearch { ...@@ -166,7 +165,7 @@ class BeamSearch {
* NOTE low performance * NOTE low performance
*/ */
std::vector<std::vector<Item>> ToMap( std::vector<std::vector<Item>> ToMap(
const std::vector<std::vector<Item>>& inputs); const std::vector<std::vector<Item>>& inputs, size_t element_num);
/* /*
* For each source, select top beam_size records. * For each source, select top beam_size records.
...@@ -187,6 +186,10 @@ class BeamSearch { ...@@ -187,6 +186,10 @@ class BeamSearch {
int end_id_{0}; int end_id_{0};
}; };
std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
std::string ItemToString(const BeamSearch::Item& item);
class BeamSearchOp : public framework::OperatorBase { class BeamSearchOp : public framework::OperatorBase {
public: public:
BeamSearchOp(const std::string& type, BeamSearchOp(const std::string& type,
...@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase {
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
LOG(INFO) << "run beam search op";
auto ids_var = scope.FindVar(Input("ids")); auto ids_var = scope.FindVar(Input("ids"));
auto scores_var = scope.FindVar(Input("scores")); auto scores_var = scope.FindVar(Input("scores"));
auto pre_ids_var = scope.FindVar(Input("pre_ids")); auto pre_ids_var = scope.FindVar(Input("pre_ids"));
...@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase {
size_t level = Attr<int>("level"); size_t level = Attr<int>("level");
size_t beam_size = Attr<int>("beam_size"); size_t beam_size = Attr<int>("beam_size");
int end_id = Attr<int>("end_id"); int end_id = Attr<int>("end_id");
LOG(INFO) << "init beam search";
BeamSearch alg(ids, scores, level, beam_size, end_id); BeamSearch alg(ids, scores, level, beam_size, end_id);
LOG(INFO) << "after beam search";
auto selected_ids_var = scope.FindVar(Output("selected_ids")); auto selected_ids_var = scope.FindVar(Output("selected_ids"));
auto selected_scores_var = scope.FindVar(Output("selected_scores")); auto selected_scores_var = scope.FindVar(Output("selected_scores"));
PADDLE_ENFORCE_NOT_NULL(selected_ids_var); PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
...@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase {
*selected_ids_var->GetMutable<framework::LoDTensor>(); *selected_ids_var->GetMutable<framework::LoDTensor>();
auto& selected_scores_tensor = auto& selected_scores_tensor =
*selected_scores_var->GetMutable<framework::LoDTensor>(); *selected_scores_var->GetMutable<framework::LoDTensor>();
LOG(INFO) << "run beam search";
alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor); alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
LOG(INFO) << "finish beam search";
} }
}; };
......
...@@ -51,7 +51,7 @@ class CTCAlignKernel : public framework::OpKernel<T> { ...@@ -51,7 +51,7 @@ class CTCAlignKernel : public framework::OpKernel<T> {
T prev_token = -1; T prev_token = -1;
for (size_t i = input_lod[level][seq_idx]; for (size_t i = input_lod[level][seq_idx];
i < input_lod[level][seq_idx + 1]; ++i) { i < input_lod[level][seq_idx + 1]; ++i) {
if (input_data[i] != blank && if ((unsigned)input_data[i] != blank &&
!(merge_repeated && input_data[i] == prev_token)) { !(merge_repeated && input_data[i] == prev_token)) {
output_data[output_idx] = input_data[i]; output_data[output_idx] = input_data[i];
++output_idx; ++output_idx;
......
...@@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> { ...@@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
framework::LoD lod(1); framework::LoD lod(1);
lod[0].reserve(batch_size + 1); lod[0].reserve(batch_size + 1);
for (int i = 0, offset = 0; i < batch_size + 1; ++i) { for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
lod[0][i] = offset; lod[0].push_back(offset);
offset += output_height * output_width; offset += output_height * output_width;
} }
out->set_lod(lod); out->set_lod(lod);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/iou_similarity_op.h"
namespace paddle {
namespace operators {
class IOUSimilarityOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of IOUSimilarityOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Y"),
"Input(Y) of IOUSimilarityOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
ctx->ShareLoD("X", /*->*/ "Out");
ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
}
};
class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
public:
IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(LoDTensor, default LoDTensor<float>) "
"Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
"each box is represented as [xmin, ymin, xmax, ymax], "
"the shape of X is [N, 4]. [xmin, ymin] is the left top "
"coordinate of the box if the input is image feature map, they "
"are close to the origin of the coordinate system. "
"[xmax, ymax] is the right bottom coordinate of the box. "
"This tensor can contain LoD information to represent a batch "
"of inputs. One instance of this batch can contain different "
"numbers of entities.");
AddInput("Y",
"(Tensor, default Tensor<float>) "
"Box list Y holds M boxes, each box is represented as "
"[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
"[xmin, ymin] is the left top coordinate of the box if the "
"input is image feature map, and [xmax, ymax] is the right "
"bottom coordinate of the box.");
AddOutput("Out",
"(LoDTensor, the lod is same as input X) The output of "
"iou_similarity op, a tensor with shape [N, M] "
"representing pairwise iou scores.");
AddComment(R"DOC(
IOU Similarity Operator.
Computes intersection-over-union (IOU) between two box lists.
Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
boxes in 'Y' are shared by all instance of the batched inputs of X.
Given two boxes A and B, the calculation of IOU is as follows:
$$
IOU(A, B) =
\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
ops::IOUSimilarityOpMaker);
REGISTER_OP_CPU_KERNEL(
iou_similarity,
ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/iou_similarity_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
iou_similarity,
ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/platform/for_range.h"
template <typename T>
inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
T ymin2, T xmax2, T ymax2) {
constexpr T zero = static_cast<T>(0);
T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
T inter_height = inter_ymax - inter_ymin;
T inter_width = inter_xmax - inter_xmin;
inter_height = inter_height > zero ? inter_height : zero;
inter_width = inter_width > zero ? inter_width : zero;
T inter_area = inter_width * inter_height;
T union_area = area1 + area2 - inter_area;
T sim_score = inter_area / union_area;
return sim_score;
}
template <typename T>
struct IOUSimilarityFunctor {
IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
: x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
inline HOSTDEVICE void operator()(size_t row_id) const {
T x_min1 = x_[row_id * 4];
T y_min1 = x_[row_id * 4 + 1];
T x_max1 = x_[row_id * 4 + 2];
T y_max1 = x_[row_id * 4 + 3];
for (size_t i = 0; i < cols_; ++i) {
T x_min2 = y_[i * 4];
T y_min2 = y_[i * 4 + 1];
T x_max2 = y_[i * 4 + 2];
T y_max2 = y_[i * 4 + 3];
T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
x_max2, y_max2);
z_[row_id * cols_ + i] = sim;
}
}
const T* x_;
const T* y_;
T* z_;
const size_t cols_;
};
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class IOUSimilarityKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
int x_n = in_x->dims()[0];
int y_n = in_y->dims()[0];
IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
out->mutable_data<T>(ctx.GetPlace()), y_n);
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()), x_n);
for_range(functor);
}
}; // namespace operators
} // namespace operators
} // namespace paddle
...@@ -66,6 +66,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,6 +66,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) " "(boolean, default false) "
"Sparse update") "Sparse update")
.SetDefault(false); .SetDefault(false);
AddAttr<int64_t>("padding_idx",
"(int64, default -1) "
"If the value is -1, it makes no effect to lookup. "
"Otherwise the given value indicates padding the output "
"with zeros whenever lookup encounters it in Ids.")
.SetDefault(-1);
AddComment(R"DOC( AddComment(R"DOC(
Lookup Table Operator. Lookup Table Operator.
......
...@@ -21,9 +21,11 @@ limitations under the License. */ ...@@ -21,9 +21,11 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T, int BlockDimX, int BlockDimY, int GridDimX> template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
bool PaddingFlag>
__global__ void LookupTable(T* output, const T* table, const int64_t* ids, __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
const int64_t N, const int64_t K, const int64_t D) { const int64_t N, const int64_t K, const int64_t D,
const int64_t padding_idx) {
int idx = threadIdx.x; int idx = threadIdx.x;
int idy = blockIdx.x + threadIdx.y * GridDimX; int idy = blockIdx.x + threadIdx.y * GridDimX;
...@@ -34,7 +36,14 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids, ...@@ -34,7 +36,14 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids,
T* out = output + idy * D; T* out = output + idy * D;
const T* tab = table + id * D; const T* tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) { for (int i = idx; i < D; i += BlockDimX) {
out[i] = tab[i]; if (PaddingFlag) {
if (id == padding_idx)
out[i] = static_cast<T>(0);
else
out[i] = tab[i];
} else {
out[i] = tab[i];
}
} }
idy += BlockDimY * GridDimX; idy += BlockDimY * GridDimX;
} }
...@@ -67,6 +76,7 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> { ...@@ -67,6 +76,7 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
auto* table_t = context.Input<LoDTensor>("W"); auto* table_t = context.Input<LoDTensor>("W");
auto* ids_t = context.Input<LoDTensor>("Ids"); auto* ids_t = context.Input<LoDTensor>("Ids");
auto* output_t = context.Output<LoDTensor>("Out"); auto* output_t = context.Output<LoDTensor>("Out");
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
size_t N = table_t->dims()[0]; size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1]; size_t D = table_t->dims()[1];
...@@ -77,10 +87,17 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> { ...@@ -77,10 +87,17 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
dim3 threads(128, 8); dim3 threads(128, 8);
dim3 grids(8, 1); dim3 grids(8, 1);
LookupTable<
T, 128, 8, if (padding_idx == -1)
8><<<grids, threads, 0, context.cuda_device_context().stream()>>>( LookupTable<
output, table, ids, N, K, D); T, 128, 8, 8,
false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids, N, K, D, padding_idx);
else
LookupTable<
T, 128, 8, 8,
true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids, N, K, D, padding_idx);
} }
}; };
...@@ -91,6 +108,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -91,6 +108,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto& dev_ctx = auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>(); context.template device_context<platform::CUDADeviceContext>();
bool is_sparse = context.Attr<bool>("is_sparse"); bool is_sparse = context.Attr<bool>("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) { if (is_sparse) {
auto* ids = context.Input<LoDTensor>("Ids"); auto* ids = context.Input<LoDTensor>("Ids");
auto* table = context.Input<LoDTensor>("W"); auto* table = context.Input<LoDTensor>("W");
......
...@@ -32,16 +32,30 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -32,16 +32,30 @@ class LookupTableKernel : public framework::OpKernel<T> {
auto* table_t = context.Input<LoDTensor>("W"); // float tensor auto* table_t = context.Input<LoDTensor>("W"); // float tensor
auto* ids_t = context.Input<LoDTensor>("Ids"); // int tensor auto* ids_t = context.Input<LoDTensor>("Ids"); // int tensor
auto* output_t = context.Output<LoDTensor>("Out"); // float tensor auto* output_t = context.Output<LoDTensor>("Out"); // float tensor
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
int N = table_t->dims()[0]; int N = table_t->dims()[0];
int D = table_t->dims()[1]; int D = table_t->dims()[1];
auto* ids = ids_t->data<int64_t>(); auto* ids = ids_t->data<int64_t>();
auto* table = table_t->data<T>(); auto* table = table_t->data<T>();
auto* output = output_t->mutable_data<T>(context.GetPlace()); auto* output = output_t->mutable_data<T>(context.GetPlace());
for (int64_t i = 0; i < ids_t->numel(); ++i) {
PADDLE_ENFORCE_LT(ids[i], N); if (padding_idx == -1) {
PADDLE_ENFORCE_GE(ids[i], 0); for (int64_t i = 0; i < ids_t->numel(); ++i) {
memcpy(output + i * D, table + ids[i] * D, D * sizeof(T)); PADDLE_ENFORCE_LT(ids[i], N);
PADDLE_ENFORCE_GE(ids[i], 0);
memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
}
} else {
for (int64_t i = 0; i < ids_t->numel(); ++i) {
if (ids[i] == padding_idx) {
memset(output + i * D, 0, D * sizeof(T));
} else {
PADDLE_ENFORCE_LT(ids[i], N);
PADDLE_ENFORCE_GE(ids[i], 0);
memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
}
}
} }
} }
}; };
...@@ -51,6 +65,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> { ...@@ -51,6 +65,8 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
bool is_sparse = context.Attr<bool>("is_sparse"); bool is_sparse = context.Attr<bool>("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) { if (is_sparse) {
auto* ids = context.Input<LoDTensor>("Ids"); auto* ids = context.Input<LoDTensor>("Ids");
auto* table = context.Input<LoDTensor>("W"); auto* table = context.Input<LoDTensor>("W");
......
...@@ -31,6 +31,7 @@ static constexpr char kParallelScopes[] = "parallel_scopes"; ...@@ -31,6 +31,7 @@ static constexpr char kParallelScopes[] = "parallel_scopes";
static constexpr char kParallelBlock[] = "sub_block"; static constexpr char kParallelBlock[] = "sub_block";
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
static void SplitTensorAndMoveTensorToScopes( static void SplitTensorAndMoveTensorToScopes(
const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes, const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
...@@ -64,6 +65,30 @@ static void SplitTensorAndMoveTensorToScopes( ...@@ -64,6 +65,30 @@ static void SplitTensorAndMoveTensorToScopes(
} }
} }
inline void CopyOrShare(const framework::Variable &src,
const platform::Place &dst_place,
framework::Variable *dst) {
if (src.IsType<LoDTensor>()) {
if (src.Get<LoDTensor>().place() == dst_place) {
dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
} else {
Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
}
} else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>();
auto *dst_sr = dst->GetMutable<SelectedRows>();
dst_sr->set_rows(src_sr.rows());
dst_sr->set_height(src_sr.height());
if (src_sr.value().place() == dst_place) {
dst_sr->mutable_value()->ShareDataWith(src_sr.value());
} else {
Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
}
} else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
}
}
void WaitOnPlace(const platform::Place place) { void WaitOnPlace(const platform::Place place) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
...@@ -210,30 +235,30 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -210,30 +235,30 @@ class ParallelDoGradOp : public framework::OperatorBase {
} }
WaitOnPlaces(places); WaitOnPlaces(places);
// merge grad AccumulateGrad(scope, place, sub_scopes, places);
}
void AccumulateGrad(const framework::Scope &scope,
const platform::Place &place,
const std::vector<framework::Scope *> &sub_scopes,
const platform::PlaceList &places) const {
for (auto &s : Outputs(framework::GradVarName(kParameters))) { for (auto &s : Outputs(framework::GradVarName(kParameters))) {
auto &result = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
std::string tmp_name; std::string tmp_name;
auto *tmp = sub_scopes[0]->Var(&tmp_name)->GetMutable<LoDTensor>(); auto *tmp = sub_scopes[0]->Var(&tmp_name);
for (size_t i = 1; i < sub_scopes.size(); ++i) { for (size_t i = 1; i < sub_scopes.size(); ++i) {
auto &tensor_to_merge = sub_scopes[i]->FindVar(s)->Get<LoDTensor>(); CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
if (!(places[i] == places[0])) { WaitOnPlace(places[0]);
framework::Copy(tensor_to_merge, places[0], tmp);
WaitOnPlace(places[0]);
} else {
tmp->ShareDataWith(tensor_to_merge);
}
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
framework::AttributeMap{}); framework::AttributeMap{});
VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
sum_op->Run(*sub_scopes[0], places[0]); sum_op->Run(*sub_scopes[0], places[0]);
WaitOnPlace(places[0]); WaitOnPlace(places[0]);
} }
VLOG(3) << result; CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
framework::Copy(result, place, scope.FindVar(s)->GetMutable<LoDTensor>());
} }
WaitOnPlaces(places); WaitOnPlaces(places);
} }
...@@ -289,7 +314,7 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase { ...@@ -289,7 +314,7 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
PADDLE_ENFORCE(ctx->HasInputs(kParameters)); PADDLE_ENFORCE(ctx->HasInputs(kParameters));
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
PADDLE_ENFORCE(ctx->HasInput(kInputs)); PADDLE_ENFORCE(ctx->HasInputs(kInputs));
for (auto &s : output) { for (auto &s : output) {
PADDLE_ENFORCE(ctx->HasInputs(s)); PADDLE_ENFORCE(ctx->HasInputs(s));
......
...@@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> { ...@@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
auto x_dims = x->dims(); auto x_dims = x->dims();
auto* y = context.Input<LoDTensor>("Y"); auto* y = context.Input<LoDTensor>("Y");
PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]), PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
y->lod().back().size() - 1, y->lod().back().size() - 1,
"The size of last lod level in Input(Y)" "The size of last lod level in Input(Y)"
......
...@@ -35,7 +35,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_lod.size(), 1UL, PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_dims[0], in_lod[0].back(), (uint64_t)in_dims[0], in_lod[0].back(),
"Inconsistent size between X.shape[0] and X.lod()[0].back()."); "Inconsistent size between X.shape[0] and X.lod()[0].back().");
auto in_lod_l0 = in_lod[0]; auto in_lod_l0 = in_lod[0];
......
...@@ -22,6 +22,7 @@ namespace paddle { ...@@ -22,6 +22,7 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
...@@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
// Get the top k elements of each row of input tensor // Get the top k elements of each row of input tensor
// FIXME: only deal with matrix(2d tensor). // FIXME: only deal with matrix(2d tensor).
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<LoDTensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<LoDTensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<LoDTensor>("Indices");
// k is determined by Attr // k is determined by Attr
const size_t k = static_cast<int>(ctx.Attr<int>("k")); const size_t k = static_cast<int>(ctx.Attr<int>("k"));
......
...@@ -10,7 +10,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) ...@@ -10,7 +10,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
cc_library(place SRCS place.cc DEPS enforce) cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
add_subdirectory(dynload) add_subdirectory(dynload)
......
...@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() { ...@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() {
} }
Event::Event(EventKind kind, std::string name, uint32_t thread_id, Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx) const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx); has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (cuda_dev_ctx) { if (has_cuda_) {
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
PADDLE_ENFORCE(cudaGetDevice(&device_)); PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_)); PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream(); auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream)); PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
} }
#endif #endif
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
...@@ -114,19 +114,20 @@ inline EventList& GetEventList() { ...@@ -114,19 +114,20 @@ inline EventList& GetEventList() {
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name, DeviceContext* dev_ctx) { void Mark(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
} }
void PushEvent(const std::string& name, DeviceContext* dev_ctx) { void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
} }
void PopEvent(const std::string& name, DeviceContext* dev_ctx) { void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name,
const DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
...@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) { ...@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) {
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx); Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait(); dev_ctx->Wait();
delete dev_ctx;
}); });
} }
} }
...@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) { ...@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) {
Mark("_start_profiler_", nullptr); Mark("_start_profiler_", nullptr);
} }
std::vector<std::vector<Event>> DisableProfiler() { void ResetProfiler() {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> result;
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) {
(*it)->Clear();
}
}
std::vector<std::vector<Event>> GetAllEvents() {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
std::vector<std::vector<Event>> result;
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
result.emplace_back((*it)->Reduce()); result.emplace_back((*it)->Reduce());
...@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() { ...@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() {
return result; return result;
} }
void DisableProfiler(EventSortingKey sorted_key) {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
}
void ParseEvents(std::vector<std::vector<Event>>& events, void ParseEvents(std::vector<std::vector<Event>>& events,
EventSortingKey sorted_by) { EventSortingKey sorted_by) {
if (g_profiler_place == "") return; if (g_profiler_place == "") return;
...@@ -291,12 +308,12 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -291,12 +308,12 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
} }
// Print report // Print report
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12); PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
} }
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width, std::string& sorted_domain, const size_t name_width,
const size_t data_width) { const size_t data_width) {
// Output header information // Output header information
std::cout << "\n------------------------->" std::cout << "\n------------------------->"
<< " Profiling Report " << " Profiling Report "
......
...@@ -29,7 +29,7 @@ class Event { ...@@ -29,7 +29,7 @@ class Event {
// The DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr. // If CPU profiling mode, can pass nullptr.
Event(EventKind kind, std::string name, uint32_t thread_id, Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx); const DeviceContext* dev_ctx);
std::string kind() const; std::string kind() const;
std::string name() const { return name_; } std::string name() const { return name_; }
...@@ -84,6 +84,8 @@ struct EventList { ...@@ -84,6 +84,8 @@ struct EventList {
return result; return result;
} }
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks; std::forward_list<std::vector<Event>> event_blocks;
}; };
...@@ -93,29 +95,26 @@ enum ProfilerState { ...@@ -93,29 +95,26 @@ enum ProfilerState {
kCUDA, // GPU profiling state kCUDA, // GPU profiling state
}; };
void Mark(const std::string& name, DeviceContext* dev_ctx); void Mark(const std::string& name, const DeviceContext* dev_ctx);
void PushEvent(const std::string& name, DeviceContext* dev_ctx); void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
struct RecordEvent { struct RecordEvent {
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordEvent(); ~RecordEvent();
// The device context is used by Event to get the current cuda stream. // The device context is used by Event to get the current cuda stream.
DeviceContext* dev_ctx_; const DeviceContext* dev_ctx_;
// Event name // Event name
std::string name_; std::string name_;
}; };
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
// Return the event list of all threads. Asummed the returned value calls // Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler(); std::vector<std::vector<Event>> GetAllEvents();
// The information of each event given in the profiling report // The information of each event given in the profiling report
struct EventItem { struct EventItem {
...@@ -130,13 +129,22 @@ struct EventItem { ...@@ -130,13 +129,22 @@ struct EventItem {
// Candidate keys to sort the profiling report // Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
// Clear the g_all_event_lists, which is total event lists of all threads.
void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key);
// Parse the event list and output the profiling report // Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&, void ParseEvents(std::vector<std::vector<Event>>&,
EventSortingKey sorted_by = EventSortingKey::kDefault); EventSortingKey sorted_by = EventSortingKey::kDefault);
// Print results // Print results
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width, std::string& sorted_domain, const size_t name_width,
const size_t data_width); const size_t data_width);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) { ...@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) {
// Bad Usage: // Bad Usage:
PushEvent("event_without_pop", dev_ctx); PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx); PopEvent("event_without_push", dev_ctx);
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler(); std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
// Will remove parsing-related code from test later
ParseEvents(events, EventSortingKey::kTotal);
int cuda_startup_count = 0; int cuda_startup_count = 0;
int start_profiler_count = 0; int start_profiler_count = 0;
int stop_profiler_count = 0;
for (size_t i = 0; i < events.size(); ++i) { for (size_t i = 0; i < events.size(); ++i) {
for (size_t j = 0; j < events[i].size(); ++j) { for (size_t j = 0; j < events[i].size(); ++j) {
if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
if (events[i][j].name() == "push") { if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop"); EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) {
} }
EXPECT_EQ(cuda_startup_count % 5, 0); EXPECT_EQ(cuda_startup_count % 5, 0);
EXPECT_EQ(start_profiler_count, 1); EXPECT_EQ(start_profiler_count, 1);
EXPECT_EQ(stop_profiler_count, 1);
// Will remove parsing-related code from test later
DisableProfiler(EventSortingKey::kTotal);
} }
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init DEPS pybind python backward proto_desc paddle_memory executor prune init profiler
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID) if(NOT APPLE AND NOT ANDROID)
target_link_libraries(paddle_pybind rt) target_link_libraries(paddle_pybind rt)
......
...@@ -64,6 +64,8 @@ std::string AttrType(paddle::framework::proto::AttrType at) { ...@@ -64,6 +64,8 @@ std::string AttrType(paddle::framework::proto::AttrType at) {
return "bool array"; return "bool array";
case paddle::framework::proto::BLOCK: case paddle::framework::proto::BLOCK:
return "block id"; return "block id";
case paddle::framework::proto::LONG:
return "long";
} }
return "UNKNOWN"; // not possible return "UNKNOWN"; // not possible
} }
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <Python.h> #include <Python.h>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include "paddle/platform/variant.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
......
...@@ -30,6 +30,7 @@ limitations under the License. */ ...@@ -30,6 +30,7 @@ limitations under the License. */
#include "paddle/operators/net_op.h" #include "paddle/operators/net_op.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
#include "paddle/pybind/const_value.h" #include "paddle/pybind/const_value.h"
#include "paddle/pybind/exception.h" #include "paddle/pybind/exception.h"
#include "paddle/pybind/pybind.h" #include "paddle/pybind/pybind.h"
...@@ -476,6 +477,24 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -476,6 +477,24 @@ All parameter, weight, gradient are variables in Paddle.
m.def("nvprof_stop", platform::CudaProfilerStop); m.def("nvprof_stop", platform::CudaProfilerStop);
#endif #endif
py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
.value("kDisabled", platform::ProfilerState::kDisabled)
.value("kCPU", platform::ProfilerState::kCPU)
.value("kCUDA", platform::ProfilerState::kCUDA)
.export_values();
py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
.value("kDefault", platform::EventSortingKey::kDefault)
.value("kCalls", platform::EventSortingKey::kCalls)
.value("kTotal", platform::EventSortingKey::kTotal)
.value("kMin", platform::EventSortingKey::kMin)
.value("kMax", platform::EventSortingKey::kMax)
.value("kAve", platform::EventSortingKey::kAve)
.export_values();
m.def("enable_profiler", platform::EnableProfiler);
m.def("disable_profiler", platform::DisableProfiler);
m.def("reset_profiler", platform::ResetProfiler);
return m.ptr(); return m.ptr();
} }
} // namespace pybind } // namespace pybind
......
...@@ -100,7 +100,8 @@ class LayerHelper(object): ...@@ -100,7 +100,8 @@ class LayerHelper(object):
if dtype is None: if dtype is None:
dtype = each.dtype dtype = each.dtype
elif dtype != each.dtype: elif dtype != each.dtype:
raise ValueError("Data Type mismatch") raise ValueError("Data Type mismatch: %d to %d" %
(dtype, each.dtype))
return dtype return dtype
def create_parameter(self, def create_parameter(self,
...@@ -110,6 +111,7 @@ class LayerHelper(object): ...@@ -110,6 +111,7 @@ class LayerHelper(object):
is_bias=False, is_bias=False,
default_initializer=None): default_initializer=None):
# Deepcopy the attr so that parameters can be shared in program # Deepcopy the attr so that parameters can be shared in program
attr = copy.deepcopy(attr)
assert isinstance(attr, ParamAttr) assert isinstance(attr, ParamAttr)
suffix = 'b' if is_bias else 'w' suffix = 'b' if is_bias else 'w'
......
...@@ -289,6 +289,7 @@ class ParallelDo(object): ...@@ -289,6 +289,7 @@ class ParallelDo(object):
for in_var_name in op.input(iname): for in_var_name in op.input(iname):
if in_var_name not in local_inputs: if in_var_name not in local_inputs:
params.append(in_var_name) params.append(in_var_name)
params = list(set(params))
return [parent_block.var(name) for name in params] return [parent_block.var(name) for name in params]
...@@ -769,7 +770,7 @@ def topk(input, k): ...@@ -769,7 +770,7 @@ def topk(input, k):
array = fluid.layers.topk(x, k) array = fluid.layers.topk(x, k)
""" """
helper = LayerHelper('topk', **locals()) helper = LayerHelper('topk', **locals())
topk_out = helper.create_tmp_variable(dtype=input.data_type) topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype='int64') topk_indices = helper.create_tmp_variable(dtype='int64')
helper.append_op( helper.append_op(
type='top_k', type='top_k',
......
...@@ -59,7 +59,10 @@ __all__ = [ ...@@ -59,7 +59,10 @@ __all__ = [
'warpctc', 'warpctc',
'sequence_reshape', 'sequence_reshape',
'transpose', 'transpose',
'im2sequence',
'nce', 'nce',
'beam_search',
'row_conv',
] ]
...@@ -162,10 +165,8 @@ def fc(input, ...@@ -162,10 +165,8 @@ def fc(input,
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type="mul", type="mul",
inputs={ inputs={"X": input_var,
"X": input_var, "Y": w},
"Y": w,
},
outputs={"Out": tmp}, outputs={"Out": tmp},
attrs={"x_num_col_dims": num_flatten_dims, attrs={"x_num_col_dims": num_flatten_dims,
"y_num_col_dims": 1}) "y_num_col_dims": 1})
...@@ -184,22 +185,35 @@ def fc(input, ...@@ -184,22 +185,35 @@ def fc(input,
return helper.append_activation(pre_activation) return helper.append_activation(pre_activation)
def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): def embedding(input,
size,
is_sparse=False,
padding_idx=None,
param_attr=None,
dtype='float32'):
""" """
**Embedding Layer** **Embedding Layer**
This layer is used to lookup a vector of IDs, provided by *input*, in a lookup table. This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
The result of this lookup is the embedding of each ID in the *input*. a lookup table. The result of this lookup is the embedding of each ID in the
:attr:`input`.
All the input variables are passed in as local variables to the LayerHelper All the input variables are passed in as local variables to the LayerHelper
constructor. constructor.
Args: Args:
input(Variable): Input to the function input(Variable): The tensor variable containing the IDs.
size(tuple|list|None): Shape of the look up table parameter size(tuple|list): The shape of the look up table parameter. It should
is_sparse(bool): Boolean flag that specifying whether the input is sparse have two elements which indicate the size of the dictionary of
param_attr(ParamAttr): Parameters for this layer embeddings and the size of each embedding vector respectively.
dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc is_sparse(bool): The flag indicating whether to use sparse update.
padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
Otherwise the given :attr:`padding_idx` indicates padding the output
with zeros whenever lookup encounters it in :attr:`input`. If
:math:`padding_idx < 0`, the padding_idx to use in lookup is
:math:`size[0] + dim`.
param_attr(ParamAttr): Parameters for this layer
dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
Returns: Returns:
Variable: The tensor variable storing the embeddings of the \ Variable: The tensor variable storing the embeddings of the \
...@@ -217,12 +231,15 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): ...@@ -217,12 +231,15 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
w = helper.create_parameter( w = helper.create_parameter(
attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False) attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_tmp_variable(dtype)
padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
size[0] + padding_idx)
helper.append_op( helper.append_op(
type='lookup_table', type='lookup_table',
inputs={'Ids': input, inputs={'Ids': input,
'W': w}, 'W': w},
outputs={'Out': tmp}, outputs={'Out': tmp},
attrs={'is_sparse': is_sparse}) attrs={'is_sparse': is_sparse,
'padding_idx': padding_idx})
return tmp return tmp
...@@ -380,9 +397,9 @@ def dynamic_gru(input, ...@@ -380,9 +397,9 @@ def dynamic_gru(input,
""" """
**Dynamic GRU Layer** **Dynamic GRU Layer**
Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
Sequence Modeling <https://arxiv.org/abs/1412.3555>`_ Sequence Modeling <https://arxiv.org/abs/1412.3555>`_
The formula is as follows: The formula is as follows:
.. math:: .. math::
...@@ -392,47 +409,47 @@ def dynamic_gru(input, ...@@ -392,47 +409,47 @@ def dynamic_gru(input,
r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r) r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
\\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c) \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t} h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
The :math:`\odot` is the element-wise product of the vectors. :math:`act_g` The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
is the update gate and reset gate activation function and :math:`sigmoid` is the update gate and reset gate activation function and :math:`sigmoid`
is usually used for it. :math:`act_c` is the activation function for is usually used for it. :math:`act_c` is the activation function for
candidate hidden state and :math:`tanh` is usually used for it. candidate hidden state and :math:`tanh` is usually used for it.
Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on
the input :math:`x_{t}` are NOT included in this operator. Users can choose the input :math:`x_{t}` are NOT included in this operator. Users can choose
to use fully-connect layer before GRU layer. to use fully-connect layer before GRU layer.
Args: Args:
input(Variable): The input of dynamic_gru layer, which supports input(Variable): The input of dynamic_gru layer, which supports
variable-time length input sequence. The underlying tensor in this variable-time length input sequence. The underlying tensor in this
Variable is a matrix with shape :math:`(T \\times 3D)`, where Variable is a matrix with shape :math:`(T \\times 3D)`, where
:math:`T` is the total time steps in this mini-batch, :math:`D` :math:`T` is the total time steps in this mini-batch, :math:`D`
is the hidden size. is the hidden size.
size(int): The dimension of the gru cell. size(int): The dimension of the gru cell.
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight matrix. Note: hidden-hidden weight matrix. Note:
- The shape of the weight matrix is :math:`(T \\times 3D)`, where - The shape of the weight matrix is :math:`(T \\times 3D)`, where
:math:`D` is the hidden size. :math:`D` is the hidden size.
- All elements in the weight matrix can be divided into two parts. - All elements in the weight matrix can be divided into two parts.
The first part are weights of the update gate and reset gate with The first part are weights of the update gate and reset gate with
shape :math:`(D \\times 2D)`, and the second part are weights for shape :math:`(D \\times 2D)`, and the second part are weights for
candidate hidden state with shape :math:`(D \\times D)`. candidate hidden state with shape :math:`(D \\times D)`.
bias_attr(ParamAttr): The parameter attribute for learnable the bias_attr(ParamAttr): The parameter attribute for learnable the
hidden-hidden bias. hidden-hidden bias.
is_reverse(bool): Whether to compute reversed GRU, default is_reverse(bool): Whether to compute reversed GRU, default
:attr:`False`. :attr:`False`.
gate_activation(str): The activation for update gate and reset gate. gate_activation(str): The activation for update gate and reset gate.
Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid". Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
activation(str): The activation for candidate hidden state. activation(str): The activation for candidate hidden state.
Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh". Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
Returns: Returns:
Variable: The hidden state of GRU. The shape is (T \\times D), and lod \ Variable: The hidden state of GRU. The shape is (T \\times D), and lod \
is the same with the input. is the same with the input.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1534,6 +1551,38 @@ def sequence_expand(x, y, name=None): ...@@ -1534,6 +1551,38 @@ def sequence_expand(x, y, name=None):
return tmp return tmp
def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
'''
This function implements the beam search algorithm.
'''
helper = LayerHelper('beam_search', **locals())
score_type = scores.dtype
id_type = ids.dtype
selected_scores = helper.create_tmp_variable(dtype=score_type)
selected_ids = helper.create_tmp_variable(dtype=id_type)
helper.append_op(
type='beam_search',
inputs={
'pre_ids': pre_ids,
'ids': ids,
'scores': scores,
},
outputs={
'selected_ids': selected_ids,
'selected_scores': selected_scores,
},
attrs={
# TODO(ChunweiYan) to assure other value support
'level': level,
'beam_size': beam_size,
'end_id': end_id,
})
return selected_ids, selected_scores
def lstm_unit(x_t, def lstm_unit(x_t,
hidden_t_prev, hidden_t_prev,
cell_t_prev, cell_t_prev,
...@@ -2391,3 +2440,181 @@ def transpose(x, perm, name=None): ...@@ -2391,3 +2440,181 @@ def transpose(x, perm, name=None):
outputs={'Out': [out]}, outputs={'Out': [out]},
attrs={'axis': perm}) attrs={'axis': perm})
return out return out
def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
"""
Extracts image patches from the input tensor to form a tensor of shape
{input.batch_size * output_height * output_width, filter_size_H *
filter_size_W * input.channels} which is similar with im2col.
This op use filter / kernel to scan images and convert these images to
sequences. After expanding, the number of time step are
output_height * output_width for an image, in which output_height and
output_width are calculated by below equation:
.. math::
output\_size = 1 + \
(2 * padding + img\_size - block\_size + stride - 1) / stride
And the dimension of each time step is block_y * block_x * input.channels.
Args:
input (Variable): The input should be a tensor in NCHW format.
filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
it must contain two integers, (filter_size_H, filter_size_W).
Otherwise, the filter will be a square.
stride(int|tuple): The stride size. If stride is a tuple, it must
contain two integers, (stride_H, stride_W). Otherwise, the
stride_H = stride_W = stride. Default: stride = 1.
padding(int|tuple): The padding size. If padding is a tuple, it can
contain two integers like (padding_H, padding_W) which means
padding_up = padding_down = padding_H and
padding_left = padding_right = padding_W. Or it can use
(padding_up, padding_left, padding_down, padding_right) to indicate
paddings of four direction. Otherwise, a scalar padding means
padding_up = padding_down = padding_left = padding_right = padding
Default: padding = 0.
name (int): The name of this layer. It is optional.
Returns:
output: The output is a LoDTensor with shape
{input.batch_size * output_height * output_width,
filter_size_H * filter_size_W * input.channels}.
If we regard output as a matrix, each row of this matrix is
a step of a sequence.
Examples:
As an example:
.. code-block:: text
Given:
x = [[[[ 6. 2. 1.]
[ 8. 3. 5.]
[ 0. 2. 6.]]
[[ 2. 4. 4.]
[ 6. 3. 0.]
[ 6. 4. 7.]]]
[[[ 6. 7. 1.]
[ 5. 7. 9.]
[ 2. 4. 8.]]
[[ 1. 2. 1.]
[ 1. 3. 5.]
[ 9. 0. 8.]]]]
x.dims = {2, 2, 3, 3}
And:
filter = [2, 2]
stride = [1, 1]
padding = [0, 0]
Then:
output.data = [[ 6. 2. 8. 3. 2. 4. 6. 3.]
[ 2. 1. 3. 5. 4. 4. 3. 0.]
[ 8. 3. 0. 2. 6. 3. 6. 4.]
[ 3. 5. 2. 6. 3. 0. 4. 7.]
[ 6. 7. 5. 7. 1. 2. 1. 3.]
[ 7. 1. 7. 9. 2. 1. 3. 5.]
[ 5. 7. 2. 4. 1. 3. 9. 0.]
[ 7. 9. 4. 8. 3. 5. 0. 8.]]
output.dims = {8, 9}
output.lod = [[0, 4, 8]]
The simple usage is:
.. code-block:: python
output = fluid.layers.im2sequence(input=layer, stride=[1, 1], filter_size=[2, 2])
"""
if isinstance(filter_size, int):
filter_size = [filter_size, filter_size]
if isinstance(stride, int):
stride = [stride, stride]
if isinstance(padding, int):
padding = [padding, padding]
if len(padding) == 2:
padding.append(padding[0])
padding.append(padding[1])
helper = LayerHelper('im2sequence', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op(
type='im2sequence',
inputs={'X': input},
outputs={'Out': out},
attrs={
'kernels': filter_size,
'strides': stride,
'paddings': padding,
})
return out
def row_conv(input, future_context_size, param_attr=None, act=None):
"""Row Conv Operator. This layer will apply lookahead convolution to
**input**. The input variable should be a 2D LoDTensor with shape [T, D].
Parameters with shape [future_context_size + 1, D] will be created. The math
equation of row convolution is as follows:
.. math::
Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j}
In the above equation:
* :math:`Out_{i}`: The i-th row of output variable with shape [1, D].
* :math:`\\tau`: Future context size.
* :math:`X_{j}`: The j-th row of input variable with shape [1, D].
* :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D].
More details about row_conv please refer to the paper \
(http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and
the design document \
(https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645).
Args:
input (Variable): Input variable, a 2D LoDTensor with shape [T, D].
future_context_size (int): Future context size. Please note, the shape
of convolution kernel is [future_context_size + 1, D].
param_attr (ParamAttr): Attributes of parameters, including
name, initializer etc.
act (str): Non-linear activation to be applied to output variable.
Returns:
Variable: The output tensor with same shape as input tensor.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[16],
dtype='float32', lod_level=1)
out = fluid.layers.row_conv(input=x, future_context_size=2)
"""
helper = LayerHelper('row_conv', **locals())
dtype = helper.input_dtype()
filter_shape = [future_context_size + 1, input.shape[1]]
filter_param = helper.create_parameter(
attr=helper.param_attr, shape=filter_shape, dtype=dtype)
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='row_conv',
inputs={'X': [input],
'Filter': [filter_param]},
outputs={'Out': [out]})
return helper.append_activation(out)
...@@ -56,7 +56,7 @@ def img_conv_group(input, ...@@ -56,7 +56,7 @@ def img_conv_group(input,
conv_act=None, conv_act=None,
param_attr=None, param_attr=None,
conv_with_batchnorm=False, conv_with_batchnorm=False,
conv_batchnorm_drop_rate=None, conv_batchnorm_drop_rate=0.0,
pool_stride=1, pool_stride=1,
pool_type=None, pool_type=None,
use_cudnn=True): use_cudnn=True):
...@@ -127,21 +127,21 @@ def sequence_conv_pool(input, ...@@ -127,21 +127,21 @@ def sequence_conv_pool(input,
def glu(input, dim=-1): def glu(input, dim=-1):
""" """
The gated linear unit composed by split, sigmoid activation and elementwise The gated linear unit composed by split, sigmoid activation and elementwise
multiplication. Specifically, Split the input into two equal sized parts multiplication. Specifically, Split the input into two equal sized parts
:math:`a` and :math:`b` along the given dimension and then compute as :math:`a` and :math:`b` along the given dimension and then compute as
following: following:
.. math:: .. math::
{GLU}(a, b)= a \otimes \sigma(b) {GLU}(a, b)= a \otimes \sigma(b)
Refer to `Language Modeling with Gated Convolutional Networks Refer to `Language Modeling with Gated Convolutional Networks
<https://arxiv.org/pdf/1612.08083.pdf>`_. <https://arxiv.org/pdf/1612.08083.pdf>`_.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int): The dimension along which to split. If :math:`dim < 0`, the dim (int): The dimension along which to split. If :math:`dim < 0`, the
dimension to split along is :math:`rank(input) + dim`. dimension to split along is :math:`rank(input) + dim`.
Returns: Returns:
...@@ -164,24 +164,24 @@ def dot_product_attention(querys, keys, values): ...@@ -164,24 +164,24 @@ def dot_product_attention(querys, keys, values):
""" """
The dot-product attention. The dot-product attention.
Attention mechanism can be seen as mapping a query and a set of key-value Attention mechanism can be seen as mapping a query and a set of key-value
pairs to an output. The output is computed as a weighted sum of the values, pairs to an output. The output is computed as a weighted sum of the values,
where the weight assigned to each value is computed by a compatibility where the weight assigned to each value is computed by a compatibility
function (dot-product here) of the query with the corresponding key. function (dot-product here) of the query with the corresponding key.
The dot-product attention can be implemented through (batch) matrix The dot-product attention can be implemented through (batch) matrix
multipication as follows: multipication as follows:
.. math:: .. math::
Attention(Q, K, V)= softmax(QK^\mathrm{T})V Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_. <https://arxiv.org/pdf/1706.03762.pdf>`_.
Note that batch data containing sequences with different lengths is not Note that batch data containing sequences with different lengths is not
supported by this because of the (batch) matrix multipication. supported by this because of the (batch) matrix multipication.
Args: Args:
query (Variable): The input variable which is a Tensor or LoDTensor. query (Variable): The input variable which is a Tensor or LoDTensor.
key (Variable): The input variable which is a Tensor or LoDTensor. key (Variable): The input variable which is a Tensor or LoDTensor.
......
...@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None): ...@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None):
# Disables profiler collection. # Disables profiler collection.
core.nvprof_stop() core.nvprof_stop()
os.remove(config_file) os.remove(config_file)
def reset_profiler():
"""The profiler clear interface.
reset_profiler will clear the previous time record.
"""
core.reset_profiler()
@contextmanager
def profiler(state, sorted_key=None):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
of 'calls', 'total', 'max', 'min' or 'ave'.
The `calls` means sorting by the number of calls.
The `total` means sorting by the total execution time.
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
"""
if state not in ['CPU', 'GPU']:
raise ValueError("The state must be 'CPU' or 'GPU'.")
prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
core.enable_profiler(prof_state)
yield
if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The state must be in 'calls', 'total', "
"'max', 'min', 'ave'")
sorted_key = 'default' if sorted_key is None else sorted_key
key_map = {
'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls,
'total': core.EventSortingKey.kTotal,
'max': core.EventSortingKey.kMax,
'min': core.EventSortingKey.kMin,
'ave': core.EventSortingKey.kAve,
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key])
...@@ -49,7 +49,7 @@ for pass_id in range(PASS_NUM): ...@@ -49,7 +49,7 @@ for pass_id in range(PASS_NUM):
avg_loss_value, = exe.run(fluid.default_main_program(), avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost]) fetch_list=[avg_cost])
print(avg_loss_value)
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
exit(0) # if avg cost less than 10.0, we think our code is good. exit(0) # if avg cost less than 10.0, we think our code is good.
exit(1) exit(1)
...@@ -17,7 +17,7 @@ import paddle.v2 as paddle ...@@ -17,7 +17,7 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as pd
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
dict_size = 30000 dict_size = 30000
...@@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) ...@@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32 hidden_dim = 32
word_dim = 16 word_dim = 16
IS_SPARSE = True IS_SPARSE = True
batch_size = 10 batch_size = 2
max_length = 50 max_length = 8
topk_size = 50 topk_size = 50
trg_dic_size = 10000 trg_dic_size = 10000
beam_size = 2
decoder_size = hidden_dim decoder_size = hidden_dim
place = core.CPUPlace()
def encoder_decoder():
def encoder():
# encoder # encoder
src_word_id = layers.data( src_word_id = pd.data(
name="src_word_id", shape=[1], dtype='int64', lod_level=1) name="src_word_id", shape=[1], dtype='int64', lod_level=1)
src_embedding = layers.embedding( src_embedding = pd.embedding(
input=src_word_id, input=src_word_id,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb')) param_attr=fluid.ParamAttr(name='vemb'))
fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoder_out = layers.sequence_last_step(input=lstm_hidden0) encoder_out = pd.sequence_last_step(input=lstm_hidden0)
return encoder_out
def decoder_train(context):
# decoder # decoder
trg_language_word = layers.data( trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1) name="target_language_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = layers.embedding( trg_embedding = pd.embedding(
input=trg_language_word, input=trg_language_word,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb')) param_attr=fluid.ParamAttr(name='vemb'))
rnn = fluid.layers.DynamicRNN() rnn = pd.DynamicRNN()
with rnn.block(): with rnn.block():
current_word = rnn.step_input(trg_embedding) current_word = rnn.step_input(trg_embedding)
mem = rnn.memory(init=encoder_out) pre_state = rnn.memory(init=context)
fc1 = fluid.layers.fc(input=[current_word, mem], current_state = pd.fc(input=[current_word, pre_state],
size=decoder_size, size=decoder_size,
act='tanh') act='tanh')
out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
rnn.update_memory(mem, fc1) current_score = pd.fc(input=current_state,
rnn.output(out) size=target_dict_dim,
act='softmax')
rnn.update_memory(pre_state, current_state)
rnn.output(current_score)
return rnn() return rnn()
def decoder_decode(context):
init_state = context
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
counter = pd.zeros(shape=[1], dtype='int64')
# fill the first element with init_state
state_array = pd.create_array('float32')
pd.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = pd.create_array('int64')
scores_array = pd.create_array('float32')
init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = pd.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
pd.array_write(init_ids, array=ids_array, i=counter)
pd.array_write(init_scores, array=scores_array, i=counter)
cond = pd.less_than(x=counter, y=array_len)
while_op = pd.While(cond=cond)
with while_op.block():
pre_ids = pd.array_read(array=ids_array, i=counter)
pre_state = pd.array_read(array=state_array, i=counter)
pre_score = pd.array_read(array=scores_array, i=counter)
# expand the lod of pre_state to be the same with pre_score
pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
pre_ids_emb = pd.embedding(
input=pre_ids,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE)
# use rnn unit to update rnn
current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
size=decoder_size,
act='tanh')
# use score to do beam search
current_score = pd.fc(input=current_state,
size=target_dict_dim,
act='softmax')
topk_scores, topk_indices = pd.topk(current_score, k=50)
selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
pd.increment(x=counter, value=1, in_place=True)
# update the memories
pd.array_write(current_state, array=state_array, i=counter)
pd.array_write(selected_ids, array=ids_array, i=counter)
pd.array_write(selected_scores, array=scores_array, i=counter)
pd.less_than(x=counter, y=array_len, cond=cond)
translation_ids, translation_scores = pd.beam_search_decode(
ids=ids_array, scores=scores_array)
# return init_ids, init_scores
return translation_ids, translation_scores
def set_init_lod(data, lod, place):
res = core.LoDTensor()
res.set(data, place)
res.set_lod(lod)
return res
def to_lodtensor(data, place): def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data] seq_lens = [len(seq) for seq in data]
cur_len = 0 cur_len = 0
...@@ -88,12 +171,13 @@ def to_lodtensor(data, place): ...@@ -88,12 +171,13 @@ def to_lodtensor(data, place):
return res return res
def main(): def train_main():
rnn_out = encoder_decoder() context = encoder()
label = layers.data( rnn_out = decoder_train(context)
label = pd.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = layers.cross_entropy(input=rnn_out, label=label) cost = pd.cross_entropy(input=rnn_out, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = pd.mean(x=cost)
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -103,13 +187,12 @@ def main(): ...@@ -103,13 +187,12 @@ def main():
paddle.dataset.wmt14.train(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size) batch_size=batch_size)
place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
batch_id = 0 batch_id = 0
for pass_id in xrange(2): for pass_id in xrange(1):
for data in train_data(): for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place) word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place)
...@@ -125,9 +208,48 @@ def main(): ...@@ -125,9 +208,48 @@ def main():
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val)) " avg_cost=" + str(avg_cost_val))
if batch_id > 3: if batch_id > 3:
exit(0) break
batch_id += 1 batch_id += 1
def decode_main():
context = encoder()
translation_ids, translation_scores = decoder_decode(context)
exe = Executor(place)
exe.run(framework.default_startup_program())
init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [i for i in range(batch_size)] + [batch_size]
init_lod = [init_lod, init_lod]
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
for _, data in enumerate(train_data()):
init_ids = set_init_lod(init_ids_data, init_lod, place)
init_scores = set_init_lod(init_scores_data, init_lod, place)
src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
result_ids, result_scores = exe.run(
framework.default_main_program(),
feed={
'src_word_id': src_word_data,
'init_ids': init_ids,
'init_scores': init_scores
},
fetch_list=[translation_ids, translation_scores],
return_numpy=False)
print result_ids.lod()
break
if __name__ == '__main__': if __name__ == '__main__':
main() # train_main()
decode_main()
...@@ -68,10 +68,10 @@ else: ...@@ -68,10 +68,10 @@ else:
fluid.io.save_persistables(exe, "./fit_a_line.model/") fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/") fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader(): for data in train_reader():
avg_loss_value, = exe.run(trainer_prog, avg_loss_value = exe.run(trainer_prog,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost]) fetch_list=[avg_cost])
print("loss:" + str(avg_loss_value))
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
exit(0) exit(0)
exit(1) exit(1)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
import os
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32
word_dim = 16
IS_SPARSE = True
batch_size = 10
max_length = 50
topk_size = 50
trg_dic_size = 10000
decoder_size = hidden_dim
def encoder_decoder():
# encoder
src_word_id = layers.data(
name="src_word_id", shape=[1], dtype='int64', lod_level=1)
src_embedding = layers.embedding(
input=src_word_id,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb'))
fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoder_out = layers.sequence_last_step(input=lstm_hidden0)
# decoder
trg_language_word = layers.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = layers.embedding(
input=trg_language_word,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb'))
rnn = fluid.layers.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(trg_embedding)
mem = rnn.memory(init=encoder_out)
fc1 = fluid.layers.fc(input=[current_word, mem],
size=decoder_size,
act='tanh')
out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
rnn.update_memory(mem, fc1)
rnn.output(out)
return rnn()
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
rnn_out = encoder_decoder()
label = layers.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = layers.cross_entropy(input=rnn_out, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
place = core.CPUPlace()
exe = Executor(place)
t = fluid.DistributeTranspiler()
# all parameter server endpoints list for spliting parameters
pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv(
"TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver
t.transpile(
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
trainer_prog = t.get_trainer_program()
exe.run(framework.default_startup_program())
batch_id = 0
for pass_id in xrange(2):
for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place)
trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
outs = exe.run(trainer_prog,
feed={
'src_word_id': word_data,
'target_language_word': trg_word,
'target_language_next_word': trg_word_next
},
fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0])
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val))
if batch_id > 3:
exit(0)
batch_id += 1
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
if __name__ == '__main__':
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import sys
import math
from op_test import OpTest
class TestIOUSimilarityOp(OpTest):
def test_check_output(self):
self.check_output()
def setUp(self):
self.op_type = "iou_similarity"
self.boxes1 = np.array(
[[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]).astype('float32')
self.boxes2 = np.array([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0],
[0.0, 0.0, 20.0, 20.0]]).astype('float32')
self.output = np.array(
[[2.0 / 16.0, 0, 6.0 / 400.0],
[1.0 / 16.0, 0.0, 5.0 / 400.0]]).astype('float32')
self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
self.outputs = {'Out': self.output}
class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
def test_check_output(self):
self.check_output()
def setUp(self):
super(TestIOUSimilarityOpWithLoD, self).setUp()
self.boxes1_lod = [[0, 1, 2]]
self.output_lod = [[0, 1, 2]]
self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
self.outputs = {'Out': (self.output, self.output_lod)}
if __name__ == '__main__':
unittest.main()
...@@ -226,6 +226,16 @@ class TestBook(unittest.TestCase): ...@@ -226,6 +226,16 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
def test_im2sequence(self):
print("test_im2sequence")
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
output = layers.im2sequence(
input=x, stride=[1, 1], filter_size=[2, 2])
self.assertIsNotNone(output)
print(str(program))
@decorators.prog_scope() @decorators.prog_scope()
def test_nce(self): def test_nce(self):
window_size = 5 window_size = 5
...@@ -261,6 +271,14 @@ class TestBook(unittest.TestCase): ...@@ -261,6 +271,14 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(avg_loss) self.assertIsNotNone(avg_loss)
print(str(default_main_program())) print(str(default_main_program()))
def test_row_conv(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[16], dtype='float32', lod_level=1)
out = layers.row_conv(input=x, future_context_size=2)
self.assertIsNotNone(out)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -33,5 +33,19 @@ class TestLookupTableOp(OpTest): ...@@ -33,5 +33,19 @@ class TestLookupTableOp(OpTest):
self.check_grad(['W'], 'Out', no_grad_set=set('Ids')) self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
class TestLookupTableOpWithPadding(TestLookupTableOp):
def test_check_output(self):
ids = np.squeeze(self.inputs['Ids'])
padding_idx = np.random.choice(ids, 1)[0]
self.outputs['Out'][ids == padding_idx] = np.zeros(31)
self.attrs = {'padding_idx': long(padding_idx)}
self.check_output()
def test_check_grad(self):
# Since paddings are not trainable and fixed in forward, the gradient of
# paddings makes no sense and we don't test the gradient here.
pass
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -159,7 +159,7 @@ class ParallelOpTest(BaseParallelForTest): ...@@ -159,7 +159,7 @@ class ParallelOpTest(BaseParallelForTest):
def test_simple_fc(self): def test_simple_fc(self):
self.run_test( self.run_test(
callback=ParallelOpTest.__network__, callback=self.__network__,
feed={ feed={
'img': numpy.random.random(size=(51, 784)).astype('float32') 'img': numpy.random.random(size=(51, 784)).astype('float32')
}, },
...@@ -167,10 +167,35 @@ class ParallelOpTest(BaseParallelForTest): ...@@ -167,10 +167,35 @@ class ParallelOpTest(BaseParallelForTest):
def test_fc_with_tiny_data(self): def test_fc_with_tiny_data(self):
self.run_test( self.run_test(
callback=ParallelOpTest.__network__, callback=self.__network__,
feed={'img': numpy.random.random(size=(1, 784)).astype('float32')}, feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
fetch=['fc1.w@GRAD']) fetch=['fc1.w@GRAD'])
class ParallelOpTestMultipleInput(BaseParallelForTest):
@staticmethod
def __network__():
x = fluid.layers.data(
shape=[784], dtype='float32', name='img1', stop_gradient=False)
y = fluid.layers.data(
shape=[784], dtype='float32', name='img2', stop_gradient=False)
yield [x, y]
x = x + y
hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w')
hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w')
loss = fluid.layers.mean(x=hidden3)
yield loss
def test_simple_fc(self):
self.run_test(
callback=self.__network__,
feed={
'img1': numpy.random.random(size=(51, 784)).astype('float32'),
'img2': numpy.random.random(size=(51, 784)).astype('float32')
},
fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -13,11 +13,12 @@ ...@@ -13,11 +13,12 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
import os
import numpy as np import numpy as np
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler import paddle.v2.fluid.profiler as profiler
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import os import paddle.v2.fluid.core as core
class TestProfiler(unittest.TestCase): class TestProfiler(unittest.TestCase):
...@@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase): ...@@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase):
exe.run(fluid.default_main_program(), feed={'data': input}) exe.run(fluid.default_main_program(), feed={'data': input})
os.remove(output_file) os.remove(output_file)
def net_profiler(self, state):
if state == 'GPU' and not core.is_compile_gpu():
return
startup_program = fluid.Program()
main_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opts = optimizer.minimize(avg_cost, startup_program=startup_program)
place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_program)
accuracy.reset(exe)
with profiler.profiler(state, 'total') as prof:
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
x = np.random.random((32, 784)).astype("float32")
y = np.random.randint(0, 10, (32, 1)).astype("int64")
outs = exe.run(main_program,
feed={'x': x,
'y': y},
fetch_list=[avg_cost] + accuracy.metrics)
acc = np.array(outs[1])
pass_acc = accuracy.eval(exe)
def test_cpu_profiler(self):
self.net_profiler('CPU')
def test_cuda_profiler(self):
self.net_profiler('GPU')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -176,7 +176,6 @@ def resize_short(im, size): ...@@ -176,7 +176,6 @@ def resize_short(im, size):
:param size: the shorter edge size of image after resizing. :param size: the shorter edge size of image after resizing.
:type size: int :type size: int
""" """
assert im.shape[-1] == 1 or im.shape[-1] == 3
h, w = im.shape[:2] h, w = im.shape[:2]
h_new, w_new = size, size h_new, w_new = size, size
if h > w: if h > w:
...@@ -267,7 +266,7 @@ def random_crop(im, size, is_color=True): ...@@ -267,7 +266,7 @@ def random_crop(im, size, is_color=True):
return im return im
def left_right_flip(im): def left_right_flip(im, is_color=True):
""" """
Flip an image along the horizontal direction. Flip an image along the horizontal direction.
Return the flipped image. Return the flipped image.
...@@ -278,13 +277,15 @@ def left_right_flip(im): ...@@ -278,13 +277,15 @@ def left_right_flip(im):
im = left_right_flip(im) im = left_right_flip(im)
:paam im: input image with HWC layout :param im: input image with HWC layout or HW layout for gray image
:type im: ndarray :type im: ndarray
:param is_color: whether input image is color or not
:type is_color: bool
""" """
if len(im.shape) == 3: if len(im.shape) == 3 and is_color:
return im[:, ::-1, :] return im[:, ::-1, :]
else: else:
return im[:, ::-1, :] return im[:, ::-1]
def simple_transform(im, def simple_transform(im,
...@@ -319,11 +320,12 @@ def simple_transform(im, ...@@ -319,11 +320,12 @@ def simple_transform(im,
""" """
im = resize_short(im, resize_size) im = resize_short(im, resize_size)
if is_train: if is_train:
im = random_crop(im, crop_size) im = random_crop(im, crop_size, is_color=is_color)
if np.random.randint(2) == 0: if np.random.randint(2) == 0:
im = left_right_flip(im) im = left_right_flip(im, is_color)
else: else:
im = center_crop(im, crop_size) im = center_crop(im, crop_size, is_color)
im = center_crop(im, crop_size, is_color=is_color)
if len(im.shape) == 3: if len(im.shape) == 3:
im = to_chw(im) im = to_chw(im)
...@@ -331,8 +333,10 @@ def simple_transform(im, ...@@ -331,8 +333,10 @@ def simple_transform(im,
if mean is not None: if mean is not None:
mean = np.array(mean, dtype=np.float32) mean = np.array(mean, dtype=np.float32)
# mean value, may be one value per channel # mean value, may be one value per channel
if mean.ndim == 1: if mean.ndim == 1 and is_color:
mean = mean[:, np.newaxis, np.newaxis] mean = mean[:, np.newaxis, np.newaxis]
elif mean.ndim == 1:
mean = mean
else: else:
# elementwise mean # elementwise mean
assert len(mean.shape) == len(im) assert len(mean.shape) == len(im)
...@@ -372,6 +376,6 @@ def load_and_transform(filename, ...@@ -372,6 +376,6 @@ def load_and_transform(filename,
mean values per channel. mean values per channel.
:type mean: numpy array | list :type mean: numpy array | list
""" """
im = load_image(filename) im = load_image(filename, is_color)
im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean) im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
return im return im
...@@ -35,7 +35,7 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf ...@@ -35,7 +35,7 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
RUN yum install -y sqlite-devel zlib-devel openssl-devel boost boost-devel pcre-devel vim tk-devel tkinter libtool RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool
RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册