提交 f42ea489 编写于 作者: N nhzlx

deal with conflict

......@@ -136,6 +136,12 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release)
endif()
if(WITH_MKL)
option(MKL_SPLIT_GEMM "PaddlePaddle MKL gemm would split to small ones" OFF)
if (MKL_SPLIT_GEMM)
add_definitions(-DPADDLE_MKL_SPLIT_GEMM)
endif()
endif()
set(WITH_MKLML ${WITH_MKL})
if (NOT DEFINED WITH_MKLDNN)
if (WITH_MKL AND AVX2_FOUND)
......
......@@ -18,7 +18,21 @@ learning to many products at Baidu.
Our vision is to enable deep learning for everyone via PaddlePaddle.
Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid)
### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0)
### Install Latest Stable Release:
```
# Linux CPU
pip install paddlepaddle
# Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu
# Linux GPU cuda8cudnn7
pip install paddlepaddle-gpu==0.14.0.post87
# Linux GPU cuda8cudnn5
pip install paddlepaddle-gpu==0.14.0.post85
# For installation on other platform, refer to http://paddlepaddle.org/
```
## Features
......
#!/bin/bash
set -e
function train() {
......
#!/bin/bash
set -e
function clock_to_seconds() {
......
#!/bin/bash
set -e
function train() {
......
#!/bin/bash
set -e
function clock_to_seconds() {
......
#!/bin/bash
set -e
function train() {
......
#!/bin/bash
set -e
function test() {
......
#!/bin/bash
set -e
function test() {
......
#!/bin/bash
set -e
function test() {
......
#!/bin/bash
set -e
function test() {
......
......@@ -4,25 +4,42 @@ set(tmp_version "HEAD")
set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
while ("${PADDLE_VERSION}" STREQUAL "")
# Check current branch name
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_NAME
RESULT_VARIABLE GIT_RESULT
OUTPUT_VARIABLE GIT_BRANCH_NAME
RESULT_VARIABLE GIT_BRANCH_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT ${GIT_RESULT})
# Check the tag is a correct version
if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
# if no tag was found, set PADDLE_VERSION to latest
set(PADDLE_VERSION "latest")
elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
else() # otherwise, get the previous git tag name.
set(tmp_version "${GIT_TAG_NAME}~1")
if (NOT ${GIT_BRANCH_RESULT})
execute_process(
COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version}
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
OUTPUT_VARIABLE GIT_TAG_NAME
RESULT_VARIABLE GIT_RESULT
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if (NOT ${GIT_RESULT})
# Check if current branch is release branch
if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
# Check the tag is a correct version
if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
# if no tag was found, set PADDLE_VERSION to latest
set(PADDLE_VERSION "latest")
elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME})
else() # otherwise, get the previous git tag name.
set(tmp_version "${GIT_TAG_NAME}~1")
endif()
else() # otherwise, we always set PADDLE_VERSION to latest
set(PADDLE_VERSION "latest")
endif()
else()
set(PADDLE_VERSION "0.0.0")
message(WARNING "Cannot add paddle version from git tag")
endif()
else()
set(PADDLE_VERSION "0.0.0")
message(WARNING "Cannot add paddle version from git tag")
message(WARNING "Cannot add paddle version for wrong git branch result")
endif()
endwhile()
......
## Motivation
There is a ```gap``` between the ```Program``` defined by
user and the ```Executable``` that can be scheduled
efficiently on heterogeneous hardware, either locally
or distributedly.
Usually, the ```gap``` is bridged by
* A serious transformations with defined order.
* These transformations usually involve
```insert, delete, clustering, split, dependency analysis```.
* Has a simple way to verify and debug each transformation.
* Flexible to add, remove or customize transformations to fit
the requirements of various algorithms (models) and hardware secenarios.
Some other events also push us to a better unified pattern.
* The deep learning framework is built around the concepts of graphs.
To leverage tools such as compilation (e.g. TVM and nGraph) or
cross-framework conversion (e.g. ONNX), we also need a intermediate
representation that can be connected to the rest of the ecosystem.
We need a unified pattern to naturally support the requirements
described above. The pattern should fit both training, inference
and other offline serielized model transformations.
Learned from LLVM and other deep learning framework, we draft the
design below.
## Design
### Major Concepts
#### Node
```Node``` represents an operation that performs some computation or
a variable that is input or output of operation.
```Node```s are connected to other ```Node```s via inputs and outputs.
Other properties (maybe device placement information) can be added
to ```Node``` in the future if it's a
common requirement of many other ```Pass```es. Otherwise, it should live
in a ```Node``` wrapper class that is private to some ```Pass``` or be
a local member of a ```Pass```.
#### Graph
```Graph``` contains a list of ```Node```s, which are connected to
each other via inputs and outputs.
TODO: Better definitions for the graph.
```Graph``` can also contain ```Attribute```s. ```Attribute```s
can be ``any`` thing. For example, it can be a list of "wraper"
nodes. The ```wrapper``` nodes compose ```Node```s and provide
helper method for execution or transformation. ```Attribute```
can also contain other things that describe some properties of
the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed
across ```Pass```. However, it should be used with care.
#### Pass
```Pass``` represents a transformation of ```Graph```. Its input
is a ```Graph``` and its output is also a ```Graph```. For example,
a ```Pass``` can simply print out the ```Graph```. A ```Pass```
can also fuse some ```Graph```'s ```Node```s.
#### Optimize
```Optimize``` contains a series of ```Pass``` with defined order.
```Optimize``` transforms a ```Graph``` that only contains raw
modeling logic to a ```Graph``` that can be run efficiently while
maintaining the original modeling logic.
### Optimize Process
* Program is first converted to Graph.
* Graph goes through a series of Pass
* Graph is transformed from raw model logic to a
form that is efficient to execute.
Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor
......@@ -4,7 +4,6 @@ API
.. toctree::
:maxdepth: 1
overview.rst
model_configs.rst
data.rst
run_logic.rst
#################
Parameter Setting
#################
##################
Parameter Settings
##################
TBD
.. contents::
1. How to Choose the Learning Rate of SGD Algorithm
--------------------------
An important issue when training with :code:`sgd/async_sgd` is to choose the correct value for :code:`learning_rate`. If it is too large, the training may not converge. If too small, the convergence may be slow, resulting in a long training time.
Usually, we start with a relatively large learning rate. If the training does not converge, then we need to reduce the learning rate continuously by a factor of 10 until the training converges. We examine the convergence of the training by estimating the minimum cost at a constant output of the model.
If the cost of the training process is significantly higher than the cost of the output, then we judge that the training does not converge. For example, if we have a three-class problem and use multi-class-cross-entropy as the cost, the ratio of 0, 1, and 2 in the data will be :code:`0.2, 0.5, 0.3`. The minimum cost thus will be :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03`. If the cost is greater than this number after training a pass (or even before), then the training may not be converged and the learning rate should be reduced.
2. How to Implement Learning Rate Annealing
------------------------------------------------
We use the Adam algorithm as an example. Set the parameters of :code:`learning_rate_schedule` in the corresponding optimization algorithm as follows:
.. code-block:: python
    Optimizer = paddle.optimizer.Adam(
        Learning_rate=1e-3,
        Learning_rate_decay_a=0.5,
        Learning_rate_decay_b=0.75,
        Learning_rate_schedule="poly",)
PaddlePaddle currently supports 8 learning rate schedules. The 8 learning rate schedules and their corresponding learning rates are calculated as follows:
* "constant"
  
  Lr = learning_rate
* "poly"
  Lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
  Variable :code:`num_samples_processed` is the number of trained samples.
* "caffe_poly"
  Lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
* "exp"
  Lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
* "discexp"
  Lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
* "linear"
  Lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
* "manual"
  This is a learning rate annealing method that is segmented by the number of trained samples. When using this learning rate schedule, we modify the learning rate attenuation factor piecewise function by changing the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
  .. code-block:: python
      Optimizer = paddle.optimizer.Adam(
          Learning_rate=1e-3,
          Learning_rate_schedule="manual",
          Learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
  In this example, when the number of trained samples is less than or equal to 1000, the learning rate is: code:`1e-3*1.0`; when the number of trained samples is greater than 1000 or less than or equal to 2000, the learning rate is:code:`1e- 3 * 0.9`; when the number of trained samples is greater than 2,000, the learning rate is: code:`1e-3*0.8`.
* "pass_manual"
  This is a learning rate annealing method that piecewisely pick values according to the number of trained passes. When using this learning rate schedule, we set the learning rate attenuation factor piecewise function by the parameter :code:`learning_rate_args`. The current learning rate is the product of :code:`learning_rate` and the current attenuation factor. Take the Adam algorithm as an example:
  .. code-block:: python
      Optimizer = paddle.optimizer.Adam(
          Learning_rate=1e-3,
          Learning_rate_schedule="pass_manual",
          Learning_rate_args="1:1.0,2:0.9,3:0.8",)
  In this example, when the number of trained passes is less than or equal to 1, the learning rate is :code:`1e-3*1.0`; when the number of trained passes is greater than 1 or less than 2, the learning rate is :code:`1e- 3 * 0.9`; when the number of trained passes is greater than 2, the learning rate is :code:`1e-3*0.8`.
3. How to Initialize Parameters
-----------------
By default, PaddlePaddle initializes parameters with an average of 0 and a standard deviation of :math:`\frac{1}{\sqrt{d}}`, where :math:`d` is the width of the parameter matrix. This initialization method does not produce bad results under normal circumstances. If users want to customize the initialization method, PaddlePaddle provides two ways to initialize the parameters:
* Gaussian distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
* Uniform distribution. Set :code:`param_attr` to :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
For example, to set a full connection layer parameter initialization mode and bias initialization mode, you can use the following code:
.. code-block:: python
    Hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
                      Bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
The above code initializes the bias to 1.0 and initializes the parameters to a uniform distribution of :code:`[1.0, -1.0]`.
4. How to Share Parameters
---------------
PaddlePaddle's parameters use :code:`name` as the ID. Parameters with the same name will share parameters//. We can set the name of the parameters using :code:`ParamAttr(name="YOUR_PARAM_NAME")`. More conveniently, we can make the parameters to be shared use the same :code:`ParamAttr` object.
A simple fully connected network has its configuration of parameter sharing as follows \:
.. literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
Here :code:`hidden_a` and :code:`hidden_b` have the same parameter and bias. The two input of the softmax layer also use the same parameter :code:`softmax_param`.
5. How to Load Pre-training Parameters
------------------------
* For layers that load pre-training parameters, set :code:`is_static = True` so that the parameters of that layer remain unchanged during the training process. Take the embedding layer as an example, the code is as follows:
.. code-block:: python
    Emb_para = paddle.attr.Param(name='emb', is_static=True)
    Paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
* Load pre-training parameters from the model file into :code:`numpy.array`. After creating the parameters, load the pre-training parameters using :code:`parameters.set()`. The first 16 bytes of the model parameter file saved by PaddlePaddle is the header information. The user must loads : :code:`numpy.array` starting with the 17th byte. Take the embedding layer as an example, the code is as follows:
.. code-block:: python
    Def load_parameter(file_name, h, w):
        With open(file_name, 'rb') as f:
            F.read(16) # skip header.
            Return np.fromfile(f, dtype=np.float32).reshape(h, w)
    Parameters = paddle.parameters.create(my_cost)
    Parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
6. Format of the Stored Parameter and How to Convert the File to Plain Text
--------------------------------------------------
The model parameter file saved by PaddlePaddle consists of 16 bytes of header information and network parameters. In the header information, the first four bytes show PaddlePaddle's version information. The user should fill in with 0s. The next four bytes represent the number of bytes occupied by each parameter. If the saved network parameter is a float type, the number is four; if it is a double, the number is eight. The third group of four bytes represents the total number of saved parameters.
When restoring the model parameters saved by PaddlePaddle back to plain text, we use the corresponding data type :code:`numpy.array` to load specific network parameters. At this time, you can skip the header information of the PaddlePaddle model parameter file. If not specified to compile with a precision for double in PaddlePaddle, then the parameter file will be caiculated with a precision for float, and the argument will be stored as a float. In this case, when using :code:`numpy.array`, generally we set :code:`dtype=float32`. An example is as follows:
.. code-block:: python
    Def read_parameter(fname, width):
        s = open(fname).read()
        # skip header
        Vec = np.fromstring(s[16:], dtype=np.float32)
        # width is the size of the corresponding layer
        Np.savetxt(fname + ".csv", vec.reshape(width, -1),
                Fmt="%.6f", delimiter=",")
When the plaintext parameters are converted into PaddlePaddle loadable model parameters, the header information is constructed first, then the network parameters are written. The following code converts the randomly generated matrix into model parameters that can be loaded by PaddlePaddle:
.. code-block:: python
    Def gen_rand_param(param_file, width, height, need_trans):
        Np.random.seed()
        Header = struct.pack("iil", 0, 4, height * width)
        Param = np.float32(np.random.rand(height, width))
        With open(param_file, "w") as fparam:
            Fparam.write(header + param.tostring())
7. A Protocol Message Rejected Because of its Large Size
-------------------------------------------------- ----------
If you are training NLP related models, and the following error occurs:
.. code-block:: bash
    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes). To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit( ) in google/protobuf/io/coded_stream.h.
    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
The possible reason is that one of the args passed to the dataprovider is too large, which is usually caused by directly passing a large dictionary. A wrongly defineed `_py_data_sources2` is similar to:
.. code-block:: python
     Src_dict = dict()
     For line_count, line in enumerate(open(src_dict_path, "r")):
        Src_dict[line.strip()] = line_count
     Define_py_data_sources2(
        Train_list,
        Test_list,
        Module="dataprovider",
        Obj="process",
        Args={"src_dict": src_dict})
The solution is to pass the address of the dictionary as args to the dataprovider, and then load the dictionary according to the address in the dataprovider. Change `_py_data_sources2` to:
.. code-block:: python
     Define_py_data_sources2(
        Train_list,
        Test_list,
        Module="dataprovider",
        Obj="process",
        Args={"src_dict_path": src_dict_path})
The full source code can be found in the `sequence_recurrent <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequence_recurrent.py>`_ example.
Layers supporting hierarchical sequence as input
================================================
TBD
###########################
Layers that Support Hierarchical Sequences as Input
###########################
 
.. contents::
 
Overview
====
 
A sequence is a common data type in natural language processing tasks. An independent word can be regarded as a non-sequential input or a 0-level sequence. A sentence made up of words is a single-level sequence; a number of sentences make up a paragraph, which is a double-level sequence.
 
A double-level sequence is a nested sequence where each element is a single-level sequence. This is a very flexible way of organizing data that helps us construct some complex input information.
 
We can define non-sequences, single-level sequences, and double-level sequences at the following levels.
 
+ 0-level sequence: an independent element. Its type can be any input data type supported by PaddlePaddle;
+ Single-level sequence: multiple elements arranged in a row; each element is a 0-level sequence. The order of elements is an important input information;
+ Double-level sequence: multiple elements arranged in a row; each element is a single-layer sequence called a subseq of a double-level sequence, and each element of the subseq is a 0-level sequence.
 
In PaddlePaddle, the following layers accept double-layer sequences as input and perform corresponding calculations.
 
`pooling`
========
 
The use of pooling is as follows:
 
.. code-block:: bash
 
        Seq_pool = pooling(input=layer,
                           Pooling_type=pooling.Max(),
                           Agg_level=AggregateLevel.TO_SEQUENCE)
        
- `pooling_type` currently supports two types: pooling.Max() and pooling.Avg().
 
- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
 
  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence 
  - Input: a double-level sequence or a single-level sequence
  - Output: a 0-level sequence which is the average (or maximum) of the entire input sequence (single or double)
 
- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
 
  - Effect: a double-level sequence will be transformed into a single-level sequence
  - Input: a double-level sequence
  - Output: a single-level sequence where each element of the sequence is the average (or maximum) value of each subseq element of the original double-level sequence.
 
`last_seq` and `first_seq`
=====================
 
An example of using `last_seq` is as follows (usage of `first_seq` is similar).
 
.. code-block:: bash
 
        Last = last_seq(input=layer,
                        Agg_level=AggregateLevel.TO_SEQUENCE)
        
- When ʻagg_level=AggregateLevel.TO_NO_SEQUENCE` (default):
 
  - Effect: a double-level sequence input will be converted into a 0-level sequence, and a single-level sequence will be converted into a 0-level sequence
  - Input: a double-level sequence or a single-level sequence
  - Output: a 0-level sequence, which is the last or the first element of the input sequence (double or single level).
 
- When ʻagg_level=AggregateLevel.TO_SEQUENCE`:
  - Effect: a double-level sequence will be transformed into a single-level sequence
  - Input: a double-level sequence
  - Output: a single-layer sequence in which each element is the last (or first) element of each subseq in a double-level sequence.
 
`expand`
======
 
The use of expand is as follows.
 
.. code-block:: bash
 
        Ex = expand(input=layer1,
                    Expand_as=layer2,
                    Expand_level=ExpandLevel.FROM_NO_SEQUENCE)
        
- When `expand_level=ExpandLevel.FROM_NO_SEQUENCE` (default):
 
  - Effect: a 0-level sequence is extended to a single-level sequence or a double-level sequence
  - Input: layer1 must be a 0-level sequence to be extended; layer2 can be a single-level sequence or a double-level sequence that provides the extended length information
  - Output: a single-level sequence or a double-level sequence; the type of the output sequence and the number of elements contained in the sequence are the same as layer2. If the output is a single-level sequence, each element of the single-level sequence will be a copy of the layer1 element. If the output is a double-level sequence, each element in the double-level sequence will be a copy of the layer1 element
 
- When `expand_level=ExpandLevel.FROM_SEQUENCE`:
 
  - Effect: a single-level sequence is extended to a double-level sequence
  - Input: layer1 must be a single-level sequence to be extended; layer2 must be a double-level sequence providing extended length information
  - Output: a double-level sequence with the same number of elements as that of layer2. It is required that the number of elements in the single-level sequence be the same as the number of subseq in the double-level sequences. The i-th element of the single-level sequence (the 0-level sequence) is expanded into a single-level sequence that constitutes the i-th subseq of the output, the double-level sequence.
......@@ -179,26 +179,17 @@ paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaul
paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.BlockGuardServ.__init__ ArgSpec(args=['self', 'server'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ListenAndServ.__init__ ArgSpec(args=['self', 'endpoint', 'inputs', 'fan_in', 'optimizer_mode'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.ListenAndServ.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ListenAndServ.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ListenAndServ.get_params_and_grads ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.ListenAndServ.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Send ArgSpec(args=['endpoints', 'send_vars', 'sync'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.layers.Recv ArgSpec(args=['endpoints', 'get_vars', 'sync'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, None, 1, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.shuffle ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.batch ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.double_buffer ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.layers.random_data_generator ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.py_reader ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.layers.Preprocessor.__init__ ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.Preprocessor.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.layers.Preprocessor.inputs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Preprocessor.is_completed ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.Preprocessor.outputs ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None)
paddle.fluid.layers.load ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.create_tensor ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False))
......@@ -218,9 +209,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=
paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.split_lod_tensor ArgSpec(args=['input', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.merge_lod_tensor ArgSpec(args=['in_true', 'in_false', 'x', 'mask', 'level'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.layers.BlockGuard.__init__ ArgSpec(args=['self', 'main_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.BlockGuardWithCompletion.__init__ ArgSpec(args=['self', 'rnn'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.WhileGuard.__init__ ArgSpec(args=['self', 'while_op'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
......@@ -350,6 +338,26 @@ paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps
paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None)
paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32'))
paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None)
paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None))
paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False))
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
......
add_subdirectory(details)
add_subdirectory(ir)
# ddim lib
proto_library(framework_proto SRCS framework.proto)
......@@ -93,7 +94,7 @@ else()
endif()
cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
......
cc_library(var_handle SRCS var_handle.cc DEPS place)
cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto)
cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph)
cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder)
cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder)
......@@ -35,7 +34,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS
cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto)
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
......
......@@ -23,10 +23,14 @@ namespace framework {
namespace details {
#ifdef PADDLE_WITH_CUDA
AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs)
: local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
nccl_ctxs_(ctxs) {
if (nccl_ctxs_) {
for (auto &p : places_) {
this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p);
......@@ -34,9 +38,10 @@ AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
}
}
#else
AllReduceOpHandle::AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif
void AllReduceOpHandle::RunImpl() {
......
......@@ -30,11 +30,11 @@ namespace details {
struct AllReduceOpHandle : public OpHandleBase {
#ifdef PADDLE_WITH_CUDA
AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs);
#else
AllReduceOpHandle(const std::vector<Scope *> &local_scopes,
AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places);
#endif
std::string Name() const override;
......
......@@ -35,10 +35,13 @@ namespace details {
struct BroadcastOpHandle : public OpHandleBase {
public:
#ifdef PADDLE_WITH_CUDA
BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctxs)
: local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
......@@ -46,9 +49,9 @@ struct BroadcastOpHandle : public OpHandleBase {
}
}
#else
BroadcastOpHandle(const std::vector<Scope *> &local_scopes,
BroadcastOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif
std::string Name() const override;
......
......@@ -96,48 +96,61 @@ struct TestBroadcastOpHandle {
}
param_scopes_[input_scope_idx]->Var("input");
std::unique_ptr<ir::Node> n(
new ir::Node("node0", ir::Node::Type::kOperation));
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(
new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(
new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_,
nccl_ctxs_.get()));
#else
op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_));
op_handle_.reset(
new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_));
#endif
}
auto* in_var_handle =
new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]);
std::unique_ptr<ir::Node> v(
new ir::Node("node1", ir::Node::Type::kVariable));
auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input",
gpu_list_[input_scope_idx]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
// add dummy var
vars_.emplace_back(new DummyVarHandle());
std::unique_ptr<ir::Node> v2(
new ir::Node("node2", ir::Node::Type::kVariable));
vars_.emplace_back(new DummyVarHandle(v2.get()));
DummyVarHandle* dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
dummy_var_handle->generated_op_ = nullptr;
dummy_var_handle->ClearGeneratedOp();
op_handle_->AddInput(dummy_var_handle);
for (size_t j = 0; j < gpu_list_.size(); ++j) {
if (!use_gpu_) {
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
}
VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]);
std::unique_ptr<ir::Node> v3(
new ir::Node("node3", ir::Node::Type::kVariable));
VarHandle* out_var_handle =
new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
}
// add dummy var
vars_.emplace_back(new DummyVarHandle());
std::unique_ptr<ir::Node> v4(
new ir::Node("node4", ir::Node::Type::kVariable));
vars_.emplace_back(new DummyVarHandle(v4.get()));
DummyVarHandle* out_dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
out_dummy_var_handle->generated_op_ = nullptr;
out_dummy_var_handle->ClearGeneratedOp();
op_handle_->AddOutput(out_dummy_var_handle);
}
......
......@@ -19,9 +19,10 @@
namespace paddle {
namespace framework {
namespace details {
ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
platform::Place place)
: op_(framework::OpRegistry::CreateOp(op_desc)),
: OpHandleBase(node),
op_(framework::OpRegistry::CreateOp(*node->Op())),
scope_(scope),
place_(place) {}
......@@ -35,8 +36,8 @@ void ComputationOpHandle::RunImpl() {
bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {
bool need_wait =
in_var && in_var->generated_op_ &&
in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_];
in_var && in_var->GeneratedOp() &&
in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_];
return need_wait;
}
......
......@@ -28,8 +28,7 @@ namespace framework {
namespace details {
struct ComputationOpHandle : public OpHandleBase {
public:
ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
platform::Place place);
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place);
std::string Name() const override;
......
......@@ -22,10 +22,10 @@ namespace details {
#ifdef PADDLE_WITH_CUDA
DataBalanceOpHandle::DataBalanceOpHandle(
const std::vector<Scope *> &local_scopes,
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs)
: local_scopes_(local_scopes), places_(places) {
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {
if (ctxs) {
for (auto &p : places_) {
this->dev_ctxes_[p] = ctxs->DevCtx(p);
......@@ -34,9 +34,9 @@ DataBalanceOpHandle::DataBalanceOpHandle(
}
#else
DataBalanceOpHandle::DataBalanceOpHandle(
const std::vector<Scope *> &local_scopes,
ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif
std::string DataBalanceOpHandle::Name() const { return "data balance"; }
......
......@@ -30,11 +30,11 @@ namespace details {
struct DataBalanceOpHandle : public OpHandleBase {
public:
#ifdef PADDLE_WITH_CUDA
DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *ctxs);
#else
DataBalanceOpHandle(const std::vector<Scope *> &local_scopes,
DataBalanceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places);
#endif
......
......@@ -21,13 +21,16 @@ namespace paddle {
namespace framework {
namespace details {
FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
std::vector<Scope *> *local_scopes)
: data_(data), offset_(offset), local_scopes_(local_scopes) {}
: OpHandleBase(node),
data_(data),
offset_(offset),
local_scopes_(local_scopes) {}
FetchOpHandle::~FetchOpHandle() {
for (auto *input_var : inputs_) {
input_var->pending_ops_.erase(this);
input_var->RemoveOutput(this, this->Node());
}
}
......@@ -77,8 +80,8 @@ void FetchOpHandle::RunImpl() {
void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) {
auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place);
for (auto *input : inputs_) {
if (input->generated_op_) {
input->generated_op_->RecordWaitEventOnCtx(cpu_ctx);
if (input->GeneratedOp()) {
input->GeneratedOp()->RecordWaitEventOnCtx(cpu_ctx);
}
}
}
......
......@@ -28,7 +28,7 @@ namespace details {
struct FetchOpHandle : public OpHandleBase {
public:
FetchOpHandle(FeedFetchList *data, size_t offset,
FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset,
std::vector<Scope *> *local_scopes);
~FetchOpHandle();
......
......@@ -30,10 +30,12 @@ namespace details {
struct FuseVarsOpHandle : public OpHandleBase {
public:
FuseVarsOpHandle(Scope *local_scope, const platform::Place &place,
FuseVarsOpHandle(ir::Node *node, Scope *local_scope,
const platform::Place &place,
const std::unordered_map<std::string, int64_t> &inputs_numel,
const std::type_index &var_type)
: local_scope_(local_scope),
: OpHandleBase(node),
local_scope_(local_scope),
place_(place),
inputs_numel_(inputs_numel),
type_(var_type) {
......
......@@ -20,9 +20,10 @@ namespace paddle {
namespace framework {
namespace details {
GatherOpHandle::GatherOpHandle(const std::vector<Scope *> &local_scopes,
GatherOpHandle::GatherOpHandle(ir::Node *node,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
void GatherOpHandle::RunImpl() {
if (places_.size() == 1) return;
......
......@@ -30,7 +30,7 @@ namespace details {
struct GatherOpHandle : public OpHandleBase {
public:
GatherOpHandle(const std::vector<Scope *> &local_scopes,
GatherOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places);
std::string Name() const override;
......
......@@ -70,6 +70,7 @@ struct TestGatherOpHandle {
}
void InitGatherOp(size_t input_scope_idx) {
std::vector<std::unique_ptr<ir::Node>> nodes;
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
Scope& local_scope = local_scopes_.back()->NewScope();
......@@ -81,30 +82,37 @@ struct TestGatherOpHandle {
}
param_scopes_[input_scope_idx]->Var("out");
op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_));
nodes.emplace_back(new ir::Node("node", ir::Node::Type::kOperation));
op_handle_.reset(
new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
// add input
for (size_t j = 0; j < gpu_list_.size(); ++j) {
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
nodes.emplace_back(new ir::Node("node1", ir::Node::Type::kVariable));
auto* in_var_handle =
new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
}
// add dummy var
vars_.emplace_back(new DummyVarHandle());
nodes.emplace_back(new ir::Node("node2", ir::Node::Type::kVariable));
vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
DummyVarHandle* in_dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
in_dummy_var_handle->generated_op_ = nullptr;
in_dummy_var_handle->ClearGeneratedOp();
op_handle_->AddInput(in_dummy_var_handle);
// add output
auto* out_var_handle =
new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]);
nodes.emplace_back(new ir::Node("node3", ir::Node::Type::kVariable));
auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx,
"out", gpu_list_[input_scope_idx]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
// add dummy var
vars_.emplace_back(new DummyVarHandle());
nodes.emplace_back(new ir::Node("node4", ir::Node::Type::kVariable));
vars_.emplace_back(new DummyVarHandle(nodes.back().get()));
DummyVarHandle* dummy_var_handle =
static_cast<DummyVarHandle*>(vars_.back().get());
op_handle_->AddOutput(dummy_var_handle);
......
......@@ -25,6 +25,7 @@
#include "paddle/fluid/framework/details/reduce_op_handle.h"
#include "paddle/fluid/framework/details/rpc_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/scope.h"
......@@ -66,31 +67,38 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
}
}
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
const OpDesc &op,
void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node,
size_t place_id) const {
auto p = places_[place_id];
auto *op_handle = result->ops_.back().get();
auto *op_handle = result->Get<GraphOps>("ops").back().get();
op_handle->SetDeviceContext(p,
platform::DeviceContextPool::Instance().Get(p));
for (auto &each_var_name : op.InputArgumentNames()) {
VarHandle *var =
CreateOrGetLatestVarHandle(result, each_var_name, p, place_id);
for (ir::Node *input : node->inputs) {
VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id);
op_handle->AddInput(var);
}
for (auto &each_var_name : op.OutputArgumentNames()) {
CreateOpOutput(result, op_handle, each_var_name, p, place_id);
for (ir::Node *output : node->outputs) {
ir::Node *new_node = nullptr;
if (output->Var()) {
new_node = result->CreateVarNode(output->Var());
} else {
new_node =
result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
}
CreateOpOutput(result, op_handle, new_node, p, place_id);
}
}
std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
const ProgramDesc &program) const {
const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
std::vector<std::string> send_vars;
// since parameters are all in block 0,
// it's enough to only scan send ops in block 0
for (auto *op : program.Block(0).AllOps()) {
for (auto &node : nodes) {
if (node->NodeType() != ir::Node::Type::kOperation) continue;
OpDesc *op = node->Op();
// TODO(Yancey1989): use a graceful method to find send op,
// instead of the the hard code string
if (op->Type() == "send") {
......@@ -104,9 +112,11 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainSendVars(
}
std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
const ProgramDesc &program) const {
const std::vector<std::unique_ptr<ir::Node>> &nodes) const {
std::vector<std::string> recv_vars;
for (auto *op : program.Block(0).AllOps()) {
for (auto &node : nodes) {
if (node->NodeType() != ir::Node::Type::kOperation) continue;
OpDesc *op = node->Op();
// TODO(Yancey1989): use a graceful method to find recv op,
// instead of the hard code string
if (op->Type() == "recv") {
......@@ -120,7 +130,7 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
}
bool MultiDevSSAGraphBuilder::IsDistTrainOp(
const OpDesc &op, const std::vector<std::string> &send_vars,
ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) const {
if (send_vars.size() == 0 || recv_vars.size() == 0) {
return false;
......@@ -143,8 +153,17 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp(
return false;
};
return checker(op.OutputArgumentNames(), send_vars) ||
checker(op.InputArgumentNames(), recv_vars);
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (ir::Node *input : node->inputs) {
input_var_names.push_back(input->Name());
}
for (ir::Node *output : node->outputs) {
output_var_names.push_back(output->Name());
}
return checker(output_var_names, send_vars) ||
checker(input_var_names, recv_vars);
}
size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
......@@ -167,25 +186,30 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
return dev_id;
}
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
for (auto *var : program.Block(0).AllVars()) {
all_vars_.emplace(var->Name(), var);
std::unique_ptr<Graph> MultiDevSSAGraphBuilder::Apply(
std::unique_ptr<Graph> graph) const {
// Rebuild the graph structure.
auto nodes = std::move(graph->nodes);
graph->nodes.clear();
for (auto &node : nodes) {
if (node->NodeType() == ir::Node::Type::kVariable) {
all_vars_.emplace(node->Name(), node->Var());
}
}
auto graph = new SSAGraph();
SSAGraph &result = *graph;
Graph &result = *graph;
std::unordered_set<std::string> og_has_been_broadcast;
// We cannot invoke resize. It is a bug of GCC 4.8
result.vars_ = std::vector<
std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>(
places_.size());
result.Set("vars", new GraphVars(places_.size()));
result.Set("dep_vars", new GraphDepVars);
result.Set("ops", new GraphOps);
// find send/recv vars so that we can place the distributed training
// realted op in the place 0
auto send_vars = FindDistTrainSendVars(program);
auto recv_vars = FindDistTrainRecvVars(program);
auto send_vars = FindDistTrainSendVars(nodes);
auto recv_vars = FindDistTrainRecvVars(nodes);
std::vector<std::unordered_set<std::string>> bcast_var_name_set;
bcast_var_name_set.resize(places_.size());
......@@ -193,14 +217,19 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
size_t cur_device_id = 0;
bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) {
// NOTE: Currently, passes before SSAGraphBuilder cannot reorder
// forward, backward nodes. E.g. you can't append an forward node
// at the end of the node list.
// TODO(panyx0718): FIXME: Needs to sort by forward->backward order.
for (auto &node : nodes) {
if (node->NodeType() != ir::Node::Type::kOperation) continue;
if (boost::get<int>(
op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kRPC)) {
CreateRPCOp(&result, *op);
} else if (IsDistTrainOp(*op, send_vars, recv_vars)) {
CreateDistTrainOp(&result, *op);
} else if (IsScaleLossOp(*op)) {
CreateRPCOp(&result, node.get());
} else if (IsDistTrainOp(node.get(), send_vars, recv_vars)) {
CreateDistTrainOp(&result, node.get());
} else if (IsScaleLossOp(node.get())) {
// user can customize loss@grad if not use_default_grad_scale_
if (strategy_.gradient_scale_ !=
BuildStrategy::GradientScaleStrategy::kCustomized) {
......@@ -212,33 +241,35 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
// the block.
is_forwarding = false;
} else {
int op_dev_id = GetOpDeviceID(*op);
int op_dev_id = GetOpDeviceID(node.get());
if (op_dev_id != -1) { // This op only runs on one specific device.
CreateComputationalOp(&result, *op, op_dev_id);
for (auto &var_name : op->OutputArgumentNames()) {
var_name_on_devices_.emplace(var_name, op_dev_id);
CreateComputationalOp(&result, node.get(), op_dev_id);
for (ir::Node *n : node->outputs) {
var_name_on_devices_.emplace(n->Name(), op_dev_id);
}
} else {
// This op runs on all devices, and its output may have parameter's
// gradients.
if (op->Type() == "read" && strategy_.enable_data_balance_) {
op->SetAttr("throw_eof_exp", false);
CreateComputationalOps(&result, *op, places_.size());
const auto &data_var_names = op->Output("Out");
if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) {
node->Op()->SetAttr("throw_eof_exp", false);
CreateComputationalOps(&result, node.get(), places_.size());
// TODO(paddle-dev): builder shouldn't depend on the out logic of
// a specific op.
const auto &data_var_names = node->Op()->Output("Out");
InsertDataBalanceOp(&result, data_var_names);
} else {
CreateComputationalOps(&result, *op, places_.size());
CreateComputationalOps(&result, node.get(), places_.size());
}
if (!is_forwarding && places_.size() > 1) {
// Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once.
if (static_cast<bool>(boost::get<int>(op->GetAttr(
if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) &
static_cast<int>(OpRole::kBackward))) {
try {
auto backward_vars =
boost::get<std::vector<std::string>>(op->GetNullableAttr(
auto backward_vars = boost::get<std::vector<std::string>>(
node->Op()->GetNullableAttr(
OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0);
......@@ -302,8 +333,7 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
* Only variables should be the leaves of graph.
*/
AddOutputToLeafOps(&result);
return std::unique_ptr<SSAGraph>(graph);
return graph;
}
bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const {
......@@ -327,78 +357,96 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext(
#endif
}
void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result,
void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result,
const std::string &p_name,
size_t src_dev_id) const {
#ifdef PADDLE_WITH_CUDA
auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_);
auto *op_handle = new BroadcastOpHandle(
result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_);
#else
auto *op_handle = new BroadcastOpHandle(local_scopes_, places_);
auto *op_handle = new BroadcastOpHandle(
result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation),
local_scopes_, places_);
#endif
result->Get<GraphOps>("ops").emplace_back(op_handle);
result->ops_.emplace_back(op_handle);
auto *in = result->vars_.at(src_dev_id).at(p_name).back().get();
auto *in =
result->Get<GraphVars>("vars").at(src_dev_id).at(p_name).back().get();
op_handle->AddInput(in);
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
SetCommunicationContext(op_handle, p);
auto &vars = result->vars_.at(i).at(p_name);
auto *out_var = new VarHandle(vars.size(), i, p_name, p);
auto &vars = result->Get<GraphVars>("vars").at(i).at(p_name);
auto *out_var = new VarHandle(
result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(),
i, p_name, p);
vars.emplace_back(out_var);
op_handle->AddOutput(out_var);
}
}
void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result,
const OpDesc &op,
void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result,
ir::Node *node,
int dev_id) const {
result->ops_.emplace_back(
new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id]));
CreateOpHandleIOs(result, op, dev_id);
result->Get<GraphOps>("ops").emplace_back(
new ComputationOpHandle(result->CreateOpNode(node->Op()),
local_scopes_[dev_id], places_[dev_id]));
CreateOpHandleIOs(result, node, dev_id);
}
void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result,
void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result,
const std::string &og) const {
#ifdef PADDLE_WITH_CUDA
result->ops_.emplace_back(
new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_));
#else
result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_));
result->Get<GraphOps>("ops").emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_));
#endif
auto *op_handle = result->ops_.back().get();
auto *op_handle = result->Get<GraphOps>("ops").back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
SetCommunicationContext(op_handle, p);
auto &vars = result->vars_[i][og];
auto &vars = result->Get<GraphVars>("vars")[i][og];
PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back();
op_handle->AddInput(prev_grad.get());
auto var = new VarHandle(vars.size(), i, og, p);
auto var =
new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
vars.size(), i, og, p);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
}
void MultiDevSSAGraphBuilder::InsertDataBalanceOp(
SSAGraph *result, const std::vector<std::string> &datas) const {
Graph *result, const std::vector<std::string> &datas) const {
#ifdef PADDLE_WITH_CUDA
result->ops_.emplace_back(
new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_));
result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_));
#else
result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_));
result->Get<GraphOps>("ops").emplace_back(new DataBalanceOpHandle(
result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation),
local_scopes_, places_));
#endif
auto *op_handle = result->ops_.back().get();
auto *op_handle = result->Get<GraphOps>("ops").back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
SetCommunicationContext(op_handle, p);
for (const std::string &d_name : datas) {
auto &vars = result->vars_[i][d_name];
auto &vars = result->Get<GraphVars>("vars")[i][d_name];
PADDLE_ENFORCE(!vars.empty());
op_handle->AddInput(vars.back().get());
auto var = new VarHandle(vars.size(), i, d_name, p);
auto var = new VarHandle(
result->CreateEmptyNode(d_name, ir::Node::Type::kVariable),
vars.size(), i, d_name, p);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
......@@ -417,22 +465,22 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
return is_pg_once;
}
int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const {
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
return -1;
}
int op_role = boost::get<int>(
op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
return -1;
}
auto param_grad = boost::get<std::vector<std::string>>(
op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
int dev_id = GetVarDeviceID(param_grad[1]);
PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(),
param_grad[0]);
PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]",
node->Op()->Type(), param_grad[0]);
return dev_id;
}
......@@ -441,7 +489,7 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {
return got == var_name_on_devices_.end() ? -1 : got->second;
}
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const {
for (size_t i = 0; i < places_.size(); ++i) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
......@@ -452,11 +500,11 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
auto *op_handle =
new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i],
places_[i], communication_dev_ctx);
result->ops_.emplace_back(op_handle);
auto *op_handle = new ScaleLossGradOpHandle(
result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation),
local_scopes_.size(), local_scopes_[i], places_[i],
communication_dev_ctx);
result->Get<GraphOps>("ops").emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators.
......@@ -464,43 +512,51 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const {
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i],
i);
CreateOpOutput(result, op_handle,
result->CreateEmptyNode(GradVarName(loss_var_name_),
ir::Node::Type::kVariable),
places_[i], i);
}
}
void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result,
const OpDesc &op,
void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result,
ir::Node *node,
size_t num_places) const {
for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) {
auto p = places_[scope_idx];
auto s = local_scopes_[scope_idx];
result->ops_.emplace_back(new ComputationOpHandle(op, s, p));
CreateOpHandleIOs(result, op, scope_idx);
result->Get<GraphOps>("ops").emplace_back(
new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p));
CreateOpHandleIOs(result, node, scope_idx);
}
}
VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result,
const std::string &og,
int dst_dev_id) const {
#ifdef PADDLE_WITH_CUDA
result->ops_.emplace_back(
new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_));
result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_));
#else
result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_));
result->Get<GraphOps>("ops").emplace_back(new ReduceOpHandle(
result->CreateEmptyNode("reduce", ir::Node::Type::kOperation),
local_scopes_, places_));
#endif
auto *op_handle = result->ops_.back().get();
auto *op_handle = result->Get<GraphOps>("ops").back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
SetCommunicationContext(op_handle, p);
auto &vars = result->vars_[i][og];
auto &vars = result->Get<GraphVars>("vars")[i][og];
PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back();
op_handle->AddInput(prev_grad.get());
}
auto &vars = result->vars_[dst_dev_id][og];
auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]);
auto &vars = result->Get<GraphVars>("vars")[dst_dev_id][og];
auto var =
new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
vars.size(), dst_dev_id, og, places_[dst_dev_id]);
vars.emplace_back(var);
op_handle->AddOutput(var);
return var;
......@@ -508,35 +564,46 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result,
// Find the first occurence of `prev_op_name` and make current `op` depend
// on it.
void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op,
void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op,
const std::string &prev_op_name) const {
for (auto &prev_op : result->ops_) {
for (auto &prev_op : result->Get<GraphOps>("ops")) {
if (prev_op->Name() == prev_op_name) {
auto *dep_var = new DummyVarHandle();
auto *dep_var = new DummyVarHandle(
result->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
prev_op->AddOutput(dep_var);
result->dep_vars_.emplace(dep_var);
result->Get<GraphDepVars>("dep_vars").emplace(dep_var);
op->AddInput(dep_var);
}
}
}
void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
const OpDesc &op) const {
void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result,
ir::Node *node) const {
int op_dev_id = -1;
if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (ir::Node *input : node->inputs) {
input_var_names.push_back(input->Name());
}
for (ir::Node *output : node->outputs) {
output_var_names.push_back(output->Name());
}
if (node->Op()->Type() == "split_byref" ||
node->Op()->Type() == "split_selected_rows") {
op_dev_id = GetVarDeviceID(input_var_names[0]);
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
for (auto &varname : op.InputArgumentNames()) {
op_dev_id = GetAppropriateDeviceID(input_var_names);
for (auto &varname : input_var_names) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
}
for (auto &varname : op.OutputArgumentNames()) {
for (auto &varname : output_var_names) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else if (op.Type() == "concat") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
for (auto &varname : op.OutputArgumentNames()) {
} else if (node->Op()->Type() == "concat") {
op_dev_id = GetVarDeviceID(input_var_names[0]);
for (auto &varname : output_var_names) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else {
......@@ -546,34 +613,43 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result,
}
PADDLE_ENFORCE(op_dev_id != -1,
"can not find right place for distributed op: %s", op.Type());
"can not find right place for distributed op: %s",
node->Op()->Type());
CreateComputationalOp(result, op, op_dev_id);
if (op.Type() == "concat") {
ConnectOp(result, result->ops_.back().get(), "fetch_barrier");
CreateComputationalOp(result, node, op_dev_id);
if (node->Op()->Type() == "concat") {
ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
"fetch_barrier");
}
}
// Create RPC related op handles that connects its in ops and out ops.
void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
const OpDesc &op) const {
void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const {
int op_dev_id = -1;
if (op.Type() == "send") {
op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]);
if (node->Op()->Type() == "send") {
op_dev_id = GetVarDeviceID(node->inputs[0]->Name());
// the variable name which contains .block means it was splited by
// split_byref op
// so that we can balance the variable blocks to all the pserver
// instances.
if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce &&
op.InputArgumentNames()[0].find(".block") == std::string::npos) {
op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames());
for (auto &varname : op.InputArgumentNames()) {
node->inputs[0]->Name().find(".block") == std::string::npos) {
std::vector<std::string> input_var_names;
for (ir::Node *n : node->inputs) {
input_var_names.push_back(n->Name());
}
op_dev_id = GetAppropriateDeviceID(input_var_names);
for (auto &varname : input_var_names) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
}
} else if (op.Type() == "recv") {
op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames());
for (auto &varname : op.OutputArgumentNames()) {
} else if (node->Op()->Type() == "recv") {
std::vector<std::string> output_var_names;
for (ir::Node *n : node->outputs) {
output_var_names.push_back(n->Name());
}
op_dev_id = GetAppropriateDeviceID(output_var_names);
for (auto &varname : output_var_names) {
var_name_on_devices_.emplace(varname, op_dev_id);
}
} else {
......@@ -582,18 +658,20 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
}
PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
op.Type());
result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id],
op.Type(), places_[op_dev_id]));
if (op.Type() == "send_barrier") {
ConnectOp(result, result->ops_.back().get(), "send");
} else if (op.Type() == "recv") {
ConnectOp(result, result->ops_.back().get(), "send_barrier");
} else if (op.Type() == "fetch_barrier") {
ConnectOp(result, result->ops_.back().get(), "recv");
} else if (op.Type() == "send") {
node->Op()->Type());
result->Get<GraphOps>("ops").emplace_back(new RPCOpHandle(
result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
node->Op()->Type(), places_[op_dev_id]));
if (node->Op()->Type() == "send_barrier") {
ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "send");
} else if (node->Op()->Type() == "recv") {
ConnectOp(result, result->Get<GraphOps>("ops").back().get(),
"send_barrier");
} else if (node->Op()->Type() == "fetch_barrier") {
ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "recv");
} else if (node->Op()->Type() == "send") {
// do nothing
} else {
PADDLE_THROW(
......@@ -601,12 +679,12 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result,
"send, send_barrier. recv, fetch_barrier]");
}
CreateOpHandleIOs(result, op, op_dev_id);
CreateOpHandleIOs(result, node, op_dev_id);
}
bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const {
bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
return boost::get<int>(
op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) ==
(static_cast<int>(OpRole::kBackward) |
static_cast<int>(OpRole::kLoss)) &&
!loss_var_name_.empty(); // If loss_var is empty. This is test mode
......
......@@ -19,6 +19,7 @@
#include "paddle/fluid/framework/details/build_strategy.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace platform {
......@@ -45,13 +46,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
const std::vector<Scope *> &local_scopes,
const BuildStrategy &strategy);
#endif
std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override;
int GetVarDeviceID(const std::string &varname) const override;
private:
void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op,
size_t device_id) const;
void CreateOpHandleIOs(Graph *result, ir::Node *node, size_t device_id) const;
private:
std::string loss_var_name_;
......@@ -63,48 +62,46 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
platform::NCCLContextMap *nccl_ctxs_;
#endif
bool IsScaleLossOp(const OpDesc &op) const;
bool IsScaleLossOp(ir::Node *node) const;
void CreateRPCOp(SSAGraph *result, const OpDesc &op) const;
void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const;
void CreateRPCOp(Graph *result, ir::Node *node) const;
void CreateDistTrainOp(Graph *result, ir::Node *node) const;
/**
* Is this operator as the end-point operator before/after send operator.
*/
bool IsDistTrainOp(const OpDesc &op,
const std::vector<std::string> &send_vars,
bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) const;
std::vector<std::string> FindDistTrainSendVars(
const ProgramDesc &program) const;
const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
std::vector<std::string> FindDistTrainRecvVars(
const ProgramDesc &program) const;
const std::vector<std::unique_ptr<ir::Node>> &nodes) const;
void ConnectOp(SSAGraph *result, OpHandleBase *op,
void ConnectOp(Graph *result, OpHandleBase *op,
const std::string &prev_op_name) const;
void CreateComputationalOps(SSAGraph *result, const OpDesc &op,
void CreateComputationalOps(Graph *result, ir::Node *node,
size_t num_places) const;
void CreateScaleLossGradOp(SSAGraph *result) const;
VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og,
void CreateScaleLossGradOp(Graph *result) const;
VarHandle *CreateReduceOp(Graph *result, const std::string &og,
int dst_dev_id) const;
void CreateComputationalOp(SSAGraph *result, const OpDesc &op,
int dev_id) const;
void CreateComputationalOp(Graph *result, ir::Node *node, int dev_id) const;
bool IsParameterGradientOnce(
const std::string &og,
std::unordered_set<std::string> *og_has_been_broadcast) const;
int GetOpDeviceID(const OpDesc &op) const;
int GetOpDeviceID(ir::Node *node) const;
void InsertAllReduceOp(SSAGraph *result, const std::string &og) const;
void InsertAllReduceOp(Graph *result, const std::string &og) const;
void InsertDataBalanceOp(SSAGraph *result,
void InsertDataBalanceOp(Graph *result,
const std::vector<std::string> &datas) const;
void CreateBroadcastOp(SSAGraph *result, const std::string &p_name,
void CreateBroadcastOp(Graph *result, const std::string &p_name,
size_t src_dev_id) const;
bool IsSparseGradient(const std::string &og) const;
......
......@@ -80,19 +80,21 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) {
void OpHandleBase::AddInput(VarHandleBase *in) {
this->inputs_.emplace_back(in);
in->pending_ops_.insert(this);
node_->inputs.push_back(in->Node());
in->AddOutput(this, this->Node());
}
void OpHandleBase::AddOutput(VarHandleBase *out) {
outputs_.emplace_back(out);
out->generated_op_ = this;
node_->outputs.push_back(out->Node());
out->AddInput(this, this->Node());
}
void OpHandleBase::WaitInputVarGenerated() {
for (auto in_var : inputs_) {
if (NeedWait(in_var)) {
for (auto &pair : dev_ctxes_) {
in_var->generated_op_->RecordWaitEventOnCtx(pair.second);
in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second);
}
}
}
......@@ -101,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() {
void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
for (auto *in : inputs_) {
if (NeedWait(in)) {
in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]);
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]);
}
}
}
......@@ -117,7 +119,7 @@ size_t OpHandleBase::NoDummyInputSize() const {
}
bool OpHandleBase::NeedWait(VarHandleBase *in_var) {
return in_var && in_var->generated_op_;
return in_var && in_var->GeneratedOp();
}
void OpHandleBase::RunAndRecordEvent(const std::function<void()> &callback) {
......
......@@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/macros.h"
......@@ -26,9 +27,11 @@ namespace details {
constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
// Wraps ir::Node and provide helper utilities.
// It's responsible for populating necessary fields of ir::Node.
class OpHandleBase {
public:
OpHandleBase() {}
explicit OpHandleBase(ir::Node *node) : node_(node) {}
virtual ~OpHandleBase();
......@@ -82,6 +85,8 @@ class OpHandleBase {
size_t NoDummyInputSize() const;
ir::Node *Node() { return node_; }
protected:
void RunAndRecordEvent(const std::function<void()> &callback);
......@@ -90,6 +95,7 @@ class OpHandleBase {
virtual void RunImpl() = 0;
ir::Node *node_;
std::vector<VarHandleBase *> inputs_;
std::vector<VarHandleBase *> outputs_;
std::map<platform::Place, platform::DeviceContext *> dev_ctxes_;
......
......@@ -35,14 +35,16 @@ struct ReduceLoDTensor {
PADDLE_ENFORCE(!src_tensors_.empty());
auto &t0 = *src_tensors_[0];
PADDLE_ENFORCE_NE(t0.numel(), 0);
dst_tensor_.Resize(t0.dims());
T *dst = dst_tensor_.mutable_data<T>(platform::CPUPlace());
if (dst != t0.data<T>()) {
std::copy(t0.data<T>(), t0.data<T>() + t0.numel(), dst);
}
for (size_t i = 1; i < src_tensors_.size(); ++i) {
for (size_t i = 0; i < src_tensors_.size(); ++i) {
auto &t = *src_tensors_[i];
if (dst == t.data<T>()) {
continue;
}
PADDLE_ENFORCE_EQ(t.dims(), t0.dims());
PADDLE_ENFORCE_EQ(t.type(), t0.type());
std::transform(t.data<T>(), t.data<T>() + t.numel(), dst, dst,
......
......@@ -37,10 +37,13 @@ struct ReduceOpHandle : public OpHandleBase {
#ifdef PADDLE_WITH_CUDA
const platform::NCCLContextMap *nccl_ctxs_;
ReduceOpHandle(const std::vector<Scope *> &local_scopes,
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
const platform::NCCLContextMap *nccl_ctxs)
: local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) {
: OpHandleBase(node),
local_scopes_(local_scopes),
places_(places),
nccl_ctxs_(nccl_ctxs) {
if (nccl_ctxs_) {
for (auto &p_ctx : nccl_ctxs_->contexts_) {
dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get();
......@@ -48,9 +51,9 @@ struct ReduceOpHandle : public OpHandleBase {
}
}
#else
ReduceOpHandle(const std::vector<Scope *> &local_scopes,
ReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places)
: local_scopes_(local_scopes), places_(places) {}
: OpHandleBase(node), local_scopes_(local_scopes), places_(places) {}
#endif
std::string Name() const override;
......
......@@ -84,6 +84,7 @@ struct TestReduceOpHandle {
}
void InitReduceOp(size_t out_scope_idx) {
std::vector<std::unique_ptr<ir::Node>> nodes;
// init scope
for (size_t j = 0; j < gpu_list_.size(); ++j) {
local_scopes_.push_back(&(g_scope_.NewScope()));
......@@ -96,19 +97,21 @@ struct TestReduceOpHandle {
}
param_scopes_[out_scope_idx]->Var("out");
nodes.emplace_back(new ir::Node("node"));
if (use_gpu_) {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(
new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get()));
#else
PADDLE_THROW("CUDA is not support.");
#endif
} else {
#ifdef PADDLE_WITH_CUDA
op_handle_.reset(
new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get()));
op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_,
gpu_list_, nccl_ctxs_.get()));
#else
op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_));
op_handle_.reset(
new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_));
#endif
}
......@@ -118,8 +121,10 @@ struct TestReduceOpHandle {
if (!use_gpu_) {
op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get());
}
auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]);
in_var_handle->generated_op_ = nullptr;
nodes.emplace_back(new ir::Node("node1"));
auto *in_var_handle =
new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]);
in_var_handle->ClearGeneratedOp();
vars_.emplace_back(in_var_handle);
op_handle_->AddInput(in_var_handle);
}
......@@ -128,12 +133,13 @@ struct TestReduceOpHandle {
vars_.emplace_back(new DummyVarHandle());
DummyVarHandle *in_dummy_var_handle =
static_cast<DummyVarHandle *>(vars_.back().get());
in_dummy_var_handle->generated_op_ = nullptr;
in_dummy_var_handle->ClearGeneratedOp();
op_handle_->AddInput(in_dummy_var_handle);
// add output
auto *out_var_handle =
new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]);
nodes.emplace_back(new ir::Node("node2"));
auto *out_var_handle = new VarHandle(nodes.back().get(), 2, out_scope_idx,
"out", gpu_list_[out_scope_idx]);
vars_.emplace_back(out_var_handle);
op_handle_->AddOutput(out_var_handle);
......
......@@ -18,10 +18,11 @@ namespace paddle {
namespace framework {
namespace details {
RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc,
RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc,
const Scope *local_scope, const std::string &name,
const platform::Place &place)
: op_(framework::OpRegistry::CreateOp(op_desc)),
: OpHandleBase(node),
op_(framework::OpRegistry::CreateOp(op_desc)),
local_scope_(local_scope),
name_(name),
place_(place) {}
......@@ -35,8 +36,8 @@ void RPCOpHandle::RunImpl() {
if (in->DebugString() == "dummy") { // HACK
continue;
}
if (in->generated_op_) {
in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[p]);
if (in->GeneratedOp()) {
in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]);
}
}
auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
......
......@@ -28,8 +28,9 @@ namespace framework {
namespace details {
struct RPCOpHandle : public OpHandleBase {
RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope,
const std::string& name, const platform::Place& place);
RPCOpHandle(ir::Node* node, const framework::OpDesc& op_desc,
const Scope* local_scope, const std::string& name,
const platform::Place& place);
std::string Name() const override;
......
......@@ -19,10 +19,14 @@
namespace paddle {
namespace framework {
namespace details {
ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope,
ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev,
Scope *scope,
platform::Place place,
platform::DeviceContext *dev_ctx)
: coeff_(static_cast<float>(1.0 / num_dev)), scope_(scope), place_(place) {
: OpHandleBase(node),
coeff_(static_cast<float>(1.0 / num_dev)),
scope_(scope),
place_(place) {
dev_ctxes_[place_] = dev_ctx;
}
......
......@@ -25,7 +25,8 @@ namespace framework {
namespace details {
struct ScaleLossGradOpHandle : public OpHandleBase {
ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place,
ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope,
platform::Place place,
platform::DeviceContext *context);
~ScaleLossGradOpHandle() final;
......
......@@ -17,6 +17,9 @@
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/scope.h"
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph.h"
......@@ -17,8 +17,8 @@
namespace paddle {
namespace framework {
namespace details {
void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
for (auto &var_map : graph->vars_) {
void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) {
for (auto &var_map : graph->Get<GraphVars>("vars")) {
for (auto &name_pair : var_map) {
if (name_pair.second.size() <= 1) {
continue;
......@@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
auto it_old = name_pair.second.rbegin();
++it_old;
for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) {
auto *write_op = (*it_new)->generated_op_;
auto &read_ops = (*it_old)->pending_ops_;
OpHandleBase *write_op = (*it_new)->GeneratedOp();
const auto &read_ops = (*it_old)->PendingOps();
for (auto *read_op : read_ops) {
// Manually add a dependency var from read_op to write_op;
......@@ -37,10 +37,11 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
continue;
}
auto *dep_var = new DummyVarHandle();
auto *dep_var = new DummyVarHandle(
graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
read_op->AddOutput(dep_var);
write_op->AddInput(dep_var);
graph->dep_vars_.emplace(dep_var);
graph->Get<GraphDepVars>("dep_vars").emplace(dep_var);
}
}
}
......@@ -48,13 +49,20 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) {
}
VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
SSAGraph *graph, const std::string &each_var_name,
const platform::Place &place, size_t place_offset) {
auto &var_holders = graph->vars_[place_offset];
auto &var_holder = var_holders[each_var_name];
Graph *graph, ir::Node *node, const platform::Place &place,
size_t place_offset) {
auto &var_holders = graph->Get<GraphVars>("vars")[place_offset];
auto &var_holder = var_holders[node->Name()];
VarHandle *var = nullptr;
if (var_holder.empty()) {
var = new VarHandle(0, place_offset, each_var_name, place);
if (node->Var()) {
var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset,
node->Name(), place);
} else {
var = new VarHandle(
graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0,
place_offset, node->Name(), place);
}
var_holder.emplace_back(var);
} else {
var = var_holder.rbegin()->get();
......@@ -62,24 +70,26 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle(
return var;
}
void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
const std::string &each_var_name,
void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
ir::Node *new_node,
const platform::Place &place,
size_t place_offset) {
auto &vars = graph->vars_[place_offset][each_var_name];
auto &vars = graph->Get<GraphVars>("vars")[place_offset][new_node->Name()];
size_t version = vars.size();
auto var = new VarHandle(version, place_offset, each_var_name, place);
auto var =
new VarHandle(new_node, version, place_offset, new_node->Name(), place);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) {
for (auto &op : graph->ops_) {
void SSAGraphBuilder::AddOutputToLeafOps(Graph *graph) {
for (auto &op : graph->Get<GraphOps>("ops")) {
if (!op->Outputs().empty()) {
continue;
}
auto *dummy_leaf = new DummyVarHandle();
graph->dep_vars_.emplace(dummy_leaf);
auto *dummy_leaf = new DummyVarHandle(
graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable));
graph->Get<GraphDepVars>("dep_vars").emplace(dummy_leaf);
op->AddOutput(dummy_leaf);
}
}
......
......@@ -16,20 +16,42 @@
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace details {
class SSAGraphBuilder {
// all variable in each devices.
// The outside vector is the device vector. Each element of this vector is a
// map from variable name to variables. The variables, who have the same name,
// will have a differsent version. The offset in the
// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
typedef std::vector<
std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
GraphVars;
// aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<std::unique_ptr<VarHandleBase>> GraphDepVars;
// all operators. NOTE that even we use a vector here, the operators is
// unordered.
typedef std::vector<std::unique_ptr<OpHandleBase>> GraphOps;
class SSAGraphBuilder : public ir::Pass {
public:
SSAGraphBuilder() {}
virtual ~SSAGraphBuilder() {}
virtual std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const = 0;
virtual int GetVarDeviceID(const std::string &var_name) const = 0;
DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder);
......@@ -42,20 +64,19 @@ class SSAGraphBuilder {
*
* https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR)
*/
static void PolishGraphToSupportDataHazards(SSAGraph *graph);
static void PolishGraphToSupportDataHazards(Graph *graph);
static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph,
const std::string &each_var_name,
static VarHandle *CreateOrGetLatestVarHandle(Graph *graph, ir::Node *node,
const platform::Place &place,
size_t place_offset);
// Add an output variable (each_var_name, place, place_offset) to op_handle,
// which belongs to graph
static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle,
const std::string &each_var_name,
const platform::Place &place, size_t place_offset);
static void CreateOpOutput(Graph *graph, OpHandleBase *op_handle,
ir::Node *new_node, const platform::Place &place,
size_t place_offset);
static void AddOutputToLeafOps(SSAGraph *graph);
static void AddOutputToLeafOps(Graph *graph);
};
} // namespace details
} // namespace framework
......
......@@ -12,15 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/ssa_graph.h"
#include <string>
#include "paddle/fluid/framework/details/ssa_graph_checker.h"
#include <string>
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const {
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
std::unordered_set<VarHandleBase *> ready_vars;
......@@ -28,12 +28,12 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
auto insert_pending_var = [&](VarHandleBase *var) {
pending_vars.insert(var);
if (var->generated_op_ == nullptr) {
if (var->GeneratedOp() == nullptr) {
ready_vars.emplace(var);
}
};
for (auto &var_map : graph->vars_) {
for (auto &var_map : graph->Get<GraphVars>("vars")) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
insert_pending_var(version_pair.get());
......@@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
}
}
for (auto &var : graph->dep_vars_) {
for (auto &var : graph->Get<GraphDepVars>("dep_vars")) {
insert_pending_var(var.get());
}
for (auto &op : graph->ops_) {
for (auto &op : graph->Get<GraphOps>("ops")) {
if (op->Inputs().empty()) {
ready_ops.insert(op.get());
} else {
......@@ -71,7 +71,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const {
for (auto ready_var : ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->pending_ops_) {
for (auto *op : ready_var->PendingOps()) {
auto &deps = --pending_ops[op];
if (deps == 0) {
ready_ops.insert(op);
......
......@@ -21,7 +21,6 @@
namespace paddle {
namespace framework {
namespace details {
struct SSAGraph;
class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
public:
......@@ -29,17 +28,17 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
std::unique_ptr<SSAGraphBuilder>&& builder)
: builder_(std::move(builder)) {}
std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
auto graph = builder_->Build(program);
PADDLE_ENFORCE(IsValidGraph(graph.get()));
return graph;
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
auto new_graph = builder_->Apply(std::move(graph));
PADDLE_ENFORCE(IsValidGraph(new_graph.get()));
return new_graph;
}
int GetVarDeviceID(const std::string& var_name) const override {
return builder_->GetVarDeviceID(var_name);
}
bool IsValidGraph(const SSAGraph* graph) const;
bool IsValidGraph(const Graph* graph) const;
private:
std::unique_ptr<SSAGraphBuilder> builder_;
......
......@@ -18,8 +18,8 @@
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
......
......@@ -14,15 +14,15 @@
#include "paddle/fluid/framework/details/ssa_graph_printer.h"
#include <string>
#include "paddle/fluid/framework/details/ssa_graph.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
namespace details {
template <typename Callback>
static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
for (auto &each : graph.vars_) {
static inline void IterAllVar(const Graph &graph, Callback callback) {
for (auto &each : graph.Get<GraphVars>("vars")) {
for (auto &pair1 : each) {
for (auto &pair2 : pair1.second) {
callback(*pair2);
......@@ -30,12 +30,12 @@ static inline void IterAllVar(const SSAGraph &graph, Callback callback) {
}
}
for (auto &var : graph.dep_vars_) {
for (auto &var : graph.Get<GraphDepVars>("dep_vars")) {
callback(*var);
}
}
void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
void GraphvizSSAGraphPrinter::Print(const Graph &graph,
std::ostream &sout) const {
size_t var_id = 0;
std::unordered_map<const VarHandleBase *, size_t> vars;
......@@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph,
});
size_t op_id = 0;
for (auto &op : graph.ops_) {
for (auto &op : graph.Get<GraphOps>("ops")) {
std::string op_name = "op_" + std::to_string(op_id++);
sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]"
<< std::endl;
......
......@@ -21,16 +21,16 @@
namespace paddle {
namespace framework {
namespace details {
struct SSAGraph;
class SSAGraphPrinter {
public:
virtual ~SSAGraphPrinter() {}
virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0;
virtual void Print(const Graph& graph, std::ostream& sout) const = 0;
};
class GraphvizSSAGraphPrinter : public SSAGraphPrinter {
public:
void Print(const SSAGraph& graph, std::ostream& sout) const override;
void Print(const Graph& graph, std::ostream& sout) const override;
};
class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
......@@ -50,10 +50,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder {
stream_ptr_(std::move(sout)),
stream_ref_(*stream_ptr_) {}
std::unique_ptr<SSAGraph> Build(const ProgramDesc& program) const override {
auto graph = builder_->Build(program);
printer_->Print(*graph, stream_ref_);
return graph;
std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const override {
auto new_graph = builder_->Apply(std::move(graph));
printer_->Print(*new_graph, stream_ref_);
return new_graph;
}
int GetVarDeviceID(const std::string& var_name) const override {
......
......@@ -14,13 +14,14 @@
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
namespace paddle {
namespace framework {
namespace details {
ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<SSAGraph> &&graph)
const std::vector<platform::Place> &places, std::unique_ptr<Graph> &&graph)
: graph_(std::move(graph)),
pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
: nullptr),
......@@ -43,18 +44,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
std::unordered_set<OpHandleBase *> delayed_ops;
// Transform SSAGraph to pending_ops & pending_vars
for (auto &var_map : graph_->vars_) {
for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
for (auto &name_pair : var_map) {
for (auto &version_pair : name_pair.second) {
InsertPendingVar(&pending_vars, &ready_vars, version_pair.get());
}
}
}
for (auto &var : graph_->dep_vars_) {
for (auto &var : graph_->Get<details::GraphDepVars>("dep_vars")) {
InsertPendingVar(&pending_vars, &ready_vars, var.get());
}
for (auto &op : graph_->ops_) {
for (auto &op : graph_->Get<details::GraphOps>("ops")) {
if (op->Inputs().empty()) { // Special case, Op has no input.
ready_ops.insert(op.get());
} else {
......@@ -64,11 +65,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Step 2. Insert FetchOps
std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
std::vector<std::unique_ptr<ir::Node>> tmp_nodes;
std::unordered_set<std::unique_ptr<VarHandleBase>> fetch_dependencies;
FeedFetchList fetch_data(fetch_tensors.size());
InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops,
&pending_vars, &ready_vars, &fetch_data);
InsertFetchOps(fetch_tensors, &fetch_ops, &tmp_nodes, &fetch_dependencies,
&pending_ops, &pending_vars, &ready_vars, &fetch_data);
auto run_all_ops = [&](std::unordered_set<OpHandleBase *> &set) {
for (auto *op : set) {
......@@ -125,7 +127,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
// Find the ready_ops after the ready_var.
for (auto ready_var : cur_ready_vars) {
pending_vars.erase(ready_var);
for (auto *op : ready_var->pending_ops_) {
for (auto *op : ready_var->PendingOps()) {
auto &deps = pending_ops[op];
--deps;
if (deps == 0) {
......@@ -151,6 +153,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
void ThreadedSSAGraphExecutor::InsertFetchOps(
const std::vector<std::string> &fetch_tensors,
std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
std::unordered_map<OpHandleBase *, size_t> *pending_ops,
std::unordered_set<VarHandleBase *> *pending_vars,
......@@ -158,7 +161,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
for (auto &fetch_var_name : fetch_tensors) {
for (auto &var_map : graph_->vars_) {
for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
auto it = var_map.find(fetch_var_name);
if (it != var_map.end()) {
fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
......@@ -168,8 +171,16 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
for (size_t i = 0; i < fetch_tensors.size(); ++i) {
auto &var_name = fetch_tensors[i];
auto &vars = fetched_vars.at(var_name);
auto *op = new FetchOpHandle(fetch_data, i, &local_scopes_);
auto fetched_var_it = fetched_vars.find(var_name);
PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
"Cannot find fetched variable.(Perhaps the main_program "
"is not set to ParallelExecutor)");
auto &vars = fetched_var_it->second;
temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i,
&local_scopes_);
fetch_ops->emplace_back(op);
for (auto &p : places_) {
......@@ -180,7 +191,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps(
op->AddInput(var);
}
auto *fetch_dummy = new DummyVarHandle();
temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
auto *fetch_dummy = new DummyVarHandle(temp_nodes->back().get());
op->AddOutput(fetch_dummy);
fetch_dependencies->emplace(fetch_dummy);
this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy);
......@@ -198,7 +210,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar(
std::unordered_set<VarHandleBase *> *pending_vars,
BlockingQueue<VarHandleBase *> *ready_vars, VarHandleBase *var) const {
pending_vars->insert(var);
if (var->generated_op_ == nullptr) {
if (var->GeneratedOp() == nullptr) {
ready_vars->Push(var);
}
}
......
......@@ -27,6 +27,7 @@
#include "paddle/fluid/framework/details/execution_strategy.h"
#include "paddle/fluid/framework/details/fetch_op_handle.h"
#include "paddle/fluid/framework/details/ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
......@@ -39,7 +40,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::unique_ptr<SSAGraph> &&graph);
std::unique_ptr<Graph> &&graph);
// Run a SSAGraph by a thread pool
// Use topological sort algorithm
......@@ -52,7 +53,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
details::OpHandleBase *op);
private:
std::unique_ptr<SSAGraph> graph_;
std::unique_ptr<Graph> graph_;
std::unique_ptr<::ThreadPool> pool_;
std::vector<Scope *> local_scopes_;
std::vector<platform::Place> places_;
......@@ -71,6 +72,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
void InsertFetchOps(
const std::vector<std::string> &fetch_tensors,
std::vector<std::unique_ptr<FetchOpHandle>> *fetch_ops,
std::vector<std::unique_ptr<ir::Node>> *temp_nodes,
std::unordered_set<std::unique_ptr<VarHandleBase>> *fetch_dependencies,
std::unordered_map<OpHandleBase *, size_t> *pending_ops,
std::unordered_set<VarHandleBase *> *pending_vars,
......
......@@ -13,11 +13,14 @@
// limitations under the License.
#pragma once
#include <algorithm>
#include <sstream>
#include <string>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
......@@ -25,19 +28,60 @@ namespace framework {
namespace details {
class OpHandleBase;
// Wraps ir::Node and provide helper utilities.
// It's responsible for populating necessary fields of ir::Node.
//
// VarHandleBase is the var node in the dependency graph.
// A variable can only be generated by a single operator. i.e.
// This is a single assignment graph.
struct VarHandleBase {
explicit VarHandleBase(ir::Node* node) : node_(node) {}
virtual ~VarHandleBase();
virtual std::string DebugString() const = 0;
void AddInput(OpHandleBase* in, ir::Node* node) {
node_->inputs.clear();
node_->inputs.push_back(node);
generated_op_ = in;
}
void AddOutput(OpHandleBase* out, ir::Node* node) {
if (pending_ops_.find(out) == pending_ops_.end()) {
pending_ops_.insert(out);
node_->outputs.push_back(node);
}
}
void RemoveOutput(OpHandleBase* out, ir::Node* node) {
pending_ops_.erase(out);
node_->outputs.erase(
std::remove(node_->outputs.begin(), node_->outputs.end(), node),
node_->outputs.end());
}
void ClearGeneratedOp() {
generated_op_ = nullptr;
node_->inputs.clear();
}
OpHandleBase* GeneratedOp() { return generated_op_; }
const std::unordered_set<OpHandleBase*>& PendingOps() const {
return pending_ops_;
}
ir::Node* Node() { return node_; }
protected:
// The operator who generate this variable. nullptr if the variable
// is a root node.
OpHandleBase* generated_op_{nullptr};
// Operators which depend on this variable ready.
std::unordered_set<OpHandleBase*> pending_ops_;
ir::Node* node_;
};
// VarHandle is actually a single version of Runtime Variable.
......@@ -46,11 +90,14 @@ struct VarHandleBase {
//
// NOTE: runtime variables have place.
struct VarHandle : public VarHandleBase {
explicit VarHandle(ir::Node* node) : VarHandleBase(node) {}
std::string DebugString() const override;
VarHandle(size_t version, size_t scope_index, std::string name,
platform::Place place)
: version_(version),
VarHandle(ir::Node* node, size_t version, size_t scope_index,
std::string name, platform::Place place)
: VarHandleBase(node),
version_(version),
scope_idx_(scope_index),
name_(std::move(name)),
place_(std::move(place)) {}
......@@ -70,6 +117,8 @@ struct VarHandle : public VarHandleBase {
// Dummy Variable. It is used to represent dependencies between operators
struct DummyVarHandle : public VarHandleBase {
explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {}
std::string DebugString() const override;
};
......
cc_library(node SRCS node.cc DEPS proto_desc)
cc_library(graph SRCS graph.cc DEPS node)
cc_library(pass SRCS pass.cc DEPS graph node)
cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
namespace paddle {
namespace framework {
// NOTE(paddle-dev): This graph contains circle.
Graph::Graph(const ProgramDesc &program) : program_(program) {
VLOG(3) << "block in program:" << program_.Size();
std::unordered_map<std::string, VarDesc *> all_vars;
for (auto *var : program.Block(0).AllVars()) {
all_vars.emplace(var->Name(), var);
}
std::map<std::string, ir::Node *> var_nodes;
for (auto *op : program.Block(0).AllOps()) {
ir::Node *node = CreateOpNode(op);
for (auto &each_var_name : op->InputArgumentNames()) {
ir::Node *var = nullptr;
if (var_nodes.find(each_var_name) != var_nodes.end()) {
var = var_nodes.at(each_var_name);
} else if (all_vars.count(each_var_name) != 0) {
var = CreateVarNode(all_vars.at(each_var_name));
var_nodes[each_var_name] = var;
} else {
// TODO(paddle-dev): Seems some assumption doesn't hold?
VLOG(3) << op->Type()
<< " input var not in all_var list: " << each_var_name;
var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
var_nodes[each_var_name] = var;
}
node->inputs.push_back(var);
var->outputs.push_back(node);
}
for (auto &each_var_name : op->OutputArgumentNames()) {
ir::Node *var = nullptr;
if (var_nodes.find(each_var_name) != var_nodes.end()) {
var = var_nodes.at(each_var_name);
} else {
var = CreateVarNode(all_vars.at(each_var_name));
var_nodes[each_var_name] = var;
}
node->outputs.push_back(var);
var->inputs.push_back(node);
}
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/variant.h"
namespace paddle {
namespace framework {
class Graph {
public:
explicit Graph(const ProgramDesc& program);
virtual ~Graph() {
for (auto& attr : attrs_) {
attr_dels_[attr.first]();
}
attrs_.clear();
attr_dels_.clear();
}
template <typename AttrType>
AttrType& Get(const std::string& attr_name) const {
return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
}
template <typename AttrType>
void Set(const std::string& attr_name, AttrType* attr) {
PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name;
delete attr;
};
}
ir::Node* CreateVarNode(VarDesc* var_desc) {
nodes.emplace_back(new ir::Node(var_desc));
return nodes.back().get();
}
ir::Node* CreateOpNode(OpDesc* op_desc) {
nodes.emplace_back(new ir::Node(op_desc));
return nodes.back().get();
}
ir::Node* CreateEmptyNode(const std::string& name, ir::Node::Type type) {
nodes.emplace_back(new ir::Node(name, type));
return nodes.back().get();
}
std::vector<std::unique_ptr<ir::Node>> nodes;
private:
// NOTE: program_ shouldn't be exposed to user.
const ProgramDesc& program_;
std::map<std::string, boost::any> attrs_;
std::map<std::string, std::function<void(void)>> attr_dels_;
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace framework {
class NOP : public OperatorBase {
public:
NOP(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
private:
void RunImpl(const Scope &scope,
const platform::Place &place) const override {}
};
class SumOpMaker : public OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "").AsDuplicable();
AddOutput("Out", "");
AddComment("");
}
};
class SumOpVarTypeInference : public VarTypeInference {
public:
void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
auto &inputs = op_desc.Input("X");
auto default_var_type = proto::VarType::SELECTED_ROWS;
bool any_input_is_lod_tensor = std::any_of(
inputs.begin(), inputs.end(), [block](const std::string &name) {
return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
});
if (any_input_is_lod_tensor) {
default_var_type = proto::VarType::LOD_TENSOR;
}
auto out_var_name = op_desc.Output("Out").front();
block->Var(out_var_name)->SetType(default_var_type);
}
};
} // namespace framework
} // namespace paddle
REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
paddle::framework::SumOpVarTypeInference);
REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
paddle::framework::SumOpMaker);
namespace paddle {
namespace framework {
TEST(GraphTest, Basic) {
ProgramDesc prog;
auto *op = prog.MutableBlock(0)->AppendOp();
op->SetType("sum");
op->SetInput("X", {"test_a", "test_b", "test_c"});
op->SetOutput("Out", {"test_out"});
prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS);
prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS);
prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS);
prog.MutableBlock(0)->Var("test_out");
op->InferVarType(prog.MutableBlock(0));
ASSERT_EQ(proto::VarType::SELECTED_ROWS,
prog.MutableBlock(0)->Var("test_out")->GetType());
prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR);
op->InferVarType(prog.MutableBlock(0));
ASSERT_EQ(proto::VarType::LOD_TENSOR,
prog.MutableBlock(0)->Var("test_out")->GetType());
std::unique_ptr<Graph> g(new Graph(prog));
ASSERT_EQ(g->nodes[0]->Name(), "sum");
ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a");
ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b");
ASSERT_EQ(g->nodes[0]->inputs[2]->Name(), "test_c");
ASSERT_EQ(g->nodes[0]->outputs[0]->Name(), "test_out");
ASSERT_EQ(g->nodes[1]->Name(), "test_a");
ASSERT_EQ(g->nodes[1]->outputs[0]->Name(), "sum");
ASSERT_EQ(g->nodes[2]->Name(), "test_b");
ASSERT_EQ(g->nodes[2]->outputs[0]->Name(), "sum");
ASSERT_EQ(g->nodes[3]->Name(), "test_c");
ASSERT_EQ(g->nodes[3]->outputs[0]->Name(), "sum");
ASSERT_EQ(g->nodes[4]->Name(), "test_out");
ASSERT_EQ(g->nodes[4]->inputs[0]->Name(), "sum");
ASSERT_EQ(g->nodes.size(), 5);
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/node.h"
namespace paddle {
namespace framework {} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/platform/macros.h"
namespace paddle {
namespace framework {
namespace ir {
class Node {
public:
enum class Type { kOperation, kVariable };
explicit Node(const std::string& name, Type type)
: name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {}
explicit Node(VarDesc* var_desc)
: name_(var_desc->Name()),
var_desc_(var_desc),
op_desc_(nullptr),
type_(Type::kVariable) {}
explicit Node(OpDesc* op_desc)
: name_(op_desc->Type()),
var_desc_(nullptr),
op_desc_(op_desc),
type_(Type::kOperation) {}
Type NodeType() const { return type_; }
std::string Name() const { return name_; }
VarDesc* Var() {
PADDLE_ENFORCE(type_ == Type::kVariable);
return var_desc_;
}
OpDesc* Op() {
PADDLE_ENFORCE(type_ == Type::kOperation);
return op_desc_;
}
std::vector<Node*> inputs;
std::vector<Node*> outputs;
protected:
const std::string name_;
VarDesc* var_desc_;
OpDesc* op_desc_;
Type type_;
private:
DISABLE_COPY_AND_ASSIGN(Node);
};
} // namespace ir
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace framework {
namespace ir {
class Pass {
public:
Pass() = default;
virtual ~Pass() {}
virtual std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const = 0;
};
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -312,19 +312,22 @@ void WriteToRecordIO(recordio::Writer *writer,
writer->Write(buffer.str());
}
std::vector<LoDTensor> ReadFromRecordIO(
recordio::Scanner *scanner, const platform::DeviceContext &dev_ctx) {
std::vector<LoDTensor> result;
if (scanner->HasNext()) {
std::istringstream sin(scanner->Next());
uint32_t sz;
sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
result.resize(sz);
for (uint32_t i = 0; i < sz; ++i) {
DeserializeFromStream(sin, &result[i], dev_ctx);
}
bool ReadFromRecordIO(recordio::Scanner *scanner,
const platform::DeviceContext &dev_ctx,
std::vector<LoDTensor> *result_ptr) {
if (!scanner->HasNext()) {
return false;
}
return result;
std::istringstream sin(scanner->Next());
uint32_t sz;
sin.read(reinterpret_cast<char *>(&sz), sizeof(uint32_t));
auto &result = *result_ptr;
result.resize(sz);
for (uint32_t i = 0; i < sz; ++i) {
DeserializeFromStream(sin, &result[i], dev_ctx);
}
return true;
}
std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
......
......@@ -223,8 +223,9 @@ extern void WriteToRecordIO(recordio::Writer* writer,
const std::vector<LoDTensor>& tensor,
const platform::DeviceContext& dev_ctx);
extern std::vector<LoDTensor> ReadFromRecordIO(
recordio::Scanner* scanner, const platform::DeviceContext& dev_ctx);
extern bool ReadFromRecordIO(recordio::Scanner* scanner,
const platform::DeviceContext& dev_ctx,
std::vector<LoDTensor>* result_ptr);
/*
* Convert between length-based LoD and offset-based LoD.
......
......@@ -301,11 +301,12 @@ static void TestRecordIO() {
{
std::unique_ptr<std::istream> stream_ptr(stream);
recordio::Scanner scanner(std::move(stream_ptr));
auto tensors = ReadFromRecordIO(&scanner, ctx);
std::vector<framework::LoDTensor> tensors;
ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]);
tensors = ReadFromRecordIO(&scanner, ctx);
ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors));
ASSERT_EQ(tensors.size(), static_cast<size_t>(2));
assert_tensor_ok(tensors[0]);
assert_tensor_ok(tensors[1]);
......
......@@ -26,6 +26,7 @@
namespace paddle {
namespace framework {
#if defined(PADDLE_WITH_CUDA)
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template <typename T>
......@@ -37,11 +38,11 @@ class Vector {
Vector() { InitEmpty(); }
// Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T& value = T()) {
explicit Vector(size_t count, const T &value = T()) {
InitEmpty();
if (count != 0) {
resize(count);
T* ptr = begin();
T *ptr = begin();
for (size_t i = 0; i < count; ++i) {
ptr[i] = value;
}
......@@ -59,7 +60,7 @@ class Vector {
// implicit cast from std::vector.
template <typename U>
Vector(const std::vector<U>& dat) { // NOLINT
Vector(const std::vector<U> &dat) { // NOLINT
if (dat.size() == 0) {
InitEmpty();
} else {
......@@ -68,10 +69,10 @@ class Vector {
}
// Copy ctor
Vector(const Vector<T>& other) { this->operator=(other); }
Vector(const Vector<T> &other) { this->operator=(other); }
// Copy operator
Vector<T>& operator=(const Vector<T>& other) {
Vector<T> &operator=(const Vector<T> &other) {
if (other.size() != 0) {
this->InitByIter(other.size(), other.begin(), other.end());
} else {
......@@ -81,7 +82,7 @@ class Vector {
}
// Move ctor
Vector(Vector<T>&& other) {
Vector(Vector<T> &&other) {
this->size_ = other.size_;
this->flag_ = other.flag_;
if (other.cuda_vec_.memory_size()) {
......@@ -93,13 +94,13 @@ class Vector {
}
// CPU data access method. Mutable.
T& operator[](size_t i) {
T &operator[](size_t i) {
MutableCPU();
return const_cast<T*>(cpu_vec_.data<T>())[i];
return const_cast<T *>(cpu_vec_.data<T>())[i];
}
// CPU data access method. Immutable.
const T& operator[](size_t i) const {
const T &operator[](size_t i) const {
ImmutableCPU();
return cpu_vec_.data<T>()[i];
}
......@@ -107,43 +108,43 @@ class Vector {
// std::vector iterator methods. Based on CPU data access method
size_t size() const { return size_; }
T* begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
T* end() {
T *end() {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
T& front() { return *begin(); }
T &front() { return *begin(); }
T& back() {
T &back() {
auto it = end();
--it;
return *it;
}
const T* begin() const {
const T *begin() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
}
const T* end() const {
const T *end() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
const T* cbegin() const { return begin(); }
const T *cbegin() const { return begin(); }
const T* cend() const { return end(); }
const T *cend() const { return end(); }
const T& back() const {
const T &back() const {
auto it = end();
--it;
return *it;
}
T* data() { return begin(); }
T *data() { return begin(); }
const T* data() const { return begin(); }
const T *data() const { return begin(); }
const T& front() const { return *begin(); }
const T &front() const { return *begin(); }
// end of std::vector iterator methods
// assign this from iterator.
......@@ -169,7 +170,7 @@ class Vector {
void Extend(It begin, It end) {
size_t pre_size = size_;
resize(pre_size + (end - begin));
T* ptr = this->begin() + pre_size;
T *ptr = this->begin() + pre_size;
for (; begin < end; ++begin, ++ptr) {
*ptr = *begin;
}
......@@ -183,9 +184,9 @@ class Vector {
MutableCPU();
Tensor cpu_tensor;
platform::Place cpu = platform::CPUPlace();
T* ptr = cpu_tensor.mutable_data<T>(
T *ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T* old_ptr =
const T *old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr);
......@@ -196,7 +197,7 @@ class Vector {
}
// get cuda ptr. immutable
const T* CUDAData(platform::Place place) const {
const T *CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
......@@ -204,10 +205,10 @@ class Vector {
}
// get cuda ptr. mutable
T* CUDAMutableData(platform::Place place) {
const T* ptr = CUDAData(place);
T *CUDAMutableData(platform::Place place) {
const T *ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA;
return const_cast<T*>(ptr);
return const_cast<T *>(ptr);
}
// clear
......@@ -228,7 +229,7 @@ class Vector {
}
// the unify method to access CPU or CUDA data. immutable.
const T* Data(platform::Place place) const {
const T *Data(platform::Place place) const {
if (platform::is_gpu_place(place)) {
return CUDAData(place);
} else {
......@@ -237,7 +238,7 @@ class Vector {
}
// the unify method to access CPU or CUDA data. mutable.
T* MutableData(platform::Place place) {
T *MutableData(platform::Place place) {
if (platform::is_gpu_place(place)) {
return CUDAMutableData(place);
} else {
......@@ -253,7 +254,7 @@ class Vector {
return result;
}
bool operator==(const Vector<T>& other) const {
bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false;
auto it1 = cbegin();
auto it2 = other.cbegin();
......@@ -274,7 +275,7 @@ class Vector {
template <typename Iter>
void InitByIter(size_t size, Iter begin, Iter end) {
platform::Place cpu = platform::CPUPlace();
T* ptr = this->cpu_vec_.template mutable_data<T>(
T *ptr = this->cpu_vec_.template mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
for (size_t i = 0; i < size; ++i) {
*ptr++ = *begin++;
......@@ -368,7 +369,7 @@ class Vector {
}
}
static T& EmptyDummy() {
static T &EmptyDummy() {
static T dummy = T();
return dummy;
}
......@@ -379,5 +380,53 @@ class Vector {
size_t size_;
};
} // namespace framework
#else // PADDLE_WITH_CUDA
template <typename T>
class CPUVector : public std::vector<T, std::allocator<T>> {
public:
CPUVector() : std::vector<T>() {}
CPUVector(size_t count, const T &value = T())
: std::vector<T>(count, value) {}
CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
CPUVector &operator=(const CPUVector &other) {
this->assign(other.begin(), other.end());
return *this;
}
CPUVector &operator=(const std::vector<T> &other) {
this->assign(other.begin(), other.end());
return *this;
}
friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
std::stringstream ss;
for (auto v : other) {
os << v << " ";
}
return os;
}
void resize(size_t size) { this->resize(size); }
T &operator[](size_t id) { return this->at(id); }
const T &operator[](size_t id) const { return this->at(id); }
template <typename D>
void Extend(const D &begin, const D &end) {
this->reserve(this->size() + size_t(end - begin));
this->insert(this->end(), begin, end);
}
};
template <typename T>
using Vector = CPUVector<T>;
#endif // PADDLE_WITH_CUDA
}; // namespace framework
} // namespace paddle
......@@ -18,6 +18,8 @@ limitations under the License. */
#include <tuple>
#include <vector>
#include "paddle/fluid/framework/ir/graph.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/nccl_helper.h"
#endif
......@@ -129,12 +131,11 @@ ParallelExecutor::ParallelExecutor(
PADDLE_THROW("Not compiled with CUDA.");
#endif
}
builder_ = builder_factory.Create();
std::unique_ptr<Graph> graph(new Graph(main_program));
graph = builder_->Apply(std::move(graph));
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, places,
builder_->Build(main_program)));
exec_strategy, member_->local_scopes_, places, std::move(graph)));
member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, std::move(var_infos),
member_->places_, std::move(member_->executor_)));
......
......@@ -67,7 +67,8 @@ void ReaderBase::Start() {
}
}
ReaderBase::~ReaderBase() { Shutdown(); }
ReaderBase::~ReaderBase() {}
DecoratedReader::~DecoratedReader() { reader_->Shutdown(); }
} // namespace framework
} // namespace paddle
......@@ -25,8 +25,6 @@
namespace paddle {
namespace framework {
enum ReaderStatus { kRunning, kStopped };
class ReaderBase {
public:
virtual void ReadNext(std::vector<LoDTensor>* out);
......@@ -48,6 +46,8 @@ class ReaderBase {
virtual void StartImpl() {}
enum ReaderStatus { kRunning, kStopped };
ReaderStatus status_{kRunning};
mutable std::mutex mu_;
......@@ -74,6 +74,8 @@ class DecoratedReader : public ReaderBase,
reader_->InsertDecoratedReader(shared_from_this());
}
~DecoratedReader();
protected:
void ShutdownImpl() override { reader_->Shutdown(); }
......
......@@ -15,6 +15,7 @@
#include <algorithm>
#include <limits>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
namespace paddle {
namespace framework {
......@@ -261,7 +262,8 @@ void TensorToStream(std::ostream& os, const Tensor& tensor,
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type());
auto* data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
......@@ -331,6 +333,9 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
tensor->Resize(framework::make_ddim(dims));
void* buf;
auto ctx = platform::CPUDeviceContext();
size_t size =
tensor->numel() *
framework::SizeOfType(framework::ToTypeIndex(desc.data_type()));
if (platform::is_gpu_place(dev_ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
Tensor cpu_tensor;
......@@ -338,7 +343,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
is.read(static_cast<char*>(buf), size);
auto dst_place = dev_ctx.GetPlace();
framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor);
#else
......@@ -348,7 +353,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), tensor->memory_size());
is.read(static_cast<char*>(buf), size);
}
}
}
......
......@@ -38,4 +38,6 @@ if(WITH_TESTING)
# both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
add_subdirectory(tests/book)
endif()
add_subdirectory(api)
if(NOT APPLE)
add_subdirectory(api)
endif()
......@@ -19,10 +19,14 @@ function (inference_analysis_test TARGET)
set(multiValueArgs SRCS)
cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(mem_opt "")
if(WITH_GPU)
set(mem_opt "--fraction_of_gpu_memory_to_use=0.5")
endif()
cc_test(${TARGET}
SRCS "${analysis_test_SRCS}"
DEPS analysis
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5)
ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt})
set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec)
endif(WITH_TESTING)
endfunction(inference_analysis_test)
......
......@@ -22,8 +22,6 @@
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
"Enable subgraph to TensorRT engine for acceleration");
......@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
DEFINE_string(inference_analysis_graphviz_log_root, "./",
"Graphviz debuger for data flow graphs.");
namespace inference {
namespace analysis {
class DfgPassManagerImpl final : public DfgPassManager {
public:
DfgPassManagerImpl() {
......
......@@ -45,14 +45,15 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h"
namespace paddle {
namespace inference {
namespace analysis {
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available.
DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
DECLARE_string(inference_analysis_graphviz_log_root);
namespace inference {
namespace analysis {
class Analyzer : public OrderedRegistry<PassManager> {
public:
// Register all the pass-managers.
......
......@@ -13,13 +13,21 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle {
namespace inference {
namespace analysis {
TEST_F(DFG_Tester, main) {
TEST_F(DFG_Tester, analysis_without_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
Analyzer analyser;
analyser.Run(&argument);
}
TEST_F(DFG_Tester, analysis_with_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
Analyzer analyser;
analyser.Run(&argument);
}
......
......@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
return stack_.top();
}
inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inlinks.size() == n;
}
GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()};
......@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) {
if (p->deleted()) {
visited.insert(p);
to_visit.erase(p);
continue;
}
inlink_visited.clear();
std::copy_if(p->inlinks.begin(), p->inlinks.end(),
......@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
return sorted_[cursor_];
}
std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
// Input a Value, check whether its inlink is in the subgraph.
auto inlink_in_subgraph = [&](Node *n) {
for (auto *in : n->inlinks) {
if (nodes.count(in)) return true;
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
// input of the sub-graph.
if (!nodes.count(in) && in->type() == Node::Type::kValue &&
!inlink_in_subgraph(in)) {
inputs.insert(in);
}
}
for (auto *out : node->outlinks) {
if (!nodes.count(out) && out->type() == Node::Type::kValue) {
outputs.insert(out);
}
}
}
return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
std::vector<Node *>(outputs.begin(), outputs.end()));
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> {
private:
std::vector<Node *> sorted_;
int cursor_{0};
size_t cursor_{0};
};
explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
......@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> {
// Extract the inputs and outputs of a graph. The inputs and outputs of a
// sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph.
static std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
// Input a Value, check whether its inlink is in the subgraph.
auto inlink_in_subgraph = [&](Node *n) {
for (auto *in : n->inlinks) {
if (nodes.count(in)) return true;
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
// input of the sub-graph.
if (!nodes.count(in) && in->type() == Node::Type::kValue &&
!inlink_in_subgraph(in)) {
inputs.insert(in);
}
}
for (auto *out : node->outlinks) {
if (!nodes.count(out) && out->type() == Node::Type::kValue) {
outputs.insert(out);
}
}
}
return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
std::vector<Node *>(outputs.begin(), outputs.end()));
}
std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
} // namespace analysis
} // namespace inference
......
......@@ -22,14 +22,18 @@
namespace paddle {
namespace inference {
DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
namespace analysis {
using framework::proto::ProgramDesc;
std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes);
const std::vector<std::unique_ptr<Node>> &nodes);
bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
PADDLE_ENFORCE(!argument->transformed_program_desc);
......@@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
bool DataFlowGraphToFluidPass::Finalize() { return true; }
void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) {
auto traits = GraphTraits<DataFlowGraph>(graph);
for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) {
if (it->deleted()) continue;
void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
LOG(INFO) << "graph.inputs " << graph->inputs.size();
for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (node.deleted()) continue;
switch (it->type()) {
switch (node.type()) {
case Node::Type::kFunction: {
LOG(INFO) << "add function " << it->repr();
AddFluidOp(&(*it));
LOG(INFO) << "add function " << node.repr();
AddFluidOp(&node);
} break;
case Node::Type::kFunctionBlock: {
LOG(INFO) << "add engine op " << it->repr() << " , "
<< static_cast<FunctionBlock*>(&(*it))->subgraph.size();
AddEngineOp(&(*it));
LOG(INFO) << "add engine op " << node.repr() << " , "
<< static_cast<FunctionBlock *>(&node)->subgraph.size();
AddEngineOp(&node);
} break;
default:
continue;
}
}
PADDLE_ENFORCE(argument_->transformed_program_desc.get());
}
void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc());
void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
// currently only the main block is analyzed.
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops();
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops();
*op = *ori_op; // copy the attributes, by default, these will not be changed
// by analysis phrase.
// by analysis phrase.
// The inputs and outputs of the existing ops are not changed by tensorrt
// subgraph pass.
// NOTE It might be changed by other passes in the long run.
}
void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph,
const framework::proto::BlockDesc& block) {
void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
const framework::proto::BlockDesc &block) {
static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc;
auto* func = static_cast<FunctionBlock*>(node);
auto *func = static_cast<FunctionBlock *>(node);
// collect inputs
std::vector<std::string> io;
for (auto* x : func->inlinks) {
for (auto *x : func->inlinks) {
io.push_back(x->name());
}
desc.SetInput("Xs", io);
// collect outputs
io.clear();
for (auto* x : func->outlinks) {
for (auto *x : func->outlinks) {
io.push_back(x->name());
}
desc.SetOutput("Ys", io);
desc.SetType("tensorrt_engine");
PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
// Set attrs
SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
SetAttr(desc.Proto(), "engine_unique_key",
"trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", 100); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "max_workspace",
1024); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
node->SetPbMsg(desc.Proto()->SerializeAsString());
}
std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes) {
const std::vector<std::unique_ptr<Node>> &nodes) {
std::vector<std::string> parameters;
for (const auto& node : nodes) {
for (const auto &node : nodes) {
if (!node->IsValue()) continue;
PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
framework::proto::VarDesc var;
......@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters(
return parameters;
}
void DataFlowGraphToFluidPass::AddEngineOp(Node* node) {
void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
// TODO(Superjomn) Here need to expose some arguments for default setting.
PADDLE_ENFORCE(node->IsFunctionBlock());
auto* block_node = static_cast<FunctionBlock*>(node);
auto *block_node = static_cast<FunctionBlock *>(node);
framework::proto::BlockDesc proto;
framework::BlockDesc block_desc(nullptr, &proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
LOG(INFO) << "origin variable size: "
<< argument_->origin_program_desc->blocks(0).vars().size();
LOG(INFO) << "transformed variable size: "
<< block_desc.Proto()->vars().size();
// copy ops.
for (auto* node : block_node->subgraph) {
auto* op = block_desc.AppendOp();
for (auto *node : block_node->subgraph) {
auto *op = block_desc.AppendOp();
PADDLE_ENFORCE(!node->pb_msg().empty());
op->Proto()->ParseFromString(node->pb_msg());
}
*block_desc.Proto()->mutable_vars() =
argument_->origin_program_desc->blocks(0).vars();
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops();
auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto *op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
op->ParseFromString(node->pb_msg());
}
......@@ -151,7 +165,7 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public:
using Config = DFG_GraphvizDrawPass::Config;
explicit DFG_DebuggerPass(const Config& config)
explicit DFG_DebuggerPass(const Config &config)
: DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
......@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
};
} // namespace
Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root,
"data_flow_graph_to_fluid_graphviz_debugger"));
......
......@@ -26,6 +26,10 @@
namespace paddle {
namespace inference {
DECLARE_int32(tensorrt_max_batchsize);
DECLARE_int32(tensorrt_workspace_size);
namespace analysis {
class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
public:
......
......@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
no++;
}
// DFG is sensitive to ProgramDesc, be careful to change the existing models.
ASSERT_EQ(no, 82);
ASSERT_EQ(no, 83);
}
} // namespace analysis
......
......@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
PADDLE_ENFORCE(argument);
if (!argument->main_dfg) {
LOG(INFO) << "Init DFG";
argument->main_dfg.reset(new DataFlowGraph);
}
desc_ = argument->origin_program_desc.get();
......@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
v->SetPbMsg(var.SerializeAsString());
var2id[var.name()] = v->id();
}
for (int i = 0; i < main_block.ops_size(); i++) {
const auto &op = main_block.ops(i);
auto *o = graph->nodes.Create(Node::Type::kFunction);
......@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
o->SetPbMsg(op.SerializeAsString());
// set inputs and outputs
// TODO(Superjomn) make sure the InputNames is the real variable name.
std::unordered_set<Node *> inlinks;
for (int j = 0; j < op.inputs_size(); j++) {
auto &in_var = op.inputs(j);
for (int k = 0; k < in_var.arguments_size(); k++) {
auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
in->outlinks.push_back(o);
o->inlinks.push_back(in);
inlinks.insert(in);
}
}
for (int j = 0; j < op.outputs_size(); j++) {
auto &out_var = op.outputs(j);
for (int k = 0; k < out_var.arguments_size(); k++) {
auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
if (inlinks.count(out)) {
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto *out_alias = graph->nodes.Create(Node::Type::kValue);
out_alias->SetName(out->name());
out_alias->SetPbDesc(out->pb_desc());
out_alias->SetPbMsg(out->pb_msg());
var2id[out_alias->name()] = out_alias->id(); // update a -> a0
LOG(INFO) << "loop found in graph, create SSA alias node ["
<< out_alias->repr() << "] for [" << out->repr() << "]";
out = out_alias;
}
out->inlinks.push_back(o);
o->outlinks.push_back(out);
}
......
......@@ -24,12 +24,12 @@ namespace analysis {
TEST_F(DFG_Tester, Init) {
FluidToDataFlowGraphPass pass;
pass.Initialize(&argument);
DataFlowGraph graph;
pass.Run(&graph);
pass.Run(argument.main_dfg.get());
// Analysis is sensitive to ProgramDesc, careful to change the original model.
ASSERT_EQ(graph.nodes.size(), 37UL);
ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
pass.Finalize();
LOG(INFO) << '\n' << graph.DotString();
ASSERT_FALSE(argument.main_dfg->DotString().empty());
EXPECT_FALSE(argument.main_dfg->inputs.empty());
}
} // namespace analysis
......
......@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_)();
VLOG(4) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/);
}
} // namespace analysis
......
......@@ -26,13 +26,13 @@ endif()
function(inference_api_test TARGET_NAME)
if (WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(oneValueArgs SRC)
set(multiValueArgs ARGS)
cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests)
cc_test(${TARGET_NAME}
SRCS ${TARGET_NAME}.cc
SRCS ${inference_test_SRC}
DEPS "${inference_deps}"
ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
if(inference_test_ARGS)
......@@ -73,24 +73,24 @@ if(NOT APPLE)
endif()
cc_test(test_paddle_inference_api
SRCS test_api.cc
SRCS api_tester.cc
DEPS paddle_inference_api)
inference_api_test(test_api_impl
inference_api_test(test_api_impl SRC api_impl_tester.cc
ARGS test_word2vec test_image_classification)
if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api)
DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
endif()
if (WITH_ANAKIN) # only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
# compile the libinference_anakin_api.a and anakin.so.
nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc)
nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc)
target_compile_options(inference_anakin_api BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
......
......@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
bool PaddleInferenceAnakinPredictor::Run(
const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) {
std::vector<PaddleTensor> *output_data, int batch_size) {
for (const auto &input : inputs) {
if (input.dtype != PaddleDType::FLOAT32) {
LOG(ERROR) << "Only support float type inputs. " << input.name
......
......@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
// NOTE Unlike the native engine, the buffers of anakin engine's output_data
// should be allocated first.
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) override;
std::vector<PaddleTensor>* output_data,
int batch_size = -1) override;
std::unique_ptr<PaddlePredictor> Clone() override;
......
......@@ -66,6 +66,7 @@ bool NativePaddlePredictor::Init(
if (parent_scope) {
scope_ = parent_scope;
sub_scope_ = &(parent_scope->NewScope());
PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail");
} else {
paddle::framework::InitDevices(false);
scope_.reset(new paddle::framework::Scope());
......@@ -102,13 +103,13 @@ bool NativePaddlePredictor::Init(
NativePaddlePredictor::~NativePaddlePredictor() {
if (sub_scope_) {
PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!");
scope_->DeleteScope(sub_scope_);
}
}
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) {
std::vector<PaddleTensor> *output_data,
int batch_size) {
VLOG(3) << "Predictor::predict";
Timer timer;
timer.tic();
......
......@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
bool Init(std::shared_ptr<framework::Scope> parent_scope);
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override;
std::vector<PaddleTensor> *output_data,
int batch_size = -1) override;
std::unique_ptr<PaddlePredictor> Clone() override;
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle {
......@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
return false;
}
// Analyze inference_program
Argument argument;
argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
*inference_program_->Proto() = *argument.transformed_program_desc;
OptimizeInferenceProgram();
ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables";
......@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
return true;
}
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
int batch_size = -1) override {
PADDLE_ENFORCE_GT(batch_size, 0,
"TensorRT engine needs the argument batch_size set");
FLAGS_tensorrt_engine_batch_size = batch_size;
return NativePaddlePredictor::Run(inputs, output_data, batch_size);
}
void OptimizeInferenceProgram() {
// Analyze inference_program
Argument argument;
argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
inference_program_.reset(
new framework::ProgramDesc(*argument.transformed_program_desc));
}
private:
TensorRTConfig config_;
};
......
......@@ -15,50 +15,79 @@
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle {
DEFINE_string(dirname, "", "Directory of the inference model.");
void Main(bool use_gpu) {
void CompareTensorRTWithFluid(bool enable_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
//# 1. Create PaddlePredictor with a config.
TensorRTConfig config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model";
config.use_gpu = use_gpu;
config.fraction_of_gpu_memory = 0.15;
config.device = 0;
auto predictor =
NativeConfig config0;
config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
config0.use_gpu = true;
config0.fraction_of_gpu_memory = 0.3;
config0.device = 0;
TensorRTConfig config1;
config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3;
config1.device = 0;
auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
auto predictor1 =
CreatePaddlePredictor<TensorRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config);
PaddleEngineKind::kAutoMixedTensorRT>(config1);
for (int batch_id = 0; batch_id < 3; batch_id++) {
for (int batch_id = 0; batch_id < 1; batch_id++) {
//# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4};
std::vector<int64_t> data(20);
for (int i = 0; i < 20; i++) data[i] = i;
PaddleTensor tensor{.name = "",
.shape = std::vector<int>({4, 1}),
.data = PaddleBuf(data, sizeof(data)),
.dtype = PaddleDType::INT64};
PaddleTensor tensor{
.name = "",
.shape = std::vector<int>({10, 1}),
.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
.dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> slots(4, tensor);
//# 3. Run
std::vector<PaddleTensor> outputs;
CHECK(predictor->Run(slots, &outputs));
std::vector<PaddleTensor> outputs0;
std::vector<PaddleTensor> outputs1;
CHECK(predictor0->Run(slots, &outputs0));
CHECK(predictor1->Run(slots, &outputs1, 10));
//# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length();
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory.
for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i];
ASSERT_EQ(outputs0.size(), 1UL);
ASSERT_EQ(outputs1.size(), 1UL);
const size_t num_elements = outputs0.front().data.length() / sizeof(float);
const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
EXPECT_EQ(num_elements, num_elements1);
auto *data0 = static_cast<float *>(outputs0.front().data.data());
auto *data1 = static_cast<float *>(outputs1.front().data.data());
ASSERT_GT(num_elements, 0UL);
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
EXPECT_NEAR(data0[i], data1[i], 1e-3);
}
}
}
TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); }
TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
CompareTensorRTWithFluid(false);
}
TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
CompareTensorRTWithFluid(true);
}
} // namespace paddle
......@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
LOG(INFO) << "I get other_config " << config.other_config;
}
bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override {
std::vector<PaddleTensor> *output_data,
int batch_size = 0) override {
LOG(INFO) << "Run";
return false;
}
......
......@@ -57,4 +57,4 @@ By specifying the engine kind and config, one can get a specific implementation.
## Reference
- [paddle_inference_api.h](./paddle_inference_api.h)
- [some demos](./demo)
- [some demos](./demo_ci)
......@@ -83,5 +83,5 @@ CHECK(predictor->Run(slots, &outputs));
## 详细代码参考
- [inference demos](./demo)
- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc)
- [inference demos](./demo_ci)
- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc)
......@@ -98,7 +98,8 @@ class PaddlePredictor {
// responsible for the output tensor's buffer, either allocated or passed from
// outside.
virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) = 0;
std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0;
// Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe.
......
......@@ -93,6 +93,10 @@ class OpConverter {
framework::Scope* scope_{nullptr};
};
} // namespace tensorrt
} // namespace inference
} // namespace paddle
#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \
struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
trt_##op_type__##_converter() { \
......@@ -111,7 +115,3 @@ class OpConverter {
extern int TouchConverterRegister_##op_type__(); \
static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
TouchConverterRegister_##op_type__();
} // namespace tensorrt
} // namespace inference
} // namespace paddle
......@@ -28,18 +28,20 @@ namespace tensorrt {
int TensorRTEngine::runtime_batch_ = 1;
void TensorRTEngine::Build(const DescType& paddle_model) {
void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE(false, "not implemented");
}
void TensorRTEngine::Execute(int batch_size) {
std::vector<void*> buffers;
for (auto& buf : buffers_) {
batch_size_ = batch_size;
std::vector<void *> buffers;
for (auto &buf : buffers_) {
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
PADDLE_ENFORCE_GT(buf.max_size, 0);
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buffers.push_back(buf.buffer);
}
PADDLE_ENFORCE_NOT_NULL(stream_);
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
cudaStreamSynchronize(*stream_);
SetRuntimeBatch(batch_size);
......@@ -48,7 +50,7 @@ void TensorRTEngine::Execute(int batch_size) {
TensorRTEngine::~TensorRTEngine() {
cudaStreamSynchronize(*stream_);
// clean buffer
for (auto& buf : buffers_) {
for (auto &buf : buffers_) {
if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
buf.buffer = nullptr;
......@@ -73,33 +75,37 @@ void TensorRTEngine::FreezeNetwork() {
// allocate GPU buffers.
buffers_.resize(buffer_sizes_.size());
for (auto& item : buffer_sizes_) {
for (auto &item : buffer_sizes_) {
// The output buffers are not set in the network building phrase, need to
// infer from the TesorRT network.
if (item.second == 0) {
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE_GT(item.second, 0);
}
auto& buf = buffer(item.first);
auto &buf = buffer(item.first);
buf.max_size = item.second * max_batch_;
CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
<< buf.buffer;
buf.size = item.second;
buf.max_size = item.second * max_batch_;
buf.size = 0;
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
buf.device = DeviceType::GPU;
}
}
nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
nvinfer1::DataType dtype,
const nvinfer1::Dims& dims) {
const nvinfer1::Dims &dims) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
name);
PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name);
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
......@@ -108,12 +114,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
return input;
}
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
const std::string& name) {
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name);
auto* output = layer->getOutput(offset);
auto *output = layer->getOutput(offset);
SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str());
......@@ -125,11 +131,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
buffer_sizes_[name] = 0;
}
void TensorRTEngine::DeclareOutput(const std::string& name) {
void TensorRTEngine::DeclareOutput(const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name);
auto* output = TensorRTEngine::GetITensor(name);
auto *output = TensorRTEngine::GetITensor(name);
PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput());
......@@ -139,13 +145,13 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
buffer_sizes_[name] = 0;
}
void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
return buffer(name).buffer;
}
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst) {
// determine data size
auto* output = TensorRTEngine::GetITensor(name);
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
......@@ -155,17 +161,17 @@ void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
auto& buf = buffer(name);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToDevice, *stream_),
0);
}
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst) {
// determine data size
auto* output = TensorRTEngine::GetITensor(name);
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
......@@ -174,13 +180,13 @@ void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
auto& buf = buffer(name);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToHost, *stream_));
}
Buffer& TensorRTEngine::buffer(const std::string& name) {
Buffer &TensorRTEngine::buffer(const std::string &name) {
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
......@@ -188,19 +194,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
return buffers_[slot_offset];
}
void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
size_t size) {
auto& buf = buffer(name);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_NOT_NULL(data);
PADDLE_ENFORCE_NOT_NULL(stream_);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buf.size = size;
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyHostToDevice, *stream_));
}
void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
size_t size) {
auto& buf = buffer(name);
auto &buf = buffer(name);
buf.size = size;
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
......@@ -208,15 +218,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
cudaMemcpyDeviceToDevice, *stream_));
}
void TensorRTEngine::SetITensor(const std::string& name,
nvinfer1::ITensor* tensor) {
void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr);
PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
name);
itensor_map_[name] = tensor;
}
nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
return itensor_map_[name];
}
......
......@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase {
: max_batch_(max_batch),
max_workspace_(max_workspace),
stream_(stream ? stream : &default_stream_),
logger_(logger) {}
logger_(logger) {
cudaStreamCreate(&default_stream_);
}
virtual ~TensorRTEngine();
......@@ -125,6 +127,9 @@ class TensorRTEngine : public EngineBase {
static int runtime_batch_;
// the max memory size the engine uses
int max_workspace_;
// batch size of the current data, will be updated each Executation.
int batch_size_{-1};
cudaStream_t* stream_;
// If stream_ is not set from outside, hold its own stream.
cudaStream_t default_stream_;
......
......@@ -103,6 +103,11 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.};
auto dims = engine_->GetITensor("y")->getDimensions();
ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0]);
ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5);
......
......@@ -168,6 +168,8 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif()
......@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND)
op_library(tensorrt_engine_op DEPS tensorrt_engine)
op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter
DEPS tensorrt_engine_op
analysis)
else()
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
......
......@@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
int64_t* rows = nullptr;
// When compiled without CUDA, the CUDAMutableData() interface should not be
// provided.
#if defined(PADDLE_WITH_CUDA)
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
} else {
#endif
rows = grad_merge.mutable_rows()->data();
#if defined(PADDLE_WITH_CUDA)
}
#endif
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor(
......
......@@ -24,15 +24,16 @@ class AucOp : public framework::OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Indices"),
"Input of Indices should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Predict"),
"Input of Out should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"),
"Input of Label should not be null.");
auto inference_height = ctx->GetInputDim("Out")[0];
auto predict_width = ctx->GetInputDim("Predict")[1];
PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification");
auto predict_height = ctx->GetInputDim("Predict")[0];
auto label_height = ctx->GetInputDim("Label")[0];
PADDLE_ENFORCE_EQ(inference_height, label_height,
PADDLE_ENFORCE_EQ(predict_height, label_height,
"Out and Label should have same height.");
int num_thres = ctx->Attrs().Get<int>("num_thresholds");
......@@ -43,14 +44,14 @@ class AucOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("FPOut", {num_thres});
ctx->SetOutputDim("FNOut", {num_thres});
ctx->ShareLoD("Out", /*->*/ "AUC");
ctx->ShareLoD("Predict", /*->*/ "AUC");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
framework::ToDataType(ctx.Input<Tensor>("Predict")->type()),
ctx.device_context());
}
};
......@@ -58,18 +59,13 @@ class AucOp : public framework::OperatorWithKernel {
class AucOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Out",
"A floating point 2D tensor, values are in the range [0, 1]."
"Each row is sorted in descending order. This input should be the"
"output of topk."
AddInput("Predict",
"A floating point 2D tensor with shape [batch_size, 2], values "
"are in the range [0, 1]."
"Typically, this tensor indicates the probability of each label");
AddInput("Indices",
"An int 2D tensor, indicating the indices of original"
"tensor before sorting. Typically, this tensor indicates which "
"label the probability stands for.");
AddInput("Label",
"A 2D int tensor indicating the label of the training data."
"The height is batch size and width is always 1.");
"A 2D int tensor indicating the label of the training data. "
"shape: [batch_size, 1]");
AddInput("TP", "True-Positive value.");
AddInput("FP", "False-Positive value.");
AddInput("TN", "True-Negative value.");
......
......@@ -31,7 +31,7 @@ template <typename DeviceContext, typename T>
class AucKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* inference = ctx.Input<Tensor>("Out");
auto* predict = ctx.Input<Tensor>("Predict");
auto* label = ctx.Input<Tensor>("Label");
auto* auc = ctx.Output<Tensor>("AUC");
// Only use output var for now, make sure it's persistable and
......@@ -41,24 +41,24 @@ class AucKernel : public framework::OpKernel<T> {
auto* true_negative = ctx.Output<Tensor>("TNOut");
auto* false_negative = ctx.Output<Tensor>("FNOut");
float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
auto* auc_data = auc->mutable_data<double>(ctx.GetPlace());
std::string curve = ctx.Attr<std::string>("curve");
int num_thresholds = ctx.Attr<int>("num_thresholds");
std::vector<float> thresholds_list;
std::vector<double> thresholds_list;
thresholds_list.reserve(num_thresholds);
for (int i = 1; i < num_thresholds - 1; i++) {
thresholds_list[i] = static_cast<float>(i) / (num_thresholds - 1);
thresholds_list[i] = static_cast<double>(i) / (num_thresholds - 1);
}
const float kEpsilon = 1e-7;
const double kEpsilon = 1e-7;
thresholds_list[0] = 0.0f - kEpsilon;
thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
size_t batch_size = inference->dims()[0];
size_t inference_width = inference->dims()[1];
size_t batch_size = predict->dims()[0];
size_t inference_width = predict->dims()[1];
const T* inference_data = inference->data<T>();
const int64_t* label_data = label->data<int64_t>();
const T* inference_data = predict->data<T>();
const auto* label_data = label->data<int64_t>();
auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
......@@ -66,20 +66,19 @@ class AucKernel : public framework::OpKernel<T> {
auto* fp_data = false_positive->mutable_data<int64_t>(ctx.GetPlace());
for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
// caculate TP, FN, TN, FP for current thresh
// calculate TP, FN, TN, FP for current thresh
int64_t tp = 0, fn = 0, tn = 0, fp = 0;
for (size_t i = 0; i < batch_size; i++) {
// NOTE: label_data used as bool, labels >0 will be treated as true.
// NOTE: label_data used as bool, labels > 0 will be treated as true.
if (label_data[i]) {
// use first(max) data in each row
if (inference_data[i * inference_width] >=
if (inference_data[i * inference_width + 1] >=
(thresholds_list[idx_thresh])) {
tp++;
} else {
fn++;
}
} else {
if (inference_data[i * inference_width] >=
if (inference_data[i * inference_width + 1] >=
(thresholds_list[idx_thresh])) {
fp++;
} else {
......@@ -94,21 +93,21 @@ class AucKernel : public framework::OpKernel<T> {
fp_data[idx_thresh] += fp;
}
// epsilon to avoid divide by zero.
float epsilon = 1e-6;
double epsilon = 1e-6;
// Riemann sum to caculate auc.
Tensor tp_rate, fp_rate, rec_rate;
tp_rate.Resize({num_thresholds});
fp_rate.Resize({num_thresholds});
rec_rate.Resize({num_thresholds});
float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
auto* tp_rate_data = tp_rate.mutable_data<double>(ctx.GetPlace());
auto* fp_rate_data = fp_rate.mutable_data<double>(ctx.GetPlace());
auto* rec_rate_data = rec_rate.mutable_data<double>(ctx.GetPlace());
for (int i = 0; i < num_thresholds; i++) {
tp_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
tp_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
(tp_data[i] + fn_data[i] + epsilon);
fp_rate_data[i] =
static_cast<float>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
rec_rate_data[i] = (static_cast<float>(tp_data[i]) + epsilon) /
static_cast<double>(fp_data[i]) / (fp_data[i] + tn_data[i] + epsilon);
rec_rate_data[i] = (static_cast<double>(tp_data[i]) + epsilon) /
(tp_data[i] + fp_data[i] + epsilon);
}
*auc_data = 0.0f;
......
......@@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
int64_t k = x->dims()[2];
auto x_lod = x->lod().back();
#if defined(PADDLE_WITH_CUDA)
size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
#else
size_t* x_lod_data = x_lod.data();
#endif
TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
mismatch_value, n, m, p, k, out_data,
......@@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
const int* neg_idx_data = neg_indices->data<int>();
auto neg_lod = neg_indices->lod().back();
#if defined(PADDLE_WITH_CUDA)
size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
#else
size_t* neg_lod_data = neg_lod.data();
#endif
NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
mismatch_value, out_data, out_wt_data);
......
if(NOT WITH_DISTRIBUTE)
return()
endif()
if(WITH_GRPC)
set(cc_generic_services "false")
else()
set(cc_generic_services "true")
endif()
configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
if(WITH_GRPC)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor
selected_rows memory)
grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc
grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
proto_desc lookup_table_op SERIAL)
cc_test(grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(grpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL)
return()
endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc
set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory)
find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so)
ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC})
set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so)
ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC})
cc_test(brpc_server_test SRCS rpc_server_test.cc
DEPS ${brpc_test_depends} SERIAL)
cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc
brpc protobuf leveldb gflags glog
protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL)
cc_test(brpc_serde_test SRCS brpc_serde_test.cc
DEPS ${brpc_test_depends} SERIAL)
......@@ -17,7 +17,7 @@ limitations under the License. */
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
namespace paddle {
namespace operators {
......
......@@ -24,6 +24,7 @@ limitations under the License. */
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "grpc++/grpc++.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
namespace grpc {
// A ZeroCopyInputStream that reads from grpc_byte_buffer
......@@ -107,25 +108,6 @@ class GrpcBufferReader final
namespace paddle {
namespace operators {
namespace distributed {
// Source provides a way for a particular RPC implementation to provide
// received data to ParseFrom.
class Source {
public:
virtual ~Source() {}
// Return the stream that contains the data to be parsed.
// Note that this method might be invoked more than once if
// ParseFrom needs to fall back to a more expensive parsing method.
// Every call must return a stream pointing at the beginning of
// the serialized RecvTensorResponse.
//
// Note that a subsequent call to contents() invalidates previous
// results of contents().
//
// Ownership of the returned stream is retained by the Source and
// should not be deleted by the caller.
virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
};
// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
class GrpcByteBufferSource
......
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "glog/logging.h" // For VLOG
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/distributed/grpc_serde.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/platform/profiler.h"
......
......@@ -38,7 +38,10 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/distributed/request_handler.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
......@@ -46,23 +49,6 @@ namespace paddle {
namespace operators {
namespace distributed {
struct VarHandle {
// RPC endpoint.
std::string ep;
const platform::DeviceContext* ctx;
const framework::Scope* scope;
// Variable name.
std::string name;
// RPC method name.
std::string method;
std::string String() const {
std::ostringstream s;
s << method << " name:[" << name << "], ep:[" << ep << "]";
return s.str();
}
};
void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg);
class BaseProcessor {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <nccl.h>
#endif
#include <sys/time.h>
#include <thread> // NOLINT
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/grpc_serde.h"
#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
namespace distributed {
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg,
const std::string& out_name) {
// Default DestroyCallback does nothing, When using GPU
// the CPU buffer need to be freed.
DestroyCallback destroy_callback = [](void* backing) {};
VarMsg request;
void* payload = nullptr;
size_t payload_size;
request.set_varname(name);
// Note: normally the profiler is enabled in 1 trainer, hence only
// 1 trainer returns true for ShouldSendProfileState(). It tells PS
// servers the trainer's profiling state so that PS can follow the
// trainer.
if (platform::ShouldSendProfileState()) {
if (platform::IsProfileEnabled()) {
request.set_profile(platform::kEnableProfiler);
} else {
request.set_profile(platform::kDisableProfiler);
}
}
if (!out_name.empty()) {
request.set_out_varname(out_name);
}
if (var->IsType<framework::LoDTensor>()) {
request.set_type(::sendrecv::LOD_TENSOR);
GetTensorPayload(var, ctx, &request, &payload, &payload_size);
} else if (var->IsType<framework::SelectedRows>()) {
request.set_type(::sendrecv::SELECTED_ROWS);
GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
#ifdef PADDLE_WITH_CUDA
} else if (var->IsType<ncclUniqueId>()) {
request.set_type(::sendrecv::NCCL_ID);
#endif
} else {
PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name());
}
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
// GPU data is copied to CPU buffer when sending,
// free the buffer when possible.
destroy_callback = [](void* backing) {
platform::CUDAPinnedPlace cuda_pinned;
memory::Free(cuda_pinned, backing);
};
#endif
}
std::string header;
request.AppendToString(&header);
auto buffer = std::unique_ptr<char[]>(new char[1024]);
void* buf = buffer.get();
ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
e.WriteRawBytes(std::string(header.data(), header.size()));
// NCCLID is copied directly to the message, return bytebuffer
// with only one slice if serializing NCCLID.
#ifdef PADDLE_WITH_CUDA
if (var->IsType<ncclUniqueId>()) {
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
NCCL_UNIQUE_ID_BYTES);
const ncclUniqueId& uid = var->Get<ncclUniqueId>();
e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
// for serialize NCCL_ID
::grpc::Slice slices(e.size());
memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
::grpc::ByteBuffer tmp(&slices, 1);
msg->Swap(&tmp);
return;
}
#endif
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
// steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
int num_slices = 2; // only SelectedRows have rows buffer
slices[0] = ::grpc::Slice(e.size());
memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
slices[1] = ::grpc::Slice(
grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
static_cast<char*>(payload)),
::grpc::Slice::STEAL_REF);
if (var->IsType<framework::SelectedRows>()) {
auto* slr = var->GetMutable<framework::SelectedRows>();
ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
size_t rows_memory_size =
slr->rows().size() * framework::SizeOfType(typeid(int64_t));
e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
slices[2] = ::grpc::Slice(e2.size());
memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
slices[3] = ::grpc::Slice(
grpc_slice_new_with_user_data(
const_cast<void*>(
reinterpret_cast<const void*>(slr->rows().data())),
rows_memory_size, [](void* backing) {},
const_cast<char*>(
reinterpret_cast<const char*>(slr->rows().data()))),
::grpc::Slice::STEAL_REF);
num_slices = 4;
}
::grpc::ByteBuffer tmp(&slices[0], num_slices);
msg->Swap(&tmp);
}
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var) {
operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar();
}
} // namespace distributed
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sys/time.h>
#include <iostream>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle {
namespace operators {
namespace distributed {
typedef void (*DestroyCallback)(void*);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg,
const std::string& out_varname = std::string());
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var);
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -21,8 +21,10 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/grpc_serde.h"
#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
......@@ -84,7 +86,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
// operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2);
framework::Scope scope;
scope.Var("myvar");
operators::distributed::VariableResponse resp(&scope, &ctx);
operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
EXPECT_EQ(resp.Parse(msg), 0);
framework::Variable* var2 = resp.GetVar();
......@@ -171,7 +173,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
// deserialize zero-copy
framework::Scope scope;
scope.Var("myvar");
operators::distributed::VariableResponse resp(&scope, &ctx);
operators::distributed::GRPCVariableResponse resp(&scope, &ctx);
if (from_type == 0) {
EXPECT_EQ(resp.Parse(msg), 0);
} else {
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <limits>
#include <string>
#include "paddle/fluid/operators/distributed/grpc_serde.h"
#include "paddle/fluid/operators/distributed/grpc_server.h"
using ::grpc::ServerAsyncResponseWriter;
......@@ -84,9 +85,9 @@ class RequestSend final : public RequestBase {
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx(),
!request_handler->sync_mode()));
request_.reset(new GRPCVariableResponse(request_handler->scope(),
request_handler->dev_ctx(),
!request_handler->sync_mode()));
int method_id = static_cast<int>(distributed::GrpcMethod::kSendVariable);
service_->RequestAsyncUnary(
method_id, &ctx_, request_.get(), &responder_, cq_, cq_,
......@@ -109,7 +110,7 @@ class RequestSend final : public RequestBase {
protected:
sendrecv::VoidMessage reply_;
std::shared_ptr<VariableResponse> request_;
std::shared_ptr<GRPCVariableResponse> request_;
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};
......@@ -161,8 +162,8 @@ class RequestPrefetch final : public RequestBase {
: RequestBase(service, cq, request_handler, req_id),
responder_(&ctx_),
local_scope_(nullptr) {
request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx(), true));
request_.reset(new GRPCVariableResponse(request_handler->scope(),
request_handler->dev_ctx(), true));
int method_id =
static_cast<int>(distributed::GrpcMethod::kPrefetchVariable);
service_->RequestAsyncUnary(
......@@ -194,7 +195,7 @@ class RequestPrefetch final : public RequestBase {
}
protected:
std::shared_ptr<VariableResponse> request_;
std::shared_ptr<GRPCVariableResponse> request_;
::grpc::ByteBuffer reply_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
framework::Scope* local_scope_;
......@@ -206,8 +207,8 @@ class RequestCheckpointNotify final : public RequestBase {
::grpc::ServerCompletionQueue* cq,
RequestHandler* request_handler, int req_id)
: RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) {
request_.reset(new VariableResponse(request_handler->scope(),
request_handler->dev_ctx()));
request_.reset(new GRPCVariableResponse(request_handler->scope(),
request_handler->dev_ctx()));
int method_id =
static_cast<int>(distributed::GrpcMethod::kCheckpointNotify);
service_->RequestAsyncUnary(
......@@ -234,7 +235,7 @@ class RequestCheckpointNotify final : public RequestBase {
}
protected:
std::shared_ptr<VariableResponse> request_;
std::shared_ptr<GRPCVariableResponse> request_;
sendrecv::VoidMessage reply_;
ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
};
......
......@@ -23,8 +23,7 @@
#include <grpc++/impl/codegen/stub_options.h>
#include <grpc++/impl/codegen/sync_stream.h>
#include <grpc++/support/byte_buffer.h>
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
#include "paddle/fluid/platform/profiler.h"
// NOTE: This method was originally created by tensorflow
......@@ -42,17 +41,18 @@ class ServerContext;
// Support parsing/unparsing of tensorflow::VariableResponse.
// Wire-format is identical to RecvVariableResponse.
template <>
class SerializationTraits<paddle::operators::distributed::VariableResponse> {
class SerializationTraits<
paddle::operators::distributed::GRPCVariableResponse> {
public:
static Status Serialize(
const paddle::operators::distributed::VariableResponse& msg,
const paddle::operators::distributed::GRPCVariableResponse& msg,
grpc_byte_buffer** bp, bool* own_buffer) {
PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!");
return Status();
}
static Status Deserialize(
grpc_byte_buffer* buffer,
paddle::operators::distributed::VariableResponse* msg,
paddle::operators::distributed::GRPCVariableResponse* msg,
int max_message_size = INT_MAX) {
if (buffer == nullptr) {
return Status(StatusCode::INTERNAL, "No payload");
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include <utility>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include <nccl.h>
#endif
#include "paddle/fluid/operators/distributed/grpc_variable_response.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
namespace distributed {
enum WireType {
WIRETYPE_VARINT = 0,
WIRETYPE_LENGTH_DELIMITED = 2,
};
inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
inline WireType GetTagWireType(uint32_t tag) {
return static_cast<WireType>(tag & 0x7);
}
bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
int* result) {
uint64_t v;
if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
*result = static_cast<int>(v);
return true;
} else {
return false;
}
}
int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
GrpcByteBufferSource source;
source.Init(byte_buffer);
GrpcByteBufferSourceWrapper r(&source);
return Parse(&r);
}
bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
std::vector<int64_t>* lod) {
while (true) {
auto p = input->ReadTagWithCutoff(127);
int tag = GetTagFieldNumber(p.first);
WireType wt = GetTagWireType(p.first);
if (!p.second) {
return (tag == 0);
}
switch (tag) {
case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
uint64_t v;
if (wt == WIRETYPE_VARINT) {
if (!input->ReadVarint64(&v)) {
return false;
}
lod->push_back(v);
break;
}
if (wt == WIRETYPE_LENGTH_DELIMITED) {
int num_bytes = 0;
if (!input->ReadVarintSizeAsInt(&num_bytes)) {
return tag;
}
int start_pos = input->CurrentPosition();
while (input->CurrentPosition() - start_pos < num_bytes) {
uint64_t v;
if (!input->ReadVarint64(&v)) {
return tag;
}
lod->push_back(v);
}
break;
}
return false;
}
default: { return false; }
}
}
return true;
}
int GRPCVariableResponse::Parse(Source* source) {
::google::protobuf::io::ZeroCopyInputStream* input_stream =
source->contents();
::google::protobuf::io::CodedInputStream input(input_stream);
input.SetTotalBytesLimit(INT_MAX, INT_MAX);
while (true) {
auto p = input.ReadTagWithCutoff(127);
int tag = GetTagFieldNumber(p.first);
WireType wt = GetTagWireType(p.first);
if (!p.second) {
if (tag != 0) {
return -1;
}
return 0;
}
switch (tag) {
case sendrecv::VariableMessage::kVarnameFieldNumber: {
uint32_t length;
if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
return tag;
}
std::string temp;
if (!input.ReadString(&temp, length)) {
return tag;
}
meta_.set_varname(temp);
break;
}
case sendrecv::VariableMessage::kTypeFieldNumber: {
uint32_t v;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
return tag;
}
meta_.set_type(static_cast<::sendrecv::VarType>(v));
break;
}
case sendrecv::VariableMessage::kDataTypeFieldNumber: {
uint32_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
return tag;
}
meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
break;
}
case sendrecv::VariableMessage::kDimsFieldNumber: {
// not packed
if (wt == WIRETYPE_VARINT) {
uint64_t v;
if (!input.ReadVarint64(&v)) {
return tag;
}
meta_.add_dims(v);
break;
}
// packed
if (wt == WIRETYPE_LENGTH_DELIMITED) {
int num_bytes = 0;
if (!input.ReadVarintSizeAsInt(&num_bytes)) {
return tag;
}
int start_pos = input.CurrentPosition();
while (input.CurrentPosition() - start_pos < num_bytes) {
uint64_t v;
if (!input.ReadVarint64(&v)) {
return tag;
}
meta_.add_dims(v);
}
break;
}
return tag;
}
case sendrecv::VariableMessage::kLodLevelFieldNumber: {
uint64_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
return tag;
}
meta_.set_lod_level(static_cast<int64_t>(v));
break;
}
case sendrecv::VariableMessage::kLodFieldNumber: {
int length = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &length)) {
return tag;
}
std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
input.IncrementRecursionDepthAndPushLimit(length);
std::vector<int64_t> lod_data;
if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
return tag;
}
if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
return tag;
}
if (lod_data.size() == 0) {
break;
}
auto lod = meta_.add_lod();
for (uint32_t i = 0; i < lod_data.size(); i++) {
lod->add_lod_data(lod_data[i]);
}
break;
}
case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
uint64_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
return tag;
}
meta_.set_slr_height(static_cast<int64_t>(v));
break;
}
case sendrecv::VariableMessage::kSerializedFieldNumber: {
int num_bytes = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &num_bytes)) {
return tag;
}
if (!ProcSerializedField(tag, &input, num_bytes)) {
return tag;
}
break;
}
case sendrecv::VariableMessage::kRowsFieldNumber: {
PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
meta_.type() == sendrecv::LOD_TENSOR) &&
meta_.varname() != "",
"meta info should be got first!");
int num_bytes = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &num_bytes)) {
return tag;
}
if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
return tag;
}
break;
}
case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
uint32_t length;
if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
return tag;
}
std::string temp;
if (!input.ReadString(&temp, length)) {
return tag;
}
meta_.set_out_varname(temp);
break;
}
case sendrecv::VariableMessage::kProfileFieldNumber: {
uint64_t profiling = 0;
if (!input.ReadVarint64(&profiling)) {
return tag;
}
meta_.set_profile(profiling);
int64_t listener_id = platform::ListenerId();
if (listener_id <= 0) {
break;
}
if (profiling == platform::kEnableProfiler &&
!platform::IsProfileEnabled()) {
platform::EnableProfiler(platform::ProfilerState::kCPU);
} else if (profiling == platform::kDisableProfiler &&
platform::IsProfileEnabled()) {
// TODO(panyx0718): Should we allow to customize file dir.
platform::DisableProfiler(
platform::EventSortingKey::kDefault,
string::Sprintf("/tmp/profile_ps_%lld", listener_id));
}
break;
}
default: {
// Unknown tag, return unknown error.
return -1;
}
}
}
return 0;
}
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -14,36 +14,45 @@
#pragma once
#include <map>
#include <string>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/details/var_handle.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/grpc_bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
namespace paddle {
namespace framework {
namespace details {
// A SSA graph used by parallel executor.
struct SSAGraph {
// all variable in each devices.
// The outside vector is the device vector. Each element of this vector is a
// map from variable name to variables. The variables, who have the same name,
// will have a different version. The offset in the
// `std::vector<std::unique_ptr<VarHandle>>` is the version of varaibles.
std::vector<
std::unordered_map<std::string, std::vector<std::unique_ptr<VarHandle>>>>
vars_;
// aux variables to represent dependency. Useful to resolve data hazard.
std::unordered_set<std::unique_ptr<VarHandleBase>> dep_vars_;
// all operators. NOTE that even we use a vector here, the operators is
// unordered.
std::vector<std::unique_ptr<OpHandleBase>> ops_;
namespace operators {
namespace distributed {
class GRPCVariableResponse : public VariableResponse {
public:
GRPCVariableResponse(const framework::Scope* scope,
const platform::DeviceContext* dev_ctx,
bool create_scope = false)
: VariableResponse(scope, dev_ctx, create_scope) {}
virtual ~GRPCVariableResponse() {}
int Parse(Source* source) override;
// return:
// 0:ok.
// -1: unkown error.
// other: number of error field.
int Parse(const ::grpc::ByteBuffer& byte_buffer);
};
} // namespace details
} // namespace framework
} // namespace paddle
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -51,6 +51,23 @@ constexpr char kRequestPassBarrier[] = "RequestPassBarrier";
class RPCServer;
struct VarHandle {
// RPC endpoint.
std::string ep;
const platform::DeviceContext* ctx;
const framework::Scope* scope;
// Variable name.
std::string name;
// RPC method name.
std::string method;
std::string String() const {
std::ostringstream s;
s << method << " name:[" << name << "], ep:[" << ep << "]";
return s.str();
}
};
class RequestHandler {
public:
explicit RequestHandler(bool sync_mode)
......
......@@ -53,7 +53,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
// Sync
if (varname == BATCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv batch barrier message";
VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE";
rpc_server_->IncreaseBatchBarrier(kRequestSend);
} else if (varname == BEGIN_PASS_MESSAGE) {
VLOG(3) << "sync: recv begin pass message";
......@@ -65,8 +65,7 @@ bool RequestSendHandler::Handle(const std::string& varname,
VLOG(3) << "sync: processing received var: " << varname;
if (invar == nullptr) {
LOG(ERROR) << "sync: Can not find server side var: " << varname;
PADDLE_THROW("sync: Can not find server side var");
LOG(FATAL) << "sync: Can not find server side var: " << varname;
return false;
}
if (invar->IsType<framework::SelectedRows>()) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
the Apache License, Version 2.0 (the "License"); you may not use this file
except in compliance with the License.
......@@ -14,7 +15,7 @@ limitations under the License. */
syntax = "proto3";
package sendrecv;
// option cc_generic_services = true;
option cc_generic_services = @cc_generic_services@;
service SendRecvService {
// For parameter server round-robin like hashing, do not split tensors.
......
......@@ -12,21 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#ifdef PADDLE_WITH_CUDA
#include <nccl.h>
#endif
#include <sys/time.h>
#include <thread> // NOLINT
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/proto_encoder_helper.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
namespace operators {
......@@ -34,6 +28,13 @@ namespace distributed {
using VarMsg = sendrecv::VariableMessage;
#ifdef PADDLE_WITH_CUDA
void* GetVarPayLoad(const std::string varname, int64_t size) {
platform::CUDAPinnedPlace cuda_pinned;
return memory::Alloc(cuda_pinned, size);
}
#endif
void GetTensorPayload(framework::Variable* var,
const platform::DeviceContext& ctx, VarMsg* request,
void** payload, size_t* payload_size) {
......@@ -58,15 +59,17 @@ void GetTensorPayload(framework::Variable* var,
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
platform::CUDAPinnedPlace cuda_pinned;
// platform::CUDAPinnedPlace cuda_pinned;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type());
*payload = memory::Alloc(cuda_pinned, copy_size);
*payload = GetVarPayLoad(request->varname(), copy_size);
platform::CUDAPinnedPlace cuda_pinned;
memory::Copy(cuda_pinned, *payload,
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(tensor.data<void>()), copy_size,
gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
......@@ -91,10 +94,11 @@ void GetSelectedRowsPayload(framework::Variable* var,
auto* tensor = slr->mutable_value();
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CUDAPinnedPlace cuda_pinned;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor->numel() * framework::SizeOfType(tensor->type());
*payload = memory::Alloc(cuda_pinned, copy_size);
*payload = GetVarPayLoad(request->varname(), copy_size);
platform::CUDAPinnedPlace cuda_pinned;
memory::Copy(cuda_pinned, *payload,
boost::get<platform::CUDAPlace>(tensor->place()),
reinterpret_cast<const void*>(tensor->data<void>()), copy_size,
......@@ -107,126 +111,6 @@ void GetSelectedRowsPayload(framework::Variable* var,
*payload_size = tensor->numel() * framework::SizeOfType(tensor->type());
}
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg,
const std::string& out_name) {
// Default DestroyCallback does nothing, When using GPU
// the CPU buffer need to be freed.
DestroyCallback destroy_callback = [](void* backing) {};
VarMsg request;
void* payload = nullptr;
size_t payload_size;
request.set_varname(name);
// Note: normally the profiler is enabled in 1 trainer, hence only
// 1 trainer returns true for ShouldSendProfileState(). It tells PS
// servers the trainer's profiling state so that PS can follow the
// trainer.
if (platform::ShouldSendProfileState()) {
if (platform::IsProfileEnabled()) {
request.set_profile(platform::kEnableProfiler);
} else {
request.set_profile(platform::kDisableProfiler);
}
}
if (!out_name.empty()) {
request.set_out_varname(out_name);
}
if (var->IsType<framework::LoDTensor>()) {
request.set_type(::sendrecv::LOD_TENSOR);
GetTensorPayload(var, ctx, &request, &payload, &payload_size);
} else if (var->IsType<framework::SelectedRows>()) {
request.set_type(::sendrecv::SELECTED_ROWS);
GetSelectedRowsPayload(var, ctx, &request, &payload, &payload_size);
#ifdef PADDLE_WITH_CUDA
} else if (var->IsType<ncclUniqueId>()) {
request.set_type(::sendrecv::NCCL_ID);
#endif
} else {
PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name());
}
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
// GPU data is copied to CPU buffer when sending,
// free the buffer when possible.
destroy_callback = [](void* backing) {
platform::CUDAPinnedPlace cuda_pinned;
memory::Free(cuda_pinned, backing);
};
#endif
}
std::string header;
request.AppendToString(&header);
auto buffer = std::unique_ptr<char[]>(new char[1024]);
void* buf = buffer.get();
ProtoEncodeHelper e(static_cast<char*>(buf), 1024);
e.WriteRawBytes(std::string(header.data(), header.size()));
// NCCLID is copied directly to the message, return bytebuffer
// with only one slice if serializing NCCLID.
#ifdef PADDLE_WITH_CUDA
if (var->IsType<ncclUniqueId>()) {
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
NCCL_UNIQUE_ID_BYTES);
const ncclUniqueId& uid = var->Get<ncclUniqueId>();
e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES));
// for serialize NCCL_ID
::grpc::Slice slices(e.size());
memcpy(const_cast<uint8_t*>(slices.begin()), e.data(), e.size());
::grpc::ByteBuffer tmp(&slices, 1);
msg->Swap(&tmp);
return;
}
#endif
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
// steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
int num_slices = 2; // only SelectedRows have rows buffer
slices[0] = ::grpc::Slice(e.size());
memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
slices[1] = ::grpc::Slice(
grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
static_cast<char*>(payload)),
::grpc::Slice::STEAL_REF);
if (var->IsType<framework::SelectedRows>()) {
auto* slr = var->GetMutable<framework::SelectedRows>();
ProtoEncodeHelper e2(static_cast<char*>(buf), 128);
size_t rows_memory_size =
slr->rows().size() * framework::SizeOfType(typeid(int64_t));
e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
slices[2] = ::grpc::Slice(e2.size());
memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
slices[3] = ::grpc::Slice(
grpc_slice_new_with_user_data(
const_cast<void*>(
reinterpret_cast<const void*>(slr->rows().data())),
rows_memory_size, [](void* backing) {},
const_cast<char*>(
reinterpret_cast<const char*>(slr->rows().data()))),
::grpc::Slice::STEAL_REF);
num_slices = 4;
}
::grpc::ByteBuffer tmp(&slices[0], num_slices);
msg->Swap(&tmp);
}
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var) {
operators::distributed::VariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar();
}
} // namespace distributed
} // namespace operators
} // namespace paddle
......@@ -25,24 +25,21 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle {
namespace operators {
namespace distributed {
typedef void (*DestroyCallback)(void*);
using VarMsg = sendrecv::VariableMessage;
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg,
const std::string& out_varname = std::string());
void GetTensorPayload(framework::Variable* var,
const platform::DeviceContext& ctx, VarMsg* request,
void** payload, size_t* payload_size);
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var);
void GetSelectedRowsPayload(framework::Variable* var,
const platform::DeviceContext& ctx, VarMsg* request,
void** payload, size_t* payload_size);
inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
switch (type) {
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -13,50 +13,20 @@
// limitations under the License.
#include "paddle/fluid/operators/distributed/variable_response.h"
#include <string>
#include <utility>
#include <vector>
#ifdef PADDLE_WITH_CUDA
#include <nccl.h>
#endif
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
namespace paddle {
namespace operators {
namespace distributed {
enum WireType {
WIRETYPE_VARINT = 0,
WIRETYPE_LENGTH_DELIMITED = 2,
};
inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; }
inline WireType GetTagWireType(uint32_t tag) {
return static_cast<WireType>(tag & 0x7);
}
bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input,
int* result) {
uint64_t v;
if (input->ReadVarint64(&v) && v <= static_cast<uint64_t>(INT_MAX)) {
*result = static_cast<int>(v);
return true;
} else {
return false;
}
}
bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& dev_ctx, platform::Place place,
void* dest, int size) {
bool VariableResponse::ReadRaw(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& dev_ctx,
platform::Place place, void* dest,
int64_t size) {
const void* data = NULL;
int size_to_write = 0;
int length = size;
int64_t length = size;
int total_written = 0;
if (platform::is_gpu_place(place)) {
......@@ -194,294 +164,49 @@ bool VariableResponse::CopySelectRowsData(
return true;
}
bool ParseLodData(::google::protobuf::io::CodedInputStream* input,
std::vector<int64_t>* lod) {
while (true) {
auto p = input->ReadTagWithCutoff(127);
int tag = GetTagFieldNumber(p.first);
WireType wt = GetTagWireType(p.first);
if (!p.second) {
return (tag == 0);
}
switch (tag) {
case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: {
uint64_t v;
if (wt == WIRETYPE_VARINT) {
if (!input->ReadVarint64(&v)) {
return false;
}
lod->push_back(v);
break;
}
if (wt == WIRETYPE_LENGTH_DELIMITED) {
int num_bytes = 0;
if (!input->ReadVarintSizeAsInt(&num_bytes)) {
return tag;
}
int start_pos = input->CurrentPosition();
while (input->CurrentPosition() - start_pos < num_bytes) {
uint64_t v;
if (!input->ReadVarint64(&v)) {
return tag;
}
lod->push_back(v);
}
break;
}
bool VariableResponse::ProcSerializedField(
int tag, ::google::protobuf::io::CodedInputStream* input,
int64_t num_bytes) {
PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
meta_.type() == sendrecv::LOD_TENSOR ||
meta_.type() == sendrecv::NCCL_ID) &&
meta_.varname() != "",
"meta info should be got first!");
if (meta_.type() == sendrecv::NCCL_ID) {
#ifdef PADDLE_WITH_CUDA
auto* var = scope_->FindVar(meta_.varname());
if (var != nullptr) {
ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
if (!ReadRaw(input, *dev_ctx_, platform::CPUPlace(), id->internal,
num_bytes)) {
return false;
}
default: { return false; }
}
}
return true;
}
int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) {
GrpcByteBufferSource source;
source.Init(byte_buffer);
GrpcByteBufferSourceWrapper r(&source);
return Parse(&r);
}
int VariableResponse::Parse(Source* source) {
::google::protobuf::io::ZeroCopyInputStream* input_stream =
source->contents();
::google::protobuf::io::CodedInputStream input(input_stream);
input.SetTotalBytesLimit(INT_MAX, INT_MAX);
while (true) {
auto p = input.ReadTagWithCutoff(127);
int tag = GetTagFieldNumber(p.first);
WireType wt = GetTagWireType(p.first);
if (!p.second) {
if (tag != 0) {
return -1;
}
return 0;
}
switch (tag) {
case sendrecv::VariableMessage::kVarnameFieldNumber: {
uint32_t length;
if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
return tag;
}
std::string temp;
if (!input.ReadString(&temp, length)) {
return tag;
}
meta_.set_varname(temp);
break;
}
case sendrecv::VariableMessage::kTypeFieldNumber: {
uint32_t v;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
return tag;
}
meta_.set_type(static_cast<::sendrecv::VarType>(v));
break;
}
case sendrecv::VariableMessage::kDataTypeFieldNumber: {
uint32_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) {
return tag;
}
meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v));
break;
}
case sendrecv::VariableMessage::kDimsFieldNumber: {
// not packed
if (wt == WIRETYPE_VARINT) {
uint64_t v;
if (!input.ReadVarint64(&v)) {
return tag;
}
meta_.add_dims(v);
break;
}
// packed
if (wt == WIRETYPE_LENGTH_DELIMITED) {
int num_bytes = 0;
if (!input.ReadVarintSizeAsInt(&num_bytes)) {
return tag;
}
int start_pos = input.CurrentPosition();
while (input.CurrentPosition() - start_pos < num_bytes) {
uint64_t v;
if (!input.ReadVarint64(&v)) {
return tag;
}
meta_.add_dims(v);
}
break;
}
return tag;
}
case sendrecv::VariableMessage::kLodLevelFieldNumber: {
uint64_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
return tag;
}
meta_.set_lod_level(static_cast<int64_t>(v));
break;
}
case sendrecv::VariableMessage::kLodFieldNumber: {
int length = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &length)) {
return tag;
}
std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p =
input.IncrementRecursionDepthAndPushLimit(length);
std::vector<int64_t> lod_data;
if (p.second < 0 || !ParseLodData(&input, &lod_data)) {
return tag;
}
if (!input.DecrementRecursionDepthAndPopLimit(p.first)) {
return false;
}
if (lod_data.size() == 0) {
break;
}
auto lod = meta_.add_lod();
for (uint32_t i = 0; i < lod_data.size(); i++) {
lod->add_lod_data(lod_data[i]);
}
break;
}
case sendrecv::VariableMessage::kSlrHeightFieldNumber: {
uint64_t v = 0;
if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) {
return tag;
}
meta_.set_slr_height(static_cast<int64_t>(v));
break;
}
case sendrecv::VariableMessage::kSerializedFieldNumber: {
PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
meta_.type() == sendrecv::LOD_TENSOR ||
meta_.type() == sendrecv::NCCL_ID) &&
meta_.varname() != "",
"meta info should be got first!");
int num_bytes = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &num_bytes)) {
return tag;
}
if (meta_.type() == sendrecv::NCCL_ID) {
#ifdef PADDLE_WITH_CUDA
auto* var = scope_->FindVar(meta_.varname());
if (var != nullptr) {
ncclUniqueId* id = var->GetMutable<ncclUniqueId>();
if (!ReadRaw(&input, *dev_ctx_, platform::CPUPlace(), id->internal,
num_bytes)) {
return tag;
}
}
break;
return true;
#else
PADDLE_THROW("Not compiled with CUDA!");
PADDLE_THROW("Not compiled with CUDA!");
return false;
#endif
}
framework::DDim dims = GetDims(meta_.dims());
if (meta_.type() == sendrecv::LOD_TENSOR) {
PADDLE_ENFORCE(meta_.lod_size() >= 0,
"lod info should be got first!");
if (!CopyLodTensorData(&input, *dev_ctx_, dims, num_bytes)) {
return tag;
}
break;
}
if (meta_.type() == sendrecv::SELECTED_ROWS) {
if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, num_bytes)) {
return tag;
}
break;
}
return tag;
}
case sendrecv::VariableMessage::kRowsFieldNumber: {
PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS ||
meta_.type() == sendrecv::LOD_TENSOR) &&
meta_.varname() != "",
"meta info should be got first!");
int num_bytes = 0;
if (wt != WIRETYPE_LENGTH_DELIMITED ||
!ReadVarintSizeAsInt(&input, &num_bytes)) {
return tag;
}
if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) {
return tag;
}
break;
}
case sendrecv::VariableMessage::kOutVarnameFieldNumber: {
uint32_t length;
if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
return tag;
}
}
std::string temp;
if (!input.ReadString(&temp, length)) {
return tag;
}
framework::DDim dims = GetDims(meta_.dims());
if (meta_.type() == sendrecv::LOD_TENSOR) {
PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
return false;
}
return true;
}
meta_.set_out_varname(temp);
break;
}
case sendrecv::VariableMessage::kProfileFieldNumber: {
uint64_t profiling = 0;
if (!input.ReadVarint64(&profiling)) {
return tag;
}
meta_.set_profile(profiling);
int64_t listener_id = platform::ListenerId();
if (listener_id <= 0) {
break;
}
if (profiling == platform::kEnableProfiler &&
!platform::IsProfileEnabled()) {
platform::EnableProfiler(platform::ProfilerState::kCPU);
} else if (profiling == platform::kDisableProfiler &&
platform::IsProfileEnabled()) {
// TODO(panyx0718): Should we allow to customize file dir.
platform::DisableProfiler(
platform::EventSortingKey::kDefault,
string::Sprintf("/tmp/profile_ps_%lld", listener_id));
}
break;
}
default: {
// Unknown tag, return unknown error.
return -1;
}
if (meta_.type() == sendrecv::SELECTED_ROWS) {
if (!CopySelectRowsTensorData(input, *dev_ctx_, dims, num_bytes)) {
return false;
}
return true;
}
return 0;
return true;
}
}; // namespace distributed
......
......@@ -22,18 +22,35 @@
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/bytebuffer_stream.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h"
namespace paddle {
namespace operators {
namespace distributed {
// Source provides a way for a particular RPC implementation to provide
// received data to ParseFrom.
class Source {
public:
virtual ~Source() {}
// Return the stream that contains the data to be parsed.
// Note that this method might be invoked more than once if
// ParseFrom needs to fall back to a more expensive parsing method.
// Every call must return a stream pointing at the beginning of
// the serialized RecvTensorResponse.
//
// Note that a subsequent call to contents() invalidates previous
// results of contents().
//
// Ownership of the returned stream is retained by the Source and
// should not be deleted by the caller.
virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0;
};
class VariableResponse {
public:
VariableResponse(const framework::Scope* scope,
......@@ -51,22 +68,19 @@ class VariableResponse {
}
}
// return:
// 0:ok.
// -1: unkown error.
// other: number of error field.
int Parse(Source* source);
int Parse(Source* source, const sendrecv::VariableMessage& meta) {
meta_ = meta;
return Parse(source);
}
// return:
// 0:ok.
// -1: unkown error.
// other: number of error field.
int Parse(const ::grpc::ByteBuffer& byte_buffer);
const framework::Scope& GetLocalScope() const { return *local_scope_; }
framework::Scope* GetMutableLocalScope() const { return local_scope_; }
virtual int Parse(Source* source) = 0;
inline const framework::Scope& GetLocalScope() const { return *local_scope_; }
inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
inline std::string Varname() const { return meta_.varname(); }
inline std::string OutVarname() const { return meta_.out_varname(); }
......@@ -78,7 +92,11 @@ class VariableResponse {
return scope_->FindVar(meta_.varname());
}
private:
protected:
bool ReadRaw(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& dev_ctx, platform::Place place,
void* dest, int64_t size);
bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx,
const framework::DDim& dims, int length);
......@@ -90,12 +108,16 @@ class VariableResponse {
const platform::DeviceContext& ctx,
const framework::DDim& dims, int length);
private:
bool ProcSerializedField(int tag,
::google::protobuf::io::CodedInputStream* input,
int64_t num_bytes);
protected:
const framework::Scope* scope_;
const platform::DeviceContext* dev_ctx_;
bool create_scope_ = false;
framework::Scope* local_scope_ = nullptr;
// only Skeleton
sendrecv::VariableMessage meta_;
};
......
......@@ -37,6 +37,7 @@ struct CBlas<float> {
libxsmm_sgemm(args...);
}
#endif
template <typename... ARGS>
static void AXPY(ARGS... args) {
platform::dynload::cblas_saxpy(args...);
......@@ -76,6 +77,7 @@ struct CBlas<double> {
libxsmm_dgemm(args...);
}
#endif
template <typename... ARGS>
static void AXPY(ARGS... args) {
platform::dynload::cblas_daxpy(args...);
......@@ -150,6 +152,7 @@ struct CBlas<double> {
}
};
#endif
template <>
struct CBlas<platform::float16> {
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
......@@ -190,30 +193,48 @@ inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
return false;
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M,
int N, int K, T alpha, const T *A,
const T *B, T beta, T *C) const {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M, int N, int K, T alpha,
const T *A, int lda, const T *B, int ldb, T beta, T *C,
int ldc) {
#ifdef PADDLE_WITH_LIBXSMM
if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
beta)) {
if (UseXSMM<T>(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
beta)) {
// Note: SMM use ColMajor
const char transa = 'N';
const char transb = 'N';
CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
&beta, C, &ldc);
} else {
return;
}
#endif
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B,
ldb, beta, C, ldc);
#ifdef PADDLE_WITH_LIBXSMM
#ifdef PADDLE_MKL_SPLIT_GEMM
constexpr int bs = 2;
if (M % bs == 0 && transA == CblasNoTrans && transB == CblasNoTrans) {
for (int off = 0; off < M; off += bs) {
CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans, bs, N, K, alpha,
A + off * lda, lda, B, ldb, beta, C + off * ldb, ldc);
}
return;
}
#endif
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
CBLAS_TRANSPOSE transB, int M,
int N, int K, T alpha, const T *A,
const T *B, T beta, T *C) const {
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
GEMM_WARP<T>(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
beta, C, ldc);
}
template <>
......@@ -222,9 +243,9 @@ void Blas<platform::CPUDeviceContext>::GEMM(bool transA, bool transB, int M,
int N, int K, T alpha, const T *A,
int lda, const T *B, int ldb,
T beta, T *C, int ldc) const {
CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
GEMM_WARP<T>(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
lda, B, ldb, beta, C, ldc);
}
template <typename DeviceContext>
......
......@@ -228,3 +228,57 @@ TEST(math_funciton, set_constant) {
}
delete ctx;
}
template <typename T>
void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
paddle::framework::Tensor mat_a;
paddle::framework::Tensor mat_b;
paddle::framework::Tensor mat_c_ref;
paddle::framework::Tensor mat_c_mkl;
auto* cpu_place = new paddle::platform::CPUPlace();
T* A = mat_a.mutable_data<T>({m, k}, *cpu_place);
T* B = mat_b.mutable_data<T>({k, n}, *cpu_place);
T* CREF = mat_c_ref.mutable_data<T>({m, n}, *cpu_place);
T* CMKL = mat_c_mkl.mutable_data<T>({m, n}, *cpu_place);
ASSERT_EQ(mat_c_mkl.numel(), mat_c_ref.numel());
for (int i = 0; i < mat_a.numel(); ++i) {
A[i] = static_cast<T>(i);
}
for (int i = 0; i < mat_b.numel(); ++i) {
B[i] = static_cast<T>(i + 1);
}
for (int i = 0; i < mat_c_ref.numel(); ++i) {
CREF[i] = static_cast<T>(i + 2);
CMKL[i] = CREF[i];
}
// this would call gemm_warp
paddle::platform::CPUDeviceContext context(*cpu_place);
GetBlas<T>(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B,
beta, CREF);
// lda,ldb,ldc follow RowMajor
int lda = k;
int ldb = n;
int ldc = n;
paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
CblasNoTrans, m, n, k, alpha, A, lda,
B, ldb, beta, CMKL, ldc);
for (int i = 0; i < mat_c_mkl.numel(); ++i) {
EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
}
}
TEST(math_function, gemm_warp) {
GemmWarpTest<float>(3, 2, 5, 1.f, 0.f);
GemmWarpTest<float>(3, 2, 5, 2.f, 1.f);
GemmWarpTest<float>(8, 5, 6, 1.f, 0.f);
GemmWarpTest<float>(8, 5, 6, 2.f, 1.f);
GemmWarpTest<double>(3, 2, 5, 1.0, 0.0);
GemmWarpTest<double>(3, 2, 5, 2.0, 1.0);
GemmWarpTest<double>(8, 5, 6, 1.0, 0.0);
GemmWarpTest<double>(8, 5, 6, 2.0, 1.0);
}
......@@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
auto lods = lod_tensor.lod();
PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
auto lod = lods[0];
const auto& lod = lods[0];
std::vector<SeqInfo> seq_info;
for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
......
......@@ -98,7 +98,7 @@ The update equations are as follows:
$$
velocity = mu * velocity + gradient \\
if (use\_nesterov): \\
param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
param = param - (gradient + mu * velocity) * learning\_rate \\
else: \\
param = param - learning\_rate * velocity. \\
$$
......
......@@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v,
T g_val = g[i];
T v_new = v[i] * mu + g_val;
v_out[i] = v_new;
p_out[i] = p[i] - (g_val - v_new * mu) * lr;
p_out[i] = p[i] - (g_val + v_new * mu) * lr;
}
} else {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
......
......@@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
v_out = v * mu + g;
if (use_nesterov) {
p_out = p - (g - v_out * mu) * lr[0];
p_out = p - (g + v_out * mu) * lr[0];
} else {
p_out = p - lr[0] * v_out;
}
......
......@@ -15,12 +15,13 @@ function(reader_library TARGET_NAME)
PARENT_SCOPE)
endfunction()
reader_library(open_files_op SRCS open_files_op.cc)
cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool)
reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader)
reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc)
reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc)
reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc)
reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc)
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc)
reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader)
reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc)
reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc)
reader_library(create_py_reader_op SRCS create_py_reader_op.cc)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include <vector>
namespace paddle {
namespace operators {
namespace reader {
BufferedReader::~BufferedReader() {
reader_->Shutdown();
while (!position_.empty()) {
position_.front().wait();
position_.pop();
}
}
BufferedReader::BufferedReader(
const std::shared_ptr<framework::ReaderBase> &reader,
const platform::Place &place, size_t buffer_size)
: framework::DecoratedReader(reader),
thread_pool_(1),
place_(place),
buffer_size_(buffer_size) {
cpu_buffer_.resize(buffer_size);
gpu_buffer_.resize(buffer_size);
ReadTillBufferFullAsync();
}
void BufferedReader::ReadTillBufferFullAsync() {
PADDLE_ENFORCE_EQ(position_.size(), 0U);
for (size_t i = 0; i < buffer_size_; ++i) {
ReadAsync(i);
}
}
void BufferedReader::ReadAsync(size_t i) {
position_.emplace(thread_pool_.enqueue([this, i]() -> size_t {
TensorVec &cpu = cpu_buffer_[i];
reader_->ReadNext(&cpu);
if (cpu.empty()) {
return -1UL;
}
if (platform::is_gpu_place(place_)) {
TensorVec &gpu = gpu_buffer_[i];
gpu.resize(cpu.size());
for (size_t i = 0; i < cpu.size(); ++i) {
framework::TensorCopySync(cpu[i], place_, &gpu[i]);
gpu[i].set_lod(cpu[i].lod());
}
}
return i;
}));
}
void BufferedReader::ShutdownImpl() {
reader_->Shutdown();
while (!position_.empty()) {
position_.pop();
}
prev_pos_ = -1UL;
}
void BufferedReader::StartImpl() {
reader_->Start();
ReadTillBufferFullAsync();
}
void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
if (position_.empty()) {
out->clear();
return;
}
size_t i = position_.front().get();
position_.pop();
if (i == -1UL) {
ReadNextImpl(out);
return;
}
*out = platform::is_gpu_place(place_) ? gpu_buffer_[i] : cpu_buffer_[i];
// Do not push current position into ReadAsync. Push the previous position
// Since all computation in fluid are async, change the data of
// current position may cause data error.
if (prev_pos_ != -1Ul) {
ReadAsync(prev_pos_);
}
prev_pos_ = i;
}
} // namespace reader
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <list>
#include <queue>
#include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/reader.h"
namespace paddle {
namespace operators {
namespace reader {
class BufferedReader : public framework::DecoratedReader {
using TensorVec = std::vector<framework::LoDTensor>;
using VecFuture = std::future<TensorVec>;
public:
BufferedReader(const std::shared_ptr<framework::ReaderBase>& reader,
const platform::Place& place, size_t buffer_size);
~BufferedReader() override;
private:
void ReadTillBufferFullAsync();
void ReadAsync(size_t i);
protected:
void ShutdownImpl() override;
void StartImpl() override;
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
private:
ThreadPool thread_pool_;
platform::Place place_;
const size_t buffer_size_;
std::queue<std::future<size_t>> position_;
// The buffer for reading data.
// NOTE: the simplest way to implement buffered reader is do not use any
// buffer, just read async and create futures as buffer size. However, to
// malloc tensors every time is extremely slow. Here we store all data in
// buffers and prevent alloc every time.
std::vector<TensorVec> cpu_buffer_;
std::vector<TensorVec> gpu_buffer_;
size_t prev_pos_{-1UL};
};
} // namespace reader
} // namespace operators
} // namespace paddle
......@@ -12,83 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thread> // NOLINT
#include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
// 'Double buffer' means we shall maintain two batches of input data at the same
// time. So the kCacheSize shoul be at least 2.
static constexpr size_t kCacheSize = 3;
// There will be two bacthes out of the channel during training:
// 1. the one waiting to be sent to the channel
// 2. the one just be received from the channel, which is also being used by
// subsequent operators.
// So the channel size should be kChacheSize - 2
static constexpr size_t kChannelSize = 1; // kCacheSize - 2
class DoubleBufferReader : public framework::DecoratedReader {
public:
explicit DoubleBufferReader(
const std::shared_ptr<ReaderBase>& reader,
platform::Place target_place = platform::CPUPlace())
: DecoratedReader(reader), place_(target_place) {
cpu_tensor_cache_.resize(kCacheSize);
gpu_tensor_cache_.resize(kCacheSize);
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) {
for (size_t i = 0; i < kCacheSize; ++i) {
ctxs_.emplace_back(new platform::CUDADeviceContext(
boost::get<platform::CUDAPlace>(place_)));
}
}
#endif
StartPrefetcher();
}
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
~DoubleBufferReader() { EndPrefetcher(); }
private:
void ShutdownImpl() override {
EndPrefetcher();
reader_->Shutdown();
}
void StartImpl() override {
reader_->Start();
StartPrefetcher();
}
void StartPrefetcher() {
channel_ = new reader::BlockingQueue<size_t>(kChannelSize);
prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
}
void EndPrefetcher() {
channel_->Close();
if (prefetcher_.joinable()) {
prefetcher_.join();
}
delete channel_;
channel_ = nullptr;
}
void PrefetchThreadFunc();
std::thread prefetcher_;
reader::BlockingQueue<size_t>* channel_;
platform::Place place_;
std::vector<std::vector<framework::LoDTensor>> cpu_tensor_cache_;
std::vector<std::vector<framework::LoDTensor>> gpu_tensor_cache_;
std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
};
class CreateDoubleBufferReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
......@@ -118,8 +47,8 @@ class CreateDoubleBufferReaderOp : public framework::OperatorBase {
place = platform::CUDAPlace(static_cast<int>(num));
}
out->Reset(framework::MakeDecoratedReader<DoubleBufferReader>(
underlying_reader, place));
out->Reset(framework::MakeDecoratedReader<BufferedReader>(underlying_reader,
place, 2));
}
};
......@@ -146,51 +75,6 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
}
};
void DoubleBufferReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
size_t cached_tensor_id;
if (channel_->Receive(&cached_tensor_id)) {
if (platform::is_gpu_place(place_)) {
*out = gpu_tensor_cache_[cached_tensor_id];
} else {
// CPU place
*out = cpu_tensor_cache_[cached_tensor_id];
}
} else {
out->clear();
}
}
void DoubleBufferReader::PrefetchThreadFunc() {
VLOG(5) << "A new prefetch thread starts.";
size_t cached_tensor_id = 0;
while (true) {
auto& cpu_batch = cpu_tensor_cache_[cached_tensor_id];
reader_->ReadNext(&cpu_batch);
if (cpu_batch.empty()) {
// The underlying reader have no next data.
break;
}
if (platform::is_gpu_place(place_)) {
auto& gpu_batch = gpu_tensor_cache_[cached_tensor_id];
gpu_batch.resize(cpu_batch.size());
for (size_t i = 0; i < cpu_batch.size(); ++i) {
// TODO(fengjiayi): Use asynchronous TensorCopy instead
framework::TensorCopySync(cpu_batch[i], place_, &gpu_batch[i]);
gpu_batch[i].set_lod(cpu_batch[i].lod());
}
}
if (!channel_->Send(cached_tensor_id)) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread will terminate.";
break;
}
++cached_tensor_id;
cached_tensor_id %= kCacheSize;
}
channel_->Close();
VLOG(5) << "Prefetch thread terminates.";
}
} // namespace reader
} // namespace operators
} // namespace paddle
......
......@@ -33,6 +33,8 @@ class PyReader : public framework::FileReader {
if (!success) out->clear();
}
~PyReader() { queue_->Close(); }
void Shutdown() override { queue_->Close(); }
void Start() override { queue_->ReOpen(); }
......
......@@ -33,11 +33,14 @@ class RecordIOFileReader : public framework::FileReader {
protected:
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
std::unique_ptr<std::lock_guard<std::mutex>> guard;
if (ThreadSafe) {
std::lock_guard<std::mutex> guard(*mutex_);
*out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
} else {
*out = framework::ReadFromRecordIO(&scanner_, dev_ctx_);
guard.reset(new std::lock_guard<std::mutex>(*mutex_));
}
bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out);
if (!ok) {
out->clear();
}
}
......
......@@ -48,9 +48,9 @@ class ShuffleReader : public framework::DecoratedReader {
private:
void ShutdownImpl() override {
reader_->Shutdown();
buffer_.clear();
iteration_pos_ = 0;
reader_->Shutdown();
}
void StartImpl() override {
......
......@@ -12,150 +12,200 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cmath>
#include <stdexcept>
#include <thread> // NOLINT
#include "ThreadPool.h"
#include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
#include "paddle/fluid/operators/reader/buffered_reader.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class MultiFileReader : public framework::ReaderBase {
class IReaderContainer {
public:
MultiFileReader(const std::vector<std::string>& file_names, size_t thread_num,
size_t buffer_size)
: buffer_size_(buffer_size) {
readers_.reserve(file_names.size());
for (const std::string& f_name : file_names) {
readers_.emplace_back(CreateReaderByFileName(f_name));
virtual ~IReaderContainer() {}
virtual void AppendReader(
std::unique_ptr<framework::ReaderBase>&& readers) = 0;
virtual void Stop() = 0;
virtual void Start() = 0;
virtual void ReadNext(std::vector<framework::LoDTensor>* out) = 0;
};
class OrderedReaderContainer : public IReaderContainer {
public:
void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
pending_.emplace(std::move(reader));
}
void Stop() override {
while (!pending_.empty()) {
MoveFrontPendingToDone();
}
prefetchers_.resize(thread_num);
StartNewScheduler();
}
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override;
void Start() override { std::swap(done_, pending_); }
~MultiFileReader() { EndScheduler(); }
void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!pending_.empty()) {
pending_.front()->ReadNext(out);
if (out->empty()) {
MoveFrontPendingToDone();
ReadNext(out);
}
} else {
out->clear();
}
}
private:
void ShutdownImpl() override { EndScheduler(); }
void StartImpl() override { StartNewScheduler(); }
void StartNewScheduler();
void EndScheduler();
void ScheduleThreadFunc();
void PrefetchThreadFunc(size_t reader_idx, size_t thread_idx);
std::vector<std::unique_ptr<framework::ReaderBase>> readers_;
std::thread scheduler_;
std::vector<std::thread> prefetchers_;
size_t buffer_size_;
reader::BlockingQueue<size_t>* waiting_reader_idx_;
reader::BlockingQueue<size_t>* available_thread_idx_;
reader::BlockingQueue<std::vector<framework::LoDTensor>>* buffer_;
void MoveFrontPendingToDone() {
pending_.front()->Shutdown();
pending_.front()->Start();
done_.emplace(move(pending_.front()));
pending_.pop();
}
std::queue<std::unique_ptr<framework::ReaderBase>> pending_;
std::queue<std::unique_ptr<framework::ReaderBase>> done_;
};
void MultiFileReader::ReadNextImpl(std::vector<framework::LoDTensor>* out) {
if (!buffer_->Receive(out)) {
out->clear();
}
}
class PreemptiveReaderContainer : public IReaderContainer {
using ReaderList = std::list<std::unique_ptr<framework::ReaderBase>>;
void MultiFileReader::StartNewScheduler() {
size_t thread_num = prefetchers_.size();
waiting_reader_idx_ = new reader::BlockingQueue<size_t>(readers_.size());
available_thread_idx_ = new reader::BlockingQueue<size_t>(thread_num);
buffer_ = new reader::BlockingQueue<std::vector<framework::LoDTensor>>(
buffer_size_);
struct FutureItem {
std::vector<framework::LoDTensor> data_;
ReaderList::iterator reader_it_;
std::exception_ptr exception_;
};
for (size_t i = 0; i < readers_.size(); ++i) {
waiting_reader_idx_->Send(i);
}
waiting_reader_idx_->Close();
for (size_t i = 0; i < thread_num; ++i) {
available_thread_idx_->Send(i);
}
using FutureList = std::list<std::future<FutureItem>>;
scheduler_ = std::thread([this] { ScheduleThreadFunc(); });
}
public:
explicit PreemptiveReaderContainer(size_t thread_num) : pool_(thread_num) {}
void MultiFileReader::EndScheduler() {
available_thread_idx_->Close();
buffer_->Close();
waiting_reader_idx_->Close();
if (scheduler_.joinable()) {
scheduler_.join();
}
delete buffer_;
delete available_thread_idx_;
delete waiting_reader_idx_;
}
void MultiFileReader::ScheduleThreadFunc() {
VLOG(5) << "MultiFileReader schedule thread starts.";
size_t completed_thread_num = 0;
size_t thread_idx;
while (available_thread_idx_->Receive(&thread_idx)) {
std::thread& prefetcher = prefetchers_[thread_idx];
if (prefetcher.joinable()) {
prefetcher.join();
}
size_t reader_idx;
if (waiting_reader_idx_->Receive(&reader_idx)) {
// Still have files to read. Start a new prefetch thread.
prefetcher = std::thread([this, reader_idx, thread_idx] {
PrefetchThreadFunc(reader_idx, thread_idx);
});
} else {
// No more file to read.
++completed_thread_num;
if (completed_thread_num == prefetchers_.size()) {
buffer_->Close();
break;
void Stop() override {
if (!pending_.empty()) {
for (auto& reader : pending_) {
reader->Shutdown();
}
for (auto& fu : futures_) {
fu.wait();
}
futures_.clear();
for (auto& reader : pending_) {
reader->Start();
done_.emplace_back(std::move(reader));
}
pending_.clear();
bool timeout;
complete_queue_.PopAll(1000, &timeout);
PADDLE_ENFORCE(!timeout);
}
}
// If users invoke Shutdown() when scheduler is running, it will close the
// 'avaiable_thread_idx_' and prefecther threads have no way to tell scheduler
// to release their resource. So a check is needed before scheduler ends.
for (auto& p : prefetchers_) {
if (p.joinable()) {
p.join();
void Start() override {
for (auto& reader : done_) {
AppendReader(std::move(reader));
}
done_.clear();
}
VLOG(5) << "MultiFileReader schedule thread terminates.";
}
void MultiFileReader::PrefetchThreadFunc(size_t reader_idx, size_t thread_idx) {
VLOG(5) << "The prefetch thread of file idx '" << reader_idx << "' starts.";
std::unique_ptr<framework::ReaderBase>& reader = readers_[reader_idx];
while (true) {
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
if (ins.empty()) {
reader->Shutdown();
reader->Start();
break;
void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!pending_.empty()) {
auto future_it = complete_queue_.Pop();
FutureItem item = future_it->get();
if (item.exception_) {
for (auto it = futures_.begin(); it != futures_.end(); ++it) {
if (it != future_it) {
it->wait(); // Wait all other threads complete.
}
}
std::rethrow_exception(item.exception_);
} else if (item.data_.empty()) { // reader done.
done_.emplace_back(std::move(*item.reader_it_));
pending_.erase(item.reader_it_);
futures_.erase(future_it);
ReadNext(out);
} else {
*out = item.data_;
// continue read async
ReadAsync(item.reader_it_, &future_it);
}
} else {
out->clear();
}
try {
buffer_->Send(std::move(ins));
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
"thread of file idx '"
<< reader_idx << "' will terminate.";
break;
}
private:
void AppendReader(std::unique_ptr<framework::ReaderBase>&& reader) override {
pending_.emplace_back(std::move(reader));
auto reader_it = pending_.end();
--reader_it;
futures_.emplace_back();
auto future_it = futures_.end();
--future_it;
ReadAsync(reader_it, &future_it);
}
void ReadAsync(const ReaderList::iterator& reader_it,
FutureList::iterator* future_it_ptr) {
auto& future_it = *future_it_ptr;
*future_it = pool_.enqueue([reader_it, future_it, this] {
try {
FutureItem item;
item.reader_it_ = reader_it;
(*reader_it)->ReadNext(&item.data_);
if (item.data_.empty()) {
(*reader_it)->Shutdown();
(*reader_it)->Start();
}
complete_queue_.Push(future_it);
return item;
} catch (...) {
FutureItem item;
item.exception_ = std::current_exception();
complete_queue_.Push(future_it);
return item;
}
});
}
FutureList futures_;
ThreadPool pool_;
framework::BlockingQueue<FutureList::iterator> complete_queue_;
std::list<std::unique_ptr<framework::ReaderBase>> pending_;
std::list<std::unique_ptr<framework::ReaderBase>> done_;
};
class MultiFileReader : public framework::ReaderBase {
public:
MultiFileReader(const std::vector<std::string>& file_names,
std::unique_ptr<IReaderContainer>&& container)
: container_(std::move(container)) {
for (auto& fn : file_names) {
container_->AppendReader(CreateReaderByFileName(fn));
}
}
if (!available_thread_idx_->Send(thread_idx)) {
VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
"Fail to send thread_idx.";
~MultiFileReader() { container_->Stop(); }
protected:
void ReadNextImpl(std::vector<framework::LoDTensor>* out) override {
container_->ReadNext(out);
}
VLOG(5) << "The prefetch thread of file idx '" << reader_idx
<< "' terminates.";
}
void ShutdownImpl() override { container_->Stop(); }
void StartImpl() override { container_->Start(); }
private:
std::unique_ptr<IReaderContainer> container_;
};
class OpenFilesOp : public framework::OperatorBase {
public:
......@@ -173,13 +223,27 @@ class OpenFilesOp : public framework::OperatorBase {
"shape concat's length.");
const auto& file_names = Attr<std::vector<std::string>>("file_names");
PADDLE_ENFORCE(!file_names.empty(), "No file to be read!");
const size_t thread_num = Attr<int>("thread_num");
const size_t buffer_size = Attr<int>("buffer_size");
bool is_test = Attr<bool>("is_test");
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(
std::make_shared<MultiFileReader>(file_names, thread_num, buffer_size));
std::unique_ptr<IReaderContainer> container;
if (is_test) {
container.reset(new OrderedReaderContainer());
} else {
container.reset(new PreemptiveReaderContainer(
static_cast<size_t>(Attr<int>("thread_num"))));
}
std::shared_ptr<framework::ReaderBase> reader(
new MultiFileReader(file_names, std::move(container)));
auto buffer_size = Attr<int>("buffer_size");
if (buffer_size > 1) {
reader = framework::MakeDecoratedReader<BufferedReader>(
reader, platform::CPUPlace(), buffer_size);
}
out->Reset(reader);
}
};
......@@ -187,9 +251,7 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
protected:
void Apply() override {
AddAttr<std::vector<std::string>>("file_names", "Files to be read.");
AddAttr<int>("thread_num", "The maximal concurrent prefetch thread number.")
.GreaterThan(0);
AddAttr<int>("buffer_size", "The size of prefetch buffer.").GreaterThan(0);
AddAttr<bool>("is_test", "Used for testing data.").SetDefault(false);
AddComment(R"DOC(
OpenFiles Operator
......@@ -197,6 +259,11 @@ class OpenFilesOpMaker : public FileReaderMakerBase {
An OpenFilesOp creates a MultiFileReader, which is able to
read data multi-threaded from multiple files.
)DOC");
AddAttr<int>("thread_num",
"The maximal concurrent prefetch thread number. Used only "
"when is_test = False");
AddAttr<int>("buffer_size", "The reading buffer of these files.")
.GreaterThan(0);
}
};
......
......@@ -23,12 +23,13 @@ REGISTER_OP_CPU_KERNEL(
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::SumFunctor>);
REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
float, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
double, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int, ops::SumGradFunctor>,
ops::ReduceGradKernel<paddle::platform::CPUDeviceContext,
int64_t, ops::SumGradFunctor>);
REGISTER_OP_CPU_KERNEL(
reduce_sum_grad,
ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, float,
ops::SumGradFunctor>,
ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, double,
ops::SumGradFunctor>,
ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int,
ops::SumGradFunctor>,
ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, int64_t,
ops::SumGradFunctor>);
......@@ -14,11 +14,69 @@
#pragma once
#include <vector>
#include "paddle/fluid/operators/reduce_op.h"
namespace paddle {
namespace operators {
// use for loop to speed up Eigen broadcast. 4 timer faster then broadcast
template <typename DeviceContext, typename T, typename Functor>
class ReduceSumGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto dims = context.Attr<std::vector<int>>("dim");
if (context.GetPlace().type() == typeid(platform::CPUPlace) &&
dims.size() == 1) {
auto* input0 = context.Input<Tensor>("X");
auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* output = context.Output<Tensor>(framework::GradVarName("X"));
output->mutable_data<T>(context.GetPlace());
const auto* input2_d = input2->data<T>();
auto* output_d = output->data<T>();
// handle reduce_all
if (input2->dims().size() == 1 && input2->dims()[0] == 1) {
for (int64_t i = 0; i < framework::product(input0->dims()); ++i) {
output_d[i] = input2_d[0];
}
return;
}
// handle reduce by one dimension
int reduce_dim_index = dims[0];
if (reduce_dim_index < 0) {
reduce_dim_index += input0->dims().size();
}
auto& input_dim = input0->dims();
int64_t before_dim = 1;
for (int i = 0; i < reduce_dim_index; ++i) {
before_dim *= input_dim[i];
}
int64_t reduce_dim = input_dim[reduce_dim_index];
int64_t after_dim = 1;
for (int i = reduce_dim_index + 1; i < input_dim.size(); ++i) {
after_dim *= input_dim[i];
}
for (int64_t i = 0; i < before_dim; ++i) {
for (int64_t j = 0; j < reduce_dim; ++j) {
for (int64_t k = 0; k < after_dim; ++k) {
output_d[i * reduce_dim * after_dim + j * after_dim + k] =
input2_d[i * after_dim + k];
}
}
}
return;
}
// default use Eigen broadcast
ReduceGradKernel<DeviceContext, T, Functor> kernel;
kernel.Compute(context);
}
};
struct SumFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
......@@ -31,7 +89,7 @@ struct SumGradFunctor {
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
dx->device(place) = dy->broadcast(dim);
dx->device(place) = dy->eval().broadcast(dim);
}
};
......
......@@ -24,6 +24,9 @@
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
namespace operators {
using inference::Singleton;
......@@ -53,7 +56,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions");
// We should delete the batch size here.
switch (shape.size()) {
case 2:
return nvinfer1::Dims2(1, shape[1]);
......@@ -91,27 +93,36 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
engine->InitNetwork();
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
VLOG(4) << "parsed var size " << block.AllVars().size();
// Add inputs
VLOG(4) << "declare inputs";
for (auto &input : context.Inputs("Xs")) {
VLOG(4) << "declare input " << input;
auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input");
auto shape = var->GetShape();
// For the special batch_size placeholder -1, drop it and pass the real
// shape of data.
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
// variational batch size.
if (shape[0] == -1) {
shape[0] = FLAGS_tensorrt_engine_batch_size;
}
engine->DeclareInput(
input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(var->GetShape()));
Vec2TRT_Dims(shape));
}
inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
block_desc, parameters, context.scope(), engine);
// Add outputs
VLOG(4) << "declare outputs";
for (auto &output : context.Outputs("Ys")) {
VLOG(4) << "declare output " << output;
engine->DeclareOutput(output);
}
......@@ -152,4 +163,7 @@ REGISTER_OP_CPU_KERNEL(
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
// A trick to compile with the needed TensorRT op converter.
USE_TRT_CONVERTER(mul)
#endif // PADDLE_WITH_CUDA
......@@ -24,6 +24,9 @@
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size);
namespace operators {
using inference::Singleton;
......@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T>
class TensorRTEngineKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
VLOG(4) << "TensorRTEngineKernel executing";
auto engine_name = context.Attr<std::string>("engine_uniq_key");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context);
......@@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
// Try to determine a batch_size
auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>(
context.scope(), input_names.front());
int batch_size = tensor0.dims()[0];
PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
context.Attr<int>("max_batch"));
// Convert input tensor from fluid to engine.
for (const auto& x : context.Inputs("Xs")) {
......@@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
}
}
// Execute the engine.
PADDLE_ENFORCE_GT(batch_size, 0);
engine->Execute(batch_size);
PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
engine->Execute(FLAGS_tensorrt_engine_batch_size);
// Convert output tensor from engine to fluid
for (const auto& y : context.Outputs("Ys")) {
// convert output and copy to fluid.
......@@ -94,16 +93,20 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* fluid_v = context.scope().FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
fluid_t->Resize(framework::make_ddim(ddim));
if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size.
engine->GetOutputInCPU(
y, fluid_t->mutable_data<float>(platform::CPUPlace()));
} else {
engine->GetOutputInGPU(
y, fluid_t->mutable_data<float>(platform::CUDAPlace()));
}
// TODO(Superjomn) find some way to determine which device to output the
// tensor.
// if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size.
engine->GetOutputInCPU(
y, fluid_t->mutable_data<float>(platform::CPUPlace()));
//} else {
// engine->GetOutputInGPU(
// y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
// size * sizeof(float));
//}
}
cudaStreamSynchronize(*engine->stream());
......
......@@ -38,6 +38,7 @@ limitations under the License. */
#endif
#endif
#include <boost/any.hpp>
#include <boost/mpl/comparison.hpp>
#include <boost/mpl/less_equal.hpp>
#include <boost/variant.hpp>
......@@ -248,15 +248,11 @@ PYBIND11_PLUGIN(core) {
#endif
})
.def("rows", [](SelectedRows &self) {
#ifndef PADDLE_WITH_CUDA
return self.rows();
#else
auto rows = self.rows();
std::vector<int64_t> new_rows;
new_rows.reserve(rows.size());
std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
return new_rows;
#endif
auto rows = self.rows();
std::vector<int64_t> new_rows;
new_rows.reserve(rows.size());
std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
return new_rows;
});
py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
......
......@@ -30,7 +30,9 @@ class RecordIOWriter {
public:
RecordIOWriter(const std::string& filename, recordio::Compressor compressor,
size_t max_num_record)
: stream_(filename), writer_(&stream_, compressor, max_num_record) {}
: closed_(false),
stream_(filename),
writer_(&stream_, compressor, max_num_record) {}
void AppendTensor(const framework::LoDTensor& tensor) {
tensors_.push_back(tensor);
......@@ -47,9 +49,17 @@ class RecordIOWriter {
PADDLE_ENFORCE(tensors_.empty());
writer_.Flush();
stream_.close();
closed_ = true;
}
~RecordIOWriter() {
if (!closed_) {
Close();
}
}
private:
bool closed_;
std::vector<framework::LoDTensor> tensors_;
std::ofstream stream_;
recordio::Writer writer_;
......
......@@ -28,6 +28,7 @@ Scanner::Scanner(std::unique_ptr<std::istream> &&stream)
Scanner::Scanner(const std::string &filename)
: stream_(new std::ifstream(filename)), parser_(*stream_) {
PADDLE_ENFORCE(static_cast<bool>(*stream_), "Cannot open file %s", filename);
Reset();
}
......
......@@ -333,7 +333,7 @@ function assert_api_not_changed() {
python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
deactivate
API_CHANGE=`git diff --name-only HEAD^ | grep "paddle/fluid/API.spec" || true`
API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true`
echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# TODO: curl -H 'Authorization: token ${TOKEN}'
......@@ -599,11 +599,11 @@ function main() {
cicheck)
cmake_gen ${PYTHON_ABI:-""}
build
assert_api_not_changed
run_test
gen_capi_package
gen_fluid_inference_lib
test_fluid_inference_lib
assert_api_not_changed
;;
*)
print_usage
......
......@@ -15,7 +15,7 @@
__all__ = ['batch']
def batch(reader, batch_size, drop_last=True):
def batch(reader, batch_size, drop_last=False):
"""
Create a batched reader.
......
......@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
for i in xrange(buffer_size):
yield images[i, :], int(labels[i])
finally:
m.terminate()
l.terminate()
try:
m.terminate()
except:
pass
try:
l.terminate()
except:
pass
return reader
......
......@@ -35,6 +35,7 @@ import io
import evaluator
import initializer
import layers
import contrib
import nets
import optimizer
import backward
......@@ -66,6 +67,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
'io',
'initializer',
'layers',
'contrib',
'transpiler',
'nets',
'optimizer',
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import decoder
from decoder import *
__all__ = decoder.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import beam_search_decoder
from beam_search_decoder import *
__all__ = beam_search_decoder.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This module provides a general beam search decoder API for RNN based decoders.
The purpose of this API is to allow users to highly customize the behavior
within their RNN decoder(vanilla RNN, LSTM, attention + LSTM, future etc.),
without using the low level API such as while ops.
This API is still under active development and may change drastically.
"""
import contextlib
import numpy as np
from ... import layers
from ...framework import Variable
from ... import core
from ... import framework, unique_name
from ...layer_helper import LayerHelper
__all__ = ['InitState', 'StateCell', 'TrainingDecoder', 'BeamSearchDecoder']
class _DecoderType:
TRAINING = 1
BEAM_SEARCH = 2
class InitState(object):
"""
The initial hidden state object. The state objects holds a variable, and may
use it to initialize the hidden state cell of RNN. Usually used as input to
`StateCell` class.
Args:
init (Variable): The initial variable of the hidden state. If set None,
the variable will be created as a tensor with constant value based
on `shape` and `value` param.
shape (tuple|list): If `init` is None, new Variable's shape. Default
None.
value (float): If `init` is None, new Variable's value. Default None.
init_boot (Variable): If provided, the initial variable will be created
with the same shape as this variable.
need_reorder (bool): If set true, the init will be sorted by its lod
rank within its batches. This should be used if `batch_size > 1`.
dtype (np.dtype|core.VarDesc.VarType|str): Data type of the initial
variable.
Returns:
An initialized state object.
Examples:
See `StateCell`.
"""
def __init__(self,
init=None,
shape=None,
value=0.0,
init_boot=None,
need_reorder=False,
dtype='float32'):
if init is not None:
self._init = init
elif init_boot is None:
raise ValueError(
'init_boot must be provided to infer the shape of InitState .\n')
else:
self._init = layers.fill_constant_batch_size_like(
input=init_boot, value=value, shape=shape, dtype=dtype)
self._shape = shape
self._value = value
self._need_reorder = need_reorder
self._dtype = dtype
@property
def value(self):
return self._init
@property
def need_reorder(self):
return self._need_reorder
class _MemoryState(object):
def __init__(self, state_name, rnn_obj, init_state):
self._state_name = state_name # each is a rnn.memory
self._rnn_obj = rnn_obj
self._state_mem = self._rnn_obj.memory(
init=init_state.value, need_reorder=init_state.need_reorder)
def get_state(self):
return self._state_mem
def update_state(self, state):
self._rnn_obj.update_memory(self._state_mem, state)
class _ArrayState(object):
def __init__(self, state_name, block, init_state):
self._state_name = state_name
self._block = block
self._state_array = self._block.create_var(
name=unique_name.generate('array_state_array'),
type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
dtype=init_state.value.dtype)
self._counter = self._block.create_var(
name=unique_name.generate('array_state_counter'),
type=core.VarDesc.VarType.LOD_TENSOR,
dtype='int64')
# initialize counter
self._block.append_op(
type='fill_constant',
inputs={},
outputs={'Out': [self._counter]},
attrs={
'shape': [1],
'dtype': self._counter.dtype,
'value': float(0.0),
'force_cpu': True
})
self._counter.stop_gradient = True
# write initial state
block.append_op(
type='write_to_array',
inputs={'X': init_state.value,
'I': self._counter},
outputs={'Out': self._state_array})
def get_state(self):
state = layers.array_read(array=self._state_array, i=self._counter)
return state
def update_state(self, state):
layers.increment(x=self._counter, value=1, in_place=True)
layers.array_write(state, array=self._state_array, i=self._counter)
class StateCell(object):
"""
The state cell class stores the hidden state of the RNN cell. A typical RNN
cell has one or more hidden states, and one or more step inputs. This class
allows you to defines the name of hidden states as well as step inputs, and
their associated variables.
Args:
inputs (dict): A feeding dict of {name(str) : Variable}. It specifies
the names of step inputs for RNN cell, and the associated variables.
The variable could initially be None and set manually during each
RNN step.
states (dict): A feeding dict of {name(str) : InitState object}. It
specifies the names of hidden states and their initialized state.
out_state (str): A string that specifies the name of hidden state that
will be used to compute the score in beam search process.
name (str): The name of the RNN cell. Default None.
Raises:
`ValueError`: If the initial state is not an instance of InitState, or
the out_state is not in the dict of states.
Returns:
StateCell: The initialized StateCell object.
Examples:
.. code-block:: python
hidden_state = InitState(init=encoder_out, need_reorder=True)
state_cell = StateCell(
inputs={'current_word': None},
states={'h': hidden_state},
out_state='h')
"""
def __init__(self, inputs, states, out_state, name=None):
self._helper = LayerHelper('state_cell', name=name)
self._cur_states = {}
self._state_names = []
for state_name, state in states.items():
if not isinstance(state, InitState):
raise ValueError('state must be an InitState object.')
self._cur_states[state_name] = state
self._state_names.append(state_name)
self._inputs = inputs # inputs is place holder here
self._cur_decoder_obj = None
self._in_decoder = False
self._states_holder = {}
self._switched_decoder = False
self._state_updater = None
self._out_state = out_state
if self._out_state not in self._cur_states:
raise ValueError('out_state must be one state in states')
def _enter_decoder(self, decoder_obj):
if self._in_decoder == True or self._cur_decoder_obj is not None:
raise ValueError('StateCell has already entered a decoder.')
self._in_decoder = True
self._cur_decoder_obj = decoder_obj
self._switched_decoder = False
def _leave_decoder(self, decoder_obj):
if not self._in_decoder:
raise ValueError('StateCell not in decoder, '
'invalid leaving operation.')
if self._cur_decoder_obj != decoder_obj:
raise ValueError('Inconsistent decoder object in StateCell.')
self._in_decoder = False
self._cur_decoder_obj = None
self._switched_decoder = False
def _switch_decoder(self): # lazy switch
if not self._in_decoder:
raise ValueError('StateCell must be enter a decoder.')
if self._switched_decoder:
raise ValueError('StateCell already done switching.')
for state_name in self._state_names:
if state_name not in self._states_holder:
state = self._cur_states[state_name]
if not isinstance(state, InitState):
raise ValueError('Current type of state is %s, should be '
'an InitState object.' % type(state))
self._states_holder[state_name] = {}
if self._cur_decoder_obj.type == _DecoderType.TRAINING:
self._states_holder[state_name][id(self._cur_decoder_obj)] \
= _MemoryState(state_name,
self._cur_decoder_obj.dynamic_rnn,
state)
elif self._cur_decoder_obj.type == _DecoderType.BEAM_SEARCH:
self._states_holder[state_name][id(self._cur_decoder_obj)] \
= _ArrayState(state_name,
self._cur_decoder_obj._parent_block(),
state)
else:
raise ValueError('Unknown decoder type, only support '
'[TRAINING, BEAM_SEARCH]')
# Read back, since current state should be LoDTensor
self._cur_states[state_name] = \
self._states_holder[state_name][
id(self._cur_decoder_obj)].get_state()
self._switched_decoder = True
def get_state(self, state_name):
"""
The getter of state object. Find the state variable by its name.
Args:
state_name (str): A string of the state's name.
Returns:
The associated state object.
"""
if self._in_decoder and not self._switched_decoder:
self._switch_decoder()
if state_name not in self._cur_states:
raise ValueError(
'Unknown state %s. Please make sure _switch_decoder() '
'invoked.' % state_name)
return self._cur_states[state_name]
def get_input(self, input_name):
"""
The getter of input variable. Find the input variable by its name.
Args:
input_name (str): The string of the input's name.
Returns:
The associated input variable.
"""
if input_name not in self._inputs or self._inputs[input_name] is None:
raise ValueError('Invalid input %s.' % input_name)
return self._inputs[input_name]
def set_state(self, state_name, state_value):
"""
The setter of the state variable. Change the variable of the given
`state_name`.
Args:
state_name (str): The name of the state to change.
state_value (Var): The variable of the new state.
"""
self._cur_states[state_name] = state_value
def state_updater(self, updater):
"""
Set up the updater to update the hidden state every RNN step. The
behavior of updater could be customized by users. The updater should be
a function that takes a `StateCell` object as input and update the
hidden state within it. The hidden state could be accessed through
`get_state` method.
Args:
updater (func): the updater to update the state cell.
"""
self._state_updater = updater
def _decorator(state_cell):
if state_cell == self:
raise TypeError('Updater should only accept a StateCell object '
'as argument.')
updater(state_cell)
return _decorator
def compute_state(self, inputs):
"""
Provide the step input of RNN cell, and compute the new hidden state
with updater and give step input.
Args:
inputs (dict): A feed dict, {name(str): Variable}. name should be
the names of step inputs for this RNN cell, and Variable should be
the associated variables.
Examples:
.. code-block:: python
state_cell.compute_state(inputs={'x': current_word})
"""
if self._in_decoder and not self._switched_decoder:
self._switch_decoder()
for input_name, input_value in inputs.items():
if input_name not in self._inputs:
raise ValueError('Unknown input %s. '
'Please make sure %s in input '
'place holder.' % (input_name, input_name))
self._inputs[input_name] = input_value
self._state_updater(self)
def update_states(self):
"""
Update and record state information after each RNN step.
"""
if self._in_decoder and not self._switched_decoder:
self._switched_decoder()
for state_name, decoder_state in self._states_holder.items():
if id(self._cur_decoder_obj) not in decoder_state:
raise ValueError('Unknown decoder object, please make sure '
'switch_decoder been invoked.')
decoder_state[id(self._cur_decoder_obj)].update_state(
self._cur_states[state_name])
def out_state(self):
"""
Get the output state variable. This must be called after update_states.
Returns:
The output variable of the RNN cell.
"""
return self._cur_states[self._out_state]
class TrainingDecoder(object):
"""
A decoder that can only be used for training. The decoder could be
initialized with a `StateCell` object. The computation within the RNN cell
could be defined with decoder's block.
Args:
state_cell (StateCell): A StateCell object that handles the input and
state variables.
name (str): The name of this decoder. Default None.
Returns:
TrainingDecoder: The initialized TrainingDecoder object.
Examples:
.. code-block:: python
decoder = TrainingDecoder(state_cell)
with decoder.block():
current_word = decoder.step_input(trg_embedding)
decoder.state_cell.compute_state(inputs={'x': current_word})
current_score = layers.fc(input=decoder.state_cell.get_state('h'),
size=32,
act='softmax')
decoder.state_cell.update_states()
decoder.output(current_score)
"""
BEFORE_DECODER = 0
IN_DECODER = 1
AFTER_DECODER = 2
def __init__(self, state_cell, name=None):
self._helper = LayerHelper('training_decoder', name=name)
self._status = TrainingDecoder.BEFORE_DECODER
self._dynamic_rnn = layers.DynamicRNN()
self._type = _DecoderType.TRAINING
self._state_cell = state_cell
self._state_cell._enter_decoder(self)
@contextlib.contextmanager
def block(self):
"""
Define the behavior of the decoder for each RNN time step.
"""
if self._status != TrainingDecoder.BEFORE_DECODER:
raise ValueError('decoder.block() can only be invoked once')
self._status = TrainingDecoder.IN_DECODER
with self._dynamic_rnn.block():
yield
self._status = TrainingDecoder.AFTER_DECODER
self._state_cell._leave_decoder(self)
@property
def state_cell(self):
self._assert_in_decoder_block('state_cell')
return self._state_cell
@property
def dynamic_rnn(self):
return self._dynamic_rnn
@property
def type(self):
return self._type
def step_input(self, x):
"""
Set the input variable as a step input to the RNN cell. For example,
in machine translation, each time step we read one word from the target
sentences, then the target sentence is a step input to the RNN cell.
Args:
x (Variable): the variable to be used as step input.
Returns:
Variable: The variable as input of current step.
Examples:
.. code-block:: python
current_word = decoder.step_input(trg_embedding)
"""
self._assert_in_decoder_block('step_input')
return self._dynamic_rnn.step_input(x)
def static_input(self, x):
"""
Set the input variable as a static input of RNN cell. In contrast to
step input, this variable will be used as a whole within the RNN decode
loop and will not be scattered into time steps.
Args:
x (Variable): the variable to be used as static input.
Returns:
Variable: The variable as input of current step.
Examples:
.. code-block:: python
encoder_vec = decoder.static_input(encoded_vector)
"""
self._assert_in_decoder_block('static_input')
return self._dynamic_rnn.static_input(x)
def __call__(self, *args, **kwargs):
"""
Get the output of RNN. This API should only be invoked after RNN.block()
Returns:
Variable: The specified output of the RNN cell.
"""
if self._status != TrainingDecoder.AFTER_DECODER:
raise ValueError('Output of training decoder can only be visited '
'outside the block.')
return self._dynamic_rnn(*args, **kwargs)
def output(self, *outputs):
"""
Set the output variable of the RNN cell.
Args:
*outputs (Variables): a series of variables that treated as output
of the RNN cell.
Examples:
.. code-block:: python
out = fluid.layers.fc(input=h,
size=32,
bias_attr=True,
act='softmax')
decoder.output(out)
"""
self._assert_in_decoder_block('output')
self._dynamic_rnn.output(*outputs)
def _assert_in_decoder_block(self, method):
if self._status != TrainingDecoder.IN_DECODER:
raise ValueError('%s should be invoked inside block of '
'TrainingDecoder object.' % method)
class BeamSearchDecoder(object):
"""
A beam search decoder that can be used for inference. The decoder should be
initialized with a `StateCell` object. The decode process can be defined
within its block.
Args:
state_cell (StateCell): A StateCell object that handles the input and
state variables.
init_ids (Variable): The init beam search token ids.
init_scores (Variable): The associated score of each id.
target_dict_dim (int): Size of dictionary.
word_dim (int): Word embedding dimension.
input_var_dict (dict): A feeding dict to feed the required input
variables to the state cell. It will be used by state_cell 's
compute method. Default empty.
topk_size (int): The topk size used for beam search. Default 50.
max_len (int): The maximum allowed length of the generated sentence.
Default 100.
beam_size (int): The beam width of beam search decode. Default 1.
end_id (int): The id of end token within beam search.
name (str): The name of this decoder. Default None.
Returns:
BeamSearchDecoder: A initialized BeamSearchDecoder object.
Examples:
.. code-block:: python
decoder = BeamSearchDecoder(
state_cell=state_cell,
init_ids=init_ids,
init_scores=init_scores,
target_dict_dim=target_dict_dim,
word_dim=word_dim,
init_var_dict={},
topk_size=topk_size,
sparse_emb=IS_SPARSE,
max_len=max_length,
beam_size=beam_size,
end_id=1,
name=None
)
decoder.decode()
translation_ids, translation_scores = decoder()
"""
BEFORE_BEAM_SEARCH_DECODER = 0
IN_BEAM_SEARCH_DECODER = 1
AFTER_BEAM_SEARCH_DECODER = 2
def __init__(self,
state_cell,
init_ids,
init_scores,
target_dict_dim,
word_dim,
input_var_dict={},
topk_size=50,
sparse_emb=True,
max_len=100,
beam_size=1,
end_id=1,
name=None):
self._helper = LayerHelper('beam_search_decoder', name=name)
self._counter = layers.zeros(shape=[1], dtype='int64')
self._counter.stop_gradient = True
self._type = _DecoderType.BEAM_SEARCH
self._max_len = layers.fill_constant(
shape=[1], dtype='int64', value=max_len)
self._cond = layers.less_than(
x=self._counter,
y=layers.fill_constant(
shape=[1], dtype='int64', value=max_len))
self._while_op = layers.While(self._cond)
self._state_cell = state_cell
self._state_cell._enter_decoder(self)
self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER
self._zero_idx = layers.fill_constant(
shape=[1], value=0, dtype='int64', force_cpu=True)
self._array_dict = {}
self._array_link = []
self._ids_array = None
self._scores_array = None
self._beam_size = beam_size
self._end_id = end_id
self._init_ids = init_ids
self._init_scores = init_scores
self._target_dict_dim = target_dict_dim
self._topk_size = topk_size
self._sparse_emb = sparse_emb
self._word_dim = word_dim
self._input_var_dict = input_var_dict
@contextlib.contextmanager
def block(self):
"""
Define the behavior of the decoder for each RNN time step.
"""
if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER:
raise ValueError('block() can only be invoke once.')
self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER
with self._while_op.block():
yield
with layers.Switch() as switch:
with switch.case(self._cond):
layers.increment(x=self._counter, value=1.0, in_place=True)
for value, array in self._array_link:
layers.array_write(
x=value, i=self._counter, array=array)
layers.less_than(
x=self._counter, y=self._max_len, cond=self._cond)
self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER
self._state_cell._leave_decoder(self)
@property
def type(self):
return self._type
def early_stop(self):
"""
Stop the generation process in advance. Could be used as "break".
"""
layers.fill_constant(
shape=[1], value=0, dtype='bool', force_cpu=True, out=self._cond)
def decode(self):
"""
Set up the computation within the decoder. Then you could call the
decoder to get the result of beam search decode. If you want to define
a more specific decoder, you could override this function.
Examples:
.. code-block:: python
decoder.decode()
translation_ids, translation_scores = decoder()
"""
with self.block():
prev_ids = self.read_array(init=self._init_ids, is_ids=True)
prev_scores = self.read_array(
init=self._init_scores, is_scores=True)
prev_ids_embedding = layers.embedding(
input=prev_ids,
size=[self._target_dict_dim, self._word_dim],
dtype='float32',
is_sparse=self._sparse_emb)
feed_dict = {}
update_dict = {}
for init_var_name, init_var in self._input_var_dict.items():
if init_var_name not in self.state_cell._inputs:
raise ValueError('Variable ' + init_var_name +
' not found in StateCell!\n')
read_var = self.read_array(init=init_var)
update_dict[init_var_name] = read_var
feed_var_expanded = layers.sequence_expand(read_var,
prev_scores)
feed_dict[init_var_name] = feed_var_expanded
for state_str in self._state_cell._state_names:
prev_state = self.state_cell.get_state(state_str)
prev_state_expanded = layers.sequence_expand(prev_state,
prev_scores)
self.state_cell.set_state(state_str, prev_state_expanded)
for i, input_name in enumerate(self._state_cell._inputs):
if input_name not in feed_dict:
feed_dict[input_name] = prev_ids_embedding
self.state_cell.compute_state(inputs=feed_dict)
current_state = self.state_cell.out_state()
current_state_with_lod = layers.lod_reset(
x=current_state, y=prev_scores)
scores = layers.fc(input=current_state_with_lod,
size=self._target_dict_dim,
act='softmax')
topk_scores, topk_indices = layers.topk(scores, k=self._topk_size)
accu_scores = layers.elementwise_add(
x=layers.log(x=topk_scores),
y=layers.reshape(
prev_scores, shape=[-1]),
axis=0)
selected_ids, selected_scores = layers.beam_search(
prev_ids,
prev_scores,
topk_indices,
accu_scores,
self._beam_size,
end_id=1,
level=0)
with layers.Switch() as switch:
with switch.case(layers.is_empty(selected_ids)):
self.early_stop()
with switch.default():
self.state_cell.update_states()
self.update_array(prev_ids, selected_ids)
self.update_array(prev_scores, selected_scores)
for update_name, var_to_update in update_dict.items():
self.update_array(var_to_update, feed_dict[update_name])
def read_array(self, init, is_ids=False, is_scores=False):
"""
Read an array to get the decoded ids and scores generated by previous
RNN step. At the first step of RNN, the init variable mut be used to
initialize the array.
Args:
init (Variable): The initial variable for first step usage. init
must be provided.
is_ids (bool): Specify whether the variable is an id.
is_scores (bool): Specify whether the variable is a score.
Returns:
The associated variable generated during previous RNN steps.
Examples:
.. code-block:: python
prev_ids = decoder.read_array(init=init_ids, is_ids=True)
prev_scores = decoder.read_array(init=init_scores, is_scores=True)
"""
self._assert_in_decoder_block('read_array')
if is_ids and is_scores:
raise ValueError('Shouldn\'t mark current array be ids array and'
'scores array at the same time.')
if not isinstance(init, Variable):
raise TypeError('The input argument `init` must be a Variable.')
parent_block = self._parent_block()
array = parent_block.create_var(
name=unique_name.generate('beam_search_decoder_array'),
type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
dtype=init.dtype)
parent_block.append_op(
type='write_to_array',
inputs={'X': init,
'I': self._zero_idx},
outputs={'Out': array})
if is_ids:
self._ids_array = array
elif is_scores:
self._scores_array = array
read_value = layers.array_read(array=array, i=self._counter)
self._array_dict[read_value.name] = array
return read_value
def update_array(self, array, value):
"""
Store the value generated in current step in an array for each RNN step.
This array could be accessed by read_array method.
Args:
array (Variable): The array to append the new variable to.
value (Variable): The newly generated value to be stored.
"""
self._assert_in_decoder_block('update_array')
if not isinstance(array, Variable):
raise TypeError(
'The input argument `array` of must be a Variable.')
if not isinstance(value, Variable):
raise TypeError('The input argument `value` of must be a Variable.')
array = self._array_dict.get(array.name, None)
if array is None:
raise ValueError('Please invoke read_array before update_array.')
self._array_link.append((value, array))
def __call__(self):
"""
Run the decode process and return the final decode result.
Returns:
A tuple of decoded (id, score) pairs. id is a Variable that holds
the generated tokens, and score is a Variable with the same shape
as id, holds the score for each generated token.
"""
if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER:
raise ValueError('Output of BeamSearchDecoder object can '
'only be visited outside the block.')
return layers.beam_search_decode(
ids=self._ids_array,
scores=self._scores_array,
beam_size=self._beam_size,
end_id=self._end_id)
@property
def state_cell(self):
self._assert_in_decoder_block('state_cell')
return self._state_cell
def _parent_block(self):
"""
Getter of parent block.
Returns:
The parent block of decoder.
"""
program = self._helper.main_program
parent_block_idx = program.current_block().parent_idx
if parent_block_idx < 0:
raise ValueError('Invalid block with index %d.' % parent_block_idx)
parent_block = program.block(parent_block_idx)
return parent_block
def _assert_in_decoder_block(self, method):
if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER:
raise ValueError('%s should be invoked inside block of '
'BeamSearchDecoder object.' % method)
......@@ -66,7 +66,8 @@ def is_persistable(var):
res = fluid.io.is_persistable(param)
"""
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
var.desc.type() == core.VarDesc.VarType.READER:
return False
return var.persistable
......
......@@ -25,9 +25,6 @@ import numpy
__all__ = [
'split_lod_tensor',
'merge_lod_tensor',
'BlockGuard',
'BlockGuardWithCompletion',
'WhileGuard',
'While',
'Switch',
'lod_rank_table',
......
......@@ -12,18 +12,21 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import multiprocessing
import threading
from .. import core
from ..framework import convert_np_dtype_to_dtype_, default_main_program, default_startup_program, Program
from ..unique_name import generate as unique_name
from ..data_feeder import DataFeeder
from control_flow import BlockGuard
from ..layer_helper import LayerHelper
from layer_function_generator import templatedoc
from .. import core
from ..executor import global_scope
from layer_function_generator import generate_layer_fn, templatedoc
from ..framework import convert_np_dtype_to_dtype_, default_main_program, \
default_startup_program, program_guard, Program
from ..layer_helper import LayerHelper
from ..unique_name import generate as unique_name
__all__ = [
'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'Recv',
'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
'data', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch',
'double_buffer', 'random_data_generator', 'py_reader', 'Preprocessor',
'load'
]
......@@ -446,10 +449,15 @@ def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
return monkey_patch_reader_methods(main_prog_var)
def py_reader(capacity, shapes, dtypes, lod_levels=None):
def py_reader(capacity,
shapes,
dtypes,
lod_levels=None,
name=None,
use_double_buffer=True):
"""
Create a reader and blocking queue for data feeding in Python
This layer returns a Reader Variable and a BlockingQueue.
The BlockingQueue provides `push()` method to push a `LoDTensorArray`
object into the queue in Python side. In C++ side, the Reader
......@@ -459,15 +467,18 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
using `close()` method when unused.
Args:
use_double_buffer(bool): Whether use double buffer or not.
capacity(int): The maximum capacity of the BlockingQueue.
shapes(list): List of tuples which declaring data shapes.
dtypes(list): List of strs which declaring data type.
lod_levels(list): List of ints which declaring data lod_level.
shapes(list|tuple): List of tuples which declaring data shapes.
dtypes(list|tuple): List of strs which declaring data type.
lod_levels(list|tuple): List of ints which declaring data lod_level.
name(basestring): The prefix Python queue name and Reader name. None will
be generated automatically.
Returns:
tuple(Variable, BlockingQueue):
A Reader Variable from which we can get feeding data.
A BlockingQueue object for data feeding.
Examples:
......@@ -480,7 +491,7 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
dtypes=['float32', 'int64'])
# Via the reader, we can use 'read_file' layer to get data:
image, label = fluid.layers.read_file(reader)
# Via the blocking queue, we can feed data using threads
def feed_data(queue, feed_images, feed_labels):
for feed_image, feed_label in zip(feed_images, feed_labels):
......@@ -488,7 +499,7 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
data.append(feed_image)
data.append(feed_label)
queue.push(data)
thread = threading.Thread(target=feed_data, args=(queue, feed_images, feed_labels))
thread.start()
"""
......@@ -503,15 +514,23 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
if lod_levels is None:
lod_levels = [0] * len(shapes)
queue_name = unique_name('lod_tensor_blocking_queue')
if name is None:
queue_name = unique_name('lod_tensor_blocking_queue')
reader_name = unique_name('create_py_reader')
double_buffer_name = unique_name('double_buffer')
else:
queue_name = "_".join([name, "queue"])
reader_name = "_".join([name, "reader"])
double_buffer_name = "_".join([name, "double_buffer"])
var = global_scope().var(queue_name)
feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes)
startup_blk = default_startup_program().current_block()
startup_var = startup_blk.create_var(name=unique_name('create_py_reader'))
startup_var = startup_blk.create_var(name=reader_name)
startup_blk.append_op(
type='create_py_reader',
inputs={'blocking_queue': queue_name},
inputs={'blocking_queue': [queue_name]},
outputs={'Out': [startup_var]},
attrs={
'shape_concat': shape_concat,
......@@ -525,17 +544,97 @@ def py_reader(capacity, shapes, dtypes, lod_levels=None):
main_prog_var = _copy_reader_var_(default_main_program().current_block(),
startup_var)
return monkey_patch_reader_methods(main_prog_var), feed_queue
reader = monkey_patch_reader_methods(main_prog_var)
if use_double_buffer:
double_buffer_reader = double_buffer(reader, name=double_buffer_name)
# we return a double buffer reader. However, the reset method comes from
# py_reader.
double_buffer_reader.reset = reader.reset
reader = double_buffer_reader
# monkey patch py_reader special methods
reader.queue = feed_queue
current_reset_method = reader.reset
reader.thread = None
reader.tensor_provider = None
reader.exited = False
def start_provide_thread(func):
def __provider_thread__():
for tensors in func():
array = core.LoDTensorArray()
for item in tensors:
if not isinstance(item, core.LoDTensor):
tmp = core.LoDTensor()
tmp.set(item, core.CPUPlace())
item = tmp
array.append(item)
if reader.exited:
break
feed_queue.push(array)
if reader.exited:
break
feed_queue.close()
reader.thread = threading.Thread(target=__provider_thread__)
reader.thread.daemon = True
reader.thread.start()
def __set_tensor_provider__(func):
reader.tensor_provider = func
def __set_paddle_reader__(paddle_reader):
with program_guard(Program(), Program()):
feed_list = []
counter = 0
for dtype, shape, lod_level in zip(dtypes, shapes, lod_levels):
name = str(counter)
feed_list.append(
data(
name=name,
dtype=dtype,
shape=shape,
lod_level=lod_level))
counter += 1
feeder = DataFeeder(feed_list=feed_list, place=core.CPUPlace())
paddle_reader = feeder.decorate_reader(
paddle_reader, multi_devices=False)
def __tensor_provider__():
for slots in paddle_reader():
yield [slots[str(idx)] for idx in xrange(counter)]
__set_tensor_provider__(__tensor_provider__)
def __reset__():
current_reset_method()
if reader.thread is not None and reader.tensor_provider is not None:
reader.exited = True
reader.thread.join()
reader.exited = False
def __start__():
start_provide_thread(reader.tensor_provider)
reader.reset = __reset__
reader.decorate_tensor_provider = __set_tensor_provider__
reader.decorate_paddle_reader = __set_paddle_reader__
reader.start = __start__
return reader
def open_files(filenames,
shapes,
lod_levels,
dtypes,
thread_num=1,
thread_num=None,
buffer_size=None,
pass_num=1,
for_parallel=True):
is_test=None):
"""
Open files
......@@ -548,14 +647,14 @@ def open_files(filenames,
shapes(list): List of tuples which declaring data shapes.
lod_levels(list): List of ints which declaring data lod_level.
dtypes(list): List of strs which declaring data type.
thread_num(int): The maximal concurrent prefetch thread number.
buffer_size(int|None): The size of prefetch buffer. If it is setted None,
buffer size will be thread_num * 3.
Default: None
thread_num(None): The number of thread to read files.
Default: min(len(filenames), cpu_number).
buffer_size(None): The buffer size of reader. Default: 3 * thread_num
pass_num(int): Number of passes to run.
for_parallel(Bool): Set it as True if you are going to run
subsequent operators in parallel.
Default: True
is_test(bool|None): Whether `open_files` used for testing or not. If it
is used for testing, the order of data generated is same as the file
order. Otherwise, it is not guaranteed the order of data is same
between every epoch. [Default: False].
Returns:
Variable: A Reader Variable via which we can get file data.
......@@ -567,15 +666,21 @@ def open_files(filenames,
'./data2.recordio'],
shapes=[(3,224,224), (1)],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=2,
buffer_size=2)
dtypes=['float32', 'int64'])
# Via the reader, we can use 'read_file' layer to get data:
image, label = fluid.layers.io.read_file(reader)
"""
if thread_num is None:
thread_num = min(len(filenames), multiprocessing.cpu_count())
else:
thread_num = int(thread_num)
if buffer_size is None:
buffer_size = thread_num * 3
buffer_size = 3 * thread_num
else:
buffer_size = int(buffer_size)
if isinstance(filenames, basestring):
filenames = [filenames]
dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
......@@ -589,17 +694,18 @@ def open_files(filenames,
multi_file_reader_name = unique_name('multi_file_reader')
startup_blk = default_startup_program().current_block()
startup_reader = startup_blk.create_var(name=multi_file_reader_name)
attrs = {
'shape_concat': shape_concat,
'lod_levels': lod_levels,
'ranks': ranks,
'file_names': filenames,
'thread_num': thread_num,
'buffer_size': buffer_size
}
if is_test is not None:
attrs['is_test'] = is_test
startup_blk.append_op(
type='open_files',
outputs={'Out': [startup_reader]},
attrs={
'shape_concat': shape_concat,
'lod_levels': lod_levels,
'ranks': ranks,
'file_names': filenames,
'thread_num': thread_num,
'buffer_size': buffer_size
})
type='open_files', outputs={'Out': [startup_reader]}, attrs=attrs)
startup_reader.desc.set_dtypes(dtypes)
startup_reader.persistable = True
......@@ -802,7 +908,7 @@ class Preprocessor(object):
self.sink_var_names = None
self.status = Preprocessor.BEFORE_SUB_BLOCK
def is_completed(self):
def _is_completed(self):
return self.sub_block and self.source_var_names and self.sink_var_names
@contextlib.contextmanager
......@@ -812,7 +918,7 @@ class Preprocessor(object):
yield
self.main_prog.rollback()
self.status = Preprocessor.AFTER_SUB_BLOCK
if not self.is_completed():
if not self._is_completed():
raise RuntimeError(
"The definition of preprocessor is incompleted! "
"Please make sure that you have set input and output "
......
......@@ -114,23 +114,13 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
prediction = network(image, is_infer=True)
auc_out=fluid.layers.auc(input=prediction, label=label)
"""
warnings.warn(
"This interface is not recommended, fluid.layers.auc compute the auc at every minibatch, \
but can not aggregate them and get the pass AUC, because pass \
auc can not be averaged with weighted from the minibatch auc value. \
Please use fluid.metrics.Auc, it can compute the auc value via Python natively, \
which can get every minibatch and every pass auc value.", Warning)
helper = LayerHelper("auc", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
topk_out, topk_indices = nn.topk(input, k=k)
auc_out = helper.create_tmp_variable(dtype="float32")
auc_out = helper.create_tmp_variable(dtype="float64")
# make tp, tn, fp, fn persistable, so that can accumulate all batches.
tp = helper.create_global_variable(persistable=True)
tn = helper.create_global_variable(persistable=True)
fp = helper.create_global_variable(persistable=True)
fn = helper.create_global_variable(persistable=True)
tp = helper.create_global_variable(persistable=True, dtype='int64')
tn = helper.create_global_variable(persistable=True, dtype='int64')
fp = helper.create_global_variable(persistable=True, dtype='int64')
fn = helper.create_global_variable(persistable=True, dtype='int64')
for var in [tp, tn, fp, fn]:
helper.set_variable_initializer(
var, Constant(
......@@ -139,8 +129,7 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
helper.append_op(
type="auc",
inputs={
"Out": [topk_out],
"Indices": [topk_indices],
"Predict": [input],
"Label": [label],
"TP": [tp],
"TN": [tn],
......@@ -156,4 +145,4 @@ def auc(input, label, curve='ROC', num_thresholds=200, topk=1):
"FPOut": [fp],
"FNOut": [fn]
})
return auc_out
return auc_out, [tp, tn, fp, fn]
......@@ -166,7 +166,8 @@ def fc(input,
param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
parameters/weights of this layer.
bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
of this layer. If it is set to None, no bias will be added to the output units.
of this layer. If it is set to False, no bias will be added to the output units.
If it is set to None, the bias is initialized zero. Default: None.
act (str, default None): Activation to be applied to the output of this layer.
is_test(bool): A flag indicating whether execution is in test phase.
use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
......@@ -2960,7 +2961,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
# x is a Tensor variable with following elements:
# [[0.2, 0.3, 0.5, 0.9]
# [0.1, 0.2, 0.6, 0.7]]
# Each example is followed by the correspending output tensor.
# Each example is followed by the corresponding output tensor.
fluid.layers.reduce_sum(x) # [3.5]
fluid.layers.reduce_sum(x, dim=0) # [0.3, 0.5, 1.1, 1.6]
fluid.layers.reduce_sum(x, dim=-1) # [1.9, 1.6]
......@@ -2969,7 +2970,7 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
# x is a Tensor variable with shape [2, 2, 2] and elements as below:
# [[[1, 2], [3, 4]],
# [[5, 6], [7, 8]]]
# Each example is followed by the correspending output tensor.
# Each example is followed by the corresponding output tensor.
fluid.layers.reduce_sum(x, dim=[1, 2]) # [10, 26]
fluid.layers.reduce_sum(x, dim=[0, 1]) # [16, 20]
......
......@@ -591,7 +591,7 @@ class Auc(MetricBase):
for i in range(self._num_thresholds - 2)]
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
# caculate TP, FN, TN, FP count
# calculate TP, FN, TN, FP count
for idx_thresh, thresh in enumerate(thresholds):
tp, fn, tn, fp = 0, 0, 0, 0
for i, lbl in enumerate(labels):
......
......@@ -324,7 +324,7 @@ class MomentumOptimizer(Optimizer):
& if (use\_nesterov):
&\quad param = param - gradient * learning\_rate + mu * velocity * learning\_rate
&\quad param = param - (gradient + mu * velocity) * learning\_rate
& else:
......@@ -1180,16 +1180,16 @@ class ModelAverage(Optimizer):
self._add_average_restore_op(block, param_grad)
def _add_average_apply_op(self, block, param_grad):
param = block.clone_variable(param_grad[0])
grad = block.clone_variable(param_grad[1])
sum_1 = block.clone_variable(self._get_accumulator('sum_1', param))
sum_2 = block.clone_variable(self._get_accumulator('sum_2', param))
sum_3 = block.clone_variable(self._get_accumulator('sum_3', param))
num_accumulates = block.clone_variable(
param = block._clone_variable(param_grad[0])
grad = block._clone_variable(param_grad[1])
sum_1 = block._clone_variable(self._get_accumulator('sum_1', param))
sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
num_accumulates = block._clone_variable(
self._get_accumulator('num_accumulates', param))
old_num_accumulates = block.clone_variable(
old_num_accumulates = block._clone_variable(
self._get_accumulator('old_num_accumulates', param))
num_updates = block.clone_variable(
num_updates = block._clone_variable(
self._get_accumulator('num_updates', param))
# backup param value to grad
layers.assign(input=param, output=grad)
......@@ -1203,8 +1203,8 @@ class ModelAverage(Optimizer):
layers.elementwise_div(x=sum, y=tmp, out=param)
def _add_average_restore_op(self, block, param_grad):
param = block.clone_variable(param_grad[0])
grad = block.clone_variable(param_grad[1])
param = block._clone_variable(param_grad[0])
grad = block._clone_variable(param_grad[1])
layers.assign(input=grad, output=param)
def _append_average_accumulate_op(self, param):
......
......@@ -14,6 +14,7 @@
from __future__ import print_function
import argparse
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle
import sys
import numpy
......@@ -134,4 +135,4 @@ def main(use_cuda):
if __name__ == '__main__':
# for use_cuda in (False, True):
main(use_cuda=True)
main(use_cuda=core.is_compiled_with_cuda())
......@@ -13,6 +13,7 @@
# limitations under the License.
from __future__ import print_function
import paddle.fluid.core as core
import math
import os
import sys
......@@ -257,6 +258,8 @@ def inject_test_method(use_cuda, parallel, nn_type, combine):
def inject_all_tests():
for use_cuda in (False, True):
if use_cuda and not core.is_compiled_with_cuda():
continue
for parallel in (False, True):
for nn_type in ('mlp', 'conv'):
inject_test_method(use_cuda, parallel, nn_type, True)
......
......@@ -245,7 +245,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
is_sparse=is_sparse,
is_parallel=is_parallel)
if use_cuda and is_sparse:
if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse:
fn = __impl__
else:
# skip the other test when on CI server
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy
import paddle
import paddle.dataset.mnist as mnist
import paddle.fluid as fluid
import paddle.v2
def network(is_train):
reader = fluid.layers.py_reader(
capacity=10,
shapes=((-1, 784), (-1, 1)),
dtypes=('float32', 'int64'),
name="train_reader" if is_train else "test_reader",
use_double_buffer=True)
img, label = fluid.layers.read_file(reader)
hidden = img
for i in xrange(2):
hidden = fluid.layers.fc(input=hidden, size=100, act='tanh')
hidden = fluid.layers.dropout(
hidden, dropout_prob=0.5, is_test=not is_train)
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
return fluid.layers.mean(loss), reader
def main():
train_prog = fluid.Program()
startup_prog = fluid.Program()
with fluid.program_guard(train_prog, startup_prog):
with fluid.unique_name.guard():
loss, train_reader = network(True)
adam = fluid.optimizer.Adam(learning_rate=0.01)
adam.minimize(loss)
test_prog = fluid.Program()
test_startup = fluid.Program()
with fluid.program_guard(test_prog, test_startup):
with fluid.unique_name.guard():
test_loss, test_reader = network(False)
use_cuda = fluid.core.is_compiled_with_cuda()
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
fluid.Executor(place).run(startup_prog)
fluid.Executor(place).run(test_startup)
trainer = fluid.ParallelExecutor(
use_cuda=use_cuda, loss_name=loss.name, main_program=train_prog)
tester = fluid.ParallelExecutor(
use_cuda=use_cuda, share_vars_from=trainer, main_program=test_prog)
train_reader.decorate_paddle_reader(
paddle.v2.reader.shuffle(
paddle.batch(mnist.train(), 512), buf_size=8192))
test_reader.decorate_paddle_reader(paddle.batch(mnist.test(), 512))
for epoch_id in xrange(10):
train_reader.start()
try:
while True:
print 'train_loss', numpy.array(
trainer.run(fetch_list=[loss.name]))
except fluid.core.EOFException:
print 'End of epoch', epoch_id
train_reader.reset()
test_reader.start()
try:
while True:
print 'test loss', numpy.array(
tester.run(fetch_list=[test_loss.name]))
except fluid.core.EOFException:
print 'End of testing'
test_reader.reset()
if __name__ == '__main__':
main()
......@@ -31,7 +31,10 @@ def load_vocab(filename):
# load word dict with paddle inner function
word_dict = load_vocab(sys.argv[1])
if len(sys.argv) == 1:
word_dict = paddle.dataset.imdb.word_dict()
else:
word_dict = load_vocab(sys.argv[1])
word_dict["<unk>"] = len(word_dict)
print "Dict dim = ", len(word_dict)
......
......@@ -41,16 +41,14 @@ def network_cfg(is_train, pass_num=100):
pass_num=pass_num,
shapes=[[-1, 1], [-1, 1]],
lod_levels=[1, 0],
dtypes=['int64', 'int64'],
thread_num=1)
dtypes=['int64', 'int64'])
test_file_obj = fluid.layers.open_files(
filenames=TEST_FILES,
pass_num=1,
shapes=[[-1, 1], [-1, 1]],
lod_levels=[1, 0],
dtypes=['int64', 'int64'],
thread_num=1)
dtypes=['int64', 'int64'])
if is_train:
file_obj = fluid.layers.shuffle(train_file_obj, buffer_size=1000)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
A simple machine translation demo using beam search decoder.
"""
import contextlib
import numpy as np
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import paddle.fluid.framework as framework
import paddle.fluid.layers as layers
from paddle.fluid.executor import Executor
from paddle.fluid.contrib.decoder.beam_search_decoder import *
import unittest
import os
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32
word_dim = 32
decoder_size = hidden_dim
IS_SPARSE = True
batch_size = 2
max_length = 8
topk_size = 50
trg_dic_size = 10000
beam_size = 2
def encoder():
# encoder
src_word = layers.data(
name="src_word", shape=[1], dtype='int64', lod_level=1)
src_embedding = layers.embedding(
input=src_word,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE)
fc1 = layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoder_out = layers.sequence_last_step(input=lstm_hidden0)
return encoder_out
def decoder_state_cell(context):
h = InitState(init=context, need_reorder=True)
state_cell = StateCell(inputs={'x': None}, states={'h': h}, out_state='h')
@state_cell.state_updater
def updater(state_cell):
current_word = state_cell.get_input('x')
prev_h = state_cell.get_state('h')
# make sure lod of h heritted from prev_h
h = layers.fc(input=[prev_h, current_word],
size=decoder_size,
act='tanh')
state_cell.set_state('h', h)
return state_cell
def decoder_train(state_cell):
# decoder
trg_language_word = layers.data(
name="target_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = layers.embedding(
input=trg_language_word,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE)
decoder = TrainingDecoder(state_cell)
with decoder.block():
current_word = decoder.step_input(trg_embedding)
decoder.state_cell.compute_state(inputs={'x': current_word})
current_score = layers.fc(input=decoder.state_cell.get_state('h'),
size=target_dict_dim,
act='softmax')
decoder.state_cell.update_states()
decoder.output(current_score)
return decoder()
def decoder_decode(state_cell):
init_ids = layers.data(
name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = layers.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
decoder = BeamSearchDecoder(
state_cell=state_cell,
init_ids=init_ids,
init_scores=init_scores,
target_dict_dim=target_dict_dim,
word_dim=word_dim,
input_var_dict={},
topk_size=topk_size,
sparse_emb=IS_SPARSE,
max_len=max_length,
beam_size=beam_size,
end_id=1,
name=None)
decoder.decode()
translation_ids, translation_scores = decoder()
return translation_ids, translation_scores
def train_main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder()
state_cell = decoder_state_cell(context)
rnn_out = decoder_train(state_cell)
label = layers.data(
name="target_next_word", shape=[1], dtype='int64', lod_level=1)
cost = layers.cross_entropy(input=rnn_out, label=label)
avg_cost = layers.mean(x=cost)
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3)
optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
feed_order = ['src_word', 'target_word', 'target_next_word']
exe = Executor(place)
def train_loop(main_program):
exe.run(framework.default_startup_program())
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
for pass_id in xrange(1):
for batch_id, data in enumerate(train_reader()):
outs = exe.run(main_program,
feed=feeder.feed(data),
fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0])
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val))
if batch_id > 3:
break
train_loop(framework.default_main_program())
def decode_main(use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder()
state_cell = decoder_state_cell(context)
translation_ids, translation_scores = decoder_decode(state_cell)
exe = Executor(place)
exe.run(framework.default_startup_program())
init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64')
init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [1] * batch_size
init_lod = [init_lod, init_lod]
init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
feed_order = ['src_word']
feed_list = [
framework.default_main_program().global_block().var(var_name)
for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
data = train_reader().next()
feed_dict = feeder.feed(map(lambda x: [x[0]], data))
feed_dict['init_ids'] = init_ids
feed_dict['init_scores'] = init_scores
result_ids, result_scores = exe.run(
framework.default_main_program(),
feed=feed_dict,
fetch_list=[translation_ids, translation_scores],
return_numpy=False)
print result_ids.lod()
class TestBeamSearchDecoder(unittest.TestCase):
pass
@contextlib.contextmanager
def scope_prog_guard():
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
def inject_test_train(use_cuda):
f_name = 'test_{0}_train'.format('cuda' if use_cuda else 'cpu')
def f(*args):
with scope_prog_guard():
train_main(use_cuda)
setattr(TestBeamSearchDecoder, f_name, f)
def inject_test_decode(use_cuda, decorator=None):
f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu', 'sparse')
def f(*args):
with scope_prog_guard():
decode_main(use_cuda)
if decorator is not None:
f = decorator(f)
setattr(TestBeamSearchDecoder, f_name, f)
for _use_cuda_ in (False, True):
inject_test_train(_use_cuda_)
for _use_cuda_ in (False, True):
_decorator_ = None
inject_test_decode(use_cuda=_use_cuda_, decorator=_decorator_)
if __name__ == '__main__':
unittest.main()
......@@ -12,6 +12,11 @@ endif(NOT WITH_MKLDNN)
if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_recv_op)
list(REMOVE_ITEM TEST_OPS test_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler)
list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
LIST(REMOVE_ITEM TEST_OPS test_dist_mnist)
LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec)
endif(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
......@@ -43,13 +48,17 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op)
list(REMOVE_ITEM TEST_OPS test_dist_train)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
if(WITH_DISTRIBUTE)
py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
from paddle.fluid import core
import unittest
from multiprocessing import Process
import os
import sys
import signal
# Fix seed for test
fluid.default_startup_program().random_seed = 1
fluid.default_main_program().random_seed = 1
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "piecewise_decay",
"epochs": [30, 60, 90],
"steps": [0.1, 0.01, 0.001, 0.0001]
}
}
class SE_ResNeXt():
def __init__(self, layers=50):
self.params = train_parameters
self.layers = layers
def net(self, input, class_dim=1000):
layers = self.layers
supported_layers = [50, 101, 152]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 50:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 6, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
elif layers == 101:
cardinality = 32
reduction_ratio = 16
depth = [3, 4, 23, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
elif layers == 152:
cardinality = 64
reduction_ratio = 16
depth = [3, 8, 36, 3]
num_filters = [128, 256, 512, 1024]
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=3,
stride=2,
act='relu')
conv = self.conv_bn_layer(
input=conv, num_filters=64, filter_size=3, stride=1, act='relu')
conv = self.conv_bn_layer(
input=conv,
num_filters=128,
filter_size=3,
stride=1,
act='relu')
conv = fluid.layers.pool2d(
input=conv, pool_size=3, pool_stride=2, pool_padding=1, \
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
cardinality=cardinality,
reduction_ratio=reduction_ratio)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
drop = fluid.layers.dropout(x=pool, dropout_prob=0.2)
stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0)
out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
return out
def shortcut(self, input, ch_out, stride):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
filter_size = 1
return self.conv_bn_layer(input, ch_out, filter_size, stride)
else:
return input
def bottleneck_block(self, input, num_filters, stride, cardinality,
reduction_ratio):
conv0 = self.conv_bn_layer(
input=input, num_filters=num_filters, filter_size=1, act='relu')
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
groups=cardinality,
act='relu')
conv2 = self.conv_bn_layer(
input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
scale = self.squeeze_excitation(
input=conv2,
num_channels=num_filters * 2,
reduction_ratio=reduction_ratio)
short = self.shortcut(input, num_filters * 2, stride)
return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) / 2,
groups=groups,
act=None,
bias_attr=False)
return fluid.layers.batch_norm(input=conv, act=act)
def squeeze_excitation(self, input, num_channels, reduction_ratio):
pool = fluid.layers.pool2d(
input=input, pool_size=0, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
squeeze = fluid.layers.fc(input=pool,
size=num_channels / reduction_ratio,
act='relu')
stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0)
excitation = fluid.layers.fc(input=squeeze,
size=num_channels,
act='sigmoid')
scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
return scale
def get_model(batch_size):
# Input data
image = fluid.layers.fill_constant(
shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
label = fluid.layers.fill_constant(
shape=[batch_size, 1], dtype='int64', value=0.0)
# Train program
model = SE_ResNeXt(layers=50)
out = model.net(input=image, class_dim=102)
cost = fluid.layers.cross_entropy(input=out, label=label)
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
# Evaluator
test_program = fluid.default_main_program().clone(for_test=True)
# Optimization
total_images = 6149 # flowers
epochs = [30, 60, 90]
step = int(total_images / batch_size + 1)
bd = [step * e for e in epochs]
base_lr = 0.1
lr = []
lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4))
optimizer.minimize(avg_cost)
# Reader
train_reader = paddle.batch(
paddle.dataset.flowers.train(), batch_size=batch_size)
test_reader = paddle.batch(
paddle.dataset.flowers.test(), batch_size=batch_size)
return test_program, avg_cost, train_reader, test_reader, acc_top1, out
def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
t = fluid.DistributeTranspiler()
t.transpile(
trainer_id=trainer_id,
program=main_program,
pservers=pserver_endpoints,
trainers=trainers)
return t
class DistSeResneXt2x2:
def run_pserver(self, pserver_endpoints, trainers, current_endpoint,
trainer_id):
get_model(batch_size=2)
t = get_transpiler(trainer_id,
fluid.default_main_program(), pserver_endpoints,
trainers)
pserver_prog = t.get_pserver_program(current_endpoint)
startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup_prog)
exe.run(pserver_prog)
def _wait_ps_ready(self, pid):
retry_times = 20
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
print("waiting ps ready: ", pid)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
batch_size=20)
if is_dist:
t = get_transpiler(trainer_id,
fluid.default_main_program(), endpoints,
trainers)
trainer_prog = t.get_trainer_program()
else:
trainer_prog = fluid.default_main_program()
startup_exe = fluid.Executor(place)
startup_exe.run(fluid.default_startup_program())
strategy = fluid.ExecutionStrategy()
strategy.num_threads = 1
strategy.allow_op_delay = False
exe = fluid.ParallelExecutor(
True,
loss_name=avg_cost.name,
exec_strategy=strategy,
num_trainers=trainers,
trainer_id=trainer_id)
feed_var_list = [
var for var in trainer_prog.global_block().vars.itervalues()
if var.is_data
]
feeder = fluid.DataFeeder(feed_var_list, place)
reader_generator = train_reader()
first_loss, = exe.run(fetch_list=[avg_cost.name])
print(first_loss)
for i in xrange(5):
loss, = exe.run(fetch_list=[avg_cost.name])
last_loss, = exe.run(fetch_list=[avg_cost.name])
print(last_loss)
def main(role="pserver",
endpoints="127.0.0.1:9123",
trainer_id=0,
current_endpoint="127.0.0.1:9123",
trainers=1,
is_dist=True):
model = DistSeResneXt2x2()
if role == "pserver":
model.run_pserver(endpoints, trainers, current_endpoint, trainer_id)
else:
p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
) else fluid.CPUPlace()
model.run_trainer(p, endpoints, trainer_id, trainers, is_dist)
if __name__ == "__main__":
if len(sys.argv) != 7:
print(
"Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]"
)
role = sys.argv[1]
endpoints = sys.argv[2]
trainer_id = int(sys.argv[3])
current_endpoint = sys.argv[4]
trainers = int(sys.argv[5])
is_dist = True if sys.argv[6] == "TRUE" else False
main(
role=role,
endpoints=endpoints,
trainer_id=trainer_id,
current_endpoint=current_endpoint,
trainers=trainers,
is_dist=is_dist)
......@@ -15,13 +15,13 @@
import unittest
import numpy as np
from op_test import OpTest
from paddle.fluid import metrics
class TestAucOp(OpTest):
def setUp(self):
self.op_type = "auc"
pred = np.random.random((128, 2)).astype("float32")
indices = np.random.randint(0, 2, (128, 2))
labels = np.random.randint(0, 2, (128, 1))
num_thresholds = 200
tp = np.zeros((num_thresholds, )).astype("int64")
......@@ -30,8 +30,7 @@ class TestAucOp(OpTest):
fn = np.zeros((num_thresholds, )).astype("int64")
self.inputs = {
'Out': pred,
'Indices': indices,
'Predict': pred,
'Label': labels,
'TP': tp,
'TN': tn,
......@@ -39,57 +38,18 @@ class TestAucOp(OpTest):
'FN': fn
}
self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
# NOTE: sklearn use a different way to generate thresholds
# which will cause the result differs slightly:
# from sklearn.metrics import roc_curve, auc
# fpr, tpr, thresholds = roc_curve(labels, pred)
# auc_value = auc(fpr, tpr)
# we caculate AUC again using numpy for testing
kepsilon = 1e-7 # to account for floating point imprecisions
thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
for i in range(num_thresholds - 2)]
thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
# caculate TP, FN, TN, FP count
tp_list = np.ndarray((num_thresholds, ))
fn_list = np.ndarray((num_thresholds, ))
tn_list = np.ndarray((num_thresholds, ))
fp_list = np.ndarray((num_thresholds, ))
for idx_thresh, thresh in enumerate(thresholds):
tp, fn, tn, fp = 0, 0, 0, 0
for i, lbl in enumerate(labels):
if lbl:
if pred[i, 0] >= thresh:
tp += 1
else:
fn += 1
else:
if pred[i, 0] >= thresh:
fp += 1
else:
tn += 1
tp_list[idx_thresh] = tp
fn_list[idx_thresh] = fn
tn_list[idx_thresh] = tn
fp_list[idx_thresh] = fp
epsilon = 1e-6
tpr = (tp_list.astype("float32") + epsilon) / (
tp_list + fn_list + epsilon)
fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
rec = (tp_list.astype("float32") + epsilon) / (
tp_list + fp_list + epsilon)
x = fpr[:num_thresholds - 1] - fpr[1:]
y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
auc_value = np.sum(x * y)
python_auc = metrics.Auc(name="auc",
curve='ROC',
num_thresholds=num_thresholds)
python_auc.update(pred, labels)
self.outputs = {
'AUC': auc_value,
'TPOut': tp_list,
'FNOut': fn_list,
'TNOut': tn_list,
'FPOut': fp_list
'AUC': python_auc.eval(),
'TPOut': python_auc.tp_list,
'FNOut': python_auc.fn_list,
'TNOut': python_auc.tn_list,
'FPOut': python_auc.fp_list
}
def test_check_output(self):
......
......@@ -100,6 +100,8 @@ class TestBeamSearchDecodeOp(unittest.TestCase):
np.array_equal(np.array(sentence_scores), expected_data))
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp):
def setUp(self):
self.scope = core.Scope()
......
......@@ -191,12 +191,16 @@ class TestWithDilation(TestConv2dTransposeOp):
# ------------ test_cudnn ------------
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNN(TestConv2dTransposeOp):
def init_op_type(self):
self.use_cudnn = True
self.op_type = "conv2d_transpose"
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNWithPad(TestWithPad):
def init_test_case(self):
self.pad = [1, 1]
......@@ -212,6 +216,8 @@ class TestCUDNNWithPad(TestWithPad):
self.op_type = "conv2d_transpose"
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNWithStride(TestWithStride):
def init_test_case(self):
self.pad = [1, 1]
......@@ -227,6 +233,8 @@ class TestCUDNNWithStride(TestWithStride):
self.op_type = "conv2d_transpose"
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNWithGroups(TestWithGroups):
def init_test_case(self):
self.pad = [1, 1]
......
......@@ -197,12 +197,16 @@ class TestWithDilation(TestConv3dTransposeOp):
# ------------ test_cudnn ------------
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNN(TestConv3dTransposeOp):
def init_op_type(self):
self.use_cudnn = True
self.op_type = "conv3d_transpose"
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNWithPad(TestWithPad):
def init_test_case(self):
self.pad = [1, 1, 1]
......@@ -218,6 +222,8 @@ class TestCUDNNWithPad(TestWithPad):
self.op_type = "conv3d_transpose"
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNWithStride(TestWithStride):
def init_test_case(self):
self.pad = [1, 1, 1]
......@@ -233,6 +239,8 @@ class TestCUDNNWithStride(TestWithStride):
self.op_type = "conv3d_transpose"
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestCUDNNWithGroups(TestWithGroups):
def init_test_case(self):
self.pad = [1, 1, 1]
......
......@@ -142,8 +142,7 @@ class TestDataBalance(unittest.TestCase):
filenames=[self.lod_data_file_name],
shapes=[[-1, 3], [-1, 1]],
lod_levels=[1, 0],
dtypes=['float32', 'int32'],
thread_num=1)
dtypes=['float32', 'int32'])
ins, label = fluid.layers.read_file(data_reader)
place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
......@@ -156,7 +155,7 @@ class TestDataBalance(unittest.TestCase):
main_program=main_prog,
build_strategy=build_strategy)
if (parallel_exe.device_count > self.batch_size):
if parallel_exe.device_count > self.batch_size:
print("WARNING: Unittest TestDataBalance skipped. \
For the result is not correct when device count \
is larger than batch size.")
......@@ -190,3 +189,7 @@ class TestDataBalance(unittest.TestCase):
def test_all(self):
self.main()
self.main_lod()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import argparse
import time
import math
import unittest
import os
import signal
import subprocess
class TestDistSeResneXt2x2(unittest.TestCase):
def setUp(self):
self._trainers = 2
self._pservers = 2
self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124"
self._python_interp = "python"
def start_pserver(self):
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
ps0_proc = subprocess.Popen(
ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
ps1_proc = subprocess.Popen(
ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return ps0_proc, ps1_proc
def _wait_ps_ready(self, pid):
retry_times = 20
while True:
assert retry_times >= 0, "wait ps ready failed"
time.sleep(3)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
retry_times -= 1
def non_test_with_place(self):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs = {
"PATH": os.getenv("PATH"),
"PYTHONPATH": os.getenv("PYTHONPATH"),
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15"
}
# Run local to get a base line
env_local = {"CUDA_VISIBLE_DEVICES": "0"}
env_local.update(required_envs)
local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
(self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
local_proc = subprocess.Popen(
local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
local_proc.wait()
local_ret = local_proc.stdout.read()
# Run dist train to compare with local results
ps0, ps1 = self.start_pserver()
self._wait_ps_ready(ps0.pid)
self._wait_ps_ready(ps1.pid)
ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps0_ep, self._trainers)
tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \
(self._python_interp, self._ps_endpoints, ps1_ep, self._trainers)
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
env0.update(required_envs)
env1.update(required_envs)
FNULL = open(os.devnull, 'w')
tr0_proc = subprocess.Popen(
tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
tr1_proc = subprocess.Popen(
tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
tr0_proc.wait()
tr1_proc.wait()
loss_data0 = tr0_proc.stdout.read()
lines = loss_data0.split("\n")
dist_first_loss = eval(lines[0].replace(" ", ","))[0]
dist_last_loss = eval(lines[1].replace(" ", ","))[0]
local_lines = local_ret.split("\n")
local_first_loss = eval(local_lines[0])[0]
local_last_loss = eval(local_lines[1])[0]
self.assertAlmostEqual(local_first_loss, dist_first_loss)
self.assertAlmostEqual(local_last_loss, dist_last_loss)
# check tr0_out
# FIXME: ensure the server process is killed
# replace with ps0.terminate()
os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL)
FNULL.close()
if __name__ == "__main__":
unittest.main()
......@@ -22,6 +22,9 @@ import numpy
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.layers.io import ListenAndServ
from paddle.fluid.layers.io import Recv
from paddle.fluid.layers.io import Send
class TestSendOp(unittest.TestCase):
......@@ -65,8 +68,7 @@ class TestSendOp(unittest.TestCase):
main = fluid.Program()
with fluid.program_guard(main):
serv = layers.ListenAndServ(
"127.0.0.1:0", ["X"], optimizer_mode=False)
serv = ListenAndServ("127.0.0.1:0", ["X"], optimizer_mode=False)
with serv.do():
out_var = main.global_block().create_var(
name="scale_0.tmp_0",
......@@ -99,8 +101,8 @@ class TestSendOp(unittest.TestCase):
persistable=False,
shape=[32, 32])
fluid.initializer.Constant(value=2.3)(get_var, main.global_block())
layers.Send("127.0.0.1:%d" % port, [x])
o = layers.Recv("127.0.0.1:%d" % port, [get_var])
Send("127.0.0.1:%d" % port, [x])
o = Recv("127.0.0.1:%d" % port, [get_var])
exe = fluid.Executor(place)
self.dist_out = exe.run(main, fetch_list=o) # o is a list
......
......@@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest):
velocity_out = mu * velocity + grad
if use_nesterov:
param_out = param - grad * learning_rate + \
param_out = param - grad * learning_rate - \
velocity_out * mu * learning_rate
else:
param_out = param - learning_rate * velocity_out
......@@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest):
velocity_out = mu * velocity + grad
if use_nesterov:
param_out = param - grad * learning_rate + \
param_out = param - grad * learning_rate - \
velocity_out * mu * learning_rate
else:
param_out = param - learning_rate * velocity_out
......
......@@ -39,17 +39,17 @@ class TestMultipleReader(unittest.TestCase):
copyfile('./mnist_0.recordio', './mnist_1.recordio')
copyfile('./mnist_0.recordio', './mnist_2.recordio')
def main(self, thread_num):
def main(self, is_test=False):
file_list = [
'./mnist_0.recordio', './mnist_1.recordio', './mnist_2.recordio'
]
with fluid.program_guard(fluid.Program(), fluid.Program()):
data_files = fluid.layers.open_files(
filenames=file_list,
thread_num=thread_num,
shapes=[(-1, 784), (-1, 1)],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
dtypes=['float32', 'int64'],
is_test=is_test)
img, label = fluid.layers.read_file(data_files)
if fluid.core.is_compiled_with_cuda():
......@@ -71,6 +71,9 @@ class TestMultipleReader(unittest.TestCase):
self.assertEqual(batch_count, self.num_batch * 3)
def test_main(self):
self.main(thread_num=3) # thread number equals to file number
self.main(thread_num=10) # thread number is larger than file number
self.main(thread_num=2) # thread number is less than file number
self.main(is_test=False)
self.main(is_test=True)
if __name__ == '__main__':
unittest.main()
......@@ -15,6 +15,7 @@
import paddle.dataset.flowers as flowers
import math
import paddle.fluid as fluid
import paddle.fluid.core as core
import unittest
import numpy as np
import paddle
......@@ -92,7 +93,8 @@ class TestFetchOp(unittest.TestCase):
train_inputs.append(tst_reader_iter.next())
os.environ['CPU_NUM'] = str(4)
self.parallel_exe(train_inputs, seed=1, use_cuda=True)
if core.is_compiled_with_cuda():
self.parallel_exe(train_inputs, seed=1, use_cuda=True)
self.parallel_exe(train_inputs, seed=1, use_cuda=False)
......@@ -137,7 +139,8 @@ class TestFeedParallel(unittest.TestCase):
def test_feed_op(self):
os.environ['CPU_NUM'] = str(4)
self.parallel_exe(use_cuda=True, seed=1)
if core.is_compiled_with_cuda():
self.parallel_exe(use_cuda=True, seed=1)
self.parallel_exe(use_cuda=False, seed=1)
......
......@@ -14,6 +14,7 @@
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
......@@ -32,9 +33,7 @@ def simple_fc_net(use_feed):
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
......@@ -60,9 +59,7 @@ def fc_with_batchnorm(use_feed):
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'],
thread_num=1,
for_parallel=True)
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
......@@ -101,13 +98,25 @@ class TestMNIST(TestParallelExecutorBase):
fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder)
def _init_data(self, random=True):
np.random.seed(5)
if random:
img = np.random.random(size=[32, 784]).astype(np.float32)
else:
img = np.ones(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
# simple_fc
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
img, label = self._init_data()
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
......@@ -115,6 +124,37 @@ class TestMNIST(TestParallelExecutorBase):
use_cuda=use_cuda,
use_reduce=use_reduce)
def check_simple_fc_convergence_with_Reduce(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, use_reduce=True)
self.check_network_convergence(
simple_fc_net,
use_cuda=use_cuda,
allow_op_delay=True,
use_reduce=True)
img, label = self._init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=True)
for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def test_simple_fc(self):
# use_cuda
self.check_simple_fc_convergence(True)
......@@ -122,12 +162,15 @@ class TestMNIST(TestParallelExecutorBase):
def test_simple_fc_with_new_strategy(self):
# use_cuda, use_reduce
self.check_simple_fc_convergence(True, True)
self.check_simple_fc_convergence(False, True)
self.check_simple_fc_convergence_with_Reduce(True)
self.check_simple_fc_convergence_with_Reduce(False)
def check_simple_fc_parallel_accuracy(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data(random=False)
def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False):
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
single_first_loss, single_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1000,
......@@ -141,8 +184,7 @@ class TestMNIST(TestParallelExecutorBase):
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_parallel_executor=True,
use_reduce=use_reduce)
use_parallel_executor=True)
for p_f in parallel_first_loss:
self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
......@@ -153,30 +195,53 @@ class TestMNIST(TestParallelExecutorBase):
self.check_simple_fc_parallel_accuracy(True)
self.check_simple_fc_parallel_accuracy(False)
def test_simple_fc_parallel_accuracy_with_new_strategy(self):
# use_cuda, use_reduce
self.check_simple_fc_parallel_accuracy(True, True)
self.check_simple_fc_parallel_accuracy(False, True)
def check_batchnorm_fc_convergence(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False):
self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
img = np.zeros(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
img, label = self._init_data()
self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda)
def check_batchnorm_fc_convergence_use_reduce(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
self.check_network_convergence(
fc_with_batchnorm, use_cuda=use_cuda, use_reduce=True)
img, label = self._init_data()
all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=use_reduce)
use_reduce=False)
reduce_first_loss, reduce_last_loss = self.check_network_convergence(
fc_with_batchnorm,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=True)
for loss in zip(all_reduce_first_loss, reduce_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(all_reduce_last_loss, reduce_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-4)
def test_batchnorm_fc(self):
self.check_batchnorm_fc_convergence(True)
self.check_batchnorm_fc_convergence(False)
def test_batchnorm_fc_with_new_strategy(self):
# use_cuda, use_reduce
self.check_batchnorm_fc_convergence(True, True)
self.check_batchnorm_fc_convergence(False, True)
self.check_batchnorm_fc_convergence_use_reduce(True)
self.check_batchnorm_fc_convergence_use_reduce(False)
if __name__ == '__main__':
......
......@@ -16,6 +16,7 @@ import paddle.fluid as fluid
import paddle.fluid.layers.ops as ops
from paddle.fluid.initializer import init_on_cpu
from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
import paddle.fluid.core as core
from parallel_executor_test_base import TestParallelExecutorBase
import unittest
import math
......@@ -140,6 +141,9 @@ class TestResnet(TestParallelExecutorBase):
use_reduce=False,
iter=20):
if use_cuda and not core.is_compiled_with_cuda():
return
os.environ['CPU_NUM'] = str(4)
def _cosine_decay(learning_rate, step_each_epoch, epochs=120):
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import unittest
import os
......@@ -92,16 +93,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
def test_parallel_testing(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
self.check_network_convergence(
use_cuda=True, build_strategy=build_strategy)
if core.is_compiled_with_cuda():
self.check_network_convergence(
use_cuda=True, build_strategy=build_strategy)
self.check_network_convergence(
use_cuda=False, build_strategy=build_strategy)
def test_parallel_testing_with_new_strategy(self):
build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
self.check_network_convergence(
use_cuda=True, build_strategy=build_strategy)
if core.is_compiled_with_cuda():
self.check_network_convergence(
use_cuda=True, build_strategy=build_strategy)
self.check_network_convergence(
use_cuda=False, build_strategy=build_strategy)
......
......@@ -56,6 +56,8 @@ class TestPrintOpCPU(unittest.TestCase):
return_numpy=False)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestPrintOpGPU(TestPrintOpCPU):
def setUp(self):
self.place = core.CUDAPlace(0)
......
......@@ -79,12 +79,18 @@ class TestProfiler(unittest.TestCase):
pass_acc_calculator.add(value=acc, weight=b_size)
pass_acc = pass_acc_calculator.eval()
@unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU")
def test_cpu_profiler(self):
self.net_profiler('CPU')
@unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU")
def test_cuda_profiler(self):
self.net_profiler('GPU')
@unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU")
def test_all_profiler(self):
self.net_profiler('All', '/tmp/profile_out')
with open('/tmp/profile_out', 'r') as f:
......
......@@ -45,12 +45,12 @@ class TestPyReader(unittest.TestCase):
) else fluid.CPUPlace()
executor = fluid.Executor(place)
data_file, feed_queue = fluid.layers.py_reader(
data_file = fluid.layers.py_reader(
capacity=self.capacity,
dtypes=self.dtypes,
lod_levels=self.lod_levels,
shapes=self.shapes)
feed_queue = data_file.queue
read_out_data = fluid.layers.read_file(data_file)
self.inputs = []
......
......@@ -52,11 +52,13 @@ def simple_fc_net(in_size,
batch_size,
queue_capacity,
use_double_buffer=False):
reader, feed_queue = fluid.layers.py_reader(
reader = fluid.layers.py_reader(
capacity=queue_capacity,
shapes=[[-1, in_size], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
dtypes=['float32', 'int64'],
use_double_buffer=False)
feed_queue = reader.queue
reader = fluid.layers.batch(reader, batch_size=batch_size)
if use_double_buffer:
reader = fluid.layers.double_buffer(reader)
......
......@@ -89,15 +89,11 @@ class TestProdOp(OpTest):
self.check_grad(['X'], 'Out')
class TestKeepDimReduce(OpTest):
class Test1DReduce(OpTest):
def setUp(self):
self.op_type = "reduce_sum"
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.attrs = {'dim': [-2], 'keep_dim': True}
self.outputs = {
'Out':
self.inputs['X'].sum(axis=tuple(self.attrs['dim']), keepdims=True)
}
self.inputs = {'X': np.random.random(20).astype("float64")}
self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
def test_check_output(self):
self.check_output()
......@@ -106,32 +102,82 @@ class TestKeepDimReduce(OpTest):
self.check_grad(['X'], 'Out')
class Test1DReduce(OpTest):
class Test2DReduce0(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.inputs = {'X': np.random.random(20).astype("float64")}
self.attrs = {'dim': [0]}
self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
class Test2DReduce1(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [1]}
self.inputs = {'X': np.random.random((20, 10)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class TestReduceAll(OpTest):
class Test3DReduce0(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [1]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class Test3DReduce1(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class Test3DReduce2(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [-2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class Test3DReduce3(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.attrs = {'dim': [1, 2]}
self.inputs = {'X': np.random.random((5, 6, 7)).astype("float64")}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']))
}
class TestKeepDimReduce(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
self.attrs = {'dim': [1], 'keep_dim': True}
self.outputs = {
'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
keepdims=self.attrs['keep_dim'])
}
class TestReduceAll(Test1DReduce):
def setUp(self):
self.op_type = "reduce_sum"
self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
self.attrs = {'reduce_all': True}
self.outputs = {'Out': self.inputs['X'].sum()}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
## reduction in multi dims
class TestReduceMeanOpMultiAxises(OpTest):
......
......@@ -61,6 +61,8 @@ class TestSequenceSoftmaxOp(OpTest):
# ----------------cudnn Sequencesoftmax----------------
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp):
def init_op_type(self):
self.use_cudnn = True
......
......@@ -63,11 +63,15 @@ class TestSoftmaxOp(OpTest):
self.check_grad(["X"], "Out", max_relative_error=0.01)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxCUDNNOp(TestSoftmaxOp):
def init_kernel_type(self):
self.use_cudnn = True
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxFP16Op(TestSoftmaxOp):
def init_kernel_type(self):
self.dtype = np.float16
......@@ -79,6 +83,8 @@ class TestSoftmaxFP16Op(TestSoftmaxOp):
self.check_output_with_place(place, atol=1e-3)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp):
def init_kernel_type(self):
self.use_cudnn = True
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import re
import paddle.version as fluid_version
class VersionTest(unittest.TestCase):
def setUp(self):
self._major_regex = "[0-9]+"
self._minor_regex = "[0-9]+"
self._patch_regex = "[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
self._rc_regex = "[0-9]+"
self._version_regex = "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?"
self._commit_regex = "[0-9a-f]{5,49}"
def test_check_output(self):
# check commit format
self.assertTrue(re.match(self._commit_regex, fluid_version.commit))
self.assertTrue(isinstance(fluid_version.istaged, bool))
# check version format
if fluid_version.istaged:
self.assertEqual(fluid_version.major, 0)
self.assertEqual(fluid_version.minor, 0)
self.assertEqual(fluid_version.patch, "0")
self.assertEqual(fluid_version.rc, 0)
self.assertEqual(fluid_version.full_version, "0.0.0")
else:
self.assertTrue(re.match(self._major_regex, fluid_version.major))
self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
self.assertTrue(
re.match(self._version_regex, fluid_version.full_version))
......@@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size):
for i in xrange(buffer_size):
yield images[i, :], int(labels[i])
finally:
m.terminate()
l.terminate()
try:
m.terminate()
except:
pass
try:
l.terminate()
except:
pass
return reader
......
......@@ -104,6 +104,8 @@ packages=['paddle',
'paddle.fluid.proto',
'paddle.fluid.proto.profiler',
'paddle.fluid.layers',
'paddle.fluid.contrib',
'paddle.fluid.contrib.decoder',
'paddle.fluid.transpiler',
'paddle.fluid.transpiler.details']
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册