diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae1857642e8df4b3859eec68a3a5227d1c4fcb3
--- /dev/null
+++ b/benchmark/paddle/image/resnet.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg("layer_num", int, 50)
+is_test = get_config_arg("is_test", bool, False)
+
+args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+define_py_data_sources2(
+ "train.list", None, module="provider", obj="process", args=args)
+
+settings(
+ batch_size=batch_size,
+ learning_rate=0.01 / batch_size,
+ learning_method=MomentumOptimizer(0.9),
+ regularization=L2Regularization(0.0005 * batch_size))
+
+
+#######################Network Configuration #############
+def conv_bn_layer(name,
+ input,
+ filter_size,
+ num_filters,
+ stride,
+ padding,
+ channels=None,
+ active_type=ReluActivation()):
+ """
+ A wrapper for conv layer with batch normalization layers.
+ Note:
+ conv layer has no activation.
+ """
+
+ tmp = img_conv_layer(
+ name=name + "_conv",
+ input=input,
+ filter_size=filter_size,
+ num_channels=channels,
+ num_filters=num_filters,
+ stride=stride,
+ padding=padding,
+ act=LinearActivation(),
+ bias_attr=False)
+ return batch_norm_layer(
+ name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+
+
+def bottleneck_block(name, input, num_filters1, num_filters2):
+ """
+ A wrapper for bottlenect building block in ResNet.
+ Last conv_bn_layer has no activation.
+ Addto layer has activation of relu.
+ """
+ last_name = conv_bn_layer(
+ name=name + '_branch2a',
+ input=input,
+ filter_size=1,
+ num_filters=num_filters1,
+ stride=1,
+ padding=0)
+ last_name = conv_bn_layer(
+ name=name + '_branch2b',
+ input=last_name,
+ filter_size=3,
+ num_filters=num_filters1,
+ stride=1,
+ padding=1)
+ last_name = conv_bn_layer(
+ name=name + '_branch2c',
+ input=last_name,
+ filter_size=1,
+ num_filters=num_filters2,
+ stride=1,
+ padding=0,
+ active_type=LinearActivation())
+
+ return addto_layer(
+ name=name + "_addto", input=[input, last_name], act=ReluActivation())
+
+
+def mid_projection(name, input, num_filters1, num_filters2, stride=2):
+ """
+ A wrapper for middile projection in ResNet.
+ projection shortcuts are used for increasing dimensions,
+ and other shortcuts are identity
+ branch1: projection shortcuts are used for increasing
+ dimensions, has no activation.
+ branch2x: bottleneck building block, shortcuts are identity.
+ """
+ # stride = 2
+ branch1 = conv_bn_layer(
+ name=name + '_branch1',
+ input=input,
+ filter_size=1,
+ num_filters=num_filters2,
+ stride=stride,
+ padding=0,
+ active_type=LinearActivation())
+
+ last_name = conv_bn_layer(
+ name=name + '_branch2a',
+ input=input,
+ filter_size=1,
+ num_filters=num_filters1,
+ stride=stride,
+ padding=0)
+ last_name = conv_bn_layer(
+ name=name + '_branch2b',
+ input=last_name,
+ filter_size=3,
+ num_filters=num_filters1,
+ stride=1,
+ padding=1)
+
+ last_name = conv_bn_layer(
+ name=name + '_branch2c',
+ input=last_name,
+ filter_size=1,
+ num_filters=num_filters2,
+ stride=1,
+ padding=0,
+ active_type=LinearActivation())
+
+ return addto_layer(
+ name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
+
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
+ """
+ A wrapper for 50,101,152 layers of ResNet.
+ res2_num: number of blocks stacked in conv2_x
+ res3_num: number of blocks stacked in conv3_x
+ res4_num: number of blocks stacked in conv4_x
+ res5_num: number of blocks stacked in conv5_x
+ """
+ # For ImageNet
+ # conv1: 112x112
+ tmp = conv_bn_layer(
+ "conv1",
+ input=img,
+ filter_size=7,
+ channels=3,
+ num_filters=64,
+ stride=2,
+ padding=3)
+ tmp = img_pool_layer(name="pool1", input=tmp, pool_size=3, stride=2)
+
+ # conv2_x: 56x56
+ tmp = mid_projection(
+ name="res2_1", input=tmp, num_filters1=64, num_filters2=256, stride=1)
+ for i in xrange(2, res2_num + 1, 1):
+ tmp = bottleneck_block(
+ name="res2_" + str(i), input=tmp, num_filters1=64, num_filters2=256)
+
+ # conv3_x: 28x28
+ tmp = mid_projection(
+ name="res3_1", input=tmp, num_filters1=128, num_filters2=512)
+ for i in xrange(2, res3_num + 1, 1):
+ tmp = bottleneck_block(
+ name="res3_" + str(i),
+ input=tmp,
+ num_filters1=128,
+ num_filters2=512)
+
+ # conv4_x: 14x14
+ tmp = mid_projection(
+ name="res4_1", input=tmp, num_filters1=256, num_filters2=1024)
+ for i in xrange(2, res4_num + 1, 1):
+ tmp = bottleneck_block(
+ name="res4_" + str(i),
+ input=tmp,
+ num_filters1=256,
+ num_filters2=1024)
+
+ # conv5_x: 7x7
+ tmp = mid_projection(
+ name="res5_1", input=tmp, num_filters1=512, num_filters2=2048)
+ for i in xrange(2, res5_num + 1, 1):
+ tmp = bottleneck_block(
+ name="res5_" + str(i),
+ input=tmp,
+ num_filters1=512,
+ num_filters2=2048)
+
+ tmp = img_pool_layer(
+ name='avgpool',
+ input=tmp,
+ pool_size=7,
+ stride=1,
+ pool_type=AvgPooling())
+
+ return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 50:
+ resnet = deep_res_net(3, 4, 6, 3)
+elif layer_num == 101:
+ resnet = deep_res_net(3, 4, 23, 3)
+elif layer_num == 152:
+ resnet = deep_res_net(3, 8, 36, 3)
+else:
+ print("Wrong layer number.")
+
+lbl = data_layer(name="label", size=num_class)
+loss = cross_entropy(name='loss', input=resnet, label=lbl)
+inputs(img, lbl)
+outputs(loss)
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh
index e31fec1cd850157d90ddcab2d559d52381ecd317..a4527e04968cf8c8c3c31d16f50bc3e28381f6d8 100755
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -5,22 +5,23 @@ function train() {
export OMP_DYNAMIC="FALSE"
export KMP_AFFINITY="granularity=fine,compact,0,0"
topology=$1
- bs=$2
- use_mkldnn=$3
- if [ $3 == "True" ]; then
+ layer_num=$2
+ bs=$3
+ use_mkldnn=$4
+ if [ $4 == "True" ]; then
thread=1
- log="logs/${topology}-mkldnn-${bs}.log"
- elif [ $3 == "False" ]; then
+ log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+ elif [ $4 == "False" ]; then
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
- log="logs/${topology}-${thread}mklml-${bs}.log"
+ log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $3, use True or False."
exit 0
fi
- args="batch_size=${bs}"
+ args="batch_size=${bs},layer_num=${layer_num}"
config="${topology}.py"
paddle train --job=time \
--config=$config \
@@ -40,12 +41,9 @@ if [ ! -d "logs" ]; then
mkdir logs
fi
-#========== mkldnn ==========#
-train vgg 64 True
-train vgg 128 True
-train vgg 256 True
-
-#========== mklml ===========#
-train vgg 64 False
-train vgg 128 False
-train vgg 256 False
+for use_mkldnn in True False; do
+ for batchsize in 64 128 256; do
+ train vgg 19 $batchsize $use_mkldnn
+ train resnet 50 $batchsize $use_mkldnn
+ done
+done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index b8429975f5c83df6996e71478fe276b246e8b77b..420884ed8e1ae36a3f1772bfbe8323f3d0ea71e6 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -13,7 +13,7 @@ define_py_data_sources2(
settings(
batch_size=batch_size,
- learning_rate=0.01 / batch_size,
+ learning_rate=0.001 / batch_size,
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 9686df00219001769d074ee815d9cc8db0258496..5a06825beb73e85d8a55b7b578b187bee2c4340c 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -46,16 +46,20 @@ IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
ENDIF()
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
ExternalProject_Add(
${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
- GIT_TAG "v0.10"
+ GIT_TAG "v0.11"
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT}
+ CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+ CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-DMKLROOT:PATH=${MKLDNN_MKLROOT}
)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 74f3279831357c21038df133df0f5a432a6dfd20..20dbc32a738d982df2d3f035206279c82c8de264 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -27,8 +27,8 @@ ENDIF()
INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
-SET(MKLML_VER "mklml_lnx_2018.0.20170720")
-SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.10/${MKLML_VER}.tgz")
+SET(MKLML_VER "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml")
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -2,112 +2,9 @@
Data Reader Interface and DataSets
==================================
+.. toctree::
+ :maxdepth: 1
-DataTypes
-=========
-
-.. automodule:: paddle.v2.data_type
- :members:
- :noindex:
-
-DataFeeder
-==========
-
-.. automodule:: paddle.v2.data_feeder
- :members:
- :noindex:
-
-Reader
-======
-
-.. automodule:: paddle.v2.reader
- :members:
- :noindex:
-
-.. automodule:: paddle.v2.reader.creator
- :members:
- :noindex:
-
-minibatch
-=========
-
-.. automodule:: paddle.v2.minibatch
- :members:
- :noindex:
-
-Dataset
-=======
-
-.. automodule:: paddle.v2.dataset
- :members:
- :noindex:
-
-mnist
-+++++
-
-.. automodule:: paddle.v2.dataset.mnist
- :members:
- :noindex:
-
-cifar
-+++++
-
-.. automodule:: paddle.v2.dataset.cifar
- :members:
- :noindex:
-
-conll05
-+++++++
-
-.. automodule:: paddle.v2.dataset.conll05
- :members: get_dict,get_embedding,test
- :noindex:
-
-imdb
-++++
-
-.. automodule:: paddle.v2.dataset.imdb
- :members:
- :noindex:
-
-imikolov
-++++++++
-
-.. automodule:: paddle.v2.dataset.imikolov
- :members:
- :noindex:
-
-movielens
-+++++++++
-
-.. automodule:: paddle.v2.dataset.movielens
- :members:
- :noindex:
-
-.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
- :noindex:
-
-.. autoclass:: paddle.v2.dataset.movielens.UserInfo
- :noindex:
-
-sentiment
-+++++++++
-
-.. automodule:: paddle.v2.dataset.sentiment
- :members:
- :noindex:
-
-uci_housing
-+++++++++++
-
-.. automodule:: paddle.v2.dataset.uci_housing
- :members:
- :noindex:
-
-wmt14
-+++++
-
-.. automodule:: paddle.v2.dataset.wmt14
- :members:
- :noindex:
-
+ data/data_reader.rst
+ data/image.rst
+ data/dataset.rst
diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst
new file mode 100644
index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e
--- /dev/null
+++ b/doc/api/v2/data/data_reader.rst
@@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+.. automodule:: paddle.v2.data_type
+ :members:
+ :noindex:
+
+DataFeeder
+==========
+
+.. automodule:: paddle.v2.data_feeder
+ :members:
+ :noindex:
+
+Reader
+======
+
+.. automodule:: paddle.v2.reader
+ :members:
+ :noindex:
+
+.. automodule:: paddle.v2.reader.creator
+ :members:
+ :noindex:
+
+minibatch
+=========
+
+.. automodule:: paddle.v2.minibatch
+ :members:
+ :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365
--- /dev/null
+++ b/doc/api/v2/data/dataset.rst
@@ -0,0 +1,75 @@
+Dataset
+=======
+
+.. automodule:: paddle.v2.dataset
+ :members:
+ :noindex:
+
+mnist
++++++
+
+.. automodule:: paddle.v2.dataset.mnist
+ :members:
+ :noindex:
+
+cifar
++++++
+
+.. automodule:: paddle.v2.dataset.cifar
+ :members:
+ :noindex:
+
+conll05
++++++++
+
+.. automodule:: paddle.v2.dataset.conll05
+ :members: get_dict,get_embedding,test
+ :noindex:
+
+imdb
+++++
+
+.. automodule:: paddle.v2.dataset.imdb
+ :members:
+ :noindex:
+
+imikolov
+++++++++
+
+.. automodule:: paddle.v2.dataset.imikolov
+ :members:
+ :noindex:
+
+movielens
++++++++++
+
+.. automodule:: paddle.v2.dataset.movielens
+ :members:
+ :noindex:
+
+.. autoclass:: paddle.v2.dataset.movielens.MovieInfo
+ :noindex:
+
+.. autoclass:: paddle.v2.dataset.movielens.UserInfo
+ :noindex:
+
+sentiment
++++++++++
+
+.. automodule:: paddle.v2.dataset.sentiment
+ :members:
+ :noindex:
+
+uci_housing
++++++++++++
+
+.. automodule:: paddle.v2.dataset.uci_housing
+ :members:
+ :noindex:
+
+wmt14
++++++
+
+.. automodule:: paddle.v2.dataset.wmt14
+ :members:
+ :noindex:
diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst
new file mode 100644
index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49
--- /dev/null
+++ b/doc/api/v2/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+.. automodule:: paddle.v2.image
+ :members:
diff --git a/doc/design/float16.md b/doc/design/float16.md
index bc1c20c3d122e783e0cd189372dab08d35042d45..078801ba2ed969d26dd31d5ec4ed268686cf7016 100644
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -55,6 +55,6 @@ After float16 class is available, some of the future items are below:
- Update pybind/tensor_py.h to bind c++ float16 with numpy float16.
-- Modify `IndicateDataType()` method in `framework/operator.h` to make it compatible with float16.
+- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..8b0d90f7b9d8184b314b0ee4e521f53eb5f1b455
Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
new file mode 100644
index 0000000000000000000000000000000000000000..9007aae7a8355ed06c6720a921351f81b859c1fe
--- /dev/null
+++ b/doc/design/ops/sequence_decoder.md
@@ -0,0 +1,245 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and image to text,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences,
+it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search,
+due to the complexity, the implementation relays on a lot of special data structures,
+quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks,
+so the flexibility of sequence decoder is very important to users.
+
+During PaddlePaddle's refactoring work,
+some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage,
+and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences,
+it stores several arrays of integers each represents a level.
+
+The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clear.
+
+The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that relay on empty sequence representation,
+such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following demos are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence,
+and a decoder which uses the sequence decoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+ name='source_language_word',
+ type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+ encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+ decoder = pd.while_loop()
+ with decoder.step():
+ decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory
+ generated_ids = decoder.memory() # TODO init to batch_size s
+ generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+ target_word = pd.lookup(trg_embedding, gendrated_ids)
+ # expand encoder_ctx's batch to fit target_word's lod
+ # for example
+ # decoder_mem.lod is
+ # [[0 1 3],
+ # [0 1 3 6]]
+ # its tensor content is [a1 a2 a3 a4 a5]
+ # which means there are 2 sentences to translate
+ # - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+ # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+ # the target_word.lod is
+ # [[0, 1, 6]
+ # [0, 2, 4, 7, 9 12]]
+ # which means 2 sentences to translate, each has 1 and 5 prefixes
+ # the first prefix has 2 candidates
+ # the following has 2, 3, 2, 3 candidates
+ # the encoder_ctx_expanded's content will be
+ # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+ encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+ decoder_input = pd.fc(
+ act=pd.activation.Linear(),
+ input=[target_word, encoder_ctx],
+ size=3 * decoder_dim)
+ gru_out, cur_mem = pd.gru_step(
+ decoder_input, mem=decoder_mem, size=decoder_dim)
+ scores = pd.fc(
+ gru_out,
+ size=trg_dic_size,
+ bias=None,
+ act=pd.activation.Softmax())
+ # K is an config
+ topk_scores, topk_ids = pd.top_k(scores, K)
+ topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+ selected_ids, selected_generation_scores = decoder.beam_search(
+ topk_ids, topk_generated_scores)
+
+ # update the states
+ decoder_mem.update(cur_mem) # tells how to update state
+ generated_ids.update(selected_ids)
+ generated_scores.update(selected_generation_scores)
+
+ decoder.output(selected_ids)
+ decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates,
+return the result of the beam search algorithm.
+
+In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes
+
+1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. remove some specific candidate in `selected_ids`
+3. get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`
+
+- the first level represents `batch_size` of (source) sentences;
+- the second level represents the candidate ID sets for translation prefix.
+
+for example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape,
+a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state stored in `encoder_ctx_expanded`
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+ - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+ - a2 a2
+ - b1 b1 b1
+ - b2
+ - b3 b3
+ - None (c1 has 0 candidates, so c1 is dropped)
+
+Benefit from the relative offset LoD, empty candidate set can be represented naturally.
+
+the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+the `selected_ids` is the candidate ids for the prefixes,
+it will be `Packed` by `TensorArray` to a two-level `LoDTensor`,
+the first level represents the source sequences,
+the second level represents generated sequences.
+
+Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations.
+
+Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+
+
+
+
+According the image above, the only phrase to change LoD is beam search.
+
+## Beam search design
+The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs
+
+1. `topk_ids`, top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of the are LoDTensors, so that the sequence affilication is clear.
+Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors,
+and they exist in each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors,
+the results of beam search are better to store in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support pack or unpack an array of `LoDTensors`.
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 4e8d630c2634682ff63b38182108eadebb5c7ff9..d485cdf6109274377ad0057223bdd8401e964aa7 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -21,7 +21,7 @@
#include "paddle/framework/var_desc.h"
#include "paddle/operators/net_op.h"
-USE_OP(fill_constant);
+USE_NO_KERNEL_OP(fill_constant);
namespace paddle {
namespace framework {
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index c5ae7b185460c8b0d68ba38bb9db9bd3d3fb14ea..3ec88d7a72c3339bf5e7d0ca3957a3f608f039b7 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -34,6 +34,21 @@ inline DataType ToDataType(std::type_index type) {
}
}
+inline std::type_index ToTypeIndex(DataType type) {
+ switch (type) {
+ case DataType::FP32:
+ return typeid(float);
+ case DataType::FP64:
+ return typeid(double);
+ case DataType::INT32:
+ return typeid(int);
+ case DataType::INT64:
+ return typeid(int64_t);
+ default:
+ PADDLE_THROW("Not support type %d", type);
+ }
+}
+
template
inline void VisitDataType(DataType type, Visitor visitor) {
switch (type) {
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 239ae5e1233c7f5c506930df374b5d0cc8de7c8d..53b899a23997b71e723a298ec360a4e018d89878 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -79,6 +79,13 @@ DDim make_ddim(const std::vector& dims) {
return result;
}
+DDim make_ddim(const std::vector& dims) {
+ std::vector res(dims.size());
+ std::transform(dims.begin(), dims.end(), res.begin(),
+ [](int d) { return static_cast(d); });
+ return make_ddim(res);
+}
+
/// @cond HIDDEN
// XXX For some reason, putting this in an anonymous namespace causes errors
class DynamicMutableIndexer : public boost::static_visitor {
@@ -117,7 +124,7 @@ int64_t DDim::operator[](int idx) const {
return boost::apply_visitor(DynamicConstIndexer(idx), var);
}
-int64_t DDim::size() const { return arity(*this); }
+int DDim::size() const { return arity(*this); }
bool DDim::operator==(DDim d) const {
if (var.which() != d.getVar().which()) {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 2a5e2d2b6948b045642dbac5e83992a048ecb63d..4ca5e49566b7ec006eba80f3f9808bacb1ff2615 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -71,7 +71,7 @@ struct DDim {
DDim operator*(DDim d) const;
- int64_t size() const;
+ int size() const;
};
/**
@@ -81,6 +81,8 @@ struct DDim {
*/
DDim make_ddim(const std::vector& dims);
+DDim make_ddim(const std::vector& dims);
+
/**
* \brief Make a DDim from an initializer list
*
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
index 68a83def7e5a12fe3261be9e27cbb9bb54e1f8ad..1c2fba70c8ab0827ba6d1563f08cd0820650822e 100644
--- a/paddle/framework/lod_rank_table.cc
+++ b/paddle/framework/lod_rank_table.cc
@@ -31,6 +31,7 @@ void LoDRankTable::Reset(const LoD& lod, size_t level) {
TableItem item;
item.index = i;
item.length = vec[i + 1] - vec[i];
+ VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
items_.emplace_back(item);
}
// NOTE(yuyang18):
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 2bcfffb134f46416301b28043e1875e822fbc3e4..a0f2906c749054c1ff9f624e47df432ec2bd6ac8 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -27,6 +27,20 @@
namespace paddle {
namespace framework {
+std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+ os << "{";
+ for (auto& v : lod) {
+ os << "{";
+ for (auto& i : v) {
+ os << i << ",";
+ }
+ os << "}";
+ }
+ os << "}";
+
+ return os;
+}
+
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
LoD new_lod;
new_lod.reserve(level_end - level_begin);
@@ -136,37 +150,35 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
ShareDataWith(Slice(begin, end));
}
-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
- std::vector>* lod_length,
- size_t* start_offset) {
- lod_length->clear();
- PADDLE_ENFORCE(start_idx < lod.size() - 1,
- "start_idx should be >= 0 and < lod.size() - 1.");
- PADDLE_ENFORCE(end_idx < lod.size(),
- "end_idx should be >= 0 and < lod.size().");
- PADDLE_ENFORCE_LE(start_idx, end_idx,
- "start_idx should be less than end_idx.");
- for (size_t level_idx = 0; level_idx < lod.size(); ++level_idx) {
+using LoDAndOffset = std::pair>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+ size_t end_idx, size_t start_level) {
+ LoD sub_lod;
+
+ for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+ PADDLE_ENFORCE_LE(start_idx, end_idx);
+ PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
std::vector level_lens;
for (size_t i = start_idx; i < end_idx; ++i) {
level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
}
- lod_length->emplace_back(level_lens);
+ sub_lod.emplace_back(level_lens);
start_idx = lod[level_idx][start_idx];
end_idx = lod[level_idx][end_idx];
}
- *start_offset = start_idx;
+
+ return LoDAndOffset{sub_lod, {start_idx, end_idx}};
}
-void AppendLoD(LoD* lod, const std::vector>& lod_length) {
- PADDLE_ENFORCE_EQ(
- lod->size(), lod_length.size(),
+void AppendLoD(LoD* lod, const LoD& lod_length) {
+ PADDLE_ENFORCE(
+ lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod.");
+ if (lod->empty()) {
+ *lod = LoD(lod_length.size(), std::vector({0}));
+ }
for (size_t i = 0; i < lod->size(); ++i) {
auto& level = (*lod)[i];
- if (level.empty()) {
- level.push_back(0);
- }
for (size_t len : lod_length[i]) {
level.push_back(level.back() + len);
}
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 1437da399a28288429527f9672ace0482825159f..7f8a51cc581e759bc707e506ac7cdeb3680f40ac 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -56,6 +56,8 @@ using Vector = thrust::host_vector<
*/
using LoD = std::vector>;
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+
/*
* Slice levels from a LoD.
* NOTE the lowest level should always be the absolute offsets of the underlying
@@ -181,11 +183,10 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
return tensor;
}
-void GetFineGrainedLoDLength(const LoD& lod, size_t start_idx, size_t end_idx,
- std::vector>* lod_length,
- size_t* start_offset);
+std::pair> GetSubLoDAndAbsoluteOffset(
+ const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
-void AppendLoD(LoD* lod, const std::vector>& lod_length);
+void AppendLoD(LoD* lod, const LoD& lod_length);
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index bf61c9ee7aa99b06e78b9b27dff72d216c74a86e..02d84b68233f2fdfc66e1df2fc7ce20307cadd94 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -146,43 +146,44 @@ TEST(LodExpand, test) {
TEST(LoD, GetFineGrainedLoDLength) {
LoD lod;
- lod.push_back(std::vector{0, 2, 4, 5});
- lod.push_back(std::vector{0, 1, 6, 8, 10, 11});
+ lod.push_back(std::vector({0, 2, 4, 5}));
+ lod.push_back(std::vector({0, 1, 6, 8, 10, 11}));
lod.push_back(
- std::vector{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29});
+ std::vector({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
- std::vector> lod_length;
- size_t start_offset;
- paddle::framework::GetFineGrainedLoDLength(lod, 1, 2, &lod_length,
- &start_offset);
+ auto lod_and_offset =
+ paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
+ LoD lod_length = lod_and_offset.first;
+ size_t start_offset = lod_and_offset.second.first;
+ size_t end_offset = lod_and_offset.second.second;
- std::vector> expected;
+ LoD expected;
expected.push_back(std::vector{2});
expected.push_back(std::vector{2, 2});
expected.push_back(std::vector{2, 3, 4, 2});
EXPECT_EQ(lod_length, expected);
EXPECT_EQ(start_offset, 15UL);
+ EXPECT_EQ(end_offset, 26UL);
}
TEST(LoD, AppendLoD) {
- std::vector> lod_lens;
- lod_lens.push_back(std::vector{2});
- lod_lens.push_back(std::vector{2, 2});
- lod_lens.push_back(std::vector{2, 3, 4, 2});
+ LoD lod_lens;
+ lod_lens.push_back(std::vector({2}));
+ lod_lens.push_back(std::vector({2, 2}));
+ lod_lens.push_back(std::vector({2, 3, 4, 2}));
LoD origin;
- origin.push_back(std::vector{0, 2});
- origin.push_back(std::vector{0, 1, 6});
- origin.push_back(std::vector{0, 2, 5, 7, 10, 12, 15});
+ origin.push_back(std::vector({0, 2}));
+ origin.push_back(std::vector({0, 1, 6}));
+ origin.push_back(std::vector({0, 2, 5, 7, 10, 12, 15}));
paddle::framework::AppendLoD(&origin, lod_lens);
LoD expected;
- expected.push_back(std::vector{0, 2, 4});
- expected.push_back(std::vector{0, 1, 6, 8, 10});
+ expected.push_back(std::vector({0, 2, 4}));
+ expected.push_back(std::vector({0, 1, 6, 8, 10}));
expected.push_back(
- std::vector{0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26});
-
+ std::vector({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
EXPECT_EQ(origin, expected);
}
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 2bb5e0e8ec29fb2df81549650aa0c65bc1e51c49..daade439e5232f06be72bc5bb1e2285124f2c3a4 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -92,8 +92,7 @@ struct OpKernelRegistrarFunctor {
void operator()(const char* op_type) const {
using T = typename KERNEL_TYPE::ELEMENT_TYPE;
- OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
- PlaceType());
+ OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType());
OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
constexpr auto size = std::tuple_size>::value;
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 8150bf923926ed871ee69f2dd8c588451d68af51..3276f8af396fe58450a8dc6713fe61e49d5ca708 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -254,8 +254,7 @@ std::vector ExecutionContext::MultiOutput(
return res;
}
-std::ostream& operator<<(std::ostream& os,
- const OperatorWithKernel::OpKernelKey& kernel_key) {
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) {
os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
<< "]";
return os;
@@ -432,7 +431,7 @@ void OperatorWithKernel::Run(const Scope& scope,
// check if op[type] have kernel for kernel_key
OpKernelMap& kernels = kernels_iter->second;
- auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+ auto kernel_key = GetKernelType(ctx);
auto kernel_iter = kernels.find(kernel_key);
if (kernel_iter == kernels.end()) {
@@ -444,6 +443,38 @@ void OperatorWithKernel::Run(const Scope& scope,
// throws errors if have.
dev_ctx.Finish();
}
+OpKernelType OperatorWithKernel::GetKernelType(
+ const ExecutionContext& ctx) const {
+ return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+}
+DataType OperatorWithKernel::IndicateDataType(
+ const ExecutionContext& ctx) const {
+ auto& scope = ctx.scope();
+ int data_type = -1;
+ for (auto& input : this->inputs_) {
+ for (auto& ipt_name : input.second) {
+ auto* var = scope.FindVar(ipt_name);
+ if (var != nullptr) {
+ const Tensor* t = nullptr;
+ if (var->IsType()) {
+ t = &var->Get();
+ } else if (var->IsType()) {
+ t = &var->Get();
+ } else if (var->IsType()) {
+ t = &(var->Get().value());
+ }
+ if (t != nullptr) {
+ int tmp = static_cast(ToDataType(t->type()));
+ PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+ "DataType of Paddle Op %s must be the same.", Type());
+ data_type = tmp;
+ }
+ }
+ }
+ }
+ PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+ return static_cast(data_type);
+}
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index a1303a90980b40ff03bce1ab1a6f67bbbf952bcf..60861d92933dd100f877bec8d43f9b924f951e60 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -345,27 +345,10 @@ class OpKernel : public OpKernelBase {
using ELEMENT_TYPE = T;
};
-class OperatorWithKernel : public OperatorBase {
- public:
- struct OpKernelKey {
- platform::Place place_;
- DataType data_type_;
-
- OpKernelKey(DataType data_type, platform::Place place)
- : place_(place), data_type_(data_type) {}
-
- OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
- : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
-
- bool operator==(const OpKernelKey& o) const {
- return platform::places_are_same_class(place_, o.place_) &&
- data_type_ == o.data_type_;
- }
- };
-
- struct OpKernelHash {
+struct OpKernelType {
+ struct Hash {
std::hash hash_;
- size_t operator()(const OpKernelKey& key) const {
+ size_t operator()(const OpKernelType& key) const {
int place = key.place_.which();
int data_type = static_cast(key.data_type_);
int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
@@ -374,9 +357,26 @@ class OperatorWithKernel : public OperatorBase {
}
};
+ platform::Place place_;
+ DataType data_type_;
+
+ OpKernelType(DataType data_type, platform::Place place)
+ : place_(place), data_type_(data_type) {}
+
+ OpKernelType(DataType data_type, const platform::DeviceContext& dev_ctx)
+ : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
+
+ bool operator==(const OpKernelType& o) const {
+ return platform::places_are_same_class(place_, o.place_) &&
+ data_type_ == o.data_type_;
+ }
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
using OpKernelMap =
- std::unordered_map,
- OpKernelHash>;
+ std::unordered_map,
+ OpKernelType::Hash>;
OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -404,40 +404,15 @@ class OperatorWithKernel : public OperatorBase {
}
protected:
+ virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const;
+
+ private:
// indicate kernel DataType by input data. Defaultly all input data must be
// same.
- virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
- auto& scope = ctx.scope();
- int data_type = -1;
- for (auto& input : this->inputs_) {
- for (auto& ipt_name : input.second) {
- auto* var = scope.FindVar(ipt_name);
- if (var != nullptr) {
- const Tensor* t = nullptr;
- if (var->IsType()) {
- t = &var->Get();
- } else if (var->IsType()) {
- t = &var->Get();
- } else if (var->IsType()) {
- t = &(var->Get().value());
- }
- if (t != nullptr) {
- int tmp = static_cast(ToDataType(t->type()));
- PADDLE_ENFORCE(tmp == data_type || data_type == -1,
- "DataType of Paddle Op %s must be the same.",
- Type());
- data_type = tmp;
- }
- }
- }
- }
- PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
- return static_cast(data_type);
- }
+ DataType IndicateDataType(const ExecutionContext& ctx) const;
};
-std::ostream& operator<<(std::ostream& os,
- const OperatorWithKernel::OpKernelKey& kernel_key);
+std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key);
extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 42e0d52eed3911d8e684e76a88bc690ca0783ce5..1e19f82b341768142258ba4a5dfa246d87ba4c43 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -114,8 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
- DataType IndicateDataType(const ExecutionContext& ctx) const override {
- return DataType::FP32;
+ OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
+ return OpKernelType(DataType::FP32, ctx.device_context());
}
};
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 16aca192d41a64003de85ce45f7697bf45c556ed..0babec29f6f4412ed29deeafe24470e86b30a636 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -45,7 +45,8 @@ void VarDescBind::SetLoDLevel(int32_t lod_level) {
desc_.mutable_tensor_array()->set_lod_level(lod_level);
break;
default:
- PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+ PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+ desc_.tensor_array().lod_level());
}
}
@@ -56,7 +57,8 @@ int32_t VarDescBind::GetLodLevel() const {
case VarDesc::LOD_TENSOR_ARRAY:
return desc_.tensor_array().lod_level();
default:
- PADDLE_THROW("Tensor type=%d does not support LoDLevel", desc_.type());
+ PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+ desc_.tensor_array().lod_level());
}
}
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 4fd72d64a90ae6f16dd1499ceb7fba6e40fe4cea..9b2779b42cad324253dadf27dbff20fd8e8c8e16 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -45,6 +45,7 @@ if(WITH_GPU)
add_simple_unittest(BlockExpandOpTest)
add_simple_unittest(CropOpTest)
add_simple_unittest(SwitchOpTest)
+ add_simple_unittest(ScaleSubRegionOpTest)
endif()
add_simple_unittest(Im2ColTest)
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index ba446bf92da264fafa1fb47a2c30da9cb13176ce..370940532ef40335be54a3e6467de0409e923ec4 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -110,6 +110,7 @@ public:
function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
function1_->init(config);
function2_->init(config);
+ initArgsCallback_ = nullptr;
}
~Compare2Function() {}
@@ -170,6 +171,10 @@ public:
*seq2_));
}
+ void registerInitCallback(std::function callback) {
+ initArgsCallback_ = callback;
+ }
+
// output need only contains shape, do not contains data.
void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
size_t size =
@@ -340,6 +345,10 @@ protected:
initArg(*func1Inputs_[i]);
}
+ if (initArgsCallback_ != nullptr) {
+ initArgsCallback_(*func1Inputs_[i], i);
+ }
+
copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
}
}
@@ -386,6 +395,7 @@ protected:
std::shared_ptr seq1_;
std::shared_ptr seq2_;
test::CopyArgument copyArg_;
+ std::function initArgsCallback_;
};
class CpuGpuFuncCompare
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a080505d7df83a6c0a9d88fbcb7863fc0e1f7b21
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+ for (int n = 0; n < number; ++n) {
+ // indices start from 1
+ int offset = n * 6;
+ for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+ for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+ for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+ int idx = ((n * channel + c) * height + h) * width + w;
+ outputs[idx] *= value;
+ }
+ }
+ }
+ }
+}
+
+template <>
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ for (int n = 0; n < number; ++n) {
+ for (int c = 0; c < channel; ++c) {
+ for (int h = 0; h < height; ++h) {
+ for (int w = 0; w < width; ++w) {
+ int idx = ((n * channel + c) * height + h) * width + w;
+ int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) &&
+ h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) &&
+ w <= (indices[offset + 5] - 1)) {
+ outGrad[idx] += inGrad[idx] * value;
+ } else {
+ outGrad[idx] += inGrad[idx];
+ }
+ }
+ }
+ }
+ }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ * a specified sub continuous region. By providing start index and end
+ * index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with same shape as inputs, output value.
+ */
+template
+class ScaleSubRegionFunc : public FunctionBase {
+public:
+ void init(const FuncConfig& config) override { conf_ = config; }
+
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ(2UL, inputs.size());
+ CHECK_EQ(1UL, outputs.size());
+ CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+ TensorShape shape = inputs[0].shape();
+
+ ScaleSubRegion(outputs[0].data(),
+ inputs[0].data(),
+ inputs[1].data(),
+ shape,
+ conf_);
+ }
+
+private:
+ FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template
+class ScaleSubRegionGradFunc : public FunctionBase {
+public:
+ void init(const FuncConfig& config) override { conf_ = config; }
+
+ void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+ CHECK_EQ(2UL, inputs.size());
+ CHECK_EQ(1UL, outputs.size());
+ CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+ TensorShape shape = inputs[0].shape();
+
+ ScaleSubRegionGrad(inputs[0].data(),
+ outputs[0].data(),
+ inputs[1].data(),
+ shape,
+ conf_);
+ }
+
+private:
+ FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/function/ScaleSubRegionOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..0480c8577f3fbf3bc9e94b635df96a31b103e9e3
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief Function to multiply a value to values in specified sub continuous
+ * region. Indices must be provided to indcate the location and shape of
+ * the region and the multiplied value is passed by configure variable.
+ *
+ *
+ * \param[out] outputs Output value.
+ * \param[in] inputs Input data which contains NCHW information.
+ * \param[in] indices Indices data to indcate the sub region.
+ * \param[in] shape Tensor shape of input value.
+ * \param[in] conf Configure variable which contains the multiplied value.
+ */
+template
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf);
+
+/**
+ * \brief Backward propagation function of ScaleSubRegion.
+ *
+ * \param[out] inGrad Gradients of previous layer.
+ * \param[in] outGrad Output gradient.
+ * \param[in] indices Indices data.
+ * \param[in] shape The Shape of input tensor.
+ * \param[in] conf Configure variable.
+ */
+template
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf);
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/function/ScaleSubRegionOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8aae2e44c3fdc8b516e66ecfd2e04f466a17dde9
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ real value,
+ int channel,
+ int height,
+ int width,
+ int nthreads) {
+ const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx < nthreads) {
+ const int w = idx % width;
+ const int h = (idx / width) % height;
+ const int c = (idx / width / height) % channel;
+ const int n = idx / width / height / channel;
+
+ const int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+ outputs[idx] = inputs[idx] * value;
+ } else {
+ outputs[idx] = inputs[idx];
+ }
+ }
+}
+
+template <>
+void ScaleSubRegion(real* outputs,
+ const real* inputs,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ size_t nth = number * channel * height * width;
+ int blockSize = 1024;
+ int gridSize = (nth + blockSize - 1) / blockSize;
+
+ KeScaleSubRegion<<>>(
+ outputs, inputs, indices, value, channel, height, width, nth);
+ CHECK_SYNC("ScaleSubRegion");
+}
+
+__global__ void KeScaleSubRegionDiff(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ real value,
+ int channel,
+ int height,
+ int width,
+ int nthreads) {
+ const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+ if (idx < nthreads) {
+ const int w = idx % width;
+ const int h = (idx / width) % height;
+ const int c = (idx / width / height) % channel;
+ const int n = idx / width / height / channel;
+
+ const int offset = n * 6;
+ if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+ h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+ w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+ outGrad[idx] += inGrad[idx] * value;
+ } else {
+ outGrad[idx] += inGrad[idx];
+ }
+ }
+}
+
+template <>
+void ScaleSubRegionGrad(const real* inGrad,
+ real* outGrad,
+ const real* indices,
+ const TensorShape shape,
+ const FuncConfig& conf) {
+ real value = conf.get("value");
+
+ int number = shape[0];
+ int channel = shape[1];
+ int height = shape[2];
+ int width = shape[3];
+
+ size_t nth = number * channel * height * width;
+ int blockSize = 1024;
+ int gridSize = (nth + blockSize - 1) / blockSize;
+
+ KeScaleSubRegionDiff<<>>(
+ inGrad, outGrad, indices, value, channel, height, width, nth);
+ CHECK_SYNC("ScaleSubRegionGrad");
+}
+
+} // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/function/ScaleSubRegionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..43331f258dddaa43cbc8cc77519e299de7e98290
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ScaleSubRegion, real) {
+ for (size_t numSamples : {5, 32}) {
+ for (size_t channels : {5, 32}) {
+ for (size_t imgSizeH : {5, 33}) {
+ for (size_t imgSizeW : {5, 32}) {
+ for (real value : {-0.5, 0.0, 0.5}) {
+ for (bool firstHalf : {false, true}) {
+ VLOG(3) << " numSamples=" << numSamples
+ << " channels=" << channels << " imgSizeH=" << imgSizeH
+ << " imgSizeW=" << imgSizeW;
+
+ for (bool testGrad : {false, true}) {
+ CpuGpuFuncCompare compare(
+ testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
+ FuncConfig().set("value", value));
+
+ TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+ TensorShape indicesShape{numSamples, 6};
+
+ compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+ compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
+
+ compare.registerInitCallback([=](BufferArg& arg, size_t index) {
+ if (index == 1) {
+ real* data = (real*)arg.data();
+
+ for (size_t i = 0; i < numSamples; ++i) {
+ size_t offset = i * 6;
+ data[offset] = firstHalf ? 1 : channels / 2;
+ data[offset + 1] = firstHalf ? channels / 2 : channels;
+ data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
+ data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
+ data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
+ data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
+ }
+ }
+ });
+
+ compare.addOutputs(
+ BufferArg(
+ VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
+ testGrad ? ADD_TO : ASSIGN_TO);
+ compare.run();
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
index 8eb700723f2cf7dda969739bb5e3d48358d278a0..6ffe4fbec643e50d27924a989875454d307f5b9b 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -62,16 +62,14 @@ void MKLDNNAddtoLayer::resetFwd(std::vector& pipeline,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
- if (biases_) {
- LOG(FATAL) << "not implemented yet";
- }
- resetFwdBuffers(inVals_, out);
+ resetFwdBuffers(inVals_, bias, out);
in = inVals_[0];
std::shared_ptr fwdPD;
- resetFwdPD(fwdPD, inVals_, out);
+ std::shared_ptr biasPD;
+ resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
- resetFwdPipeline(pipeline, fwdPD, inVals_, out);
+ resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
}
void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
@@ -79,7 +77,7 @@ void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
- resetBwdBuffers(inGrads_, out);
+ resetBwdBuffers(inGrads_, bias, out);
in = inGrads_[0];
// backward only need share output grad to input grad
@@ -89,6 +87,20 @@ void MKLDNNAddtoLayer::resetBwd(std::vector& pipeline,
inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
}
}
+
+ // backward bias
+ bwdBias_ = nullptr;
+ if (bias) {
+ std::vector scales(bs_, 1.0);
+ std::vector srcPDs(bs_, bias->getPrimitiveDesc());
+ auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
+ std::vector srcs;
+ for (size_t i = 0; i < grads_.size(); ++i) {
+ srcs.push_back(*(grads_[i]));
+ }
+ bwdBias_.reset(new sum(biasPD, srcs, *bias));
+ pipeline.push_back(*bwdBias_);
+ }
}
void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
@@ -97,7 +109,25 @@ void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
}
}
+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+ const MatrixPtr& biasMat,
+ const MKLDNNMatrixPtr& out,
+ std::vector& outs) {
+ auto pd = MKLDNNMatrix::createPrimitiveDesc(
+ {(int)layerSize_}, memory::format::x, engine_);
+ bias = MKLDNNMatrix::create(pd, biasMat);
+ outs.clear();
+ real* data = out->getData();
+ CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+ for (int i = 0; i < bs_; ++i) {
+ MatrixPtr tmp =
+ Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+ outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+ }
+}
+
void MKLDNNAddtoLayer::resetFwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
@@ -110,12 +140,20 @@ void MKLDNNAddtoLayer::resetFwdBuffers(std::vector& inputs,
}
resetOutValue(out, inputs[0]->getPrimitiveDesc());
+
+ if (biases_ && biases_->getW()) {
+ prepareBias(bias, biases_->getW(), out, vals_);
+ } else {
+ bias = nullptr;
+ }
}
void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr bias,
MKLDNNMatrixPtr out) {
- std::vector scales(inputs.size(), 1.0);
+ std::vector scales(inputs.size(), 1.0);
std::vector srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
@@ -123,12 +161,23 @@ void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr& pd,
CHECK(out);
pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+
+ biasPD = nullptr;
+ if (bias) {
+ std::vector scales(2, 1.0);
+ std::vector srcPDs(2, bias->getPrimitiveDesc());
+ biasPD.reset(
+ new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+ CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+ }
}
void MKLDNNAddtoLayer::resetFwdPipeline(
std::vector& pipeline,
std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
std::vector srcs;
for (size_t i = 0; i < inputs.size(); i++) {
@@ -136,9 +185,23 @@ void MKLDNNAddtoLayer::resetFwdPipeline(
}
fwd_.reset(new sum(*pd, srcs, *out));
pipeline.push_back(*fwd_);
+
+ fwdBias_.clear();
+ if (biasPD == nullptr || bias == nullptr) {
+ return;
+ }
+ fwdBias_.resize(vals_.size());
+ for (size_t i = 0; i < vals_.size(); ++i) {
+ std::vector srcs;
+ srcs.push_back(*(vals_[i]));
+ srcs.push_back(*bias);
+ fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+ pipeline.push_back(*fwdBias_[i]);
+ }
}
void MKLDNNAddtoLayer::resetBwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
@@ -149,6 +212,12 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector& inputs,
resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
}
+
+ if (biases_ && biases_->getWGrad()) {
+ prepareBias(bias, biases_->getWGrad(), out, grads_);
+ } else {
+ bias = nullptr;
+ }
}
} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
index 15f74ec5bdf3d1e4ae5e09051be6be418590a67a..24504b7b4f50726e2b2757ca3029461cdc27b411 100644
--- a/paddle/gserver/layers/MKLDNNAddtoLayer.h
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -32,9 +32,15 @@ protected:
// layer size == ic * ih * iw == oc * oh *ow, and can not be changed
size_t layerSize_;
- // TODO(TJ): this part has not been optimized by MKL-DNN
std::unique_ptr biases_;
+ // buffers for adding bias
+ std::vector vals_;
+ std::vector grads_;
+ // primitives for adding bias
+ std::vector> fwdBias_;
+ std::shared_ptr bwdBias_;
+
public:
explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
@@ -91,20 +97,34 @@ protected:
* reset pipeline.
*/
void resetFwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr bias,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector& pipeline,
std::shared_ptr& pd,
+ std::shared_ptr& biasPD,
std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
*/
void resetBwdBuffers(std::vector& inputs,
+ MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
+
+ /**
+ * prepare for bias
+ */
+ void prepareBias(MKLDNNMatrixPtr& bias,
+ const MatrixPtr& biasMat,
+ const MKLDNNMatrixPtr& out,
+ std::vector& outs);
};
} // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index d82063a7130ca928ba042e210eb216f90c7207cd..3429c53d2396e051d62fe0ae405934758e89f9c2 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -60,18 +60,16 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
}
CHECK(wgtVal_) << "should have been initialized";
- bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
- auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
+ auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
hasInitedWgt_ = true;
}
void MKLDNNFcLayer::convertWeightsToPaddle() {
CHECK(wgtVal_) << "should have been initialized";
- bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
auto targetDim = wgtVal_->getDims();
- auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
+ auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
}
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
index 5fd62f4f73b18df683ccf74143e45054c3631c22..e75ac5ba4647a8267b7bc189893bd7adb5c3053f 100644
--- a/paddle/gserver/layers/MKLDNNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -181,21 +181,17 @@ void MKLDNNLayer::resetInValue(
auto extPD = MKLDNNMatrix::createPrimitiveDesc(
{bs_, ic_, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
- in = std::dynamic_pointer_cast(inMat);
- CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
- if (in == nullptr || in->getFormat() == format::nc) {
- in = MKLDNNMatrix::create(extPD, inMat);
- }
- extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
- if (in->getFormat() == format::nc) {
- CHECK(ih_ == 1 && iw_ == 1);
+ extInVal_ = std::dynamic_pointer_cast(inMat);
+ CHECK_EQ(inputIsOnlyMKLDNN(), extInVal_ != nullptr);
+ if (extInVal_ == nullptr || extInVal_->getFormat() == format::nc) {
+ extInVal_ = MKLDNNMatrix::create(extPD, inMat);
}
+ in = extInVal_;
if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
return;
}
// need create reorder
in = MKLDNNMatrix::create(*intPD);
- extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
CHECK(cvtInVal_) << "should not be emptry";
}
@@ -291,7 +287,7 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
return;
}
CHECK(out) << "should have reset internal ouput grad";
- std::vector scales(outputMap_.size(), 1.0);
+ std::vector scales(outputMap_.size(), 1.0);
std::vector srcPDs;
std::vector srcs;
for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b18bc0c1b9065e09324d8ab4ed165679f6196d00
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+ const ParameterMap& parameterMap) {
+ Layer::init(layerMap, parameterMap);
+ CHECK_EQ(static_cast(inputLayers_.size()), 2);
+ auto& conf = config_.inputs(0).scale_sub_region_conf();
+ value_ = conf.value();
+
+ createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+ createFunction(
+ backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+ return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+ Layer::forward(passType);
+ auto in0 = getInput(0);
+ imgH_ = in0.getFrameHeight();
+ imgW_ = in0.getFrameWidth();
+ if (imgH_ == 0 || imgW_ == 0) {
+ auto& conf = config_.inputs(0).scale_sub_region_conf();
+ imgH_ = conf.image_conf().img_size_y();
+ imgW_ = conf.image_conf().img_size();
+ }
+ MatrixPtr imgV = in0.value;
+ size_t batchSize = imgV->getHeight();
+ size_t spatialSize = imgH_ * imgW_;
+ channelsNum_ = imgV->getWidth() / spatialSize;
+ shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+ resetOutput(batchSize, imgV->getWidth());
+ auto out = getOutput();
+ out.setFrameHeight(imgH_);
+ out.setFrameWidth(imgW_);
+
+ MatrixPtr indicesV = getInputValue(1);
+ indicesShape_ = TensorShape({batchSize, 6});
+
+ REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+ BufferArgs inArgs;
+ BufferArgs outArgs;
+ inArgs.addArg(*imgV, shape_);
+ inArgs.addArg(*indicesV, indicesShape_);
+ outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+ forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+ REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+ BufferArgs inArgs;
+ BufferArgs outArgs;
+ inArgs.addArg(*getOutputGrad(), shape_);
+ inArgs.addArg(*getInputValue(1), indicesShape_);
+ outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+ backward_[0]->calc(inArgs, outArgs);
+}
+
+} // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..a27c56de93bb6fdde0f95cd4c5abe5dfabe4e858
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief For each instance, this layer can be used to multiply a value to a
+ * specified sub continuous region. By providing start index and end
+ * index for C/H/W, you can specify the location and shape of the
+ * region.
+ *
+ * input_0: Input value.
+ * input_1: Indices value to specify the location an shape of the
+ * region.
+ */
+class ScaleSubRegionLayer : public Layer {
+public:
+ explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
+
+ ~ScaleSubRegionLayer() {}
+
+ bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+ void forward(PassType passType);
+
+ void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+ TensorShape shape_;
+ TensorShape indicesShape_;
+ size_t imgH_;
+ size_t imgW_;
+ size_t channelsNum_;
+ real value_;
+};
+
+} // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 1a46fb49153a0aa4228f58db481b950bc2d6de83..3f7d8810513ed4b765219be722cf8ae9adc7909f 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -2358,6 +2358,38 @@ TEST(Layer, ScaleShiftLayer) {
}
}
+TEST(Layer, ScaleSubRegionLayer) {
+ const size_t batchSize = 64;
+ const size_t size = 4096;
+ TestConfig config;
+ config.layerConfig.set_type("scale_sub_region");
+ config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+ MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+ auto* data = indicesV->getData();
+ for (size_t i = 0; i < batchSize; ++i) {
+ data[i * 2] = 2;
+ data[i * 2 + 1] = 4;
+ data[i * 2 + 2] = 16;
+ data[i * 2 + 3] = 32;
+ data[i * 2 + 4] = 16;
+ data[i * 2 + 5] = 32;
+ }
+ config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+ LayerInputConfig* input = config.layerConfig.add_inputs();
+ ScaleSubRegionConfig* scaleSubRegionConf =
+ input->mutable_scale_sub_region_conf();
+ ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+ imgConf->set_img_size(32);
+ imgConf->set_img_size_y(32);
+ imgConf->set_channels(4);
+ scaleSubRegionConf->set_value(2.0);
+ config.layerConfig.add_inputs();
+
+ for (auto useGpu : {false, true}) {
+ testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+ }
+}
+
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 2e8d9f3333b36005c9b3b28449c76a4a44c74cc6..3960d699ac8dc08316ee413116878ee3eda65793 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -300,13 +300,8 @@ void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
TestConfig dnnConfig;
getAddtoConfig(dnnConfig, pm, nInputs);
dnnConfig.layerConfig.set_type("mkldnn_addto");
- // TODO(TJ): test with bias
- for (auto withBias : {false}) {
- if (withBias) {
- dnnConfig.biasSize = pm.ic * pm.ih * pm.iw;
- } else {
- dnnConfig.biasSize = 0;
- }
+ for (auto withBias : {false, true}) {
+ dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
}
}
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index 5bc4a03067a75527fa30e5bb5526f93dc7b9fdcc..b998e5772e70d0a0ec79dc4064dcbaa2c302efd2 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -169,7 +169,7 @@ void TensorCheck(AssertEq compare,
count++;
}
}
- EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+ EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
}
template
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index b497c877d1b564434807ac040112020445bd71f1..29ce44c23308cb5ae1c1df5c9be1412c28abe96f 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -170,6 +170,8 @@ set(DEPS_OPS
sequence_conv_op
sequence_pool_op
lod_rank_table_op
+ lod_tensor_to_array_op
+ array_to_lod_tensor_op
lstm_op
tensor_array_read_write_op
gru_op)
@@ -182,6 +184,8 @@ op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling)
op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
if(WITH_GPU)
op_library(nccl_op DEPS nccl_common)
@@ -191,8 +195,13 @@ op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute)
-op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
- DEPS net_op tensor_array)
+if(WITH_TESTING)
+ op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
+ DEPS net_op tensor_array gtest)
+else()
+ op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
+ DEPS net_op tensor_array)
+endif()
op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index eaafb9ad54b371837cbc0f3268f7f2bf169e83e8..03c2fa945d94a522d25e65103c8842a93852ba3d 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -47,10 +47,11 @@ class AccuracyOp : public framework::OperatorWithKernel {
}
protected:
- // IndicateDataType
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext &ctx) const override {
- return framework::ToDataType(ctx.Input("Out")->type());
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("Out")->type()),
+ ctx.device_context());
}
};
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index d0c4c0d25d6f4e3ab7acd72d62a8a17fa102637b..1776f33105367447759aa91c25263dfc53bd2f99 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -65,7 +65,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
size_t num_samples = inference->dims()[0];
size_t infer_width = inference->dims()[1];
- cudaMemset((void**)&accuracy_data, 0, sizeof(float));
+ PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
if (num_samples == 0) {
return;
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..666043e824f885e9c0e79e319d0a38ba108c209a
--- /dev/null
+++ b/paddle/operators/array_operator.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOp : public framework::OperatorBase {
+ public:
+ ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+ size_t GetOffset(const framework::Scope &scope,
+ const platform::DeviceContext &dev_ctx) const {
+ auto *i = scope.FindVar(Input("I"));
+ PADDLE_ENFORCE(i != nullptr, "I must be set");
+ auto &i_tensor = i->Get();
+ PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+ size_t offset;
+ if (platform::is_gpu_place(i_tensor.place())) {
+ // FIXME: Avoid copy from GPU to CPU
+ framework::Tensor t;
+ t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+ dev_ctx.Wait();
+ offset = static_cast(*t.data());
+ } else {
+ offset = static_cast(*i_tensor.data());
+ }
+ return offset;
+ }
+};
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0903bb4e5ca7f160e19eefab99af7e3e4a8ed76
--- /dev/null
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -0,0 +1,170 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class ArrayToLoDTensorOp : public framework::OperatorBase {
+ public:
+ ArrayToLoDTensorOp(const std::string &type,
+ const framework::VariableNameMap &inputs,
+ const framework::VariableNameMap &outputs,
+ const framework::AttributeMap &attrs)
+ : OperatorBase(type, inputs, outputs, attrs) {}
+ void Run(const framework::Scope &scope,
+ const platform::DeviceContext &dev_ctx) const override {
+ auto &x = scope.FindVar(Input("X"))->Get();
+ auto &rank_table =
+ scope.FindVar(Input("RankTable"))->Get();
+ auto *out =
+ scope.FindVar(Output("Out"))->GetMutable();
+
+ // Check dims, place and data type of input's elements and infer output's
+ // dim
+ PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
+ int rank = x[0].dims().size();
+ platform::Place place = x[0].place();
+ std::type_index data_type = x[0].type();
+ framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
+ int64_t batch_size = x[0].dims()[0];
+ for (size_t i = 1; i < x.size(); ++i) {
+ PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
+ "The dimension of the %zu'th element in LoDTensorArray "
+ "differs from previous ones.",
+ i);
+ PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
+ "The place class of the %zu'th element in LoDTensorArray "
+ "differs from previous ones.",
+ i);
+ PADDLE_ENFORCE(x[i].type() == data_type,
+ "The date type of the %zu'th element in LoDTensorArray "
+ "differs from previous ones.",
+ i);
+ batch_size += x[i].dims()[0];
+ }
+ auto ins_dim_vec = framework::vectorize(ins_dims);
+ ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
+ framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
+ out->Resize(out_dims);
+ out->mutable_data(place, data_type);
+
+ auto &table_items = rank_table.items();
+ std::vector table_item_idx(table_items.size());
+ // table_item_idx = range(table_items_idx.size())
+ std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
+ std::sort(table_item_idx.begin(), table_item_idx.end(),
+ [&](size_t a, size_t b) {
+ return table_items[a].index < table_items[b].index;
+ });
+
+ // Build LoDTensor `out`
+ framework::LoD *out_lod = out->mutable_lod();
+ out_lod->clear();
+ size_t out_offset = 0;
+ auto prefix_lod = rank_table.coarse_lod();
+ prefix_lod.emplace_back();
+ auto &cur_level_lod = prefix_lod.back();
+ cur_level_lod.push_back(0);
+ for (size_t idx : table_item_idx) {
+ cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
+ for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
+ auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+ x[x_idx].lod(), idx, idx + 1, 0);
+
+ auto &lod_length = lod_and_offset.first;
+ framework::AppendLoD(out_lod, lod_length);
+
+ size_t start_offset = lod_and_offset.second.first;
+ size_t end_offset = lod_and_offset.second.second;
+ VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
+ << ", " << end_offset << "]";
+ // Copy data
+ PADDLE_ENFORCE_GE(end_offset, start_offset);
+ size_t len = end_offset - start_offset;
+ if (len == 0) {
+ continue;
+ }
+ out->Slice(out_offset, out_offset + len)
+ .CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx);
+ out_offset += len;
+ }
+ }
+ out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
+ }
+};
+
+class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ArrayToLoDTensorOpProtoMaker(framework::OpProto *proto,
+ framework::OpAttrChecker *op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X",
+ "(std::vector) A vector of tensors that is going to "
+ "be casted to a big LoDTensor.");
+ AddInput("RankTable",
+ "(LoDRankTable) RankTable provides the coarse lod infomation to "
+ "build the output LoDTensor. See "
+ "'paddle/framework/lod_rank_table.h' for more details.");
+ AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
+ AddComment(
+ R"DOC(This Op build a big LoDTensor from a std::vector
+ and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
+ outputs back to a normal LoDTensor. The std::vector
+ would be the output of RNN Op and the LoDRankTable would be build
+ with RNN's input.)DOC");
+ }
+};
+
+class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+ void operator()(framework::InferShapeContext *context) const override {
+ PADDLE_ENFORCE(context->HasInput("X"),
+ "ArrayToLoDTensorOp must has input X.");
+ PADDLE_ENFORCE(context->HasInput("RankTable"),
+ "ArrayToLoDTensorOp must has input RankTable.");
+ context->SetOutputDim("Out", context->GetInputDim("X"));
+ }
+};
+
+class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+ using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+ std::unique_ptr Apply() const override {
+ auto *grad_op = new framework::OpDescBind();
+ grad_op->SetType("lod_tensor_to_array");
+ grad_op->SetInput("X", OutputGrad("Out"));
+ grad_op->SetInput("RankTable", Input("RankTable"));
+ grad_op->SetOutput("Out", InputGrad("X"));
+ grad_op->SetAttrMap(Attrs());
+ return std::unique_ptr(grad_op);
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
+ ops::ArrayToLoDTensorOpProtoMaker,
+ ops::ArrayToLoDTensorInferShape,
+ ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
index ccb969ab23a8e1df713278513a66b10f21690108..6c3f67ec32fb1b942241997e87a1e9c4752e707d 100644
--- a/paddle/operators/auc_op.cc
+++ b/paddle/operators/auc_op.cc
@@ -39,10 +39,11 @@ class AucOp : public framework::OperatorWithKernel {
}
protected:
- // IndicateDataType
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext &ctx) const override {
- return framework::ToDataType(ctx.Input("Out")->type());
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("Out")->type()),
+ ctx.device_context());
}
};
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index 7d73dfde786208fe217dac97325d432fb80052ad..8721ca352848fc4d69b206d4ea0ab7c581c8d055 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -303,7 +303,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
}
- framework::DataType IndicateDataType(
+ protected:
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext &ctx) const override {
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) {
@@ -318,7 +319,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
- return framework::ToDataType(t->type());
+ return framework::OpKernelType(framework::ToDataType(t->type()),
+ ctx.device_context());
}
};
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d9fc532e39500fa397be80396b075e866bad9362
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/operators/clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipByNormOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(framework::InferShapeContext* ctx) const override {
+ PADDLE_ENFORCE(ctx->HasInput("X"),
+ "Input(X) of ClipByNormOp should not be null.");
+ PADDLE_ENFORCE(ctx->HasOutput("Out"),
+ "Output(Out) of ClipByNormOp should not be null.");
+ auto max_norm = ctx->Attrs().Get("max_norm");
+ PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+ auto x_dims = ctx->GetInputDim("X");
+ ctx->SetOutputDim("Out", x_dims);
+ ctx->ShareLoD("X", /*->*/ "Out");
+ }
+};
+
+class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+ ClipByNormOpMaker(framework::OpProto* proto,
+ framework::OpAttrChecker* op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("X",
+ "(Tensor) The input of clip_by_norm op."
+ "The number of dimensions must be between [1, 9].");
+ AddOutput("Out",
+ "(Tensor) The output of clip_by_norm op with shape as input(X)");
+ AddAttr("max_norm", "(float) The maximum norm value.");
+ AddComment(R"DOC(
+ClipByNorm operator limits the L2 norm of the input 'X' within 'max_norm'.
+If the L2 norm of 'X' is less than or equal to 'max_norm', 'Out' will be
+the same as 'X'. If the L2 norm of 'X' is greater than 'max_norm', 'X' will
+be linearly scaled to make the L2 norm of 'Out' equal to 'max_norm', as
+shown in the following formula:
+
+'Out' = 'max_norm' * 'X' / norm('X'),
+
+where norm('X') represents the L2 norm of 'X'.
+)DOC");
+ }
+};
+
+} // namespace operators
+} // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
+ ops::ClipByNormOpMaker);
+REGISTER_OP_CPU_KERNEL(
+ clip_by_norm, ops::ClipByNormKernel);
diff --git a/paddle/operators/fill_constant_op.cu b/paddle/operators/clip_by_norm_op.cu
similarity index 63%
rename from paddle/operators/fill_constant_op.cu
rename to paddle/operators/clip_by_norm_op.cu
index bca402a8b988b570a083e9ce253342304f4b8946..2593a24ebbf56ecd286a726e527d2414247576e8 100644
--- a/paddle/operators/fill_constant_op.cu
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -12,13 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
-#define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/fill_constant_op.h"
+#include "paddle/operators/clip_by_norm_op.h"
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
- fill_constant, ops::FillConstantOpKernel,
- ops::FillConstantOpKernel,
- ops::FillConstantOpKernel,
- ops::FillConstantOpKernel);
+ clip_by_norm, ops::ClipByNormKernel);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b26476cae9b5b2fa290bc9186b9a64c48ba703d6
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template
+using EigenVector = framework::EigenVector;
+
+template
+class ClipByNormKernel : public framework::OpKernel {
+ public:
+ void Compute(const framework::ExecutionContext& context) const override {
+ auto max_norm = context.Attr("max_norm");
+ auto* input = context.Input("X");
+ auto* output = context.Output("Out");
+ output->mutable_data(context.GetPlace());
+
+ auto x = EigenVector::Flatten(*input);
+ auto out = EigenVector::Flatten(*output);
+ auto x_norm = x.square().sum().sqrt();
+ auto place = context.GetEigenDevice();
+
+ auto temp = (x_norm <= max_norm).template cast().eval();
+ auto scaling = temp + (static_cast(1) - temp) * max_norm / x_norm;
+ Eigen::array one_dim{{1}};
+ Eigen::DSizes m_dsize(input->numel());
+ out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+ }
+};
+
+} // namespace operators
+} // namespace paddle
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
index 8b425d14df3bc484437dc72f29abf13b887006bd..716b5ee92d0d8737d2069460f53989f691ff7c77 100644
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@@ -14,6 +14,7 @@
#include "paddle/operators/compare_op.h"
#include "paddle/framework/op_registry.h"
+
namespace paddle {
namespace operators {
template
@@ -61,19 +62,34 @@ class CompareOpInferShape : public framework::InferShapeBase {
}
};
+class CompareOp : public framework::OperatorWithKernel {
+ public:
+ using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ framework::OpKernelType GetKernelType(
+ const framework::ExecutionContext &ctx) const override {
+ framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx);
+ // CompareOp kernel's device type is decided by input tensor place
+ kt.place_ = ctx.Input("X")->place();
+ return kt;
+ }
+};
+
} // namespace operators
} // namespace paddle
-#define REGISTER_LOGICAL_OP(op_type, _equation) \
- struct _##op_type##Comment { \
- static char type[]; \
- static char equation[]; \
- }; \
- char _##op_type##Comment::type[]{#op_type}; \
- char _##op_type##Comment::equation[]{_equation}; \
- REGISTER_OP_WITH_KERNEL( \
- op_type, ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
- ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
+#define REGISTER_LOGICAL_OP(op_type, _equation) \
+ struct _##op_type##Comment { \
+ static char type[]; \
+ static char equation[]; \
+ }; \
+ char _##op_type##Comment::type[]{#op_type}; \
+ char _##op_type##Comment::equation[]{_equation}; \
+ REGISTER_OPERATOR( \
+ op_type, ::paddle::operators::CompareOp, \
+ ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
+ ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
::paddle::framework::EmptyGradOpMaker);
REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
index d1ce74c4b911545476f0b362df0e0f7a0d14cfb4..f418f489c0ff471464a23380598e9f4c8da16ca9 100644
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
@@ -120,9 +120,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
}
protected:
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::ToDataType(ctx.Input("Emission")->type());
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("Emission")->type()),
+ ctx.device_context());
}
};
} // namespace operators
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 9d41879b27a24f83090f5abf1325eca5f9488d00..1e82742eaf86711fe4f9d02d517ad1853131cf67 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -51,9 +51,11 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
protected:
// Explicitly set that the data type of computation kernel of cross_entropy
// is determined by its input "X".
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::ToDataType(ctx.Input("X")->type());
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("X")->type()),
+ ctx.device_context());
}
};
@@ -98,9 +100,11 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
protected:
// Explicitly set that the data type of computation kernel of cross_entropy
// is determined by its input "X".
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::ToDataType(ctx.Input("X")->type());
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("X")->type()),
+ ctx.device_context());
}
};
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
index 232d88e26bd2fb17f52c2dc3626d47c2615b1d42..85871ebbfcd8ee38ef5e8078d1d6cb6bdda46a7b 100644
--- a/paddle/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -49,9 +49,11 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
}
protected:
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext &ctx) const override {
- return static_cast(ctx.Attr("data_type"));
+ return framework::OpKernelType(
+ static_cast(ctx.Attr("data_type")),
+ ctx.device_context());
}
};
@@ -73,10 +75,10 @@ class FillConstantBatchSizeLikeOpMaker
"with the specified value");
AddAttr>("shape", "(vector) The shape of the output");
AddAttr("input_dim_idx",
- "(int, default 0) the index of input's batch size dimension")
+ "(int, default 0) The index of input's batch size dimension")
.SetDefault(0);
AddAttr("output_dim_idx",
- "(int, default 0) the index of output's batch size dimension")
+ "(int, default 0) The index of output's batch size dimension")
.SetDefault(0);
AddAttr("value", "(float, default 0) The value to be filled")
.SetDefault(0.0f);
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
index f60425051cc93b77ef9e383f90a62eb9dfed44de..818f113b90a4c239a857791fb9957e51d3287b97 100644
--- a/paddle/operators/fill_constant_op.cc
+++ b/paddle/operators/fill_constant_op.cc
@@ -12,32 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
-#include "paddle/operators/fill_constant_op.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
-class FillConstantOp : public framework::OperatorWithKernel {
+class FillConstantInferShape : public framework::InferShapeBase {
public:
- using framework::OperatorWithKernel::OperatorWithKernel;
-
- void InferShape(framework::InferShapeContext *ctx) const override {
+ void operator()(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FillConstantOp should not be null.");
auto &shape = ctx->Attrs().Get>("shape");
- std::vector shape_int64(shape.size(), 0);
- std::transform(shape.begin(), shape.end(), shape_int64.begin(),
- [](int a) { return static_cast(a); });
- auto dims = framework::make_ddim(shape_int64);
- ctx->SetOutputDim("Out", dims);
+ ctx->SetOutputDim("Out", framework::make_ddim(shape));
}
+};
- protected:
- framework::DataType IndicateDataType(
- const framework::ExecutionContext &ctx) const override {
- int data_type = ctx.Attr("data_type");
- VLOG(10) << " FillConstant data_type = " << data_type;
- return static_cast(data_type);
+class FillConstantOp : public framework::OperatorBase {
+ public:
+ using framework::OperatorBase::OperatorBase;
+ void Run(const framework::Scope &scope,
+ const platform::DeviceContext &dev_ctx) const override {
+ auto data_type = static_cast(Attr("data_type"));
+ auto value = Attr("value");
+ auto force_cpu = Attr("force_cpu");
+ auto &out =
+ *scope.FindVar(Output("Out"))->GetMutable();
+ out.Resize(framework::make_ddim(Attr>("shape")));
+ if (force_cpu) {
+ auto cpu = platform::CPUPlace();
+ out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+ } else {
+ out.mutable_data(dev_ctx.GetPlace(), framework::ToTypeIndex(data_type));
+ }
+ math::set_constant(dev_ctx, &out, value);
}
};
@@ -53,6 +62,11 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr>("shape", "(vector) The shape of the output");
AddAttr("value", "(float, default 0) The value to be filled")
.SetDefault(0.0f);
+ AddAttr("force_cpu",
+ "(bool, default false) Force fill output variable to cpu "
+ "memory. Otherwise, fill output variable to the running "
+ "device")
+ .SetDefault(false);
AddOutput("Out",
"(Tensor) Tensor of specified shape will be filled "
"with the specified value");
@@ -68,10 +82,6 @@ Fill up a variable with specified constant value.
} // namespace paddle
namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
- ops::FillConstantOpMaker);
-REGISTER_OP_CPU_KERNEL(
- fill_constant, ops::FillConstantOpKernel,
- ops::FillConstantOpKernel,
- ops::FillConstantOpKernel,
- ops::FillConstantOpKernel);
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+ ops::FillConstantInferShape, ops::FillConstantOpMaker,
+ paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/fill_constant_op.h b/paddle/operators/fill_constant_op.h
deleted file mode 100644
index 3668f42f1c29541e29463ff3969064e80703fa04..0000000000000000000000000000000000000000
--- a/paddle/operators/fill_constant_op.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template
-class FillConstantOpKernel : public framework::OpKernel {
- public:
- void Compute(const framework::ExecutionContext& ctx) const override {
- auto* out = ctx.Output("Out");
- out->mutable_data(ctx.GetPlace());
- auto value = ctx.Attr("value");
-
- auto out_eigen = framework::EigenVector::Flatten(*out);
- auto place = ctx.GetEigenDevice();
- out_eigen.device(place) = out_eigen.constant(static_cast(value));
- }
-};
-
-} // namespace operators
-} // namespace paddle
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index aee672500ee5d1bf6cc7ef872f2cb6c408de6d9e..8f80fb162519f60fcce897b3c31a3507bbf6ba6d 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -40,9 +40,11 @@ class GatherOp : public framework::OperatorWithKernel {
}
protected:
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::ToDataType(ctx.Input("X")->type());
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("X")->type()),
+ ctx.device_context());
}
};
@@ -55,9 +57,11 @@ class GatherGradOp : public framework::OperatorWithKernel {
}
protected:
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::ToDataType(ctx.Input("X")->type());
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("X")->type()),
+ ctx.device_context());
}
};
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 802c98ae764d02af3143d1d39b714d486791da82..53ad86c6c48d1868f4495af51661d91b39a84f0b 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -57,9 +57,11 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
}
protected:
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
- return static_cast(ctx.Attr("data_type"));
+ return framework::OpKernelType(
+ static_cast(ctx.Attr("data_type")),
+ ctx.device_context());
}
};
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
index bcb48e13bd948b4e91ce8cbd7231a9619fac8d18..066bdf67aa037e9c25cfdfaff7ec8771eb59cde8 100644
--- a/paddle/operators/linear_chain_crf_op.cc
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -183,9 +183,11 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
protected:
// Explicitly set that the data type of computation kernel of linear_chain_crf
// is determined by its input "Emission".
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::ToDataType(ctx.Input("Emission")->type());
+ return framework::OpKernelType(
+ framework::ToDataType(ctx.Input("Emission")->type()),
+ ctx.device_context());
}
};
@@ -240,10 +242,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
protected:
// Explicitly set that the data type of output of the linear_chain_crf_grad
// operator is determined by its input: gradients of LogLikelihood.
- framework::DataType IndicateDataType(
+ framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx) const override {
- return framework::ToDataType(
- ctx.Input(framework::GradVarName("LogLikelihood"))->type());
+ return framework::OpKernelType(
+ framework::ToDataType(
+ ctx.Input(framework::GradVarName("LogLikelihood"))
+ ->type()),
+ ctx.device_context());
}
};
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
index be198951c241dc5e5587c8a2b8d94f67173d2b2a..ce010fcb91873b3099f6bf52cfe20c1ff61846ea 100644
--- a/paddle/operators/lod_rank_table_op.cc
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -28,6 +28,7 @@ class LoDRankTableOp : public framework::OperatorBase {
auto x = scope.FindVar(Input("X"))->Get();
auto *out =
scope.FindVar(Output("Out"))->GetMutable();
+ VLOG(10) << "Level = " << static_cast