提交 420f9c38 编写于 作者: L Luo Tao

Merge branch 'develop' into warpctc_deps

...@@ -9,7 +9,7 @@ import subprocess ...@@ -9,7 +9,7 @@ import subprocess
import platform import platform
COPYRIGHT = ''' COPYRIGHT = '''
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
...@@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING) ...@@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED) find_package(Git REQUIRED)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
if(NOT ANDROID AND NOT IOS)
find_package(Boost QUIET)
endif()
include(simd) include(simd)
...@@ -42,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F ...@@ -42,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON) option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
...@@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas ...@@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
...@@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}") ...@@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}")
include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
include_directories(${Boost_INCLUDE_DIRS})
set(EXTERNAL_LIBS set(EXTERNAL_LIBS
${GFLAGS_LIBRARIES} ${GFLAGS_LIBRARIES}
......
...@@ -27,7 +27,7 @@ RUN apt-get update && \ ...@@ -27,7 +27,7 @@ RUN apt-get update && \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \ net-tools libtool && \
apt-get clean -y apt-get clean -y
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(BOOST_PROJECT "extern_boost")
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0")
set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
include_directories(${BOOST_INCLUDE_DIR})
ExternalProject_Add(
${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
&& tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(boost STATIC ${dummyfile})
else()
add_library(boost INTERFACE)
endif()
add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
...@@ -229,12 +229,18 @@ function(cc_test TARGET_NAME) ...@@ -229,12 +229,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -462,7 +468,7 @@ endfunction() ...@@ -462,7 +468,7 @@ endfunction()
function(py_test TARGET_NAME) function(py_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options STATIC static SHARED shared) set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
DataFeeder data_feeder
=========== ===========
DataFeeder DataFeeder
----------- ----------
.. automodule:: paddle.v2.fluid.data_feeder
:members: DataFeeder .. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
:members:
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Evaluator !DO NOT EDIT THIS FILE MANUALLY!
===========
=========
Evaluator evaluator
----------- =========
.. automodule:: paddle.v2.fluid.evaluator
:members: Evaluator Accuracy
--------
.. autoclass:: paddle.v2.fluid.evaluator.Accuracy
:members:
:noindex: :noindex:
ChunkEvaluator
--------------
.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
:members:
:noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Executor !DO NOT EDIT THIS FILE MANUALLY!
===========
========
executor
========
Executor Executor
--------
.. autoclass:: paddle.v2.fluid.executor.Executor
:members:
:noindex:
global_scope
------------
.. autofunction:: paddle.v2.fluid.executor.global_scope
:noindex:
scope_guard
----------- -----------
.. automodule:: paddle.v2.fluid.executor
:members: Executor .. autofunction:: paddle.v2.fluid.executor.scope_guard
:noindex:
switch_scope
------------
.. autofunction:: paddle.v2.fluid.executor.switch_scope
:noindex: :noindex:
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import sys
import types
import paddle.v2.fluid as fluid
def parse_arg():
parser = argparse.ArgumentParser()
parser.add_argument('--submodules', nargs="*")
parser.add_argument(
'module', type=str, help='Generate the documentation of which module')
return parser.parse_args()
class DocGenerator(object):
def __init__(self, module_name, stream=sys.stdout):
self.stream = stream
self.module_name = module_name
if not hasattr(fluid, module_name):
raise ValueError("Cannot find fluid.{0}".format(module_name))
else:
self.module = getattr(fluid, module_name)
self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
''')
self._print_header_(module_name, dot='=', is_title=True)
def print_submodule(self, submodule_name):
submodule = getattr(self.module, submodule_name)
if submodule is None:
raise ValueError("Cannot find submodule {0}".format(submodule_name))
self.print_section(submodule_name)
for item in submodule.__all__:
self.print_item(item)
def print_current_module(self):
for item in self.module.__all__:
self.print_item(item)
def print_section(self, name):
self._print_header_(name, dot='=', is_title=False)
def print_item(self, name):
item = getattr(self.module, name)
if isinstance(item, types.TypeType):
self.print_class(name)
elif isinstance(item, types.FunctionType):
self.print_method(name)
else:
raise RuntimeError("Unsupported item {0}".format(name))
def print_class(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1}
:members:
:noindex:
'''.format(self.module_name, name))
def print_method(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1}
:noindex:
'''.format(self.module_name, name))
def _print_header_(self, name, dot, is_title):
dot_line = dot * len(name)
if is_title:
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write(name)
self.stream.write('\n')
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write('\n')
def main():
args = parse_arg()
gen = DocGenerator(args.module)
if args.submodules is None:
gen.print_current_module()
else:
for submodule_name in args.submodules:
gen.print_submodule(submodule_name)
if __name__ == '__main__':
main()
#!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
do
python gen_doc.py ${module} > ${module}.rst
done
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Initializer initializer
=========== ===========
Constant
--------
.. autoclass:: paddle.v2.fluid.initializer.Constant
Initializer :members:
-----------
.. automodule:: paddle.v2.fluid.initializer
:members: Initializer
:noindex:
ConstantInitializer
-------------------
.. automodule:: paddle.v2.fluid.initializer
:members: ConstantInitializer
:noindex: :noindex:
Uniform
-------
.. autoclass:: paddle.v2.fluid.initializer.Uniform
UniformInitializer :members:
------------------
.. automodule:: paddle.v2.fluid.initializer
:members: UniformInitializer
:noindex:
NormalInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: NormalInitializer
:noindex: :noindex:
Normal
------
XavierInitializer .. autoclass:: paddle.v2.fluid.initializer.Normal
----------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: XavierInitializer
:noindex: :noindex:
Xavier
------
MSRAInitializer .. autoclass:: paddle.v2.fluid.initializer.Xavier
--------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: MSRAInitializer
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
IO !DO NOT EDIT THIS FILE MANUALLY!
===========
==
io
==
save_vars
---------
is_parameter .. autofunction:: paddle.v2.fluid.io.save_vars
:noindex:
save_params
----------- -----------
.. autofunction:: paddle.v2.fluid.io.is_parameter
.. autofunction:: paddle.v2.fluid.io.save_params
:noindex:
save_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.save_persistables
:noindex:
load_vars
---------
.. autofunction:: paddle.v2.fluid.io.load_vars
:noindex:
load_params
-----------
.. autofunction:: paddle.v2.fluid.io.load_params
:noindex: :noindex:
load_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.load_persistables
:noindex:
save_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.save_inference_model
:noindex:
load_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.load_inference_model
:noindex:
get_inference_program
---------------------
.. autofunction:: paddle.v2.fluid.io.get_inference_program
:noindex:
========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Layers !DO NOT EDIT THIS FILE MANUALLY!
==========
======
layers
======
fc control_flow
--- ============
.. autofunction:: paddle.v2.fluid.layers.fc
split_lod_tensor
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex: :noindex:
embedding merge_lod_tensor
--------- ----------------
.. autofunction:: paddle.v2.fluid.layers.embedding
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex: :noindex:
dynamic_lstm BlockGuard
------------ ----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
.. autoclass:: paddle.v2.fluid.layers.BlockGuard
:members:
:noindex: :noindex:
dynamic_gru BlockGuardWithCompletion
----------- ------------------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
.. autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
:members:
:noindex: :noindex:
data StaticRNNMemoryLink
---- -------------------
.. autofunction:: paddle.v2.fluid.layers.data
.. autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
:members:
:noindex: :noindex:
mean WhileGuard
---- ----------
.. autofunction:: paddle.v2.fluid.layers.mean
.. autoclass:: paddle.v2.fluid.layers.WhileGuard
:members:
:noindex: :noindex:
mul While
--- -----
.. autofunction:: paddle.v2.fluid.layers.mul
.. autoclass:: paddle.v2.fluid.layers.While
:members:
:noindex: :noindex:
elementwise_add lod_rank_table
--------------- --------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex: :noindex:
elementwise_sub max_sequence_len
--------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex: :noindex:
elementwise_mul topk
--------------- ----
.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex: :noindex:
elementwise_div lod_tensor_to_array
--------------- -------------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex: :noindex:
array_to_lod_tensor
-------------------
dropout .. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex: :noindex:
increment
---------
reshape .. autofunction:: paddle.v2.fluid.layers.increment
--------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex: :noindex:
array_write
-----------
sigmoid .. autofunction:: paddle.v2.fluid.layers.array_write
---------
.. autofunction:: paddle.v2.fluid.layers.sigmoid
:noindex: :noindex:
create_array
------------
scale .. autofunction:: paddle.v2.fluid.layers.create_array
:noindex:
less_than
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.scale
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex: :noindex:
array_read
----------
transpose .. autofunction:: paddle.v2.fluid.layers.array_read
:noindex:
shrink_memory
-------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex:
array_length
------------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex:
IfElse
------
.. autoclass:: paddle.v2.fluid.layers.IfElse
:members:
:noindex:
DynamicRNN
----------
.. autoclass:: paddle.v2.fluid.layers.DynamicRNN
:members:
:noindex:
ConditionalBlock
----------------
.. autoclass:: paddle.v2.fluid.layers.ConditionalBlock
:members:
:noindex:
StaticRNN
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.transpose
.. autoclass:: paddle.v2.fluid.layers.StaticRNN
:members:
:noindex: :noindex:
reorder_lod_tensor_by_rank
--------------------------
sigmoid_cross_entropy_with_logits .. autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
---------------------------------
.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
:noindex: :noindex:
ParallelDo
----------
cast .. autoclass:: paddle.v2.fluid.layers.ParallelDo
:members:
:noindex:
Print
-----
.. autofunction:: paddle.v2.fluid.layers.Print
:noindex:
device
======
get_places
----------
.. autofunction:: paddle.v2.fluid.layers.get_places
:noindex:
io
==
data
---- ----
.. autofunction:: paddle.v2.fluid.layers.cast
.. autofunction:: paddle.v2.fluid.layers.data
:noindex: :noindex:
BlockGuardServ
--------------
concat .. autoclass:: paddle.v2.fluid.layers.BlockGuardServ
------- :members:
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex: :noindex:
ListenAndServ
-------------
sums .. autoclass:: paddle.v2.fluid.layers.ListenAndServ
:members:
:noindex:
Send
---- ----
.. autofunction:: paddle.v2.fluid.layers.sums
.. autofunction:: paddle.v2.fluid.layers.Send
:noindex: :noindex:
nn
==
linear_chain_crf fc
---------------- --
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
.. autofunction:: paddle.v2.fluid.layers.fc
:noindex: :noindex:
embedding
---------
assign
-------
.. autofunction:: paddle.v2.fluid.layers.embedding .. autofunction:: paddle.v2.fluid.layers.embedding
:noindex: :noindex:
dynamic_lstm
------------
split_lod_tensor .. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex: :noindex:
dynamic_lstmp
-------------
merge_lod_tensor .. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
:noindex:
dynamic_gru
-----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
:noindex:
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex:
linear_chain_crf
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
:noindex:
crf_decoding
------------
.. autofunction:: paddle.v2.fluid.layers.crf_decoding
:noindex: :noindex:
cos_sim cos_sim
-------- -------
.. autofunction:: paddle.v2.fluid.layers.cos_sim .. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex: :noindex:
cross_entropy cross_entropy
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.cross_entropy .. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex: :noindex:
square_error_cost square_error_cost
----------------- -----------------
.. autofunction:: paddle.v2.fluid.layers.square_error_cost .. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex: :noindex:
accuracy accuracy
--------- --------
.. autofunction:: paddle.v2.fluid.layers.accuracy .. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex: :noindex:
chunk_eval
----------
.. autofunction:: paddle.v2.fluid.layers.chunk_eval
:noindex:
sequence_conv sequence_conv
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_conv .. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex: :noindex:
conv2d conv2d
------ ------
.. autofunction:: paddle.v2.fluid.layers.conv2d .. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex: :noindex:
sequence_pool sequence_pool
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_pool .. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex: :noindex:
pool2d
------
sequence_first_step .. autofunction:: paddle.v2.fluid.layers.pool2d
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex: :noindex:
batch_norm
----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex:
sequence_last_step beam_search_decode
------------------ ------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex:
conv2d_transpose
----------------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex: :noindex:
sequence_expand
---------------
pool2d .. autofunction:: paddle.v2.fluid.layers.sequence_expand
------
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex: :noindex:
lstm_unit
---------
batch_norm .. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex:
reduce_sum
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex: :noindex:
reduce_mean
-----------
beam_search_decode .. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex:
reduce_max
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex:
reduce_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex:
sequence_first_step
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex:
sequence_last_step
------------------ ------------------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
:noindex:
dropout
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex: :noindex:
split
-----
lod_rank_table .. autofunction:: paddle.v2.fluid.layers.split
--------------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex: :noindex:
ctc_greedy_decoder
------------------
max_sequence_len .. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
----------------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex: :noindex:
edit_distance
-------------
topk .. autofunction:: paddle.v2.fluid.layers.edit_distance
-----
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex: :noindex:
l2_normalize
------------
lod_tensor_to_array .. autofunction:: paddle.v2.fluid.layers.l2_normalize
-------------------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex: :noindex:
matmul
------
.. autofunction:: paddle.v2.fluid.layers.matmul
array_to_lod_tensor
-------------------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex: :noindex:
warpctc
-------
.. autofunction:: paddle.v2.fluid.layers.warpctc
:noindex:
sequence_reshape
----------------
fill_constant .. autofunction:: paddle.v2.fluid.layers.sequence_reshape
-------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex: :noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
:noindex:
fill_constant_batch_size_like im2sequence
----------------------------- -----------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
.. autofunction:: paddle.v2.fluid.layers.im2sequence
:noindex: :noindex:
nce
---
ones .. autofunction:: paddle.v2.fluid.layers.nce
----
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex: :noindex:
beam_search
-----------
zeros .. autofunction:: paddle.v2.fluid.layers.beam_search
-----
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex: :noindex:
row_conv
--------
increment .. autofunction:: paddle.v2.fluid.layers.row_conv
---------
.. autofunction:: paddle.v2.fluid.layers.increment
:noindex: :noindex:
multiplex
---------
array_write .. autofunction:: paddle.v2.fluid.layers.multiplex
-----------
.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex: :noindex:
ops
===
mean
----
create_array .. autofunction:: paddle.v2.fluid.layers.mean
------------
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex: :noindex:
mul
---
less_than .. autofunction:: paddle.v2.fluid.layers.mul
---------
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex: :noindex:
reshape
-------
array_read .. autofunction:: paddle.v2.fluid.layers.reshape
----------
.. autofunction:: paddle.v2.fluid.layers.array_read
:noindex: :noindex:
scale
-----
shrink_memory .. autofunction:: paddle.v2.fluid.layers.scale
--------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex: :noindex:
sigmoid_cross_entropy_with_logits
---------------------------------
array_length .. autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
-------------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex: :noindex:
elementwise_add
---------------
conv2d_transpose .. autofunction:: paddle.v2.fluid.layers.elementwise_add
----------------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex: :noindex:
elementwise_div
sequence_expand
--------------- ---------------
.. autofunction:: paddle.v2.fluid.layers.sequence_expand
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex: :noindex:
elementwise_sub
---------------
gru_unit .. autofunction:: paddle.v2.fluid.layers.elementwise_sub
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex: :noindex:
elementwise_mul
---------------
lstm_unit .. autofunction:: paddle.v2.fluid.layers.elementwise_mul
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex: :noindex:
elementwise_max
---------------
sequence_softmax .. autofunction:: paddle.v2.fluid.layers.elementwise_max
----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
:noindex: :noindex:
elementwise_min
---------------
reduce_sum .. autofunction:: paddle.v2.fluid.layers.elementwise_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex: :noindex:
elementwise_pow
---------------
reduce_mean .. autofunction:: paddle.v2.fluid.layers.elementwise_pow
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex: :noindex:
clip
----
reduce_max .. autofunction:: paddle.v2.fluid.layers.clip
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex: :noindex:
clip_by_norm
------------
reduce_min .. autofunction:: paddle.v2.fluid.layers.clip_by_norm
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex: :noindex:
sequence_softmax
----------------
split .. autofunction:: paddle.v2.fluid.layers.sequence_softmax
-----
.. autofunction:: paddle.v2.fluid.layers.split
:noindex: :noindex:
sigmoid
-------
matmul .. autofunction:: paddle.v2.fluid.layers.sigmoid
------
.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex: :noindex:
logsigmoid logsigmoid
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.logsigmoid .. autofunction:: paddle.v2.fluid.layers.logsigmoid
:noindex: :noindex:
exp exp
--- ---
.. autofunction:: paddle.v2.fluid.layers.exp .. autofunction:: paddle.v2.fluid.layers.exp
:noindex: :noindex:
relu relu
---- ----
.. autofunction:: paddle.v2.fluid.layers.relu .. autofunction:: paddle.v2.fluid.layers.relu
:noindex: :noindex:
tanh tanh
---- ----
.. autofunction:: paddle.v2.fluid.layers.tanh .. autofunction:: paddle.v2.fluid.layers.tanh
:noindex: :noindex:
tanh_shrink tanh_shrink
----------- -----------
.. autofunction:: paddle.v2.fluid.layers.tanh_shrink .. autofunction:: paddle.v2.fluid.layers.tanh_shrink
:noindex: :noindex:
softshrink softshrink
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.softshrink .. autofunction:: paddle.v2.fluid.layers.softshrink
:noindex: :noindex:
sqrt sqrt
---- ----
.. autofunction:: paddle.v2.fluid.layers.sqrt .. autofunction:: paddle.v2.fluid.layers.sqrt
:noindex: :noindex:
abs abs
---- ---
.. autofunction:: paddle.v2.fluid.layers.abs .. autofunction:: paddle.v2.fluid.layers.abs
:noindex: :noindex:
ceil ceil
---- ----
.. autofunction:: paddle.v2.fluid.layers.ceil .. autofunction:: paddle.v2.fluid.layers.ceil
:noindex: :noindex:
floor floor
----- -----
.. autofunction:: paddle.v2.fluid.layers.floor .. autofunction:: paddle.v2.fluid.layers.floor
:noindex: :noindex:
round round
----- -----
.. autofunction:: paddle.v2.fluid.layers.round .. autofunction:: paddle.v2.fluid.layers.round
:noindex: :noindex:
reciprocal reciprocal
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.reciprocal .. autofunction:: paddle.v2.fluid.layers.reciprocal
:noindex: :noindex:
log log
--- ---
.. autofunction:: paddle.v2.fluid.layers.log .. autofunction:: paddle.v2.fluid.layers.log
:noindex: :noindex:
square square
------ ------
.. autofunction:: paddle.v2.fluid.layers.square .. autofunction:: paddle.v2.fluid.layers.square
:noindex: :noindex:
softplus softplus
-------- --------
.. autofunction:: paddle.v2.fluid.layers.softplus .. autofunction:: paddle.v2.fluid.layers.softplus
:noindex: :noindex:
softsign softsign
--------- --------
.. autofunction:: paddle.v2.fluid.layers.softsign .. autofunction:: paddle.v2.fluid.layers.softsign
:noindex: :noindex:
brelu brelu
----- -----
.. autofunction:: paddle.v2.fluid.layers.brelu .. autofunction:: paddle.v2.fluid.layers.brelu
:noindex: :noindex:
leaky_relu leaky_relu
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.leaky_relu .. autofunction:: paddle.v2.fluid.layers.leaky_relu
:noindex: :noindex:
soft_relu soft_relu
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.soft_relu .. autofunction:: paddle.v2.fluid.layers.soft_relu
:noindex: :noindex:
elu elu
---- ---
.. autofunction:: paddle.v2.fluid.layers.elu .. autofunction:: paddle.v2.fluid.layers.elu
:noindex: :noindex:
relu6 relu6
----- -----
.. autofunction:: paddle.v2.fluid.layers.relu6 .. autofunction:: paddle.v2.fluid.layers.relu6
:noindex: :noindex:
pow pow
---- ---
.. autofunction:: paddle.v2.fluid.layers.pow .. autofunction:: paddle.v2.fluid.layers.pow
:noindex: :noindex:
stanh
-----
.. autofunction:: paddle.v2.fluid.layers.stanh
:noindex:
hard_shrink hard_shrink
----------- -----------
.. autofunction:: paddle.v2.fluid.layers.hard_shrink .. autofunction:: paddle.v2.fluid.layers.hard_shrink
:noindex: :noindex:
thresholded_relu thresholded_relu
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.thresholded_relu .. autofunction:: paddle.v2.fluid.layers.thresholded_relu
:noindex: :noindex:
hard_sigmoid hard_sigmoid
------------- ------------
.. autofunction:: paddle.v2.fluid.layers.hard_sigmoid .. autofunction:: paddle.v2.fluid.layers.hard_sigmoid
:noindex: :noindex:
swish swish
------ -----
.. autofunction:: paddle.v2.fluid.layers.swish .. autofunction:: paddle.v2.fluid.layers.swish
:noindex: :noindex:
im2sequence tensor
======
create_tensor
-------------
.. autofunction:: paddle.v2.fluid.layers.create_tensor
:noindex:
create_parameter
----------------
.. autofunction:: paddle.v2.fluid.layers.create_parameter
:noindex:
create_global_var
-----------------
.. autofunction:: paddle.v2.fluid.layers.create_global_var
:noindex:
cast
----
.. autofunction:: paddle.v2.fluid.layers.cast
:noindex:
concat
------ ------
.. autofunction:: paddle.v2.fluid.layers.im2sequence
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex: :noindex:
edit_distance sums
--------------- ----
.. autofunction:: paddle.v2.fluid.layers.edit_distance_error
.. autofunction:: paddle.v2.fluid.layers.sums
:noindex: :noindex:
ctc_greedy_decoder assign
--------------- ------
.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
.. autofunction:: paddle.v2.fluid.layers.assign
:noindex: :noindex:
l2_normalize fill_constant_batch_size_like
------------ -----------------------------
.. autofunction:: paddle.v2.fluid.layers.l2_normalize
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex: :noindex:
sequence_reshape fill_constant
---------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex:
ones
----
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex:
zeros
-----
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Nets !DO NOT EDIT THIS FILE MANUALLY!
===========
====
nets
====
simple_img_conv_pool simple_img_conv_pool
-------------------- --------------------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
img_conv_group
---------------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex: :noindex:
sequence_conv_pool sequence_conv_pool
------------------ ------------------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool .. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex: :noindex:
glu glu
--- ---
.. autofunction:: paddle.v2.fluid.nets.glu .. autofunction:: paddle.v2.fluid.nets.glu
:noindex: :noindex:
scaled_dot_product_attention
----------------------------
dot_product_attention .. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
---------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Optimizer !DO NOT EDIT THIS FILE MANUALLY!
===========
Optimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: Optimizer
:noindex:
=========
optimizer
=========
SGDOptimizer SGD
----------- ---
.. automodule:: paddle.v2.fluid.optimizer
:members: SGDOptimizer
:noindex:
.. autoclass:: paddle.v2.fluid.optimizer.SGD
:members:
:noindex:
Momentum
--------
MomentumOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Momentum
----------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
:noindex: :noindex:
Adagrad
-------
.. autoclass:: paddle.v2.fluid.optimizer.Adagrad
AdagradOptimizer :members:
----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
:noindex: :noindex:
Adam
----
AdamOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adam
------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
:noindex: :noindex:
Adamax
------
AdamaxOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adamax
----------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamaxOptimizer
:noindex: :noindex:
DecayedAdagrad
--------------
DecayedAdagradOptimizer .. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
----------------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
param_attr
==========
ParamAttr ParamAttr
=========== ---------
.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr
:members:
:noindex:
WeightNormParamAttr
-------------------
ParamAttr .. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
----------- :members:
.. automodule:: paddle.v2.fluid.param_attr
:members: ParamAttr
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Profiler !DO NOT EDIT THIS FILE MANUALLY!
===========
========
profiler
========
cuda_profiler
-------------
Profiler
-----------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler .. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex: :noindex:
reset_profiler
--------------
.. autofunction:: paddle.v2.fluid.profiler.reset_profiler
:noindex:
profiler
--------
.. autofunction:: paddle.v2.fluid.profiler.profiler
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Regularizer regularizer
=========== ===========
WeightDecayRegularizer append_regularization_ops
---------------------- -------------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
L2DecayRegularizer .. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
:noindex: :noindex:
L1Decay
-------
.. autoclass:: paddle.v2.fluid.regularizer.L1Decay
:members:
:noindex:
L1DecayRegularizer L2Decay
------------------- -------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
.. autoclass:: paddle.v2.fluid.regularizer.L2Decay
:members:
:noindex:
# Design Doc: CSP in PaddlePaddle Fluid
## Motivation
Concurrent programming is important for deep learning. Few example applications are:
1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
## Concurrent Programming Models
There were many concurrent programming models, implemented in various forms:
| concurrent programming model | implementation |
|-----|-----|
| mutex | types and functions in standard libraries |
| semaphore | types and functions in standard libraries |
| communicating sequential processes (CSP) | Go programming language |
| actor model | Erlang programming language |
| message passing | MPI |
| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
### CSP v.s. Actor Model
A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
## CSP in Fluid
Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following:
1. a new data type: *channel* and operators *send* and *recv*,
1. *goroutine* or thread, and
1. a new control-flow: select.
We also need Python wrappers for the above components.
The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
### Type Channel
Fluid supports many data types:
1. Tensor,
1. Row-sparse Tensor
1. LoD Tensor,
1. Tensor array, etc
Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum.
To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
## Syntax Design
### Create Channel
In Go, we create a channel by specifying the element type and buffer size:
```go
ch := make(chan int) // a channel without buffer
ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
```
In Fluid, we should be able to do the same:
```python
ch = fluid.make_chan(dtype=INT)
ch1 = fluid.make_chan(dtype=INT, 100)
```
In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
```python
ch = fluid.make_chan(dtype=Tensor, etype=float16)
```
or Tensors of Tensors of float16 etc.
The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
### Send and Recv
### Select
## Example Programs
### 1. RPC between Trainers and Parameter Servers
### 2. Concurrent Minibatch Loading
...@@ -152,12 +152,12 @@ for data in train_reader(): ...@@ -152,12 +152,12 @@ for data in train_reader():
`JobDesc` object describe the distributed job resource specification to run on `JobDesc` object describe the distributed job resource specification to run on
Cluster environment. Cluster environment.
<img src="src/remote_executor.png"/> <img src="src/remote_executor.png" width="500" align="center" />
`RemoteExecutor.run` sends the `ProgramDesc` and `RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource) [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
to start the final Kubernetes Jobs to run the different role of `ProgramDesc`. to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
### Placement Algorithm ### Placement Algorithm
......
...@@ -9,16 +9,16 @@ different purposes. ...@@ -9,16 +9,16 @@ different purposes.
## Background ## Background
The previous implementations of the parameter server does not run a The previous implementations of the parameter server do not run a
fluid sub-program. Parameter initialization, optimizer computation, network fluid sub-program. Parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the communication and checkpointing are implemented twice on both the
trainer and the parameter server. trainer as well as the parameter server.
It would be great if we can write code once and use them on both the It would be great if we can write code once and use them on both: the
trainer and the parameter server: reduces code duplication and trainer and the parameter server, since this reduces code duplication and
improves extensibility. Given that after the current refactor, we are improves extensibility. Given that after the current refactoring, we are
representing everything as a computing graph on the representing everything as a computation graph on the
trainer. Representing everything as a computing graph on the parameter trainer. Representing everything as a computation graph on the parameter
server becomes a natural extension. server becomes a natural extension.
## Design ## Design
...@@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following ...@@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following
steps: steps:
1. OP placement: the OPs will be placed on different nodes according 1. OP placement: the OPs will be placed on different nodes according
to heuristic that minimizes estimated total computation to a heuristic that minimizes the estimated total computation
time. Currently we will use a simple heuristic that puts parameter time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer variable on parameter server workers and everything else on trainer
workers. workers.
1. Add communication OPs to enable the communication between nodes. 1. Add communication OPs to enable the communication between nodes.
...@@ -47,22 +47,22 @@ After converting: ...@@ -47,22 +47,22 @@ After converting:
<img src="src/dist-graph.png" width="700"/> <img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer program are placed on the parameter server. 1. The parameter variable W and its optimizer program are placed on the parameter server.
1. Operators are added to the program. 1. Operators are added to the program.
- *Send* sends data to the connected *Recv* operator. The - *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark to run when the *Send* operator has ran (the *Send* OP will mark
the *Recv* OP runnable automatically). the *Recv* OP runnable automatically).
- *Enueue* enqueues the input variable, it can block until space - *Enqueue* enqueues the input variable, it can block until space
become available in the queue. become available in the queue.
- *Dequeue* outputs configurable numbers of tensors from the - *Dequeue* outputs configurable numbers of tensors from the
queue. It will block until the queue have the required number of queue. It will block until the queue has the required number of
tensors. tensors.
### Benefits ### Benefits
- Model parallelism become easier to implement: it's an extension to - Model parallelism becomes easier to implement: it is an extension to
the trainer - parameter server approach. We can have several "Transpilers" the trainer - parameter server approach. We can have several "Transpilers"
to achieve different goals. to achieve different goals.
- User-defined optimizer is easier to add - user can now express it as - User-defined optimizer is easier to add - user can now express it as
...@@ -72,22 +72,22 @@ After converting: ...@@ -72,22 +72,22 @@ After converting:
### Challenges ### Challenges
- It's important to balance the parameter shards of on multiple - It is important to balance the parameter shards on multiple
parameter server. If a single parameter is very big (some parameter servers. If a single parameter is very big (for example: some
word-embedding, fully connected, softmax layer), we need to word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends parameter servers when possible (only element-wise optimizer depends
on the parameter variable). on the parameter variable).
- In the "Aync SGD" figure, the "W" variable on the parameter server - In the "Async SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently. See could be read and written concurrently. See
[here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
details about concurrent program in fluid. details about concurrent program in Fluid.
### Discussion ### Discussion
- Can the Enqueue OP be implemented under our current tensor design - Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)? (put the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the - *Dequeue* OP will have variable numbers of output (depending on the
`min_count` attribute), does our current design support it? (similar `min_count` attribute), does our current design support it? (similar
question for the *Add* OP) question for the *Add* OP)
......
...@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences ...@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
let's call this format the **absolute-offset LoD** for clarity. let's call this format the **absolute-offset LoD** for clarity.
The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
```python ```python
[[0, 3, 9] [[0, 3, 9]
[0, 2, 3, 3, 3, 9]] [0, 2, 3, 3, 3, 9]]
...@@ -119,7 +119,7 @@ def generate(): ...@@ -119,7 +119,7 @@ def generate():
encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
decoder_input = pd.fc( decoder_input = pd.fc(
act=pd.activation.Linear(), act=pd.activation.Linear(),
input=[target_word, encoder_ctx], input=[target_word, encoder_ctx_expanded],
size=3 * decoder_dim) size=3 * decoder_dim)
gru_out, cur_mem = pd.gru_step( gru_out, cur_mem = pd.gru_step(
decoder_input, mem=decoder_mem, size=decoder_dim) decoder_input, mem=decoder_mem, size=decoder_dim)
......
...@@ -140,7 +140,19 @@ TODO by Assignees ...@@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM ### Beam Search with CTC and LM
TODO by Assignees <div align="center">
<img src="image/beam_search.png" width=600><br/>
Figure 2. Algorithm for CTC Beam Search Decoder.
</div>
- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
- 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
- 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
- Such external scorer consists of language model, word count or any other custom scorers.
- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
## Future Work ## Future Work
...@@ -153,3 +165,4 @@ TODO by Assignees ...@@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
## Background ## Background
Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently. Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library. On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent. On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
...@@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith ...@@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith
There are mainly three parts that we have to consider while integrating a new device/library: There are mainly three parts that we have to consider while integrating a new device/library:
- Place and DeviceContext: indicates the device id and manages hardware resources - Place and DeviceContext: indicate the device id and manage hardware resources
- Memory and Tensor: malloc/free data on certain device - Memory and Tensor: malloc/free data on certain device
...@@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de ...@@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de
### Place and DeviceContext ### Place and DeviceContext
Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
#### Place #### Place
Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`. Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
``` ```
| CPUPlace | CPUPlace
...@@ -144,7 +144,7 @@ class Tensor { ...@@ -144,7 +144,7 @@ class Tensor {
}; };
``` ```
`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory. `Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
```cpp ```cpp
paddle::framework::Tensor t; paddle::framework::Tensor t;
...@@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi ...@@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi
Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example: Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
The interface is defined in header file. The interface is defined in the header file.
``` ```
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -174,7 +174,7 @@ class MaxOutFunctor { ...@@ -174,7 +174,7 @@ class MaxOutFunctor {
}; };
``` ```
CPU implemention is in .cc file CPU implementation is in .cc file
``` ```
template <typename T> template <typename T>
...@@ -188,7 +188,7 @@ class MaxOutFunctor<platform::CPUDeviceContext, T> { ...@@ -188,7 +188,7 @@ class MaxOutFunctor<platform::CPUDeviceContext, T> {
}; };
``` ```
CUDA implemention is in .cu file CUDA implementation is in .cu file
``` ```
template <typename T> template <typename T>
...@@ -203,9 +203,9 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> { ...@@ -203,9 +203,9 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> {
``` ```
We get computing handle from a concrete DeviceContext, and make compution on tensors. We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map. The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
Fluid provides different register interfaces in op_registry.h Fluid provides different register interfaces in op_registry.h
...@@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL(
## Advanced topics: How to switch between different Device/Library ## Advanced topics: How to switch between different Device/Library
Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library. Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
For more details, please refer to following docs: For more details, please refer to following docs:
......
...@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B ...@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B
"WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
"WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
"WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
"WITH_TESTING", "是否开启单元测试", "ON" "WITH_TESTING", "是否开启单元测试", "OFF"
"WITH_DOC", "是否编译中英文文档", "OFF" "WITH_DOC", "是否编译中英文文档", "OFF"
"WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
"WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON" "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
......
...@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like: ...@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
"WITH_AVX", "Build with AVX support", "ON" "WITH_AVX", "Build with AVX support", "ON"
"WITH_PYTHON", "Build with integrated Python interpreter", "ON" "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON" "WITH_STYLE_CHECK", "Check code style when building", "ON"
"WITH_TESTING", "Build unit tests", "ON" "WITH_TESTING", "Build unit tests", "OFF"
"WITH_DOC", "Build documentations", "OFF" "WITH_DOC", "Build documentations", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
......
...@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note ...@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
国内用户可以使用下面的镜像源来加速访问:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
然后在浏览器中输入以下网址: 然后在浏览器中输入以下网址:
.. code-block:: text .. code-block:: text
......
...@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command: ...@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
For users in China, we provide a faster mirror:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
Then, you would back and paste the address into the local browser: Then, you would back and paste the address into the local browser:
.. code-block:: text .. code-block:: text
......
...@@ -39,6 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具 ...@@ -39,6 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
......
...@@ -42,6 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star ...@@ -42,6 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
......
...@@ -60,8 +60,7 @@ each column is as follows: ...@@ -60,8 +60,7 @@ each column is as follows:
| column | meaning | | column | meaning |
| --- | --- | | --- | --- |
| ncalls | the number of calls into a function | | ncalls | the number of calls into a function |
| tottime | the total execution time of the function, not including the | tottime | the total execution time of the function, not including the execution time of other functions called by the function |
execution time of other functions called by the function |
| percall | tottime divided by ncalls | | percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called | | cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls | | percall | cumtime divided by ncalls |
......
...@@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes ...@@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes
PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html). PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
``` bash
cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
```
### Update the training script ### Update the training script
#### Non-cluster training script #### Non-cluster training script
...@@ -119,7 +125,14 @@ for pass_id in range(100): ...@@ -119,7 +125,14 @@ for pass_id in range(100):
### E2E demo ### E2E demo
Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run the following in the command line: Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
First `cd` into the folder that contains the `python` files. In this case:
```bash
cd /paddle/python/paddle/v2/fluid/tests/book_distribute
```
In parameter server node run the following in the command line:
``` bash ``` bash
PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
......
...@@ -18,7 +18,7 @@ else() ...@@ -18,7 +18,7 @@ else()
add_subdirectory(capi) add_subdirectory(capi)
endif() endif()
if(Boost_FOUND) if(NOT ANDROID AND NOT IOS)
add_subdirectory(memory) add_subdirectory(memory)
add_subdirectory(platform) add_subdirectory(platform)
add_subdirectory(framework) add_subdirectory(framework)
......
# ddim lib # ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
...@@ -22,11 +22,11 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) ...@@ -22,11 +22,11 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_test(variable_test SRCS variable_test.cc) cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc) cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool) cc_library(scope SRCS scope.cc DEPS glog threadpool)
...@@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_ ...@@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
cc_library(attribute SRCS attribute.cc DEPS framework_proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context) device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
...@@ -74,7 +74,10 @@ cc_library(backward SRCS backward.cc DEPS net_op) ...@@ -74,7 +74,10 @@ cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table profiler feed_fetch_method)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
...@@ -95,3 +98,5 @@ if(NOT WITH_C_API AND WITH_FLUID) ...@@ -95,3 +98,5 @@ if(NOT WITH_C_API AND WITH_FLUID)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details) install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
endif() endif()
cc_test(channel_test SRCS channel_test.cc)
...@@ -14,38 +14,45 @@ limitations under the License. */ ...@@ -14,38 +14,45 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/block_desc.h" #include <stddef.h> // for size_t
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/program_desc.h"
namespace paddle { namespace paddle {
namespace framework {
// Channel is the abstract class of buffered and un-buffered channels.
template <typename T>
class Channel {
public:
virtual void Send(T*) = 0;
virtual void Receive(T*) = 0;
virtual size_t Cap() = 0;
virtual void Close() = 0;
virtual ~Channel() {}
};
class InferenceEngine { // Forward declaration of channel implementations.
public: namespace details {
InferenceEngine() : program_(nullptr), load_program_(nullptr) {} template <typename T>
~InferenceEngine() { class Buffered;
delete program_; template <typename T>
delete load_program_; class UnBuffered;
} // namespace details
template <typename T>
Channel<T>* MakeChannel(size_t buffer_size) {
if (buffer_size > 0) {
return new details::Buffered<T>(buffer_size);
} }
return new details::UnBuffered<T>();
}
void LoadInferenceModel(const std::string& dirname); template <typename T>
void LoadInferenceModel(const std::string& dirname, void CloseChannel(Channel<T>* ch) {
const std::vector<std::string>& feed_var_names, ch->Close();
const std::vector<std::string>& fetch_var_names); }
void Execute(const std::vector<framework::LoDTensor>& feeds,
std::vector<framework::LoDTensor>& fetchs);
private:
bool IsParameter(const framework::VarDesc* var);
void GenerateLoadProgram(const std::string& dirname);
void PrependFeedOp();
void AppendFetchOp();
private:
framework::ProgramDesc* program_;
framework::ProgramDesc* load_program_;
std::vector<std::string> feed_var_names_;
std::vector<std::string> fetch_var_names_;
};
} // namespace framework
} // namespace paddle } // namespace paddle
#include "paddle/framework/details/buffered_channel.h"
#include "paddle/framework/details/unbuffered_channel.h"
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/channel.h"
#include <chrono>
#include <thread>
#include "gtest/gtest.h"
using paddle::framework::Channel;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
TEST(Channel, MakeAndClose) {
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
{
// MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel<int>(10);
EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel<int>(0);
EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
}
TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
ch->Send(&i); // should not block
}
size_t out;
for (size_t i = 0; i < buffer_size; ++i) {
ch->Receive(&out); // should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
delete ch;
}
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
size_t sum = 0;
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
ch->Send(&i); // should not block
sum += i;
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum, 45U);
CloseChannel(ch);
t.join();
delete ch;
}
...@@ -79,5 +79,33 @@ inline void VisitDataType(proto::DataType type, Visitor visitor) { ...@@ -79,5 +79,33 @@ inline void VisitDataType(proto::DataType type, Visitor visitor) {
} }
} }
inline std::string DataTypeToString(const proto::DataType type) {
using namespace paddle::framework::proto;
switch (type) {
case DataType::FP16:
return "float16";
case DataType::FP32:
return "float32";
case DataType::FP64:
return "float64";
case DataType::INT16:
return "int16";
case DataType::INT32:
return "int32";
case DataType::INT64:
return "int64";
case DataType::BOOL:
return "bool";
default:
PADDLE_THROW("Not support type %d", type);
}
}
inline std::ostream& operator<<(std::ostream& out,
const proto::DataType& type) {
out << DataTypeToString(type);
return out;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <deque>
#include <mutex>
#include "paddle/framework/channel.h"
#include "paddle/platform/enforce.h"
namespace paddle {
namespace framework {
namespace details {
template <typename T>
class Buffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public:
virtual void Send(T*);
virtual void Receive(T*);
virtual size_t Cap() { return cap_; }
virtual void Close();
virtual ~Buffered();
private:
size_t cap_;
std::mutex mu_;
std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_;
std::deque<T> channel_;
bool closed_;
Buffered(size_t cap) : cap_(cap), closed_(false) {
PADDLE_ENFORCE_GT(cap, 0);
}
void NotifyAllSenders(std::unique_lock<std::mutex>*);
};
template <typename T>
void Buffered<T>::Send(T* item) {
std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
if (!closed_) {
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
}
}
template <typename T>
void Buffered<T>::Receive(T* item) {
std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
if (!closed_) {
*item = std::move(channel_.front());
channel_.pop_front();
NotifyAllSenders(&lock);
} else {
item = nullptr;
}
}
template <typename T>
void Buffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
NotifyAllSenders(&lock);
}
template <typename T>
Buffered<T>::~Buffered() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
channel_.clear();
NotifyAllSenders(&lock);
}
template <typename T>
void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
lock->unlock();
full_cond_var_.notify_all();
}
} // namespace details
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <deque>
#include <mutex>
#include "paddle/framework/channel.h"
namespace paddle {
namespace framework {
namespace details {
template <typename T>
class UnBuffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public:
virtual void Send(T*);
virtual void Receive(T*);
virtual size_t Cap() { return 0; }
virtual void Close();
virtual ~UnBuffered();
private:
UnBuffered() {}
};
template <typename T>
void UnBuffered<T>::Send(T* channel_element) {}
template <typename T>
void UnBuffered<T>::Receive(T*) {}
template <typename T>
void UnBuffered<T>::Close() {}
template <typename T>
UnBuffered<T>::~UnBuffered() {}
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -17,13 +17,15 @@ limitations under the License. */ ...@@ -17,13 +17,15 @@ limitations under the License. */
#include <set> #include <set>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/framework/feed_fetch_method.h"
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
DECLARE_bool(do_memory_benchmark); DECLARE_bool(benchmark);
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
...@@ -31,9 +33,6 @@ DEFINE_bool(check_nan_inf, false, ...@@ -31,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
...@@ -116,9 +115,14 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -116,9 +115,14 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugStringEx(local_scope); VLOG(4) << op->DebugStringEx(local_scope);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(op->Type(), pool.Get(place_));
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
if (FLAGS_do_memory_benchmark) { VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
} }
...@@ -135,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -135,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
if (create_vars && create_local_scope) { if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: " VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
...@@ -143,5 +147,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -143,5 +147,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
} }
} }
// Check whether the block already has feed operators and feed_holder.
// Return false if the block does not have any feed operators.
// If some feed operators have been prepended to the block, check that
// the info contained in these feed operators matches the feed_targets
// and feed_holder_name. Raise exception when any mismatch is found.
// Return true if the block has feed operators and holder of matching info.
static bool has_feed_operators(
BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
const std::string& feed_holder_name) {
size_t feed_count = 0;
for (auto* op : block->AllOps()) {
if (op->Type() == kFeedOpType) {
feed_count++;
PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
"Input to feed op should be '%s'", feed_holder_name);
std::string feed_target_name = op->Output("Out")[0];
PADDLE_ENFORCE(
feed_targets.find(feed_target_name) != feed_targets.end(),
"Feed operator output name '%s' cannot be found in 'feed_targets'",
feed_target_name);
}
}
if (feed_count > 0) {
PADDLE_ENFORCE_EQ(
feed_count, feed_targets.size(),
"The number of feed operators should match 'feed_targets'");
// When feed operator are present, so should be feed_holder
auto var = block->FindVar(feed_holder_name);
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
feed_holder_name);
PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
"'%s' variable should be 'FEED_MINIBATCH' type",
feed_holder_name);
}
return feed_count > 0;
}
// Check whether the block already has fetch operators and fetch_holder.
// Return false if the block does not have any fetch operators.
// If some fetch operators have been appended to the block, check that
// the info contained in these fetch operators matches the fetch_targets
// and fetch_holder_name. Raise exception when any mismatch is found.
// Return true if the block has fetch operators and holder of matching info.
static bool has_fetch_operators(
BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& fetch_holder_name) {
size_t fetch_count = 0;
for (auto* op : block->AllOps()) {
if (op->Type() == kFetchOpType) {
fetch_count++;
PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
"Output of fetch op should be '%s'", fetch_holder_name);
std::string fetch_target_name = op->Input("X")[0];
PADDLE_ENFORCE(
fetch_targets.find(fetch_target_name) != fetch_targets.end(),
"Fetch operator input name '%s' cannot be found in 'fetch_targets'",
fetch_target_name);
}
}
if (fetch_count > 0) {
PADDLE_ENFORCE_EQ(
fetch_count, fetch_targets.size(),
"The number of fetch operators should match 'fetch_targets'");
// When fetch operator are present, so should be fetch_holder
auto var = block->FindVar(fetch_holder_name);
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
fetch_holder_name);
PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
"'%s' variable should be 'FETCH_LIST' type",
fetch_holder_name);
}
return fetch_count > 0;
}
void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
auto* copy_program = new ProgramDesc(program);
auto* global_block = copy_program->MutableBlock(0);
if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
// create feed_holder variable
auto* feed_holder = global_block->Var(feed_holder_name);
feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
feed_holder->SetPersistable(true);
int i = 0;
for (auto& feed_target : feed_targets) {
std::string var_name = feed_target.first;
VLOG(3) << "feed target's name: " << var_name;
// prepend feed op
auto* op = global_block->PrependOp();
op->SetType(kFeedOpType);
op->SetInput("X", {feed_holder_name});
op->SetOutput("Out", {var_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
i++;
}
}
// map the data of feed_targets to feed_holder
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
std::string feed_target_name = op->Output("Out")[0];
int idx = boost::get<int>(op->GetAttr("col"));
SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
idx);
}
}
if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
// create fetch_holder variable
auto* fetch_holder = global_block->Var(fetch_holder_name);
fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
fetch_holder->SetPersistable(true);
int i = 0;
for (auto& fetch_target : fetch_targets) {
std::string var_name = fetch_target.first;
VLOG(3) << "fetch target's name: " << var_name;
// append fetch op
auto* op = global_block->AppendOp();
op->SetType(kFetchOpType);
op->SetInput("X", {var_name});
op->SetOutput("Out", {fetch_holder_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
i++;
}
}
Run(*copy_program, scope, 0, true, true);
// obtain the data of fetch_targets from fetch_holder
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
GetFetchVariable(*scope, fetch_holder_name, idx);
}
}
delete copy_program;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -41,6 +41,12 @@ class Executor { ...@@ -41,6 +41,12 @@ class Executor {
void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true);
void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
private: private:
const platform::Place place_; const platform::Place place_;
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/feed_fetch_method.h"
#include "glog/logging.h"
#include "paddle/framework/variable.h"
namespace paddle {
namespace framework {
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs =
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
}
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) {
// Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size());
return tensor;
}
} // namespace framework
} // namespace paddle
...@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "glog/logging.h"
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
#include "paddle/framework/variable.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
void SetFeedVariable(Scope* scope, const LoDTensor& input, void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) { const std::string& var_name, size_t index);
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs =
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
}
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) { size_t index);
// Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size());
return tensor;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
...@@ -20,5 +21,8 @@ namespace paddle { ...@@ -20,5 +21,8 @@ namespace paddle {
namespace framework { namespace framework {
using FeedFetchType = LoDTensor; using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector<FeedFetchType>; using FeedFetchList = std::vector<FeedFetchType>;
static const std::string kFeedOpType = "feed";
static const std::string kFetchOpType = "fetch";
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string.h> // for strdup #include <string.h> // for strdup
#include <algorithm> #include <algorithm>
#include <stdexcept>
#include <string> #include <string>
#include "paddle/framework/init.h" #include "paddle/framework/init.h"
...@@ -46,17 +47,23 @@ void InitDevices() { ...@@ -46,17 +47,23 @@ void InitDevices() {
std::vector<platform::Place> places; std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
int count = 0;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int count = platform::GetCUDADeviceCount(); try {
for (int i = 0; i < count; ++i) { count = platform::GetCUDADeviceCount();
places.emplace_back(platform::CUDAPlace(i)); } catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
} }
#else #else
LOG(WARNING) LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option"; << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif #endif
for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
}
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
} }
......
...@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) { ...@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
using paddle::framework::InitDevices; using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool; using paddle::platform::DeviceContextPool;
#ifndef PADDLE_WITH_CUDA
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_GE(pool.size(), 1U); ASSERT_EQ(pool.size(), 1U);
#endif
}
TEST(InitDevices, CUDA) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifdef PADDLE_WITH_CUDA
int count = paddle::platform::GetCUDADeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
#endif
} }
...@@ -24,8 +24,6 @@ limitations under the License. */ ...@@ -24,8 +24,6 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <iterator> #include <iterator>
#include <glog/logging.h>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -107,9 +105,10 @@ LoD ToAbsOffset(const LoD &in) { ...@@ -107,9 +105,10 @@ LoD ToAbsOffset(const LoD &in) {
// the lowest level stores relative offsets // the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in; if (in.empty() || in.size() == 1) return in;
LoD result = in; LoD result = in;
for (int level = result.size() - 2; level >= 0; level--) { for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
for (auto &ele : result[level]) { for (size_t i = 0; i < in[level].size(); ++i) {
ele = result[level + 1][ele]; size_t index = in[level][i];
result[level][i] = result[level + 1][index];
} }
} }
return result; return result;
......
...@@ -18,11 +18,11 @@ limitations under the License. */ ...@@ -18,11 +18,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif #endif
#include <glog/logging.h> #include <glog/logging.h>
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/mixed_vector.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h" #include "paddle/framework/tensor_util.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
...@@ -31,15 +31,6 @@ limitations under the License. */ ...@@ -31,15 +31,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#ifndef PADDLE_WITH_CUDA
template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<
T, thrust::system::cuda::experimental::pinned_allocator<T>>;
#endif
/* /*
* LoD is short for Level of Details. * LoD is short for Level of Details.
* *
...@@ -55,7 +46,15 @@ using Vector = thrust::host_vector< ...@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
* 0 2 4 7 * 0 2 4 7
* 0 2 5 7 10 12 15 20 * 0 2 5 7 10 12 15 20
*/ */
using LoD = std::vector<Vector<size_t>>; struct LoD : public std::vector<Vector<size_t>> {
using std::vector<Vector<size_t>>::vector;
void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA();
}
}
};
std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t); std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
...@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1); ...@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
*/ */
class LoDTensor : public Tensor { class LoDTensor : public Tensor {
public: public:
LoDTensor() {} LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {} explicit LoDTensor(const LoD& lod) : lod_(lod) {}
......
...@@ -23,6 +23,17 @@ ...@@ -23,6 +23,17 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
TEST(LoD, data) {
LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i);
}
}
TEST(LodExpand, test) { TEST(LodExpand, test) {
LoD lod{{0, 2}}; LoD lod{{0, 2}};
LoDTensor tensor; LoDTensor tensor;
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stdio.h>
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
...@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) { ...@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
} }
} }
TEST(Vector, Normal) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
paddle::framework::InitDevices();
paddle::framework::Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(LoD, data) {
paddle::framework::InitDevices();
paddle::framework::LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
test<<<1, 1>>>(v.cuda_data(), v.size());
cudaDeviceSynchronize();
v.CopyFromCUDA();
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2);
}
}
TEST(LoDTensor, LoDInGPU) { TEST(LoDTensor, LoDInGPU) {
paddle::framework::InitDevices();
paddle::framework::LoDTensor lod_tensor; paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0); paddle::platform::CUDAPlace place(0);
...@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) { ...@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod(); auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].data(), lod[0].size()); test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) { for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <initializer_list>
#include <vector>
#include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
/**
* @brief Vector support both cpu and gpu.
* host vector lifetime is same with Vector
* device vector is lazily malloc and modified.
*/
template <typename T>
class Vector : public std::vector<T> {
public:
/* NOTE(dzhwinter):
* Data always store and modified on Host.
* If the data is modified when use cuda_data interface,
* You need to call the CopyFromCUDA explicitly to synchronize data.
*
*/
enum class kDataPosition {
kDataOnHost = 0,
kDataOnDevice = 1,
};
public:
using std::vector<T>::vector;
Vector() {}
Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT
virtual ~Vector() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
}
#endif
}
T *cuda_data() {
CopyToCUDA();
PADDLE_ENFORCE_NOT_NULL(
cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
return static_cast<T *>(cuda_ptr_);
}
T *data() { return std::vector<T>::data(); }
const T *data() const { return std::vector<T>::data(); }
void CopyToCUDA();
void CopyFromCUDA();
void CopyToPeer(platform::Place);
private:
void *cuda_ptr_ = nullptr;
size_t cuda_size_ = 0;
/*The DataPosition is unused now,
if we want support random access from cpu and cuda,
we need to overload all the vector method */
kDataPosition position_ = kDataPosition::kDataOnHost;
platform::CUDAPlace place_;
};
template <typename T>
void Vector<T>::CopyToCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ == nullptr) {
cuda_ptr_ =
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *cuda_ctx = pool.GetByPlace(place_);
memory::Copy(place_, static_cast<void *>(cuda_ptr_), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), cuda_ctx->stream());
cuda_ctx->Wait();
cuda_size_ = this->size();
#endif
}
template <typename T>
void Vector<T>::CopyFromCUDA() {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *cuda_ctx = pool.GetByPlace(place_);
if (cuda_ptr_ == nullptr) {
LOG(WARNING) << "No uncommited cuda data.";
return;
}
this->resize(cuda_size_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
cuda_ctx->stream());
cuda_ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyToPeer(platform::Place peer_place) {
if (platform::is_cpu_place(peer_place)) {
return;
}
#ifdef PADDLE_WITH_CUDA
auto *cuda_ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
void *peer_cuda_ptr_ = memory::Alloc<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
memory::Copy(boost::get<platform::CUDAPlace>(peer_place),
static_cast<void *>(peer_cuda_ptr_), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
cuda_ctx->stream());
cuda_ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
place_ = boost::get<platform::CUDAPlace>(peer_place);
cuda_ptr_ = peer_cuda_ptr_;
#endif
}
template class Vector<int>;
template class Vector<unsigned>;
template class Vector<size_t>;
template class Vector<int64_t>;
} // namespace framework
} // namespace paddle
...@@ -26,9 +26,9 @@ TEST(OpKernelType, ToString) { ...@@ -26,9 +26,9 @@ TEST(OpKernelType, ToString) {
OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN); LibraryType::kCUDNN);
ASSERT_EQ( ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
paddle::framework::KernelTypeToString(op_kernel_type), "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
"data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); "CUDNN]");
} }
TEST(OpKernelType, Hash) { TEST(OpKernelType, Hash) {
......
...@@ -22,9 +22,7 @@ limitations under the License. */ ...@@ -22,9 +22,7 @@ limitations under the License. */
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h" #include "paddle/framework/var_type.h"
DEFINE_bool(op_sync, false, DECLARE_bool(benchmark);
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -531,7 +529,7 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -531,7 +529,7 @@ void OperatorWithKernel::Run(const Scope& scope,
ExecutionContext(*this, new_scope, *new_dev_ctx)); ExecutionContext(*this, new_scope, *new_dev_ctx));
/*For profiling/benchmark only*/ /*For profiling/benchmark only*/
if (FLAGS_op_sync) { if (FLAGS_benchmark) {
new_dev_ctx->Wait(); new_dev_ctx->Wait();
} }
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/framework/block_desc.h" #include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -64,5 +65,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) { ...@@ -64,5 +65,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
} }
} }
const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
BlockDesc *global_block = blocks_[0].get();
std::vector<std::string> feed_target_names;
for (auto *op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
}
}
return feed_target_names;
}
const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
BlockDesc *global_block = blocks_[0].get();
std::vector<std::string> fetch_target_names;
for (auto *op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
fetch_target_names.push_back(op->Input("X")[0]);
}
}
return fetch_target_names;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/proto_desc.h" #include "paddle/framework/proto_desc.h"
#include "paddle/platform/macros.h" #include "paddle/platform/macros.h"
...@@ -45,6 +46,9 @@ class ProgramDesc { ...@@ -45,6 +46,9 @@ class ProgramDesc {
proto::ProgramDesc *Proto(); proto::ProgramDesc *Proto();
const std::vector<std::string> GetFeedTargetNames();
const std::vector<std::string> GetFetchTargetNames();
private: private:
proto::ProgramDesc desc_; proto::ProgramDesc desc_;
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include <glog/logging.h> #include <glog/logging.h>
...@@ -102,6 +103,32 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, ...@@ -102,6 +103,32 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
*op_field->Add() = input.blocks(block_id).ops(i); *op_field->Add() = input.blocks(block_id).ops(i);
} }
} }
// remove the VarDescs in BlockDesc that are not referenced in
// the pruned OpDescs
std::unordered_map<std::string, proto::VarDesc> var_map;
auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
for (const auto& var : *var_field) {
var_map[var.name()] = var;
}
var_field->Clear();
for (const auto& op : *op_field) {
// add VarDescs of all input arguments for each OpDesc
auto& input_field = op.inputs();
for (auto& input_var : input_field) {
for (auto& arg : input_var.arguments()) {
*var_field->Add() = var_map[arg];
}
}
// add VarDescs of all output arguments for each OpDesc
auto& output_field = op.outputs();
for (auto& output_var : output_field) {
for (auto& arg : output_var.arguments()) {
*var_field->Add() = var_map[arg];
}
}
}
} }
// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
......
...@@ -20,9 +20,11 @@ limitations under the License. */ ...@@ -20,9 +20,11 @@ limitations under the License. */
#include "paddle/framework/threadpool.h" #include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
DEFINE_bool(do_memory_benchmark, false, DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, " "Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs"); "and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) { ...@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
// When making memory benchmark on Fluid, we have to delete scope sync. // When making memory benchmark on Fluid, we have to delete scope sync.
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
delete scope; delete scope;
} else { } else {
Async([scope] { delete scope; }); Async([scope] { delete scope; });
......
...@@ -47,6 +47,11 @@ class Tensor { ...@@ -47,6 +47,11 @@ class Tensor {
public: public:
Tensor() : offset_(0) {} Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
inline T* data(); inline T* data();
...@@ -137,6 +142,7 @@ class Tensor { ...@@ -137,6 +142,7 @@ class Tensor {
virtual std::type_index type() const = 0; virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0; virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0; virtual void set_type(std::type_index type) = 0;
virtual void set_place(platform::Place place) = 0;
}; };
template <typename Place> template <typename Place>
...@@ -156,6 +162,7 @@ class Tensor { ...@@ -156,6 +162,7 @@ class Tensor {
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); } virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return type_; } virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; } virtual void set_type(std::type_index type) { type_ = type; }
virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */ /*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_; std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/threadpool.h" #include "paddle/framework/threadpool.h"
#include "paddle/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr); std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
std::once_flag ThreadPool::init_flag; std::once_flag ThreadPool::init_flag_;
ThreadPool* ThreadPool::GetInstance() {
std::call_once(init_flag_, &ThreadPool::Init);
return threadpool_.get();
}
void ThreadPool::Init() {
if (threadpool_.get() == nullptr) {
// TODO(Yancey1989): specify the max threads number
int num_threads = std::thread::hardware_concurrency();
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool_.reset(new ThreadPool(num_threads));
}
}
ThreadPool::ThreadPool(int num_threads)
: total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
threads_.resize(num_threads);
for (auto& thread : threads_) {
// TODO(Yancey1989): binding the thread on the specify CPU number
thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
}
}
ThreadPool::~ThreadPool() {
{
// notify all threads to stop running
running_ = false;
scheduled_.notify_all();
}
for (auto& t : threads_) {
t->join();
t.reset(nullptr);
}
}
void ThreadPool::Wait() {
std::unique_lock<std::mutex> lock(mutex_);
completed_.wait(lock, [=] { return Done() == true; });
}
void ThreadPool::TaskLoop() {
while (running_) {
std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
if (!running_) {
break;
}
// pop a task from the task queue
auto task = std::move(tasks_.front());
tasks_.pop();
--idle_threads_;
lock.unlock();
// run the task
task();
{
std::unique_lock<std::mutex> lock(mutex_);
++idle_threads_;
if (Done()) {
completed_.notify_all();
}
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -20,52 +20,36 @@ limitations under the License. */ ...@@ -20,52 +20,36 @@ limitations under the License. */
#include <mutex> #include <mutex>
#include <queue> #include <queue>
#include <thread> #include <thread>
#include <vector>
#include "paddle/platform/enforce.h" #include "paddle/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// ThreadPool maintains a queue of tasks, and runs them using a fixed
// number of threads.
class ThreadPool { class ThreadPool {
public: public:
typedef std::packaged_task<void()> Task; typedef std::packaged_task<void()> Task;
/** // Returns the singleton of ThreadPool.
* @brief Get a instance of threadpool, the thread number will static ThreadPool* GetInstance();
* be specified as the number of hardware thread contexts
*/
static ThreadPool* GetInstance() {
std::call_once(init_flag, &ThreadPool::Init);
return threadpool.get();
}
~ThreadPool() { ~ThreadPool();
{
// notify all threads to stop running
running_ = false;
scheduled_.notify_all();
}
for (auto& t : threads_) { // Returns the number of threads created by the constructor.
t->join(); size_t Threads() const { return total_threads_; }
t.reset(nullptr);
}
}
int GetNumThreads() const { return num_threads_; } // Returns the number of currently idle threads.
size_t IdleThreads() {
int GetAvailable() {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
return available_; return idle_threads_;
} }
/** // Run pushes a function to the task queue and returns a std::future
* @brief Push a function to the queue, and will be scheduled and // object. To wait for the completion of the task, call
* executed if a thread is available. // std::future::wait().
* @param[in] Task, will be pushed to the task queue.
* @return std::future<void>, we could wait for the task finished by
* f.wait().
*/
template <typename Callback> template <typename Callback>
std::future<void> Run(Callback fn) { std::future<void> Run(Callback fn) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
...@@ -77,84 +61,40 @@ class ThreadPool { ...@@ -77,84 +61,40 @@ class ThreadPool {
return f; return f;
} }
/** // Wait until all the tasks are completed.
* @brief Wait until all the tasks are completed. void Wait();
*/
void Wait() {
std::unique_lock<std::mutex> lock(mutex_);
completed_.wait(lock, [=] { return Done() == true; });
}
private: private:
DISABLE_COPY_AND_ASSIGN(ThreadPool); DISABLE_COPY_AND_ASSIGN(ThreadPool);
explicit ThreadPool(int num_threads) explicit ThreadPool(int num_threads);
: num_threads_(num_threads), available_(num_threads), running_(true) {
threads_.resize(num_threads);
for (auto& thread : threads_) {
// TODO(Yancey1989): binding the thread on the specify CPU number
thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
}
}
/** // If the task queue is empty and avaialbe is equal to the number of
* @brief If the task queue is empty and avaialbe // threads, means that all tasks are completed. Note: this function
* is equal to the number of threads, means that // is not thread-safe. Returns true if all tasks are completed.
* all tasks are completed. // Note: don't delete the data member total_threads_ and use
* // threads_.size() instead; because you'd need to lock the mutex
* Note: this function is not thread-safe. // before accessing threads_.
* bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
* @return true if all tasks are completed.
*/
bool Done() { return tasks_.empty() && available_ == num_threads_; }
void TaskLoop() {
while (running_) {
std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
if (!running_) { // The constructor starts threads to run TaskLoop, which retrieves
break; // and runs tasks from the queue.
} void TaskLoop();
// pop a task from the task queue
auto task = std::move(tasks_.front());
tasks_.pop();
--available_; // Init is called by GetInstance.
lock.unlock(); static void Init();
// run the task
task();
{
std::unique_lock<std::mutex> lock(mutex_);
++available_;
if (Done()) {
completed_.notify_all();
}
}
}
}
static void Init() {
if (threadpool.get() == nullptr) {
// TODO(Yancey1989): specify the max threads number
int num_threads = std::thread::hardware_concurrency();
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool.reset(new ThreadPool(num_threads));
}
}
private: private:
static std::unique_ptr<ThreadPool> threadpool; static std::unique_ptr<ThreadPool> threadpool_;
static std::once_flag init_flag; static std::once_flag init_flag_;
int num_threads_;
int available_;
bool running_;
std::queue<Task> tasks_;
std::vector<std::unique_ptr<std::thread>> threads_; std::vector<std::unique_ptr<std::thread>> threads_;
const size_t total_threads_;
size_t idle_threads_;
std::queue<Task> tasks_;
std::mutex mutex_; std::mutex mutex_;
bool running_;
std::condition_variable scheduled_; std::condition_variable scheduled_;
std::condition_variable completed_; std::condition_variable completed_;
}; };
......
...@@ -22,11 +22,7 @@ namespace framework = paddle::framework; ...@@ -22,11 +22,7 @@ namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) { void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
std::vector<std::future<void>> fs; std::vector<std::future<void>> fs;
for (int i = 0; i < cnt; ++i) { for (int i = 0; i < cnt; ++i) {
auto f = pool->Run([&sum]() { sum.fetch_add(1); }); fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
fs.push_back(std::move(f));
}
for (auto& f : fs) {
f.wait();
} }
} }
......
...@@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap, ...@@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap,
if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size()); if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
// flip aspect ratios // flip aspect ratios
for (int index = 0; index < tmp.size(); index++) { for (unsigned index = 0; index < tmp.size(); index++) {
real ar = tmp[index]; real ar = tmp[index];
if (fabs(ar - 1.) < 1e-6) continue; if (fabs(ar - 1.) < 1e-6) continue;
aspectRatio_.push_back(ar); aspectRatio_.push_back(ar);
......
...@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) { ...@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
"seqlastins", "seqlastins",
"non-seq", "non-seq",
-1); // hasSubseq seqlastins to non-seq -1); // hasSubseq seqlastins to non-seq
testDegradeLayer( testDegradeLayer(true,
true, "seqlastins", "seq", -1); // hasSubseq seqlastins to seq "seqlastins",
"seq",
-1); // hasSubseq seqlastins to seq
} }
TEST(Layer, AverageLayer) { TEST(Layer, AverageLayer) {
...@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) { ...@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) {
"average", "average",
"non-seq", "non-seq",
5); // seq average to a shorten seq, stride window = 5 5); // seq average to a shorten seq, stride window = 5
testDegradeLayer( testDegradeLayer(true,
true, "average", "non-seq", -1); // hasSubseq average to non-seq "average",
"non-seq",
-1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
} }
...@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) { ...@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) {
testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2( testPoolLayer2("cudnn-avg-incl-pad-pool",
"cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true); /* trans= */ false,
/* useGpu= */ true);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true); testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
...@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) { ...@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) {
} }
TEST(Layer, ScaleShiftLayer) { TEST(Layer, ScaleShiftLayer) {
const size_t batchSize = 16; // FIXME: Disable ScaleShiftLayer because it is not stable.
const size_t size = 32; // https://github.com/PaddlePaddle/Paddle/issues/7781
TestConfig config; return;
config.layerConfig.set_type("scale_shift"); // const size_t batchSize = 16;
config.layerConfig.set_size(size); // const size_t size = 32;
config.biasSize = 1; // TestConfig config;
config.inputDefs.push_back( // config.layerConfig.set_type("scale_shift");
{INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1}); // config.layerConfig.set_size(size);
config.layerConfig.add_inputs(); // config.biasSize = 1;
for (auto useGpu : {false, true}) { // config.inputDefs.push_back(
testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false); // {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
} // config.layerConfig.add_inputs();
// for (auto useGpu : {false, true}) {
// testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
// }
} }
TEST(Layer, ScaleSubRegionLayer) { TEST(Layer, ScaleSubRegionLayer) {
......
set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init) set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS inference.cc SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Merge all modules into a single static library # Merge all modules into a single static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Create shared library # Create shared library
add_library(paddle_fluid_shared SHARED inference.cc) add_library(paddle_fluid_shared SHARED io.cc)
target_circle_link_libraries(paddle_fluid_shared target_circle_link_libraries(paddle_fluid_shared
ARCHIVE_START ARCHIVE_START
...@@ -20,23 +20,10 @@ SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) ...@@ -20,23 +20,10 @@ SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
# install library & headers # install library & headers
if(NOT WITH_C_API AND WITH_FLUID) if(NOT WITH_C_API AND WITH_FLUID)
install(FILES inference.h DESTINATION include/paddle/inference) install(FILES io.h DESTINATION include/paddle/inference)
install(TARGETS paddle_fluid_shared DESTINATION lib) install(TARGETS paddle_fluid_shared DESTINATION lib)
endif() endif()
add_executable(example example.cc) if(WITH_TESTING)
if(APPLE) add_subdirectory(tests/book)
set(OPTIONAL_LINK_FLAGS)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
endif()
target_link_libraries(example
-Wl,-force_load paddle_fluid
${OPTIONAL_LINK_FLAGS}
${PTOOLS_LIB})
else()
target_link_libraries(example
-Wl,--start-group -Wl,--whole-archive paddle_fluid
-Wl,--no-whole-archive -Wl,--end-group
${PTOOLS_LIB})
endif() endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <time.h>
#include <iostream>
#include "gflags/gflags.h"
#include "paddle/inference/inference.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_dirname.empty()) {
// Example:
// ./example --dirname=recognize_digits_mlp.inference.model
std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
exit(1);
}
std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
paddle::InferenceEngine* engine = new paddle::InferenceEngine();
engine->LoadInferenceModel(dirname);
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor> feeds;
feeds.push_back(input);
std::vector<paddle::framework::LoDTensor> fetchs;
engine->Execute(feeds, fetchs);
for (size_t i = 0; i < fetchs.size(); ++i) {
auto dims_i = fetchs[i].dims();
std::cout << "dims_i:";
for (int j = 0; j < dims_i.size(); ++j) {
std::cout << " " << dims_i[j];
}
std::cout << std::endl;
std::cout << "result:";
float* output_ptr = fetchs[i].data<float>();
for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
std::cout << " " << output_ptr[j];
}
std::cout << std::endl;
}
delete engine;
return 0;
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "inference.h"
#include <fstream>
#include "paddle/framework/executor.h"
#include "paddle/framework/feed_fetch_method.h"
#include "paddle/framework/init.h"
#include "paddle/framework/scope.h"
#ifdef PADDLE_USE_PTOOLS
#include "chooseser.h"
#endif
namespace paddle {
void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
std::string model_filename = dirname + "/__model__.dat";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
program_ = new framework::ProgramDesc(program_desc_str);
GenerateLoadProgram(dirname);
framework::BlockDesc* global_block = program_->MutableBlock(0);
feed_var_names_.clear();
fetch_var_names_.clear();
for (auto* op : global_block->AllOps()) {
if (op->Type() == "feed") {
feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
} else if (op->Type() == "fetch") {
fetch_var_names_.push_back(op->Input("X")[0]);
}
}
}
void InferenceEngine::LoadInferenceModel(
const std::string& dirname,
const std::vector<std::string>& feed_var_names,
const std::vector<std::string>& fetch_var_names) {
std::string model_filename = dirname + "/__model__.dat";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
program_ = new framework::ProgramDesc(program_desc_str);
GenerateLoadProgram(dirname);
if (feed_var_names.empty() || fetch_var_names.empty()) {
LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
}
feed_var_names_ = feed_var_names;
fetch_var_names_ = fetch_var_names;
PrependFeedOp();
AppendFetchOp();
}
bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
if (var->Persistable() && var->Name() != "feed" && var->Name() != "fetch") {
// There are many unreachable variables in the program
for (size_t i = 0; i < program_->Size(); ++i) {
const framework::BlockDesc& block = program_->Block(i);
for (auto* op : block.AllOps()) {
for (auto input_argument_name : op->InputArgumentNames()) {
if (input_argument_name == var->Name()) {
return true;
}
}
}
}
}
return false;
}
void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
framework::BlockDesc* global_block = program_->MutableBlock(0);
load_program_ = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program_->MutableBlock(0);
for (auto* var : global_block->AllVars()) {
if (IsParameter(var)) {
LOG(INFO) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
// append_op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
op->CheckAttrs();
}
}
}
void InferenceEngine::PrependFeedOp() {
if (!program_) {
LOG(FATAL) << "Please initialize the program_ first.";
}
framework::BlockDesc* global_block = program_->MutableBlock(0);
// create_var
framework::VarDesc* feed_var = global_block->Var("feed");
feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
feed_var->SetPersistable(true);
// prepend feed_op
for (size_t i = 0; i < feed_var_names_.size(); ++i) {
std::string var_name = feed_var_names_[i];
LOG(INFO) << "feed var's name: " << var_name;
// prepend_op
framework::OpDesc* op = global_block->PrependOp();
op->SetType("feed");
op->SetInput("X", {"feed"});
op->SetOutput("Out", {var_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
}
}
void InferenceEngine::AppendFetchOp() {
if (!program_) {
LOG(FATAL) << "Please initialize the program_ first.";
}
framework::BlockDesc* global_block = program_->MutableBlock(0);
// create_var
framework::VarDesc* fetch_var = global_block->Var("fetch");
fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
fetch_var->SetPersistable(true);
// append fetch_op
for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
std::string var_name = fetch_var_names_[i];
LOG(INFO) << "fetch var's name: " << var_name;
// append_op
framework::OpDesc* op = global_block->AppendOp();
op->SetType("fetch");
op->SetInput("X", {var_name});
op->SetOutput("Out", {"fetch"});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
}
}
void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
std::vector<framework::LoDTensor>& fetchs) {
if (!program_ || !load_program_) {
LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
}
if (feeds.size() < feed_var_names_.size()) {
LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
}
auto* place = new platform::CPUPlace();
framework::InitDevices();
framework::Executor* executor = new framework::Executor(*place);
framework::Scope* scope = new framework::Scope();
executor->Run(*load_program_, scope, 0, true, true);
// set_feed_variable
for (size_t i = 0; i < feed_var_names_.size(); ++i) {
framework::SetFeedVariable(scope, feeds[i], "feed", i);
}
executor->Run(*program_, scope, 0, true, true);
// get_fetch_variable
fetchs.resize(fetch_var_names_.size());
for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
}
delete place;
delete scope;
delete executor;
}
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/inference/io.h"
#include <fstream>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle {
namespace inference {
bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program) {
if (var->Persistable()) {
// There are many unreachable variables in the program
for (size_t i = 0; i < main_program.Size(); ++i) {
const framework::BlockDesc& block = main_program.Block(i);
for (auto* op : block.AllOps()) {
if (op->Type() == framework::kFeedOpType) {
continue;
}
for (auto input_argument_name : op->InputArgumentNames()) {
if (input_argument_name == var->Name()) {
return true;
}
}
}
}
}
return false;
}
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname,
const framework::ProgramDesc& main_program) {
const framework::BlockDesc& global_block = main_program.Block(0);
framework::ProgramDesc* load_program = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program->MutableBlock(0);
for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) {
VLOG(3) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
// append_op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
op->CheckAttrs();
}
}
executor.Run(*load_program, &scope, 0, true, true);
delete load_program;
}
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
LoadPersistables(executor, scope, dirname, *main_program);
return main_program;
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "paddle/framework/executor.h"
#include "paddle/framework/program_desc.h"
#include "paddle/framework/scope.h"
namespace paddle {
namespace inference {
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname,
const framework::ProgramDesc& main_program);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname);
} // namespace inference
} // namespace paddle
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
cc_test(test_inference_recognize_digits_mlp
SRCS test_inference_recognize_digits.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp
PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <time.h>
#include <sstream>
#include "gflags/gflags.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/inference/io.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
template <typename Place, typename T>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
// 1. Define place, executor and scope
auto place = Place();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
// 2. Initialize the inference_program and load all parameters from file
auto inference_program = paddle::inference::Load(executor, *scope, dirname);
// 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names =
inference_program->GetFeedTargetNames();
const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames();
// 4. Prepare inputs: set up maps for feed targets
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
for (size_t i = 0; i < feed_target_names.size(); ++i) {
// Please make sure that cpu_feeds[i] is right for feed_target_names[i]
feed_targets[feed_target_names[i]] = cpu_feeds[i];
}
// 5. Define Tensor to get the outputs: set up maps for fetch targets
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
}
// 6. Run the inference program
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
delete scope;
}
TEST(inference, recognize_digits) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
EXPECT_EQ(output1.dims(), output2.dims());
EXPECT_EQ(output1.numel(), output2.numel());
float err = 1E-3;
int count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
#endif
}
add_subdirectory(detail) add_subdirectory(detail)
cc_library(memory SRCS memory.cc DEPS place enforce) cc_library(memory SRCS memory.cc DEPS place enforce)
cc_library(memcpy SRCS memcpy.cc) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(paddle_memory cc_library(paddle_memory
DEPS DEPS
......
...@@ -147,6 +147,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table) ...@@ -147,6 +147,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling) op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor) op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
...@@ -172,6 +173,8 @@ endif() ...@@ -172,6 +173,8 @@ endif()
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
...@@ -191,3 +194,4 @@ if(WITH_GPU) ...@@ -191,3 +194,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
...@@ -323,7 +323,7 @@ template <typename T> ...@@ -323,7 +323,7 @@ template <typename T>
struct FloorFunctor : public BaseActivationFunctor<T> { struct FloorFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out> template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const { void operator()(Device d, X x, Out out) const {
out.device(d) = x.ceil(); out.device(d) = x.floor();
} }
}; };
......
...@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func; math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
auto grad_merge = merge_func(context, grad); auto grad_merge = merge_func(context, grad);
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>(); auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
auto& merge_rows = grad_merge.rows(); framework::Vector<int64_t> merge_rows(grad_merge.rows());
// 2. m += g_m * g_m // 2. m += g_m * g_m
math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func; math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge); auto grad_square = sqare_func(context, grad_merge, grad_merge);
...@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
SparseAdagradFunctorKernel< SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0, T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, grad_merge.rows().data(), .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
lr, param_data, moment_data, grad_width, param_data, moment_data, grad_width,
epsilon); epsilon);
} }
}; };
......
...@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
merge_func(ctx.template device_context<DeviceContext>(), grad); merge_func(ctx.template device_context<DeviceContext>(), grad);
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
auto* rows = grad_merge.rows().data(); int64_t* rows = nullptr;
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->cuda_data();
} else {
rows = grad_merge.mutable_rows()->data();
}
auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor( SparseAdamFunctor<T> functor(
......
...@@ -24,8 +24,18 @@ namespace operators { ...@@ -24,8 +24,18 @@ namespace operators {
void BeamSearch::operator()(const framework::LoDTensor &pre_ids, void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
framework::LoDTensor *selected_ids, framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores) { framework::LoDTensor *selected_scores) {
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
auto items = SelectTopBeamSizeItems(); auto items = SelectTopBeamSizeItems();
auto selected_items = ToMap(items); auto selected_items = ToMap(items, high_level.back());
VLOG(3) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) {
VLOG(3) << "offset:" << i;
for (auto &item : selected_items[i]) {
VLOG(3) << ItemToString(item);
}
}
PruneEndidCandidates(pre_ids, &selected_items); PruneEndidCandidates(pre_ids, &selected_items);
// calculate the output tensor's height // calculate the output tensor's height
size_t num_instances = std::accumulate( size_t num_instances = std::accumulate(
...@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, ...@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
low_level.push_back(low_offset); low_level.push_back(low_offset);
// fill lod // fill lod
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
framework::LoD lod(2); framework::LoD lod(2);
lod[0].assign(high_level.begin(), high_level.end()); lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end()); lod[1].assign(low_level.begin(), low_level.end());
if (!framework::CheckLoD(lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
}
selected_ids->set_lod(lod); selected_ids->set_lod(lod);
selected_scores->set_lod(lod); selected_scores->set_lod(lod);
} }
...@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, ...@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
} }
std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap( std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
const std::vector<std::vector<Item>> &items) { const std::vector<std::vector<Item>> &items, size_t element_num) {
std::vector<std::vector<Item>> result; std::vector<std::vector<Item>> result;
result.resize(element_num);
for (auto &entries : items) { for (auto &entries : items) {
for (const auto &item : entries) { for (const auto &item : entries) {
if (item.offset >= result.size()) {
result.resize(item.offset + 1);
}
result[item.offset].push_back(item); result[item.offset].push_back(item);
} }
} }
...@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() { ...@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() {
} }
result.emplace_back(items); result.emplace_back(items);
} }
VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
for (auto &items : result) {
VLOG(3) << "item set:";
for (auto &item : items) {
VLOG(3) << ItemToString(item);
}
}
return result; return result;
} }
...@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) { ...@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
return true; return true;
} }
std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
os << "{";
os << "offset: " << item.offset << ", ";
os << "id: " << item.id << ", ";
os << "score: " << item.score << "";
os << "}";
return os;
}
std::string ItemToString(const BeamSearch::Item &item) {
std::ostringstream stream;
stream << item;
return stream.str();
}
class BeamSearchProtoAndCheckerMaker class BeamSearchProtoAndCheckerMaker
: public framework::OpProtoAndCheckerMaker { : public framework::OpProtoAndCheckerMaker {
public: public:
...@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker ...@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker
} }
}; };
class BeamSearchInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
for (const std::string &arg :
std::vector<std::string>({"pre_ids", "ids", "scores"})) {
PADDLE_ENFORCE(context->HasInput(arg),
"BeamSearch need input argument '%s'", arg);
}
for (const std::string &arg :
std::vector<std::string>({"selected_ids", "selected_scores"})) {
PADDLE_ENFORCE(context->HasOutput(arg),
"BeamSearch need output argument '%s'", arg);
}
}
};
class BeamSearchInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &o : op_desc.Output("selected_ids")) {
block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
}
for (auto &o : op_desc.Output("selected_scores")) {
block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp, REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
paddle::operators::BeamSearchProtoAndCheckerMaker); paddle::operators::BeamSearchProtoAndCheckerMaker,
paddle::operators::BeamSearchInferShape,
paddle::operators::BeamSearchInferVarType,
paddle::framework::EmptyGradOpMaker);
...@@ -136,8 +136,6 @@ class BeamSearch { ...@@ -136,8 +136,6 @@ class BeamSearch {
void operator()(const framework::LoDTensor& pre_ids, void operator()(const framework::LoDTensor& pre_ids,
framework::LoDTensor* selected_ids, framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores); framework::LoDTensor* selected_scores);
protected:
/* /*
* The basic items help to sort. * The basic items help to sort.
*/ */
...@@ -155,6 +153,7 @@ class BeamSearch { ...@@ -155,6 +153,7 @@ class BeamSearch {
score_t score; score_t score;
}; };
protected:
/* /*
* Delete all the records that follows the end token. * Delete all the records that follows the end token.
*/ */
...@@ -166,7 +165,7 @@ class BeamSearch { ...@@ -166,7 +165,7 @@ class BeamSearch {
* NOTE low performance * NOTE low performance
*/ */
std::vector<std::vector<Item>> ToMap( std::vector<std::vector<Item>> ToMap(
const std::vector<std::vector<Item>>& inputs); const std::vector<std::vector<Item>>& inputs, size_t element_num);
/* /*
* For each source, select top beam_size records. * For each source, select top beam_size records.
...@@ -187,6 +186,10 @@ class BeamSearch { ...@@ -187,6 +186,10 @@ class BeamSearch {
int end_id_{0}; int end_id_{0};
}; };
std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
std::string ItemToString(const BeamSearch::Item& item);
class BeamSearchOp : public framework::OperatorBase { class BeamSearchOp : public framework::OperatorBase {
public: public:
BeamSearchOp(const std::string& type, BeamSearchOp(const std::string& type,
...@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase {
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
LOG(INFO) << "run beam search op";
auto ids_var = scope.FindVar(Input("ids")); auto ids_var = scope.FindVar(Input("ids"));
auto scores_var = scope.FindVar(Input("scores")); auto scores_var = scope.FindVar(Input("scores"));
auto pre_ids_var = scope.FindVar(Input("pre_ids")); auto pre_ids_var = scope.FindVar(Input("pre_ids"));
...@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase {
size_t level = Attr<int>("level"); size_t level = Attr<int>("level");
size_t beam_size = Attr<int>("beam_size"); size_t beam_size = Attr<int>("beam_size");
int end_id = Attr<int>("end_id"); int end_id = Attr<int>("end_id");
LOG(INFO) << "init beam search";
BeamSearch alg(ids, scores, level, beam_size, end_id); BeamSearch alg(ids, scores, level, beam_size, end_id);
LOG(INFO) << "after beam search";
auto selected_ids_var = scope.FindVar(Output("selected_ids")); auto selected_ids_var = scope.FindVar(Output("selected_ids"));
auto selected_scores_var = scope.FindVar(Output("selected_scores")); auto selected_scores_var = scope.FindVar(Output("selected_scores"));
PADDLE_ENFORCE_NOT_NULL(selected_ids_var); PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
...@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase {
*selected_ids_var->GetMutable<framework::LoDTensor>(); *selected_ids_var->GetMutable<framework::LoDTensor>();
auto& selected_scores_tensor = auto& selected_scores_tensor =
*selected_scores_var->GetMutable<framework::LoDTensor>(); *selected_scores_var->GetMutable<framework::LoDTensor>();
LOG(INFO) << "run beam search";
alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor); alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
LOG(INFO) << "finish beam search";
} }
}; };
......
...@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> { ...@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>( MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
num_tokens, tokens, num_seq, input_lod[level].data(), blank, num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
merge_repeated, dev_out_lod0_ptr, output_data); merge_repeated, dev_out_lod0_ptr, output_data);
// set output lod // set output lod
thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(), std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
dev_out_lod0.end());
framework::LoD out_lod; framework::LoD out_lod;
out_lod.push_back(host_out_lod0); out_lod.push_back(host_out_lod0);
output->set_lod(out_lod); output->set_lod(out_lod);
......
...@@ -51,7 +51,7 @@ class CTCAlignKernel : public framework::OpKernel<T> { ...@@ -51,7 +51,7 @@ class CTCAlignKernel : public framework::OpKernel<T> {
T prev_token = -1; T prev_token = -1;
for (size_t i = input_lod[level][seq_idx]; for (size_t i = input_lod[level][seq_idx];
i < input_lod[level][seq_idx + 1]; ++i) { i < input_lod[level][seq_idx + 1]; ++i) {
if (input_data[i] != blank && if ((unsigned)input_data[i] != blank &&
!(merge_repeated && input_data[i] == prev_token)) { !(merge_repeated && input_data[i] == prev_token)) {
output_data[output_idx] = input_data[i]; output_data[output_idx] = input_data[i];
++output_idx; ++output_idx;
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "grpc_client.h" #include "grpc_client.h"
#include "paddle/framework/threadpool.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
...@@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -22,25 +23,32 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out) { int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string var_name_val = var_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
auto* var = p_scope->FindVar(var_name_val);
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
auto* var = scope.FindVar(var_name); SerializeToMessage(var_name_val, var, *p_ctx, &req);
SerializeToMessage(var_name, var, ctx, &req);
// varhandle // varhandle
VarHandle var_h; VarHandle var_h;
var_h.ep = ep; var_h.ep = ep_val;
var_h.scope = &scope; var_h.scope = p_scope;
var_h.name = var_name; var_h.name = var_name_val;
var_h.ctx = &ctx; var_h.ctx = p_ctx;
// stub context // stub context
auto ch = GetChannel(ep);
SendProcessor* s = new SendProcessor(ch); SendProcessor* s = new SendProcessor(ch);
s->Prepare(var_h, time_out); s->Prepare(var_h, time_out);
s->response_call_back_ = NULL; s->response_call_back_ = NULL;
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s); rpc->Finish(&s->reply_, &s->status_, (void*)s);
});
req_count_++; req_count_++;
...@@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -50,8 +58,6 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
void ProcGetResponse(const VarHandle& var_h, void ProcGetResponse(const VarHandle& var_h,
const sendrecv::VariableMessage& ret_msg) { const sendrecv::VariableMessage& ret_msg) {
auto* outvar = var_h.scope->FindVar(var_h.name); auto* outvar = var_h.scope->FindVar(var_h.name);
std::istringstream iss(ret_msg.serialized());
DeserializeFromMessage(ret_msg, *var_h.ctx, outvar); DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
} }
...@@ -60,44 +66,78 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -60,44 +66,78 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out) { int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string var_name_val = var_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name); req.set_varname(var_name_val);
// varhandle // varhandle
VarHandle var_h; VarHandle var_h;
var_h.ep = ep; var_h.ep = ep_val;
var_h.scope = &scope; var_h.scope = p_scope;
var_h.name = var_name; var_h.name = var_name_val;
var_h.ctx = &ctx; var_h.ctx = p_ctx;
// stub context // stub context
auto ch = GetChannel(ep);
GetProcessor* s = new GetProcessor(ch); GetProcessor* s = new GetProcessor(ch);
s->Prepare(var_h, time_out); s->Prepare(var_h, time_out);
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s); rpc->Finish(&s->reply_, &s->status_, (void*)s);
});
req_count_++; req_count_++;
return true; return true;
} }
bool RPCClient::Wait() { bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
bool ok = true; const auto ch = GetChannel(ep);
BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
s->Prepare(time_out);
sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, (void*)s);
req_count_++;
return true;
}
while (true) { bool RPCClient::Wait() {
if (req_count_ <= 0) { if (req_count_ <= 0) {
break; return true;
}
const size_t kReqCnt = req_count_;
bool a[kReqCnt];
std::vector<std::future<void>> waits(req_count_);
for (int i = 0; i < req_count_; i++) {
waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
}
for (int i = 0; i < req_count_; i++) {
waits[i].wait();
} }
if (!Proceed()) { int last_req_count = req_count_;
req_count_ = 0;
for (int i = 0; i < last_req_count; i++) {
if (!a[i]) {
return false; return false;
} }
} }
return ok; return true;
} }
bool RPCClient::Proceed() { bool RPCClient::Proceed() {
...@@ -124,7 +164,6 @@ bool RPCClient::Proceed() { ...@@ -124,7 +164,6 @@ bool RPCClient::Proceed() {
c->Process(); c->Process();
delete c; delete c;
req_count_--;
return true; return true;
} }
......
...@@ -71,6 +71,15 @@ class ClientBase { ...@@ -71,6 +71,15 @@ class ClientBase {
context_->set_deadline(deadline); context_->set_deadline(deadline);
} }
virtual void Prepare(int64_t time_out) {
context_.reset(new grpc::ClientContext());
std::chrono::system_clock::time_point deadline =
std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
context_->set_deadline(deadline);
}
virtual void Process() = 0; virtual void Process() = 0;
std::unique_ptr<sendrecv::SendRecvService::Stub> stub_; std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
...@@ -117,6 +126,17 @@ class GetProcessor : public ClientBase { ...@@ -117,6 +126,17 @@ class GetProcessor : public ClientBase {
RequestGetCallBack response_call_back_ = ProcGetResponse; RequestGetCallBack response_call_back_ = ProcGetResponse;
}; };
class BatchBarrierProcessor : public ClientBase {
public:
explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
: ClientBase(ch) {}
virtual ~BatchBarrierProcessor() {}
virtual void Process() {}
sendrecv::VoidMessage reply_;
};
class RPCClient { class RPCClient {
public: public:
bool AsyncSendVariable(const std::string& ep, bool AsyncSendVariable(const std::string& ep,
...@@ -130,6 +150,10 @@ class RPCClient { ...@@ -130,6 +150,10 @@ class RPCClient {
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
int64_t time_out = 600 * 1000); int64_t time_out = 600 * 1000);
bool AsyncSendBatchBarrier(const std::string& ep,
int64_t time_out = 600 * 1000);
bool Wait(); bool Wait();
private: private:
......
...@@ -132,6 +132,7 @@ void AsyncGRPCServer::RunSyncUpdate() { ...@@ -132,6 +132,7 @@ void AsyncGRPCServer::RunSyncUpdate() {
cq_send_ = builder.AddCompletionQueue(); cq_send_ = builder.AddCompletionQueue();
cq_get_ = builder.AddCompletionQueue(); cq_get_ = builder.AddCompletionQueue();
server_ = builder.BuildAndStart(); server_ = builder.BuildAndStart();
LOG(INFO) << "Server listening on " << address_ << std::endl; LOG(INFO) << "Server listening on " << address_ << std::endl;
...@@ -141,11 +142,11 @@ void AsyncGRPCServer::RunSyncUpdate() { ...@@ -141,11 +142,11 @@ void AsyncGRPCServer::RunSyncUpdate() {
std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this); std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
t_send_.reset( t_send_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, false, new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_send_.get(), "cq_send", send_register))); cq_send_.get(), "cq_send", send_register)));
t_get_.reset( t_get_.reset(
new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, true, new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
cq_get_.get(), "cq_get", get_register))); cq_get_.get(), "cq_get", get_register)));
// wait server // wait server
...@@ -174,7 +175,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() { ...@@ -174,7 +175,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() {
} }
RequestSend* send = RequestSend* send =
new RequestSend(&service_, cq_send_.get(), &var_recv_queue_); new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
VLOG(4) << "create RequestSend status:" << send->Status(); VLOG(4) << "Create RequestSend status:" << send->Status();
} }
void AsyncGRPCServer::TryToRegisterNewGetOne() { void AsyncGRPCServer::TryToRegisterNewGetOne() {
...@@ -184,11 +185,11 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() { ...@@ -184,11 +185,11 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() {
} }
RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_, RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
&var_get_queue_); &var_get_queue_);
VLOG(4) << "create Requestget status:" << get->Status(); VLOG(4) << "Create RequestGet status:" << get->Status();
} }
// FIXME(typhoonzero): remove wait argument and change cq_name to enum. // FIXME(typhoonzero): change cq_name to enum.
void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq, void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq,
std::string cq_name, std::string cq_name,
std::function<void()> TryToRegisterNewOne) { std::function<void()> TryToRegisterNewOne) {
TryToRegisterNewOne(); TryToRegisterNewOne();
......
...@@ -57,8 +57,7 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { ...@@ -57,8 +57,7 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
void ShutDown(); void ShutDown();
protected: protected:
void HandleRequest(bool wait, grpc::ServerCompletionQueue *cq, void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name,
std::string cq_name,
std::function<void()> TryToRegisterNewOne); std::function<void()> TryToRegisterNewOne);
void TryToRegisterNewSendOne(); void TryToRegisterNewSendOne();
void TryToRegisterNewGetOne(); void TryToRegisterNewGetOne();
......
...@@ -30,6 +30,9 @@ namespace paddle { ...@@ -30,6 +30,9 @@ namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
void SerializeToMessage(const std::string& name, const framework::Variable* var, void SerializeToMessage(const std::string& name, const framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
sendrecv::VariableMessage* msg); sendrecv::VariableMessage* msg);
......
...@@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
"'dropout_prob' must be between 0.0 and 1.0."); "'dropout_prob' must be between 0.0 and 1.0.");
}); });
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false); AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddAttr<bool>("fix_seed",
"A flag indicating whether to use a fixed seed to generate "
"random mask. NOTE: DO NOT set this flag to true in "
"training. Setting this flag to true is only useful in "
"unittest or for debug that always the same output units "
"will be dropped.")
.SetDefault(false);
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0); AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
......
...@@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace()); auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int size = framework::product(mask->dims()); int size = framework::product(mask->dims());
int seed = context.Attr<int>("seed");
std::random_device rnd;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
thrust::counting_iterator<unsigned int> index_sequence_begin(0); thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin, index_sequence_begin + size, thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(mask_data), thrust::device_ptr<T>(mask_data),
......
...@@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> { ...@@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
if (!context.Attr<bool>("is_test")) { if (!context.Attr<bool>("is_test")) {
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace()); auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int seed = context.Attr<int>("seed");
// NOTE: fixed seed should only be used in unittest or for debug.
// Guarantee to use random seed in training.
std::random_device rnd;
std::minstd_rand engine; std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed); engine.seed(seed);
std::uniform_real_distribution<float> dist(0, 1); std::uniform_real_distribution<float> dist(0, 1);
size_t size = framework::product(mask->dims()); size_t size = framework::product(mask->dims());
for (size_t i = 0; i < size; ++i) { for (size_t i = 0; i < size; ++i) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/elementwise_pow_op.h"
#include "paddle/operators/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwisePowOpMaker : public ElementwiseOpMaker {
public:
ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: ElementwiseOpMaker(proto, op_checker) {
SetComment("Pow", "Out = X ^ Y");
AddComment(comment_);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
ops::ElementwisePowOpMaker);
REGISTER_OP_CPU_KERNEL(
elementwise_pow,
ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/elementwise_pow_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
elementwise_pow,
ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include "paddle/operators/elementwise_op_function.h"
namespace paddle {
namespace operators {
template <typename T>
struct PowFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
};
template <typename DeviceContext, typename T>
class ElementwisePowKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx);
}
};
} // namespace operators
} // namespace paddle
...@@ -135,12 +135,12 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -135,12 +135,12 @@ class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
AddComment(R"DOC( AddComment(R"DOC(
GRU Operator implements part calculations of the complete GRU as following: GRU Operator implements part calculations of the complete GRU as following:
\f[ $$
update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\ reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\
output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t) output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
\f] $$
@note To implement the complete GRU, fully-connected operator must be used @note To implement the complete GRU, fully-connected operator must be used
before to feed xu, xr and xc as the Input of GRU operator. before to feed xu, xr and xc as the Input of GRU operator.
......
...@@ -30,11 +30,12 @@ using Tensor = framework::Tensor; ...@@ -30,11 +30,12 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx, inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) { framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle; math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace()); dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index, *dst, indexed_src); row_shuffle(ctx, src, index_lod, *dst, indexed_src);
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel<T> { ...@@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel<T> {
gru_value.state_weight = gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size); const_cast<T*>(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0; Tensor ordered_h0;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (h0) { if (h0) {
// Since the batch computing for GRU reorders the input sequences // Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
...@@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel<T> { ...@@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0)); zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
Tensor ordered_h0, ordered_h0_grad; Tensor ordered_h0, ordered_h0_grad;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (h0) { if (h0) {
ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0, ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
true); true);
......
...@@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> { ...@@ -79,7 +79,7 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
framework::LoD lod(1); framework::LoD lod(1);
lod[0].reserve(batch_size + 1); lod[0].reserve(batch_size + 1);
for (int i = 0, offset = 0; i < batch_size + 1; ++i) { for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
lod[0][i] = offset; lod[0].push_back(offset);
offset += output_height * output_width; offset += output_height * output_width;
} }
out->set_lod(lod); out->set_lod(lod);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/label_smooth_op.h"
namespace paddle {
namespace operators {
class LabelSmoothOp : public framework::OperatorWithKernel {
public:
LabelSmoothOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LabelSmoothOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of LabelSmoothOp should not be null.");
auto in_dims = ctx->GetInputDim("X");
if (ctx->HasInput("PriorDist")) {
auto noise_dims = ctx->GetInputDim("PriorDist");
auto noise_numel = paddle::framework::product(noise_dims);
PADDLE_ENFORCE(
in_dims[1] == noise_numel,
"The number of elements in Input(PriorDist) must be equal to the "
"dimension of each label.");
}
ctx->ShareLoD("X", /*->*/ "Out");
ctx->SetOutputDim("Out", in_dims);
}
};
class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(LoDTensor) The input labels of LabelSmooth operator. This "
"input can be batched labels in one-hot encoding or output from "
"softmax, with shape [N x K], where N is the batch size and K is "
"the number of classes");
AddInput("PriorDist",
"(Tensor, optional)"
"The prior distribution to be added to the smoothed label. It is "
"fixed during training and the number of elements should be equal "
"to the dimension K of each label. Default is uniform "
"distribution and each element will be set to 1/K if not provided "
"in input.")
.AsDispensable();
AddOutput("Out",
"(loDTensor) The smoothed label of LabelSmooth operator. It has"
"the same shape and LoD with the Input(LoDTensor).");
AddAttr<float>("epsilon",
"(float, default 0.0f)"
"The smoothing parameter of LabelSmooth operator.")
.SetDefault(0.0f);
AddComment(R"DOC(
LabelSmooth Operator.
Label smoothing is a mechanism to regularize the classifier layer. In machine
learning, optimizing the log-likelihood of the correct label directly may
cause two problems. First, it may result in overfitting: if the model learns
to assign full probability to the ground-truth label for each training example,
it is not guaranteed to generalize. Second, it encourages the differences
between the largest logit and all others to become large, reducing the ability
of the model to adapt. Label smoothing is proposed to encourage the model to
be less confident, which replaces the ground-truth label $y$ with the weighted
sum of itself and some fixed distribution $\mu$, i.e.
$$
\tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
$$
where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and
$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for
$\mu$. This change in the ground-truth label is called label-smoothing
regularization or LSR.
See more details about label smoothing in https://arxiv.org/abs/1512.00567.
)DOC");
}
};
class LabelSmoothGradOp : public framework::OperatorWithKernel {
public:
LabelSmoothGradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
label_smooth_grad, ops::LabelSmoothGradOp);
REGISTER_OP_CPU_KERNEL(
label_smooth,
ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
label_smooth_grad,
ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/label_smooth_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
label_smooth,
ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
label_smooth_grad,
ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LabelSmoothKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* out_t = ctx.Output<framework::LoDTensor>("Out");
auto* in_t = ctx.Input<framework::LoDTensor>("X");
auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
auto label_dim = in_t->dims()[1];
out_t->mutable_data<T>(ctx.GetPlace());
auto epsilon = ctx.Attr<float>("epsilon");
auto out = framework::EigenVector<T>::Flatten(*out_t);
auto in = framework::EigenVector<T>::Flatten(*in_t);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
if (dist_t) {
auto dist = framework::EigenVector<T>::Flatten(*dist_t);
out.device(dev) =
static_cast<T>(1 - epsilon) * in +
epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
} else {
out.device(dev) = static_cast<T>(1 - epsilon) * in +
static_cast<T>(epsilon / label_dim);
}
}
};
template <typename DeviceContext, typename T>
class LabelSmoothGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
d_in_t->mutable_data<T>(ctx.GetPlace());
auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
auto epsilon = ctx.Attr<float>("epsilon");
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/layer_norm_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenMatrixMapRowMajor = Eigen::Map<
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
template <typename T>
using ConstEigenMatrixMapRowMajor = Eigen::Map<
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
class LayerNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"),
"Output(Y) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Mean"),
"Output(Mean) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Variance"),
"Output(Variance) of LayerNormOp should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
"'begin_norm_axis' must be less than the rank of X.");
auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
if (ctx->HasInput("Scale")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
}
if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
}
ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
ctx->SetOutputDim("Mean", {left});
ctx->SetOutputDim("Variance", {left});
ctx->ShareLoD("X", "Y");
}
};
class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(LoDTensor) The input tensor.");
AddInput("Scale",
"(Tensor, optional) Scale is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output.")
.AsDispensable();
AddInput("Bias",
"(Tensor, optional) Bias is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output.")
.AsDispensable();
AddOutput("Y", "(LoDTensor) Result after normalization.");
AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
.AsIntermediate();
AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
.AsIntermediate();
AddAttr<float>("epsilon",
"(float, default 1e-5) Constant for "
"numerical stability")
.SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
"'epsilon' should be between 0.0 and 0.001.");
});
AddAttr<int>("begin_norm_axis",
"(int default:1), the "
"axis of `begin_norm_axis ... Rank(X) - 1` will be "
"normalized. `begin_norm_axis` splits the tensor(`X`) to a "
"matrix [N,H].")
.SetDefault(1)
.AddCustomChecker([](const int &begin_norm_axis) {
PADDLE_ENFORCE_GT(begin_norm_axis, 0,
"'begin_norm_axis' should be greater than zero.");
});
AddComment(R"DOC(
Layer Normalization.
Layer Norm has been implemented as discussed in the paper:
https://arxiv.org/abs/1607.06450
...
)DOC");
}
};
template <typename T>
class LayerNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto *output = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
output->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
auto squre = [](T ele) { return ele * ele; };
auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
mean_map = input_map.rowwise().mean();
var_map = (input_map - mean_map.replicate(1, right))
.unaryExpr(squre)
.rowwise()
.mean()
.unaryExpr(add_epslion);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto inv_std = var_map.unaryExpr(inv_std_func);
if (scale && bias) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1)) +
bias_map.replicate(left, 1);
} else if (scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1));
} else if (bias) {
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right)) +
bias_map.replicate(left, 1);
} else {
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right));
}
}
};
class LayerNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
// check input
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Mean"),
"Input(Mean) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Variance"),
"Input(Variance) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) of LayerNormOp should not be null.");
// check output
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
if (ctx->HasOutput(framework::GradVarName("Scale"))) {
ctx->SetOutputDim(framework::GradVarName("Scale"),
ctx->GetInputDim("Scale"));
}
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
const Tensor *t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
}
if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
return framework::OpKernelType(framework::ToDataType(t->type()),
ctx.GetPlace());
}
};
template <typename T>
class LayerNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
const auto *var = ctx.Input<Tensor>("Variance");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
d_bias_map = d_y_map.colwise().sum();
}
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
auto d_scale_map =
EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// There are two equation to compute d_scale. One uses "Y" and the other
// does not use "Y"
d_scale_map =
((x_map - mean_map.replicate(1, right))
.cwiseProduct(
var_map.unaryExpr(inv_std_func).replicate(1, right))
.cwiseProduct(d_y_map))
.colwise()
.sum();
}
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
auto triple_product_func = [](T ele) { return ele * ele * ele; };
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): these code can be refined
if (d_scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1));
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1))
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
} else {
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map);
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
layer_norm_grad, ops::LayerNormGradOp);
REGISTER_OP_CPU_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LayerNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
template <typename DeviceContext, typename T>
class LayerNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
class LoadCombineOp : public framework::OperatorBase {
public:
LoadCombineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path");
std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
auto out_var_names = Outputs("Out");
PADDLE_ENFORCE_GT(
static_cast<int>(out_var_names.size()), 0,
"The number of output variables should be greater than 0.");
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < out_var_names.size(); i++) {
auto *out_var = scope.FindVar(out_var_names[i]);
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
out_var_names[i]);
auto *tensor = out_var->GetMutable<framework::LoDTensor>();
// Error checking
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
filename);
// Get data from fin to tensor
DeserializeFromStream(fin, tensor, dev_ctx);
if (platform::is_gpu_place(place)) {
// copy CPU to GPU
framework::LoDTensor cpu_tensor;
cpu_tensor.ShareDataWith(*tensor);
cpu_tensor.set_lod(tensor->lod());
// reset tensor
out_var->Clear();
tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->set_lod(cpu_tensor.lod());
Copy(cpu_tensor, place, dev_ctx, tensor);
}
}
}
};
class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput(
"Out",
"(vector) The output LoDTensors that will be read from the input file.")
.AsDuplicable();
AddAttr<std::string>("file_path",
"(string) "
"LoDTensors will be loaded from \"file_path\".")
.AddCustomChecker(
[](const std::string &path) { return !path.empty(); });
AddComment(R"DOC(
LoadCombine Operator.
LoadCombine operator loads LoDTensor variables from a file. The file should
contain one or more LoDTensors serialized using the SaveCombine operator. The
LoadCombine operator applies a deserialization strategy to appropriately load
the LodTensors, and this strategy complements the serialization strategy used
in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
with the SaveCombine operator, and can only deserialize one or more LoDTensors
that were saved using the SaveCombine operator.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
ops::LoadCombineOpProtoMaker);
...@@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> { ...@@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
new_rows.resize(ids_dim[0]); new_rows.resize(ids_dim[0]);
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data, memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
ids_dim[0] * sizeof(int64_t), stream); ids_data, ids_dim[0] * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);
......
...@@ -27,11 +27,12 @@ using Tensor = framework::Tensor; ...@@ -27,11 +27,12 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx, inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) { framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle; math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace()); dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index, *dst, indexed_src); row_shuffle(ctx, src, index_lod, *dst, indexed_src);
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel<T> {
} }
lstm_value.prev_state_value = nullptr; lstm_value.prev_state_value = nullptr;
Tensor ordered_c0; Tensor ordered_c0;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (cell_t0) { if (cell_t0) {
// Since the batch computing for LSTM reorders the input sequence // Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized cell state also needs // according to their length. The initialized cell state also needs
...@@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
// ordered_h0_g/c0_g is the reordered gradient of hidden/cell // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
// initialization. // initialization.
Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
const size_t* order = batch_gate->lod()[2].data(); framework::Vector<size_t> order(batch_gate->lod()[2]);
if (c0) { if (c0) {
ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0, ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
true); true);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/lstmp_op.h"
namespace paddle {
namespace operators {
class LSTMPOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Weight"),
"Input(Weight) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
"Input(ProjWeight) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Bias"),
"Input(Bias) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Projection"),
"Output(Projection) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Cell"),
"Output(Cell) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
"Output(BatchGate) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
"Output(BatchCellPreAct) of LSTMP operator should not be "
"null.");
PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
"Output(BatchHidden) of LSTMP operator should not be null.");
auto in_dims = ctx->GetInputDim("Input");
PADDLE_ENFORCE_EQ(in_dims.size(), 2,
"Input(X)'s rank of LSTMP operator must be 2.");
int frame_size = in_dims[1] / 4;
auto w_dims = ctx->GetInputDim("Weight");
auto proj_dims = ctx->GetInputDim("ProjWeight");
PADDLE_ENFORCE_EQ(w_dims.size(), 2,
"The rank of Input(Weight) should be 2.");
PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1],
"The first dimension of Input(Weight) "
"should be %d.",
proj_dims[1]);
PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
"The second dimension of Input(Weight) "
"should be 4 * %d.",
frame_size);
PADDLE_ENFORCE_EQ(proj_dims.size(), 2,
"The rank of Input(ProjWeight) should be 2.");
PADDLE_ENFORCE_EQ(proj_dims[0], frame_size,
"The first dimension of Input(ProjWeight) "
"should be %d.",
frame_size);
if (ctx->HasInput("H0")) {
PADDLE_ENFORCE(ctx->HasInput("C0"),
"Input(C0) of LSTMP operator should not be null after "
"Input(H0) provided.");
auto h_dims = ctx->GetInputDim("H0");
auto c_dims = ctx->GetInputDim("C0");
PADDLE_ENFORCE(h_dims == c_dims,
"The dimension of Input(H0) and Input(C0) "
"should be the same.");
ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
}
auto b_dims = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
PADDLE_ENFORCE_EQ(b_dims[0], 1,
"The first dimension of Input(Bias) should be 1.");
if (ctx->Attrs().Get<bool>("use_peepholes")) {
PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
"The second dimension of Input(Bias) should be "
"7 * %d if enable peepholes connection",
frame_size);
} else {
PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
"The second dimension of Input(Bias) should be "
"4 * %d if disable peepholes connection",
frame_size);
}
framework::DDim out_dims({in_dims[0], frame_size});
framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
ctx->SetOutputDim("Projection", proj_out_dims);
ctx->SetOutputDim("Cell", out_dims);
ctx->SetOutputDim("BatchGate", in_dims);
ctx->SetOutputDim("BatchCellPreAct", out_dims);
ctx->SetOutputDim("BatchHidden", out_dims);
ctx->ShareLoD("Input", "Projection");
ctx->ShareLoD("Input", "Cell");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
ctx.device_context());
}
};
class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Input",
"(LoDTensor) the input for sequence data, which supports "
"variable-time length input sequence. The underlying tensor in "
"this LoDTensor is a matrix with shape (T X 4D), where T is the "
"total time steps in this mini-batch, D is the hidden size.");
AddInput("H0",
"(Tensor, optional) the initial hidden state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size and D is the hidden size.")
.AsDispensable();
AddInput("C0",
"(Tensor, optional) the initial cell state is an optional "
"input. This is a tensor with shape (N x D), where N is the "
"batch size. `C0` should not be null if `H0` provided.")
.AsDispensable();
AddInput("Weight",
"(Tensor) the learnable hidden-hidden weights."
" - The shape is (P x 4D), where P is the projection layer size "
"and D is the hidden size."
" - Weight = {W_cr, W_ir, W_fr, W_or}");
AddInput("ProjWeight",
"(Tensor) the learnable weight of the projection layer."
" - The shape is (D x P), where P is the recurrent projection "
"layer size and D is the hidden size."
" - ProjWeight = {W_rh}");
AddInput("Bias",
"(Tensor) the learnable biases, which contains two parts: "
"input-hidden biases and peephole connections weights if "
"setting `use_peepholes` to `True`. "
"1. `use_peepholes = False` "
" - The shape is (1 x 4D). "
" - Bias = {b_c, b_i, b_f, b_o}."
"2. `use_peepholes = True` "
" - The shape is (1 x 7D). "
" - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
AddOutput("Projection",
"(LoDTensor) the projection of the hidden state of LSTMP "
"operator. The shape is (T x P), and LoD is the same with the "
"`Input`.");
AddOutput("Cell",
"(LoDTensor) the cell state of LSTMP operator. "
"The shape is (T x D), and lod is the same with the `Input`.");
AddOutput("BatchGate",
"(LoDTensor) This LoDTensor contains input gate, forget gate "
"and output gate after the activations. This LoDTensor has the "
"same shape as the reorganized input, which is also be called "
"batch input. The LoD size is 2. The first-level LoD is the "
"batch offsets and the second contains the indices, which "
"denotes the position of reorganized sequence in the raw input.")
.AsIntermediate();
AddOutput("BatchCellPreAct",
"(LoDTensor) the pre-activation cell state reorganized in batch. "
"This LoDTensor is obtained in the forward and used in the "
"backward.")
.AsIntermediate();
AddOutput("BatchHidden",
"(LoDTensor) the hidden state reorganized in batch. "
"This LoDTensor is obtained in the forward and used in the "
"backward.")
.AsIntermediate();
AddOutput("OrderedP0",
"(Tensor) the projection of the initial hidden state "
"H0. This is a tensor with shape (N x P), where N is the "
"batch size and P is the hidden size.")
.AsIntermediate();
AddAttr<bool>("use_peepholes",
"(bool, defalut: True) "
"whether to enable diagonal/peephole connections.")
.SetDefault(true);
AddAttr<bool>("is_reverse",
"(bool, defalut: False) "
"whether to compute reversed LSTMP.")
.SetDefault(false);
AddAttr<std::string>(
"gate_activation",
"(string, default: sigmoid)"
"The activation for input gate, forget gate and output "
"gate, `sigmoid` by default.")
.SetDefault("sigmoid")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("cell_activation",
"(string, default: tanh)"
"The activation for cell output, `tanh` by defalut.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("candidate_activation",
"(string, default: tanh)"
"The activation for candidate hidden state, "
"`tanh` by default.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<std::string>("proj_activation",
"(string, default: tanh)"
"The activation for projection output, "
"`tanh` by defalut.")
.SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"});
AddComment(R"DOC(
Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
LSTMP has a separate projection layer after the LSTM layer, projecting the
original hidden state to a lower-dimensional one, which is proposed to reduce
the number of total parameters and furthermore computational complexity for
the LSTM, espeacially for the case that the size of output units is relative
large (https://research.google.com/pubs/archive/43905.pdf).
The formula is as follows:
$$
i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
h_t = o_t \odot act_h(c_t) \\
r_t = \overline{act_h}(W_{rh}h_t)
$$
where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
are diagonal weight matrices for peephole connections. In our implementation,
we use vectors to reprenset these diagonal weight matrices. The b terms
denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
is the activation, such as logistic sigmoid function, and
$i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$. Here $h$ is usually called the hidden
state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also
called the candidate hidden state, whose computation is based on the current
input and previous hidden state.
The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
are the cell input and cell output activation functions and `tanh` is usually
used for them. $\overline{act_h}$ is the activation function for the
projection output, usually using `identity` or same as $act_h$.
Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
operations on the input $x_{t}$ are NOT included in this operator.
Users can choose to use fully-connected operator before LSTMP operator.
)DOC");
}
};
class LSTMPGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Projection"),
"Input(Projection) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Cell"),
"Input(Cell) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Weight"),
"Input(Weight) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
"Input(ProjWeight) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Bias"),
"Input(Bias) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
"Input(BatchGate) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
"Input(BatchGate) of LSTMP operator should not be null.");
auto SetOutGradDim = [&ctx](const std::string& name) {
auto g_name = framework::GradVarName(name);
if (ctx->HasOutput(g_name))
ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
};
SetOutGradDim("Input");
SetOutGradDim("Weight");
SetOutGradDim("ProjWeight");
SetOutGradDim("Bias");
SetOutGradDim("H0");
SetOutGradDim("C0");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
ops::LSTMPGradOp);
REGISTER_OP_CPU_KERNEL(
lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
lstmp_grad, ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/lstmp_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
lstmp, ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
lstmp_grad,
ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/operators/activation_op.h"
#include "paddle/operators/math/detail/activation_functions.h"
#include "paddle/operators/math/lstm_compute.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence2batch.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src,
framework::Vector<size_t> index,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index, *dst, indexed_src);
}
template <typename DeviceContext, typename T>
class LSTMPKernel : public framework::OpKernel<T> {
public:
template <typename Device, typename X, typename Y>
void ActCompute(const math::detail::ActivationType act_type, const Device& d,
X x, Y y) const {
if (act_type == math::detail::ActivationType::kIdentity)
y.device(d) = x;
else if (act_type == math::detail::ActivationType::kSigmoid)
SigmoidFunctor<T>()(d, x, y);
else if (act_type == math::detail::ActivationType::kTanh)
TanhFunctor<T>()(d, x, y);
else if (act_type == math::detail::ActivationType::kReLU)
ReluFunctor<T>()(d, x, y);
else
PADDLE_THROW("unsupported activation type");
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("Input");
auto* weight = ctx.Input<Tensor>("Weight");
auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden_t0 = ctx.Input<Tensor>("H0");
auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
auto* cell_t0 = ctx.Input<Tensor>("C0");
auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
batch_gate->mutable_data<T>(ctx.GetPlace());
auto* proj_out = ctx.Output<LoDTensor>("Projection");
proj_out->mutable_data<T>(ctx.GetPlace());
auto* cell_out = ctx.Output<LoDTensor>("Cell");
cell_out->mutable_data<T>(ctx.GetPlace());
bool is_reverse = ctx.Attr<bool>("is_reverse");
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& device_ctx = ctx.template device_context<DeviceContext>();
to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
auto in_dims = input->dims();
int frame_size = static_cast<int>(in_dims[1] / 4);
framework::DDim dims({in_dims[0], frame_size});
framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
if (bias) {
Tensor b = *bias;
b.Resize({bias->numel(), 1});
Tensor gate_bias = b.Slice(0, 4 * frame_size);
math::RowwiseAdd<DeviceContext, T> add_bias;
add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
}
math::LstmMetaValue<T> lstmp_value;
if (bias && ctx.Attr<bool>("use_peepholes")) {
T* bias_data = const_cast<T*>(bias->data<T>());
// the code style in LstmpMetaValue will be updated later.
lstmp_value.check_ig = bias_data + 4 * frame_size;
lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
lstmp_value.check_og = lstmp_value.check_fg + frame_size;
} else {
lstmp_value.check_ig = nullptr;
lstmp_value.check_fg = nullptr;
lstmp_value.check_og = nullptr;
}
lstmp_value.prev_state_value = nullptr;
Tensor ordered_c0;
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (cell_t0) {
// Since the batch computing for LSTMP reorders the input sequence
// according to their length. The initialized cell state also needs
// to reorder.
ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
&ordered_c0, true);
lstmp_value.prev_state_value = ordered_c0.data<T>();
}
// Use the local variable as here.
LoDTensor batch_proj, batch_cell;
auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
auto* batch_hidden = ctx.Output<LoDTensor>("BatchHidden");
batch_hidden->mutable_data<T>(dims, ctx.GetPlace()); // T x D
batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace()); // T x P
batch_cell.mutable_data<T>(dims, ctx.GetPlace()); // T x D
auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
auto gate_act = math::detail::GetActivationType(
ctx.Attr<std::string>("gate_activation"));
auto cell_act = math::detail::GetActivationType(
ctx.Attr<std::string>("cell_activation"));
auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation"));
auto proj_act = math::detail::GetActivationType(
ctx.Attr<std::string>("proj_activation"));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
Tensor gate_t = batch_gate->Slice(bstart, bend);
Tensor hidden_t = batch_hidden->Slice(bstart, bend);
Tensor proj_t = batch_proj.Slice(bstart, bend);
Tensor cell_t = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
int cur_batch_size = bend - bstart;
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size;
auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
false, static_cast<T>(1.0), &gate_t,
static_cast<T>(1.0));
} else if (hidden_t0) {
// If n == 0 and there is no initialized hidden state, that is to say
// the H0 is zeros, the calculation W_h * H0 will be skiped.
// If n == 0 and there is initialized hidden state, calculate W_h * H0.
// Since the batch computing for LSTMP reorders the input sequence
// according to their length. The initialized hidden state also needs
// to reorder.
Tensor ordered_h0;
ordered_proj0->mutable_data<T>(ctx.GetPlace());
ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
&ordered_h0, true);
math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
*proj_weight, false, static_cast<T>(1.0),
ordered_proj0, static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
ActCompute(cell_act, place, proj0_dev, proj0_dev);
}
math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
*weight, false, static_cast<T>(1.0),
&gate_t, static_cast<T>(1.0));
}
lstmp_value.gate_value = gate_t.data<T>();
lstmp_value.output_value = hidden_t.data<T>();
lstmp_value.state_value = cell_t.data<T>();
lstmp_value.state_active_value = cell_pre_act_t.data<T>();
math::LstmUnitFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
cell_act, cand_act);
lstmp_value.prev_state_value = lstmp_value.state_value;
math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
false, static_cast<T>(1.0), &proj_t,
static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj_t_dev = EigenMatrix<T>::From(proj_t);
ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
}
}
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
batch_proj.set_lod(batch_gate->lod());
// restore the output hidden in LoDTensor from the batch hidden
to_seq(device_ctx, batch_proj, *proj_out);
batch_cell.set_lod(batch_gate->lod());
// restore the output cell state in LoDTensor from the batch cell
to_seq(device_ctx, batch_cell, *cell_out);
}
};
template <typename DeviceContext, typename T>
class LSTMPGradKernel : public framework::OpKernel<T> {
public:
template <typename Device, typename X, typename Y, typename DX, typename DY>
void ActGradCompute(const math::detail::ActivationType act_type,
const Device& d, X x, Y y, DX dx, DY dy) const {
// x is dummy and won't be used even in Relu(use y instead)
if (act_type == math::detail::ActivationType::kIdentity)
dx.device(d) = dy;
else if (act_type == math::detail::ActivationType::kSigmoid)
SigmoidGradFunctor<T>()(d, x, y, dy, dx);
else if (act_type == math::detail::ActivationType::kTanh)
TanhGradFunctor<T>()(d, x, y, dy, dx);
else if (act_type == math::detail::ActivationType::kReLU)
ReluGradFunctor<T>()(d, x, y, dy, dx);
else
PADDLE_THROW("unsupported activation type");
}
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("Input");
auto* weight = ctx.Input<Tensor>("Weight");
auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
auto* bias = ctx.Input<Tensor>("Bias");
auto* proj_out = ctx.Input<LoDTensor>("Projection");
auto* cell_out = ctx.Input<LoDTensor>("Cell");
auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
auto* projection_g =
ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
auto* proj_weight_g =
ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto* h0 = ctx.Input<Tensor>("H0");
auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
auto* c0 = ctx.Input<Tensor>("C0");
auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
auto& device_ctx = ctx.template device_context<DeviceContext>();
math::SetConstant<DeviceContext, T> zero;
if (weight_g) {
weight_g->mutable_data<T>(ctx.GetPlace());
zero(device_ctx, weight_g, static_cast<T>(0.0));
}
if (proj_weight_g) {
proj_weight_g->mutable_data<T>(ctx.GetPlace());
zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
}
// ordered_h0/c0 is the reordered hidden/cell initialization.
// ordered_h0_g/c0_g is the reordered gradient of hidden/cell
// initialization.
Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (c0) {
ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
true);
}
if (c0 && c0_g) {
ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
}
auto in_dims = input->dims();
auto out_dims = cell_out->dims();
framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
int frame_size = static_cast<int>(in_dims[1] / 4);
PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
math::LstmMetaValue<T> lstmp_value;
if (bias && ctx.Attr<bool>("use_peepholes")) {
T* bias_data = const_cast<T*>(bias->data<T>());
lstmp_value.check_ig = bias_data + 4 * frame_size;
lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
lstmp_value.check_og = lstmp_value.check_fg + frame_size;
} else {
lstmp_value.check_ig = nullptr;
lstmp_value.check_fg = nullptr;
lstmp_value.check_og = nullptr;
}
math::LstmMetaGrad<T> lstmp_grad;
if (bias && bias_g) {
bias_g->mutable_data<T>(ctx.GetPlace());
zero(device_ctx, bias_g, static_cast<T>(0.0));
}
if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
T* bias_g_data = bias_g->data<T>();
lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
} else {
lstmp_grad.check_ig_grad = nullptr;
lstmp_grad.check_fg_grad = nullptr;
lstmp_grad.check_og_grad = nullptr;
}
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto ToBatch = [&batch_gate, &to_batch](
const DeviceContext& ctx, const framework::LoDTensor& src,
const framework::DDim& dims, framework::LoDTensor& dst) {
dst.mutable_data<T>(dims, ctx.GetPlace());
dst.set_lod(batch_gate->lod());
to_batch(ctx, src, dst, false);
};
LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
ToBatch(device_ctx, *proj_out, proj_dims, batch_proj); // T x P
ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g); // T x P
ToBatch(device_ctx, *cell_out, out_dims, batch_cell); // T x D
LoDTensor batch_cell_g, batch_gate_g;
batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
// TODO(qingqing) support the case output cell has gradient.
// to_batch(device_ctx, *cell_g, batch_cell_g, false);
zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
batch_gate_g.set_lod(batch_gate->lod());
auto gate_act = math::detail::GetActivationType(
ctx.Attr<std::string>("gate_activation"));
auto cell_act = math::detail::GetActivationType(
ctx.Attr<std::string>("cell_activation"));
auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation"));
auto proj_act = math::detail::GetActivationType(
ctx.Attr<std::string>("proj_activation"));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1;
for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]);
Tensor cur_proj = batch_proj.Slice(bstart, bend);
Tensor proj_g = batch_proj_g.Slice(bstart, bend);
if (proj_act != math::detail::ActivationType::kIdentity) {
auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
auto proj_g_dev = EigenMatrix<T>::From(proj_g);
ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
proj_g_dev);
}
/* hidden state backwarad */
Tensor out_g = batch_hidden_g.Slice(bstart, bend);
math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
true, static_cast<T>(1.0), &out_g,
static_cast<T>(0.0));
/* projection weight backward*/
if (proj_weight_g) {
Tensor hidden_t = batch_hidden->Slice(bstart, bend);
math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
false, static_cast<T>(1.0),
proj_weight_g, static_cast<T>(1.0));
}
Tensor gate = batch_gate->Slice(bstart, bend);
Tensor cell = batch_cell.Slice(bstart, bend);
Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
lstmp_value.gate_value = gate.data<T>();
lstmp_value.state_value = cell.data<T>();
lstmp_value.state_active_value = cell_pre_act.data<T>();
Tensor gate_g = batch_gate_g.Slice(bstart, bend);
Tensor cell_g = batch_cell_g.Slice(bstart, bend);
lstmp_grad.state_grad = cell_g.data<T>();
lstmp_grad.gate_grad = gate_g.data<T>();
lstmp_grad.output_grad = out_g.data<T>();
if (n > 0) {
int bstart_pre = static_cast<int>(batch_starts[n - 1]);
Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
lstmp_value.prev_state_value = cell_pre.data<T>();
lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
} else {
lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
}
int cur_batch_size = bend - bstart;
math::LstmUnitGradFunctor<DeviceContext, T>::compute(
device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
gate_act, cell_act, cand_act);
if (n > 0) {
int pre_h_start = static_cast<int>(batch_starts[n - 1]);
int pre_h_end = pre_h_start + cur_batch_size;
auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
static_cast<T>(1.0), &pre_proj_g,
static_cast<T>(1.0));
if (weight_g) {
/* weight backward*/
auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
false, static_cast<T>(1.0), weight_g,
static_cast<T>(1.0));
}
} else {
if (h0 && weight_g) {
ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
&ordered_h0, true);
if (weight_g) {
math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
gate_g, false, static_cast<T>(1.0),
weight_g, static_cast<T>(1.0));
}
}
if (h0 && (h0_g || proj_weight_g)) {
ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
Tensor proj0_g;
proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
proj0_g.mutable_data<T>(ctx.GetPlace());
math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
true, static_cast<T>(1.0), &proj0_g,
static_cast<T>(0.0));
if (proj_act != math::detail::ActivationType::kIdentity) {
auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
proj0_g_dev);
}
if (h0_g) {
math::matmul<DeviceContext, T>(
device_ctx, proj0_g, false, *proj_weight, true,
static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
}
if (proj_weight_g) {
math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
proj0_g, false, static_cast<T>(1.0),
proj_weight_g, static_cast<T>(1.0));
}
}
}
}
math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
if (in_g) {
/* backward data */
in_g->mutable_data<T>(ctx.GetPlace());
to_seq(device_ctx, batch_gate_g, *in_g);
}
if (bias && bias_g) {
/* backward bias */
Tensor b_g = *bias_g;
b_g.Resize({bias_g->numel(), 1});
Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
math::ColwiseSum<DeviceContext, T> col_sum;
col_sum(device_ctx, batch_gate_g, &gate_bias_g);
}
if (h0 && h0_g) {
ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
false);
}
if (c0 && c0_g) {
ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
false);
}
}
};
} // namespace operators
} // namespace paddle
...@@ -11,7 +11,7 @@ if(WITH_GPU) ...@@ -11,7 +11,7 @@ if(WITH_GPU)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context) nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context) nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
...@@ -28,7 +28,7 @@ else() ...@@ -28,7 +28,7 @@ else()
cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor) cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
cc_library(context_project SRCS context_project.cc DEPS device_context math_function) cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context) cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context) cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
......
...@@ -31,7 +31,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -31,7 +31,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, input2.height()); PADDLE_ENFORCE_EQ(in1_height, input2.height());
output->set_height(in1_height); output->set_height(in1_height);
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = input2.rows(); auto& in2_rows = input2.rows();
std::vector<int64_t> out_rows; std::vector<int64_t> out_rows;
out_rows.reserve(in1_rows.size() + in2_rows.size()); out_rows.reserve(in1_rows.size() + in2_rows.size());
...@@ -108,7 +108,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -108,7 +108,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
...@@ -126,7 +126,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -126,7 +126,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
SelectedRowsAddTensorKernel< SelectedRowsAddTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.data(), out_data, in1_row_numel); in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
auto out_eigen = framework::EigenVector<T>::Flatten(*output); auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2); auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
...@@ -146,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -146,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();
...@@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
...@@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
SelectedRowsAddToTensorKernel< SelectedRowsAddToTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.data(), in2_data, in1_row_numel); in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
} }
}; };
...@@ -257,7 +257,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -257,7 +257,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
framework::SelectedRows operator()(const platform::CUDADeviceContext& context, framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input) { const framework::SelectedRows& input) {
framework::SelectedRows out; framework::SelectedRows out;
auto input_rows = input.rows(); framework::Vector<int64_t> input_rows(input.rows());
std::set<int64_t> row_set(input_rows.begin(), input_rows.end()); std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end()); std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
...@@ -283,9 +283,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -283,9 +283,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
MergeAddKernel< MergeAddKernel<
T, 256><<<grid1, threads, 0, T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input_data, input.rows().data(), out_data, .stream()>>>(input_data, input_rows.cuda_data(), out_data,
out.rows().data(), out.rows().size(), out.mutable_rows()->cuda_data(),
input_width); out.rows().size(), input_width);
return out; return out;
} }
}; };
...@@ -370,8 +370,8 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> { ...@@ -370,8 +370,8 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1); dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<< UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op, grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
in2_data, in1_row_numel); op, in2_data, in1_row_numel);
} }
}; };
} // namespace scatter } // namespace scatter
......
...@@ -23,8 +23,10 @@ template <typename T> ...@@ -23,8 +23,10 @@ template <typename T>
class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> { class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
public: public:
void operator()(const platform::CPUDeviceContext& context, void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Tensor& dst, bool is_src_index) { framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index) {
size_t* index = index_lod.data();
auto src_dims = src.dims(); auto src_dims = src.dims();
auto dst_dims = dst.dims(); auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2UL, PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
......
...@@ -42,8 +42,10 @@ template <typename T> ...@@ -42,8 +42,10 @@ template <typename T>
class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> { class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& src, const size_t* index, const framework::Tensor& src,
framework::Tensor& dst, bool is_src_index) { framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index) {
size_t* index = index_lod.cuda_data();
auto src_dims = src.dims(); auto src_dims = src.dims();
auto dst_dims = dst.dims(); auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2, PADDLE_ENFORCE_EQ(src_dims.size(), 2,
......
...@@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor { ...@@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor {
// copy the input src to the indexed rows of output dst. // copy the input src to the indexed rows of output dst.
// The indexed rows are based on the input index. // The indexed rows are based on the input index.
void operator()(const DeviceContext& context, const framework::Tensor& src, void operator()(const DeviceContext& context, const framework::Tensor& src,
const size_t* index, framework::Tensor& dst, framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index); bool is_src_index);
}; };
...@@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor { ...@@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor {
PADDLE_ENFORCE_EQ(lods[1].size(), PADDLE_ENFORCE_EQ(lods[1].size(),
static_cast<size_t>(lod_tensor.dims()[0])); static_cast<size_t>(lod_tensor.dims()[0]));
CopyMatrixRowsFunctor<DeviceContext, T> to_batch; CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
to_batch(context, lod_tensor, lods[1].data(), batch, true); to_batch(context, lod_tensor, lods[1], batch, true);
return; return;
} }
...@@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor { ...@@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor {
batch.set_lod(batch_lods); batch.set_lod(batch_lods);
CopyMatrixRowsFunctor<DeviceContext, T> to_batch; CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
to_batch(context, lod_tensor, seq2batch_idx, batch, true); to_batch(context, lod_tensor, batch_lods[1], batch, true);
} }
}; };
...@@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor { ...@@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor {
PADDLE_ENFORCE_EQ(in_lod[1].size(), PADDLE_ENFORCE_EQ(in_lod[1].size(),
static_cast<size_t>(lod_tensor.dims()[0])); static_cast<size_t>(lod_tensor.dims()[0]));
CopyMatrixRowsFunctor<DeviceContext, T> to_seq; CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
size_t* index = in_lod[1].data(); to_seq(context, batch, in_lod[1], lod_tensor, false);
to_seq(context, batch, index, lod_tensor, false);
} }
}; };
......
...@@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
T* padding_data = padding.data<T>(); T* padding_data = padding.data<T>();
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(), padding_data, const_cast<T*>(seq_data),
sequence_width, max_sequence_length, num_sequences); abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} else { } else {
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(), padding_data, const_cast<T*>(seq_data),
sequence_width, max_sequence_length, num_sequences); abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} }
} }
}; };
...@@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
T* seq_data = seq.data<T>(); T* seq_data = seq.data<T>();
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(), const_cast<T*>(padding_data), seq_data,
sequence_width, max_sequence_length, num_sequences); abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} else { } else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(), const_cast<T*>(padding_data), seq_data,
sequence_width, max_sequence_length, num_sequences); abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} }
} }
}; };
......
...@@ -73,7 +73,7 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> { ...@@ -73,7 +73,7 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
dim3 grid(num_seq, 1); dim3 grid(num_seq, 1);
auto stream = context.stream(); auto stream = context.stream();
KeMaxSequencePool<T><<<grid, threads, 0, stream>>>( KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
in_data, starts.data(), out_data, max_index, num_seq, dim); in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
} }
}; };
......
...@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> { ...@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<< SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
seq_data, abs_offset_lod[level].data(), scales, seq_width); seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
} }
}; };
......
...@@ -119,7 +119,13 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, ...@@ -119,7 +119,13 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
multiplex, multiplex,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>); ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
multiplex_grad, multiplex_grad,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>); ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
...@@ -90,7 +90,13 @@ namespace ops = paddle::operators; ...@@ -90,7 +90,13 @@ namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
multiplex, multiplex,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>); ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
multiplex_grad, multiplex_grad,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>); ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -241,7 +241,7 @@ TEST_F(NCCLTester, ncclReduceOp) { ...@@ -241,7 +241,7 @@ TEST_F(NCCLTester, ncclReduceOp) {
// ncclBcastOp with desc // ncclBcastOp with desc
TEST_F(NCCLTester, ncclBcastOp) { TEST_F(NCCLTester, ncclBcastOp) {
std::unique_ptr<f::OpDesc> op2(new f::OpDesc); std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
const int kRoot = 5; const int kRoot = 0;
op2->SetType("ncclBcast"); op2->SetType("ncclBcast");
op2->SetInput("X", {"st"}); op2->SetInput("X", {"st"});
op2->SetInput("Communicator", {"comm"}); op2->SetInput("Communicator", {"comm"});
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/operators/one_hot_op.h"
#include "paddle/framework/framework.pb.h"
namespace paddle {
namespace operators {
class OneHotOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of OneHotOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of OneHotOp should not be null.");
auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_GE(x_dims.size(), 2,
"Rank of Input(X) should be at least 2.");
PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
"Last dimension of Input(X) should be 1.");
int depth = ctx->Attrs().Get<int>("depth");
PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
framework::DDim out_dims(x_dims);
out_dims[out_dims.size() - 1] = depth;
ctx->SetOutputDim("Out", out_dims);
ctx->ShareLoD("X", /* --> */ "Out");
}
};
class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
public:
OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
"The last dimension of X should be 1. Each value of X is an index "
"to indicate the position.");
AddOutput("Out",
"(Tensor, Tensor<float>) Output tensor with same rank as X. "
"The tensor consists of one-hot representations of values in X.");
AddAttr<int>("depth",
"A positive integer to specify the length of one-hot vector.");
AddAttr<int>("dtype",
"An integer to specify the data type of one-hot "
"vector. The default value is FP32.")
.SetDefault(paddle::framework::proto::DataType::FP32);
AddComment(R"DOC(
One Hot Operator. This operator creates the one-hot representations for input
index values. The following example will help to explain the function of this
operator:
X is a LoDTensor:
X.lod = [[0, 1, 4]]
X.shape = [4, 1]
X.data = [[1], [1], [3], [0]]
set depth = 4
Out is a LoDTensor:
Out.lod = [[0, 1, 4]]
Out.shape = [4, 4]
Out.data = [[0., 1., 0., 0.],
[0., 1., 0., 0.],
[0., 0., 0., 1.],
[1., 0., 0., 0.]]
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(
one_hot, ops::OneHotKernel<paddle::platform::CPUDeviceContext, int>,
ops::OneHotKernel<paddle::platform::CPUDeviceContext, int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/operators/one_hot_op.h"
#include "paddle/platform/cuda_helper.h"
#include "paddle/platform/gpu_info.h"
namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <typename InT, typename OutT>
__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
const int64_t numel, const int depth) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < numel) {
*(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
}
}
template <typename DeviceContext, typename InT>
struct OneHotOpCUDAFunctor {
const framework::LoDTensor* in_;
framework::LoDTensor* out_;
const DeviceContext& ctx_;
int depth_;
OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
int depth, const DeviceContext& ctx)
: in_(in), out_(out), depth_(depth), ctx_(ctx) {}
template <typename OutT>
void operator()() const {
auto* p_in_data = in_->data<InT>();
auto numel = in_->numel();
auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
auto stream = ctx_.stream();
math::set_constant(ctx_, out_, 0.0);
FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
PADDLE_CUDA_NUM_THREADS,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
p_in_data, p_out_data, numel, depth_);
}
};
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T>
class OneHotCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
int depth = context.Attr<int>("depth");
framework::VisitDataType(
static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
OneHotOpCUDAFunctor<DeviceContext, T>(
in, out, depth, context.template device_context<DeviceContext>()));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
one_hot, ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int>,
ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename InT>
struct OneHotOpFunctor {
const framework::LoDTensor* in_;
framework::LoDTensor* out_;
int depth_;
const DeviceContext& ctx_;
OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
int depth, const DeviceContext& ctx)
: in_(in), out_(out), depth_(depth), ctx_(ctx) {}
template <typename OutT>
void operator()() const {
auto* p_in_data = in_->data<InT>();
auto numel = in_->numel();
auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
math::set_constant(ctx_, out_, 0.0);
for (int i = 0; i < numel; ++i) {
PADDLE_ENFORCE_GE(p_in_data[i], 0,
"Illegal index value, should be at least 0.");
PADDLE_ENFORCE_LT(p_in_data[i], depth_,
"Illegal index value, should be less than depth (%d).",
depth_);
*(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
}
}
};
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext, typename T>
class OneHotKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* in = context.Input<LoDTensor>("X");
auto* out = context.Output<LoDTensor>("Out");
int depth = context.Attr<int>("depth");
framework::VisitDataType(
static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
OneHotOpFunctor<DeviceContext, T>(
in, out, depth, context.template device_context<DeviceContext>()));
}
};
} // namespace operators
} // namespace paddle
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/framework/executor.h" #include "paddle/framework/executor.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/threadpool.h" #include "paddle/framework/threadpool.h"
#include "paddle/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -31,6 +32,7 @@ static constexpr char kParallelScopes[] = "parallel_scopes"; ...@@ -31,6 +32,7 @@ static constexpr char kParallelScopes[] = "parallel_scopes";
static constexpr char kParallelBlock[] = "sub_block"; static constexpr char kParallelBlock[] = "sub_block";
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
static void SplitTensorAndMoveTensorToScopes( static void SplitTensorAndMoveTensorToScopes(
const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes, const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
...@@ -38,8 +40,10 @@ static void SplitTensorAndMoveTensorToScopes( ...@@ -38,8 +40,10 @@ static void SplitTensorAndMoveTensorToScopes(
const std::vector<std::string> &names) { const std::vector<std::string> &names) {
size_t num_sub_scopes = 0; size_t num_sub_scopes = 0;
for (auto &argu : names) { for (auto &argu : names) {
auto *var = scope.FindVar(argu); const auto &tensor =
const auto &tensor = var->Get<LoDTensor>(); detail::Ref(scope.FindVar(argu),
"Cannot find variable %s in the parent scope", argu)
.Get<LoDTensor>();
auto lod_tensors = tensor.SplitLoDTensor(places); auto lod_tensors = tensor.SplitLoDTensor(places);
for (auto &lod : lod_tensors) { for (auto &lod : lod_tensors) {
...@@ -59,11 +63,37 @@ static void SplitTensorAndMoveTensorToScopes( ...@@ -59,11 +63,37 @@ static void SplitTensorAndMoveTensorToScopes(
} }
for (size_t i = 0; i < lod_tensors.size(); ++i) { for (size_t i = 0; i < lod_tensors.size(); ++i) {
*(*sub_scopes)[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i]; *detail::Ref(sub_scopes->at(i)->Var(argu),
"Cannot find variable in the sub-scope", argu)
.GetMutable<LoDTensor>() = lod_tensors[i];
} }
} }
} }
inline void CopyOrShare(const framework::Variable &src,
const platform::Place &dst_place,
framework::Variable *dst) {
if (src.IsType<LoDTensor>()) {
if (src.Get<LoDTensor>().place() == dst_place) {
dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
} else {
Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
}
} else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>();
auto *dst_sr = dst->GetMutable<SelectedRows>();
dst_sr->set_rows(src_sr.rows());
dst_sr->set_height(src_sr.height());
if (src_sr.value().place() == dst_place) {
dst_sr->mutable_value()->ShareDataWith(src_sr.value());
} else {
Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
}
} else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
}
}
void WaitOnPlace(const platform::Place place) { void WaitOnPlace(const platform::Place place) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
...@@ -210,30 +240,30 @@ class ParallelDoGradOp : public framework::OperatorBase { ...@@ -210,30 +240,30 @@ class ParallelDoGradOp : public framework::OperatorBase {
} }
WaitOnPlaces(places); WaitOnPlaces(places);
// merge grad AccumulateGrad(scope, place, sub_scopes, places);
}
void AccumulateGrad(const framework::Scope &scope,
const platform::Place &place,
const std::vector<framework::Scope *> &sub_scopes,
const platform::PlaceList &places) const {
for (auto &s : Outputs(framework::GradVarName(kParameters))) { for (auto &s : Outputs(framework::GradVarName(kParameters))) {
auto &result = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
std::string tmp_name; std::string tmp_name;
auto *tmp = sub_scopes[0]->Var(&tmp_name)->GetMutable<LoDTensor>(); auto *tmp = sub_scopes[0]->Var(&tmp_name);
for (size_t i = 1; i < sub_scopes.size(); ++i) { for (size_t i = 1; i < sub_scopes.size(); ++i) {
auto &tensor_to_merge = sub_scopes[i]->FindVar(s)->Get<LoDTensor>(); CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
if (!(places[i] == places[0])) {
framework::Copy(tensor_to_merge, places[0], tmp);
WaitOnPlace(places[0]); WaitOnPlace(places[0]);
} else {
tmp->ShareDataWith(tensor_to_merge);
}
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
framework::AttributeMap{}); framework::AttributeMap{});
VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
sum_op->Run(*sub_scopes[0], places[0]); sum_op->Run(*sub_scopes[0], places[0]);
WaitOnPlace(places[0]); WaitOnPlace(places[0]);
} }
VLOG(3) << result; CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
framework::Copy(result, place, scope.FindVar(s)->GetMutable<LoDTensor>());
} }
WaitOnPlaces(places); WaitOnPlaces(places);
} }
...@@ -262,6 +292,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -262,6 +292,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
this->InputGrad(input_param, false)); this->InputGrad(input_param, false));
} }
} }
auto *g_block = this->grad_block_[0];
// All variable name that needed by gradient operators
std::unordered_set<std::string> all_inputs_in_grad_blocks;
for (size_t i = 0; i < g_block->OpSize(); ++i) {
auto *op = g_block->Op(i);
for (auto &var_name : op->InputArgumentNames()) {
all_inputs_in_grad_blocks.insert(var_name);
}
}
for (auto &output_param : this->OutputNames()) { for (auto &output_param : this->OutputNames()) {
if (output_param == kParallelScopes) { if (output_param == kParallelScopes) {
...@@ -270,8 +311,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -270,8 +311,17 @@ class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
this->Output(output_param)); this->Output(output_param));
} else { } else {
grad->SetInput(output_param, this->Output(output_param)); grad->SetInput(output_param, this->Output(output_param));
grad->SetInput(framework::GradVarName(output_param), std::vector<std::string> og_names;
this->OutputGrad(output_param)); for (auto &og_name : this->OutputGrad(output_param)) {
if (all_inputs_in_grad_blocks.count(og_name) != 0) {
// there are some gradient operators who need the OG. So make this
// OG as an input of parallel.do
og_names.push_back(og_name);
}
// else, there is no operator who need the OG. Do not use this OG as
// an input
}
grad->SetInput(framework::GradVarName(output_param), og_names);
} }
} }
grad->SetAttrMap(this->Attrs()); grad->SetAttrMap(this->Attrs());
...@@ -289,7 +339,7 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase { ...@@ -289,7 +339,7 @@ class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
PADDLE_ENFORCE(ctx->HasInputs(kParameters)); PADDLE_ENFORCE(ctx->HasInputs(kParameters));
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
PADDLE_ENFORCE(ctx->HasInput(kInputs)); PADDLE_ENFORCE(ctx->HasInputs(kInputs));
for (auto &s : output) { for (auto &s : output) {
PADDLE_ENFORCE(ctx->HasInputs(s)); PADDLE_ENFORCE(ctx->HasInputs(s));
......
...@@ -139,10 +139,8 @@ class PoolGradKernel : public framework::OpKernel<T> { ...@@ -139,10 +139,8 @@ class PoolGradKernel : public framework::OpKernel<T> {
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
if (in_x_grad) { if (in_x_grad) {
in_x_grad->mutable_data<T>(context.GetPlace()); in_x_grad->mutable_data<T>(context.GetPlace());
auto temp = framework::EigenVector<T>::Flatten(*in_x_grad); paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
temp.device( set_constant(dev_ctx, in_x_grad, 0.0);
*context.template device_context<DeviceContext>().eigen_device()) =
temp.constant(static_cast<T>(0));
switch (ksize.size()) { switch (ksize.size()) {
case 2: { case 2: {
......
...@@ -29,8 +29,6 @@ limitations under the License. */ ...@@ -29,8 +29,6 @@ limitations under the License. */
#include "paddle/operators/detail/simple_block_queue.h" #include "paddle/operators/detail/simple_block_queue.h"
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -95,7 +93,6 @@ class RecvOp : public framework::OperatorBase { ...@@ -95,7 +93,6 @@ class RecvOp : public framework::OperatorBase {
auto param_list = Attr<std::vector<std::string>>("ParamList"); auto param_list = Attr<std::vector<std::string>>("ParamList");
auto grad_list = Attr<std::vector<std::string>>("GradList"); auto grad_list = Attr<std::vector<std::string>>("GradList");
auto fan_in = Attr<int>("Fanin"); auto fan_in = Attr<int>("Fanin");
size_t param_count = param_list.size();
auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock); auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
auto *program = block->Program(); auto *program = block->Program();
...@@ -103,20 +100,28 @@ class RecvOp : public framework::OperatorBase { ...@@ -103,20 +100,28 @@ class RecvOp : public framework::OperatorBase {
// TODO(typhoonzero): change this to a while_op for every cluster-batch. // TODO(typhoonzero): change this to a while_op for every cluster-batch.
bool exit_flag = false; bool exit_flag = false;
size_t barrier_size = param_count * fan_in;
while (!exit_flag) { while (!exit_flag) {
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(0); rpc_service_->SetCond(0);
for (size_t i = 0; i < barrier_size; ++i) { size_t recv_var_cnt = 0;
int batch_barrier = 0;
while (batch_barrier != fan_in) {
const detail::MessageWithName &v = rpc_service_->Get(); const detail::MessageWithName &v = rpc_service_->Get();
auto grad_var_name = v.first; auto grad_var_name = v.first;
if (grad_var_name == LISTEN_TERMINATE_MESSAGE) { if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
LOG(INFO) << "received terminate message and exit"; LOG(INFO) << "received terminate message and exit";
exit_flag = true; exit_flag = true;
break; break;
} } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
auto it = std::find(grad_list.begin(), grad_list.end(), grad_var_name); VLOG(3) << "recv batch barrier message";
batch_barrier++;
continue;
} else {
// receive a variable
recv_var_cnt++;
auto it =
std::find(grad_list.begin(), grad_list.end(), grad_var_name);
std::string param_var_name; std::string param_var_name;
if (it != grad_list.end()) { if (it != grad_list.end()) {
param_var_name = param_list[it - grad_list.begin()]; param_var_name = param_list[it - grad_list.begin()];
...@@ -125,6 +130,7 @@ class RecvOp : public framework::OperatorBase { ...@@ -125,6 +130,7 @@ class RecvOp : public framework::OperatorBase {
} }
VLOG(3) << "received grad: " << grad_var_name VLOG(3) << "received grad: " << grad_var_name
<< " updating param: " << param_var_name; << " updating param: " << param_var_name;
if (fan_in > 1) { if (fan_in > 1) {
grad_var_name = this->GetGradVarNameForTrainer(grad_var_name); grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
} }
...@@ -135,6 +141,9 @@ class RecvOp : public framework::OperatorBase { ...@@ -135,6 +141,9 @@ class RecvOp : public framework::OperatorBase {
} }
detail::DeserializeFromMessage(v.second, dev_ctx, var); detail::DeserializeFromMessage(v.second, dev_ctx, var);
} }
}
VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
// TODO(Yancey1989): merge SelectedRows variables here
if (exit_flag) { if (exit_flag) {
break; break;
} }
...@@ -146,7 +155,7 @@ class RecvOp : public framework::OperatorBase { ...@@ -146,7 +155,7 @@ class RecvOp : public framework::OperatorBase {
LOG(ERROR) << "run sub program error " << e.what(); LOG(ERROR) << "run sub program error " << e.what();
} }
rpc_service_->SetCond(1); rpc_service_->SetCond(1);
rpc_service_->WaitClientGet(barrier_size); rpc_service_->WaitClientGet(recv_var_cnt);
grads_counter_.clear(); grads_counter_.clear();
} // while(true) } // while(true)
} }
...@@ -161,7 +170,6 @@ class RecvOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -161,7 +170,6 @@ class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker) RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("RX", "(Tensor) Input tensor to be optimized").AsDuplicable();
AddComment(R"DOC( AddComment(R"DOC(
Recv operator Recv operator
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/operators/reduce_op.h" #include "paddle/operators/reduce_op.h"
#include "paddle/operators/net_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -38,10 +37,14 @@ class ReduceOp : public framework::OperatorWithKernel { ...@@ -38,10 +37,14 @@ class ReduceOp : public framework::OperatorWithKernel {
dim, x_rank, dim, x_rank,
"The dim should be in the range [-rank(input), rank(input))."); "The dim should be in the range [-rank(input), rank(input)).");
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all"); bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
if (reduce_all) { if (reduce_all) {
if (keep_dim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1}); ctx->SetOutputDim("Out", {1});
} else { } else {
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
auto dims_vector = vectorize(x_dims); auto dims_vector = vectorize(x_dims);
if (keep_dim || x_rank == 1) { if (keep_dim || x_rank == 1) {
dims_vector[dim] = 1; dims_vector[dim] = 1;
......
...@@ -90,14 +90,10 @@ Reshape Operator. ...@@ -90,14 +90,10 @@ Reshape Operator.
Reshape Input(X) into the shape specified by Attr(shape). Reshape Input(X) into the shape specified by Attr(shape).
An example: An example:
Given a 2-D tensor X with 2 rows and 2 columns Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
[[1, 2], [3, 4]]
and target shape = [1, 4], the reshape operator will transform and target shape = [1, 4], the reshape operator will transform
the tensor X into a 2-D tensor: the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
[[1, 2, 3, 4]]
One dimension in the target shape can be set -1, representing that its One dimension in the target shape can be set -1, representing that its
size is unknown. In this case, the real dimension will be infered from size is unknown. In this case, the real dimension will be infered from
......
...@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T> ...@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1]; int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1; int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0]; int future_context = Filter->dims()[0];
size_t *idx = batch_indices.data(); size_t *idx = batch_indices.cuda_data();
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
if (future_context <= 32) { if (future_context <= 32) {
...@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T> ...@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1]; int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1; int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0]; int future_context = Filter->dims()[0];
size_t *idx = batch_indices.data(); size_t *idx = batch_indices.cuda_data();
auto &device_ctx = context.cuda_device_context(); auto &device_ctx = context.cuda_device_context();
math::SetConstant<platform::CUDADeviceContext, T> zero; math::SetConstant<platform::CUDADeviceContext, T> zero;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <numeric>
#include <sstream>
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
// TODO(sidgoyal78): These function are needed by other files (save_op), move
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveCombineOp : public framework::OperatorBase {
public:
SaveCombineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite");
bool is_present = FileExists(filename);
if (is_present && !overwrite) {
PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
filename, overwrite);
}
MkDirRecursively(DirName(filename).c_str());
std::ofstream fout(filename);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto inp_var_names = Inputs("X");
PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
"The number of input variables should be greater than 0");
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < inp_var_names.size(); i++) {
auto *var = scope.FindVar(inp_var_names[i]);
PADDLE_ENFORCE(var != nullptr,
"Cannot find variable %s for save_combine_op",
inp_var_names[i]);
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
"SaveCombineOp only supports LoDTensor, %s has wrong type",
inp_var_names[i]);
auto &tensor = var->Get<framework::LoDTensor>();
// Serialize tensor
framework::SerializeToStream(fout, tensor, dev_ctx);
}
fout.close();
}
};
class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"X",
"(vector) Input LoDTensors that need to be saved together in a file.")
.AsDuplicable();
AddComment(R"DOC(
SaveCombine operator
This operator will serialize and write a list of input LoDTensor variables
to a file on disk.
)DOC");
AddAttr<bool>("overwrite",
"(boolean, default true)"
"Overwrite the output file if it exists.")
.SetDefault(true);
AddAttr<std::string>(
"file_path",
"(string)"
"The \"file_path\" where the LoDTensor variables will be saved.")
.AddCustomChecker(
[](const std::string &path) { return !path.empty(); });
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
ops::SaveCombineOpProtoMaker);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/framework/op_registry.h"
USE_NO_KERNEL_OP(save_combine);
USE_NO_KERNEL_OP(load_combine);
int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
std::string var_name,
paddle::platform::CPUPlace& place,
paddle::framework::Scope& scope,
paddle::framework::LoD& expect_lod) {
auto var = scope.Var(var_name);
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({x, y});
expect_lod.resize(1);
for (size_t i = 0; i < lod_info.size(); i++) {
expect_lod[0].push_back(lod_info[i]);
}
tensor->set_lod(expect_lod);
int* expect = tensor->mutable_data<int>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<int>(i);
}
return expect;
}
paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
const std::string out_var_name, paddle::framework::Scope& scope) {
auto load_var = scope.Var(out_var_name);
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
return target;
}
int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
paddle::framework::Scope& scope,
paddle::framework::LoD& actual_lod) {
int* actual = target->data<int>();
actual_lod = target->lod();
return actual;
}
void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
paddle::framework::LoD actual_lod, const int& numel) {
for (int64_t i = 0; i < numel; ++i) {
EXPECT_EQ(expect[i], actual[i]);
}
EXPECT_EQ(expect_lod.size(), actual_lod.size());
for (size_t i = 0; i < expect_lod.size(); ++i) {
for (size_t j = 0; j < expect_lod[i].size(); ++j) {
EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
}
}
}
// Here, we create 4 LoDTensors and use save_combine_op to first save these
// in a single file. Then, we use load_combine_op to load these sequentially
TEST(SaveLoadCombineOp, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
std::vector<int> lod1 = {0, 1, 2, 3, 10};
int numel1 = 100;
paddle::framework::LoD expect_lod1;
int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
expect_lod1);
std::vector<int> lod2 = {0, 2, 5, 10};
int numel2 = 200;
paddle::framework::LoD expect_lod2;
int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
expect_lod2);
std::vector<int> lod3 = {0, 2, 3, 20};
int numel3 = 4000;
paddle::framework::LoD expect_lod3;
int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
scope, expect_lod3);
std::vector<int> lod4 = {0, 1, 20};
int numel4 = 1000;
paddle::framework::LoD expect_lod4;
int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
expect_lod4);
// Set attributes
std::string filename = "check_tensor.ls";
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string(filename)});
// Run the save_combine_op
auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
"save_combine",
{{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
save_combine_op->Run(scope, place);
// Set up output vars
auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
// Run the load_combine_op
auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
"load_combine", {},
{{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
load_combine_op->Run(scope, place);
paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
}
// Test with original SaveLoadTest
TEST(SaveLoadTestWithCombineOp, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({3, 10});
paddle::framework::LoD expect_lod;
expect_lod.resize(1);
expect_lod[0].push_back(0);
expect_lod[0].push_back(1);
expect_lod[0].push_back(2);
expect_lod[0].push_back(3);
tensor->set_lod(expect_lod);
int* expect = tensor->mutable_data<int>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<int>(i);
}
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string("check_t.save")});
auto save_op = paddle::framework::OpRegistry::CreateOp(
"save_combine", {{"X", {"test_var"}}}, {}, attrs);
save_op->Run(scope, place);
auto load_var = scope.Var("out_var");
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
auto load_op = paddle::framework::OpRegistry::CreateOp(
"load_combine", {}, {{"Out", {"out_var"}}}, attrs);
load_op->Run(scope, place);
int* actual = target->data<int>();
for (int64_t i = 0; i < tensor->numel(); ++i) {
EXPECT_EQ(expect[i], actual[i]);
}
auto& actual_lod = target->lod();
EXPECT_EQ(expect_lod.size(), actual_lod.size());
for (size_t i = 0; i < expect_lod.size(); ++i) {
for (size_t j = 0; j < expect_lod[i].size(); ++j) {
EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
}
}
}
...@@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) { ...@@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) {
auto var = scope.Var("test_var"); auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>(); auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({10, 10}); tensor->Resize({3, 10});
paddle::framework::LoD expect_lod; paddle::framework::LoD expect_lod;
expect_lod.resize(1); expect_lod.resize(1);
expect_lod[0].push_back(0); expect_lod[0].push_back(0);
......
...@@ -37,25 +37,37 @@ class SendOp : public framework::OperatorBase { ...@@ -37,25 +37,37 @@ class SendOp : public framework::OperatorBase {
auto ins = Inputs("X"); auto ins = Inputs("X");
auto outs = Outputs("Out"); auto outs = Outputs("Out");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap"); std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
std::vector<std::string> endpoints =
Attr<std::vector<std::string>>("endpoints");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
VLOG(3) << "sending " << ins[i]; VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]); rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} }
PADDLE_ENFORCE(client_.Wait()); PADDLE_ENFORCE(rpc_client->Wait());
for (size_t i = 0; i < outs.size(); i++) { for (auto& ep : endpoints) {
VLOG(3) << "getting " << outs[i]; VLOG(3) << "batch barrier, ep: " << ep;
client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]); rpc_client->AsyncSendBatchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait());
PADDLE_ENFORCE(client_.Wait()); for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
}
PADDLE_ENFORCE(rpc_client->Wait());
} }
private:
mutable detail::RPCClient client_;
}; };
class SendOpMaker : public framework::OpProtoAndCheckerMaker { class SendOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -65,6 +77,9 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -65,6 +77,9 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
AddOutput("Out", "(Tensor) Output tensor to be received from server") AddOutput("Out", "(Tensor) Output tensor to be received from server")
.AsDuplicable(); .AsDuplicable();
AddOutput("RPCClient",
"(RPCClient) The RPC client object which is"
"initialized at most once.");
AddComment(R"DOC( AddComment(R"DOC(
Send operator Send operator
......
...@@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> { ...@@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1, GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>( PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr); num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
// Set LoD for output // Set LoD for output
thrust::host_vector<size_t> out_lod0 = dev_out_lod; std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
framework::LoD out_lod; framework::LoD out_lod;
out_lod.push_back(out_lod0); out_lod.push_back(out_lod0);
out->set_lod(out_lod); out->set_lod(out_lod);
......
...@@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> { ...@@ -32,6 +32,7 @@ class SequenceExpandKernel : public framework::OpKernel<T> {
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
auto x_dims = x->dims(); auto x_dims = x->dims();
auto* y = context.Input<LoDTensor>("Y"); auto* y = context.Input<LoDTensor>("Y");
PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]), PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
y->lod().back().size() - 1, y->lod().back().size() - 1,
"The size of last lod level in Input(Y)" "The size of last lod level in Input(Y)"
......
...@@ -30,8 +30,13 @@ class SequenceReshapeOp : public framework::OperatorWithKernel { ...@@ -30,8 +30,13 @@ class SequenceReshapeOp : public framework::OperatorWithKernel {
auto x_numel = product(x_dims); auto x_numel = product(x_dims);
PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
int new_dim = ctx->Attrs().Get<int>("new_dim"); int new_dim = ctx->Attrs().Get<int>("new_dim");
if (ctx->IsRuntime()) {
ctx->SetOutputDim("Out", ctx->SetOutputDim("Out",
{x_numel / new_dim, static_cast<int64_t>(new_dim)}); {x_numel / new_dim, static_cast<int64_t>(new_dim)});
} else {
// when compiling, the batch size is undetermined, just set to -1
ctx->SetOutputDim("Out", {-1, static_cast<int64_t>(new_dim)});
}
} }
}; };
......
...@@ -35,7 +35,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> { ...@@ -35,7 +35,7 @@ class SequenceReshapeKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_lod.size(), 1UL, PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_dims[0], in_lod[0].back(), (uint64_t)in_dims[0], in_lod[0].back(),
"Inconsistent size between X.shape[0] and X.lod()[0].back()."); "Inconsistent size between X.shape[0] and X.lod()[0].back().");
auto in_lod_l0 = in_lod[0]; auto in_lod_l0 = in_lod[0];
......
...@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_height, out_dims[0]); PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
auto& in_value = grad->value(); auto& in_value = grad->value();
auto& in_rows = grad->rows(); framework::Vector<int64_t> in_rows(grad->rows());
int64_t in_row_numel = in_value.numel() / in_rows.size(); int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
...@@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> { ...@@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
dim3 grid(1, in_rows.size()); dim3 grid(1, in_rows.size());
SparseSGDFunctorKernel< SparseSGDFunctorKernel<
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>( T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
in_data, in_rows.data(), learning_rate->data<T>(), out_data, in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
in_row_numel); in_row_numel);
} else { } else {
......
...@@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel<T> {
} }
} }
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now"); std::unique_ptr<framework::SelectedRows> in0;
if (in_place) {
// If is in_place, we store the input[0] to in0
auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
auto &rows = in_sel0.rows();
#ifdef PADDLE_WITH_CUDA
std::vector<int64_t> rows_in_cpu;
rows_in_cpu.reserve(rows.size());
for (auto item : rows) {
rows_in_cpu.push_back(item);
}
in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
#else
in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
#endif
in0->mutable_value()->ShareDataWith(in_sel0.value());
}
auto get_selected_row = [&](size_t i) -> const SelectedRows & {
if (i == 0 && in0) {
return *in0.get();
} else {
return in_vars[i]->Get<SelectedRows>();
}
};
auto *out = context.Output<SelectedRows>("Out"); auto *out = context.Output<SelectedRows>("Out");
out->mutable_rows()->clear(); out->mutable_rows()->clear();
auto *out_value = out->mutable_value(); auto *out_value = out->mutable_value();
...@@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel<T> {
// Runtime InferShape // Runtime InferShape
size_t first_dim = 0; size_t first_dim = 0;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
first_dim += in_vars[i]->Get<SelectedRows>().rows().size(); auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size();
} }
auto in_dim = in_vars[0]->Get<SelectedRows>().value().dims(); auto in_dim =
auto in_dim_vec = framework::vectorize(in_dim); framework::vectorize(get_selected_row(N - 1).value().dims());
in_dim_vec[0] = static_cast<int64_t>(first_dim); in_dim[0] = static_cast<int64_t>(first_dim);
out_value->Resize(framework::make_ddim(in_dim_vec)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
math::SelectedRowsAddTo<DeviceContext, T> functor; math::SelectedRowsAddTo<DeviceContext, T> functor;
int64_t offset = 0; int64_t offset = 0;
for (int i = 0; i < N; i++) { for (int i = 0; i < N; i++) {
PADDLE_ENFORCE_EQ(out->height(), auto &sel_row = get_selected_row(i);
in_vars[i]->Get<SelectedRows>().height());
functor(context.template device_context<DeviceContext>(), PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
in_vars[i]->Get<SelectedRows>(), offset, out); functor(context.template device_context<DeviceContext>(), sel_row,
offset += in_vars[i]->Get<SelectedRows>().value().numel(); offset, out);
offset += sel_row.value().numel();
} }
} else if (out_var->IsType<framework::LoDTensorArray>()) { } else if (out_var->IsType<framework::LoDTensorArray>()) {
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>(); auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
......
...@@ -22,6 +22,7 @@ namespace paddle { ...@@ -22,6 +22,7 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T, int MajorType = Eigen::RowMajor, template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
...@@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -33,9 +34,9 @@ class TopkKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
// Get the top k elements of each row of input tensor // Get the top k elements of each row of input tensor
// FIXME: only deal with matrix(2d tensor). // FIXME: only deal with matrix(2d tensor).
auto* input = ctx.Input<Tensor>("X"); auto* input = ctx.Input<LoDTensor>("X");
auto* output = ctx.Output<Tensor>("Out"); auto* output = ctx.Output<LoDTensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices"); auto* indices = ctx.Output<LoDTensor>("Indices");
// k is determined by Attr // k is determined by Attr
const size_t k = static_cast<int>(ctx.Attr<int>("k")); const size_t k = static_cast<int>(ctx.Attr<int>("k"));
......
...@@ -10,7 +10,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) ...@@ -10,7 +10,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
cc_library(place SRCS place.cc DEPS enforce) cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
add_subdirectory(dynload) add_subdirectory(dynload)
......
...@@ -29,20 +29,25 @@ namespace platform { ...@@ -29,20 +29,25 @@ namespace platform {
*/ */
template <typename Callable, typename... Args> template <typename Callable, typename... Args>
inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) { inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
bool good = false; bool good = true;
std::exception ex; std::exception ex;
try {
std::call_once(flag, std::call_once(flag,
[&](Args&&... args) { [&](Args&&... args) {
try { try {
f(args...); f(args...);
good = true;
} catch (const std::exception& e) { } catch (const std::exception& e) {
ex = e; ex = e;
good = false;
} catch (...) { } catch (...) {
ex = std::runtime_error("excption caught in call_once"); ex = std::runtime_error("excption caught in call_once");
good = false;
} }
}, },
args...); args...);
} catch (std::system_error& x) {
throw std::runtime_error("call once failed");
}
if (!good) { if (!good) {
throw std::exception(ex); throw std::exception(ex);
} }
......
...@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() { ...@@ -47,16 +47,16 @@ inline uint64_t GetTimeInNsec() {
} }
Event::Event(EventKind kind, std::string name, uint32_t thread_id, Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx) const DeviceContext* dev_ctx)
: kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) { : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (has_cuda_) {
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx); auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) {
PADDLE_ENFORCE(cudaGetDevice(&device_)); PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_)); PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream(); auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream)); PADDLE_ENFORCE(cudaEventRecord(event_, stream));
has_cuda_ = true;
} }
#endif #endif
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
...@@ -114,19 +114,20 @@ inline EventList& GetEventList() { ...@@ -114,19 +114,20 @@ inline EventList& GetEventList() {
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name, DeviceContext* dev_ctx) { void Mark(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
} }
void PushEvent(const std::string& name, DeviceContext* dev_ctx) { void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
} }
void PopEvent(const std::string& name, DeviceContext* dev_ctx) { void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name,
const DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
...@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) { ...@@ -155,6 +156,7 @@ void EnableProfiler(ProfilerState state) {
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx); Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait(); dev_ctx->Wait();
delete dev_ctx;
}); });
} }
} }
...@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) { ...@@ -163,14 +165,17 @@ void EnableProfiler(ProfilerState state) {
Mark("_start_profiler_", nullptr); Mark("_start_profiler_", nullptr);
} }
std::vector<std::vector<Event>> DisableProfiler() { void ResetProfiler() {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> result;
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) {
(*it)->Clear();
}
}
std::vector<std::vector<Event>> GetAllEvents() {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
std::vector<std::vector<Event>> result;
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
result.emplace_back((*it)->Reduce()); result.emplace_back((*it)->Reduce());
...@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() { ...@@ -178,6 +183,18 @@ std::vector<std::vector<Event>> DisableProfiler() {
return result; return result;
} }
void DisableProfiler(EventSortingKey sorted_key) {
PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
"Can't disable profiling, since it's not starting.");
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
g_state = ProfilerState::kDisabled;
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, sorted_key);
ResetProfiler();
}
void ParseEvents(std::vector<std::vector<Event>>& events, void ParseEvents(std::vector<std::vector<Event>>& events,
EventSortingKey sorted_by) { EventSortingKey sorted_by) {
if (g_profiler_place == "") return; if (g_profiler_place == "") return;
...@@ -291,10 +308,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events, ...@@ -291,10 +308,10 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
} }
// Print report // Print report
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12); PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
} }
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width, std::string& sorted_domain, const size_t name_width,
const size_t data_width) { const size_t data_width) {
// Output header information // Output header information
......
...@@ -29,7 +29,7 @@ class Event { ...@@ -29,7 +29,7 @@ class Event {
// The DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr. // If CPU profiling mode, can pass nullptr.
Event(EventKind kind, std::string name, uint32_t thread_id, Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx); const DeviceContext* dev_ctx);
std::string kind() const; std::string kind() const;
std::string name() const { return name_; } std::string name() const { return name_; }
...@@ -84,6 +84,8 @@ struct EventList { ...@@ -84,6 +84,8 @@ struct EventList {
return result; return result;
} }
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks; std::forward_list<std::vector<Event>> event_blocks;
}; };
...@@ -93,29 +95,26 @@ enum ProfilerState { ...@@ -93,29 +95,26 @@ enum ProfilerState {
kCUDA, // GPU profiling state kCUDA, // GPU profiling state
}; };
void Mark(const std::string& name, DeviceContext* dev_ctx); void Mark(const std::string& name, const DeviceContext* dev_ctx);
void PushEvent(const std::string& name, DeviceContext* dev_ctx); void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
struct RecordEvent { struct RecordEvent {
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordEvent(); ~RecordEvent();
// The device context is used by Event to get the current cuda stream. // The device context is used by Event to get the current cuda stream.
DeviceContext* dev_ctx_; const DeviceContext* dev_ctx_;
// Event name // Event name
std::string name_; std::string name_;
}; };
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
// Return the event list of all threads. Asummed the returned value calls // Return the event list of all threads. Asummed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler(); std::vector<std::vector<Event>> GetAllEvents();
// The information of each event given in the profiling report // The information of each event given in the profiling report
struct EventItem { struct EventItem {
...@@ -130,13 +129,22 @@ struct EventItem { ...@@ -130,13 +129,22 @@ struct EventItem {
// Candidate keys to sort the profiling report // Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
// Clear the g_all_event_lists, which is total event lists of all threads.
void ResetProfiler();
void DisableProfiler(EventSortingKey sorted_key);
// Parse the event list and output the profiling report // Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&, void ParseEvents(std::vector<std::vector<Event>>&,
EventSortingKey sorted_by = EventSortingKey::kDefault); EventSortingKey sorted_by = EventSortingKey::kDefault);
// Print results // Print results
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width, std::string& sorted_domain, const size_t name_width,
const size_t data_width); const size_t data_width);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) { ...@@ -103,18 +103,14 @@ TEST(RecordEvent, RecordEvent) {
// Bad Usage: // Bad Usage:
PushEvent("event_without_pop", dev_ctx); PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx); PopEvent("event_without_push", dev_ctx);
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler(); std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
// Will remove parsing-related code from test later
ParseEvents(events, EventSortingKey::kTotal);
int cuda_startup_count = 0; int cuda_startup_count = 0;
int start_profiler_count = 0; int start_profiler_count = 0;
int stop_profiler_count = 0;
for (size_t i = 0; i < events.size(); ++i) { for (size_t i = 0; i < events.size(); ++i) {
for (size_t j = 0; j < events[i].size(); ++j) { for (size_t j = 0; j < events[i].size(); ++j) {
if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count;
if (events[i][j].name() == "push") { if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop"); EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -127,5 +123,7 @@ TEST(RecordEvent, RecordEvent) {
} }
EXPECT_EQ(cuda_startup_count % 5, 0); EXPECT_EQ(cuda_startup_count % 5, 0);
EXPECT_EQ(start_profiler_count, 1); EXPECT_EQ(start_profiler_count, 1);
EXPECT_EQ(stop_profiler_count, 1);
// Will remove parsing-related code from test later
DisableProfiler(EventSortingKey::kTotal);
} }
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(paddle_pybind SHARED cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc SRCS pybind.cc exception.cc protobuf.cc const_value.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
${GLOB_OP_LIB}) ${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID) if(NOT APPLE AND NOT ANDROID)
target_link_libraries(paddle_pybind rt) target_link_libraries(paddle_pybind rt)
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <Python.h> #include <Python.h>
#include <fstream> #include <fstream>
#include <vector> #include <vector>
#include "paddle/platform/variant.h"
#include "pybind11/numpy.h" #include "pybind11/numpy.h"
#include "pybind11/pybind11.h" #include "pybind11/pybind11.h"
#include "pybind11/stl.h" #include "pybind11/stl.h"
......
...@@ -30,6 +30,7 @@ limitations under the License. */ ...@@ -30,6 +30,7 @@ limitations under the License. */
#include "paddle/operators/net_op.h" #include "paddle/operators/net_op.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
#include "paddle/pybind/const_value.h" #include "paddle/pybind/const_value.h"
#include "paddle/pybind/exception.h" #include "paddle/pybind/exception.h"
#include "paddle/pybind/pybind.h" #include "paddle/pybind/pybind.h"
...@@ -52,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) { ...@@ -52,7 +53,7 @@ static size_t UniqueIntegerGenerator(const std::string &prefix) {
return generators[prefix].fetch_add(1); return generators[prefix].fetch_add(1);
} }
bool IsCompileGPU() { bool IsCompiledWithCUDA() {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
return false; return false;
#else #else
...@@ -123,44 +124,25 @@ PYBIND11_PLUGIN(core) { ...@@ -123,44 +124,25 @@ PYBIND11_PLUGIN(core) {
.def( .def(
"__init__", "__init__",
[](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) { [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
#ifndef PADDLE_WITH_CUDA
new (&instance) LoDTensor(lod);
#else
LoD new_lod; LoD new_lod;
new_lod.reserve(lod.size()); new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
new (&instance) LoDTensor(new_lod); new (&instance) LoDTensor(new_lod);
#endif
}) })
.def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); }) .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
.def("set_lod", .def("set_lod",
[](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) { [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
#ifndef PADDLE_WITH_CUDA
self.set_lod(lod);
#else
LoD new_lod; LoD new_lod;
new_lod.reserve(lod.size()); new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
self.set_lod(new_lod); self.set_lod(new_lod);
#endif
}) })
.def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> { .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
#ifndef PADDLE_WITH_CUDA
return self.lod();
#else
auto lod = self.lod(); auto lod = self.lod();
std::vector<std::vector<size_t>> new_lod; std::vector<std::vector<size_t>> new_lod;
new_lod.reserve(lod.size()); new_lod.reserve(lod.size());
std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod), std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
[](Vector<size_t> item) ->
std::vector<size_t> {
std::vector<size_t> v;
v.reserve(item.size());
std::copy(item.begin(), item.end(), std::back_inserter(v));
return v;
});
return new_lod; return new_lod;
#endif
}); });
py::class_<SelectedRows>(m, "SelectedRows") py::class_<SelectedRows>(m, "SelectedRows")
...@@ -423,14 +405,16 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -423,14 +405,16 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<framework::Executor>(m, "Executor") py::class_<framework::Executor>(m, "Executor")
.def(py::init<const platform::Place &>()) .def(py::init<const platform::Place &>())
.def("run", &Executor::Run); .def("run",
(void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
Executor::Run);
m.def("unique_integer", UniqueIntegerGenerator); m.def("unique_integer", UniqueIntegerGenerator);
m.def("init_gflags", framework::InitGflags); m.def("init_gflags", framework::InitGflags);
m.def("init_glog", framework::InitGLOG); m.def("init_glog", framework::InitGLOG);
m.def("init_devices", &framework::InitDevices); m.def("init_devices", &framework::InitDevices);
m.def("is_compile_gpu", IsCompileGPU); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable", framework::GetFetchVariable);
...@@ -476,6 +460,24 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -476,6 +460,24 @@ All parameter, weight, gradient are variables in Paddle.
m.def("nvprof_stop", platform::CudaProfilerStop); m.def("nvprof_stop", platform::CudaProfilerStop);
#endif #endif
py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
.value("kDisabled", platform::ProfilerState::kDisabled)
.value("kCPU", platform::ProfilerState::kCPU)
.value("kCUDA", platform::ProfilerState::kCUDA)
.export_values();
py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
.value("kDefault", platform::EventSortingKey::kDefault)
.value("kCalls", platform::EventSortingKey::kCalls)
.value("kTotal", platform::EventSortingKey::kTotal)
.value("kMin", platform::EventSortingKey::kMin)
.value("kMax", platform::EventSortingKey::kMax)
.value("kAve", platform::EventSortingKey::kAve)
.export_values();
m.def("enable_profiler", platform::EnableProfiler);
m.def("disable_profiler", platform::DisableProfiler);
m.def("reset_profiler", platform::ResetProfiler);
return m.ptr(); return m.ptr();
} }
} // namespace pybind } // namespace pybind
......
...@@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" ...@@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
| ------ | -------- | ----------- | | ------ | -------- | ----------- |
| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. | | `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. | | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
| `WITH_TESTING` | ON | Build unit tests binaries. | | `WITH_TESTING` | OFF | Build unit tests binaries. |
| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. | | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. | | `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
| `WITH_SWIG_PY` | ON | Build with SWIG python API support. | | `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
......
...@@ -32,7 +32,7 @@ function cmake_gen() { ...@@ -32,7 +32,7 @@ function cmake_gen() {
cat <<EOF cat <<EOF
======================================== ========================================
Configuring cmake in /paddle/build ... Configuring cmake in /paddle/build ...
-DCMAKE_BUILD_TYPE=Release -DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release}
${PYTHON_FLAGS} ${PYTHON_FLAGS}
-DWITH_DOC=OFF -DWITH_DOC=OFF
-DWITH_GPU=${WITH_GPU:-OFF} -DWITH_GPU=${WITH_GPU:-OFF}
...@@ -54,7 +54,7 @@ EOF ...@@ -54,7 +54,7 @@ EOF
# docker environment is fully controlled by this script. # docker environment is fully controlled by this script.
# See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option. # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
cmake .. \ cmake .. \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release} \
${PYTHON_FLAGS} \ ${PYTHON_FLAGS} \
-DWITH_DOC=OFF \ -DWITH_DOC=OFF \
-DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \
......
...@@ -22,7 +22,9 @@ limitations under the License. */ ...@@ -22,7 +22,9 @@ limitations under the License. */
int main(int argc, char** argv) { int main(int argc, char** argv) {
std::vector<char*> new_argv; std::vector<char*> new_argv;
std::string gflags_env; std::string gflags_env;
new_argv.push_back(argv[0]); for (int i = 0; i < argc; ++i) {
new_argv.push_back(argv[i]);
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
new_argv.push_back( new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
......
...@@ -140,8 +140,13 @@ def init_config_environment( ...@@ -140,8 +140,13 @@ def init_config_environment(
g_submodel_stack=[], g_submodel_stack=[],
g_add_submodel_suffix=False, ): g_add_submodel_suffix=False, ):
for k, v in locals().iteritems(): # directly iterate through locals().iteritems() will change
globals()[k] = copy.deepcopy(v) # the size of locals() due to introducing k, v into scope
# which will break the process in some env
local_vars = copy.deepcopy(locals())
for k, v in local_vars.iteritems():
globals()[k] = v
# Because type is widely used as a variable name in this code. # Because type is widely used as a variable name in this code.
......
...@@ -26,6 +26,7 @@ import initializer ...@@ -26,6 +26,7 @@ import initializer
import layers import layers
import nets import nets
import optimizer import optimizer
import learning_rate_decay
import backward import backward
import regularizer import regularizer
from param_attr import ParamAttr from param_attr import ParamAttr
...@@ -35,27 +36,16 @@ from distribute_transpiler import DistributeTranspiler ...@@ -35,27 +36,16 @@ from distribute_transpiler import DistributeTranspiler
from distribute_transpiler_simple import SimpleDistributeTranspiler from distribute_transpiler_simple import SimpleDistributeTranspiler
import clip import clip
from memory_optimization_transpiler import memory_optimize from memory_optimization_transpiler import memory_optimize
import profiler
Tensor = LoDTensor Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [ __all__ = framework.__all__ + executor.__all__ + [
'io', 'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
'initializer', 'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
'layers',
'nets',
'optimizer',
'backward',
'regularizer',
'LoDTensor',
'CPUPlace',
'CUDAPlace',
'Tensor',
'ParamAttr' 'ParamAttr'
'DataFeeder', 'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
'clip', 'memory_optimize', 'profiler'
'SimpleDistributeTranspiler',
'DistributeTranspiler',
'memory_optimize',
] ]
...@@ -86,11 +76,9 @@ def __bootstrap__(): ...@@ -86,11 +76,9 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = [ read_env_flags = ['use_pinned_memory', 'check_nan_inf', 'benchmark']
'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark' if core.is_compiled_with_cuda():
] read_env_flags += ['fraction_of_gpu_memory_to_use']
if core.is_compile_gpu():
read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0]) core.init_glog(sys.argv[0])
......
...@@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): ...@@ -178,7 +178,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
if _all_in_set_( if _all_in_set_(
filter(lambda name: name.find(core.grad_var_suffix()) != -1, filter(lambda name: name.find(core.grad_var_suffix()) != -1,
op_desc.input_arg_names()), no_grad_set): op_desc.input_arg_names()), no_grad_set):
no_grad_set.union(out_arg_names) no_grad_set.update(out_arg_names)
return True return True
return False return False
......
...@@ -30,6 +30,9 @@ __all__ = [ ...@@ -30,6 +30,9 @@ __all__ = [
class BaseErrorClipAttr(object): class BaseErrorClipAttr(object):
def __str__(self):
raise NotImplementedError()
def append_clip_op(self, block, grad_name): def append_clip_op(self, block, grad_name):
raise NotImplementedError() raise NotImplementedError()
...@@ -44,6 +47,9 @@ class ErrorClipByValue(BaseErrorClipAttr): ...@@ -44,6 +47,9 @@ class ErrorClipByValue(BaseErrorClipAttr):
self.max = max self.max = max
self.min = min self.min = min
def __str__(self):
return "ByValue, min=%f, max=%f" % (self.min, self.max)
def append_clip_op(self, block, grad_name): def append_clip_op(self, block, grad_name):
clip_op_desc = block.desc.append_op() clip_op_desc = block.desc.append_op()
clip_op_desc.set_type("clip") clip_op_desc.set_type("clip")
...@@ -71,6 +77,9 @@ def error_clip_callback(block, context): ...@@ -71,6 +77,9 @@ def error_clip_callback(block, context):
class BaseGradientClipAttr(object): class BaseGradientClipAttr(object):
def __str__(self):
raise NotImplementedError()
def process_context(self, context, param, grad): def process_context(self, context, param, grad):
raise NotImplementedError() raise NotImplementedError()
...@@ -79,6 +88,9 @@ class BaseGradientClipAttr(object): ...@@ -79,6 +88,9 @@ class BaseGradientClipAttr(object):
class NullGradientClipAttr(BaseGradientClipAttr): class NullGradientClipAttr(BaseGradientClipAttr):
def __str__(self):
return "Null"
def process_context(self, context, param, grad): def process_context(self, context, param, grad):
pass pass
...@@ -96,6 +108,9 @@ class GradientClipByValue(BaseGradientClipAttr): ...@@ -96,6 +108,9 @@ class GradientClipByValue(BaseGradientClipAttr):
self.max = max self.max = max
self.min = min self.min = min
def __str__(self):
return "ByValue, min=%f, max=%f" % (self.min, self.max)
def process_context(self, context, param, grad): def process_context(self, context, param, grad):
pass pass
...@@ -108,6 +123,9 @@ class GradientClipByNorm(BaseGradientClipAttr): ...@@ -108,6 +123,9 @@ class GradientClipByNorm(BaseGradientClipAttr):
def __init__(self, clip_norm): def __init__(self, clip_norm):
self.clip_norm = clip_norm self.clip_norm = clip_norm
def __str__(self):
return "ByNorm, clip_norm=%f" % self.clip_norm
def process_context(self, context, param, grad): def process_context(self, context, param, grad):
pass pass
...@@ -124,6 +142,10 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -124,6 +142,10 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
self.clip_norm = clip_norm self.clip_norm = clip_norm
self.group_name = group_name self.group_name = group_name
def __str__(self):
return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
self.clip_norm)
def process_context(self, context, param, grad): def process_context(self, context, param, grad):
if self.group_name not in context: if self.group_name not in context:
context[self.group_name] = [] context[self.group_name] = []
...@@ -160,6 +182,17 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -160,6 +182,17 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
def set_gradient_clip(clip, param_list=None, program=None): def set_gradient_clip(clip, param_list=None, program=None):
"""
To specify parameters that require gradient clip.
Args:
clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
which describes the type and detailed attributes of required gradient clip.
param_list(list, None by default): Parameters that require gradient clip.
It can be a list of parameter or a list of parameter's name.
When it's None, all parameters in the program will be included.
program(Program, None by default): The program where parameters are.
Will be the default main program when assigned with None.
"""
if not isinstance(clip, BaseGradientClipAttr): if not isinstance(clip, BaseGradientClipAttr):
raise TypeError( raise TypeError(
"'clip' should be an instance of BaseGradientClipAttr's derived class" "'clip' should be an instance of BaseGradientClipAttr's derived class"
...@@ -199,3 +232,5 @@ def append_gradient_clip_ops(param_grad): ...@@ -199,3 +232,5 @@ def append_gradient_clip_ops(param_grad):
ClipByValue = GradientClipByValue ClipByValue = GradientClipByValue
ClipByNorm = GradientClipByNorm
ClipByGlobalNorm = GradientClipByGlobalNorm
...@@ -33,6 +33,10 @@ class VarBlock: ...@@ -33,6 +33,10 @@ class VarBlock:
return "%s:%d:%d" % (self.varname, self.offset, self.size) return "%s:%d:%d" % (self.varname, self.offset, self.size)
def same_or_split_var(p_name, var_name):
return p_name == var_name or p_name.startswith(var_name + ".block")
def split_dense_variable(var_list, def split_dense_variable(var_list,
pserver_count, pserver_count,
min_block_size=1024, min_block_size=1024,
...@@ -149,11 +153,18 @@ class DistributeTranspiler: ...@@ -149,11 +153,18 @@ class DistributeTranspiler:
self.param_grad_ep_mapping[ep]["params"].append(param) self.param_grad_ep_mapping[ep]["params"].append(param)
self.param_grad_ep_mapping[ep]["grads"].append(grad) self.param_grad_ep_mapping[ep]["grads"].append(grad)
rpc_client_var = program.global_block().create_var(
name="RPC_CLIENT_VAR",
psersistable=True,
dtype='float32', # dtype and shape is not used in fact
shape=[0])
# create send_op # create send_op
send_op = program.global_block().append_op( send_op = program.global_block().append_op(
type="send", type="send",
inputs={"X": send_inputs}, inputs={"X": send_inputs},
outputs={"Out": send_outputs}, outputs={"Out": send_outputs,
"RPCClient": rpc_client_var},
attrs={"endpoints": pserver_endpoints, attrs={"endpoints": pserver_endpoints,
"epmap": eplist}) "epmap": eplist})
# step4 # step4
...@@ -221,7 +232,7 @@ class DistributeTranspiler: ...@@ -221,7 +232,7 @@ class DistributeTranspiler:
if len(splited_vars) <= 1: if len(splited_vars) <= 1:
continue continue
orig_var = program.global_block().vars[varname] orig_var = program.global_block().vars[varname]
if orig_var == core.VarDesc.VarType.SELECTED_ROWS: if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
height_sections = [] height_sections = []
for v in splited_vars: for v in splited_vars:
height_sections.append(v.shape[0]) height_sections.append(v.shape[0])
...@@ -230,7 +241,7 @@ class DistributeTranspiler: ...@@ -230,7 +241,7 @@ class DistributeTranspiler:
inputs={"X": orig_var}, inputs={"X": orig_var},
outputs={"Out": splited_vars}, outputs={"Out": splited_vars},
attrs={"height_sections": height_sections}) attrs={"height_sections": height_sections})
elif orig_var == core.VarDesc.VarType.LOD_TENSOR: elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
sections = [] sections = []
for v in splited_vars: for v in splited_vars:
sections.append(v.shape[0]) sections.append(v.shape[0])
...@@ -303,8 +314,8 @@ class DistributeTranspiler: ...@@ -303,8 +314,8 @@ class DistributeTranspiler:
return True return True
else: else:
for n in param_names: for n in param_names:
if n.startswith(op.inputs["Param"].name+".block") and \ if same_or_split_var(n, op.inputs[
n != op.inputs["Param"].name: "Param"].name) and n != op.inputs["Param"].name:
return True return True
return False return False
else: else:
...@@ -335,7 +346,7 @@ class DistributeTranspiler: ...@@ -335,7 +346,7 @@ class DistributeTranspiler:
if key == "Grad": if key == "Grad":
grad_block = None grad_block = None
for g in self.param_grad_ep_mapping[endpoint]["grads"]: for g in self.param_grad_ep_mapping[endpoint]["grads"]:
if g.name.startswith(var.name): if same_or_split_var(g.name, var.name):
grad_block = g grad_block = g
break break
if not grad_block: if not grad_block:
...@@ -365,7 +376,7 @@ class DistributeTranspiler: ...@@ -365,7 +376,7 @@ class DistributeTranspiler:
# param is already created on global program # param is already created on global program
param_block = None param_block = None
for p in self.param_grad_ep_mapping[endpoint]["params"]: for p in self.param_grad_ep_mapping[endpoint]["params"]:
if p.name.startswith(var.name): if same_or_split_var(p.name, var.name):
param_block = p param_block = p
break break
if not param_block: if not param_block:
...@@ -470,8 +481,7 @@ class DistributeTranspiler: ...@@ -470,8 +481,7 @@ class DistributeTranspiler:
# Append the recv op # Append the recv op
pserver_program.global_block().append_op( pserver_program.global_block().append_op(
type="recv", type="recv",
inputs={"RX": self.param_grad_ep_mapping[endpoint]["grads"] inputs={},
}, # grads to recv
outputs={}, outputs={},
attrs={ attrs={
"OptimizeBlock": optimize_sub_program.global_block(), "OptimizeBlock": optimize_sub_program.global_block(),
...@@ -502,7 +512,7 @@ class DistributeTranspiler: ...@@ -502,7 +512,7 @@ class DistributeTranspiler:
def _get_splited_name_and_shape(varname): def _get_splited_name_and_shape(varname):
for idx, splited_param in enumerate(params): for idx, splited_param in enumerate(params):
pname = splited_param.name pname = splited_param.name
if pname.startswith(varname) and varname != pname: if same_or_split_var(pname, varname) and varname != pname:
return pname, splited_param.shape return pname, splited_param.shape
return "", [] return "", []
......
...@@ -68,6 +68,84 @@ def as_numpy(tensor): ...@@ -68,6 +68,84 @@ def as_numpy(tensor):
return ans return ans
def has_feed_operators(block, feed_targets, feed_holder_name):
""" Check whether the block already has feed operators.
Return false if the block does not have any feed operators.
If some feed operators have been prepended to the block, check that
the info contained in these feed operators matches the feed_targets
and feed_holder_name. Raise exception when any mismatch is found.
Return true when the block has feed operators with matching info.
Args:
block: a block instance (typically global block of a program)
feed_targets: a dictionary of {feed_target_name: feed_target_data}
feed_holder_name: the name of the variable that holds the data of
all feed targets. The type of this feed_holder variable is
FEED_MINIBATCH, which is essentially vector<LoDTensor>.
Returns:
A boolean value that indicates whether a block has feed operators
that match the info contained in feed_targets and feed_holder_name.
"""
feed_count = 0
for op in block.ops:
if op.desc.type() == 'feed':
feed_count += 1
assert op.desc.input('X')[0] == feed_holder_name
feed_target_name = op.desc.output('Out')[0]
if feed_target_name not in feed_targets:
raise Exception("'feed_targets' does not have {} variable".
format(feed_target_name))
else:
break
if feed_count > 0 and feed_count != len(feed_targets):
raise Exception(
"Feed operators in program desc do not match 'feed_targets'")
return feed_count > 0
def has_fetch_operators(block, fetch_targets, fetch_holder_name):
""" Check whether the block already has fetch operators.
Return false if the block does not have any fetch operators.
If some fetch operators have been appended to the block, check that
the info contained in these fetch operators matches the fetch_targets
and fetch_holder_name. Raise exception when any mismatch is found.
Return true when the block has fetch operators with matching info.
Args:
block: a block instance (typically global block of a program)
fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
fetch_holder_name: the name of the variable that holds the data of
all fetch targets. The type of this fetch_holder variable is
FETCH_LIST, which is essentially vector<LoDTensor>.
Return:
A boolean value that indicates whether a block has fetch operators
that match the info contained in fetch_targets and fetch_holder_name.
"""
fetch_count = 0
for op in block.ops:
if op.desc.type() == 'fetch':
fetch_count += 1
assert op.desc.output('Out')[0] == fetch_holder_name
fetch_target_name = op.desc.input('X')[0]
if fetch_target_name not in [
var.desc.name() for var in fetch_targets
]:
raise Exception("'fetch_targets' does not have {} variable".
format(fetch_target_name))
idx = op.desc.attr('col')
assert fetch_target_name == fetch_targets[idx].desc.name()
if fetch_count > 0 and fetch_count != len(fetch_targets):
raise Exception(
"Fetch operators in program desc do not match 'fetch_targets'")
return fetch_count > 0
class Executor(object): class Executor(object):
def __init__(self, places): def __init__(self, places):
if not isinstance(places, list) and not isinstance(places, tuple): if not isinstance(places, list) and not isinstance(places, tuple):
...@@ -147,27 +225,44 @@ class Executor(object): ...@@ -147,27 +225,44 @@ class Executor(object):
program = program.clone() program = program.clone()
global_block = program.global_block() global_block = program.global_block()
if feed_var_name in global_block.vars:
feed_var = global_block.var(feed_var_name)
else:
feed_var = global_block.create_var( feed_var = global_block.create_var(
name=feed_var_name, name=feed_var_name,
type=core.VarDesc.VarType.FEED_MINIBATCH, type=core.VarDesc.VarType.FEED_MINIBATCH,
persistable=True) persistable=True)
if fetch_var_name in global_block.vars:
fetch_var = global_block.var(fetch_var_name)
else:
fetch_var = global_block.create_var(
name=fetch_var_name,
type=core.VarDesc.VarType.FETCH_LIST,
persistable=True)
if not has_feed_operators(global_block, feed, feed_var_name):
for i, name in enumerate(feed): for i, name in enumerate(feed):
out = global_block.var(name) out = global_block.var(name)
global_block.prepend_op( global_block.prepend_op(
'feed', type='feed',
inputs={'X': [feed_var]}, inputs={'X': [feed_var]},
outputs={'Out': [out]}, outputs={'Out': [out]},
attrs={'col': i}) attrs={'col': i})
cur_feed = feed[name]
for op in global_block.ops:
if op.desc.type() == 'feed':
feed_target_name = op.desc.output('Out')[0]
cur_feed = feed[feed_target_name]
if not isinstance(cur_feed, core.LoDTensor): if not isinstance(cur_feed, core.LoDTensor):
cur_feed = self.aslodtensor(cur_feed) cur_feed = self.aslodtensor(cur_feed)
core.set_feed_variable(scope, cur_feed, feed_var.name, i) idx = op.desc.attr('col')
core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
else:
break
fetch_var = global_block.create_var( if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
name=fetch_var_name,
type=core.VarDesc.VarType.FETCH_LIST,
persistable=True)
for i, var in enumerate(fetch_list): for i, var in enumerate(fetch_list):
global_block.append_op( global_block.append_op(
type='fetch', type='fetch',
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import collections import collections
import contextlib import contextlib
import re
import numpy as np import numpy as np
...@@ -239,20 +240,30 @@ class Variable(object): ...@@ -239,20 +240,30 @@ class Variable(object):
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
def to_string(self, throw_on_error): def to_string(self, throw_on_error, with_details=False):
""" """
Get debug string. Get debug string.
Args: Args:
throw_on_error(bool): True if raise an exception when self is not throw_on_error(bool): True if raise an exception when self is not
intialized. intialized.
with_details(bool): more details about variables and parameters
(e.g. trainable, optimize_attr, ...) will be printed when with_details is True
Returns(str): The debug string. Returns(str): The debug string.
""" """
assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool)
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
proto = framework_pb2.VarDesc.FromString(str(protostr)) proto = framework_pb2.VarDesc.FromString(str(protostr))
return _debug_string_(proto, throw_on_error) res_str = _debug_string_(proto, throw_on_error)
if with_details:
additional_attr = ("error_clip", "stop_gradient")
for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name,
str(getattr(self, attr_name)))
return res_str
__repr__ = __str__ __repr__ = __str__
...@@ -629,10 +640,36 @@ class Block(object): ...@@ -629,10 +640,36 @@ class Block(object):
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
def to_string(self, throw_on_error): def to_string(self, throw_on_error, with_details=False):
"""
To debug string.
Args:
throw_on_error(bool): raise exception when self is not initialized
when throw_on_error is True
with_details(bool): more details about variables and parameters
(e.g. trainable, optimize_attr, ...) will be printed when with_details is True
Returns(str): The debug string.
"""
assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool)
if with_details:
re_add_indent = re.compile(r"\n(.)")
res_str = "blocks {\n idx: %d\n parent_idx: %d" % (
self.idx, self.parent_idx)
for var in self.vars.itervalues():
res_str += "\n vars {\n %s }" % re_add_indent.sub(
r"\n \1", var.to_string(throw_on_error, with_details))
for op in self.ops:
res_str += "\n ops {\n %s }" % re_add_indent.sub(
r"\n \1", op.to_string(throw_on_error))
res_str += "\n}"
else:
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
proto = framework_pb2.BlockDesc.FromString(str(protostr)) proto = framework_pb2.BlockDesc.FromString(str(protostr))
return _debug_string_(proto, throw_on_error) res_str = _debug_string_(proto, throw_on_error)
return res_str
__repr__ = __str__ __repr__ = __str__
...@@ -796,10 +833,29 @@ class Program(object): ...@@ -796,10 +833,29 @@ class Program(object):
def __str__(self): def __str__(self):
return self.to_string(True) return self.to_string(True)
def to_string(self, throw_on_error): def to_string(self, throw_on_error, with_details=False):
"""
To debug string.
Args:
throw_on_error(bool): raise exception when self is not initialized
when throw_on_error is True
with_details(bool): more details about variables and parameters
(e.g. trainable, optimize_attr, ...) will be printed when with_details is True
Returns(str): The debug string.
"""
assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool)
if with_details:
res_str = ""
for block in self.blocks:
res_str += block.to_string(throw_on_error, with_details)
else:
protostr = self.desc.serialize_to_string() protostr = self.desc.serialize_to_string()
proto = framework_pb2.ProgramDesc.FromString(str(protostr)) proto = framework_pb2.ProgramDesc.FromString(str(protostr))
return _debug_string_(proto, throw_on_error) res_str = _debug_string_(proto, throw_on_error)
return res_str
def get_desc(self): def get_desc(self):
return self.desc return self.desc
...@@ -950,6 +1006,36 @@ class Parameter(Variable): ...@@ -950,6 +1006,36 @@ class Parameter(Variable):
self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None) self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
def __str__(self):
return self.to_string(True)
def to_string(self, throw_on_error, with_details=False):
"""
To debug string.
Args:
throw_on_error(bool): raise exception when self is not initialized
when throw_on_error is True
with_details(bool): more details about variables and parameters
(e.g. trainable, optimize_attr, ...) will be printed when with_details is True
Returns(str): The debug string.
"""
assert isinstance(throw_on_error, bool) and isinstance(with_details,
bool)
if with_details:
res_str = Variable.to_string(self, throw_on_error, True)
additional_attr = ("trainable", "optimize_attr", "regularizer",
"gradient_clip_attr")
for attr_name in additional_attr:
res_str += "%s: %s\n" % (attr_name,
str(getattr(self, attr_name)))
else:
res_str = Variable.to_string(self, throw_on_error, False)
return res_str
__repr__ = __str__
# program is a global instance. # program is a global instance.
_main_program_ = Program() _main_program_ = Program()
......
...@@ -13,8 +13,8 @@ ...@@ -13,8 +13,8 @@
# limitations under the License. # limitations under the License.
import os import os
import cPickle as pickle
from paddle.v2.fluid.evaluator import Evaluator
from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
from . import core from . import core
...@@ -187,18 +187,28 @@ def get_inference_program(target_vars, main_program=None): ...@@ -187,18 +187,28 @@ def get_inference_program(target_vars, main_program=None):
main_program = default_main_program() main_program = default_main_program()
if not isinstance(target_vars, list): if not isinstance(target_vars, list):
target_vars = [target_vars] target_vars = [target_vars]
vars = []
pruned_program = main_program.prune(targets=target_vars) for var in target_vars:
if isinstance(var, Evaluator):
vars.extend(var.states)
vars.extend(var.metrics)
else:
vars.append(var)
pruned_program = main_program.prune(targets=vars)
inference_program = pruned_program.inference_optimize() inference_program = pruned_program.inference_optimize()
return inference_program return inference_program
def prepend_feed_ops(inference_program, feeded_var_names): def prepend_feed_ops(inference_program,
feed_target_names,
feed_holder_name='feed'):
global_block = inference_program.global_block() global_block = inference_program.global_block()
feed_var = global_block.create_var( feed_var = global_block.create_var(
name='feed', type=core.VarDesc.VarType.FEED_MINIBATCH, persistable=True) name=feed_holder_name,
type=core.VarDesc.VarType.FEED_MINIBATCH,
persistable=True)
for i, name in enumerate(feeded_var_names): for i, name in enumerate(feed_target_names):
out = global_block.var(name) out = global_block.var(name)
global_block.prepend_op( global_block.prepend_op(
type='feed', type='feed',
...@@ -207,12 +217,16 @@ def prepend_feed_ops(inference_program, feeded_var_names): ...@@ -207,12 +217,16 @@ def prepend_feed_ops(inference_program, feeded_var_names):
attrs={'col': i}) attrs={'col': i})
def append_fetch_ops(inference_program, fetch_var_names): def append_fetch_ops(inference_program,
fetch_target_names,
fetch_holder_name='fetch'):
global_block = inference_program.global_block() global_block = inference_program.global_block()
fetch_var = global_block.create_var( fetch_var = global_block.create_var(
name='fetch', type=core.VarDesc.VarType.FETCH_LIST, persistable=True) name=fetch_holder_name,
type=core.VarDesc.VarType.FETCH_LIST,
persistable=True)
for i, name in enumerate(fetch_var_names): for i, name in enumerate(fetch_target_names):
global_block.append_op( global_block.append_op(
type='fetch', type='fetch',
inputs={'X': [name]}, inputs={'X': [name]},
...@@ -262,21 +276,12 @@ def save_inference_model(dirname, ...@@ -262,21 +276,12 @@ def save_inference_model(dirname,
inference_program = pruned_program.inference_optimize() inference_program = pruned_program.inference_optimize()
fetch_var_names = [v.name for v in target_vars] fetch_var_names = [v.name for v in target_vars]
model_file_name = dirname + "/__model__"
with open(model_file_name, "w") as f:
pickle.dump({
"program_desc_str": inference_program.desc.serialize_to_string(),
"feed_var_names": feeded_var_names,
"fetch_var_names": fetch_var_names
}, f, -1)
prepend_feed_ops(inference_program, feeded_var_names) prepend_feed_ops(inference_program, feeded_var_names)
append_fetch_ops(inference_program, fetch_var_names) append_fetch_ops(inference_program, fetch_var_names)
# Save only programDesc of inference_program in binary format model_file_name = dirname + "/__model__"
# in another file: __model__.dat with open(model_file_name, "wb") as f:
with open(model_file_name + ".dat", "wb") as fp: f.write(inference_program.desc.serialize_to_string())
fp.write(inference_program.desc.serialize_to_string())
save_params(executor, dirname, main_program) save_params(executor, dirname, main_program)
...@@ -299,6 +304,24 @@ def load_persistables_if_exist(executor, dirname, main_program=None): ...@@ -299,6 +304,24 @@ def load_persistables_if_exist(executor, dirname, main_program=None):
predicate=_is_presistable_and_exist_) predicate=_is_presistable_and_exist_)
def get_feed_targets_names(program):
feed_targets_names = []
global_block = program.global_block()
for op in global_block.ops:
if op.desc.type() == 'feed':
feed_targets_names.insert(0, op.desc.output('Out')[0])
return feed_targets_names
def get_fetch_targets_names(program):
fetch_targets_names = []
global_block = program.global_block()
for op in global_block.ops:
if op.desc.type() == 'fetch':
fetch_targets_names.append(op.desc.input('X')[0])
return fetch_targets_names
def load_inference_model(dirname, executor): def load_inference_model(dirname, executor):
""" """
Load inference model from a directory Load inference model from a directory
...@@ -306,24 +329,28 @@ def load_inference_model(dirname, executor): ...@@ -306,24 +329,28 @@ def load_inference_model(dirname, executor):
:param dirname: directory path :param dirname: directory path
:param executor: executor that load inference model :param executor: executor that load inference model
:return: [program, feed_var_names, fetch_var_names] :return: [program, feed_target_names, fetch_targets]
program: program especially for inference. program: program especially for inference.
feeded_var_names: Names of variables that need to feed data feed_target_names: Names of variables that need to feed data
fetch_vars: Variables from which we can get inference results. fetch_targets: Variables from which we can get inference results.
""" """
if not os.path.isdir(dirname): if not os.path.isdir(dirname):
raise ValueError("There is no directory named '%s'", dirname) raise ValueError("There is no directory named '%s'", dirname)
model_file_name = dirname + "/__model__" model_file_name = dirname + "/__model__"
model = pickle.load(open(model_file_name, "r")) with open(model_file_name, "rb") as f:
program_desc_str = model["program_desc_str"] program_desc_str = f.read()
feed_var_names = model["feed_var_names"]
fetch_var_names = model["fetch_var_names"]
program = Program.parse_from_string(program_desc_str) program = Program.parse_from_string(program_desc_str)
load_persistables_if_exist(executor, dirname, program) load_persistables_if_exist(executor, dirname, program)
fetch_vars = [program.global_block().var(name) for name in fetch_var_names]
return [program, feed_var_names, fetch_vars] feed_target_names = get_feed_targets_names(program)
fetch_target_names = get_fetch_targets_names(program)
fetch_targets = [
program.global_block().var(name) for name in fetch_target_names
]
return [program, feed_target_names, fetch_targets]
def get_parameter_value(para, executor): def get_parameter_value(para, executor):
......
...@@ -18,7 +18,7 @@ import itertools ...@@ -18,7 +18,7 @@ import itertools
from framework import Variable, Parameter, default_main_program, default_startup_program, \ from framework import Variable, Parameter, default_main_program, default_startup_program, \
unique_name, dtype_is_floating unique_name, dtype_is_floating
from paddle.v2.fluid.initializer import Constant, Xavier from paddle.v2.fluid.initializer import Constant, Xavier
from param_attr import ParamAttr from param_attr import ParamAttr, WeightNormParamAttr
class LayerHelper(object): class LayerHelper(object):
...@@ -100,9 +100,181 @@ class LayerHelper(object): ...@@ -100,9 +100,181 @@ class LayerHelper(object):
if dtype is None: if dtype is None:
dtype = each.dtype dtype = each.dtype
elif dtype != each.dtype: elif dtype != each.dtype:
raise ValueError("Data Type mismatch") raise ValueError("Data Type mismatch: %d to %d" %
(dtype, each.dtype))
return dtype return dtype
def _create_weight_normalize(self, attr, shape, dtype):
from .layers import elementwise_mul, elementwise_div, reshape
# Remove these ops when LayerHelper and layers support indicating
# program and block.
def __norm_op(x,
out=None,
p=2,
dim=None,
keep_dim=False,
block=self.startup_program.global_block()):
if out is None:
out = block.create_var(
name=unique_name(".".join([self.name, 'weight_norm_norm'])),
dtype=dtype,
persistable=False)
abs_out = block.create_var(
name=unique_name(".".join([self.name, 'weight_norm_abs'])),
dtype=dtype,
persistable=False)
block.append_op(
type='abs', inputs={'X': x}, outputs={'Out': abs_out})
pow_out = block.create_var(
name=unique_name(".".join([self.name, 'weight_norm_pow'])),
dtype=dtype,
persistable=False)
block.append_op(
type='pow',
inputs={'X': abs_out},
outputs={'Out': pow_out},
attrs={'factor': float(p)})
sum_out = block.create_var(
name=unique_name(".".join([self.name, 'weight_norm_sum'])),
dtype=dtype,
persistable=False)
block.append_op(
type='reduce_sum',
inputs={'X': pow_out},
outputs={'Out': sum_out},
attrs={
'dim': dim,
'keep_dim': keep_dim,
'reduce_all': True if dim is None else False
})
block.append_op(
type='pow',
inputs={'X': sum_out},
outputs={'Out': out},
attrs={'factor': 1. / p})
return out
def __reshape_op(x,
shape,
out=None,
block=self.startup_program.global_block()):
if out is None:
out = block.create_var(
name=unique_name(".".join(
[self.name, 'weight_norm_reshape'])),
dtype=dtype,
persistable=False)
block.append_op(
type='reshape',
inputs={'X': x},
outputs={'Out': out},
attrs={'shape': shape})
return out
def __transpose_op(x,
axis,
out=None,
block=self.startup_program.global_block()):
if out is None:
out = block.create_var(
name=unique_name(".".join(
[self.name, 'weight_norm_transpose'])),
dtype=dtype,
persistable=False)
block.append_op(
type='transpose',
inputs={'X': x},
outputs={'Out': out},
attrs={'axis': axis})
return out
def __norm_except_dim(x,
out=None,
dim=None,
block=self.startup_program.global_block()):
"""Computes the norm over all dimensions except dim"""
if out is None:
out = block.create_var(
name=unique_name(".".join([self.name, 'weight_norm_norm'])),
dtype=dtype,
persistable=False)
if dim is None:
__norm_op(x, out, dim=dim, block=block)
elif dim == 0:
out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
norm = __norm_op(reshape, dim=1, block=block)
__reshape_op(norm, out=out, shape=out_shape, block=block)
elif dim == len(x.shape) - 1:
out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
norm = __norm_op(reshape, dim=0, block=block)
__reshape_op(norm, out=out, shape=out_shape, block=block)
else:
perm = range(len(x.shape))
perm[0], perm[dim] = dim, 0
transpose = __transpose_op(x, perm, block=block)
norm = __norm_op(transpose, dim=0, block=block)
__transpose_op(norm, perm, out=out, block=block)
return out
def __weight_normalize(g, v, dim):
"""Calculations for weight normalization"""
norm = __norm_except_dim(
v, dim=dim, block=self.main_program.current_block())
scale = elementwise_div(
x=g, y=norm) # The shapes of g and norm are the same.
# Currently, elementwise_mul only support broadcast when the shape
# of y is a subset of the shape of x. Thus, we reshape y to squeeze
# to achive the subset.
w = elementwise_mul(
x=v,
y=scale if dim is None else reshape(
x=scale, shape=[v.shape[dim]]),
axis=-1 if dim is None else dim)
# To serialize the original parameter for inference, maybe a
# parameter rather than a variable should be returned.
return w
g_param_attr = copy.deepcopy(attr)
g_param_attr.name = attr.name + '_g'
g_param_shape = [1] * len(shape)
if attr.dim is not None:
g_param_shape[attr.dim] = shape[attr.dim]
v_param_attr = copy.deepcopy(attr)
v_param_attr.name = attr.name + '_v'
v_param_shape = shape
# Add to startup_program to initialize g and v.
# Try to reconstruct the initializer of w by initializing g and v.
# Set the initializers of g and v as below, then the distribution
# of w is the same as initializing w with the given initializer.
# For Data-Dependent Initialization, please compute the init-values
# of g and v in external and then feed the values to g and v by
# executing an extra program.
g_param = self.startup_program.global_block().create_parameter(
dtype=dtype,
shape=g_param_shape,
**g_param_attr.to_kwargs(with_initializer=False))
v_param = self.startup_program.global_block().create_parameter(
dtype=dtype,
shape=v_param_shape,
**v_param_attr.to_kwargs(with_initializer=True))
__norm_except_dim(
x=v_param,
out=g_param,
dim=attr.dim,
block=self.startup_program.global_block())
# Add weight normalization to main_program
g_param = self.main_program.global_block().create_parameter(
dtype=dtype, shape=g_param_shape, **g_param_attr.to_kwargs())
v_param = self.main_program.global_block().create_parameter(
dtype=dtype, shape=v_param_shape, **v_param_attr.to_kwargs())
w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
return w_param
def create_parameter(self, def create_parameter(self,
attr, attr,
shape, shape,
...@@ -110,18 +282,26 @@ class LayerHelper(object): ...@@ -110,18 +282,26 @@ class LayerHelper(object):
is_bias=False, is_bias=False,
default_initializer=None): default_initializer=None):
# Deepcopy the attr so that parameters can be shared in program # Deepcopy the attr so that parameters can be shared in program
attr = copy.deepcopy(attr)
assert isinstance(attr, ParamAttr) assert isinstance(attr, ParamAttr)
suffix = 'b' if is_bias else 'w' suffix = 'b' if is_bias else 'w'
if attr.name is None:
attr.name = unique_name(".".join([self.name, suffix]))
if default_initializer is None: if default_initializer is None and attr.initializer is None:
if is_bias: if is_bias:
attr.set_default_bias_initializer() attr.set_default_bias_initializer()
else: else:
attr.set_default_param_initializer() attr.set_default_param_initializer()
else: else:
attr.set_default_initializer(default_initializer) attr.set_default_initializer(default_initializer)
if attr.name is None:
attr.name = unique_name(".".join([self.name, suffix])) # If weight normalization is set, insert extra parameters and ops.
# Refer to https://arxiv.org/pdf/1602.07868.pdf
if isinstance(attr, WeightNormParamAttr):
param = self._create_weight_normalize(attr, shape, dtype)
WeightNormParamAttr.params_with_weight_norm.append(param)
return param
self.startup_program.global_block().create_parameter( self.startup_program.global_block().create_parameter(
dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True)) dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
......
...@@ -289,6 +289,7 @@ class ParallelDo(object): ...@@ -289,6 +289,7 @@ class ParallelDo(object):
for in_var_name in op.input(iname): for in_var_name in op.input(iname):
if in_var_name not in local_inputs: if in_var_name not in local_inputs:
params.append(in_var_name) params.append(in_var_name)
params = list(set(params))
return [parent_block.var(name) for name in params] return [parent_block.var(name) for name in params]
...@@ -769,7 +770,7 @@ def topk(input, k): ...@@ -769,7 +770,7 @@ def topk(input, k):
array = fluid.layers.topk(x, k) array = fluid.layers.topk(x, k)
""" """
helper = LayerHelper('topk', **locals()) helper = LayerHelper('topk', **locals())
topk_out = helper.create_tmp_variable(dtype=input.data_type) topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype='int64') topk_indices = helper.create_tmp_variable(dtype='int64')
helper.append_op( helper.append_op(
type='top_k', type='top_k',
......
...@@ -14,8 +14,10 @@ ...@@ -14,8 +14,10 @@
from .. import core from .. import core
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from control_flow import BlockGuard
from ..layer_helper import LayerHelper
__all__ = ['data'] __all__ = ['data', 'BlockGuardServ', 'ListenAndServ', 'Send']
def data(name, def data(name,
...@@ -74,3 +76,123 @@ def data(name, ...@@ -74,3 +76,123 @@ def data(name,
type=type, type=type,
stop_gradient=stop_gradient, stop_gradient=stop_gradient,
lod_level=lod_level) lod_level=lod_level)
class BlockGuardServ(BlockGuard):
"""
BlockGuardServ class.
BlockGuardServ class is used to create an op with a block in a program.
"""
def __init__(self, server):
if not (isinstance(server, ListenAndServ)):
raise TypeError("BlockGuardServ takes a ListenAndServ")
super(BlockGuardServ, self).__init__(server.helper.main_program)
self.server = server
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
self.server.complete_op()
return super(BlockGuardServ, self).__exit__(exc_type, exc_val, exc_tb)
class ListenAndServ(object):
"""
ListenAndServ class.
ListenAndServ class is used to wrap listen_and_serv op to create a server
which can receive variables from clients and run a block.
"""
def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
self.helper = LayerHelper("recv")
self.inputs = []
self.outputs = []
self.endpoint = endpoint
self.fan_in = fan_in
# FIXME(typhoonzero): add optimizer_mode is stupid, should make it more
# general.
self.optimizer_mode = optimizer_mode
def do(self):
return BlockGuardServ(self)
def get_params_and_grads(self):
main_program = self.helper.main_program
current_block = main_program.current_block()
parent_block = self.parent_block()
# params and grads in the same order.
params = list()
grads = list()
for op in current_block.ops:
# FIXME(typhoonzero): op.inputs is None if it's cloned.
if self.optimizer_mode:
if "Grad" in op.inputs and "Param" in op.inputs:
params.append(op.inputs["Param"].name)
grads.append(op.inputs["Grad"].name)
else:
# simple recv mode, recv operators inputs.
for iname in op.input_names:
for in_var_name in op.input(iname):
params.append(parent_block.var(in_var_name))
grads.append(parent_block.var(in_var_name))
return params, grads
def parent_block(self):
prog = self.helper.main_program
parent_idx = prog.current_block().parent_idx
assert parent_idx >= 0
parent_block = prog.block(parent_idx)
return parent_block
def complete_op(self):
main_program = self.helper.main_program
current_block = main_program.current_block()
parent_block = self.parent_block()
params, grads = self.get_params_and_grads()
param_names = [p.name for p in params]
grad_names = [g.name for g in grads]
parent_block.append_op(
type='recv',
inputs={},
outputs={},
attrs={
'endpoint': self.endpoint,
'Fanin': self.fan_in,
'ParamList': param_names,
'GradList': grad_names,
'OptimizeBlock': current_block
})
def Send(endpoints, send_vars, get_vars):
"""
Send layer
Args:
endpoints: comma seperated IP:PORT pairs in the order
of send_vars to send
send_vars: vars to send
get_vars: vars to get from server after send completes.
Send variables to the server side, and get vars from server
side when server have finished running server side program.
"""
assert (type(send_vars) == list)
assert (type(get_vars) == list)
epmap = endpoints.split(",")
endpoints = list(set(epmap))
helper = LayerHelper("Send", **locals())
helper.append_op(
type="send",
inputs={"X": send_vars},
outputs={"Out": get_vars},
attrs={"endpoints": endpoints,
"epmap": epmap})
...@@ -145,7 +145,9 @@ def monkey_patch_variable(): ...@@ -145,7 +145,9 @@ def monkey_patch_variable():
# a*b == b*a. Do not need to reverse explicitly # a*b == b*a. Do not need to reverse explicitly
("__rmul__", "elementwise_mul", False), ("__rmul__", "elementwise_mul", False),
("__div__", "elementwise_div", False), ("__div__", "elementwise_div", False),
("__rdiv__", "elementwise_div", True)): ("__rdiv__", "elementwise_div", True),
("__pow__", "elementwise_pow", False),
("__rpow__", "elementwise_pow", True)):
setattr(Variable, method_name, setattr(Variable, method_name,
_elemwise_method_creator_(method_name, op_type, reverse)) _elemwise_method_creator_(method_name, op_type, reverse))
......
...@@ -26,6 +26,7 @@ __all__ = [ ...@@ -26,6 +26,7 @@ __all__ = [
'fc', 'fc',
'embedding', 'embedding',
'dynamic_lstm', 'dynamic_lstm',
'dynamic_lstmp',
'dynamic_gru', 'dynamic_gru',
'gru_unit', 'gru_unit',
'linear_chain_crf', 'linear_chain_crf',
...@@ -61,6 +62,9 @@ __all__ = [ ...@@ -61,6 +62,9 @@ __all__ = [
'transpose', 'transpose',
'im2sequence', 'im2sequence',
'nce', 'nce',
'beam_search',
'row_conv',
'multiplex',
] ]
...@@ -108,16 +112,17 @@ def fc(input, ...@@ -108,16 +112,17 @@ def fc(input,
into a 2-dimensional matrix. The parameter into a 2-dimensional matrix. The parameter
`num_flatten_dims` determines how the input tensor `num_flatten_dims` determines how the input tensor
is flattened: the first `num_flatten_dims` is flattened: the first `num_flatten_dims`
dimensions will be flatten to form the first (inclusive, index starts from 1) dimensions will
dimension of the final matrix (height of the be flatten to form the first dimension of the
matrix), and the rest `rank(X) - num_flatten_dims` final matrix (height of the matrix), and the rest
dimensions are flattened to form the second `rank(X) - num_flatten_dims` dimensions are
dimension of the final matrix (width of the matrix). flattened to form the second dimension of the
For example, suppose `X` is a 6-dimensional tensor final matrix (width of the matrix). For example,
with a shape [2, 3, 4, 5, 6], and suppose `X` is a 6-dimensional tensor with a shape
`num_flatten_dims` = 3. Then, the flattened matrix [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then,
will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. the flattened matrix will have a shape
By default, `num_flatten_dims` is set to 1. [2 x 3 x 4, 5 x 6] = [24, 30]. By default,
`num_flatten_dims` is set to 1.
param_attr(ParamAttr|list): The parameter attribute for learnable param_attr(ParamAttr|list): The parameter attribute for learnable
parameters/weights of the fully connected parameters/weights of the fully connected
layer. layer.
...@@ -158,15 +163,14 @@ def fc(input, ...@@ -158,15 +163,14 @@ def fc(input,
param_shape = [ param_shape = [
reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
] + [size] ] + [size]
w = helper.create_parameter( w = helper.create_parameter(
attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False) attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
tmp = helper.create_tmp_variable(dtype) tmp = helper.create_tmp_variable(dtype)
helper.append_op( helper.append_op(
type="mul", type="mul",
inputs={ inputs={"X": input_var,
"X": input_var, "Y": w},
"Y": w,
},
outputs={"Out": tmp}, outputs={"Out": tmp},
attrs={"x_num_col_dims": num_flatten_dims, attrs={"x_num_col_dims": num_flatten_dims,
"y_num_col_dims": 1}) "y_num_col_dims": 1})
...@@ -253,7 +257,8 @@ def dynamic_lstm(input, ...@@ -253,7 +257,8 @@ def dynamic_lstm(input,
gate_activation='sigmoid', gate_activation='sigmoid',
cell_activation='tanh', cell_activation='tanh',
candidate_activation='tanh', candidate_activation='tanh',
dtype='float32'): dtype='float32',
name=None):
""" """
**Dynamic LSTM Layer** **Dynamic LSTM Layer**
...@@ -279,7 +284,7 @@ def dynamic_lstm(input, ...@@ -279,7 +284,7 @@ def dynamic_lstm(input,
W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
our implementation, we use vectors to reprenset these diagonal weight our implementation, we use vectors to reprenset these diagonal weight
matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
gate bias vector), :math:`\sigma` is the non-line activations, such as gate bias vector), :math:`\sigma` is the non-linear activations, such as
logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
gate, forget gate, output gate, and cell activation vectors, respectively, gate, forget gate, output gate, and cell activation vectors, respectively,
all of which have the same size as the cell output activation vector :math:`h`. all of which have the same size as the cell output activation vector :math:`h`.
...@@ -305,25 +310,25 @@ def dynamic_lstm(input, ...@@ -305,25 +310,25 @@ def dynamic_lstm(input,
(T X 4D), where T is the total time steps in this (T X 4D), where T is the total time steps in this
mini-batch, D is the hidden size. mini-batch, D is the hidden size.
size(int): 4 * hidden size. size(int): 4 * hidden size.
param_attr(ParamAttr): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weights. hidden-hidden weights.
- The shape is (D x 4D), where D is the hidden
size.
- Weights = {:math:`W_{ch}, W_{ih}, \ - Weights = {:math:`W_{ch}, W_{ih}, \
W_{fh}, W_{oh}`} W_{fh}, W_{oh}`}
bias_attr(ParamAttr): The bias attribute for the learnable bias - The shape is (D x 4D), where D is the hidden
size.
bias_attr(ParamAttr|None): The bias attribute for the learnable bias
weights, which contains two parts, input-hidden weights, which contains two parts, input-hidden
bias weights and peephole connections weights if bias weights and peephole connections weights if
setting `use_peepholes` to `True`. setting `use_peepholes` to `True`.
1. `use_peepholes = False` 1. `use_peepholes = False`
- The shape is (1 x 4D).
- Biases = {:math:`b_c, b_i, b_f, b_o`}. - Biases = {:math:`b_c, b_i, b_f, b_o`}.
- The shape is (1 x 4D).
2. `use_peepholes = True` 2. `use_peepholes = True`
- The shape is (1 x 7D).
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
W_{fc}, W_{oc}`}. W_{fc}, W_{oc}`}.
- The shape is (1 x 7D).
use_peepholes(bool): Whether to enable diagonal/peephole connections, use_peepholes(bool): Whether to enable diagonal/peephole connections,
default `True`. default `True`.
is_reverse(bool): Whether to compute reversed LSTM, default `False`. is_reverse(bool): Whether to compute reversed LSTM, default `False`.
...@@ -336,6 +341,8 @@ def dynamic_lstm(input, ...@@ -336,6 +341,8 @@ def dynamic_lstm(input,
Choices = ["sigmoid", "tanh", "relu", "identity"], Choices = ["sigmoid", "tanh", "relu", "identity"],
default "tanh". default "tanh".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32". dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns: Returns:
tuple: The hidden state, and cell state of LSTM. The shape of both \ tuple: The hidden state, and cell state of LSTM. The shape of both \
...@@ -350,6 +357,7 @@ def dynamic_lstm(input, ...@@ -350,6 +357,7 @@ def dynamic_lstm(input,
forward, _ = fluid.layers.dynamic_lstm( forward, _ = fluid.layers.dynamic_lstm(
input=forward_proj, size=hidden_dim * 4, use_peepholes=False) input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
""" """
helper = LayerHelper('lstm', **locals()) helper = LayerHelper('lstm', **locals())
size = size / 4 size = size / 4
weight = helper.create_parameter( weight = helper.create_parameter(
...@@ -386,6 +394,192 @@ def dynamic_lstm(input, ...@@ -386,6 +394,192 @@ def dynamic_lstm(input,
return hidden, cell return hidden, cell
def dynamic_lstmp(input,
size,
proj_size,
param_attr=None,
bias_attr=None,
use_peepholes=True,
is_reverse=False,
gate_activation='sigmoid',
cell_activation='tanh',
candidate_activation='tanh',
proj_activation='tanh',
dtype='float32',
name=None):
"""
**Dynamic LSTMP Layer**
LSTMP (LSTM with recurrent projection) layer has a separate projection
layer after the LSTM layer, projecting the original hidden state to a
lower-dimensional one, which is proposed to reduce the number of total
parameters and furthermore computational complexity for the LSTM,
espeacially for the case that the size of output units is relative
large (https://research.google.com/pubs/archive/43905.pdf).
The formula is as follows:
.. math::
i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
\\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o)
c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
h_t & = o_t \odot act_h(c_t)
r_t & = \overline{act_h}(W_{rh}h_t)
In the above formula:
* :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \
the matrix of weights from the input gate to the input).
* :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
matrices for peephole connections. In our implementation, \
we use vectors to reprenset these diagonal weight matrices.
* :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
bias vector).
* :math:`\sigma`: The activation, such as logistic sigmoid function.
* :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
gate, and cell activation vectors, respectively, all of which have \
the same size as the cell output activation vector :math:`h`.
* :math:`h`: The hidden state.
* :math:`r`: The recurrent projection of the hidden state.
* :math:`\\tilde{c_t}`: The candidate hidden state, whose \
computation is based on the current input and previous hidden state.
* :math:`\odot`: The element-wise product of the vectors.
* :math:`act_g` and :math:`act_h`: The cell input and cell output \
activation functions and `tanh` is usually used for them.
* :math:`\overline{act_h}`: The activation function for the projection \
output, usually using `identity` or same as :math:`act_h`.
Set `use_peepholes` to `False` to disable peephole connection. The formula
is omitted here, please refer to the paper
http://www.bioinf.jku.at/publications/older/2604.pdf for details.
Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
operations on the input :math:`x_{t}` are NOT included in this operator.
Users can choose to use fully-connected layer before LSTMP layer.
Args:
input(Variable): The input of dynamic_lstmp layer, which supports
variable-time length input sequence. The underlying
tensor in this Variable is a matrix with shape
(T X 4D), where T is the total time steps in this
mini-batch, D is the hidden size.
size(int): 4 * hidden size.
proj_size(int): The size of projection output.
param_attr(ParamAttr|None): The parameter attribute for the learnable
hidden-hidden weight and projection weight.
- Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
W_{fh}, W_{oh}`}.
- The shape of hidden-hidden weight is (P x 4D),
where P is the projection size and D the hidden
size.
- Projection weight = {:math:`W_{rh}`}.
- The shape of projection weight is (D x P).
bias_attr(ParamAttr|None): The bias attribute for the learnable bias
weights, which contains two parts, input-hidden
bias weights and peephole connections weights if
setting `use_peepholes` to `True`.
1. `use_peepholes = False`
- Biases = {:math:`b_c, b_i, b_f, b_o`}.
- The shape is (1 x 4D).
2. `use_peepholes = True`
- Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
W_{fc}, W_{oc}`}.
- The shape is (1 x 7D).
use_peepholes(bool): Whether to enable diagonal/peephole connections,
default `True`.
is_reverse(bool): Whether to compute reversed LSTM, default `False`.
gate_activation(str): The activation for input gate, forget gate and
output gate. Choices = ["sigmoid", "tanh", "relu",
"identity"], default "sigmoid".
cell_activation(str): The activation for cell output. Choices = ["sigmoid",
"tanh", "relu", "identity"], default "tanh".
candidate_activation(str): The activation for candidate hidden state.
Choices = ["sigmoid", "tanh", "relu", "identity"],
default "tanh".
proj_activation(str): The activation for projection output.
Choices = ["sigmoid", "tanh", "relu", "identity"],
default "tanh".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
tuple: The projection of hidden state, and cell state of LSTMP. The \
shape of projection is (T x P), for the cell state which is \
(T x D), and both LoD is the same with the `input`.
Examples:
.. code-block:: python
hidden_dim, proj_dim = 512, 256
fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
act=None, bias_attr=None)
proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out,
size=hidden_dim * 4,
proj_size=proj_dim,
use_peepholes=False,
is_reverse=True,
cell_activation="tanh",
proj_activation="tanh")
"""
helper = LayerHelper('lstmp', **locals())
size = size / 4
weight = helper.create_parameter(
attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
proj_weight = helper.create_parameter(
attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
bias_size = [1, 7 * size]
if not use_peepholes:
bias_size[1] = 4 * size
bias = helper.create_parameter(
attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
projection = helper.create_tmp_variable(dtype)
cell = helper.create_tmp_variable(dtype)
ordered_proj0 = helper.create_tmp_variable(dtype)
batch_hidden = helper.create_tmp_variable(dtype)
batch_gate = helper.create_tmp_variable(dtype)
batch_cell_pre_act = helper.create_tmp_variable(dtype)
helper.append_op(
type='lstmp',
inputs={
'Input': input,
'Weight': weight,
'ProjWeight': proj_weight,
'Bias': bias
},
outputs={
'Projection': projection,
'Cell': cell,
'OrderedP0': ordered_proj0,
'BatchHidden': batch_hidden,
'BatchGate': batch_gate,
'BatchCellPreAct': batch_cell_pre_act
},
attrs={
'use_peepholes': use_peepholes,
'is_reverse': is_reverse,
'gate_activation': gate_activation,
'cell_activation': cell_activation,
'candidate_activation': candidate_activation,
'proj_activation': proj_activation
})
return projection, cell
def dynamic_gru(input, def dynamic_gru(input,
size, size,
param_attr=None, param_attr=None,
...@@ -530,8 +724,10 @@ def gru_unit(input, ...@@ -530,8 +724,10 @@ def gru_unit(input,
size (integer): The input dimension value. size (integer): The input dimension value.
weight (ParamAttr): The weight parameters for gru unit. Default: None weight (ParamAttr): The weight parameters for gru unit. Default: None
bias (ParamAttr): The bias parameters for gru unit. Default: None bias (ParamAttr): The bias parameters for gru unit. Default: None
activation (string): The activation type for cell (actNode). Default: 'tanh' activation (string): The activation type for cell (actNode).
gate_activation (string): The activation type for gates (actGate). Default: 'sigmoid' Default: 'tanh'
gate_activation (string): The activation type for gates (actGate).
Default: 'sigmoid'
Returns: Returns:
tuple: The hidden value, reset-hidden value and gate values. tuple: The hidden value, reset-hidden value and gate values.
...@@ -651,7 +847,35 @@ def cos_sim(X, Y, **kwargs): ...@@ -651,7 +847,35 @@ def cos_sim(X, Y, **kwargs):
return out return out
def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs): def dropout(x, dropout_prob, is_test=False, seed=None, **kwargs):
"""
Computes dropout.
Drop or keep each element of `x` independently. Dropout is a regularization
technique for reducing overfitting by preventing neuron co-adaption during
training. The dropout operator randomly set (according to the given dropout
probability) the outputs of some units to zero, while others are remain
unchanged.
Args:
x(variable): The input tensor.
dropout_prob(float): Probability of setting units to zero.
is_test(bool): A flag indicating whether it is in test phrase or not.
seed(int): A Python integer used to create random seeds. If this
parameter is set to None, a random seed is used.
NOTE: If an integer seed is given, always the same output
units will be dropped. DO NOT use a fixed seed in training.
Returns:
Variable: A tensor variable.
Examples:
.. code-block:: python
x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
"""
helper = LayerHelper('dropout', **kwargs) helper = LayerHelper('dropout', **kwargs)
out = helper.create_tmp_variable(dtype=x.dtype) out = helper.create_tmp_variable(dtype=x.dtype)
mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True) mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
...@@ -660,9 +884,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs): ...@@ -660,9 +884,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
inputs={'X': [x]}, inputs={'X': [x]},
outputs={'Out': [out], outputs={'Out': [out],
'Mask': [mask]}, 'Mask': [mask]},
attrs={'dropout_prob': dropout_prob, attrs={
'dropout_prob': dropout_prob,
'is_test': is_test, 'is_test': is_test,
'seed': seed}) 'fix_seed': seed is not None,
'seed': seed if seed is not None else 0
})
return out return out
...@@ -670,8 +897,9 @@ def cross_entropy(input, label, **kwargs): ...@@ -670,8 +897,9 @@ def cross_entropy(input, label, **kwargs):
""" """
**Cross Entropy Layer** **Cross Entropy Layer**
This layer computes the cross entropy between `input` and `label`. It supports This layer computes the cross entropy between `input` and `label`. It
both standard cross-entropy and soft-label cross-entropy loss computation. supports both standard cross-entropy and soft-label cross-entropy loss
computation.
1) One-hot cross-entropy: 1) One-hot cross-entropy:
`soft_label = False`, `Label[i, 0]` indicates the class index for sample i: `soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
...@@ -698,23 +926,28 @@ def cross_entropy(input, label, **kwargs): ...@@ -698,23 +926,28 @@ def cross_entropy(input, label, **kwargs):
Args: Args:
input (Variable|list): a 2-D tensor with shape [N x D], where N is the input (Variable|list): a 2-D tensor with shape [N x D], where N is the
batch size and D is the number of classes. This input is a probability batch size and D is the number of classes. This
computed by the previous operator, which is almost always the result input is a probability computed by the previous
of a softmax operator. operator, which is almost always the result of
a softmax operator.
label (Variable|list): the ground truth which is a 2-D tensor. When label (Variable|list): the ground truth which is a 2-D tensor. When
`soft_label` is set to `False`, `label` is a tensor<int64> with shape `soft_label` is set to `False`, `label` is a
[N x 1]. When `soft_label` is set to `True`, `label` is a tensor<int64> with shape [N x 1]. When
`soft_label` is set to `True`, `label` is a
tensor<float/double> with shape [N x D]. tensor<float/double> with shape [N x D].
soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate soft_label (bool, via `**kwargs`): a flag indicating whether to
the given labels as soft labels, default `False`. interpretate the given labels as soft
labels, default `False`.
Returns: Returns:
A 2-D tensor with shape [N x 1], the cross entropy loss. A 2-D tensor with shape [N x 1], the cross entropy loss.
Raises: Raises:
`ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ `ValueError`: 1) the 1st dimension of `input` and `label` are not equal.
`soft_label == True`, and the 2nd dimension of `input` and `label` are not \ 2) when `soft_label == True`, and the 2nd dimension of
equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1. `input` and `label` are not equal.
3) when `soft_label == False`, and the 2nd dimension of
`label` is not 1.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -737,7 +970,9 @@ def square_error_cost(input, label, **kwargs): ...@@ -737,7 +970,9 @@ def square_error_cost(input, label, **kwargs):
""" """
**Square error cost layer** **Square error cost layer**
This layer accepts input predictions and target label and returns the squared error cost. This layer accepts input predictions and target label and returns the
squared error cost.
For predictions, :math:`X`, and target labels, :math:`Y`, the equation is: For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
.. math:: .. math::
...@@ -755,8 +990,8 @@ def square_error_cost(input, label, **kwargs): ...@@ -755,8 +990,8 @@ def square_error_cost(input, label, **kwargs):
label(Variable): Label tensor, has target labels. label(Variable): Label tensor, has target labels.
Returns: Returns:
Variable: The tensor variable storing the element-wise squared error difference \ Variable: The tensor variable storing the element-wise squared error
of input and label. difference of input and label.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -852,7 +1087,8 @@ def chunk_eval(input, ...@@ -852,7 +1087,8 @@ def chunk_eval(input,
"chunk_scheme": chunk_scheme, "chunk_scheme": chunk_scheme,
"excluded_chunk_types": excluded_chunk_types or [] "excluded_chunk_types": excluded_chunk_types or []
}) })
return precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
num_correct_chunks)
def sequence_conv(input, def sequence_conv(input,
...@@ -910,13 +1146,14 @@ def conv2d(input, ...@@ -910,13 +1146,14 @@ def conv2d(input,
**Convlution2D Layer** **Convlution2D Layer**
The convolution2D layer calculates the output based on the input, filter The convolution2D layer calculates the output based on the input, filter
and strides, paddings, dilations, groups parameters. Input(Input) and Output(Output) and strides, paddings, dilations, groups parameters. Input(Input) and
are in NCHW format. Where N is batch size, C is the number of channels, H is the height Output(Output) are in NCHW format. Where N is batch size, C is the number of
of the feature, and W is the width of the feature. channels, H is the height of the feature, and W is the width of the feature.
The details of convolution layer, please refer UFLDL's `convolution, The details of convolution layer, please refer UFLDL's `convolution,
<http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ . <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
If bias attribution and activation type are provided, bias is added to the output of the convolution, If bias attribution and activation type are provided, bias is added to the
and the corresponding activation function is applied to the final result. output of the convolution, and the corresponding activation function is
applied to the final result.
For each input :math:`X`, the equation is: For each input :math:`X`, the equation is:
...@@ -931,7 +1168,8 @@ def conv2d(input, ...@@ -931,7 +1168,8 @@ def conv2d(input,
* :math:`\\ast`: Convolution operation. * :math:`\\ast`: Convolution operation.
* :math:`b`: Bias value, a 2-D tensor with shape [M, 1]. * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
* :math:`\\sigma`: Activation function. * :math:`\\sigma`: Activation function.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
different.
Example: Example:
...@@ -976,17 +1214,20 @@ def conv2d(input, ...@@ -976,17 +1214,20 @@ def conv2d(input,
act(str): Activation type. Default: None act(str): Activation type. Default: None
Returns: Returns:
Variable: The tensor variable storing the convolution and \ Variable: The tensor variable storing the convolution and
non-linearity activation result. non-linearity activation result.
Raises: Raises:
ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch. ValueError: If the shapes of input, filter_size, stride, padding and
groups mismatch.
Examples: Examples:
.. code-block:: python .. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') data = fluid.layers.data(
conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu") name='data', shape=[3, 32, 32], dtype='float32')
conv2d = fluid.layers.conv2d(
input=data, num_filters=2, filter_size=3, act="relu")
""" """
if stride is None: if stride is None:
stride = [1, 1] stride = [1, 1]
...@@ -1349,7 +1590,8 @@ def conv2d_transpose(input, ...@@ -1349,7 +1590,8 @@ def conv2d_transpose(input,
H is the height of the feature, and W is the width of the feature. H is the height of the feature, and W is the width of the feature.
Parameters(dilations, strides, paddings) are two elements. These two elements Parameters(dilations, strides, paddings) are two elements. These two elements
represent height and width, respectively. The details of convolution transpose represent height and width, respectively. The details of convolution transpose
layer, please refer to the following explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_. layer, please refer to the following explanation and references
`therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
For each input :math:`X`, the equation is: For each input :math:`X`, the equation is:
...@@ -1362,7 +1604,8 @@ def conv2d_transpose(input, ...@@ -1362,7 +1604,8 @@ def conv2d_transpose(input,
* :math:`X`: Input value, a tensor with NCHW format. * :math:`X`: Input value, a tensor with NCHW format.
* :math:`W`: Filter value, a tensor with MCHW format. * :math:`W`: Filter value, a tensor with MCHW format.
* :math:`\\ast` : Convolution transpose operation. * :math:`\\ast` : Convolution transpose operation.
* :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different. * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
different.
Example: Example:
...@@ -1403,7 +1646,8 @@ def conv2d_transpose(input, ...@@ -1403,7 +1646,8 @@ def conv2d_transpose(input,
dilation(int|tuple): The dilation size. If dilation is a tuple, it must dilation(int|tuple): The dilation size. If dilation is a tuple, it must
contain two integers, (dilation_H, dilation_W). Otherwise, the contain two integers, (dilation_H, dilation_W). Otherwise, the
dilation_H = dilation_W = dilation. Default: dilation = 1. dilation_H = dilation_W = dilation. Default: dilation = 1.
param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer. Default: None param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
Default: None
use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
library is installed. Default: True library is installed. Default: True
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
...@@ -1413,13 +1657,16 @@ def conv2d_transpose(input, ...@@ -1413,13 +1657,16 @@ def conv2d_transpose(input,
Variable: The tensor variable storing the convolution transpose result. Variable: The tensor variable storing the convolution transpose result.
Raises: Raises:
ValueError: If the shapes of input, filter_size, stride, padding and groups mismatch. ValueError: If the shapes of input, filter_size, stride, padding and
groups mismatch.
Examples: Examples:
.. code-block:: python .. code-block:: python
data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32') data = fluid.layers.data(
conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3) name='data', shape=[3, 32, 32], dtype='float32')
conv2d_transpose = fluid.layers.conv2d_transpose(
input=data, num_filters=2, filter_size=3)
""" """
helper = LayerHelper("conv2d_transpose", **locals()) helper = LayerHelper("conv2d_transpose", **locals())
if not isinstance(input, Variable): if not isinstance(input, Variable):
...@@ -1551,6 +1798,38 @@ def sequence_expand(x, y, name=None): ...@@ -1551,6 +1798,38 @@ def sequence_expand(x, y, name=None):
return tmp return tmp
def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
'''
This function implements the beam search algorithm.
'''
helper = LayerHelper('beam_search', **locals())
score_type = scores.dtype
id_type = ids.dtype
selected_scores = helper.create_tmp_variable(dtype=score_type)
selected_ids = helper.create_tmp_variable(dtype=id_type)
helper.append_op(
type='beam_search',
inputs={
'pre_ids': pre_ids,
'ids': ids,
'scores': scores,
},
outputs={
'selected_ids': selected_ids,
'selected_scores': selected_scores,
},
attrs={
# TODO(ChunweiYan) to assure other value support
'level': level,
'beam_size': beam_size,
'end_id': end_id,
})
return selected_ids, selected_scores
def lstm_unit(x_t, def lstm_unit(x_t,
hidden_t_prev, hidden_t_prev,
cell_t_prev, cell_t_prev,
...@@ -1611,9 +1890,9 @@ def lstm_unit(x_t, ...@@ -1611,9 +1890,9 @@ def lstm_unit(x_t,
tuple: The hidden value and cell value of lstm unit. tuple: The hidden value and cell value of lstm unit.
Raises: Raises:
ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\ ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**
not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \ not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev**
and **cell_t_prev** not be the same or the 2nd dimensions of \ and **cell_t_prev** not be the same or the 2nd dimensions of
**hidden_t_prev** and **cell_t_prev** not be the same. **hidden_t_prev** and **cell_t_prev** not be the same.
Examples: Examples:
...@@ -1946,7 +2225,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): ...@@ -1946,7 +2225,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
data = fluid.layers.data(name="data", data = fluid.layers.data(name="data",
shape=(3, 17, 13), shape=(3, 17, 13),
dtype="float32") dtype="float32")
fc = fluid.layers.l2_normalize(x=data, axis=1) normed = fluid.layers.l2_normalize(x=data, axis=1)
""" """
if len(x.shape) == 1: axis = 0 if len(x.shape) == 1: axis = 0
...@@ -1998,9 +2277,10 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): ...@@ -1998,9 +2277,10 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
def matmul(x, y, transpose_x=False, transpose_y=False, name=None): def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
""" """
Applies matrix multiplication to two tensors. Currently, the input Applies matrix multiplication to two tensors.
tensors' rank can be any, but when the rank of anyone inputs is
bigger than 3, this two inputs' rank should be equal. Currently, the input tensors' rank can be any, but when the rank of any
inputs is bigger than 3, this two inputs' rank should be equal.
The actual behavior depends on the shapes of :math:`x`, :math:`y` and the The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically: flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
...@@ -2041,25 +2321,56 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): ...@@ -2041,25 +2321,56 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
# Examples to clarify shapes of the inputs and output # Examples to clarify shapes of the inputs and output
# x: [B, ..., M, K], y: [B, ..., K, N] # x: [B, ..., M, K], y: [B, ..., K, N]
fluid.layers.matmul(x, y) # out: [B, ..., M, N] fluid.layers.matmul(x, y) # out: [B, ..., M, N]
# x: [B, M, K], y: [B, K, N] # x: [B, M, K], y: [B, K, N]
fluid.layers.matmul(x, y) # out: [B, M, N] fluid.layers.matmul(x, y) # out: [B, M, N]
# x: [B, M, K], y: [K, N] # x: [B, M, K], y: [K, N]
fluid.layers.matmul(x, y) # out: [B, M, N] fluid.layers.matmul(x, y) # out: [B, M, N]
# x: [B, M, K], y: [K]
fluid.layers.matmul(x, y) # out: [B, M]
# x: [M, K], y: [K, N] # x: [M, K], y: [K, N]
fluid.layers.matmul(x, y) # out: [M, N] fluid.layers.matmul(x, y) # out: [M, N]
# x: [B, M, K], y: [K]
fluid.layers.matmul(x, y) # out: [B, M]
# x: [K], y: [K] # x: [K], y: [K]
fluid.layers.matmul(x, y) # out: [1] fluid.layers.matmul(x, y) # out: [1]
# x: [M], y: [N]
# x: [M], y: [N]
fluid.layers.matmul(x, y, True, True) # out: [M, N] fluid.layers.matmul(x, y, True, True) # out: [M, N]
""" """
def __check_input(x, y):
if len(y.shape) > len(x.shape):
raise ValueError(
"Invalid inputs for matmul. "
"x's rank should be always greater than or equal to y'rank.")
x_shape = list(x.shape)
y_shape = list(y.shape)
if len(x_shape) == 1:
x_shape = [1] + x_shape
if len(y_shape) == 1:
y_shape = y_shape + [1]
# check the inner 2 dimensions
if transpose_x:
x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
if transpose_y:
y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
if x_shape[-1] != y_shape[-2]:
raise ValueError("Invalid inputs for matmul.")
if len(y_shape) > 2:
for i, dim_x in enumerate(x_shape[:-2]):
if dim_x != y_shape[i]:
raise ValueError("Invalid inputs for matmul.")
__check_input(x, y)
helper = LayerHelper('matmul', **locals()) helper = LayerHelper('matmul', **locals())
assert max(len(x.shape), len(y.shape)) <= 3 or len(x.shape) == len( out = helper.create_tmp_variable(dtype=x.dtype)
y.
shape), 'Inputs\' rank should be equal or their rank should be less 4.'
out = helper.create_tmp_variable(dtype=helper.input_dtype())
helper.append_op( helper.append_op(
type='matmul', type='matmul',
inputs={'X': x, inputs={'X': x,
...@@ -2076,13 +2387,26 @@ def edit_distance(input, ...@@ -2076,13 +2387,26 @@ def edit_distance(input,
ignored_tokens=None, ignored_tokens=None,
name=None): name=None):
""" """
EditDistance operator computes the edit distances between a batch of hypothesis strings and their references. Edit distance, also called Levenshtein distance, measures how dissimilar two strings are by counting the minimum number of operations to transform one string into anthor. Here the operations include insertion, deletion, and substitution. For example, given hypothesis string A = "kitten" and reference B = "sitting", the edit distance is 3 for A will be transformed into B at least after two substitutions and one insertion: EditDistance operator computes the edit distances between a batch of
hypothesis strings and their references. Edit distance, also called
Levenshtein distance, measures how dissimilar two strings are by counting
the minimum number of operations to transform one string into anthor.
Here the operations include insertion, deletion, and substitution.
For example, given hypothesis string A = "kitten" and reference
B = "sitting", the edit distance is 3 for A will be transformed into B
at least after two substitutions and one insertion:
"kitten" -> "sitten" -> "sittin" -> "sitting" "kitten" -> "sitten" -> "sittin" -> "sitting"
Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total number denoted by `batch_size`, and the separation is specified by the LoD information. And the `batch_size` reference strings are arranged in order in the same way in the LoDTensor Input(Refs). Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with
the total number denoted by `batch_size`, and the separation is specified
by the LoD information. And the `batch_size` reference strings are arranged
in order in the same way in the LoDTensor Input(Refs).
Output(Out) contains the `batch_size` results and each stands for the edit stance for a pair of strings respectively. If Attr(normalized) is true, the edit distance will be divided by the length of reference string. Output(Out) contains the `batch_size` results and each stands for the edit
distance for a pair of strings respectively. If Attr(normalized) is true,
the edit distance will be divided by the length of reference string.
Args: Args:
...@@ -2090,9 +2414,11 @@ def edit_distance(input, ...@@ -2090,9 +2414,11 @@ def edit_distance(input,
label(Variable): The indices for reference strings. label(Variable): The indices for reference strings.
normalized(bool): Indicated whether to normalize the edit distance by the length of reference string. normalized(bool): Indicated whether to normalize the edit distance by
the length of reference string.
ignored_tokens(list of int): Tokens that should be removed before calculating edit distance. ignored_tokens(list of int): Tokens that should be removed before
calculating edit distance.
Returns: Returns:
Variable: sequence-to-sequence edit distance in shape [batch_size, 1]. Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
...@@ -2143,8 +2469,10 @@ def edit_distance(input, ...@@ -2143,8 +2469,10 @@ def edit_distance(input,
def ctc_greedy_decoder(input, blank, name=None): def ctc_greedy_decoder(input, blank, name=None):
""" """
This op is used to decode sequences by greedy policy by below steps: This op is used to decode sequences by greedy policy by below steps:
1. Get the indexes of max value for each row in input. a.k.a. numpy.argmax(input, axis=0). 1. Get the indexes of max value for each row in input. a.k.a.
2. For each sequence in result of step1, merge repeated tokens between two blanks and delete all blanks. numpy.argmax(input, axis=0).
2. For each sequence in result of step1, merge repeated tokens between two
blanks and delete all blanks.
A simple example as below: A simple example as below:
...@@ -2174,9 +2502,16 @@ def ctc_greedy_decoder(input, blank, name=None): ...@@ -2174,9 +2502,16 @@ def ctc_greedy_decoder(input, blank, name=None):
Args: Args:
input(Variable): (LoDTensor<float>), the probabilities of variable-length sequences, which is a 2-D Tensor with LoD information. It's shape is [Lp, num_classes + 1], where Lp is the sum of all input sequences' length and num_classes is the true number of classes. (not including the blank label). input(Variable): (LoDTensor<float>), the probabilities of
variable-length sequences, which is a 2-D Tensor with
LoD information. It's shape is [Lp, num_classes + 1],
where Lp is the sum of all input sequences' length and
num_classes is the true number of classes. (not
including the blank label).
blank(int): the blank label index of Connectionist Temporal Classification (CTC) loss, which is in thehalf-opened interval [0, num_classes + 1). blank(int): the blank label index of Connectionist Temporal
Classification (CTC) loss, which is in thehalf-opened
interval [0, num_classes + 1).
Returns: Returns:
Variable: CTC greedy decode result. Variable: CTC greedy decode result.
...@@ -2244,8 +2579,10 @@ def warpctc(input, label, blank=0, norm_by_times=False, **kwargs): ...@@ -2244,8 +2579,10 @@ def warpctc(input, label, blank=0, norm_by_times=False, **kwargs):
Examples: Examples:
.. code-block:: python .. code-block:: python
y = layers.data(name='y', shape=[11, 8], dtype='float32', lod_level=1) y = layers.data(
y_predict = layers.data(name='y_predict', shape=[11, 1], dtype='float32') name='y', shape=[11, 8], dtype='float32', lod_level=1)
y_predict = layers.data(
name='y_predict', shape=[11, 1], dtype='float32')
cost = layers.warpctc(input=y_predict, label=y) cost = layers.warpctc(input=y_predict, label=y)
""" """
...@@ -2399,6 +2736,12 @@ def transpose(x, perm, name=None): ...@@ -2399,6 +2736,12 @@ def transpose(x, perm, name=None):
raise ValueError( raise ValueError(
"Input(perm) is the permutation of dimensions of Input(input). " "Input(perm) is the permutation of dimensions of Input(input). "
"It's length shoud be equal to Input(input)'s rank.") "It's length shoud be equal to Input(input)'s rank.")
for idx, dim in enumerate(perm):
if dim >= len(x.shape):
raise ValueError(
"Each element in perm should be less than x's rank. "
"%d-th element in perm is %d which accesses x's rank %d." %
(idx, perm[idx], len(x.shape)))
helper = LayerHelper('transpose', **locals()) helper = LayerHelper('transpose', **locals())
out = helper.create_tmp_variable(x.dtype) out = helper.create_tmp_variable(x.dtype)
...@@ -2507,7 +2850,8 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): ...@@ -2507,7 +2850,8 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
.. code-block:: python .. code-block:: python
output = fluid.layers.im2sequence(input=layer, stride=[1, 1], filter_size=[2, 2]) output = fluid.layers.im2sequence(
input=layer, stride=[1, 1], filter_size=[2, 2])
""" """
...@@ -2533,3 +2877,108 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None): ...@@ -2533,3 +2877,108 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
'paddings': padding, 'paddings': padding,
}) })
return out return out
def row_conv(input, future_context_size, param_attr=None, act=None):
"""Row Conv Operator. This layer will apply lookahead convolution to
**input**. The input variable should be a 2D LoDTensor with shape [T, D].
Parameters with shape [future_context_size + 1, D] will be created. The math
equation of row convolution is as follows:
.. math::
Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j}
In the above equation:
* :math:`Out_{i}`: The i-th row of output variable with shape [1, D].
* :math:`\\tau`: Future context size.
* :math:`X_{j}`: The j-th row of input variable with shape [1, D].
* :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D].
More details about row_conv please refer to the paper \
(http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and
the design document \
(https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645).
Args:
input (Variable): Input variable, a 2D LoDTensor with shape [T, D].
future_context_size (int): Future context size. Please note, the shape
of convolution kernel is [future_context_size + 1, D].
param_attr (ParamAttr): Attributes of parameters, including
name, initializer etc.
act (str): Non-linear activation to be applied to output variable.
Returns:
Variable: The output tensor with same shape as input tensor.
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[16],
dtype='float32', lod_level=1)
out = fluid.layers.row_conv(input=x, future_context_size=2)
"""
helper = LayerHelper('row_conv', **locals())
dtype = helper.input_dtype()
filter_shape = [future_context_size + 1, input.shape[1]]
filter_param = helper.create_parameter(
attr=helper.param_attr, shape=filter_shape, dtype=dtype)
out = helper.create_tmp_variable(dtype)
helper.append_op(
type='row_conv',
inputs={'X': [input],
'Filter': [filter_param]},
outputs={'Out': [out]})
return helper.append_activation(out)
def multiplex(inputs, index):
"""
**Multiplex Layer**
Referring to the given index variable, this layer selects rows from the
input variables to construct a multiplex variable. Assuming that there are
:math:`m` input variables and :math:`I_i` represents the i-th input
variable and :math:`i` is in [0, :math:`m`). All input variables are
tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
Please note that rank of the input tensor should be at least 2. Each input
variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
* ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
variable. The given index variable should be a 2-D tensor with shape
[:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
Then the output variable will be a tensor with shape [:math:`d_0`,
:math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
Args:
inputs (list): A list of variables to gather from. All variables have the
same shape and the rank is at least 2.
index (Variable): Tensor<int32>, index variable which is a 2-D tensor
with shape [M, 1] where M is the batch size.
Returns:
Variable: Multiplex variable gathered from input variables.
Examples:
.. code-block:: python
x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
index = fluid.layers.data(name='index', shape=[1], dtype='int32')
out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
"""
helper = LayerHelper('multiplex', **locals())
if not isinstance(inputs, list) and len(inputs) < 2:
raise ValueError("inputs should be a list object and contains at least "
"2 elements.")
out = helper.create_tmp_variable(inputs[0].dtype)
helper.append_op(
type='multiplex',
inputs={'X': inputs,
'Ids': index},
outputs={'Out': [out]})
return out
...@@ -56,6 +56,7 @@ __all__ = [ ...@@ -56,6 +56,7 @@ __all__ = [
'elementwise_mul', 'elementwise_mul',
'elementwise_max', 'elementwise_max',
'elementwise_min', 'elementwise_min',
'elementwise_pow',
'clip', 'clip',
'clip_by_norm', 'clip_by_norm',
'sequence_softmax', 'sequence_softmax',
......
...@@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper ...@@ -16,12 +16,14 @@ from ..layer_helper import LayerHelper
from ..param_attr import ParamAttr from ..param_attr import ParamAttr
from ..framework import convert_np_dtype_to_dtype_ from ..framework import convert_np_dtype_to_dtype_
from ..framework import Variable from ..framework import Variable
from ..initializer import Constant
from ..core import DataType from ..core import DataType
import numpy import numpy
__all__ = [ __all__ = [
'create_tensor', 'create_tensor',
'create_parameter', 'create_parameter',
'create_global_var',
'cast', 'cast',
'concat', 'concat',
'sums', 'sums',
...@@ -58,13 +60,22 @@ def create_parameter(shape, ...@@ -58,13 +60,22 @@ def create_parameter(shape,
Returns: Returns:
Parameter: the created parameter Parameter: the created parameter
""" """
helper = LayerHelper("create_parameter") helper = LayerHelper("create_parameter", **locals())
if attr is None: if attr is None:
attr = ParamAttr() attr = ParamAttr()
return helper.create_parameter(attr, shape, dtype, is_bias, return helper.create_parameter(attr, shape, dtype, is_bias,
default_initializer) default_initializer)
def create_global_var(shape, value, dtype, persistable=False, name=None):
helper = LayerHelper("global_var", **locals())
var = helper.create_global_variable(
dtype=dtype, shape=shape, persistable=persistable, name=name)
helper.set_variable_initializer(
var, initializer=Constant(value=float(value)))
return var
def cast(x, dtype): def cast(x, dtype):
""" """
This function takes in the input with input_dtype This function takes in the input with input_dtype
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import layers
from framework import Variable
__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
"""
When training a model, it's often useful to decay the
learning rate during training process, this is called
learning_rate_decay. There are many strategies to do
this, this module will provide some classical method.
User can also implement their own learning_rate_decay
strategy according to this module.
"""
def exponential_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
"""Applies exponential decay to the learning rate.
```python
decayed_learning_rate = learning_rate *
decay_rate ^ (global_step / decay_steps)
```
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for exponential_decay.")
# update learning_rate
div_res = global_step / decay_steps
if staircase:
div_res = layers.floor(x=div_res)
return learning_rate * (decay_rate**div_res)
def natural_exp_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
"""Applies natural exponential decay to the initial learning rate.
```python
if not staircase:
decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
else:
decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
```
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for natural_exp_decay.")
div_res = global_step / decay_steps
if staircase:
div_res = layers.floor(x=div_res)
return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
def inverse_time_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
"""Applies inverse time decay to the initial learning rate.
```python
if staircase:
decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
else
decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
```
Args:
learning_rate: A scalar float32 value or a Variable. This
will be the initial learning rate during training
global_step: A Variable that record the training step.
decay_steps: A Python `int32` number.
decay_rate: A Python `float` number.
staircase: Boolean. If set true, decay the learning rate every decay_steps.
Returns:
The decayed learning rate
"""
if not isinstance(global_step, Variable):
raise ValueError("global_step is required for inverse_time_decay.")
div_res = global_step / decay_steps
if staircase:
div_res = layers.floor(x=div_res)
return learning_rate / (1 + decay_rate * div_res)
...@@ -11,14 +11,13 @@ ...@@ -11,14 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import layers import layers
__all__ = [ __all__ = [
"simple_img_conv_pool", "simple_img_conv_pool",
"sequence_conv_pool", "sequence_conv_pool",
"glu", "glu",
"dot_product_attention", "scaled_dot_product_attention",
] ]
...@@ -160,7 +159,11 @@ def glu(input, dim=-1): ...@@ -160,7 +159,11 @@ def glu(input, dim=-1):
return out return out
def dot_product_attention(querys, keys, values): def scaled_dot_product_attention(queries,
keys,
values,
num_heads=1,
dropout_rate=0.):
""" """
The dot-product attention. The dot-product attention.
...@@ -174,39 +177,162 @@ def dot_product_attention(querys, keys, values): ...@@ -174,39 +177,162 @@ def dot_product_attention(querys, keys, values):
.. math:: .. math::
Attention(Q, K, V)= softmax(QK^\mathrm{T})V Attention(Q, K, V)= softmax(QK^\mathrm{T})V
Refer to `Attention Is All You Need Refer to `Attention Is All You Need
<https://arxiv.org/pdf/1706.03762.pdf>`_. <https://arxiv.org/pdf/1706.03762.pdf>`_.
Note that batch data containing sequences with different lengths is not
supported by this because of the (batch) matrix multipication.
Args: Args:
query (Variable): The input variable which is a Tensor or LoDTensor.
key (Variable): The input variable which is a Tensor or LoDTensor. queries (Variable): The input variable which should be a 3-D Tensor.
value (Variable): The input variable which is a Tensor or LoDTensor. keys (Variable): The input variable which should be a 3-D Tensor.
values (Variable): The input variable which should be a 3-D Tensor.
num_heads (int): Head number to compute the scaled dot product
attention. Default value is 1.
dropout_rate (float): The dropout rate to drop the attention weight.
Default value is 0.
Returns: Returns:
tuple: The Tensor variables representing the output and attention scores.
Variable: A 3-D Tensor computed by multi-head scaled dot product
attention.
Raises:
ValueError: If input queries, keys, values are not 3-D Tensors.
NOTE:
1. When num_heads > 1, three linear projections are learned respectively
to map input queries, keys and values into queries', keys' and values'.
queries', keys' and values' have the same shapes with queries, keys
and values.
1. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters.
Examples: Examples:
.. code-block:: python .. code-block:: python
# Suppose q, k, v are tensor variables with the following shape: # Suppose q, k, v are Tensors with the following shape:
# q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10] # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
out, attn_scores = fluid.nets.dot_product_attention(q, k, v)
out.shape # [3, 5, 10] contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
attn_scores.shape # [3, 5, 6] contexts.shape # [3, 5, 10]
"""
if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
raise ValueError(
"Inputs quries, keys and values should all be 3-D tensors.")
if queries.shape[-1] != keys.shape[-1]:
raise ValueError(
"The hidden size of queries and keys should be the same.")
if keys.shape[-2] != values.shape[-2]:
raise ValueError(
"The max sequence length in query batch and in key batch "
"should be the same.")
if keys.shape[-1] % num_heads != 0:
raise ValueError("The hidden size of keys (%d) must be divisible "
"by the number of attention heads (%d)." %
(keys.shape[-1], num_heads))
if values.shape[-1] % num_heads != 0:
raise ValueError("The hidden size of values (%d) must be divisible "
"by the number of attention heads (%d)." %
(values.shape[-1], num_heads))
def __compute_qkv(queries, keys, values, num_heads):
"""
Add linear projection to queries, keys, and values.
Args:
queries(Tensor): a 3-D input Tensor.
keys(Tensor): a 3-D input Tensor.
values(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads. Linearly project the inputs
ONLY when num_heads > 1.
Returns:
Tensor: linearly projected output Tensors: queries', keys' and
values'. They have the same shapes with queries, keys and
values.
"""
if num_heads == 1:
return queries, keys, values
q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
return q, k, v
def __split_heads(x, num_heads):
"""
Reshape the last dimension of inpunt tensor x so that it becomes two
dimensions.
Args:
x(Tensor): a 3-D input Tensor.
num_heads(int): The number of heads.
Returns:
Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
of the last dimension of x.
"""
if num_heads == 1:
return x
hidden_size = x.shape[-1]
# reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
# into a 4-D output:
# [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
reshaped = layers.reshape(
x=x,
shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
# permuate the dimensions into:
# [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
def __combine_heads(x):
"""
Reshape the last two dimensions of inpunt tensor x so that it becomes
one dimension.
Args:
x(Tensor): a 4-D input Tensor with shape
[bs, num_heads, max_sequence_length, hidden_dim].
Returns:
Tensor: a Tensor with shape
[bs, max_sequence_length, num_heads * hidden_dim].
""" """
assert keys.shape[-2] == values.shape[
-2], 'The shapes of keys and values mismatch.' if len(x.shape) == 3: return x
assert querys.shape[-1] == keys.shape[ if len(x.shape) != 4:
-1], 'The shapes of querys and keys mismatch.' raise ValueError("Input(x) should be a 4-D Tensor.")
product = layers.matmul(x=querys, y=keys, transpose_y=True)
attn_scores = layers.reshape( trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
return layers.reshape(
x=trans_x,
shape=map(int, [
trans_x.shape[0], trans_x.shape[1],
trans_x.shape[2] * trans_x.shape[3]
]))
q, k, v = __compute_qkv(queries, keys, values, num_heads)
q = __split_heads(q, num_heads)
k = __split_heads(k, num_heads)
v = __split_heads(v, num_heads)
key_dim_per_head = keys.shape[-1] // num_heads
scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
weights = layers.reshape(
x=layers.reshape( x=layers.reshape(
x=product, shape=[-1, product.shape[-1]], act='softmax'), x=product, shape=[-1, product.shape[-1]], act="softmax"),
shape=product.shape) shape=product.shape)
out = layers.matmul(attn_scores, values) if dropout_rate:
return out, attn_scores weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
ctx_multiheads = layers.matmul(weights, v)
return __combine_heads(ctx_multiheads)
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from collections import defaultdict from collections import defaultdict
import framework import framework
import layers
from backward import append_backward from backward import append_backward
from framework import unique_name, program_guard from framework import unique_name, program_guard
from initializer import Constant from initializer import Constant
...@@ -33,9 +34,11 @@ class Optimizer(object): ...@@ -33,9 +34,11 @@ class Optimizer(object):
but need to use one of it's implementation. but need to use one of it's implementation.
""" """
def __init__(self, global_step=None, regularization=None): def __init__(self, learning_rate, global_step=None, regularization=None):
assert learning_rate is not None
self._global_step = global_step self._global_step = global_step
self.regularization = regularization self.regularization = regularization
self._global_learning_rate = learning_rate
# Dictionary of accumulators. Some optimizer subclasses need to # Dictionary of accumulators. Some optimizer subclasses need to
# allocate and manage extra variables associated with the parameters # allocate and manage extra variables associated with the parameters
# to train. These variables are called accumulators. # to train. These variables are called accumulators.
...@@ -43,6 +46,28 @@ class Optimizer(object): ...@@ -43,6 +46,28 @@ class Optimizer(object):
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
self.helper = None self.helper = None
def _create_global_learning_rate(self):
if isinstance(self._global_learning_rate, float):
self._global_learning_rate = layers.create_global_var(
name=unique_name("learning_rate"),
shape=[1],
value=float(self._global_learning_rate),
dtype='float32',
persistable=True)
if not isinstance(self._global_learning_rate, framework.Variable):
raise ValueError("learning rate should be a Variable, "
"actual type is %s",
type(self._global_learning_rate))
@property
def global_learning_rate(self):
"""
get global decayed learning rate
:return:
"""
return self._global_learning_rate
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
""" append optimize operator to block and return all the added optimize_op """ append optimize operator to block and return all the added optimize_op
""" """
...@@ -52,17 +77,7 @@ class Optimizer(object): ...@@ -52,17 +77,7 @@ class Optimizer(object):
# create learning rate variable for every parameter # create learning rate variable for every parameter
param = param_and_grad[0] param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] param_lr = param.optimize_attr['learning_rate']
param_lr_shape = [1] return self._global_learning_rate * param_lr
param_lr_var = self.helper.create_global_variable(
name=unique_name("learning_rate"),
dtype='float32',
shape=param_lr_shape,
lod_level=1,
persistable=True)
param_lr = param_lr * self._learning_rate
self.helper.set_variable_initializer(
var=param_lr_var, initializer=Constant(param_lr))
return param_lr_var
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
"""Create all accumulators needed by the parameters """Create all accumulators needed by the parameters
...@@ -178,6 +193,7 @@ class Optimizer(object): ...@@ -178,6 +193,7 @@ class Optimizer(object):
self.helper = LayerHelper(self.__class__.__name__) self.helper = LayerHelper(self.__class__.__name__)
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
self._create_global_learning_rate()
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
...@@ -231,9 +247,9 @@ class SGDOptimizer(Optimizer): ...@@ -231,9 +247,9 @@ class SGDOptimizer(Optimizer):
def __init__(self, learning_rate, **kwargs): def __init__(self, learning_rate, **kwargs):
assert learning_rate is not None assert learning_rate is not None
super(SGDOptimizer, self).__init__(**kwargs) super(SGDOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "sgd" self.type = "sgd"
self._learning_rate = learning_rate
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -259,9 +275,9 @@ class MomentumOptimizer(Optimizer): ...@@ -259,9 +275,9 @@ class MomentumOptimizer(Optimizer):
def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
assert learning_rate is not None assert learning_rate is not None
assert momentum is not None assert momentum is not None
super(MomentumOptimizer, self).__init__(**kwargs) super(MomentumOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "momentum" self.type = "momentum"
self._learning_rate = learning_rate
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
...@@ -303,9 +319,9 @@ class AdagradOptimizer(Optimizer): ...@@ -303,9 +319,9 @@ class AdagradOptimizer(Optimizer):
def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
assert learning_rate is not None assert learning_rate is not None
assert epsilon is not None assert epsilon is not None
super(AdagradOptimizer, self).__init__(**kwargs) super(AdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "adagrad" self.type = "adagrad"
self._learning_rate = learning_rate
self._epsilon = epsilon self._epsilon = epsilon
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
...@@ -352,9 +368,9 @@ class AdamOptimizer(Optimizer): ...@@ -352,9 +368,9 @@ class AdamOptimizer(Optimizer):
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamOptimizer, self).__init__(**kwargs) super(AdamOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "adam" self.type = "adam"
self._learning_rate = learning_rate
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
...@@ -457,9 +473,9 @@ class AdamaxOptimizer(Optimizer): ...@@ -457,9 +473,9 @@ class AdamaxOptimizer(Optimizer):
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamaxOptimizer, self).__init__(**kwargs) super(AdamaxOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "adamax" self.type = "adamax"
self._learning_rate = learning_rate
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
...@@ -535,9 +551,9 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -535,9 +551,9 @@ class DecayedAdagradOptimizer(Optimizer):
assert decay is not None assert decay is not None
assert epsilon is not None assert epsilon is not None
super(DecayedAdagradOptimizer, self).__init__(**kwargs) super(DecayedAdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs)
self.type = "decayed_adagrad" self.type = "decayed_adagrad"
self._learning_rate = learning_rate
self._decay = decay self._decay = decay
self._epsilon = epsilon self._epsilon = epsilon
......
...@@ -15,7 +15,10 @@ ...@@ -15,7 +15,10 @@
from initializer import Initializer, Xavier, Constant from initializer import Initializer, Xavier, Constant
from regularizer import WeightDecayRegularizer from regularizer import WeightDecayRegularizer
__all__ = ['ParamAttr'] __all__ = [
'ParamAttr',
'WeightNormParamAttr',
]
class ParamAttr(object): class ParamAttr(object):
...@@ -82,3 +85,20 @@ class ParamAttr(object): ...@@ -82,3 +85,20 @@ class ParamAttr(object):
if with_initializer: if with_initializer:
kwargs['initializer'] = self.initializer kwargs['initializer'] = self.initializer
return kwargs return kwargs
class WeightNormParamAttr(ParamAttr):
"""
Used for weight normalization. Any field in ParamAttr can also be set here.
Besides, an extra field dim can be set to indicate the dimension except
which to normalize.
"""
# List to record the parameters reparameterized by weight normalization.
# If these parameters are treated as Variable rather than Parameter,
# it can be used to discriminate these parameters and help to serialize
# these paramters for inference.
params_with_weight_norm = []
def __init__(self, dim=None, **kwargs):
super(WeightNormParamAttr, self).__init__(**kwargs)
self.dim = dim
...@@ -12,11 +12,11 @@ ...@@ -12,11 +12,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import paddle.v2.fluid.core as core import core
from contextlib import contextmanager from contextlib import contextmanager
import os import os
__all__ = ['CudaProfiler'] __all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
NVPROF_CONFIG = [ NVPROF_CONFIG = [
"gpustarttimestamp", "gpustarttimestamp",
...@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None): ...@@ -63,3 +63,58 @@ def cuda_profiler(output_file, output_mode=None, config=None):
# Disables profiler collection. # Disables profiler collection.
core.nvprof_stop() core.nvprof_stop()
os.remove(config_file) os.remove(config_file)
def reset_profiler():
"""The profiler clear interface.
reset_profiler will clear the previous time record.
"""
core.reset_profiler()
@contextmanager
def profiler(state, sorted_key=None):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
of 'calls', 'total', 'max', 'min' or 'ave'.
The `calls` means sorting by the number of calls.
The `total` means sorting by the total execution time.
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
"""
if state not in ['CPU', 'GPU']:
raise ValueError("The state must be 'CPU' or 'GPU'.")
prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
core.enable_profiler(prof_state)
yield
if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The state must be in 'calls', 'total', "
"'max', 'min', 'ave'")
sorted_key = 'default' if sorted_key is None else sorted_key
key_map = {
'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls,
'total': core.EventSortingKey.kTotal,
'max': core.EventSortingKey.kMax,
'min': core.EventSortingKey.kMin,
'ave': core.EventSortingKey.kAve,
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key])
...@@ -87,6 +87,11 @@ class WeightDecayRegularizer(object): ...@@ -87,6 +87,11 @@ class WeightDecayRegularizer(object):
""" """
raise NotImplementedError() raise NotImplementedError()
def __str__(self):
"""Debug string
"""
raise NotImplementedError()
class L2DecayRegularizer(WeightDecayRegularizer): class L2DecayRegularizer(WeightDecayRegularizer):
"""Implements the L2 Weight Decay Regularization """Implements the L2 Weight Decay Regularization
...@@ -123,6 +128,9 @@ class L2DecayRegularizer(WeightDecayRegularizer): ...@@ -123,6 +128,9 @@ class L2DecayRegularizer(WeightDecayRegularizer):
return decay return decay
def __str__(self):
return "L2Decay, regularization_coeff=%f" % self._regularization_coeff
class L1DecayRegularizer(WeightDecayRegularizer): class L1DecayRegularizer(WeightDecayRegularizer):
"""Implements the L1 Weight Decay Regularization """Implements the L1 Weight Decay Regularization
...@@ -163,6 +171,9 @@ class L1DecayRegularizer(WeightDecayRegularizer): ...@@ -163,6 +171,9 @@ class L1DecayRegularizer(WeightDecayRegularizer):
return decay return decay
def __str__(self):
return "L1Decay, regularization_coeff=%f" % self._regularization_coeff
# We short the class name, since users will use the regulaizer with the package # We short the class name, since users will use the regulaizer with the package
# name. The sample code: # name. The sample code:
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
if(NOT WITH_DISTRIBUTE)
list(REMOVE_ITEM TEST_OPS test_recv_op)
endif(NOT WITH_DISTRIBUTE)
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) py_test(${src} SRCS ${src}.py)
endforeach() endforeach()
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
list(REMOVE_ITEM TEST_OPS test_image_classification_train) list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet) py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg) py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
py_test(test_recognize_digits_mlp_cpu
SRCS test_recognize_digits.py
ARGS mlp)
py_test(test_recognize_digits_mlp_cuda
SRCS test_recognize_digits.py
ARGS mlp --use_cuda)
py_test(test_recognize_digits_conv_cpu
SRCS test_recognize_digits.py
ARGS conv)
py_test(test_recognize_digits_conv_cuda
SRCS test_recognize_digits.py
ARGS conv --use_cuda)
py_test(test_recognize_digits_mlp_cpu_parallel
SRCS test_recognize_digits.py
ARGS mlp --parallel)
py_test(test_recognize_digits_mlp_cuda_parallel
SRCS test_recognize_digits.py
ARGS mlp --use_cuda --parallel)
py_test(test_recognize_digits_conv_cpu_parallel
SRCS test_recognize_digits.py
ARGS conv --parallel)
py_test(test_recognize_digits_conv_cuda_parallel
SRCS test_recognize_digits.py
ARGS conv --use_cuda --parallel)
# default test # default test
foreach(src ${TEST_OPS}) foreach(src ${TEST_OPS})
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
...@@ -49,7 +49,7 @@ for pass_id in range(PASS_NUM): ...@@ -49,7 +49,7 @@ for pass_id in range(PASS_NUM):
avg_loss_value, = exe.run(fluid.default_main_program(), avg_loss_value, = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost]) fetch_list=[avg_cost])
print(avg_loss_value)
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
exit(0) # if avg cost less than 10.0, we think our code is good. exit(0) # if avg cost less than 10.0, we think our code is good.
exit(1) exit(1)
...@@ -175,7 +175,7 @@ def main(): ...@@ -175,7 +175,7 @@ def main():
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192), paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
#place = fluid.CPUPlace() # place = fluid.CPUPlace()
place = fluid.CUDAPlace(0) place = fluid.CUDAPlace(0)
feeder = fluid.DataFeeder( feeder = fluid.DataFeeder(
feed_list=[ feed_list=[
......
...@@ -17,7 +17,7 @@ import paddle.v2 as paddle ...@@ -17,7 +17,7 @@ import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as pd
from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.executor import Executor
dict_size = 30000 dict_size = 30000
...@@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) ...@@ -26,53 +26,136 @@ src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32 hidden_dim = 32
word_dim = 16 word_dim = 16
IS_SPARSE = True IS_SPARSE = True
batch_size = 10 batch_size = 2
max_length = 50 max_length = 8
topk_size = 50 topk_size = 50
trg_dic_size = 10000 trg_dic_size = 10000
beam_size = 2
decoder_size = hidden_dim decoder_size = hidden_dim
place = core.CPUPlace()
def encoder_decoder():
def encoder():
# encoder # encoder
src_word_id = layers.data( src_word_id = pd.data(
name="src_word_id", shape=[1], dtype='int64', lod_level=1) name="src_word_id", shape=[1], dtype='int64', lod_level=1)
src_embedding = layers.embedding( src_embedding = pd.embedding(
input=src_word_id, input=src_word_id,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb')) param_attr=fluid.ParamAttr(name='vemb'))
fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoder_out = layers.sequence_last_step(input=lstm_hidden0) encoder_out = pd.sequence_last_step(input=lstm_hidden0)
return encoder_out
def decoder_train(context):
# decoder # decoder
trg_language_word = layers.data( trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1) name="target_language_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = layers.embedding( trg_embedding = pd.embedding(
input=trg_language_word, input=trg_language_word,
size=[dict_size, word_dim], size=[dict_size, word_dim],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb')) param_attr=fluid.ParamAttr(name='vemb'))
rnn = fluid.layers.DynamicRNN() rnn = pd.DynamicRNN()
with rnn.block(): with rnn.block():
current_word = rnn.step_input(trg_embedding) current_word = rnn.step_input(trg_embedding)
mem = rnn.memory(init=encoder_out) pre_state = rnn.memory(init=context)
fc1 = fluid.layers.fc(input=[current_word, mem], current_state = pd.fc(input=[current_word, pre_state],
size=decoder_size, size=decoder_size,
act='tanh') act='tanh')
out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
rnn.update_memory(mem, fc1) current_score = pd.fc(input=current_state,
rnn.output(out) size=target_dict_dim,
act='softmax')
rnn.update_memory(pre_state, current_state)
rnn.output(current_score)
return rnn() return rnn()
def decoder_decode(context):
init_state = context
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
counter = pd.zeros(shape=[1], dtype='int64')
# fill the first element with init_state
state_array = pd.create_array('float32')
pd.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = pd.create_array('int64')
scores_array = pd.create_array('float32')
init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
init_scores = pd.data(
name="init_scores", shape=[1], dtype="float32", lod_level=2)
pd.array_write(init_ids, array=ids_array, i=counter)
pd.array_write(init_scores, array=scores_array, i=counter)
cond = pd.less_than(x=counter, y=array_len)
while_op = pd.While(cond=cond)
with while_op.block():
pre_ids = pd.array_read(array=ids_array, i=counter)
pre_state = pd.array_read(array=state_array, i=counter)
pre_score = pd.array_read(array=scores_array, i=counter)
# expand the lod of pre_state to be the same with pre_score
pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
pre_ids_emb = pd.embedding(
input=pre_ids,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE)
# use rnn unit to update rnn
current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
size=decoder_size,
act='tanh')
# use score to do beam search
current_score = pd.fc(input=current_state,
size=target_dict_dim,
act='softmax')
topk_scores, topk_indices = pd.topk(current_score, k=50)
selected_ids, selected_scores = pd.beam_search(
pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
pd.increment(x=counter, value=1, in_place=True)
# update the memories
pd.array_write(current_state, array=state_array, i=counter)
pd.array_write(selected_ids, array=ids_array, i=counter)
pd.array_write(selected_scores, array=scores_array, i=counter)
pd.less_than(x=counter, y=array_len, cond=cond)
translation_ids, translation_scores = pd.beam_search_decode(
ids=ids_array, scores=scores_array)
# return init_ids, init_scores
return translation_ids, translation_scores
def set_init_lod(data, lod, place):
res = core.LoDTensor()
res.set(data, place)
res.set_lod(lod)
return res
def to_lodtensor(data, place): def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data] seq_lens = [len(seq) for seq in data]
cur_len = 0 cur_len = 0
...@@ -88,12 +171,13 @@ def to_lodtensor(data, place): ...@@ -88,12 +171,13 @@ def to_lodtensor(data, place):
return res return res
def main(): def train_main():
rnn_out = encoder_decoder() context = encoder()
label = layers.data( rnn_out = decoder_train(context)
label = pd.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = layers.cross_entropy(input=rnn_out, label=label) cost = pd.cross_entropy(input=rnn_out, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = pd.mean(x=cost)
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
...@@ -103,13 +187,12 @@ def main(): ...@@ -103,13 +187,12 @@ def main():
paddle.dataset.wmt14.train(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size) batch_size=batch_size)
place = core.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
batch_id = 0 batch_id = 0
for pass_id in xrange(2): for pass_id in xrange(1):
for data in train_data(): for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place) word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place) trg_word = to_lodtensor(map(lambda x: x[1], data), place)
...@@ -125,9 +208,48 @@ def main(): ...@@ -125,9 +208,48 @@ def main():
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val)) " avg_cost=" + str(avg_cost_val))
if batch_id > 3: if batch_id > 3:
exit(0) break
batch_id += 1 batch_id += 1
def decode_main():
context = encoder()
translation_ids, translation_scores = decoder_decode(context)
exe = Executor(place)
exe.run(framework.default_startup_program())
init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
init_scores_data = np.array(
[1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [i for i in range(batch_size)] + [batch_size]
init_lod = [init_lod, init_lod]
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
for _, data in enumerate(train_data()):
init_ids = set_init_lod(init_ids_data, init_lod, place)
init_scores = set_init_lod(init_scores_data, init_lod, place)
src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
result_ids, result_scores = exe.run(
framework.default_main_program(),
feed={
'src_word_id': src_word_data,
'init_ids': init_ids,
'init_scores': init_scores
},
fetch_list=[translation_ids, translation_scores],
return_numpy=False)
print result_ids.lod()
break
if __name__ == '__main__': if __name__ == '__main__':
main() # train_main()
decode_main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import paddle.v2.fluid as fluid
import paddle.v2 as paddle
import sys
import numpy
def parse_arg():
parser = argparse.ArgumentParser()
parser.add_argument(
"nn_type",
help="The neural network type, in ['mlp', 'conv']",
type=str,
choices=['mlp', 'conv'])
parser.add_argument(
"--parallel",
help='Run in parallel or not',
default=False,
action="store_true")
parser.add_argument(
"--use_cuda",
help="Run the program by using CUDA",
default=False,
action="store_true")
return parser.parse_args()
BATCH_SIZE = 64
def loss_net(hidden, label):
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(x=loss)
acc = fluid.layers.accuracy(input=prediction, label=label)
return prediction, avg_loss, acc
def mlp(img, label):
hidden = fluid.layers.fc(input=img, size=200, act='tanh')
hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
return loss_net(hidden, label)
def conv_net(img, label):
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=img,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
return loss_net(conv_pool_2, label)
def train(args, save_dirname=None):
print("recognize digits with args: {0}".format(" ".join(sys.argv[1:])))
img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
if args.nn_type == 'mlp':
net_conf = mlp
else:
net_conf = conv_net
if args.parallel:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
img_ = pd.read_input(img)
label_ = pd.read_input(label)
prediction, avg_loss, acc = net_conf(img_, label_)
for o in [avg_loss, acc]:
pd.write_output(o)
avg_loss, acc = pd()
# get mean loss and acc through every devices.
avg_loss = fluid.layers.mean(x=avg_loss)
acc = fluid.layers.mean(x=acc)
else:
prediction, avg_loss, acc = net_conf(img, label)
test_program = fluid.default_main_program().clone()
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(
paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
PASS_NUM = 100
for pass_id in range(PASS_NUM):
for batch_id, data in enumerate(train_reader()):
# train a mini-batch, fetch nothing
exe.run(feed=feeder.feed(data))
if (batch_id + 1) % 10 == 0:
acc_set = []
avg_loss_set = []
for test_data in test_reader():
acc_np, avg_loss_np = exe.run(program=test_program,
feed=feeder.feed(test_data),
fetch_list=[acc, avg_loss])
acc_set.append(float(acc_np))
avg_loss_set.append(float(avg_loss_np))
# get test acc and loss
acc_val = numpy.array(acc_set).mean()
avg_loss_val = numpy.array(avg_loss_set).mean()
if float(acc_val) > 0.85: # test acc > 85%
if save_dirname is not None:
fluid.io.save_inference_model(save_dirname, ["img"],
[prediction], exe)
return
else:
print(
'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
format(pass_id, batch_id + 1,
float(avg_loss_val), float(acc_val)))
def infer(args, save_dirname=None):
if save_dirname is None:
return
place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place)
# Use fluid.io.load_inference_model to obtain the inference program desc,
# the feed_target_names (the names of variables that will be feeded
# data using feed operators), and the fetch_targets (variables that
# we want to obtain data from using fetch operators).
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
# The input's dimension of conv should be 4-D or 5-D.
tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32")
# Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets.
results = exe.run(inference_program,
feed={feed_target_names[0]: tensor_img},
fetch_list=fetch_targets)
print("infer results: ", results[0])
if __name__ == '__main__':
args = parse_arg()
if not args.use_cuda and not args.parallel:
save_dirname = "recognize_digits_" + args.nn_type + ".inference.model"
else:
save_dirname = None
train(args, save_dirname)
infer(args, save_dirname)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
conv_pool_1 = fluid.nets.simple_img_conv_pool(
input=images,
filter_size=5,
num_filters=20,
pool_size=2,
pool_stride=2,
act="relu")
conv_pool_2 = fluid.nets.simple_img_conv_pool(
input=conv_pool_1,
filter_size=5,
num_filters=50,
pool_size=2,
pool_stride=2,
act="relu")
predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.01)
optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
BATCH_SIZE = 50
PASS_NUM = 3
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
str(pass_acc))
# print loss, acc
if loss < 10.0 and pass_acc > 0.9:
# if avg cost less than 10.0 and accuracy is larger than 0.9, we think our code is good.
exit(0)
pass_acc = accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
exit(1)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
BATCH_SIZE = 128
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
hidden1 = fluid.layers.fc(input=image,
size=128,
act='relu',
param_attr=fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=fluid.clip.ClipByValue(10)))
hidden2 = fluid.layers.fc(input=hidden1,
size=64,
act='relu',
param_attr=regularizer)
predict = fluid.layers.fc(input=hidden2,
size=10,
act='softmax',
param_attr=regularizer)
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opts = optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=BATCH_SIZE)
test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
exe.run(fluid.default_startup_program())
PASS_NUM = 100
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
for data in train_reader():
out, acc = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
test_accuracy.reset(exe)
for data in test_reader():
out, acc = exe.run(inference_program,
feed=feeder.feed(data),
fetch_list=[avg_cost] + test_accuracy.metrics)
test_pass_acc = test_accuracy.eval(exe)
print("pass_id=" + str(pass_id) + " train_cost=" + str(
out) + " train_acc=" + str(acc) + " train_pass_acc=" + str(pass_acc)
+ " test_acc=" + str(test_pass_acc))
if test_pass_acc > 0.7:
fluid.io.save_inference_model(
"./recognize_digits_mlp.inference.model/", ["x"], [predict],
exe)
exit(0)
exit(1)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32
embedding_dim = 16
batch_size = 10
max_length = 50
topk_size = 50
encoder_size = decoder_size = hidden_dim
IS_SPARSE = True
USE_PEEPHOLES = False
def bi_lstm_encoder(input_seq, hidden_size):
input_forward_proj = fluid.layers.fc(input=input_seq,
size=hidden_size * 4,
bias_attr=True)
forward, _ = fluid.layers.dynamic_lstm(
input=input_forward_proj,
size=hidden_size * 4,
use_peepholes=USE_PEEPHOLES)
input_backward_proj = fluid.layers.fc(input=input_seq,
size=hidden_size * 4,
bias_attr=True)
backward, _ = fluid.layers.dynamic_lstm(
input=input_backward_proj,
size=hidden_size * 4,
is_reverse=True,
use_peepholes=USE_PEEPHOLES)
forward_last = fluid.layers.sequence_last_step(input=forward)
backward_first = fluid.layers.sequence_first_step(input=backward)
return forward_last, backward_first
# FIXME(peterzhang2029): Replace this function with the lstm_unit_op.
def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
def linear(inputs):
return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
cell_t = fluid.layers.sums(input=[
fluid.layers.elementwise_mul(
x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
x=input_gate, y=cell_tilde)
])
hidden_t = fluid.layers.elementwise_mul(
x=output_gate, y=fluid.layers.tanh(x=cell_t))
return hidden_t, cell_t
def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
decoder_size):
rnn = fluid.layers.DynamicRNN()
cell_init = fluid.layers.fill_constant_batch_size_like(
input=decoder_boot,
value=0.0,
shape=[-1, decoder_size],
dtype='float32')
cell_init.stop_gradient = False
with rnn.block():
current_word = rnn.step_input(target_embedding)
context = rnn.static_input(context)
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
cell_mem = rnn.memory(init=cell_init)
decoder_inputs = fluid.layers.concat(
input=[context, current_word], axis=1)
h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
rnn.update_memory(hidden_mem, h)
rnn.update_memory(cell_mem, c)
out = fluid.layers.fc(input=h,
size=target_dict_dim,
bias_attr=True,
act='softmax')
rnn.output(out)
return rnn()
def seq_to_seq_net():
"""Construct a seq2seq network."""
src_word_idx = fluid.layers.data(
name='source_sequence', shape=[1], dtype='int64', lod_level=1)
src_embedding = fluid.layers.embedding(
input=src_word_idx,
size=[source_dict_dim, embedding_dim],
dtype='float32')
src_forward_last, src_backward_first = bi_lstm_encoder(
input_seq=src_embedding, hidden_size=encoder_size)
encoded_vector = fluid.layers.concat(
input=[src_forward_last, src_backward_first], axis=1)
decoder_boot = fluid.layers.fc(input=src_backward_first,
size=decoder_size,
bias_attr=False,
act='tanh')
trg_word_idx = fluid.layers.data(
name='target_sequence', shape=[1], dtype='int64', lod_level=1)
trg_embedding = fluid.layers.embedding(
input=trg_word_idx,
size=[target_dict_dim, embedding_dim],
dtype='float32')
prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot,
encoded_vector, decoder_size)
label = fluid.layers.data(
name='label_sequence', shape=[1], dtype='int64', lod_level=1)
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
return avg_cost
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
avg_cost = seq_to_seq_net()
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimizer.minimize(avg_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
place = core.CPUPlace()
exe = Executor(place)
exe.run(framework.default_startup_program())
batch_id = 0
for pass_id in xrange(2):
for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place)
trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
outs = exe.run(framework.default_main_program(),
feed={
'source_sequence': word_data,
'target_sequence': trg_word,
'label_sequence': trg_word_next
},
fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0])
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val))
if batch_id > 3:
exit(0)
batch_id += 1
if __name__ == '__main__':
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,10 +12,10 @@ ...@@ -12,10 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function import unittest
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2 as paddle
import contextlib
def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
...@@ -40,62 +40,115 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, ...@@ -40,62 +40,115 @@ def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost) adam_optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label) accuracy = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, accuracy, accuracy.metrics[0] return avg_cost, accuracy
def to_lodtensor(data, place): def stacked_lstm_net(data,
seq_lens = [len(seq) for seq in data] label,
cur_len = 0 input_dim,
lod = [cur_len] class_dim=2,
for l in seq_lens: emb_dim=128,
cur_len += l hid_dim=512,
lod.append(cur_len) stacked_num=3):
flattened_data = np.concatenate(data, axis=0).astype("int64") assert stacked_num % 2 == 1
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor() emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
res.set(flattened_data, place) # add bias attr
res.set_lod([lod])
return res
def main():
BATCH_SIZE = 100
PASS_NUM = 5
word_dict = paddle.dataset.imdb.word_dict() # TODO(qijun) linear act
fc1 = fluid.layers.fc(input=emb, size=hid_dim)
lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
inputs = [fc1, lstm1]
for i in range(2, stacked_num + 1):
fc = fluid.layers.fc(input=inputs, size=hid_dim)
lstm, cell = fluid.layers.dynamic_lstm(
input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
inputs = [fc, lstm]
fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
prediction = fluid.layers.fc(input=[fc_last, lstm_last],
size=class_dim,
act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost)
accuracy = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, accuracy
def main(word_dict, net_method, use_cuda):
if use_cuda and not fluid.core.is_compiled_with_cuda():
return
BATCH_SIZE = 128
PASS_NUM = 5
dict_dim = len(word_dict) dict_dim = len(word_dict)
class_dim = 2 class_dim = 2
data = fluid.layers.data( data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1) name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64") label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out = convolution_net( cost, acc_out = net_method(
data, label, input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=1000), paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
accuracy.reset(exe)
for data in train_data(): for data in train_data():
cost_val, acc_val = exe.run(fluid.default_main_program(), cost_val, acc_val = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[cost, acc_out]) fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe) print("cost=" + str(cost_val) + " acc=" + str(acc_val))
print("cost=" + str(cost_val) + " acc=" + str(acc_val) + if cost_val < 0.4 and acc_val > 0.8:
" pass_acc=" + str(pass_acc)) return
if cost_val < 1.0 and pass_acc > 0.8: raise AssertionError("Cost is too large for {0}".format(
exit(0) net_method.__name__))
exit(1)
class TestUnderstandSentiment(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.word_dict = paddle.dataset.imdb.word_dict()
@contextlib.contextmanager
def new_program_scope(self):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
yield
def test_conv_cpu(self):
with self.new_program_scope():
main(self.word_dict, net_method=convolution_net, use_cuda=False)
def test_stacked_lstm_cpu(self):
with self.new_program_scope():
main(self.word_dict, net_method=stacked_lstm_net, use_cuda=False)
def test_conv_gpu(self):
with self.new_program_scope():
main(self.word_dict, net_method=convolution_net, use_cuda=True)
def test_stacked_lstm_gpu(self):
with self.new_program_scope():
main(self.word_dict, net_method=stacked_lstm_net, use_cuda=True)
if __name__ == '__main__': if __name__ == '__main__':
main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
from paddle.v2.fluid.layer_helper import LayerHelper
def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
"""
This function helps create an operator for the LSTM (Long Short Term
Memory) cell that can be used inside an RNN.
"""
helper = LayerHelper('lstm_unit', **locals())
rnn = fluid.layers.StaticRNN()
with rnn.step():
c_pre = rnn.memory(init=c_pre_init)
x_t = rnn.step_input(x)
before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)
dtype = x.dtype
c = helper.create_tmp_variable(dtype)
h = helper.create_tmp_variable(dtype)
helper.append_op(
type='lstm_unit',
inputs={"X": after_fc,
"C_prev": c_pre},
outputs={"C": c,
"H": h},
attrs={"forget_bias": forget_bias})
rnn.update_memory(c_pre, c)
rnn.output(h)
return rnn()
def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
data = fluid.layers.data(
name="words",
shape=[seq_len * batch_size, 1],
append_batch_size=False,
dtype="int64",
lod_level=1)
label = fluid.layers.data(
name="label",
shape=[batch_size, 1],
append_batch_size=False,
dtype="int64")
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
emb = fluid.layers.transpose(x=emb, perm=[1, 0, 2])
c_pre_init = fluid.layers.fill_constant(
dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
c_pre_init.stop_gradient = False
layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
layer_1_out = fluid.layers.transpose(x=layer_1_out, perm=[1, 0, 2])
prediction = fluid.layers.fc(input=layer_1_out,
size=class_dim,
act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, acc
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def chop_data(data, chop_len=80, batch_size=50):
data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
return data[:batch_size]
def prepare_feed_data(data, place):
tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
label = np.array(map(lambda x: x[1], data)).astype("int64")
label = label.reshape([len(label), 1])
tensor_label = fluid.LoDTensor()
tensor_label.set(label, place)
return tensor_words, tensor_label
def main():
BATCH_SIZE = 100
PASS_NUM = 5
word_dict = paddle.dataset.imdb.word_dict()
print "load word dict successfully"
dict_dim = len(word_dict)
class_dim = 2
cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for pass_id in xrange(PASS_NUM):
for data in train_data():
chopped_data = chop_data(data)
tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
outs = exe.run(fluid.default_main_program(),
feed={"words": tensor_words,
"label": tensor_label},
fetch_list=[cost, acc])
cost_val = np.array(outs[0])
acc_val = np.array(outs[1])
print("cost=" + str(cost_val) + " acc=" + str(acc_val))
if acc_val > 0.7:
exit(0)
exit(1)
if __name__ == '__main__':
main()
...@@ -12,76 +12,145 @@ ...@@ -12,76 +12,145 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import unittest
import os
PASS_NUM = 100
EMBED_SIZE = 32
HIDDEN_SIZE = 256
N = 5
BATCH_SIZE = 32
IS_SPARSE = True
word_dict = paddle.dataset.imikolov.build_dict() def main(use_cuda, is_sparse, parallel):
dict_size = len(word_dict) if use_cuda and not fluid.core.is_compiled_with_cuda():
return
first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64') PASS_NUM = 100
second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64') EMBED_SIZE = 32
third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64') HIDDEN_SIZE = 256
forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64') N = 5
next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64') BATCH_SIZE = 32
IS_SPARSE = is_sparse
embed_first = fluid.layers.embedding( def __network__(words):
input=first_word, embed_first = fluid.layers.embedding(
input=words[0],
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr='shared_w') param_attr='shared_w')
embed_second = fluid.layers.embedding( embed_second = fluid.layers.embedding(
input=second_word, input=words[1],
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr='shared_w') param_attr='shared_w')
embed_third = fluid.layers.embedding( embed_third = fluid.layers.embedding(
input=third_word, input=words[2],
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr='shared_w') param_attr='shared_w')
embed_forth = fluid.layers.embedding( embed_forth = fluid.layers.embedding(
input=forth_word, input=words[3],
size=[dict_size, EMBED_SIZE], size=[dict_size, EMBED_SIZE],
dtype='float32', dtype='float32',
is_sparse=IS_SPARSE, is_sparse=IS_SPARSE,
param_attr='shared_w') param_attr='shared_w')
concat_embed = fluid.layers.concat( concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], axis=1) input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid') hidden1 = fluid.layers.fc(input=concat_embed,
predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax') size=HIDDEN_SIZE,
cost = fluid.layers.cross_entropy(input=predict_word, label=next_word) act='sigmoid')
avg_cost = fluid.layers.mean(x=cost) predict_word = fluid.layers.fc(input=hidden1,
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) size=dict_size,
sgd_optimizer.minimize(avg_cost) act='softmax')
cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
train_reader = paddle.batch( avg_cost = fluid.layers.mean(x=cost)
return avg_cost
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
if not parallel:
avg_cost = __network__(
[first_word, second_word, third_word, forth_word, next_word])
else:
places = fluid.layers.get_places()
pd = fluid.layers.ParallelDo(places)
with pd.do():
avg_cost = __network__(
map(pd.read_input, [
first_word, second_word, third_word, forth_word, next_word
]))
pd.write_output(avg_cost)
avg_cost = fluid.layers.mean(x=pd())
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE) paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder( feeder = fluid.DataFeeder(
feed_list=[first_word, second_word, third_word, forth_word, next_word], feed_list=[first_word, second_word, third_word, forth_word, next_word],
place=place) place=place)
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
for data in train_reader(): for data in train_reader():
avg_cost_np = exe.run(fluid.default_main_program(), avg_cost_np = exe.run(fluid.default_main_program(),
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost]) fetch_list=[avg_cost])
if avg_cost_np[0] < 5.0: if avg_cost_np[0] < 5.0:
exit(0) # if avg cost less than 10.0, we think our code is good. return
exit(1) raise AssertionError("Cost is too large {0:2.2}".format(avg_cost_np[0]))
FULL_TEST = os.getenv('FULL_TEST',
'0').lower() in ['true', '1', 't', 'y', 'yes', 'on']
SKIP_REASON = "Only run minimum number of tests in CI server, to make CI faster"
class W2VTest(unittest.TestCase):
pass
def inject_test_method(use_cuda, is_sparse, parallel):
fn_name = "test_{0}_{1}_{2}".format("cuda" if use_cuda else "cpu", "sparse"
if is_sparse else "dense", "parallel"
if parallel else "normal")
def __impl__(*args, **kwargs):
prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.core.Scope()
with fluid.scope_guard(scope):
with fluid.program_guard(prog, startup_prog):
main(use_cuda=use_cuda, is_sparse=is_sparse, parallel=parallel)
if use_cuda and is_sparse and parallel:
fn = __impl__
else:
# skip the other test when on CI server
fn = unittest.skipUnless(
condition=FULL_TEST, reason=SKIP_REASON)(__impl__)
setattr(W2VTest, fn_name, fn)
for use_cuda in (False, True):
for is_sparse in (False, True):
for parallel in (False, True):
inject_test_method(use_cuda, is_sparse, parallel)
if __name__ == '__main__':
unittest.main()
...@@ -68,10 +68,10 @@ else: ...@@ -68,10 +68,10 @@ else:
fluid.io.save_persistables(exe, "./fit_a_line.model/") fluid.io.save_persistables(exe, "./fit_a_line.model/")
fluid.io.load_persistables(exe, "./fit_a_line.model/") fluid.io.load_persistables(exe, "./fit_a_line.model/")
for data in train_reader(): for data in train_reader():
avg_loss_value, = exe.run(trainer_prog, avg_loss_value = exe.run(trainer_prog,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost]) fetch_list=[avg_cost])
print("loss:" + str(avg_loss_value))
if avg_loss_value[0] < 10.0: if avg_loss_value[0] < 10.0:
exit(0) exit(0)
exit(1) exit(1)
#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# #
#Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
#You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
#Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
#limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import sys
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import os import os
...@@ -106,10 +104,10 @@ if len(sys.argv) >= 2: ...@@ -106,10 +104,10 @@ if len(sys.argv) >= 2:
net_type = sys.argv[1] net_type = sys.argv[1]
if net_type == "vgg": if net_type == "vgg":
print("train vgg net") print("training vgg net")
net = vgg16_bn_drop(images) net = vgg16_bn_drop(images)
elif net_type == "resnet": elif net_type == "resnet":
print("train resnet") print("training resnet")
net = resnet_cifar10(images, 32) net = resnet_cifar10(images, 32)
else: else:
raise ValueError("%s network is not supported" % net_type) raise ValueError("%s network is not supported" % net_type)
...@@ -129,6 +127,7 @@ train_reader = paddle.batch( ...@@ -129,6 +127,7 @@ train_reader = paddle.batch(
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() place = fluid.CPUPlace()
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe = fluid.Executor(place) exe = fluid.Executor(place)
t = fluid.DistributeTranspiler() t = fluid.DistributeTranspiler()
...@@ -146,17 +145,14 @@ if training_role == "PSERVER": ...@@ -146,17 +145,14 @@ if training_role == "PSERVER":
if not current_endpoint: if not current_endpoint:
print("need env SERVER_ENDPOINT") print("need env SERVER_ENDPOINT")
exit(1) exit(1)
print("start pserver at:", current_endpoint)
pserver_prog = t.get_pserver_program(current_endpoint) pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
exe.run(pserver_startup) exe.run(pserver_startup)
exe.run(pserver_prog) exe.run(pserver_prog)
print("pserver run end")
elif training_role == "TRAINER": elif training_role == "TRAINER":
print("start trainer")
trainer_prog = t.get_trainer_program() trainer_prog = t.get_trainer_program()
feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM): for pass_id in range(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_reader(): for data in train_reader():
...@@ -164,9 +160,10 @@ elif training_role == "TRAINER": ...@@ -164,9 +160,10 @@ elif training_role == "TRAINER":
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics) fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( print("pass_id:" + str(pass_id) + "loss:" + str(loss) + " pass_acc:"
pass_acc)) + str(pass_acc))
# this model is slow, so if we can train two mini batch, we think it works properly. # this model is slow, so if we can train two mini batches,
# we think it works properly.
print("trainer run end") print("trainer run end")
else: else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER") print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid.layers as layers
from paddle.v2.fluid.executor import Executor
import os
dict_size = 30000
source_dict_dim = target_dict_dim = dict_size
src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
hidden_dim = 32
word_dim = 16
IS_SPARSE = True
batch_size = 10
max_length = 50
topk_size = 50
trg_dic_size = 10000
decoder_size = hidden_dim
def encoder_decoder():
# encoder
src_word_id = layers.data(
name="src_word_id", shape=[1], dtype='int64', lod_level=1)
src_embedding = layers.embedding(
input=src_word_id,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb'))
fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
encoder_out = layers.sequence_last_step(input=lstm_hidden0)
# decoder
trg_language_word = layers.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1)
trg_embedding = layers.embedding(
input=trg_language_word,
size=[dict_size, word_dim],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr=fluid.ParamAttr(name='vemb'))
rnn = fluid.layers.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(trg_embedding)
mem = rnn.memory(init=encoder_out)
fc1 = fluid.layers.fc(input=[current_word, mem],
size=decoder_size,
act='tanh')
out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
rnn.update_memory(mem, fc1)
rnn.output(out)
return rnn()
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def main():
rnn_out = encoder_decoder()
label = layers.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = layers.cross_entropy(input=rnn_out, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
optimize_ops, params_grads = optimizer.minimize(avg_cost)
train_data = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size)
place = core.CPUPlace()
exe = Executor(place)
t = fluid.DistributeTranspiler()
# all parameter server endpoints list for spliting parameters
pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv(
"TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver
t.transpile(
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
trainer_prog = t.get_trainer_program()
exe.run(framework.default_startup_program())
batch_id = 0
for pass_id in xrange(2):
for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place)
trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
outs = exe.run(trainer_prog,
feed={
'src_word_id': word_data,
'target_language_word': trg_word,
'target_language_next_word': trg_word_next
},
fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0])
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
" avg_cost=" + str(avg_cost_val))
if batch_id > 3:
exit(0)
batch_id += 1
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
if __name__ == '__main__':
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import os
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.nets as nets
from paddle.v2.fluid.optimizer import SGDOptimizer
IS_SPARSE = True
BATCH_SIZE = 256
PASS_NUM = 100
def get_usr_combined_features():
USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
uid = layers.data(name='user_id', shape=[1], dtype='int64')
usr_emb = layers.embedding(
input=uid,
dtype='float32',
size=[USR_DICT_SIZE, 32],
param_attr='user_table',
is_sparse=IS_SPARSE)
usr_fc = layers.fc(input=usr_emb, size=32)
USR_GENDER_DICT_SIZE = 2
usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
usr_gender_emb = layers.embedding(
input=usr_gender_id,
size=[USR_GENDER_DICT_SIZE, 16],
param_attr='gender_table',
is_sparse=IS_SPARSE)
usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
usr_age_emb = layers.embedding(
input=usr_age_id,
size=[USR_AGE_DICT_SIZE, 16],
is_sparse=IS_SPARSE,
param_attr='age_table')
usr_age_fc = layers.fc(input=usr_age_emb, size=16)
USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
usr_job_emb = layers.embedding(
input=usr_job_id,
size=[USR_JOB_DICT_SIZE, 16],
param_attr='job_table',
is_sparse=IS_SPARSE)
usr_job_fc = layers.fc(input=usr_job_emb, size=16)
concat_embed = layers.concat(
input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
return usr_combined_features
def get_mov_combined_features():
MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
mov_emb = layers.embedding(
input=mov_id,
dtype='float32',
size=[MOV_DICT_SIZE, 32],
param_attr='movie_table',
is_sparse=IS_SPARSE)
mov_fc = layers.fc(input=mov_emb, size=32)
CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
category_id = layers.data(name='category_id', shape=[1], dtype='int64')
mov_categories_emb = layers.embedding(
input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
mov_categories_hidden = layers.sequence_pool(
input=mov_categories_emb, pool_type="sum")
MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
mov_title_emb = layers.embedding(
input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
mov_title_conv = nets.sequence_conv_pool(
input=mov_title_emb,
num_filters=32,
filter_size=3,
act="tanh",
pool_type="sum")
concat_embed = layers.concat(
input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
return mov_combined_features
def model():
usr_combined_features = get_usr_combined_features()
mov_combined_features = get_mov_combined_features()
# need cos sim
inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
scale_infer = layers.scale(x=inference, scale=5.0)
label = layers.data(name='score', shape=[1], dtype='float32')
square_cost = layers.square_error_cost(input=scale_infer, label=label)
avg_cost = layers.mean(x=square_cost)
return avg_cost
def func_feed(feeding, data, place):
feed_tensors = {}
for (key, idx) in feeding.iteritems():
tensor = core.LoDTensor()
if key != "category_id" and key != "movie_title":
if key == "score":
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"float32")
else:
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"int64")
else:
numpy_data = map(lambda x: np.array(x[idx]).astype("int64"), data)
lod_info = [len(item) for item in numpy_data]
offset = 0
lod = [offset]
for item in lod_info:
offset += item
lod.append(offset)
numpy_data = np.concatenate(numpy_data, axis=0)
tensor.set_lod([lod])
numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
tensor.set(numpy_data, place)
feed_tensors[key] = tensor
return feed_tensors
def main():
cost = model()
optimizer = SGDOptimizer(learning_rate=0.2)
optimize_ops, params_grads = optimizer.minimize(cost)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.movielens.train(), buf_size=8192),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
t = fluid.DistributeTranspiler()
# all parameter server endpoints list for spliting parameters
pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv("TRAINING_ROLE", "TRAINER")
t.transpile(
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
exe.run(fluid.default_startup_program())
trainer_prog = t.get_trainer_program()
feeding = {
'user_id': 0,
'gender_id': 1,
'age_id': 2,
'job_id': 3,
'movie_id': 4,
'category_id': 5,
'movie_title': 6,
'score': 7
}
for pass_id in range(PASS_NUM):
for data in train_reader():
outs = exe.run(trainer_prog,
feed=func_feed(feeding, data, place),
fetch_list=[cost])
out = np.array(outs[0])
print("cost=" + str(out[0]))
if out[0] < 6.0:
print("Training complete. Average cost is less than 6.0.")
# if avg cost less than 6.0, we think our code is good.
exit(0)
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
if __name__ == '__main__':
main()
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import numpy as np import numpy as np
import os
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
...@@ -50,9 +51,9 @@ def stacked_lstm_net(data, ...@@ -50,9 +51,9 @@ def stacked_lstm_net(data,
cost = fluid.layers.cross_entropy(input=prediction, label=label) cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost) avg_cost = fluid.layers.mean(x=cost)
adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002) adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
adam_optimizer.minimize(avg_cost) optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=prediction, label=label) accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
return avg_cost, accuracy, accuracy.metrics[0] return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
def to_lodtensor(data, place): def to_lodtensor(data, place):
...@@ -75,14 +76,14 @@ def main(): ...@@ -75,14 +76,14 @@ def main():
PASS_NUM = 5 PASS_NUM = 5
word_dict = paddle.dataset.imdb.word_dict() word_dict = paddle.dataset.imdb.word_dict()
print "load word dict successfully" print "loaded word dict successfully"
dict_dim = len(word_dict) dict_dim = len(word_dict)
class_dim = 2 class_dim = 2
data = fluid.layers.data( data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1) name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64") label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost, accuracy, acc_out = stacked_lstm_net( cost, accuracy, acc_out, optimize_ops, params_grads = stacked_lstm_net(
data, label, input_dim=dict_dim, class_dim=class_dim) data, label, input_dim=dict_dim, class_dim=class_dim)
train_data = paddle.batch( train_data = paddle.batch(
...@@ -93,12 +94,32 @@ def main(): ...@@ -93,12 +94,32 @@ def main():
exe = fluid.Executor(place) exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place) feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
t = fluid.DistributeTranspiler()
# all parameter server endpoints list for spliting parameters
pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv(
"TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver
t.transpile(
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint)
pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
exe.run(pserver_startup)
exe.run(pserver_prog)
elif training_role == "TRAINER":
exe.run(fluid.default_startup_program()) exe.run(fluid.default_startup_program())
trainer_prog = t.get_trainer_program()
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
accuracy.reset(exe) accuracy.reset(exe)
for data in train_data(): for data in train_data():
cost_val, acc_val = exe.run(fluid.default_main_program(), cost_val, acc_val = exe.run(trainer_prog,
feed=feeder.feed(data), feed=feeder.feed(data),
fetch_list=[cost, acc_out]) fetch_list=[cost, acc_out])
pass_acc = accuracy.eval(exe) pass_acc = accuracy.eval(exe)
...@@ -106,7 +127,8 @@ def main(): ...@@ -106,7 +127,8 @@ def main():
" pass_acc=" + str(pass_acc)) " pass_acc=" + str(pass_acc))
if cost_val < 1.0 and acc_val > 0.8: if cost_val < 1.0 and acc_val > 0.8:
exit(0) exit(0)
exit(1) else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase): ...@@ -334,7 +334,7 @@ class OpTest(unittest.TestCase):
def check_output(self, atol=1e-5): def check_output(self, atol=1e-5):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu(self.op_type): if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_output_with_place(place, atol) self.check_output_with_place(place, atol)
...@@ -367,7 +367,7 @@ class OpTest(unittest.TestCase): ...@@ -367,7 +367,7 @@ class OpTest(unittest.TestCase):
max_relative_error=0.005, max_relative_error=0.005,
user_defined_grads=None): user_defined_grads=None):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu(self.op_type): if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_grad_with_place(place, inputs_to_check, output_names, self.check_grad_with_place(place, inputs_to_check, output_names,
......
...@@ -186,8 +186,7 @@ class TestFloor(OpTest): ...@@ -186,8 +186,7 @@ class TestFloor(OpTest):
self.op_type = "floor" self.op_type = "floor"
x = np.random.uniform(-1, 1, [4, 4]).astype("float32") x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
self.inputs = {'X': x} self.inputs = {'X': x}
# numpy floor need +1 self.outputs = {'Out': np.floor(self.inputs['X'])}
self.outputs = {'Out': np.floor(self.inputs['X']) + 1.0}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
......
...@@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase): ...@@ -180,7 +180,7 @@ class TestSparseAdagradOp(unittest.TestCase):
def test_sparse_adagrad(self): def test_sparse_adagrad(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -305,7 +305,7 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -305,7 +305,7 @@ class TestSparseAdamOp(unittest.TestCase):
def test_sparse_sgd(self): def test_sparse_sgd(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -352,7 +352,7 @@ class TestBatchNormOp(OpTest): ...@@ -352,7 +352,7 @@ class TestBatchNormOp(OpTest):
print "op test backward passed: ", str(place), data_layout print "op test backward passed: ", str(place), data_layout
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
......
...@@ -21,7 +21,7 @@ class TestDropoutOp(OpTest): ...@@ -21,7 +21,7 @@ class TestDropoutOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")} self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
self.attrs = {'dropout_prob': 0.0, 'is_test': False} self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
self.outputs = { self.outputs = {
'Out': self.inputs['X'], 'Out': self.inputs['X'],
'Mask': np.ones((32, 64)).astype('float32') 'Mask': np.ones((32, 64)).astype('float32')
...@@ -38,7 +38,7 @@ class TestDropoutOp2(TestDropoutOp): ...@@ -38,7 +38,7 @@ class TestDropoutOp2(TestDropoutOp):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")} self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
self.attrs = {'dropout_prob': 1.0, 'is_test': False} self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
self.outputs = { self.outputs = {
'Out': np.zeros((32, 64)).astype('float32'), 'Out': np.zeros((32, 64)).astype('float32'),
'Mask': np.zeros((32, 64)).astype('float32') 'Mask': np.zeros((32, 64)).astype('float32')
...@@ -49,7 +49,7 @@ class TestDropoutOp3(TestDropoutOp): ...@@ -49,7 +49,7 @@ class TestDropoutOp3(TestDropoutOp):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")} self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
self.attrs = {'dropout_prob': 0.0, 'is_test': False} self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
self.outputs = { self.outputs = {
'Out': self.inputs['X'], 'Out': self.inputs['X'],
'Mask': np.ones((32, 64, 2)).astype('float32') 'Mask': np.ones((32, 64, 2)).astype('float32')
...@@ -60,7 +60,7 @@ class TestDropoutOp4(OpTest): ...@@ -60,7 +60,7 @@ class TestDropoutOp4(OpTest):
def setUp(self): def setUp(self):
self.op_type = "dropout" self.op_type = "dropout"
self.inputs = {'X': np.random.random((32, 64)).astype("float32")} self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
self.attrs = {'dropout_prob': 0.35, 'is_test': True} self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True}
self.outputs = { self.outputs = {
'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob']) 'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
} }
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestElementwisePowOp(OpTest):
def setUp(self):
self.op_type = "elementwise_pow"
self.inputs = {
'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
}
self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
def test_check_output(self):
self.check_output()
class TestElementwisePowOp_scalar(TestElementwisePowOp):
def setUp(self):
self.op_type = "elementwise_pow"
self.inputs = {
'X': np.random.rand(2, 3, 4).astype('float32'),
'Y': np.random.rand(1).astype('float32')
}
self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
if __name__ == '__main__':
unittest.main()
...@@ -33,7 +33,7 @@ class TestGaussianRandomOp(unittest.TestCase): ...@@ -33,7 +33,7 @@ class TestGaussianRandomOp(unittest.TestCase):
self.gaussian_random_test(place=fluid.CPUPlace()) self.gaussian_random_test(place=fluid.CPUPlace())
def test_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
self.gaussian_random_test(place=fluid.CUDAPlace(0)) self.gaussian_random_test(place=fluid.CUDAPlace(0))
def gaussian_random_test(self, place): def gaussian_random_test(self, place):
......
文件模式从 100755 更改为 100644
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestLabelSmoothOp(OpTest):
def config(self):
self.op_type = "label_smooth"
self.epsilon = 0.1
batch_size, self.label_dim = 5, 10
self.label = np.zeros((batch_size, self.label_dim)).astype("float64")
nonzero_index = np.random.randint(self.label_dim, size=(batch_size))
self.label[np.arange(batch_size), nonzero_index] = 1
def setUp(self):
self.config()
smoothed_label = (1 - self.epsilon
) * self.label + self.epsilon / self.label_dim
self.inputs = {'X': self.label}
self.attrs = {'epsilon': self.epsilon}
self.outputs = {'Out': smoothed_label}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Out")
class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
def setUp(self):
self.config()
dist = np.random.random((1, self.label_dim))
smoothed_label = (1 - self.epsilon) * self.label + self.epsilon * dist
self.inputs = {'X': self.label, 'PriorDist': dist}
self.attrs = {'epsilon': self.epsilon}
self.outputs = {'Out': smoothed_label}
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from operator import mul
from op_test import OpTest
import paddle.v2.fluid.core as core
from paddle.v2.fluid.op import Operator
from paddle.v2.fluid.framework import grad_var_name
def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
x_shape = x.shape
N = reduce(mul, x_shape[0:begin_norm_axis], 1)
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
x.shape = [N, D]
mean = np.mean(x, axis=1)
var = np.var(x, axis=1) + epsilon
output = scale.reshape([1, D]) * np.divide(
(x - mean.reshape([N, 1])),
(np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
x.shape, output.shape = x_shape, x_shape
return output, mean, var
def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
x_shape = x.shape
scale_shape = scale.shape
N = reduce(mul, x_shape[0:begin_norm_axis], 1)
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
x.shape, grad_y.shape = [N, D], [N, D]
var.shape, mean.shape = [N, 1], [N, 1]
scale.shape = [1, D]
# d_bias
d_bias = np.sum(grad_y, axis=0).reshape([1, D])
# d_scale
d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
axis=0).reshape([1, D])
# dx
dx_end = scale * np.sqrt(1.0 / var) * grad_y
d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
[N, 1]) # the second part equals to zero.
d_mean = 1.0 / D * d_mean_0
d_std = np.sum(
-(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
grad_x = dx_end + d_mean + d_std
grad_y.shape = x_shape
x.shape = x_shape
scale.shape = scale_shape
return grad_x, d_scale, d_bias
def get_backward_op(scope, op, no_grad_set):
backward_op = core.Operator.backward(op, no_grad_set)
for input in backward_op.input_vars():
var = scope.var(input)
var.get_tensor()
for output in backward_op.output_vars():
var = scope.var(output)
var.get_tensor()
return backward_op
def create_or_get_tensor(scope, var_name, var, place):
tensor = scope.var(var_name).get_tensor()
if var is not None:
assert isinstance(var, np.ndarray)
tensor.set_lod([[]])
tensor.set_dims(var.shape)
tensor.set(var, place)
return tensor
def set_output_grad(scope, outputs, place, feed_dict=None):
def __set_tensor__(name, data=None):
out_tensor = scope.find_var(name).get_tensor()
grad_tensor = scope.var(grad_var_name(name)).get_tensor()
out_dtype = out_tensor.dtype()
if data is None:
if out_dtype == core.DataType.FP64:
data = np.ones(out_tensor.shape(), dtype=np.float64)
elif out_dtype == core.DataType.FP32:
data = np.ones(out_tensor.shape(), dtype=np.float32)
else:
raise ValueError("Not supported data type " + str(out_dtype))
grad_tensor.set(data, place)
for output in outputs:
data = None
if output in feed_dict:
data = feed_dict[output]
__set_tensor__(output, data)
class TestLayerNormdOp(OpTest):
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(
np.allclose(
np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
msg)
def __assert_grad_close(self,
tensor,
np_array,
name,
place,
max_relative_error=0.02):
a = np.array(tensor).reshape(np_array.shape)
b = np_array
abs_a = np.abs(a)
abs_a[abs_a < 1e-5] = 1
diff_mat = np.abs(a - b) / abs_a
max_diff = np.max(diff_mat)
def err_msg():
offset = np.argmax(diff_mat > max_relative_error)
return ("%s Variable %s max gradient diff %f over limit %f, "
"the first error element is %d, %f, %f") % (
"Gradient Check On %s" % str(place), name, max_diff,
max_relative_error, offset, a.flatten()[offset],
b.flatten()[offset])
self.assertLessEqual(max_diff, max_relative_error, err_msg())
def check_forward_backward(self, shape, begin_norm_axis):
def test_with_place(place, shape, begin_norm_axis=1):
# setUp
assert begin_norm_axis > 0 and begin_norm_axis < len(
shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
# attr
epsilon = 0.00001
x_shape = shape
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
scale_shape = [D]
np.random.random(123)
x_val = np.random.random_sample(x_shape).astype(np.float32)
scale_val = np.random.random_sample(scale_shape).astype(np.float32)
bias_val = np.random.random_sample(scale_shape).astype(np.float32)
y_grad = np.random.random_sample(x_shape).astype(np.float32)
# run forward
y_out, saved_mean, var_ref = _reference_layer_norm_naive(
x_val, scale_val, bias_val, epsilon, begin_norm_axis)
naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
# get gradient
x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
naive_grad = {
"X": x_grad_ref,
"Scale": scale_grad_ref,
"Bias": bias_grad_ref
}
scope = core.Scope()
# create input
input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
for i_name in input_map:
create_or_get_tensor(scope, i_name, input_map[i_name], place)
# create output
output_map = {"Y": None, "Mean": None, "Variance": None}
output_tensor = {}
for o_name in output_map:
output_tensor[o_name] = create_or_get_tensor(
scope, o_name, output_map[o_name], place)
layer_norm_op = Operator(
"layer_norm",
# inputs
X="X",
Scale="Scale",
Bias="Bias",
# outputs
Y="Y",
Mean="Mean",
Variance="Variance",
# attrs
epsilon=epsilon,
begin_norm_axis=begin_norm_axis)
layer_norm_op.run(scope, place)
# check forward result
atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
for o_tensor in output_tensor:
self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
o_tensor, atol)
# run backward
layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
set_output_grad(
scope, ["Y", "Mean", "Variance"],
place,
feed_dict={"Y": y_grad})
layer_norm_op_grad.run(scope, place)
# get output
grad_tensor = {}
for o_name in naive_grad:
grad_tensor[o_name] = x_ = create_or_get_tensor(
scope, grad_var_name(o_name), None, place)
# check gradient output
for o_grad in naive_grad:
self.__assert_grad_close(grad_tensor[o_grad],
naive_grad[o_grad], o_grad + "@GRAD",
place)
places = [core.CPUPlace()]
if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
places.append(core.CUDAPlace(0))
for place in places:
test_with_place(place, shape, begin_norm_axis)
def test_check_forward_backward_with_scale_and_bias(self):
self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
def test_check_forward_backward_with_scale(self):
pass # TODO(zcd)
def test_check_forward_backward_with_bias(self):
pass # TODO(zcd)
def test_check_forward_backward(self):
pass # TODO(zcd)
if __name__ == '__main__':
unittest.main()
...@@ -202,6 +202,18 @@ class TestBook(unittest.TestCase): ...@@ -202,6 +202,18 @@ class TestBook(unittest.TestCase):
x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell)) x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
print(str(program)) print(str(program))
def test_dynamic_lstmp(self):
program = Program()
with program_guard(program):
hidden_dim, proj_dim = 16, 8
seq_data = layers.data(
name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
self.assertIsNotNone(
layers.dynamic_lstmp(
input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
print(str(program))
def test_sequence_softmax(self): def test_sequence_softmax(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
...@@ -271,6 +283,24 @@ class TestBook(unittest.TestCase): ...@@ -271,6 +283,24 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(avg_loss) self.assertIsNotNone(avg_loss)
print(str(default_main_program())) print(str(default_main_program()))
def test_row_conv(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[16], dtype='float32', lod_level=1)
out = layers.row_conv(input=x, future_context_size=2)
self.assertIsNotNone(out)
print(str(program))
def test_multiplex(self):
program = Program()
with program_guard(program):
x1 = layers.data(name='x1', shape=[4], dtype='float32')
x2 = layers.data(name='x2', shape=[4], dtype='float32')
index = layers.data(name='index', shape=[1], dtype='int32')
out = layers.multiplex(inputs=[x1, x2], index=index)
self.assertIsNotNone(out)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import math
import paddle.v2.fluid.framework as framework
import paddle.v2.fluid as fluid
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid.learning_rate_decay as lr_decay
def exponential_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
exponent = float(global_step) / float(decay_steps)
if staircase:
exponent = math.floor(exponent)
return learning_rate * decay_rate**exponent
def natural_exp_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
exponent = float(global_step) / float(decay_steps)
if staircase:
exponent = math.floor(exponent)
return learning_rate * math.exp(-1 * decay_rate * exponent)
def inverse_time_decay(learning_rate,
global_step,
decay_steps,
decay_rate,
staircase=False):
temp = float(global_step) / float(decay_steps)
if staircase:
temp = math.floor(temp)
return learning_rate / (1 + decay_rate * temp)
class TestLearningRateDecay(unittest.TestCase):
def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
init_lr = 1.0
decay_steps = 5
decay_rate = 0.5
global_step = layers.create_global_var(
shape=[1], value=0.0, dtype='float32', persistable=True)
decayed_lr = fluid_decay_fn(
learning_rate=init_lr,
global_step=global_step,
decay_steps=decay_steps,
decay_rate=decay_rate,
staircase=staircase)
layers.increment(global_step, 1.0)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
for step in range(10):
step_val, lr_val = exe.run(fluid.default_main_program(),
feed=[],
fetch_list=[global_step, decayed_lr])
python_decayed_lr = python_decay_fn(
learning_rate=init_lr,
global_step=step,
decay_steps=decay_steps,
decay_rate=decay_rate,
staircase=staircase)
self.assertAlmostEqual(python_decayed_lr, lr_val[0])
def test_decay(self):
decay_fns = [
(exponential_decay, lr_decay.exponential_decay, True),
(exponential_decay, lr_decay.exponential_decay, False),
(natural_exp_decay, lr_decay.natural_exp_decay, True),
(natural_exp_decay, lr_decay.natural_exp_decay, False),
(inverse_time_decay, lr_decay.inverse_time_decay, True),
(inverse_time_decay, lr_decay.inverse_time_decay, False),
]
for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
staircase))
main_program = framework.Program()
startup_program = framework.Program()
with framework.program_guard(main_program, startup_program):
self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
if __name__ == '__main__':
unittest.main()
...@@ -42,7 +42,7 @@ def relu(x): ...@@ -42,7 +42,7 @@ def relu(x):
return np.maximum(x, 0) return np.maximum(x, 0)
ACTVATION = { ACTIVATION = {
'identity': identity, 'identity': identity,
'sigmoid': sigmoid, 'sigmoid': sigmoid,
'tanh': tanh, 'tanh': tanh,
...@@ -158,8 +158,8 @@ class TestLstmOp(OpTest): ...@@ -158,8 +158,8 @@ class TestLstmOp(OpTest):
w_b = b[:, 0:4 * self.D] w_b = b[:, 0:4 * self.D]
w_c = b[:, 4 * self.D:] if self.use_peepholes else None w_c = b[:, 4 * self.D:] if self.use_peepholes else None
h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse, h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
ACTVATION[self.act_gate], ACTVATION[self.act_cell], ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
ACTVATION[self.act_cand]) ACTIVATION[self.act_cand])
self.inputs = {'Input': (x, self.lod), 'Weight': w} self.inputs = {'Input': (x, self.lod), 'Weight': w}
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import unittest
import numpy as np
import test_lstm_op as LstmTest
ACTIVATION = {
'identity': LstmTest.identity,
'sigmoid': LstmTest.sigmoid,
'tanh': LstmTest.tanh,
'relu': LstmTest.relu
}
# LSTM with recurrent projection Layer
def lstmp(
input, # T x 4D
lod, # 1 x N
h0=None, # N x D
c0=None, # N x D
w_r=None, # P x 4D
w_rh=None, # D x P
w_b=None, # 1 x 4D
w_c=None, # 1 x 3D
is_reverse=False,
act_gate=None,
act_cell=None,
act_cand=None,
act_proj=None):
def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
act_proj):
g = np.dot(r_pre, w_r) # 1 x 4D
g = g + x
g = np.reshape(g, (1, g.size))
c, g_i, g_f, g_o = np.split(g, 4, axis=1)
if w_c is None:
g_i = act_gate(g_i) # 1 x D
g_f = act_gate(g_f) # 1 x D
else:
w_ic, w_fc, _ = np.split(w_c, 3, axis=1)
g_i = act_gate(g_i + w_ic * c_pre) # 1 x D
g_f = act_gate(g_f + w_fc * c_pre) # 1 x D
c = g_f * c_pre + g_i * act_cand(c) # 1 x D
if w_c is None:
g_o = act_gate(g_o) # 1 x D
else:
_, _, w_oc = np.split(w_c, 3, axis=1)
g_o = act_gate(g_o + w_oc * c) # 1 x D
h = g_o * act_cell(c)
# projection
r = np.dot(h, w_rh)
r = act_proj(r)
return r, c
def _reverse(x, lod):
y = np.zeros_like(x)
for i in range(len(lod) - 1):
b, e = lod[i], lod[i + 1]
y[b:e, :] = np.flip(x[b:e, :], 0)
return y
offset = lod[0]
batch_size = len(offset) - 1
# recurrent projection state
projection = []
cell = []
input = _reverse(input, offset) if is_reverse else input
if w_b is not None:
input = input + np.tile(w_b, (offset[-1], 1))
for i in range(batch_size):
# compute one sequence
seq_len = offset[i + 1] - offset[i]
x = input[offset[i]:offset[i + 1], :]
r_pre = np.dot(h0[i], w_rh) # 1 x P
r_pre = act_proj(r_pre)
c_pre = c0[i] # 1 x D
for j in range(seq_len):
# compute one step
r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
act_cell, act_cand, act_proj)
projection.append(r_pre.flatten())
cell.append(c_pre.flatten())
projection = np.array(projection).astype('float64')
cell = np.array(cell).astype('float64')
projection = _reverse(projection, offset) if is_reverse else projection
cell = _reverse(cell, offset) if is_reverse else cell
assert projection.shape == (input.shape[0], w_r.shape[0]) # T x P
assert cell.shape == (input.shape[0], input.shape[1] / 4) # T x D
return projection, cell
class TestLstmpOp(LstmTest.TestLstmOp):
def reset_argument(self):
pass
def setUp(self):
self.set_argument()
# projection size
self.P = 10
self.act_proj = self.act_cell
self.reset_argument()
self.op_type = 'lstmp'
T = self.lod[0][-1]
N = len(self.lod[0]) - 1
x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
if self.has_initial_state:
h0 = np.random.normal(size=(N, self.D)).astype('float64')
c0 = np.random.normal(size=(N, self.D)).astype('float64')
else:
h0 = np.zeros((N, self.D)).astype('float64')
c0 = np.zeros((N, self.D)).astype('float64')
w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
if self.use_peepholes:
b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
else:
b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
w_b = b[:, 0:4 * self.D]
w_c = b[:, 4 * self.D:] if self.use_peepholes else None
w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
self.inputs['Bias'] = b
if self.has_initial_state:
self.inputs['H0'] = h0
self.inputs['C0'] = c0
self.outputs = {
'Projection': (r, self.lod),
'Cell': (c, self.lod),
}
self.attrs = {
'use_peepholes': self.use_peepholes,
'is_reverse': self.is_reverse,
'gate_activation': self.act_gate,
'cell_activation': self.act_cell,
'candidate_activation': self.act_cand,
'proj_activation': self.act_proj
}
def test_check_output(self):
self.check_output(atol=1e-8)
def test_check_grad(self):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N = len(self.lod[0]) - 1
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2)
class TestLstmpOpHasInitial(TestLstmpOp):
def reset_argument(self):
self.has_initial_state = True
def test_check_grad(self):
# TODO(qingqing) remove folowing lines after the check_grad is refined.
N = len(self.lod[0]) - 1
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
['Projection'],
max_relative_error=1e-2)
def test_check_grad_ingore_bias(self):
N = len(self.lod[0]) - 1
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'ProjWeight', 'Weight'], ['Projection'],
max_relative_error=1e-2,
no_grad_set=set('Bias'))
def test_check_grad_ingore_weight(self):
N = len(self.lod[0]) - 1
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
no_grad_set=set('Weight'))
def test_check_grad_ingore_proj_weight(self):
N = len(self.lod[0]) - 1
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
no_grad_set=set('ProjWeight'))
def test_check_grad_ingore_input(self):
N = len(self.lod[0]) - 1
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Weight', 'ProjWeight', 'Bias'], ['Projection'],
max_relative_error=1e-2,
no_grad_set=set('Input'))
def test_check_grad_ingore_h0(self):
N = len(self.lod[0]) - 1
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
max_relative_error=1e-2,
no_grad_set=set('H0'))
def test_check_grad_ingore_c0(self):
N = len(self.lod[0]) - 1
self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
self.outputs['BatchCellPreAct'] = np.zeros(
(N, self.D)).astype('float64')
self.check_grad(
['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
max_relative_error=1e-2,
no_grad_set=set('C0'))
class TestLstmpOpRerverse(TestLstmpOp):
def reset_argument(self):
self.is_reverse = True
class TestLstmpOpNotUsePeepholes(TestLstmpOp):
def reset_argument(self):
self.use_peepholes = False
class TestLstmpOpLinearProjection(TestLstmpOp):
def reset_argument(self):
self.act_proj = 'identity'
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import numpy as np
class TestMultiheadAttention(unittest.TestCase):
def gen_random_input(self):
"""Generate random input data.
"""
# batch_size, max_sequence_length, hidden dimension
self.input_shape = (3, 13, 16)
self.queries = np.random.random(size=self.input_shape).astype("float32")
self.keys = np.random.random(size=self.input_shape).astype("float32")
def set_program(self):
"""Build the test program.
"""
queries = fluid.layers.data(
name="queries",
shape=self.input_shape,
dtype="float32",
append_batch_size=False)
queries.stop_gradient = False
keys = fluid.layers.data(
name="keys",
shape=self.input_shape,
dtype="float32",
append_batch_size=False)
keys.stop_gradient = False
contexts = fluid.nets.scaled_dot_product_attention(
queries=queries,
keys=keys,
values=keys,
num_heads=8,
dropout_rate=0.)
out = fluid.layers.reduce_sum(contexts, dim=None)
fluid.backward.append_backward(loss=out)
self.fetch_list = [contexts]
def run_program(self):
"""Run the test program.
"""
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.set_inputs(place)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output = exe.run(fluid.default_main_program(),
feed=self.inputs,
fetch_list=self.fetch_list,
return_numpy=True)
self.op_output = output
def set_inputs(self, place):
"""Set the randomly generated data to the test program.
"""
self.inputs = {}
queries = fluid.Tensor()
queries.set(self.queries, place)
keys = fluid.Tensor()
keys.set(self.keys, place)
self.inputs["keys"] = keys
self.inputs["queries"] = queries
def test_multihead_attention(self):
self.gen_random_input()
self.set_program()
self.run_program()
#fixme(caoying) add more meaningfull unittest.
if __name__ == '__main__':
unittest.main()
...@@ -46,7 +46,7 @@ class TestNormalization(unittest.TestCase): ...@@ -46,7 +46,7 @@ class TestNormalization(unittest.TestCase):
"""Run the test program. """Run the test program.
""" """
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import math
from op_test import OpTest
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import paddle.v2.fluid.framework as framework
from paddle.v2.fluid.framework import Program, program_guard
class TestOneHotOp(OpTest):
def setUp(self):
self.op_type = 'one_hot'
depth = 10
dimension = 12
x_lod = [[0, 4, 5, 8, 11]]
x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
out = np.zeros(shape=(np.product(x.shape[:-1]),
depth)).astype('float32')
for i in xrange(np.product(x.shape)):
out[i, x[i]] = 1.0
self.inputs = {'X': (x, x_lod)}
self.attrs = {'depth': depth, 'dtype': int(core.DataType.FP32)}
self.outputs = {'Out': (out, x_lod)}
def test_check_output(self):
self.check_output()
class TestOneHotOp_default_dtype(OpTest):
def setUp(self):
self.op_type = 'one_hot'
depth = 10
dimension = 12
x_lod = [[0, 4, 5, 8, 11]]
x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
out = np.zeros(shape=(np.product(x.shape[:-1]),
depth)).astype('float32')
for i in xrange(np.product(x.shape)):
out[i, x[i]] = 1.0
self.inputs = {'X': (x, x_lod)}
self.attrs = {'depth': depth}
self.outputs = {'Out': (out, x_lod)}
def test_check_output(self):
self.check_output()
class TestOneHotOp_exception(OpTest):
def setUp(self):
self.op_type = 'one_hot'
self.depth = 10
self.place = core.CPUPlace()
self.dimension = 12
self.x = core.LoDTensor()
x_lod = [[0, 4, 5, 8, 11]]
data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])]
data = np.array(data).astype('int').reshape([x_lod[0][-1], 1])
self.x.set(data, self.place)
self.x.set_lod(x_lod)
def test_check_output(self):
program = Program()
with program_guard(program):
x = fluid.layers.data(
name='x', shape=[self.dimension], dtype='float32', lod_level=1)
block = program.current_block()
one_hot_out = block.create_var(
name="one_hot_out",
type=core.VarDesc.VarType.LOD_TENSOR,
dtype='float32')
block.append_op(
type='one_hot',
inputs={'X': x},
attrs={'depth': self.depth},
outputs={'Out': one_hot_out})
exe = fluid.Executor(self.place)
def run():
exe.run(feed={'x': self.x},
fetch_list=[one_hot_out],
return_numpy=False)
self.assertRaises(core.EnforceNotMet, run)
if __name__ == '__main__':
unittest.main()
...@@ -18,7 +18,8 @@ import paddle.v2.fluid.core as core ...@@ -18,7 +18,8 @@ import paddle.v2.fluid.core as core
class TestOpSupportGPU(unittest.TestCase): class TestOpSupportGPU(unittest.TestCase):
def test_case(self): def test_case(self):
self.assertEqual(core.is_compile_gpu(), core.op_support_gpu("sum")) self.assertEqual(core.is_compiled_with_cuda(),
core.op_support_gpu("sum"))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -53,7 +53,7 @@ class BaseParallelForTest(unittest.TestCase): ...@@ -53,7 +53,7 @@ class BaseParallelForTest(unittest.TestCase):
fetch=fetch, fetch=fetch,
place=cpu, place=cpu,
use_parallel=True) use_parallel=True)
if fluid.core.is_compile_gpu(): if fluid.core.is_compiled_with_cuda():
gpu = fluid.CUDAPlace(0) gpu = fluid.CUDAPlace(0)
result_gpu = self._run_test_impl_( result_gpu = self._run_test_impl_(
callback=callback, callback=callback,
...@@ -159,7 +159,7 @@ class ParallelOpTest(BaseParallelForTest): ...@@ -159,7 +159,7 @@ class ParallelOpTest(BaseParallelForTest):
def test_simple_fc(self): def test_simple_fc(self):
self.run_test( self.run_test(
callback=ParallelOpTest.__network__, callback=self.__network__,
feed={ feed={
'img': numpy.random.random(size=(51, 784)).astype('float32') 'img': numpy.random.random(size=(51, 784)).astype('float32')
}, },
...@@ -167,10 +167,35 @@ class ParallelOpTest(BaseParallelForTest): ...@@ -167,10 +167,35 @@ class ParallelOpTest(BaseParallelForTest):
def test_fc_with_tiny_data(self): def test_fc_with_tiny_data(self):
self.run_test( self.run_test(
callback=ParallelOpTest.__network__, callback=self.__network__,
feed={'img': numpy.random.random(size=(1, 784)).astype('float32')}, feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
fetch=['fc1.w@GRAD']) fetch=['fc1.w@GRAD'])
class ParallelOpTestMultipleInput(BaseParallelForTest):
@staticmethod
def __network__():
x = fluid.layers.data(
shape=[784], dtype='float32', name='img1', stop_gradient=False)
y = fluid.layers.data(
shape=[784], dtype='float32', name='img2', stop_gradient=False)
yield [x, y]
x = x + y
hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w')
hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w')
loss = fluid.layers.mean(x=hidden3)
yield loss
def test_simple_fc(self):
self.run_test(
callback=self.__network__,
feed={
'img1': numpy.random.random(size=(51, 784)).astype('float32'),
'img2': numpy.random.random(size=(51, 784)).astype('float32')
},
fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -13,16 +13,17 @@ ...@@ -13,16 +13,17 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
import os
import numpy as np import numpy as np
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.profiler as profiler import paddle.v2.fluid.profiler as profiler
import paddle.v2.fluid.layers as layers import paddle.v2.fluid.layers as layers
import os import paddle.v2.fluid.core as core
class TestProfiler(unittest.TestCase): class TestProfiler(unittest.TestCase):
def test_nvprof(self): def test_nvprof(self):
if not fluid.core.is_compile_gpu(): if not fluid.core.is_compiled_with_cuda():
return return
epoc = 8 epoc = 8
dshape = [4, 3, 28, 28] dshape = [4, 3, 28, 28]
...@@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase): ...@@ -40,6 +41,50 @@ class TestProfiler(unittest.TestCase):
exe.run(fluid.default_main_program(), feed={'data': input}) exe.run(fluid.default_main_program(), feed={'data': input})
os.remove(output_file) os.remove(output_file)
def net_profiler(self, state):
if state == 'GPU' and not core.is_compiled_with_cuda():
return
startup_program = fluid.Program()
main_program = fluid.Program()
with fluid.program_guard(main_program, startup_program):
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opts = optimizer.minimize(avg_cost, startup_program=startup_program)
place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_program)
accuracy.reset(exe)
with profiler.profiler(state, 'total') as prof:
for iter in range(10):
if iter == 2:
profiler.reset_profiler()
x = np.random.random((32, 784)).astype("float32")
y = np.random.randint(0, 10, (32, 1)).astype("int64")
outs = exe.run(main_program,
feed={'x': x,
'y': y},
fetch_list=[avg_cost] + accuracy.metrics)
acc = np.array(outs[1])
pass_acc = accuracy.eval(exe)
def test_cpu_profiler(self):
self.net_profiler('CPU')
def test_cuda_profiler(self):
self.net_profiler('GPU')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import paddle.v2.fluid as fluid
import paddle.v2.fluid.layers as layers
import numpy
from multiprocessing import Process
import os, sys
class TestRecvOp(unittest.TestCase):
def test_send(self):
# Run init_serv in a thread
place = fluid.CPUPlace()
p = Process(target=self.init_serv, args=(place, ))
p.daemon = True
p.start()
self.init_client(place)
# FIXME(typhoonzero): find a way to gracefully shutdown the server.
os.system("kill -9 %d" % p.pid)
p.join()
def init_serv(self, place):
main = fluid.Program()
with fluid.program_guard(main):
x = layers.data(
shape=[32, 32],
dtype='float32',
name="X",
append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block())
serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
with serv.do():
o = layers.scale(x=x, scale=10.0)
main.global_block().create_var(
name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
exe = fluid.Executor(place)
exe.run(main)
def init_client(self, place):
main = fluid.Program()
with fluid.program_guard(main):
x = layers.data(
shape=[32, 32],
dtype='float32',
name='X',
append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block())
layers.Send("127.0.0.1:6174", [x], [x])
exe = fluid.Executor(place)
exe.run(main)
if __name__ == "__main__":
unittest.main()
...@@ -45,7 +45,7 @@ class TestReorderLoDTensor(unittest.TestCase): ...@@ -45,7 +45,7 @@ class TestReorderLoDTensor(unittest.TestCase):
outputs = [] outputs = []
input_grads = [] input_grads = []
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.set_inputs(place) self.set_inputs(place)
......
...@@ -91,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase): ...@@ -91,7 +91,7 @@ class TestSparseSGDOp(unittest.TestCase):
def test_sparse_sgd(self): def test_sparse_sgd(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) self.check_with_place(place)
......
...@@ -21,7 +21,7 @@ from paddle.v2.fluid.op import Operator ...@@ -21,7 +21,7 @@ from paddle.v2.fluid.op import Operator
class TestSpliteSelectedRows(unittest.TestCase): class TestSpliteSelectedRows(unittest.TestCase):
def get_places(self): def get_places(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
return places return places
......
...@@ -108,9 +108,31 @@ class TestTensor(unittest.TestCase): ...@@ -108,9 +108,31 @@ class TestTensor(unittest.TestCase):
scope = core.Scope() scope = core.Scope()
place = core.CPUPlace() place = core.CPUPlace()
lod_py = [[0, 2, 5], [0, 2, 4, 5]] lod_py = [[0, 2, 5], [0, 2, 4, 5]]
lod_tensor = core.LoDTensor(lod_py) lod_tensor = core.LoDTensor()
lod_tensor.set_dims([5, 2, 3, 4]) lod_tensor.set_dims([5, 2, 3, 4])
lod_tensor.set_lod(lod_py)
lod_tensor.alloc_float(place)
tensor_array = numpy.array(lod_tensor)
tensor_array[0, 0, 0, 0] = 1.0
tensor_array[0, 0, 0, 1] = 2.0
lod_tensor.set(tensor_array, place)
lod_v = numpy.array(lod_tensor)
self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
self.assertListEqual(lod_py, lod_tensor.lod())
def test_lod_tensor_gpu_init(self):
if not core.is_compiled_with_cuda():
return
scope = core.Scope()
place = core.CUDAPlace(0)
lod_py = [[0, 2, 5], [0, 2, 4, 5]]
lod_tensor = core.LoDTensor()
lod_tensor.set_dims([5, 2, 3, 4])
lod_tensor.set_lod(lod_py)
lod_tensor.alloc_float(place) lod_tensor.alloc_float(place)
tensor_array = numpy.array(lod_tensor) tensor_array = numpy.array(lod_tensor)
tensor_array[0, 0, 0, 0] = 1.0 tensor_array[0, 0, 0, 0] = 1.0
......
...@@ -36,7 +36,7 @@ class TestUniformRandomOp(unittest.TestCase): ...@@ -36,7 +36,7 @@ class TestUniformRandomOp(unittest.TestCase):
self.uniform_random_test(place=core.CPUPlace()) self.uniform_random_test(place=core.CPUPlace())
def test_gpu(self): def test_gpu(self):
if core.is_compile_gpu(): if core.is_compiled_with_cuda():
self.uniform_random_test(place=core.CUDAPlace(0)) self.uniform_random_test(place=core.CUDAPlace(0))
def uniform_random_test(self, place): def uniform_random_test(self, place):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy
import collections
import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
from paddle.v2.fluid.initializer import ConstantInitializer
from paddle.v2.fluid.param_attr import WeightNormParamAttr
class TestWeightNormalization(unittest.TestCase):
batch_size = 3
hidden_size = 5
data_desc = (['x', [10], 0], )
@classmethod
def setUpClass(cls):
cls.set_program()
@classmethod
def set_program(cls):
data = fluid.layers.data(
name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
out = fluid.layers.fc(input=data,
size=cls.hidden_size,
param_attr=WeightNormParamAttr(
dim=None,
name='weight_norm_param',
initializer=ConstantInitializer(1.0)),
bias_attr=False,
act=None)
loss = fluid.layers.reduce_sum(out)
fluid.backward.append_backward(loss=loss)
cls.fetch_list = [
'weight_norm_param_g', 'weight_norm_param_v',
'weight_norm_param_g@GRAD'
]
def run_program(self):
outputs = []
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.set_inputs(place)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())
output = exe.run(fluid.default_main_program(),
feed=self.inputs,
fetch_list=self.fetch_list,
return_numpy=False)
outputs.append(output)
self.actual_outputs = outputs
def set_data(self):
self.data = collections.OrderedDict()
for desc in self.data_desc:
data_name = desc[0]
data_shape = desc[1]
data_lod_level = desc[2]
data_lod = []
for i in range(data_lod_level):
lod_level_i = numpy.random.randint(
low=1,
high=5,
size=self.batch_size if i == 0 else lod_level_i[-1])
lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
data_lod.append(lod_level_i)
data_value = numpy.random.random(
size=[data_lod[-1][-1] if data_lod else self.batch_size
] + data_shape).astype('float32')
self.data[data_name] = (data_value, data_lod)
def set_inputs(self, place):
self.inputs = {}
for desc in self.data_desc:
tensor = fluid.Tensor()
tensor.set(self.data[desc[0]][0], place)
if self.data[desc[0]][1]:
tensor.set_lod(self.data[desc[0]][1])
self.inputs[desc[0]] = tensor
def weight_normalize(self):
v = numpy.ones((self.data[self.data_desc[0][0]][0].shape[-1],
self.hidden_size))
g = numpy.linalg.norm(v, axis=None, keepdims=True)
w = g * v / numpy.linalg.norm(v, axis=None, keepdims=True)
x = self.data[self.data_desc[0][0]][0]
out = numpy.dot(x, w)
g_grad = (numpy.dot(x.T, numpy.ones_like(out)) * (v / numpy.linalg.norm(
v, axis=None, keepdims=True))).sum(axis=None, keepdims=True)
return g, v, g_grad
def test_weight_normalization(self):
self.set_data()
self.run_program()
expect_output = self.weight_normalize()
for actual_output in self.actual_outputs:
[
self.assertTrue(
numpy.allclose(
numpy.array(actual), expect, atol=0.001))
for expect, actual in zip(expect_output, actual_output)
]
if __name__ == '__main__':
unittest.main()
...@@ -176,7 +176,6 @@ def resize_short(im, size): ...@@ -176,7 +176,6 @@ def resize_short(im, size):
:param size: the shorter edge size of image after resizing. :param size: the shorter edge size of image after resizing.
:type size: int :type size: int
""" """
assert im.shape[-1] == 1 or im.shape[-1] == 3
h, w = im.shape[:2] h, w = im.shape[:2]
h_new, w_new = size, size h_new, w_new = size, size
if h > w: if h > w:
...@@ -267,7 +266,7 @@ def random_crop(im, size, is_color=True): ...@@ -267,7 +266,7 @@ def random_crop(im, size, is_color=True):
return im return im
def left_right_flip(im): def left_right_flip(im, is_color=True):
""" """
Flip an image along the horizontal direction. Flip an image along the horizontal direction.
Return the flipped image. Return the flipped image.
...@@ -278,13 +277,15 @@ def left_right_flip(im): ...@@ -278,13 +277,15 @@ def left_right_flip(im):
im = left_right_flip(im) im = left_right_flip(im)
:paam im: input image with HWC layout :param im: input image with HWC layout or HW layout for gray image
:type im: ndarray :type im: ndarray
:param is_color: whether input image is color or not
:type is_color: bool
""" """
if len(im.shape) == 3: if len(im.shape) == 3 and is_color:
return im[:, ::-1, :] return im[:, ::-1, :]
else: else:
return im[:, ::-1, :] return im[:, ::-1]
def simple_transform(im, def simple_transform(im,
...@@ -321,8 +322,9 @@ def simple_transform(im, ...@@ -321,8 +322,9 @@ def simple_transform(im,
if is_train: if is_train:
im = random_crop(im, crop_size, is_color=is_color) im = random_crop(im, crop_size, is_color=is_color)
if np.random.randint(2) == 0: if np.random.randint(2) == 0:
im = left_right_flip(im) im = left_right_flip(im, is_color)
else: else:
im = center_crop(im, crop_size, is_color)
im = center_crop(im, crop_size, is_color=is_color) im = center_crop(im, crop_size, is_color=is_color)
if len(im.shape) == 3: if len(im.shape) == 3:
im = to_chw(im) im = to_chw(im)
...@@ -331,8 +333,10 @@ def simple_transform(im, ...@@ -331,8 +333,10 @@ def simple_transform(im,
if mean is not None: if mean is not None:
mean = np.array(mean, dtype=np.float32) mean = np.array(mean, dtype=np.float32)
# mean value, may be one value per channel # mean value, may be one value per channel
if mean.ndim == 1: if mean.ndim == 1 and is_color:
mean = mean[:, np.newaxis, np.newaxis] mean = mean[:, np.newaxis, np.newaxis]
elif mean.ndim == 1:
mean = mean
else: else:
# elementwise mean # elementwise mean
assert len(mean.shape) == len(im) assert len(mean.shape) == len(im)
...@@ -372,6 +376,6 @@ def load_and_transform(filename, ...@@ -372,6 +376,6 @@ def load_and_transform(filename,
mean values per channel. mean values per channel.
:type mean: numpy array | list :type mean: numpy array | list
""" """
im = load_image(filename) im = load_image(filename, is_color)
im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean) im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
return im return im
...@@ -35,7 +35,7 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf ...@@ -35,7 +35,7 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
RUN yum install -y sqlite-devel zlib-devel openssl-devel boost boost-devel pcre-devel vim tk-devel tkinter libtool RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool
RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册