提交 8d9c3fc6 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dist_train_benchmark_vgg16

...@@ -9,7 +9,7 @@ import subprocess ...@@ -9,7 +9,7 @@ import subprocess
import platform import platform
COPYRIGHT = ''' COPYRIGHT = '''
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
......
...@@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING) ...@@ -31,9 +31,6 @@ if(NOT CMAKE_CROSSCOMPILING)
endif(NOT CMAKE_CROSSCOMPILING) endif(NOT CMAKE_CROSSCOMPILING)
find_package(Git REQUIRED) find_package(Git REQUIRED)
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
if(NOT ANDROID AND NOT IOS)
find_package(Boost QUIET)
endif()
include(simd) include(simd)
...@@ -42,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F ...@@ -42,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON) option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
...@@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas ...@@ -140,6 +137,7 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc include(external/warpctc) # download, build, install warpctc
include(external/boost) # download, build, install boost
include(external/any) # download libn::any include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
...@@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}") ...@@ -164,7 +162,6 @@ include_directories("${PADDLE_SOURCE_DIR}")
include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include") include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto") include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c") include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
include_directories(${Boost_INCLUDE_DIRS})
set(EXTERNAL_LIBS set(EXTERNAL_LIBS
${GFLAGS_LIBRARIES} ${GFLAGS_LIBRARIES}
......
...@@ -27,7 +27,7 @@ RUN apt-get update && \ ...@@ -27,7 +27,7 @@ RUN apt-get update && \
curl sed grep graphviz libjpeg-dev zlib1g-dev \ curl sed grep graphviz libjpeg-dev zlib1g-dev \
python-matplotlib gcc-4.8 g++-4.8 \ python-matplotlib gcc-4.8 g++-4.8 \
automake locales clang-format swig doxygen cmake \ automake locales clang-format swig doxygen cmake \
liblapack-dev liblapacke-dev libboost-dev \ liblapack-dev liblapacke-dev \
clang-3.8 llvm-3.8 libclang-3.8-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \
net-tools libtool && \ net-tools libtool && \
apt-get clean -y apt-get clean -y
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include(ExternalProject)
set(BOOST_PROJECT "extern_boost")
set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0")
set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
include_directories(${BOOST_INCLUDE_DIR})
ExternalProject_Add(
${BOOST_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
&& tar zxf ${BOOST_TAR}.tar.gz
DOWNLOAD_NO_PROGRESS 1
PREFIX ${BOOST_SOURCES_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
UPDATE_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
add_library(boost STATIC ${dummyfile})
else()
add_library(boost INTERFACE)
endif()
add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
...@@ -224,12 +224,18 @@ function(cc_test TARGET_NAME) ...@@ -224,12 +224,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS}) add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif() endif()
endfunction(cc_test) endfunction(cc_test)
...@@ -457,7 +463,7 @@ endfunction() ...@@ -457,7 +463,7 @@ endfunction()
function(py_test TARGET_NAME) function(py_test TARGET_NAME)
if(WITH_TESTING) if(WITH_TESTING)
set(options STATIC static SHARED shared) set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS) set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
DataFeeder data_feeder
=========== ===========
DataFeeder DataFeeder
----------- ----------
.. automodule:: paddle.v2.fluid.data_feeder
:members: DataFeeder .. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
:members:
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Evaluator !DO NOT EDIT THIS FILE MANUALLY!
===========
=========
Evaluator evaluator
----------- =========
.. automodule:: paddle.v2.fluid.evaluator
:members: Evaluator Accuracy
--------
.. autoclass:: paddle.v2.fluid.evaluator.Accuracy
:members:
:noindex: :noindex:
ChunkEvaluator
--------------
.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
:members:
:noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Executor !DO NOT EDIT THIS FILE MANUALLY!
===========
========
executor
========
Executor Executor
--------
.. autoclass:: paddle.v2.fluid.executor.Executor
:members:
:noindex:
global_scope
------------
.. autofunction:: paddle.v2.fluid.executor.global_scope
:noindex:
scope_guard
----------- -----------
.. automodule:: paddle.v2.fluid.executor
:members: Executor .. autofunction:: paddle.v2.fluid.executor.scope_guard
:noindex:
switch_scope
------------
.. autofunction:: paddle.v2.fluid.executor.switch_scope
:noindex: :noindex:
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import sys
import types
import paddle.v2.fluid as fluid
def parse_arg():
parser = argparse.ArgumentParser()
parser.add_argument('--submodules', nargs="*")
parser.add_argument(
'module', type=str, help='Generate the documentation of which module')
return parser.parse_args()
class DocGenerator(object):
def __init__(self, module_name, stream=sys.stdout):
self.stream = stream
self.module_name = module_name
if not hasattr(fluid, module_name):
raise ValueError("Cannot find fluid.{0}".format(module_name))
else:
self.module = getattr(fluid, module_name)
self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
''')
self._print_header_(module_name, dot='=', is_title=True)
def print_submodule(self, submodule_name):
submodule = getattr(self.module, submodule_name)
if submodule is None:
raise ValueError("Cannot find submodule {0}".format(submodule_name))
self.print_section(submodule_name)
for item in submodule.__all__:
self.print_item(item)
def print_current_module(self):
for item in self.module.__all__:
self.print_item(item)
def print_section(self, name):
self._print_header_(name, dot='=', is_title=False)
def print_item(self, name):
item = getattr(self.module, name)
if isinstance(item, types.TypeType):
self.print_class(name)
elif isinstance(item, types.FunctionType):
self.print_method(name)
else:
raise RuntimeError("Unsupported item {0}".format(name))
def print_class(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1}
:members:
:noindex:
'''.format(self.module_name, name))
def print_method(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1}
:noindex:
'''.format(self.module_name, name))
def _print_header_(self, name, dot, is_title):
dot_line = dot * len(name)
if is_title:
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write(name)
self.stream.write('\n')
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write('\n')
def main():
args = parse_arg()
gen = DocGenerator(args.module)
if args.submodules is None:
gen.print_current_module()
else:
for submodule_name in args.submodules:
gen.print_submodule(submodule_name)
if __name__ == '__main__':
main()
#!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
do
python gen_doc.py ${module} > ${module}.rst
done
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Initializer initializer
=========== ===========
Constant
--------
.. autoclass:: paddle.v2.fluid.initializer.Constant
Initializer :members:
-----------
.. automodule:: paddle.v2.fluid.initializer
:members: Initializer
:noindex:
ConstantInitializer
-------------------
.. automodule:: paddle.v2.fluid.initializer
:members: ConstantInitializer
:noindex: :noindex:
Uniform
-------
.. autoclass:: paddle.v2.fluid.initializer.Uniform
UniformInitializer :members:
------------------
.. automodule:: paddle.v2.fluid.initializer
:members: UniformInitializer
:noindex:
NormalInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: NormalInitializer
:noindex: :noindex:
Normal
------
XavierInitializer .. autoclass:: paddle.v2.fluid.initializer.Normal
----------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: XavierInitializer
:noindex: :noindex:
Xavier
------
MSRAInitializer .. autoclass:: paddle.v2.fluid.initializer.Xavier
--------------- :members:
.. automodule:: paddle.v2.fluid.initializer
:members: MSRAInitializer
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
IO !DO NOT EDIT THIS FILE MANUALLY!
===========
==
io
==
save_vars
---------
is_parameter .. autofunction:: paddle.v2.fluid.io.save_vars
:noindex:
save_params
----------- -----------
.. autofunction:: paddle.v2.fluid.io.is_parameter
.. autofunction:: paddle.v2.fluid.io.save_params
:noindex:
save_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.save_persistables
:noindex:
load_vars
---------
.. autofunction:: paddle.v2.fluid.io.load_vars
:noindex:
load_params
-----------
.. autofunction:: paddle.v2.fluid.io.load_params
:noindex: :noindex:
load_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.load_persistables
:noindex:
save_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.save_inference_model
:noindex:
load_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.load_inference_model
:noindex:
get_inference_program
---------------------
.. autofunction:: paddle.v2.fluid.io.get_inference_program
:noindex:
========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Layers !DO NOT EDIT THIS FILE MANUALLY!
==========
======
layers
======
fc control_flow
--- ============
.. autofunction:: paddle.v2.fluid.layers.fc
:noindex:
embedding split_lod_tensor
--------- ----------------
.. autofunction:: paddle.v2.fluid.layers.embedding
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex: :noindex:
dynamic_lstm merge_lod_tensor
------------ ----------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex: :noindex:
data BlockGuard
---- ----------
.. autofunction:: paddle.v2.fluid.layers.data
.. autoclass:: paddle.v2.fluid.layers.BlockGuard
:members:
:noindex: :noindex:
mean BlockGuardWithCompletion
---- ------------------------
.. autofunction:: paddle.v2.fluid.layers.mean
.. autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
:members:
:noindex: :noindex:
mul StaticRNNMemoryLink
--- -------------------
.. autofunction:: paddle.v2.fluid.layers.mul
.. autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
:members:
:noindex: :noindex:
elementwise_add WhileGuard
--------------- ----------
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
.. autoclass:: paddle.v2.fluid.layers.WhileGuard
:members:
:noindex: :noindex:
elementwise_sub While
--------------- -----
.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
.. autoclass:: paddle.v2.fluid.layers.While
:members:
:noindex: :noindex:
elementwise_mul lod_rank_table
--------------- --------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex: :noindex:
elementwise_div max_sequence_len
--------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex: :noindex:
topk
----
dropout .. autofunction:: paddle.v2.fluid.layers.topk
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex: :noindex:
lod_tensor_to_array
-------------------
reshape .. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
--------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex: :noindex:
array_to_lod_tensor
-------------------
sigmoid .. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex:
increment
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.sigmoid
.. autofunction:: paddle.v2.fluid.layers.increment
:noindex: :noindex:
array_write
-----------
scale .. autofunction:: paddle.v2.fluid.layers.array_write
:noindex:
create_array
------------
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex:
less_than
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.scale
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex: :noindex:
array_read
----------
transpose .. autofunction:: paddle.v2.fluid.layers.array_read
:noindex:
shrink_memory
-------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex:
array_length
------------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex:
IfElse
------
.. autoclass:: paddle.v2.fluid.layers.IfElse
:members:
:noindex:
DynamicRNN
----------
.. autoclass:: paddle.v2.fluid.layers.DynamicRNN
:members:
:noindex:
ConditionalBlock
----------------
.. autoclass:: paddle.v2.fluid.layers.ConditionalBlock
:members:
:noindex:
StaticRNN
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.transpose
.. autoclass:: paddle.v2.fluid.layers.StaticRNN
:members:
:noindex: :noindex:
reorder_lod_tensor_by_rank
--------------------------
sigmoid_cross_entropy_with_logits .. autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
---------------------------------
.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
:noindex: :noindex:
ParallelDo
----------
cast .. autoclass:: paddle.v2.fluid.layers.ParallelDo
:members:
:noindex:
Print
-----
.. autofunction:: paddle.v2.fluid.layers.Print
:noindex:
device
======
get_places
----------
.. autofunction:: paddle.v2.fluid.layers.get_places
:noindex:
io
==
data
---- ----
.. autofunction:: paddle.v2.fluid.layers.cast
.. autofunction:: paddle.v2.fluid.layers.data
:noindex: :noindex:
BlockGuardServ
--------------
concat .. autoclass:: paddle.v2.fluid.layers.BlockGuardServ
------- :members:
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex: :noindex:
ListenAndServ
-------------
sums .. autoclass:: paddle.v2.fluid.layers.ListenAndServ
:members:
:noindex:
Send
---- ----
.. autofunction:: paddle.v2.fluid.layers.sums
.. autofunction:: paddle.v2.fluid.layers.Send
:noindex: :noindex:
nn
==
linear_chain_crf fc
---------------- --
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
.. autofunction:: paddle.v2.fluid.layers.fc
:noindex: :noindex:
embedding
---------
assign
-------
.. autofunction:: paddle.v2.fluid.layers.embedding .. autofunction:: paddle.v2.fluid.layers.embedding
:noindex: :noindex:
dynamic_lstm
------------
split_lod_tensor .. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex: :noindex:
dynamic_lstmp
-------------
merge_lod_tensor .. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
:noindex:
dynamic_gru
-----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
:noindex:
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex:
linear_chain_crf
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
:noindex:
crf_decoding
------------
.. autofunction:: paddle.v2.fluid.layers.crf_decoding
:noindex: :noindex:
cos_sim cos_sim
-------- -------
.. autofunction:: paddle.v2.fluid.layers.cos_sim .. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex: :noindex:
cross_entropy cross_entropy
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.cross_entropy .. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex: :noindex:
square_error_cost square_error_cost
----------------- -----------------
.. autofunction:: paddle.v2.fluid.layers.square_error_cost .. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex: :noindex:
accuracy accuracy
--------- --------
.. autofunction:: paddle.v2.fluid.layers.accuracy .. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex: :noindex:
chunk_eval
----------
.. autofunction:: paddle.v2.fluid.layers.chunk_eval
:noindex:
sequence_conv sequence_conv
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_conv .. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex: :noindex:
conv2d conv2d
------ ------
.. autofunction:: paddle.v2.fluid.layers.conv2d .. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex: :noindex:
sequence_pool sequence_pool
------------- -------------
.. autofunction:: paddle.v2.fluid.layers.sequence_pool .. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex: :noindex:
pool2d
------
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex:
batch_norm
----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex:
beam_search_decode
------------------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex:
conv2d_transpose
----------------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex:
sequence_expand
---------------
.. autofunction:: paddle.v2.fluid.layers.sequence_expand
:noindex:
lstm_unit
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex:
reduce_sum
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex:
reduce_mean
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex:
reduce_max
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex:
reduce_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex:
sequence_first_step sequence_first_step
------------------- -------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step .. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex: :noindex:
sequence_last_step sequence_last_step
------------------ ------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
:noindex:
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
pool2d
------
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex: :noindex:
dropout
-------
batch_norm .. autofunction:: paddle.v2.fluid.layers.dropout
----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex: :noindex:
split
-----
beam_search_decode .. autofunction:: paddle.v2.fluid.layers.split
------------------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex: :noindex:
ctc_greedy_decoder
------------------
lod_rank_table .. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
--------------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex: :noindex:
edit_distance
-------------
max_sequence_len .. autofunction:: paddle.v2.fluid.layers.edit_distance
----------------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex: :noindex:
l2_normalize
------------
topk .. autofunction:: paddle.v2.fluid.layers.l2_normalize
-----
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex: :noindex:
matmul
------
lod_tensor_to_array .. autofunction:: paddle.v2.fluid.layers.matmul
-------------------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex: :noindex:
warpctc
-------
.. autofunction:: paddle.v2.fluid.layers.warpctc
array_to_lod_tensor
-------------------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex: :noindex:
sequence_reshape
----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
:noindex:
transpose
---------
fill_constant .. autofunction:: paddle.v2.fluid.layers.transpose
-------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex: :noindex:
im2sequence
-----------
.. autofunction:: paddle.v2.fluid.layers.im2sequence
fill_constant_batch_size_like
-----------------------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex: :noindex:
nce
---
ones .. autofunction:: paddle.v2.fluid.layers.nce
----
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex: :noindex:
beam_search
-----------
zeros .. autofunction:: paddle.v2.fluid.layers.beam_search
-----
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex: :noindex:
row_conv
--------
increment .. autofunction:: paddle.v2.fluid.layers.row_conv
---------
.. autofunction:: paddle.v2.fluid.layers.increment
:noindex: :noindex:
multiplex
---------
array_write .. autofunction:: paddle.v2.fluid.layers.multiplex
-----------
.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex: :noindex:
ops
===
mean
----
create_array .. autofunction:: paddle.v2.fluid.layers.mean
------------
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex: :noindex:
mul
---
less_than .. autofunction:: paddle.v2.fluid.layers.mul
---------
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex: :noindex:
reshape
-------
array_read .. autofunction:: paddle.v2.fluid.layers.reshape
----------
.. autofunction:: paddle.v2.fluid.layers.array_read
:noindex: :noindex:
scale
-----
shrink_memory .. autofunction:: paddle.v2.fluid.layers.scale
--------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex: :noindex:
sigmoid_cross_entropy_with_logits
---------------------------------
array_length .. autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
-------------
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex: :noindex:
elementwise_add
---------------
conv2d_transpose .. autofunction:: paddle.v2.fluid.layers.elementwise_add
----------------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex: :noindex:
elementwise_div
sequence_expand
--------------- ---------------
.. autofunction:: paddle.v2.fluid.layers.sequence_expand
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex: :noindex:
elementwise_sub
---------------
gru_unit .. autofunction:: paddle.v2.fluid.layers.elementwise_sub
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex: :noindex:
elementwise_mul
---------------
lstm_unit .. autofunction:: paddle.v2.fluid.layers.elementwise_mul
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex: :noindex:
elementwise_max
---------------
sequence_softmax .. autofunction:: paddle.v2.fluid.layers.elementwise_max
----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
:noindex: :noindex:
elementwise_min
---------------
reduce_sum .. autofunction:: paddle.v2.fluid.layers.elementwise_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex: :noindex:
elementwise_pow
---------------
reduce_mean .. autofunction:: paddle.v2.fluid.layers.elementwise_pow
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex: :noindex:
clip
----
reduce_max .. autofunction:: paddle.v2.fluid.layers.clip
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex: :noindex:
clip_by_norm
------------
reduce_min .. autofunction:: paddle.v2.fluid.layers.clip_by_norm
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex: :noindex:
sequence_softmax
----------------
split .. autofunction:: paddle.v2.fluid.layers.sequence_softmax
-----
.. autofunction:: paddle.v2.fluid.layers.split
:noindex: :noindex:
sigmoid
-------
matmul .. autofunction:: paddle.v2.fluid.layers.sigmoid
------
.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex: :noindex:
logsigmoid logsigmoid
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.logsigmoid .. autofunction:: paddle.v2.fluid.layers.logsigmoid
:noindex: :noindex:
exp exp
--- ---
.. autofunction:: paddle.v2.fluid.layers.exp .. autofunction:: paddle.v2.fluid.layers.exp
:noindex: :noindex:
relu relu
---- ----
.. autofunction:: paddle.v2.fluid.layers.relu .. autofunction:: paddle.v2.fluid.layers.relu
:noindex: :noindex:
tanh tanh
---- ----
.. autofunction:: paddle.v2.fluid.layers.tanh .. autofunction:: paddle.v2.fluid.layers.tanh
:noindex: :noindex:
tanh_shrink tanh_shrink
----------- -----------
.. autofunction:: paddle.v2.fluid.layers.tanh_shrink .. autofunction:: paddle.v2.fluid.layers.tanh_shrink
:noindex: :noindex:
softshrink softshrink
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.softshrink .. autofunction:: paddle.v2.fluid.layers.softshrink
:noindex: :noindex:
sqrt sqrt
---- ----
.. autofunction:: paddle.v2.fluid.layers.sqrt .. autofunction:: paddle.v2.fluid.layers.sqrt
:noindex: :noindex:
abs abs
---- ---
.. autofunction:: paddle.v2.fluid.layers.abs .. autofunction:: paddle.v2.fluid.layers.abs
:noindex: :noindex:
ceil ceil
---- ----
.. autofunction:: paddle.v2.fluid.layers.ceil .. autofunction:: paddle.v2.fluid.layers.ceil
:noindex: :noindex:
floor floor
----- -----
.. autofunction:: paddle.v2.fluid.layers.floor .. autofunction:: paddle.v2.fluid.layers.floor
:noindex: :noindex:
round round
----- -----
.. autofunction:: paddle.v2.fluid.layers.round .. autofunction:: paddle.v2.fluid.layers.round
:noindex: :noindex:
reciprocal reciprocal
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.reciprocal .. autofunction:: paddle.v2.fluid.layers.reciprocal
:noindex: :noindex:
log log
--- ---
.. autofunction:: paddle.v2.fluid.layers.log .. autofunction:: paddle.v2.fluid.layers.log
:noindex: :noindex:
square square
------ ------
.. autofunction:: paddle.v2.fluid.layers.square .. autofunction:: paddle.v2.fluid.layers.square
:noindex: :noindex:
softplus softplus
-------- --------
.. autofunction:: paddle.v2.fluid.layers.softplus .. autofunction:: paddle.v2.fluid.layers.softplus
:noindex: :noindex:
softsign softsign
--------- --------
.. autofunction:: paddle.v2.fluid.layers.softsign .. autofunction:: paddle.v2.fluid.layers.softsign
:noindex: :noindex:
brelu brelu
----- -----
.. autofunction:: paddle.v2.fluid.layers.brelu .. autofunction:: paddle.v2.fluid.layers.brelu
:noindex: :noindex:
leaky_relu leaky_relu
---------- ----------
.. autofunction:: paddle.v2.fluid.layers.leaky_relu .. autofunction:: paddle.v2.fluid.layers.leaky_relu
:noindex: :noindex:
soft_relu soft_relu
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.soft_relu .. autofunction:: paddle.v2.fluid.layers.soft_relu
:noindex: :noindex:
elu elu
---- ---
.. autofunction:: paddle.v2.fluid.layers.elu .. autofunction:: paddle.v2.fluid.layers.elu
:noindex: :noindex:
relu6 relu6
----- -----
.. autofunction:: paddle.v2.fluid.layers.relu6 .. autofunction:: paddle.v2.fluid.layers.relu6
:noindex: :noindex:
pow pow
---- ---
.. autofunction:: paddle.v2.fluid.layers.pow .. autofunction:: paddle.v2.fluid.layers.pow
:noindex: :noindex:
stanh
-----
.. autofunction:: paddle.v2.fluid.layers.stanh
:noindex:
hard_shrink hard_shrink
----------- -----------
.. autofunction:: paddle.v2.fluid.layers.hard_shrink .. autofunction:: paddle.v2.fluid.layers.hard_shrink
:noindex: :noindex:
thresholded_relu thresholded_relu
---------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.thresholded_relu .. autofunction:: paddle.v2.fluid.layers.thresholded_relu
:noindex: :noindex:
hard_sigmoid hard_sigmoid
------------- ------------
.. autofunction:: paddle.v2.fluid.layers.hard_sigmoid .. autofunction:: paddle.v2.fluid.layers.hard_sigmoid
:noindex: :noindex:
swish swish
------ -----
.. autofunction:: paddle.v2.fluid.layers.swish .. autofunction:: paddle.v2.fluid.layers.swish
:noindex: :noindex:
edit_distance tensor
--------------- ======
.. autofunction:: paddle.v2.fluid.layers.edit_distance_error
create_tensor
-------------
.. autofunction:: paddle.v2.fluid.layers.create_tensor
:noindex: :noindex:
ctc_greedy_decoder create_parameter
--------------- ----------------
.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
.. autofunction:: paddle.v2.fluid.layers.create_parameter
:noindex: :noindex:
l2_normalize create_global_var
------------ -----------------
.. autofunction:: paddle.v2.fluid.layers.l2_normalize
.. autofunction:: paddle.v2.fluid.layers.create_global_var
:noindex: :noindex:
sequence_reshape cast
---------------- ----
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
.. autofunction:: paddle.v2.fluid.layers.cast
:noindex:
concat
------
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex:
sums
----
.. autofunction:: paddle.v2.fluid.layers.sums
:noindex:
assign
------
.. autofunction:: paddle.v2.fluid.layers.assign
:noindex:
fill_constant_batch_size_like
-----------------------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex:
fill_constant
-------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex:
ones
----
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex:
zeros
-----
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Nets !DO NOT EDIT THIS FILE MANUALLY!
===========
====
nets
====
simple_img_conv_pool simple_img_conv_pool
-------------------- --------------------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
img_conv_group
---------------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
:noindex: :noindex:
sequence_conv_pool sequence_conv_pool
------------------ ------------------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool .. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex: :noindex:
glu glu
--- ---
.. autofunction:: paddle.v2.fluid.nets.glu .. autofunction:: paddle.v2.fluid.nets.glu
:noindex: :noindex:
scaled_dot_product_attention
----------------------------
dot_product_attention .. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
---------------------
.. autofunction:: paddle.v2.fluid.nets.dot_product_attention
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Optimizer !DO NOT EDIT THIS FILE MANUALLY!
===========
Optimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: Optimizer
:noindex:
=========
optimizer
=========
SGDOptimizer SGD
----------- ---
.. automodule:: paddle.v2.fluid.optimizer
:members: SGDOptimizer
:noindex:
.. autoclass:: paddle.v2.fluid.optimizer.SGD
:members:
:noindex:
Momentum
--------
MomentumOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Momentum
----------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
:noindex: :noindex:
Adagrad
-------
.. autoclass:: paddle.v2.fluid.optimizer.Adagrad
AdagradOptimizer :members:
----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
:noindex: :noindex:
Adam
----
AdamOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adam
------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
:noindex: :noindex:
Adamax
------
AdamaxOptimizer .. autoclass:: paddle.v2.fluid.optimizer.Adamax
----------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamaxOptimizer
:noindex: :noindex:
DecayedAdagrad
--------------
DecayedAdagradOptimizer .. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
----------------------- :members:
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
param_attr
==========
ParamAttr ParamAttr
=========== ---------
.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr
:members:
:noindex:
WeightNormParamAttr
-------------------
ParamAttr .. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
----------- :members:
.. automodule:: paddle.v2.fluid.param_attr
:members: ParamAttr
:noindex: :noindex:
=========== .. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
Profiler !DO NOT EDIT THIS FILE MANUALLY!
===========
========
profiler
========
cuda_profiler
-------------
Profiler
-----------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler .. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex: :noindex:
reset_profiler
--------------
.. autofunction:: paddle.v2.fluid.profiler.reset_profiler
:noindex:
profiler
--------
.. autofunction:: paddle.v2.fluid.profiler.profiler
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========== ===========
Regularizer regularizer
=========== ===========
WeightDecayRegularizer append_regularization_ops
---------------------- -------------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
L2DecayRegularizer .. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
:noindex: :noindex:
L1Decay
-------
.. autoclass:: paddle.v2.fluid.regularizer.L1Decay
:members:
:noindex:
L1DecayRegularizer L2Decay
------------------- -------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
.. autoclass:: paddle.v2.fluid.regularizer.L2Decay
:members:
:noindex:
# Design Doc: CSP in PaddlePaddle Fluid
## Motivation
Concurrent programming is important for deep learning. Few example applications are:
1. The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
2. The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
## Concurrent Programming Models
There were many concurrent programming models, implemented in various forms:
| concurrent programming model | implementation |
|-----|-----|
| mutex | types and functions in standard libraries |
| semaphore | types and functions in standard libraries |
| communicating sequential processes (CSP) | Go programming language |
| actor model | Erlang programming language |
| message passing | MPI |
| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
### CSP v.s. Actor Model
A well-known implementation of Actor Model is the Erlang programming language. In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs. We can find the three ingredients, process with ID, send, and recv, in MPI too. Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code. Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
## CSP in Fluid
Fluid has two fundamental control-flows: *if-else* and *while*. If we are to implement CSP, we need the following:
1. a new data type: *channel* and operators *send* and *recv*,
1. *goroutine* or thread, and
1. a new control-flow: select.
We also need Python wrappers for the above components.
The type *channel* is conceptually the blocking queue. In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
The `select` operation has been in OS kernels long before Go language. All Unix kernels implement system calls *poll* and *select*. They monitor multiple file descriptors to see if I/O is possible on any of them. This takes O(N) time. Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time. In BSD systems, there is a similar system call *kqueue*. Go's Linux implementation uses epoll.
It might be a good idea to implement Fluid's select using epoll too. In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
### Type Channel
Fluid supports many data types:
1. Tensor,
1. Row-sparse Tensor
1. LoD Tensor,
1. Tensor array, etc
Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value. To add a new type channel, we need to add a new type enum.
To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file. [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
## Syntax Design
### Create Channel
In Go, we create a channel by specifying the element type and buffer size:
```go
ch := make(chan int) // a channel without buffer
ch1 := make(chan int, 100) // a channel that can buffer 100 ints.
```
In Fluid, we should be able to do the same:
```python
ch = fluid.make_chan(dtype=INT)
ch1 = fluid.make_chan(dtype=INT, 100)
```
In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
```python
ch = fluid.make_chan(dtype=Tensor, etype=float16)
```
or Tensors of Tensors of float16 etc.
The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
### Send and Recv
### Select
## Example Programs
### 1. RPC between Trainers and Parameter Servers
### 2. Concurrent Minibatch Loading
...@@ -152,12 +152,12 @@ for data in train_reader(): ...@@ -152,12 +152,12 @@ for data in train_reader():
`JobDesc` object describe the distributed job resource specification to run on `JobDesc` object describe the distributed job resource specification to run on
Cluster environment. Cluster environment.
<img src="src/remote_executor.png"/> <img src="src/remote_executor.png" width="500" align="center" />
`RemoteExecutor.run` sends the `ProgramDesc` and `RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource) [TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
to start the final Kubernetes Jobs to run the different role of `ProgramDesc`. to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
### Placement Algorithm ### Placement Algorithm
......
...@@ -9,16 +9,16 @@ different purposes. ...@@ -9,16 +9,16 @@ different purposes.
## Background ## Background
The previous implementations of the parameter server does not run a The previous implementations of the parameter server do not run a
fluid sub-program. Parameter initialization, optimizer computation, network fluid sub-program. Parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the communication and checkpointing are implemented twice on both the
trainer and the parameter server. trainer as well as the parameter server.
It would be great if we can write code once and use them on both the It would be great if we can write code once and use them on both: the
trainer and the parameter server: reduces code duplication and trainer and the parameter server, since this reduces code duplication and
improves extensibility. Given that after the current refactor, we are improves extensibility. Given that after the current refactoring, we are
representing everything as a computing graph on the representing everything as a computation graph on the
trainer. Representing everything as a computing graph on the parameter trainer. Representing everything as a computation graph on the parameter
server becomes a natural extension. server becomes a natural extension.
## Design ## Design
...@@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following ...@@ -30,9 +30,9 @@ into sub-programs to be scheduled on different nodes with the following
steps: steps:
1. OP placement: the OPs will be placed on different nodes according 1. OP placement: the OPs will be placed on different nodes according
to heuristic that minimizes estimated total computation to a heuristic that minimizes the estimated total computation
time. Currently we will use a simple heuristic that puts parameter time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer variable on parameter server workers and everything else on trainer
workers. workers.
1. Add communication OPs to enable the communication between nodes. 1. Add communication OPs to enable the communication between nodes.
...@@ -47,22 +47,22 @@ After converting: ...@@ -47,22 +47,22 @@ After converting:
<img src="src/dist-graph.png" width="700"/> <img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer program are placed on the parameter server. 1. The parameter variable W and its optimizer program are placed on the parameter server.
1. Operators are added to the program. 1. Operators are added to the program.
- *Send* sends data to the connected *Recv* operator. The - *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark to run when the *Send* operator has ran (the *Send* OP will mark
the *Recv* OP runnable automatically). the *Recv* OP runnable automatically).
- *Enueue* enqueues the input variable, it can block until space - *Enqueue* enqueues the input variable, it can block until space
become available in the queue. become available in the queue.
- *Dequeue* outputs configurable numbers of tensors from the - *Dequeue* outputs configurable numbers of tensors from the
queue. It will block until the queue have the required number of queue. It will block until the queue has the required number of
tensors. tensors.
### Benefits ### Benefits
- Model parallelism become easier to implement: it's an extension to - Model parallelism becomes easier to implement: it is an extension to
the trainer - parameter server approach. We can have several "Transpilers" the trainer - parameter server approach. We can have several "Transpilers"
to achieve different goals. to achieve different goals.
- User-defined optimizer is easier to add - user can now express it as - User-defined optimizer is easier to add - user can now express it as
...@@ -72,22 +72,22 @@ After converting: ...@@ -72,22 +72,22 @@ After converting:
### Challenges ### Challenges
- It's important to balance the parameter shards of on multiple - It is important to balance the parameter shards on multiple
parameter server. If a single parameter is very big (some parameter servers. If a single parameter is very big (for example: some
word-embedding, fully connected, softmax layer), we need to word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends parameter servers when possible (only element-wise optimizer depends
on the parameter variable). on the parameter variable).
- In the "Aync SGD" figure, the "W" variable on the parameter server - In the "Async SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently. See could be read and written concurrently. See
[here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
details about concurrent program in fluid. details about concurrent program in Fluid.
### Discussion ### Discussion
- Can the Enqueue OP be implemented under our current tensor design - Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)? (put the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the - *Dequeue* OP will have variable numbers of output (depending on the
`min_count` attribute), does our current design support it? (similar `min_count` attribute), does our current design support it? (similar
question for the *Add* OP) question for the *Add* OP)
......
...@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences ...@@ -22,7 +22,7 @@ The current `LoDTensor` is designed to store levels of variable-length sequences
The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
let's call this format the **absolute-offset LoD** for clarity. let's call this format the **absolute-offset LoD** for clarity.
The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
```python ```python
[[0, 3, 9] [[0, 3, 9]
[0, 2, 3, 3, 3, 9]] [0, 2, 3, 3, 3, 9]]
...@@ -119,7 +119,7 @@ def generate(): ...@@ -119,7 +119,7 @@ def generate():
encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
decoder_input = pd.fc( decoder_input = pd.fc(
act=pd.activation.Linear(), act=pd.activation.Linear(),
input=[target_word, encoder_ctx], input=[target_word, encoder_ctx_expanded],
size=3 * decoder_dim) size=3 * decoder_dim)
gru_out, cur_mem = pd.gru_step( gru_out, cur_mem = pd.gru_step(
decoder_input, mem=decoder_mem, size=decoder_dim) decoder_input, mem=decoder_mem, size=decoder_dim)
......
...@@ -140,7 +140,19 @@ TODO by Assignees ...@@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM ### Beam Search with CTC and LM
TODO by Assignees <div align="center">
<img src="image/beam_search.png" width=600><br/>
Figure 2. Algorithm for CTC Beam Search Decoder.
</div>
- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
- 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
- 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
- Such external scorer consists of language model, word count or any other custom scorers.
- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
## Future Work ## Future Work
...@@ -153,3 +165,4 @@ TODO by Assignees ...@@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016. 1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595. 2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
## Background ## Background
Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently. Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library. On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent. On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
...@@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith ...@@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith
There are mainly three parts that we have to consider while integrating a new device/library: There are mainly three parts that we have to consider while integrating a new device/library:
- Place and DeviceContext: indicates the device id and manages hardware resources - Place and DeviceContext: indicate the device id and manage hardware resources
- Memory and Tensor: malloc/free data on certain device - Memory and Tensor: malloc/free data on certain device
...@@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de ...@@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de
### Place and DeviceContext ### Place and DeviceContext
Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
#### Place #### Place
Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`. Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
``` ```
| CPUPlace | CPUPlace
...@@ -144,7 +144,7 @@ class Tensor { ...@@ -144,7 +144,7 @@ class Tensor {
}; };
``` ```
`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory. `Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
```cpp ```cpp
paddle::framework::Tensor t; paddle::framework::Tensor t;
...@@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi ...@@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi
Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example: Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
The interface is defined in header file. The interface is defined in the header file.
``` ```
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -174,7 +174,7 @@ class MaxOutFunctor { ...@@ -174,7 +174,7 @@ class MaxOutFunctor {
}; };
``` ```
CPU implemention is in .cc file CPU implementation is in .cc file
``` ```
template <typename T> template <typename T>
...@@ -188,7 +188,7 @@ class MaxOutFunctor<platform::CPUDeviceContext, T> { ...@@ -188,7 +188,7 @@ class MaxOutFunctor<platform::CPUDeviceContext, T> {
}; };
``` ```
CUDA implemention is in .cu file CUDA implementation is in .cu file
``` ```
template <typename T> template <typename T>
...@@ -203,9 +203,9 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> { ...@@ -203,9 +203,9 @@ class MaxOutFunctor<platform::CUDADeviceContext, T> {
``` ```
We get computing handle from a concrete DeviceContext, and make compution on tensors. We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map. The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
Fluid provides different register interfaces in op_registry.h Fluid provides different register interfaces in op_registry.h
...@@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL( ...@@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL(
## Advanced topics: How to switch between different Device/Library ## Advanced topics: How to switch between different Device/Library
Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library. Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
For more details, please refer to following docs: For more details, please refer to following docs:
......
...@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B ...@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B
"WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON" "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
"WITH_PYTHON", "是否内嵌PYTHON解释器", "ON" "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
"WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON" "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
"WITH_TESTING", "是否开启单元测试", "ON" "WITH_TESTING", "是否开启单元测试", "OFF"
"WITH_DOC", "是否编译中英文文档", "OFF" "WITH_DOC", "是否编译中英文文档", "OFF"
"WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto" "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
"WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON" "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
......
...@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like: ...@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
"WITH_AVX", "Build with AVX support", "ON" "WITH_AVX", "Build with AVX support", "ON"
"WITH_PYTHON", "Build with integrated Python interpreter", "ON" "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON" "WITH_STYLE_CHECK", "Check code style when building", "ON"
"WITH_TESTING", "Build unit tests", "ON" "WITH_TESTING", "Build unit tests", "OFF"
"WITH_DOC", "Build documentations", "OFF" "WITH_DOC", "Build documentations", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
......
...@@ -25,14 +25,14 @@ ...@@ -25,14 +25,14 @@
.. code-block:: bash .. code-block:: bash
docker pull docker.paddlepaddle.org/paddle docker pull docker.paddlepaddlehub.com/paddle
下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像: 下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像:
.. code-block:: bash .. code-block:: bash
docker pull paddlepaddle/paddle:latest-gpu docker pull paddlepaddle/paddle:latest-gpu
docker pull docker.paddlepaddle.org/paddle:latest-gpu docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
选择下载使用不同的BLAS库的Docker镜像: 选择下载使用不同的BLAS库的Docker镜像:
...@@ -49,7 +49,7 @@ ...@@ -49,7 +49,7 @@
docker pull paddlepaddle/paddle:[tag] docker pull paddlepaddle/paddle:[tag]
# 比如: # 比如:
docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
.. _docker_run: .. _docker_run:
...@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note ...@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
国内用户可以使用下面的镜像源来加速访问:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
然后在浏览器中输入以下网址: 然后在浏览器中输入以下网址:
.. code-block:: text .. code-block:: text
......
...@@ -26,14 +26,14 @@ For users in China, we provide a faster mirror: ...@@ -26,14 +26,14 @@ For users in China, we provide a faster mirror:
.. code-block:: bash .. code-block:: bash
docker pull docker.paddlepaddle.org/paddle docker pull docker.paddlepaddlehub.com/paddle
Download GPU version (cuda8.0_cudnn5_avx_mkl) images: Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
.. code-block:: bash .. code-block:: bash
docker pull paddlepaddle/paddle:latest-gpu docker pull paddlepaddle/paddle:latest-gpu
docker pull docker.paddlepaddle.org/paddle:latest-gpu docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
Choose between different BLAS version: Choose between different BLAS version:
...@@ -53,7 +53,7 @@ and run: ...@@ -53,7 +53,7 @@ and run:
docker pull paddlepaddle/paddle:[tag] docker pull paddlepaddle/paddle:[tag]
# i.e. # i.e.
docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
.. _docker_run: .. _docker_run:
...@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command: ...@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
docker run -p 8888:8888 paddlepaddle/book docker run -p 8888:8888 paddlepaddle/book
For users in China, we provide a faster mirror:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
Then, you would back and paste the address into the local browser: Then, you would back and paste the address into the local browser:
.. code-block:: text .. code-block:: text
......
...@@ -39,6 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具 ...@@ -39,6 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
......
...@@ -42,6 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star ...@@ -42,6 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
......
...@@ -60,8 +60,7 @@ each column is as follows: ...@@ -60,8 +60,7 @@ each column is as follows:
| column | meaning | | column | meaning |
| --- | --- | | --- | --- |
| ncalls | the number of calls into a function | | ncalls | the number of calls into a function |
| tottime | the total execution time of the function, not including the | tottime | the total execution time of the function, not including the execution time of other functions called by the function |
execution time of other functions called by the function |
| percall | tottime divided by ncalls | | percall | tottime divided by ncalls |
| cumtime | the total execution time of the function, including the execution time of other functions being called | | cumtime | the total execution time of the function, including the execution time of other functions being called |
| percall | cumtime divided by ncalls | | percall | cumtime divided by ncalls |
......
...@@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes ...@@ -16,6 +16,12 @@ PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes
PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html). PaddlePaddle build and installation guide can be found [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
``` bash
cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
```
### Update the training script ### Update the training script
#### Non-cluster training script #### Non-cluster training script
...@@ -119,7 +125,14 @@ for pass_id in range(100): ...@@ -119,7 +125,14 @@ for pass_id in range(100):
### E2E demo ### E2E demo
Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py). In parameter server node run the following in the command line: Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
First `cd` into the folder that contains the `python` files. In this case:
```bash
cd /paddle/python/paddle/v2/fluid/tests/book_distribute
```
In parameter server node run the following in the command line:
``` bash ``` bash
PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
......
...@@ -18,7 +18,7 @@ else() ...@@ -18,7 +18,7 @@ else()
add_subdirectory(capi) add_subdirectory(capi)
endif() endif()
if(Boost_FOUND) if(NOT ANDROID AND NOT IOS)
add_subdirectory(memory) add_subdirectory(memory)
add_subdirectory(platform) add_subdirectory(platform)
add_subdirectory(framework) add_subdirectory(framework)
......
# ddim lib # ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim)
...@@ -22,11 +22,11 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) ...@@ -22,11 +22,11 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto) cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_test(variable_test SRCS variable_test.cc) cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc) cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool) cc_library(scope SRCS scope.cc DEPS glog threadpool)
...@@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_ ...@@ -45,7 +45,7 @@ cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
cc_library(attribute SRCS attribute.cc DEPS framework_proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context) device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
...@@ -74,7 +74,10 @@ cc_library(backward SRCS backward.cc DEPS net_op) ...@@ -74,7 +74,10 @@ cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op) cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table profiler feed_fetch_method)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
...@@ -95,3 +98,5 @@ if(NOT WITH_C_API AND WITH_FLUID) ...@@ -95,3 +98,5 @@ if(NOT WITH_C_API AND WITH_FLUID)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details) install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
endif() endif()
cc_test(channel_test SRCS channel_test.cc)
...@@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { ...@@ -61,6 +61,9 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
} }
return val; return val;
} }
case proto::AttrType::LONG: {
return attr_desc.l();
}
default: default:
PADDLE_THROW("Unsupport attr type %d", attr_desc.type()); PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
} }
......
...@@ -168,6 +168,32 @@ struct ExtractAttribute<bool> { ...@@ -168,6 +168,32 @@ struct ExtractAttribute<bool> {
const std::string& attr_name_; const std::string& attr_name_;
}; };
template <>
struct ExtractAttribute<int64_t> {
explicit ExtractAttribute(const std::string& attr_name)
: attr_name_(attr_name) {}
int64_t* operator()(Attribute& attr) const {
if (attr.type() == typeid(int)) { // NOLINT
int val = boost::get<int>(attr);
attr = static_cast<int64_t>(val);
} else if (attr.type() == typeid(float)) { // NOLINT
int val = boost::get<float>(attr);
attr = static_cast<int64_t>(val);
}
int64_t* attr_value = nullptr;
try {
attr_value = &boost::get<int64_t>(attr);
} catch (boost::bad_get& bad_get) {
PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
attr_name_, attr.type().name());
}
return attr_value;
}
const std::string& attr_name_;
};
// check whether a certain attribute fit its limits // check whether a certain attribute fit its limits
// an attribute can have more than one limits // an attribute can have more than one limits
template <typename T> template <typename T>
......
...@@ -75,7 +75,7 @@ std::vector<VarDesc *> BlockDesc::AllVars() const { ...@@ -75,7 +75,7 @@ std::vector<VarDesc *> BlockDesc::AllVars() const {
OpDesc *BlockDesc::AppendOp() { OpDesc *BlockDesc::AppendOp() {
need_update_ = true; need_update_ = true;
ops_.emplace_back(new OpDesc()); ops_.emplace_back(new OpDesc(this));
return ops_.back().get(); return ops_.back().get();
} }
...@@ -86,7 +86,7 @@ void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) { ...@@ -86,7 +86,7 @@ void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
OpDesc *BlockDesc::PrependOp() { OpDesc *BlockDesc::PrependOp() {
need_update_ = true; need_update_ = true;
ops_.emplace_front(new OpDesc()); ops_.emplace_front(new OpDesc(this));
return ops_.front().get(); return ops_.front().get();
} }
...@@ -153,7 +153,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc) ...@@ -153,7 +153,7 @@ BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
vars_[var_desc.name()].reset(new VarDesc(var_desc)); vars_[var_desc.name()].reset(new VarDesc(var_desc));
} }
for (const proto::OpDesc &op_desc : desc_->ops()) { for (const proto::OpDesc &op_desc : desc_->ops()) {
ops_.emplace_back(new OpDesc(op_desc, prog)); ops_.emplace_back(new OpDesc(op_desc, prog, this));
} }
} }
...@@ -162,7 +162,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ...@@ -162,7 +162,7 @@ BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
: prog_(prog), desc_(desc) { : prog_(prog), desc_(desc) {
need_update_ = true; need_update_ = true;
for (auto &op : other.ops_) { for (auto &op : other.ops_) {
ops_.emplace_back(new OpDesc(*op)); ops_.emplace_back(new OpDesc(*op, this));
} }
for (auto &it : other.vars_) { for (auto &it : other.vars_) {
......
...@@ -14,38 +14,45 @@ limitations under the License. */ ...@@ -14,38 +14,45 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/block_desc.h" #include <stddef.h> // for size_t
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/program_desc.h"
namespace paddle { namespace paddle {
namespace framework {
// Channel is the abstract class of buffered and un-buffered channels.
template <typename T>
class Channel {
public:
virtual void Send(T*) = 0;
virtual void Receive(T*) = 0;
virtual size_t Cap() = 0;
virtual void Close() = 0;
virtual ~Channel() {}
};
class InferenceEngine { // Forward declaration of channel implementations.
public: namespace details {
InferenceEngine() : program_(nullptr), load_program_(nullptr) {} template <typename T>
~InferenceEngine() { class Buffered;
delete program_; template <typename T>
delete load_program_; class UnBuffered;
} // namespace details
template <typename T>
Channel<T>* MakeChannel(size_t buffer_size) {
if (buffer_size > 0) {
return new details::Buffered<T>(buffer_size);
} }
return new details::UnBuffered<T>();
}
void LoadInferenceModel(const std::string& dirname); template <typename T>
void LoadInferenceModel(const std::string& dirname, void CloseChannel(Channel<T>* ch) {
const std::vector<std::string>& feed_var_names, ch->Close();
const std::vector<std::string>& fetch_var_names); }
void Execute(const std::vector<framework::LoDTensor>& feeds,
std::vector<framework::LoDTensor>& fetchs);
private:
bool IsParameter(const framework::VarDesc* var);
void GenerateLoadProgram(const std::string& dirname);
void PrependFeedOp();
void AppendFetchOp();
private:
framework::ProgramDesc* program_;
framework::ProgramDesc* load_program_;
std::vector<std::string> feed_var_names_;
std::vector<std::string> fetch_var_names_;
};
} // namespace framework
} // namespace paddle } // namespace paddle
#include "paddle/framework/details/buffered_channel.h"
#include "paddle/framework/details/unbuffered_channel.h"
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/channel.h"
#include <chrono>
#include <thread>
#include "gtest/gtest.h"
using paddle::framework::Channel;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
TEST(Channel, MakeAndClose) {
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
{
// MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel<int>(10);
EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel<int>(0);
EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
}
TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
ch->Send(&i); // should not block
}
size_t out;
for (size_t i = 0; i < buffer_size; ++i) {
ch->Receive(&out); // should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
delete ch;
}
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
size_t sum = 0;
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
ch->Send(&i); // should not block
sum += i;
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum, 45U);
CloseChannel(ch);
t.join();
delete ch;
}
...@@ -79,5 +79,33 @@ inline void VisitDataType(proto::DataType type, Visitor visitor) { ...@@ -79,5 +79,33 @@ inline void VisitDataType(proto::DataType type, Visitor visitor) {
} }
} }
inline std::string DataTypeToString(const proto::DataType type) {
using namespace paddle::framework::proto;
switch (type) {
case DataType::FP16:
return "float16";
case DataType::FP32:
return "float32";
case DataType::FP64:
return "float64";
case DataType::INT16:
return "int16";
case DataType::INT32:
return "int32";
case DataType::INT64:
return "int64";
case DataType::BOOL:
return "bool";
default:
PADDLE_THROW("Not support type %d", type);
}
}
inline std::ostream& operator<<(std::ostream& out,
const proto::DataType& type) {
out << DataTypeToString(type);
return out;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <deque>
#include <mutex>
#include "paddle/framework/channel.h"
#include "paddle/platform/enforce.h"
namespace paddle {
namespace framework {
namespace details {
template <typename T>
class Buffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public:
virtual void Send(T*);
virtual void Receive(T*);
virtual size_t Cap() { return cap_; }
virtual void Close();
virtual ~Buffered();
private:
size_t cap_;
std::mutex mu_;
std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_;
std::deque<T> channel_;
bool closed_;
Buffered(size_t cap) : cap_(cap), closed_(false) {
PADDLE_ENFORCE_GT(cap, 0);
}
void NotifyAllSenders(std::unique_lock<std::mutex>*);
};
template <typename T>
void Buffered<T>::Send(T* item) {
std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
if (!closed_) {
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
}
}
template <typename T>
void Buffered<T>::Receive(T* item) {
std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
if (!closed_) {
*item = std::move(channel_.front());
channel_.pop_front();
NotifyAllSenders(&lock);
} else {
item = nullptr;
}
}
template <typename T>
void Buffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
NotifyAllSenders(&lock);
}
template <typename T>
Buffered<T>::~Buffered() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
channel_.clear();
NotifyAllSenders(&lock);
}
template <typename T>
void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
lock->unlock();
full_cond_var_.notify_all();
}
} // namespace details
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <condition_variable>
#include <deque>
#include <mutex>
#include "paddle/framework/channel.h"
namespace paddle {
namespace framework {
namespace details {
template <typename T>
class UnBuffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public:
virtual void Send(T*);
virtual void Receive(T*);
virtual size_t Cap() { return 0; }
virtual void Close();
virtual ~UnBuffered();
private:
UnBuffered() {}
};
template <typename T>
void UnBuffered<T>::Send(T* channel_element) {}
template <typename T>
void UnBuffered<T>::Receive(T*) {}
template <typename T>
void UnBuffered<T>::Close() {}
template <typename T>
UnBuffered<T>::~UnBuffered() {}
} // namespace details
} // namespace framework
} // namespace paddle
...@@ -17,13 +17,15 @@ limitations under the License. */ ...@@ -17,13 +17,15 @@ limitations under the License. */
#include <set> #include <set>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/framework/feed_fetch_method.h"
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
DECLARE_bool(do_memory_benchmark); DECLARE_bool(benchmark);
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely."); "extremely slow so please use this flag wisely.");
...@@ -31,9 +33,6 @@ DEFINE_bool(check_nan_inf, false, ...@@ -31,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle { namespace paddle {
namespace framework { namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";
Executor::Executor(const platform::Place& place) : place_(place) {} Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
...@@ -116,9 +115,14 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -116,9 +115,14 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugStringEx(local_scope); VLOG(4) << op->DebugStringEx(local_scope);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(op->Type(), pool.Get(place_));
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
if (FLAGS_do_memory_benchmark) { VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
} }
...@@ -135,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -135,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
if (create_vars && create_local_scope) { if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope); scope->DeleteScope(local_scope);
} }
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------"; VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: " VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_); << memory::memory_usage(place_);
...@@ -143,5 +147,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -143,5 +147,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
} }
} }
// Check whether the block already has feed operators and feed_holder.
// Return false if the block does not have any feed operators.
// If some feed operators have been prepended to the block, check that
// the info contained in these feed operators matches the feed_targets
// and feed_holder_name. Raise exception when any mismatch is found.
// Return true if the block has feed operators and holder of matching info.
static bool has_feed_operators(
BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
const std::string& feed_holder_name) {
size_t feed_count = 0;
for (auto* op : block->AllOps()) {
if (op->Type() == kFeedOpType) {
feed_count++;
PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
"Input to feed op should be '%s'", feed_holder_name);
std::string feed_target_name = op->Output("Out")[0];
PADDLE_ENFORCE(
feed_targets.find(feed_target_name) != feed_targets.end(),
"Feed operator output name '%s' cannot be found in 'feed_targets'",
feed_target_name);
}
}
if (feed_count > 0) {
PADDLE_ENFORCE_EQ(
feed_count, feed_targets.size(),
"The number of feed operators should match 'feed_targets'");
// When feed operator are present, so should be feed_holder
auto var = block->FindVar(feed_holder_name);
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
feed_holder_name);
PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
"'%s' variable should be 'FEED_MINIBATCH' type",
feed_holder_name);
}
return feed_count > 0;
}
// Check whether the block already has fetch operators and fetch_holder.
// Return false if the block does not have any fetch operators.
// If some fetch operators have been appended to the block, check that
// the info contained in these fetch operators matches the fetch_targets
// and fetch_holder_name. Raise exception when any mismatch is found.
// Return true if the block has fetch operators and holder of matching info.
static bool has_fetch_operators(
BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& fetch_holder_name) {
size_t fetch_count = 0;
for (auto* op : block->AllOps()) {
if (op->Type() == kFetchOpType) {
fetch_count++;
PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
"Output of fetch op should be '%s'", fetch_holder_name);
std::string fetch_target_name = op->Input("X")[0];
PADDLE_ENFORCE(
fetch_targets.find(fetch_target_name) != fetch_targets.end(),
"Fetch operator input name '%s' cannot be found in 'fetch_targets'",
fetch_target_name);
}
}
if (fetch_count > 0) {
PADDLE_ENFORCE_EQ(
fetch_count, fetch_targets.size(),
"The number of fetch operators should match 'fetch_targets'");
// When fetch operator are present, so should be fetch_holder
auto var = block->FindVar(fetch_holder_name);
PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
fetch_holder_name);
PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
"'%s' variable should be 'FETCH_LIST' type",
fetch_holder_name);
}
return fetch_count > 0;
}
void Executor::Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name,
const std::string& fetch_holder_name) {
auto* copy_program = new ProgramDesc(program);
auto* global_block = copy_program->MutableBlock(0);
if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
// create feed_holder variable
auto* feed_holder = global_block->Var(feed_holder_name);
feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
feed_holder->SetPersistable(true);
int i = 0;
for (auto& feed_target : feed_targets) {
std::string var_name = feed_target.first;
VLOG(3) << "feed target's name: " << var_name;
// prepend feed op
auto* op = global_block->PrependOp();
op->SetType(kFeedOpType);
op->SetInput("X", {feed_holder_name});
op->SetOutput("Out", {var_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
i++;
}
}
// map the data of feed_targets to feed_holder
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
std::string feed_target_name = op->Output("Out")[0];
int idx = boost::get<int>(op->GetAttr("col"));
SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
idx);
}
}
if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
// create fetch_holder variable
auto* fetch_holder = global_block->Var(fetch_holder_name);
fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
fetch_holder->SetPersistable(true);
int i = 0;
for (auto& fetch_target : fetch_targets) {
std::string var_name = fetch_target.first;
VLOG(3) << "fetch target's name: " << var_name;
// append fetch op
auto* op = global_block->AppendOp();
op->SetType(kFetchOpType);
op->SetInput("X", {var_name});
op->SetOutput("Out", {fetch_holder_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
i++;
}
}
Run(*copy_program, scope, 0, true, true);
// obtain the data of fetch_targets from fetch_holder
for (auto* op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
std::string fetch_target_name = op->Input("X")[0];
int idx = boost::get<int>(op->GetAttr("col"));
*fetch_targets[fetch_target_name] =
GetFetchVariable(*scope, fetch_holder_name, idx);
}
}
delete copy_program;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -41,6 +41,12 @@ class Executor { ...@@ -41,6 +41,12 @@ class Executor {
void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true, void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
bool create_vars = true); bool create_vars = true);
void Run(const ProgramDesc& program, Scope* scope,
std::map<std::string, const LoDTensor*>& feed_targets,
std::map<std::string, LoDTensor*>& fetch_targets,
const std::string& feed_holder_name = "feed",
const std::string& fetch_holder_name = "fetch");
private: private:
const platform::Place place_; const platform::Place place_;
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/feed_fetch_method.h"
#include "glog/logging.h"
#include "paddle/framework/variable.h"
namespace paddle {
namespace framework {
void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) {
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs =
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
}
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) {
// Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size());
return tensor;
}
} // namespace framework
} // namespace paddle
...@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and ...@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include "glog/logging.h"
#include "paddle/framework/feed_fetch_type.h" #include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
#include "paddle/framework/variable.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
void SetFeedVariable(Scope* scope, const LoDTensor& input, void SetFeedVariable(Scope* scope, const LoDTensor& input,
const std::string& var_name, size_t index) { const std::string& var_name, size_t index);
// If var_name Variable is not found in GlobalScope, a new variable will
// be created.
VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
Variable* g_feed_value = scope->Var(var_name);
auto& feed_inputs =
*(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
if (index >= feed_inputs.size()) {
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
}
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
size_t index) { size_t index);
// Since we want to fetch LodTensor from a variable, the variable must
// be created alreadly.
Variable* g_fetch_value = scope.FindVar(var_name);
PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
"Only %s can be invoked by GetFetchVariable",
typeid(FeedFetchList).name());
auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
auto& tensor = fetch_outputs[index];
VLOG(3) << "Fetch " << var_name << " with index " << index
<< " shape= " << tensor.dims();
PADDLE_ENFORCE_LT(index, fetch_outputs.size());
return tensor;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string>
#include <vector> #include <vector>
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
...@@ -20,5 +21,8 @@ namespace paddle { ...@@ -20,5 +21,8 @@ namespace paddle {
namespace framework { namespace framework {
using FeedFetchType = LoDTensor; using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector<FeedFetchType>; using FeedFetchList = std::vector<FeedFetchType>;
static const std::string kFeedOpType = "feed";
static const std::string kFetchOpType = "fetch";
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -26,6 +26,7 @@ enum AttrType { ...@@ -26,6 +26,7 @@ enum AttrType {
BOOLEAN = 6; BOOLEAN = 6;
BOOLEANS = 7; BOOLEANS = 7;
BLOCK = 8; BLOCK = 8;
LONG = 9;
} }
// OpDesc describes an instance of a C++ framework::OperatorBase // OpDesc describes an instance of a C++ framework::OperatorBase
...@@ -44,6 +45,7 @@ message OpDesc { ...@@ -44,6 +45,7 @@ message OpDesc {
optional bool b = 10; optional bool b = 10;
repeated bool bools = 11; repeated bool bools = 11;
optional int32 block_idx = 12; optional int32 block_idx = 12;
optional int64 l = 13;
}; };
message Var { message Var {
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <string.h> // for strdup #include <string.h> // for strdup
#include <algorithm> #include <algorithm>
#include <stdexcept>
#include <string> #include <string>
#include "paddle/framework/init.h" #include "paddle/framework/init.h"
...@@ -46,17 +47,23 @@ void InitDevices() { ...@@ -46,17 +47,23 @@ void InitDevices() {
std::vector<platform::Place> places; std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace()); places.emplace_back(platform::CPUPlace());
int count = 0;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
int count = platform::GetCUDADeviceCount(); try {
for (int i = 0; i < count; ++i) { count = platform::GetCUDADeviceCount();
places.emplace_back(platform::CUDAPlace(i)); } catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
} }
#else #else
LOG(WARNING) LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option"; << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif #endif
for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
}
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
} }
......
...@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) { ...@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
using paddle::framework::InitDevices; using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool; using paddle::platform::DeviceContextPool;
#ifndef PADDLE_WITH_CUDA
InitDevices(); InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance(); DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_GE(pool.size(), 1U); ASSERT_EQ(pool.size(), 1U);
#endif
}
TEST(InitDevices, CUDA) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifdef PADDLE_WITH_CUDA
int count = paddle::platform::GetCUDADeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
#endif
} }
...@@ -24,8 +24,6 @@ limitations under the License. */ ...@@ -24,8 +24,6 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <iterator> #include <iterator>
#include <glog/logging.h>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -107,9 +105,10 @@ LoD ToAbsOffset(const LoD &in) { ...@@ -107,9 +105,10 @@ LoD ToAbsOffset(const LoD &in) {
// the lowest level stores relative offsets // the lowest level stores relative offsets
if (in.empty() || in.size() == 1) return in; if (in.empty() || in.size() == 1) return in;
LoD result = in; LoD result = in;
for (int level = result.size() - 2; level >= 0; level--) { for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
for (auto &ele : result[level]) { for (size_t i = 0; i < in[level].size(); ++i) {
ele = result[level + 1][ele]; size_t index = in[level][i];
result[level][i] = result[level + 1][index];
} }
} }
return result; return result;
......
...@@ -18,11 +18,11 @@ limitations under the License. */ ...@@ -18,11 +18,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <thrust/device_vector.h> #include <thrust/device_vector.h>
#include <thrust/host_vector.h> #include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif #endif
#include <glog/logging.h> #include <glog/logging.h>
#include "paddle/framework/ddim.h" #include "paddle/framework/ddim.h"
#include "paddle/framework/mixed_vector.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h" #include "paddle/framework/tensor_util.h"
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
...@@ -31,15 +31,6 @@ limitations under the License. */ ...@@ -31,15 +31,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#ifndef PADDLE_WITH_CUDA
template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<
T, thrust::system::cuda::experimental::pinned_allocator<T>>;
#endif
/* /*
* LoD is short for Level of Details. * LoD is short for Level of Details.
* *
...@@ -55,7 +46,15 @@ using Vector = thrust::host_vector< ...@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
* 0 2 4 7 * 0 2 4 7
* 0 2 5 7 10 12 15 20 * 0 2 5 7 10 12 15 20
*/ */
using LoD = std::vector<Vector<size_t>>; struct LoD : public std::vector<Vector<size_t>> {
using std::vector<Vector<size_t>>::vector;
void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA();
}
}
};
std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t); std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
...@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1); ...@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
*/ */
class LoDTensor : public Tensor { class LoDTensor : public Tensor {
public: public:
LoDTensor() {} LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {} explicit LoDTensor(const LoD& lod) : lod_(lod) {}
......
...@@ -23,6 +23,17 @@ ...@@ -23,6 +23,17 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
TEST(LoD, data) {
LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i);
}
}
TEST(LodExpand, test) { TEST(LodExpand, test) {
LoD lod{{0, 2}}; LoD lod{{0, 2}};
LoDTensor tensor; LoDTensor tensor;
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stdio.h>
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
...@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) { ...@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
} }
} }
TEST(Vector, Normal) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
paddle::framework::InitDevices();
paddle::framework::Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(LoD, data) {
paddle::framework::InitDevices();
paddle::framework::LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
test<<<1, 1>>>(v.cuda_data(), v.size());
cudaDeviceSynchronize();
v.CopyFromCUDA();
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2);
}
}
TEST(LoDTensor, LoDInGPU) { TEST(LoDTensor, LoDInGPU) {
paddle::framework::InitDevices();
paddle::framework::LoDTensor lod_tensor; paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0); paddle::platform::CUDAPlace place(0);
...@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) { ...@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod(); auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].data(), lod[0].size()); test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) { for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <initializer_list>
#include <vector>
#include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
/**
* @brief Vector support both cpu and gpu.
* host vector lifetime is same with Vector
* device vector is lazily malloc and modified.
*/
template <typename T>
class Vector : public std::vector<T> {
public:
/* NOTE(dzhwinter):
* Data always store and modified on Host.
* If the data is modified when use cuda_data interface,
* You need to call the CopyFromCUDA explicitly to synchronize data.
*
*/
enum class kDataPosition {
kDataOnHost = 0,
kDataOnDevice = 1,
};
public:
using std::vector<T>::vector;
Vector() {}
Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT
virtual ~Vector() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
}
#endif
}
T *cuda_data() {
CopyToCUDA();
PADDLE_ENFORCE_NOT_NULL(
cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
return static_cast<T *>(cuda_ptr_);
}
T *data() { return std::vector<T>::data(); }
const T *data() const { return std::vector<T>::data(); }
void CopyToCUDA();
void CopyFromCUDA();
void CopyToPeer(platform::Place);
private:
void *cuda_ptr_ = nullptr;
size_t cuda_size_ = 0;
/*The DataPosition is unused now,
if we want support random access from cpu and cuda,
we need to overload all the vector method */
kDataPosition position_ = kDataPosition::kDataOnHost;
platform::CUDAPlace place_;
};
template <typename T>
void Vector<T>::CopyToCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ == nullptr) {
cuda_ptr_ =
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *cuda_ctx = pool.GetByPlace(place_);
memory::Copy(place_, static_cast<void *>(cuda_ptr_), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), cuda_ctx->stream());
cuda_ctx->Wait();
cuda_size_ = this->size();
#endif
}
template <typename T>
void Vector<T>::CopyFromCUDA() {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *cuda_ctx = pool.GetByPlace(place_);
if (cuda_ptr_ == nullptr) {
LOG(WARNING) << "No uncommited cuda data.";
return;
}
this->resize(cuda_size_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
cuda_ctx->stream());
cuda_ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyToPeer(platform::Place peer_place) {
if (platform::is_cpu_place(peer_place)) {
return;
}
#ifdef PADDLE_WITH_CUDA
auto *cuda_ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
void *peer_cuda_ptr_ = memory::Alloc<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
memory::Copy(boost::get<platform::CUDAPlace>(peer_place),
static_cast<void *>(peer_cuda_ptr_), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
cuda_ctx->stream());
cuda_ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
place_ = boost::get<platform::CUDAPlace>(peer_place);
cuda_ptr_ = peer_cuda_ptr_;
#endif
}
template class Vector<int>;
template class Vector<unsigned>;
template class Vector<size_t>;
template class Vector<int64_t>;
} // namespace framework
} // namespace paddle
...@@ -97,7 +97,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) { ...@@ -97,7 +97,7 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
need_update_ = true; need_update_ = true;
} }
OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog) OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
: desc_(desc), need_update_(false) { : desc_(desc), need_update_(false) {
// restore inputs_ // restore inputs_
int input_size = desc_.inputs_size(); int input_size = desc_.inputs_size();
...@@ -131,6 +131,7 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog) ...@@ -131,6 +131,7 @@ OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog)
attrs_[attr_name] = prog->MutableBlock(bid); attrs_[attr_name] = prog->MutableBlock(bid);
} }
} }
this->block_ = block;
} }
proto::OpDesc *OpDesc::Proto() { proto::OpDesc *OpDesc::Proto() {
...@@ -282,6 +283,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> { ...@@ -282,6 +283,7 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
VectorToRepeated(v, attr_->mutable_bools()); VectorToRepeated(v, attr_->mutable_bools());
} }
void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
void operator()(int64_t v) const { attr_->set_l(v); }
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
}; };
......
...@@ -25,7 +25,6 @@ namespace framework { ...@@ -25,7 +25,6 @@ namespace framework {
class BlockDesc; class BlockDesc;
class ProgramDesc; class ProgramDesc;
class OpDesc { class OpDesc {
public: public:
OpDesc() {} OpDesc() {}
...@@ -33,7 +32,14 @@ class OpDesc { ...@@ -33,7 +32,14 @@ class OpDesc {
OpDesc(const std::string &type, const VariableNameMap &inputs, OpDesc(const std::string &type, const VariableNameMap &inputs,
const VariableNameMap &outputs, const AttributeMap &attrs); const VariableNameMap &outputs, const AttributeMap &attrs);
OpDesc(const proto::OpDesc &desc, ProgramDesc *prog); OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
explicit OpDesc(BlockDesc *block) : block_(block) {}
OpDesc(const OpDesc &other, BlockDesc *block) {
*this = other;
block_ = block;
}
void CopyFrom(const OpDesc &op_desc); void CopyFrom(const OpDesc &op_desc);
...@@ -117,6 +123,10 @@ class OpDesc { ...@@ -117,6 +123,10 @@ class OpDesc {
void Flush(); void Flush();
BlockDesc *Block() { return this->block_; }
void SetBlock(BlockDesc *block) { this->block_ = block; }
private: private:
template <typename MapType> template <typename MapType>
static std::vector<typename MapType::key_type> MapKeys(const MapType &map) { static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
...@@ -129,6 +139,7 @@ class OpDesc { ...@@ -129,6 +139,7 @@ class OpDesc {
} }
proto::OpDesc desc_; proto::OpDesc desc_;
BlockDesc *block_; // not_own
// input arg name => input variable names // input arg name => input variable names
VariableNameMap inputs_; VariableNameMap inputs_;
// output arg name => output variable names // output arg name => output variable names
......
...@@ -26,9 +26,9 @@ TEST(OpKernelType, ToString) { ...@@ -26,9 +26,9 @@ TEST(OpKernelType, ToString) {
OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN); LibraryType::kCUDNN);
ASSERT_EQ( ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
paddle::framework::KernelTypeToString(op_kernel_type), "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
"data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); "CUDNN]");
} }
TEST(OpKernelType, Hash) { TEST(OpKernelType, Hash) {
......
...@@ -22,9 +22,7 @@ limitations under the License. */ ...@@ -22,9 +22,7 @@ limitations under the License. */
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h" #include "paddle/framework/var_type.h"
DEFINE_bool(op_sync, false, DECLARE_bool(benchmark);
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -531,7 +529,7 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -531,7 +529,7 @@ void OperatorWithKernel::Run(const Scope& scope,
ExecutionContext(*this, new_scope, *new_dev_ctx)); ExecutionContext(*this, new_scope, *new_dev_ctx));
/*For profiling/benchmark only*/ /*For profiling/benchmark only*/
if (FLAGS_op_sync) { if (FLAGS_benchmark) {
new_dev_ctx->Wait(); new_dev_ctx->Wait();
} }
} }
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/framework/block_desc.h" #include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -64,5 +65,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) { ...@@ -64,5 +65,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
} }
} }
const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
BlockDesc *global_block = blocks_[0].get();
std::vector<std::string> feed_target_names;
for (auto *op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
}
}
return feed_target_names;
}
const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
BlockDesc *global_block = blocks_[0].get();
std::vector<std::string> fetch_target_names;
for (auto *op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
fetch_target_names.push_back(op->Input("X")[0]);
}
}
return fetch_target_names;
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/proto_desc.h" #include "paddle/framework/proto_desc.h"
#include "paddle/platform/macros.h" #include "paddle/platform/macros.h"
...@@ -45,6 +46,9 @@ class ProgramDesc { ...@@ -45,6 +46,9 @@ class ProgramDesc {
proto::ProgramDesc *Proto(); proto::ProgramDesc *Proto();
const std::vector<std::string> GetFeedTargetNames();
const std::vector<std::string> GetFetchTargetNames();
private: private:
proto::ProgramDesc desc_; proto::ProgramDesc desc_;
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include <glog/logging.h> #include <glog/logging.h>
...@@ -102,6 +103,32 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, ...@@ -102,6 +103,32 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
*op_field->Add() = input.blocks(block_id).ops(i); *op_field->Add() = input.blocks(block_id).ops(i);
} }
} }
// remove the VarDescs in BlockDesc that are not referenced in
// the pruned OpDescs
std::unordered_map<std::string, proto::VarDesc> var_map;
auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
for (const auto& var : *var_field) {
var_map[var.name()] = var;
}
var_field->Clear();
for (const auto& op : *op_field) {
// add VarDescs of all input arguments for each OpDesc
auto& input_field = op.inputs();
for (auto& input_var : input_field) {
for (auto& arg : input_var.arguments()) {
*var_field->Add() = var_map[arg];
}
}
// add VarDescs of all output arguments for each OpDesc
auto& output_field = op.outputs();
for (auto& output_var : output_field) {
for (auto& arg : output_var.arguments()) {
*var_field->Add() = var_map[arg];
}
}
}
} }
// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
......
...@@ -20,9 +20,11 @@ limitations under the License. */ ...@@ -20,9 +20,11 @@ limitations under the License. */
#include "paddle/framework/threadpool.h" #include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
DEFINE_bool(do_memory_benchmark, false, DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, " "Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs"); "and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) { ...@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
// When making memory benchmark on Fluid, we have to delete scope sync. // When making memory benchmark on Fluid, we have to delete scope sync.
if (FLAGS_do_memory_benchmark) { if (FLAGS_benchmark) {
delete scope; delete scope;
} else { } else {
Async([scope] { delete scope; }); Async([scope] { delete scope; });
......
...@@ -47,6 +47,11 @@ class Tensor { ...@@ -47,6 +47,11 @@ class Tensor {
public: public:
Tensor() : offset_(0) {} Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
inline T* data(); inline T* data();
...@@ -137,6 +142,7 @@ class Tensor { ...@@ -137,6 +142,7 @@ class Tensor {
virtual std::type_index type() const = 0; virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0; virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0; virtual void set_type(std::type_index type) = 0;
virtual void set_place(platform::Place place) = 0;
}; };
template <typename Place> template <typename Place>
...@@ -156,6 +162,7 @@ class Tensor { ...@@ -156,6 +162,7 @@ class Tensor {
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); } virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return type_; } virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; } virtual void set_type(std::type_index type) { type_ = type; }
virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */ /*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_; std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/threadpool.h" #include "paddle/framework/threadpool.h"
#include "paddle/platform/enforce.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::unique_ptr<ThreadPool> ThreadPool::threadpool(nullptr); std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
std::once_flag ThreadPool::init_flag; std::once_flag ThreadPool::init_flag_;
ThreadPool* ThreadPool::GetInstance() {
std::call_once(init_flag_, &ThreadPool::Init);
return threadpool_.get();
}
void ThreadPool::Init() {
if (threadpool_.get() == nullptr) {
// TODO(Yancey1989): specify the max threads number
int num_threads = std::thread::hardware_concurrency();
PADDLE_ENFORCE_GT(num_threads, 0);
threadpool_.reset(new ThreadPool(num_threads));
}
}
ThreadPool::ThreadPool(int num_threads)
: total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
threads_.resize(num_threads);
for (auto& thread : threads_) {
// TODO(Yancey1989): binding the thread on the specify CPU number
thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
}
}
ThreadPool::~ThreadPool() {
{
// notify all threads to stop running
running_ = false;
scheduled_.notify_all();
}
for (auto& t : threads_) {
t->join();
t.reset(nullptr);
}
}
void ThreadPool::Wait() {
std::unique_lock<std::mutex> lock(mutex_);
completed_.wait(lock, [=] { return Done() == true; });
}
void ThreadPool::TaskLoop() {
while (running_) {
std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
if (!running_) {
break;
}
// pop a task from the task queue
auto task = std::move(tasks_.front());
tasks_.pop();
--idle_threads_;
lock.unlock();
// run the task
task();
{
std::unique_lock<std::mutex> lock(mutex_);
++idle_threads_;
if (Done()) {
completed_.notify_all();
}
}
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -20,52 +20,36 @@ limitations under the License. */ ...@@ -20,52 +20,36 @@ limitations under the License. */
#include <mutex> #include <mutex>
#include <queue> #include <queue>
#include <thread> #include <thread>
#include <vector>
#include "paddle/platform/enforce.h" #include "paddle/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// ThreadPool maintains a queue of tasks, and runs them using a fixed
// number of threads.
class ThreadPool { class ThreadPool {
public: public:
typedef std::packaged_task<void()> Task; typedef std::packaged_task<void()> Task;
/** // Returns the singleton of ThreadPool.
* @brief Get a instance of threadpool, the thread number will static ThreadPool* GetInstance();
* be specified as the number of hardware thread contexts
*/
static ThreadPool* GetInstance() {
std::call_once(init_flag, &ThreadPool::Init);
return threadpool.get();
}
~ThreadPool() { ~ThreadPool();
{
// notify all threads to stop running
running_ = false;
scheduled_.notify_all();
}
for (auto& t : threads_) {
t->join();
t.reset(nullptr);
}
}
int GetNumThreads() const { return num_threads_; } // Returns the number of threads created by the constructor.
size_t Threads() const { return total_threads_; }
int GetAvailable() { // Returns the number of currently idle threads.
size_t IdleThreads() {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
return available_; return idle_threads_;
} }
/** // Run pushes a function to the task queue and returns a std::future
* @brief Push a function to the queue, and will be scheduled and // object. To wait for the completion of the task, call
* executed if a thread is available. // std::future::wait().
* @param[in] Task, will be pushed to the task queue.
* @return std::future<void>, we could wait for the task finished by
* f.wait().
*/
template <typename Callback> template <typename Callback>
std::future<void> Run(Callback fn) { std::future<void> Run(Callback fn) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
...@@ -77,84 +61,40 @@ class ThreadPool { ...@@ -77,84 +61,40 @@ class ThreadPool {
return f; return f;
} }
/** // Wait until all the tasks are completed.
* @brief Wait until all the tasks are completed. void Wait();
*/
void Wait() {
std::unique_lock<std::mutex> lock(mutex_);
completed_.wait(lock, [=] { return Done() == true; });
}
private: private:
DISABLE_COPY_AND_ASSIGN(ThreadPool); DISABLE_COPY_AND_ASSIGN(ThreadPool);
explicit ThreadPool(int num_threads) explicit ThreadPool(int num_threads);
: num_threads_(num_threads), available_(num_threads), running_(true) {
threads_.resize(num_threads);
for (auto& thread : threads_) {
// TODO(Yancey1989): binding the thread on the specify CPU number
thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
}
}
/** // If the task queue is empty and avaialbe is equal to the number of
* @brief If the task queue is empty and avaialbe // threads, means that all tasks are completed. Note: this function
* is equal to the number of threads, means that // is not thread-safe. Returns true if all tasks are completed.
* all tasks are completed. // Note: don't delete the data member total_threads_ and use
* // threads_.size() instead; because you'd need to lock the mutex
* Note: this function is not thread-safe. // before accessing threads_.
* bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
* @return true if all tasks are completed.
*/
bool Done() { return tasks_.empty() && available_ == num_threads_; }
void TaskLoop() {
while (running_) {
std::unique_lock<std::mutex> lock(mutex_);
scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
if (!running_) {
break;
}
// pop a task from the task queue
auto task = std::move(tasks_.front());
tasks_.pop();
--available_;
lock.unlock();
// run the task
task();
{
std::unique_lock<std::mutex> lock(mutex_);
++available_;
if (Done()) {
completed_.notify_all();
}
}
}
}
static void Init() { // The constructor starts threads to run TaskLoop, which retrieves
if (threadpool.get() == nullptr) { // and runs tasks from the queue.
// TODO(Yancey1989): specify the max threads number void TaskLoop();
int num_threads = std::thread::hardware_concurrency();
PADDLE_ENFORCE_GT(num_threads, 0); // Init is called by GetInstance.
threadpool.reset(new ThreadPool(num_threads)); static void Init();
}
}
private: private:
static std::unique_ptr<ThreadPool> threadpool; static std::unique_ptr<ThreadPool> threadpool_;
static std::once_flag init_flag; static std::once_flag init_flag_;
int num_threads_;
int available_;
bool running_;
std::queue<Task> tasks_;
std::vector<std::unique_ptr<std::thread>> threads_; std::vector<std::unique_ptr<std::thread>> threads_;
const size_t total_threads_;
size_t idle_threads_;
std::queue<Task> tasks_;
std::mutex mutex_; std::mutex mutex_;
bool running_;
std::condition_variable scheduled_; std::condition_variable scheduled_;
std::condition_variable completed_; std::condition_variable completed_;
}; };
......
...@@ -22,11 +22,7 @@ namespace framework = paddle::framework; ...@@ -22,11 +22,7 @@ namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) { void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
std::vector<std::future<void>> fs; std::vector<std::future<void>> fs;
for (int i = 0; i < cnt; ++i) { for (int i = 0; i < cnt; ++i) {
auto f = pool->Run([&sum]() { sum.fetch_add(1); }); fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
fs.push_back(std::move(f));
}
for (auto& f : fs) {
f.wait();
} }
} }
......
...@@ -35,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>; ...@@ -35,7 +35,7 @@ using VariableNameMap = std::map<std::string, std::vector<std::string>>;
using Attribute = using Attribute =
boost::variant<boost::blank, int, float, std::string, std::vector<int>, boost::variant<boost::blank, int, float, std::string, std::vector<int>,
std::vector<float>, std::vector<std::string>, bool, std::vector<float>, std::vector<std::string>, bool,
std::vector<bool>, BlockDesc*>; std::vector<bool>, BlockDesc*, int64_t>;
using AttributeMap = std::unordered_map<std::string, Attribute>; using AttributeMap = std::unordered_map<std::string, Attribute>;
......
...@@ -66,6 +66,8 @@ class VarDesc { ...@@ -66,6 +66,8 @@ class VarDesc {
std::string Name() const { return desc_.name(); } std::string Name() const { return desc_.name(); }
void SetName(std::string name) { desc_.set_name(name); }
void SetShape(const std::vector<int64_t> &dims); void SetShape(const std::vector<int64_t> &dims);
void SetDataType(proto::DataType data_type); void SetDataType(proto::DataType data_type);
......
...@@ -12,19 +12,6 @@ ...@@ -12,19 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
/*
Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include <memory> #include <memory>
#include <string> #include <string>
......
...@@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap, ...@@ -69,7 +69,7 @@ bool PriorBoxLayer::init(const LayerMap& layerMap,
if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size()); if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
// flip aspect ratios // flip aspect ratios
for (int index = 0; index < tmp.size(); index++) { for (unsigned index = 0; index < tmp.size(); index++) {
real ar = tmp[index]; real ar = tmp[index];
if (fabs(ar - 1.) < 1e-6) continue; if (fabs(ar - 1.) < 1e-6) continue;
aspectRatio_.push_back(ar); aspectRatio_.push_back(ar);
......
...@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) { ...@@ -991,8 +991,10 @@ TEST(Layer, SequenceLastInstanceLayer) {
"seqlastins", "seqlastins",
"non-seq", "non-seq",
-1); // hasSubseq seqlastins to non-seq -1); // hasSubseq seqlastins to non-seq
testDegradeLayer( testDegradeLayer(true,
true, "seqlastins", "seq", -1); // hasSubseq seqlastins to seq "seqlastins",
"seq",
-1); // hasSubseq seqlastins to seq
} }
TEST(Layer, AverageLayer) { TEST(Layer, AverageLayer) {
...@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) { ...@@ -1001,8 +1003,10 @@ TEST(Layer, AverageLayer) {
"average", "average",
"non-seq", "non-seq",
5); // seq average to a shorten seq, stride window = 5 5); // seq average to a shorten seq, stride window = 5
testDegradeLayer( testDegradeLayer(true,
true, "average", "non-seq", -1); // hasSubseq average to non-seq "average",
"non-seq",
-1); // hasSubseq average to non-seq
testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq testDegradeLayer(true, "average", "seq", -1); // hasSubseq average to seq
} }
...@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) { ...@@ -1287,8 +1291,9 @@ TEST(Layer, PoolLayer) {
testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
testPoolLayer2( testPoolLayer2("cudnn-avg-incl-pad-pool",
"cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true); /* trans= */ false,
/* useGpu= */ true);
testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true); testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
#endif #endif
} }
...@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) { ...@@ -2431,18 +2436,21 @@ TEST(Layer, test3DDeConvLayer) {
} }
TEST(Layer, ScaleShiftLayer) { TEST(Layer, ScaleShiftLayer) {
const size_t batchSize = 16; // FIXME: Disable ScaleShiftLayer because it is not stable.
const size_t size = 32; // https://github.com/PaddlePaddle/Paddle/issues/7781
TestConfig config; return;
config.layerConfig.set_type("scale_shift"); // const size_t batchSize = 16;
config.layerConfig.set_size(size); // const size_t size = 32;
config.biasSize = 1; // TestConfig config;
config.inputDefs.push_back( // config.layerConfig.set_type("scale_shift");
{INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1}); // config.layerConfig.set_size(size);
config.layerConfig.add_inputs(); // config.biasSize = 1;
for (auto useGpu : {false, true}) { // config.inputDefs.push_back(
testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false); // {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
} // config.layerConfig.add_inputs();
// for (auto useGpu : {false, true}) {
// testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
// }
} }
TEST(Layer, ScaleSubRegionLayer) { TEST(Layer, ScaleSubRegionLayer) {
......
set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init) set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
cc_library(paddle_fluid_api cc_library(paddle_fluid_api
SRCS inference.cc SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Merge all modules into a single static library # Merge all modules into a single static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Create shared library # Create shared library
add_library(paddle_fluid_shared SHARED inference.cc) add_library(paddle_fluid_shared SHARED io.cc)
target_circle_link_libraries(paddle_fluid_shared target_circle_link_libraries(paddle_fluid_shared
ARCHIVE_START ARCHIVE_START
...@@ -20,23 +20,10 @@ SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) ...@@ -20,23 +20,10 @@ SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
# install library & headers # install library & headers
if(NOT WITH_C_API AND WITH_FLUID) if(NOT WITH_C_API AND WITH_FLUID)
install(FILES inference.h DESTINATION include/paddle/inference) install(FILES io.h DESTINATION include/paddle/inference)
install(TARGETS paddle_fluid_shared DESTINATION lib) install(TARGETS paddle_fluid_shared DESTINATION lib)
endif() endif()
add_executable(example example.cc) if(WITH_TESTING)
if(APPLE) add_subdirectory(tests/book)
set(OPTIONAL_LINK_FLAGS)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
endif()
target_link_libraries(example
-Wl,-force_load paddle_fluid
${OPTIONAL_LINK_FLAGS}
${PTOOLS_LIB})
else()
target_link_libraries(example
-Wl,--start-group -Wl,--whole-archive paddle_fluid
-Wl,--no-whole-archive -Wl,--end-group
${PTOOLS_LIB})
endif() endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <time.h>
#include <iostream>
#include "gflags/gflags.h"
#include "paddle/inference/inference.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_dirname.empty()) {
// Example:
// ./example --dirname=recognize_digits_mlp.inference.model
std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
exit(1);
}
std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
paddle::InferenceEngine* engine = new paddle::InferenceEngine();
engine->LoadInferenceModel(dirname);
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor> feeds;
feeds.push_back(input);
std::vector<paddle::framework::LoDTensor> fetchs;
engine->Execute(feeds, fetchs);
for (size_t i = 0; i < fetchs.size(); ++i) {
auto dims_i = fetchs[i].dims();
std::cout << "dims_i:";
for (int j = 0; j < dims_i.size(); ++j) {
std::cout << " " << dims_i[j];
}
std::cout << std::endl;
std::cout << "result:";
float* output_ptr = fetchs[i].data<float>();
for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
std::cout << " " << output_ptr[j];
}
std::cout << std::endl;
}
delete engine;
return 0;
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "inference.h"
#include <fstream>
#include "paddle/framework/executor.h"
#include "paddle/framework/feed_fetch_method.h"
#include "paddle/framework/init.h"
#include "paddle/framework/scope.h"
#ifdef PADDLE_USE_PTOOLS
#include "chooseser.h"
#endif
namespace paddle {
void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
std::string model_filename = dirname + "/__model__.dat";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
program_ = new framework::ProgramDesc(program_desc_str);
GenerateLoadProgram(dirname);
framework::BlockDesc* global_block = program_->MutableBlock(0);
feed_var_names_.clear();
fetch_var_names_.clear();
for (auto* op : global_block->AllOps()) {
if (op->Type() == "feed") {
feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
} else if (op->Type() == "fetch") {
fetch_var_names_.push_back(op->Input("X")[0]);
}
}
}
void InferenceEngine::LoadInferenceModel(
const std::string& dirname,
const std::vector<std::string>& feed_var_names,
const std::vector<std::string>& fetch_var_names) {
std::string model_filename = dirname + "/__model__.dat";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
program_ = new framework::ProgramDesc(program_desc_str);
GenerateLoadProgram(dirname);
if (feed_var_names.empty() || fetch_var_names.empty()) {
LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
}
feed_var_names_ = feed_var_names;
fetch_var_names_ = fetch_var_names;
PrependFeedOp();
AppendFetchOp();
}
bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
if (var->Persistable() && var->Name() != "feed" && var->Name() != "fetch") {
// There are many unreachable variables in the program
for (size_t i = 0; i < program_->Size(); ++i) {
const framework::BlockDesc& block = program_->Block(i);
for (auto* op : block.AllOps()) {
for (auto input_argument_name : op->InputArgumentNames()) {
if (input_argument_name == var->Name()) {
return true;
}
}
}
}
}
return false;
}
void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
framework::BlockDesc* global_block = program_->MutableBlock(0);
load_program_ = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program_->MutableBlock(0);
for (auto* var : global_block->AllVars()) {
if (IsParameter(var)) {
LOG(INFO) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
// append_op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
op->CheckAttrs();
}
}
}
void InferenceEngine::PrependFeedOp() {
if (!program_) {
LOG(FATAL) << "Please initialize the program_ first.";
}
framework::BlockDesc* global_block = program_->MutableBlock(0);
// create_var
framework::VarDesc* feed_var = global_block->Var("feed");
feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
feed_var->SetPersistable(true);
// prepend feed_op
for (size_t i = 0; i < feed_var_names_.size(); ++i) {
std::string var_name = feed_var_names_[i];
LOG(INFO) << "feed var's name: " << var_name;
// prepend_op
framework::OpDesc* op = global_block->PrependOp();
op->SetType("feed");
op->SetInput("X", {"feed"});
op->SetOutput("Out", {var_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
}
}
void InferenceEngine::AppendFetchOp() {
if (!program_) {
LOG(FATAL) << "Please initialize the program_ first.";
}
framework::BlockDesc* global_block = program_->MutableBlock(0);
// create_var
framework::VarDesc* fetch_var = global_block->Var("fetch");
fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
fetch_var->SetPersistable(true);
// append fetch_op
for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
std::string var_name = fetch_var_names_[i];
LOG(INFO) << "fetch var's name: " << var_name;
// append_op
framework::OpDesc* op = global_block->AppendOp();
op->SetType("fetch");
op->SetInput("X", {var_name});
op->SetOutput("Out", {"fetch"});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
}
}
void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
std::vector<framework::LoDTensor>& fetchs) {
if (!program_ || !load_program_) {
LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
}
if (feeds.size() < feed_var_names_.size()) {
LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
}
auto* place = new platform::CPUPlace();
framework::InitDevices();
framework::Executor* executor = new framework::Executor(*place);
framework::Scope* scope = new framework::Scope();
executor->Run(*load_program_, scope, 0, true, true);
// set_feed_variable
for (size_t i = 0; i < feed_var_names_.size(); ++i) {
framework::SetFeedVariable(scope, feeds[i], "feed", i);
}
executor->Run(*program_, scope, 0, true, true);
// get_fetch_variable
fetchs.resize(fetch_var_names_.size());
for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
}
delete place;
delete scope;
delete executor;
}
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/inference/io.h"
#include <fstream>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle {
namespace inference {
bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program) {
if (var->Persistable()) {
// There are many unreachable variables in the program
for (size_t i = 0; i < main_program.Size(); ++i) {
const framework::BlockDesc& block = main_program.Block(i);
for (auto* op : block.AllOps()) {
if (op->Type() == framework::kFeedOpType) {
continue;
}
for (auto input_argument_name : op->InputArgumentNames()) {
if (input_argument_name == var->Name()) {
return true;
}
}
}
}
}
return false;
}
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname,
const framework::ProgramDesc& main_program) {
const framework::BlockDesc& global_block = main_program.Block(0);
framework::ProgramDesc* load_program = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program->MutableBlock(0);
for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) {
VLOG(3) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
// append_op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
op->CheckAttrs();
}
}
executor.Run(*load_program, &scope, 0, true, true);
delete load_program;
}
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
LoadPersistables(executor, scope, dirname, *main_program);
return main_program;
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "paddle/framework/executor.h"
#include "paddle/framework/program_desc.h"
#include "paddle/framework/scope.h"
namespace paddle {
namespace inference {
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname,
const framework::ProgramDesc& main_program);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname);
} // namespace inference
} // namespace paddle
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
cc_test(test_inference_recognize_digits_mlp
SRCS test_inference_recognize_digits.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp
PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <time.h>
#include <sstream>
#include "gflags/gflags.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/inference/io.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
template <typename Place, typename T>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
// 1. Define place, executor and scope
auto place = Place();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
// 2. Initialize the inference_program and load all parameters from file
auto inference_program = paddle::inference::Load(executor, *scope, dirname);
// 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names =
inference_program->GetFeedTargetNames();
const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames();
// 4. Prepare inputs: set up maps for feed targets
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
for (size_t i = 0; i < feed_target_names.size(); ++i) {
// Please make sure that cpu_feeds[i] is right for feed_target_names[i]
feed_targets[feed_target_names[i]] = cpu_feeds[i];
}
// 5. Define Tensor to get the outputs: set up maps for fetch targets
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
}
// 6. Run the inference program
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
delete scope;
}
TEST(inference, recognize_digits) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
EXPECT_EQ(output1.dims(), output2.dims());
EXPECT_EQ(output1.numel(), output2.numel());
float err = 1E-3;
int count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
#endif
}
add_subdirectory(detail) add_subdirectory(detail)
cc_library(memory SRCS memory.cc DEPS place enforce) cc_library(memory SRCS memory.cc DEPS place enforce)
cc_library(memcpy SRCS memcpy.cc) cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(paddle_memory cc_library(paddle_memory
DEPS DEPS
......
...@@ -147,6 +147,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table) ...@@ -147,6 +147,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(sequence_conv_op DEPS context_project) op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling) op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute) op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor) op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
...@@ -172,6 +173,8 @@ endif() ...@@ -172,6 +173,8 @@ endif()
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
...@@ -191,3 +194,4 @@ if(WITH_GPU) ...@@ -191,3 +194,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
...@@ -323,7 +323,7 @@ template <typename T> ...@@ -323,7 +323,7 @@ template <typename T>
struct FloorFunctor : public BaseActivationFunctor<T> { struct FloorFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out> template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const { void operator()(Device d, X x, Out out) const {
out.device(d) = x.ceil(); out.device(d) = x.floor();
} }
}; };
......
...@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func; math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
auto grad_merge = merge_func(context, grad); auto grad_merge = merge_func(context, grad);
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>(); auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
auto& merge_rows = grad_merge.rows(); framework::Vector<int64_t> merge_rows(grad_merge.rows());
// 2. m += g_m * g_m // 2. m += g_m * g_m
math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func; math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge); auto grad_square = sqare_func(context, grad_merge, grad_merge);
...@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> { ...@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
SparseAdagradFunctorKernel< SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0, T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, grad_merge.rows().data(), .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
lr, param_data, moment_data, grad_width, param_data, moment_data, grad_width,
epsilon); epsilon);
} }
}; };
......
...@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
merge_func(ctx.template device_context<DeviceContext>(), grad); merge_func(ctx.template device_context<DeviceContext>(), grad);
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
auto* rows = grad_merge.rows().data(); int64_t* rows = nullptr;
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->cuda_data();
} else {
rows = grad_merge.mutable_rows()->data();
}
auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor( SparseAdamFunctor<T> functor(
......
...@@ -24,8 +24,18 @@ namespace operators { ...@@ -24,8 +24,18 @@ namespace operators {
void BeamSearch::operator()(const framework::LoDTensor &pre_ids, void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
framework::LoDTensor *selected_ids, framework::LoDTensor *selected_ids,
framework::LoDTensor *selected_scores) { framework::LoDTensor *selected_scores) {
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
auto items = SelectTopBeamSizeItems(); auto items = SelectTopBeamSizeItems();
auto selected_items = ToMap(items); auto selected_items = ToMap(items, high_level.back());
VLOG(3) << "selected_items:";
for (size_t i = 0; i < selected_items.size(); ++i) {
VLOG(3) << "offset:" << i;
for (auto &item : selected_items[i]) {
VLOG(3) << ItemToString(item);
}
}
PruneEndidCandidates(pre_ids, &selected_items); PruneEndidCandidates(pre_ids, &selected_items);
// calculate the output tensor's height // calculate the output tensor's height
size_t num_instances = std::accumulate( size_t num_instances = std::accumulate(
...@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids, ...@@ -63,11 +73,12 @@ void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
low_level.push_back(low_offset); low_level.push_back(low_offset);
// fill lod // fill lod
auto abs_lod = framework::ToAbsOffset(ids_->lod());
auto &high_level = abs_lod[lod_level_];
framework::LoD lod(2); framework::LoD lod(2);
lod[0].assign(high_level.begin(), high_level.end()); lod[0].assign(high_level.begin(), high_level.end());
lod[1].assign(low_level.begin(), low_level.end()); lod[1].assign(low_level.begin(), low_level.end());
if (!framework::CheckLoD(lod)) {
PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
}
selected_ids->set_lod(lod); selected_ids->set_lod(lod);
selected_scores->set_lod(lod); selected_scores->set_lod(lod);
} }
...@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids, ...@@ -90,13 +101,11 @@ int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
} }
std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap( std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
const std::vector<std::vector<Item>> &items) { const std::vector<std::vector<Item>> &items, size_t element_num) {
std::vector<std::vector<Item>> result; std::vector<std::vector<Item>> result;
result.resize(element_num);
for (auto &entries : items) { for (auto &entries : items) {
for (const auto &item : entries) { for (const auto &item : entries) {
if (item.offset >= result.size()) {
result.resize(item.offset + 1);
}
result[item.offset].push_back(item); result[item.offset].push_back(item);
} }
} }
...@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() { ...@@ -122,6 +131,14 @@ BeamSearch::SelectTopBeamSizeItems() {
} }
result.emplace_back(items); result.emplace_back(items);
} }
VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
for (auto &items : result) {
VLOG(3) << "item set:";
for (auto &item : items) {
VLOG(3) << ItemToString(item);
}
}
return result; return result;
} }
...@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) { ...@@ -159,6 +176,22 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
return true; return true;
} }
std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
os << "{";
os << "offset: " << item.offset << ", ";
os << "id: " << item.id << ", ";
os << "score: " << item.score << "";
os << "}";
return os;
}
std::string ItemToString(const BeamSearch::Item &item) {
std::ostringstream stream;
stream << item;
return stream.str();
}
class BeamSearchProtoAndCheckerMaker class BeamSearchProtoAndCheckerMaker
: public framework::OpProtoAndCheckerMaker { : public framework::OpProtoAndCheckerMaker {
public: public:
...@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker ...@@ -186,8 +219,40 @@ class BeamSearchProtoAndCheckerMaker
} }
}; };
class BeamSearchInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
for (const std::string &arg :
std::vector<std::string>({"pre_ids", "ids", "scores"})) {
PADDLE_ENFORCE(context->HasInput(arg),
"BeamSearch need input argument '%s'", arg);
}
for (const std::string &arg :
std::vector<std::string>({"selected_ids", "selected_scores"})) {
PADDLE_ENFORCE(context->HasOutput(arg),
"BeamSearch need output argument '%s'", arg);
}
}
};
class BeamSearchInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &o : op_desc.Output("selected_ids")) {
block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
}
for (auto &o : op_desc.Output("selected_scores")) {
block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(beam_search, paddle::operators::BeamSearchOp, REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
paddle::operators::BeamSearchProtoAndCheckerMaker); paddle::operators::BeamSearchProtoAndCheckerMaker,
paddle::operators::BeamSearchInferShape,
paddle::operators::BeamSearchInferVarType,
paddle::framework::EmptyGradOpMaker);
...@@ -136,8 +136,6 @@ class BeamSearch { ...@@ -136,8 +136,6 @@ class BeamSearch {
void operator()(const framework::LoDTensor& pre_ids, void operator()(const framework::LoDTensor& pre_ids,
framework::LoDTensor* selected_ids, framework::LoDTensor* selected_ids,
framework::LoDTensor* selected_scores); framework::LoDTensor* selected_scores);
protected:
/* /*
* The basic items help to sort. * The basic items help to sort.
*/ */
...@@ -155,6 +153,7 @@ class BeamSearch { ...@@ -155,6 +153,7 @@ class BeamSearch {
score_t score; score_t score;
}; };
protected:
/* /*
* Delete all the records that follows the end token. * Delete all the records that follows the end token.
*/ */
...@@ -166,7 +165,7 @@ class BeamSearch { ...@@ -166,7 +165,7 @@ class BeamSearch {
* NOTE low performance * NOTE low performance
*/ */
std::vector<std::vector<Item>> ToMap( std::vector<std::vector<Item>> ToMap(
const std::vector<std::vector<Item>>& inputs); const std::vector<std::vector<Item>>& inputs, size_t element_num);
/* /*
* For each source, select top beam_size records. * For each source, select top beam_size records.
...@@ -187,6 +186,10 @@ class BeamSearch { ...@@ -187,6 +186,10 @@ class BeamSearch {
int end_id_{0}; int end_id_{0};
}; };
std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
std::string ItemToString(const BeamSearch::Item& item);
class BeamSearchOp : public framework::OperatorBase { class BeamSearchOp : public framework::OperatorBase {
public: public:
BeamSearchOp(const std::string& type, BeamSearchOp(const std::string& type,
...@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -203,7 +206,6 @@ class BeamSearchOp : public framework::OperatorBase {
void Run(const framework::Scope& scope, void Run(const framework::Scope& scope,
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
LOG(INFO) << "run beam search op";
auto ids_var = scope.FindVar(Input("ids")); auto ids_var = scope.FindVar(Input("ids"));
auto scores_var = scope.FindVar(Input("scores")); auto scores_var = scope.FindVar(Input("scores"));
auto pre_ids_var = scope.FindVar(Input("pre_ids")); auto pre_ids_var = scope.FindVar(Input("pre_ids"));
...@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -217,10 +219,8 @@ class BeamSearchOp : public framework::OperatorBase {
size_t level = Attr<int>("level"); size_t level = Attr<int>("level");
size_t beam_size = Attr<int>("beam_size"); size_t beam_size = Attr<int>("beam_size");
int end_id = Attr<int>("end_id"); int end_id = Attr<int>("end_id");
LOG(INFO) << "init beam search";
BeamSearch alg(ids, scores, level, beam_size, end_id); BeamSearch alg(ids, scores, level, beam_size, end_id);
LOG(INFO) << "after beam search";
auto selected_ids_var = scope.FindVar(Output("selected_ids")); auto selected_ids_var = scope.FindVar(Output("selected_ids"));
auto selected_scores_var = scope.FindVar(Output("selected_scores")); auto selected_scores_var = scope.FindVar(Output("selected_scores"));
PADDLE_ENFORCE_NOT_NULL(selected_ids_var); PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
...@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase { ...@@ -229,9 +229,7 @@ class BeamSearchOp : public framework::OperatorBase {
*selected_ids_var->GetMutable<framework::LoDTensor>(); *selected_ids_var->GetMutable<framework::LoDTensor>();
auto& selected_scores_tensor = auto& selected_scores_tensor =
*selected_scores_var->GetMutable<framework::LoDTensor>(); *selected_scores_var->GetMutable<framework::LoDTensor>();
LOG(INFO) << "run beam search";
alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor); alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
LOG(INFO) << "finish beam search";
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
class BipartiteMatchOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("DistMat"),
"Input(DistMat) of BipartiteMatch should not be null.");
auto dims = ctx->GetInputDim("DistMat");
PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
ctx->SetOutputDim("ColToRowMatchIndices", dims);
ctx->SetOutputDim("ColToRowMatchDis", dims);
}
};
template <typename T>
class BipartiteMatchKernel : public framework::OpKernel<T> {
public:
// The match_indices must be initialized to -1 at first.
// The match_dist must be initialized to 0 at first.
void BipartiteMatch(const Tensor& dist, int* match_indices,
T* match_dist) const {
constexpr T kEPS = static_cast<T>(1e-6);
PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
int64_t row = dist.dims()[0];
int64_t col = dist.dims()[1];
auto* dist_data = dist.data<T>();
std::vector<int> row_pool;
for (int i = 0; i < row; ++i) {
row_pool.push_back(i);
}
while (row_pool.size() > 0) {
int max_idx = -1;
int max_row_idx = -1;
T max_dist = -1;
for (int64_t j = 0; j < col; ++j) {
if (match_indices[j] != -1) {
continue;
}
for (size_t k = 0; k < row_pool.size(); ++k) {
int m = row_pool[k];
// distance is 0 between m-th row and j-th column
if (dist_data[m * col + j] < kEPS) {
continue;
}
if (dist_data[m * col + j] > max_dist) {
max_idx = j;
max_row_idx = m;
max_dist = dist_data[m * col + j];
}
}
}
if (max_idx == -1) {
// Cannot find good match.
break;
} else {
PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
match_indices[max_idx] = max_row_idx;
match_dist[max_idx] = max_dist;
// Erase the row index.
row_pool.erase(
std::find(row_pool.begin(), row_pool.end(), max_row_idx));
}
}
}
void Compute(const framework::ExecutionContext& context) const override {
auto* dist_mat = context.Input<LoDTensor>("DistMat");
auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
auto col = dist_mat->dims()[1];
int64_t n = dist_mat->lod().size() == 0UL
? 1
: static_cast<int64_t>(dist_mat->lod().back().size() - 1);
if (dist_mat->lod().size()) {
PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL,
"Only support 1 level of LoD.");
}
match_indices->mutable_data<int>({n, col}, context.GetPlace());
match_dist->mutable_data<T>({n, col}, context.GetPlace());
math::SetConstant<platform::CPUDeviceContext, int> iset;
iset(dev_ctx, match_indices, static_cast<int>(-1));
math::SetConstant<platform::CPUDeviceContext, T> tset;
tset(dev_ctx, match_dist, static_cast<T>(0));
int* indices = match_indices->data<int>();
T* dist = match_dist->data<T>();
if (n == 1) {
BipartiteMatch(*dist_mat, indices, dist);
} else {
auto lod = dist_mat->lod().back();
for (size_t i = 0; i < lod.size() - 1; ++i) {
Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
BipartiteMatch(one_ins, indices + i * col, dist + i * col);
}
}
}
};
class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
public:
BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"DistMat",
"(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
"[K, M]. It is pair-wise distance matrix between the entities "
"represented by each row and each column. For example, assumed one "
"entity is A with shape [K], another entity is B with shape [M]. The "
"DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
"the distance is, the better macthing the pairs are. Please note, "
"This tensor can contain LoD information to represent a batch of "
"inputs. One instance of this batch can contain different numbers of "
"entities.");
AddOutput("ColToRowMatchIndices",
"(Tensor) A 2-D Tensor with shape [N, M] in int type. "
"N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
"means B[j] does not match any entity in i-th instance. "
"Otherwise, it means B[j] is matched to row "
"ColToRowMatchIndices[i][j] in i-th instance. The row number of "
"i-th instance is saved in ColToRowMatchIndices[i][j].");
AddOutput("ColToRowMatchDis",
"(Tensor) A 2-D Tensor with shape [N, M] in float type. "
"N is batch size. If ColToRowMatchIndices[i][j] is -1, "
"ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
"ColToRowMatchIndices[i][j] = d, and the row offsets of each "
"instance are called LoD. Then "
"ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
AddComment(R"DOC(
This operator is a greedy bipartite matching algorithm, which is used to
obtain the matching with the maximum distance based on the input
distance matrix. For input 2D matrix, the bipartite matching algorithm can
find the matched column for each row, also can find the matched row for
each column. And this operator only calculate matched indices from column
to row. For each instance, the number of matched indices is the number of
of columns of the input ditance matrix.
There are two outputs to save matched indices and distance.
A simple description, this algothrim matched the best (maximum distance)
row entity to the column entity and the matched indices are not duplicated
in each row of ColToRowMatchIndices. If the column entity is not matched
any row entity, set -1 in ColToRowMatchIndices.
Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
If Tensor, the height of ColToRowMatchIndices is 1.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp,
ops::BipartiteMatchOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel<float>,
ops::BipartiteMatchKernel<double>);
...@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> { ...@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>( MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
num_tokens, tokens, num_seq, input_lod[level].data(), blank, num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
merge_repeated, dev_out_lod0_ptr, output_data); merge_repeated, dev_out_lod0_ptr, output_data);
// set output lod // set output lod
thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(), std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
dev_out_lod0.end());
framework::LoD out_lod; framework::LoD out_lod;
out_lod.push_back(host_out_lod0); out_lod.push_back(host_out_lod0);
output->set_lod(out_lod); output->set_lod(out_lod);
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册