diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c7eb260aea8478f4833cb79253f4481e10b8685..e8ea828dd2a25f5f47b03e92ae86e083d4425dc9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
+option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index 137f11da7f2f1c46eebf6590d93402786ef543c9..c70d83b3f4bb24740ed67b4e2f98a3ced26d1648 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -15,9 +15,9 @@
include(ExternalProject)
set(BOOST_PROJECT "extern_boost")
-set(BOOST_VER "1.66.0")
-set(BOOST_TAR "boost_1_66_0")
-set(BOOST_URL "https://dl.bintray.com/boostorg/release/${BOOST_VER}/source/${BOOST_TAR}.tar.gz")
+set(BOOST_VER "1.41.0")
+set(BOOST_TAR "boost_1_41_0")
+set(BOOST_URL "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 585db019d521b1699baadfae31ef95b5059c71b4..33ef6860e1d38f4e87c4431addf43f9f8a655fc2 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
endif()
if (cc_library_DEPS)
+ # Don't need link libwarpctc.so
+ if ("${cc_library_DEPS};" MATCHES "warpctc;")
+ list(REMOVE_ITEM cc_library_DEPS warpctc)
+ add_dependencies(${TARGET_NAME} warpctc)
+ endif()
add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
endif()
@@ -224,12 +229,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
- set(multiValueArgs SRCS DEPS)
+ set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
- target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+ # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+ target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+ if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+ list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+ endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
- add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+ add_test(NAME ${TARGET_NAME}
+ COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+ WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction(cc_test)
@@ -457,12 +468,12 @@ endfunction()
function(py_test TARGET_NAME)
if(WITH_TESTING)
- set(options STATIC static SHARED shared)
+ set(options "")
set(oneValueArgs "")
- set(multiValueArgs SRCS DEPS ARGS)
+ set(multiValueArgs SRCS DEPS ARGS ENVS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_test(NAME ${TARGET_NAME}
- COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+ COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
index 0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa..a591c7334fd31c98a94b50a4344f251560a0f2f9 100644
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -1,9 +1,14 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
===========
-DataFeeder
+data_feeder
===========
DataFeeder
------------
-.. automodule:: paddle.v2.fluid.data_feeder
- :members: DataFeeder
+----------
+
+.. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
+ :members:
:noindex:
+
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
index a23f3301d0331e0ea3733f06444515eb4680cd31..00dcecfd628a35d83d1c596bf0aea819a1705862 100644
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -1,9 +1,21 @@
-===========
-Evaluator
-===========
-
-Evaluator
------------
-.. automodule:: paddle.v2.fluid.evaluator
- :members: Evaluator
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+=========
+evaluator
+=========
+
+Accuracy
+--------
+
+.. autoclass:: paddle.v2.fluid.evaluator.Accuracy
+ :members:
:noindex:
+
+ChunkEvaluator
+--------------
+
+.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
+ :members:
+ :noindex:
+
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
index 3a283538c120cfa1ef646c390bb71c6251c23675..a028f6283f2ca333bdf6c9857a98661c0222b41e 100644
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
@@ -1,9 +1,32 @@
-===========
-Executor
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+========
+executor
+========
Executor
+--------
+
+.. autoclass:: paddle.v2.fluid.executor.Executor
+ :members:
+ :noindex:
+
+global_scope
+------------
+
+.. autofunction:: paddle.v2.fluid.executor.global_scope
+ :noindex:
+
+scope_guard
-----------
-.. automodule:: paddle.v2.fluid.executor
- :members: Executor
+
+.. autofunction:: paddle.v2.fluid.executor.scope_guard
+ :noindex:
+
+switch_scope
+------------
+
+.. autofunction:: paddle.v2.fluid.executor.switch_scope
:noindex:
+
diff --git a/doc/api/v2/fluid/gen_doc.py b/doc/api/v2/fluid/gen_doc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2147fd3f7ea635d8f14210fbcd1a568ee2230ee
--- /dev/null
+++ b/doc/api/v2/fluid/gen_doc.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import argparse
+import sys
+import types
+
+import paddle.v2.fluid as fluid
+
+
+def parse_arg():
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--submodules', nargs="*")
+ parser.add_argument(
+ 'module', type=str, help='Generate the documentation of which module')
+ return parser.parse_args()
+
+
+class DocGenerator(object):
+ def __init__(self, module_name, stream=sys.stdout):
+ self.stream = stream
+ self.module_name = module_name
+ if not hasattr(fluid, module_name):
+ raise ValueError("Cannot find fluid.{0}".format(module_name))
+ else:
+ self.module = getattr(fluid, module_name)
+ self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+''')
+
+ self._print_header_(module_name, dot='=', is_title=True)
+
+ def print_submodule(self, submodule_name):
+ submodule = getattr(self.module, submodule_name)
+ if submodule is None:
+ raise ValueError("Cannot find submodule {0}".format(submodule_name))
+ self.print_section(submodule_name)
+
+ for item in submodule.__all__:
+ self.print_item(item)
+
+ def print_current_module(self):
+ for item in self.module.__all__:
+ self.print_item(item)
+
+ def print_section(self, name):
+ self._print_header_(name, dot='=', is_title=False)
+
+ def print_item(self, name):
+ item = getattr(self.module, name)
+ if isinstance(item, types.TypeType):
+ self.print_class(name)
+ elif isinstance(item, types.FunctionType):
+ self.print_method(name)
+ else:
+ raise RuntimeError("Unsupported item {0}".format(name))
+
+ def print_class(self, name):
+ self._print_header_(name, dot='-', is_title=False)
+ self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1}
+ :members:
+ :noindex:
+
+'''.format(self.module_name, name))
+
+ def print_method(self, name):
+ self._print_header_(name, dot='-', is_title=False)
+ self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1}
+ :noindex:
+
+'''.format(self.module_name, name))
+
+ def _print_header_(self, name, dot, is_title):
+ dot_line = dot * len(name)
+ if is_title:
+ self.stream.write(dot_line)
+ self.stream.write('\n')
+ self.stream.write(name)
+ self.stream.write('\n')
+ self.stream.write(dot_line)
+ self.stream.write('\n')
+ self.stream.write('\n')
+
+
+def main():
+ args = parse_arg()
+ gen = DocGenerator(args.module)
+ if args.submodules is None:
+ gen.print_current_module()
+ else:
+ for submodule_name in args.submodules:
+ gen.print_submodule(submodule_name)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/doc/api/v2/fluid/gen_doc.sh b/doc/api/v2/fluid/gen_doc.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ba7b7ba8e51399deb852b0a7c8ddd3128f521e85
--- /dev/null
+++ b/doc/api/v2/fluid/gen_doc.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
+
+for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
+do
+ python gen_doc.py ${module} > ${module}.rst
+done
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
index 8f587837e9873370722062404f511654a9460587..c38be033fff2997930525f51c93995db09daa2b6 100644
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
@@ -1,50 +1,35 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
===========
-Initializer
+initializer
===========
+Constant
+--------
-
-Initializer
------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: Initializer
- :noindex:
-
-
-
-ConstantInitializer
--------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: ConstantInitializer
+.. autoclass:: paddle.v2.fluid.initializer.Constant
+ :members:
:noindex:
+Uniform
+-------
-
-UniformInitializer
-------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: UniformInitializer
- :noindex:
-
-
-
-NormalInitializer
------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: NormalInitializer
+.. autoclass:: paddle.v2.fluid.initializer.Uniform
+ :members:
:noindex:
+Normal
+------
-XavierInitializer
------------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: XavierInitializer
+.. autoclass:: paddle.v2.fluid.initializer.Normal
+ :members:
:noindex:
+Xavier
+------
-MSRAInitializer
----------------
-.. automodule:: paddle.v2.fluid.initializer
- :members: MSRAInitializer
+.. autoclass:: paddle.v2.fluid.initializer.Xavier
+ :members:
:noindex:
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst
index 67f68c4e9e16b379207b8de114cdf769e056f78e..37c9c273e369532e8ff596e9649cb695a98a2505 100644
--- a/doc/api/v2/fluid/io.rst
+++ b/doc/api/v2/fluid/io.rst
@@ -1,10 +1,61 @@
-===========
-IO
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+==
+io
+==
+save_vars
+---------
-is_parameter
+.. autofunction:: paddle.v2.fluid.io.save_vars
+ :noindex:
+
+save_params
-----------
-.. autofunction:: paddle.v2.fluid.io.is_parameter
+
+.. autofunction:: paddle.v2.fluid.io.save_params
+ :noindex:
+
+save_persistables
+-----------------
+
+.. autofunction:: paddle.v2.fluid.io.save_persistables
+ :noindex:
+
+load_vars
+---------
+
+.. autofunction:: paddle.v2.fluid.io.load_vars
+ :noindex:
+
+load_params
+-----------
+
+.. autofunction:: paddle.v2.fluid.io.load_params
:noindex:
+
+load_persistables
+-----------------
+
+.. autofunction:: paddle.v2.fluid.io.load_persistables
+ :noindex:
+
+save_inference_model
+--------------------
+
+.. autofunction:: paddle.v2.fluid.io.save_inference_model
+ :noindex:
+
+load_inference_model
+--------------------
+
+.. autofunction:: paddle.v2.fluid.io.load_inference_model
+ :noindex:
+
+get_inference_program
+---------------------
+
+.. autofunction:: paddle.v2.fluid.io.get_inference_program
+ :noindex:
+
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index f738bf15641d9fca0bfb0c10821de778ceee0d79..e24613b94b422b7cdf9c6383c359fa92a4faf6ff 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -1,541 +1,799 @@
-==========
-Layers
-==========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+======
+layers
+======
-fc
----
-.. autofunction:: paddle.v2.fluid.layers.fc
+control_flow
+============
+
+split_lod_tensor
+----------------
+
+.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex:
-embedding
----------
-.. autofunction:: paddle.v2.fluid.layers.embedding
+merge_lod_tensor
+----------------
+
+.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex:
-dynamic_lstm
-------------
-.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+BlockGuard
+----------
+
+.. autoclass:: paddle.v2.fluid.layers.BlockGuard
+ :members:
:noindex:
-dynamic_gru
------------
-.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
+BlockGuardWithCompletion
+------------------------
+
+.. autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
+ :members:
:noindex:
-data
-----
-.. autofunction:: paddle.v2.fluid.layers.data
+StaticRNNMemoryLink
+-------------------
+
+.. autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
+ :members:
:noindex:
-mean
-----
-.. autofunction:: paddle.v2.fluid.layers.mean
+WhileGuard
+----------
+
+.. autoclass:: paddle.v2.fluid.layers.WhileGuard
+ :members:
:noindex:
-mul
----
-.. autofunction:: paddle.v2.fluid.layers.mul
+While
+-----
+
+.. autoclass:: paddle.v2.fluid.layers.While
+ :members:
:noindex:
-elementwise_add
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_add
+lod_rank_table
+--------------
+
+.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex:
-elementwise_sub
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
+max_sequence_len
+----------------
+
+.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex:
-elementwise_mul
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
+topk
+----
+
+.. autofunction:: paddle.v2.fluid.layers.topk
:noindex:
-elementwise_div
----------------
-.. autofunction:: paddle.v2.fluid.layers.elementwise_div
+lod_tensor_to_array
+-------------------
+
+.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex:
+array_to_lod_tensor
+-------------------
-dropout
--------
-.. autofunction:: paddle.v2.fluid.layers.dropout
+.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex:
+increment
+---------
-reshape
---------
-.. autofunction:: paddle.v2.fluid.layers.reshape
+.. autofunction:: paddle.v2.fluid.layers.increment
:noindex:
+array_write
+-----------
-sigmoid
----------
-.. autofunction:: paddle.v2.fluid.layers.sigmoid
+.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex:
+create_array
+------------
-scale
+.. autofunction:: paddle.v2.fluid.layers.create_array
+ :noindex:
+
+less_than
---------
-.. autofunction:: paddle.v2.fluid.layers.scale
+
+.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex:
+array_read
+----------
-transpose
+.. autofunction:: paddle.v2.fluid.layers.array_read
+ :noindex:
+
+shrink_memory
+-------------
+
+.. autofunction:: paddle.v2.fluid.layers.shrink_memory
+ :noindex:
+
+array_length
+------------
+
+.. autofunction:: paddle.v2.fluid.layers.array_length
+ :noindex:
+
+IfElse
+------
+
+.. autoclass:: paddle.v2.fluid.layers.IfElse
+ :members:
+ :noindex:
+
+DynamicRNN
+----------
+
+.. autoclass:: paddle.v2.fluid.layers.DynamicRNN
+ :members:
+ :noindex:
+
+ConditionalBlock
+----------------
+
+.. autoclass:: paddle.v2.fluid.layers.ConditionalBlock
+ :members:
+ :noindex:
+
+StaticRNN
---------
-.. autofunction:: paddle.v2.fluid.layers.transpose
+
+.. autoclass:: paddle.v2.fluid.layers.StaticRNN
+ :members:
:noindex:
+reorder_lod_tensor_by_rank
+--------------------------
-sigmoid_cross_entropy_with_logits
----------------------------------
-.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+.. autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
:noindex:
+ParallelDo
+----------
-cast
+.. autoclass:: paddle.v2.fluid.layers.ParallelDo
+ :members:
+ :noindex:
+
+Print
+-----
+
+.. autofunction:: paddle.v2.fluid.layers.Print
+ :noindex:
+
+device
+======
+
+get_places
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.get_places
+ :noindex:
+
+io
+==
+
+data
----
-.. autofunction:: paddle.v2.fluid.layers.cast
+
+.. autofunction:: paddle.v2.fluid.layers.data
:noindex:
+BlockGuardServ
+--------------
-concat
--------
-.. autofunction:: paddle.v2.fluid.layers.concat
+.. autoclass:: paddle.v2.fluid.layers.BlockGuardServ
+ :members:
:noindex:
+ListenAndServ
+-------------
+
+.. autoclass:: paddle.v2.fluid.layers.ListenAndServ
+ :members:
+ :noindex:
-sums
+Send
----
-.. autofunction:: paddle.v2.fluid.layers.sums
+
+.. autofunction:: paddle.v2.fluid.layers.Send
:noindex:
+nn
+==
-linear_chain_crf
-----------------
-.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+fc
+--
+
+.. autofunction:: paddle.v2.fluid.layers.fc
:noindex:
+embedding
+---------
-assign
--------
.. autofunction:: paddle.v2.fluid.layers.embedding
:noindex:
+dynamic_lstm
+------------
-split_lod_tensor
-----------------
-.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
:noindex:
+dynamic_lstmp
+-------------
-merge_lod_tensor
+.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+ :noindex:
+
+dynamic_gru
+-----------
+
+.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
+ :noindex:
+
+gru_unit
+--------
+
+.. autofunction:: paddle.v2.fluid.layers.gru_unit
+ :noindex:
+
+linear_chain_crf
----------------
-.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+
+.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+ :noindex:
+
+crf_decoding
+------------
+
+.. autofunction:: paddle.v2.fluid.layers.crf_decoding
:noindex:
cos_sim
---------
+-------
+
.. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex:
-
cross_entropy
-------------
+
.. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex:
-
-
square_error_cost
-----------------
+
.. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex:
-
accuracy
----------
+--------
+
.. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex:
+chunk_eval
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.chunk_eval
+ :noindex:
sequence_conv
-------------
+
.. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex:
-
conv2d
------
+
.. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex:
-
sequence_pool
-------------
+
.. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex:
+pool2d
+------
-sequence_first_step
--------------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
+.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex:
+batch_norm
+----------
-sequence_last_step
+.. autofunction:: paddle.v2.fluid.layers.batch_norm
+ :noindex:
+
+beam_search_decode
------------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
+
+.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex:
+conv2d_transpose
+----------------
-pool2d
-------
-.. autofunction:: paddle.v2.fluid.layers.pool2d
+.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex:
+sequence_expand
+---------------
-batch_norm
+.. autofunction:: paddle.v2.fluid.layers.sequence_expand
+ :noindex:
+
+lstm_unit
+---------
+
+.. autofunction:: paddle.v2.fluid.layers.lstm_unit
+ :noindex:
+
+reduce_sum
----------
-.. autofunction:: paddle.v2.fluid.layers.batch_norm
+
+.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex:
+reduce_mean
+-----------
-beam_search_decode
+.. autofunction:: paddle.v2.fluid.layers.reduce_mean
+ :noindex:
+
+reduce_max
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.reduce_max
+ :noindex:
+
+reduce_min
+----------
+
+.. autofunction:: paddle.v2.fluid.layers.reduce_min
+ :noindex:
+
+sequence_first_step
+-------------------
+
+.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
+ :noindex:
+
+sequence_last_step
------------------
-.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
+
+.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
+ :noindex:
+
+dropout
+-------
+
+.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex:
+split
+-----
-lod_rank_table
---------------
-.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
+.. autofunction:: paddle.v2.fluid.layers.split
:noindex:
+ctc_greedy_decoder
+------------------
-max_sequence_len
-----------------
-.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
+.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
:noindex:
+edit_distance
+-------------
-topk
------
-.. autofunction:: paddle.v2.fluid.layers.topk
+.. autofunction:: paddle.v2.fluid.layers.edit_distance
:noindex:
+l2_normalize
+------------
-lod_tensor_to_array
--------------------
-.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+.. autofunction:: paddle.v2.fluid.layers.l2_normalize
:noindex:
+matmul
+------
-
-array_to_lod_tensor
--------------------
-.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex:
+warpctc
+-------
+.. autofunction:: paddle.v2.fluid.layers.warpctc
+ :noindex:
+sequence_reshape
+----------------
-fill_constant
--------------
-.. autofunction:: paddle.v2.fluid.layers.fill_constant
+.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
:noindex:
+transpose
+---------
+.. autofunction:: paddle.v2.fluid.layers.transpose
+ :noindex:
-fill_constant_batch_size_like
------------------------------
-.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+im2sequence
+-----------
+
+.. autofunction:: paddle.v2.fluid.layers.im2sequence
:noindex:
+nce
+---
-ones
-----
-.. autofunction:: paddle.v2.fluid.layers.ones
+.. autofunction:: paddle.v2.fluid.layers.nce
:noindex:
+beam_search
+-----------
-zeros
------
-.. autofunction:: paddle.v2.fluid.layers.zeros
+.. autofunction:: paddle.v2.fluid.layers.beam_search
:noindex:
+row_conv
+--------
-increment
----------
-.. autofunction:: paddle.v2.fluid.layers.increment
+.. autofunction:: paddle.v2.fluid.layers.row_conv
:noindex:
+multiplex
+---------
-array_write
------------
-.. autofunction:: paddle.v2.fluid.layers.array_write
+.. autofunction:: paddle.v2.fluid.layers.multiplex
:noindex:
+ops
+===
+mean
+----
-create_array
-------------
-.. autofunction:: paddle.v2.fluid.layers.create_array
+.. autofunction:: paddle.v2.fluid.layers.mean
:noindex:
+mul
+---
-less_than
----------
-.. autofunction:: paddle.v2.fluid.layers.less_than
+.. autofunction:: paddle.v2.fluid.layers.mul
:noindex:
+reshape
+-------
-array_read
-----------
-.. autofunction:: paddle.v2.fluid.layers.array_read
+.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
+scale
+-----
-shrink_memory
---------------
-.. autofunction:: paddle.v2.fluid.layers.shrink_memory
+.. autofunction:: paddle.v2.fluid.layers.scale
:noindex:
+sigmoid_cross_entropy_with_logits
+---------------------------------
-array_length
--------------
-.. autofunction:: paddle.v2.fluid.layers.array_length
+.. autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
:noindex:
+elementwise_add
+---------------
-conv2d_transpose
-----------------
-.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+.. autofunction:: paddle.v2.fluid.layers.elementwise_add
:noindex:
-
-sequence_expand
+elementwise_div
---------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_expand
+
+.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex:
+elementwise_sub
+---------------
-gru_unit
---------
-.. autofunction:: paddle.v2.fluid.layers.gru_unit
+.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
:noindex:
+elementwise_mul
+---------------
-lstm_unit
----------
-.. autofunction:: paddle.v2.fluid.layers.lstm_unit
+.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
:noindex:
+elementwise_max
+---------------
-sequence_softmax
-----------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
+.. autofunction:: paddle.v2.fluid.layers.elementwise_max
:noindex:
+elementwise_min
+---------------
-reduce_sum
-----------
-.. autofunction:: paddle.v2.fluid.layers.reduce_sum
+.. autofunction:: paddle.v2.fluid.layers.elementwise_min
:noindex:
+elementwise_pow
+---------------
-reduce_mean
------------
-.. autofunction:: paddle.v2.fluid.layers.reduce_mean
+.. autofunction:: paddle.v2.fluid.layers.elementwise_pow
:noindex:
+clip
+----
-reduce_max
-----------
-.. autofunction:: paddle.v2.fluid.layers.reduce_max
+.. autofunction:: paddle.v2.fluid.layers.clip
:noindex:
+clip_by_norm
+------------
-reduce_min
-----------
-.. autofunction:: paddle.v2.fluid.layers.reduce_min
+.. autofunction:: paddle.v2.fluid.layers.clip_by_norm
:noindex:
+sequence_softmax
+----------------
-split
------
-.. autofunction:: paddle.v2.fluid.layers.split
+.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
:noindex:
+sigmoid
+-------
-matmul
-------
-.. autofunction:: paddle.v2.fluid.layers.matmul
+.. autofunction:: paddle.v2.fluid.layers.sigmoid
:noindex:
logsigmoid
----------
+
.. autofunction:: paddle.v2.fluid.layers.logsigmoid
:noindex:
exp
---
+
.. autofunction:: paddle.v2.fluid.layers.exp
:noindex:
relu
----
+
.. autofunction:: paddle.v2.fluid.layers.relu
:noindex:
tanh
----
+
.. autofunction:: paddle.v2.fluid.layers.tanh
:noindex:
tanh_shrink
-----------
+
.. autofunction:: paddle.v2.fluid.layers.tanh_shrink
:noindex:
softshrink
----------
+
.. autofunction:: paddle.v2.fluid.layers.softshrink
:noindex:
sqrt
----
+
.. autofunction:: paddle.v2.fluid.layers.sqrt
:noindex:
abs
-----
+---
+
.. autofunction:: paddle.v2.fluid.layers.abs
:noindex:
ceil
----
+
.. autofunction:: paddle.v2.fluid.layers.ceil
:noindex:
floor
-----
+
.. autofunction:: paddle.v2.fluid.layers.floor
:noindex:
round
-----
+
.. autofunction:: paddle.v2.fluid.layers.round
:noindex:
reciprocal
----------
+
.. autofunction:: paddle.v2.fluid.layers.reciprocal
:noindex:
log
---
+
.. autofunction:: paddle.v2.fluid.layers.log
:noindex:
square
------
+
.. autofunction:: paddle.v2.fluid.layers.square
:noindex:
softplus
--------
+
.. autofunction:: paddle.v2.fluid.layers.softplus
:noindex:
softsign
----------
+--------
+
.. autofunction:: paddle.v2.fluid.layers.softsign
:noindex:
brelu
-----
+
.. autofunction:: paddle.v2.fluid.layers.brelu
:noindex:
leaky_relu
----------
+
.. autofunction:: paddle.v2.fluid.layers.leaky_relu
:noindex:
soft_relu
---------
+
.. autofunction:: paddle.v2.fluid.layers.soft_relu
:noindex:
elu
-----
+---
+
.. autofunction:: paddle.v2.fluid.layers.elu
:noindex:
relu6
-----
+
.. autofunction:: paddle.v2.fluid.layers.relu6
:noindex:
pow
-----
+---
+
.. autofunction:: paddle.v2.fluid.layers.pow
:noindex:
+stanh
+-----
+
+.. autofunction:: paddle.v2.fluid.layers.stanh
+ :noindex:
+
hard_shrink
-----------
+
.. autofunction:: paddle.v2.fluid.layers.hard_shrink
:noindex:
thresholded_relu
----------------
+
.. autofunction:: paddle.v2.fluid.layers.thresholded_relu
:noindex:
hard_sigmoid
--------------
+------------
+
.. autofunction:: paddle.v2.fluid.layers.hard_sigmoid
:noindex:
swish
-------
+-----
+
.. autofunction:: paddle.v2.fluid.layers.swish
:noindex:
-im2sequence
+tensor
+======
+
+create_tensor
+-------------
+
+.. autofunction:: paddle.v2.fluid.layers.create_tensor
+ :noindex:
+
+create_parameter
+----------------
+
+.. autofunction:: paddle.v2.fluid.layers.create_parameter
+ :noindex:
+
+create_global_var
+-----------------
+
+.. autofunction:: paddle.v2.fluid.layers.create_global_var
+ :noindex:
+
+cast
+----
+
+.. autofunction:: paddle.v2.fluid.layers.cast
+ :noindex:
+
+concat
------
-.. autofunction:: paddle.v2.fluid.layers.im2sequence
+
+.. autofunction:: paddle.v2.fluid.layers.concat
:noindex:
-edit_distance
----------------
-.. autofunction:: paddle.v2.fluid.layers.edit_distance_error
+sums
+----
+
+.. autofunction:: paddle.v2.fluid.layers.sums
:noindex:
-ctc_greedy_decoder
----------------
-.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+assign
+------
+
+.. autofunction:: paddle.v2.fluid.layers.assign
:noindex:
-l2_normalize
-------------
-.. autofunction:: paddle.v2.fluid.layers.l2_normalize
+fill_constant_batch_size_like
+-----------------------------
+
+.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex:
-sequence_reshape
-----------------
-.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
+fill_constant
+-------------
+
+.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex:
-row_conv
---------
-.. autofunction:: paddle.v2.fluid.layers.row_conv
+ones
+----
+
+.. autofunction:: paddle.v2.fluid.layers.ones
:noindex:
-multiplex
----------
-.. autofunction:: paddle.v2.fluid.layers.multiplex
+zeros
+-----
+
+.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex:
+
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
index 500019bc507f859c4c91de5d322a82eb1e78e2de..015581b7660848bdb0845fafe2d3fc05405e6ae6 100644
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
@@ -1,33 +1,31 @@
-===========
-Nets
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+====
+nets
+====
simple_img_conv_pool
--------------------
-.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
- :noindex:
-
-img_conv_group
----------------
-.. autofunction:: paddle.v2.fluid.nets.img_conv_group
+.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
-
sequence_conv_pool
------------------
+
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex:
-
glu
---
+
.. autofunction:: paddle.v2.fluid.nets.glu
:noindex:
-
scaled_dot_product_attention
----------------------------
+
.. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
:noindex:
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
index 19b4940f08de3e2f7dc177f2961e538946d10a78..1691ebb9a7cb16da96e04147d0adea322374f529 100644
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -1,54 +1,49 @@
-===========
-Optimizer
-===========
-
-Optimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: Optimizer
- :noindex:
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+=========
+optimizer
+=========
-SGDOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: SGDOptimizer
- :noindex:
+SGD
+---
+.. autoclass:: paddle.v2.fluid.optimizer.SGD
+ :members:
+ :noindex:
+Momentum
+--------
-MomentumOptimizer
------------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: MomentumOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.Momentum
+ :members:
:noindex:
+Adagrad
+-------
-
-AdagradOptimizer
-----------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdagradOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.Adagrad
+ :members:
:noindex:
+Adam
+----
-AdamOptimizer
--------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdamOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.Adam
+ :members:
:noindex:
+Adamax
+------
-AdamaxOptimizer
------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: AdamaxOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.Adamax
+ :members:
:noindex:
+DecayedAdagrad
+--------------
-DecayedAdagradOptimizer
------------------------
-.. automodule:: paddle.v2.fluid.optimizer
- :members: DecayedAdagradOptimizer
+.. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
+ :members:
:noindex:
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
index ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50..8083d0d858dafcd275eaddb9b475875ee42ef724 100644
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -1,11 +1,21 @@
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
+==========
+param_attr
+==========
+
ParamAttr
-===========
+---------
+.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr
+ :members:
+ :noindex:
+WeightNormParamAttr
+-------------------
-ParamAttr
------------
-.. automodule:: paddle.v2.fluid.param_attr
- :members: ParamAttr
+.. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
+ :members:
:noindex:
+
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
index 7d4042d1f41c12c4a551ba6576559d612116872a..4a1ff7cb6976e0054f77428b699ea679aa91394f 100644
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
@@ -1,10 +1,25 @@
-===========
-Profiler
-===========
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+========
+profiler
+========
+cuda_profiler
+-------------
-Profiler
------------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex:
+
+reset_profiler
+--------------
+
+.. autofunction:: paddle.v2.fluid.profiler.reset_profiler
+ :noindex:
+
+profiler
+--------
+
+.. autofunction:: paddle.v2.fluid.profiler.profiler
+ :noindex:
+
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
index 868e225ed3d59e79aeb217fb88081ea25f80fa2c..2c17d15599baa1d02eb87c7b6c40034769ebb3a4 100644
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -1,25 +1,27 @@
+.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
+ !DO NOT EDIT THIS FILE MANUALLY!
+
===========
-Regularizer
+regularizer
===========
-WeightDecayRegularizer
-----------------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: WeightDecayRegularizer
- :noindex:
-
+append_regularization_ops
+-------------------------
-L2DecayRegularizer
-------------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: L2DecayRegularizer
+.. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
:noindex:
+L1Decay
+-------
+.. autoclass:: paddle.v2.fluid.regularizer.L1Decay
+ :members:
+ :noindex:
-L1DecayRegularizer
--------------------
-.. automodule:: paddle.v2.fluid.regularizer
- :members: L1DecayRegularizer
+L2Decay
+-------
+.. autoclass:: paddle.v2.fluid.regularizer.L2Decay
+ :members:
+ :noindex:
diff --git a/doc/design/speech/README.MD b/doc/design/speech/deep_speech_2.md
similarity index 85%
rename from doc/design/speech/README.MD
rename to doc/design/speech/deep_speech_2.md
index 7304650e628dba210488cd2dc4836318b5383b2a..cfdc4d6df04344c70d3334626bd38eca997c31ff 100644
--- a/doc/design/speech/README.MD
+++ b/doc/design/speech/deep_speech_2.md
@@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM
-TODO by Assignees
+
+
+Figure 2. Algorithm for CTC Beam Search Decoder.
+
+
+- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
+ - 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
+ - 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
+- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
+- Such external scorer consists of language model, word count or any other custom scorers.
+- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
+- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
+
## Future Work
@@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
+3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
diff --git a/doc/design/speech/image/beam_search.png b/doc/design/speech/image/beam_search.png
new file mode 100644
index 0000000000000000000000000000000000000000..7f7e35f34223162d0f7f0ed97375909c43b830ae
Binary files /dev/null and b/doc/design/speech/image/beam_search.png differ
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
index 4c5f10e2ecb9ec09b78926ca27552741d02d7cc9..8983df900460127fc130043c52373dab505363ba 100644
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -2,9 +2,9 @@
## Background
-Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries flexibly and efficiently.
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
-On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example,Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
@@ -17,7 +17,7 @@ For a general overview of fluid, please refer to the [overview doc](https://gith
There are mainly three parts that we have to consider while integrating a new device/library:
-- Place and DeviceContext: indicates the device id and manages hardware resources
+- Place and DeviceContext: indicate the device id and manage hardware resources
- Memory and Tensor: malloc/free data on certain device
@@ -25,10 +25,10 @@ There are mainly three parts that we have to consider while integrating a new de
### Place and DeviceContext
-Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
#### Place
-Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`.
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
```
| CPUPlace
@@ -144,7 +144,7 @@ class Tensor {
};
```
-`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
```cpp
paddle::framework::Tensor t;
@@ -163,7 +163,7 @@ Fluid implements computing units based on different DeviceContexts. Some computi
Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
-The interface is defined in header file.
+The interface is defined in the header file.
```
template
@@ -174,7 +174,7 @@ class MaxOutFunctor {
};
```
-CPU implemention is in .cc file
+CPU implementation is in .cc file
```
template
@@ -188,7 +188,7 @@ class MaxOutFunctor {
};
```
-CUDA implemention is in .cu file
+CUDA implementation is in .cu file
```
template
@@ -203,9 +203,9 @@ class MaxOutFunctor {
```
-We get computing handle from a concrete DeviceContext, and make compution on tensors.
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
-The implemention of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
Fluid provides different register interfaces in op_registry.h
@@ -231,7 +231,7 @@ REGISTER_OP_CUDA_KERNEL(
## Advanced topics: How to switch between different Device/Library
-Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
For more details, please refer to following docs:
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index 71904dc41ed0d946867d890cc585e1b88450ca8c..ff904b1022a41612c9680dce92d3fc2c69ad7e93 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B
"WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
"WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
"WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
- "WITH_TESTING", "是否开启单元测试", "ON"
+ "WITH_TESTING", "是否开启单元测试", "OFF"
"WITH_DOC", "是否编译中英文文档", "OFF"
"WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
"WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index 27f73b2e2c029b41d514e1612912ed1c335605b6..718fb869c23a1f7be82c87c726282bded9dad516 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
"WITH_AVX", "Build with AVX support", "ON"
"WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON"
- "WITH_TESTING", "Build unit tests", "ON"
+ "WITH_TESTING", "Build unit tests", "OFF"
"WITH_DOC", "Build documentations", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 98fada7bdb46f4dd2927d6f93bcbcebbe7d18604..79d214635a069a739060e0b79424729f6ff90387 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
docker run -p 8888:8888 paddlepaddle/book
+国内用户可以使用下面的镜像源来加速访问:
+
+ .. code-block: bash
+
+ docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
然后在浏览器中输入以下网址:
.. code-block:: text
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index b1d0890b4cdddb77114a80276130afd07c22d270..e0e0559fb858a093db96a9b4ec1c5a45d6c71a38 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
docker run -p 8888:8888 paddlepaddle/book
+For users in China, we provide a faster mirror:
+
+ .. code-block: bash
+
+ docker run -p 8888:8888 docker.paddlepaddlehub.com/book
+
Then, you would back and paste the address into the local browser:
.. code-block:: text
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index c2fc86687d7106aac7c74d6dd16bc229353cb7c1..0f3db59607fb6b43da01f5fdb46949087517ed6c 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -92,11 +92,11 @@ paddle.init(
参数说明
- use_gpu: **可选,默认False**,是否启用GPU训练
-- trainer_count:**必选,默认1**,当前训练任务trainer总个数
+- trainer_count:**必选,默认1**,当前trainer的线程数目
- port:**必选,默认7164**,连接到pserver的端口
- ports_num:**必选,默认1**,连接到pserver的端口个数
- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
-- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
+- num_gradient_servers:**必选,默认1**,当前训练任务trainer总数
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 28cd1fa7903e559e33a7fc2f00172fdfbe2fdc97..f9424f8f1a29fcf001c4e7976086512b22f6e858 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -95,11 +95,11 @@ paddle.init(
Parameter Description
- use_gpu: **optional, default False**, set to "True" to enable GPU training.
-- trainer_count: **required, default 1**, total count of trainers in the training job.
+- trainer_count: **required, default 1**, number of threads in current trainer.
- port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication.
- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
-- num_gradient_servers: **required, default 1**, total number of gradient server.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 8d9260811a8c9274dcaade9b090bab727d1952ca..8b71f73c36c33d882b34c833031c50cd14817e76 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -22,11 +22,11 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
-nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_test(variable_test SRCS variable_test.cc)
-cc_library(threadpool SRCS threadpool.cc)
+cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool)
@@ -74,8 +74,10 @@ cc_library(backward SRCS backward.cc DEPS net_op)
cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
+cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table profiler)
+framework_proto backward glog lod_rank_table profiler feed_fetch_method)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
@@ -96,3 +98,5 @@ if(NOT WITH_C_API AND WITH_FLUID)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
endif()
+
+cc_test(channel_test SRCS channel_test.cc)
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0570980c5a4d7fa45e672ae5baac65d2c65ddad9
--- /dev/null
+++ b/paddle/framework/channel.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include // for size_t
+
+namespace paddle {
+namespace framework {
+
+// Channel is the abstract class of buffered and un-buffered channels.
+template
+class Channel {
+ public:
+ virtual void Send(T*) = 0;
+ virtual void Receive(T*) = 0;
+ virtual size_t Cap() = 0;
+ virtual void Close() = 0;
+ virtual ~Channel() {}
+};
+
+// Forward declaration of channel implementations.
+namespace details {
+template
+class Buffered;
+template
+class UnBuffered;
+} // namespace details
+
+template
+Channel* MakeChannel(size_t buffer_size) {
+ if (buffer_size > 0) {
+ return new details::Buffered(buffer_size);
+ }
+ return new details::UnBuffered();
+}
+
+template
+void CloseChannel(Channel* ch) {
+ ch->Close();
+}
+
+} // namespace framework
+} // namespace paddle
+
+#include "paddle/framework/details/buffered_channel.h"
+#include "paddle/framework/details/unbuffered_channel.h"
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1510fb8abf54f05804bd404d9bd00ecc42fbef63
--- /dev/null
+++ b/paddle/framework/channel_test.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/channel.h"
+
+#include
+#include
+
+#include "gtest/gtest.h"
+
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
+
+TEST(Channel, MakeAndClose) {
+ using paddle::framework::details::Buffered;
+ using paddle::framework::details::UnBuffered;
+ {
+ // MakeChannel should return a buffered channel is buffer_size > 0.
+ auto ch = MakeChannel(10);
+ EXPECT_NE(dynamic_cast*>(ch), nullptr);
+ EXPECT_EQ(dynamic_cast*>(ch), nullptr);
+ CloseChannel(ch);
+ delete ch;
+ }
+ {
+ // MakeChannel should return an un-buffered channel is buffer_size = 0.
+ auto ch = MakeChannel(0);
+ EXPECT_EQ(dynamic_cast*>(ch), nullptr);
+ EXPECT_NE(dynamic_cast*>(ch), nullptr);
+ CloseChannel(ch);
+ delete ch;
+ }
+}
+
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+ const size_t buffer_size = 10;
+ auto ch = MakeChannel(buffer_size);
+ for (size_t i = 0; i < buffer_size; ++i) {
+ ch->Send(&i); // should not block
+ }
+
+ size_t out;
+ for (size_t i = 0; i < buffer_size; ++i) {
+ ch->Receive(&out); // should not block
+ EXPECT_EQ(out, i);
+ }
+ CloseChannel(ch);
+ delete ch;
+}
+
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+ const size_t buffer_size = 10;
+ auto ch = MakeChannel(buffer_size);
+ size_t sum = 0;
+ std::thread t([&]() {
+ // Try to write more than buffer size.
+ for (size_t i = 0; i < 2 * buffer_size; ++i) {
+ ch->Send(&i); // should not block
+ sum += i;
+ }
+ });
+ std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
+ EXPECT_EQ(sum, 45U);
+
+ CloseChannel(ch);
+ t.join();
+ delete ch;
+}
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
index 6a372ac32e48131eed28e2d42125feb5b92a11c7..98eb3e857d1943e71f1d41f24ecbedbe09e85b7b 100644
--- a/paddle/framework/data_type.h
+++ b/paddle/framework/data_type.h
@@ -79,5 +79,33 @@ inline void VisitDataType(proto::DataType type, Visitor visitor) {
}
}
+inline std::string DataTypeToString(const proto::DataType type) {
+ using namespace paddle::framework::proto;
+ switch (type) {
+ case DataType::FP16:
+ return "float16";
+ case DataType::FP32:
+ return "float32";
+ case DataType::FP64:
+ return "float64";
+ case DataType::INT16:
+ return "int16";
+ case DataType::INT32:
+ return "int32";
+ case DataType::INT64:
+ return "int64";
+ case DataType::BOOL:
+ return "bool";
+ default:
+ PADDLE_THROW("Not support type %d", type);
+ }
+}
+
+inline std::ostream& operator<<(std::ostream& out,
+ const proto::DataType& type) {
+ out << DataTypeToString(type);
+ return out;
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..b093e1589293b030ef2bedb82504a8e86b3dc857
--- /dev/null
+++ b/paddle/framework/details/buffered_channel.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include
+#include
+#include
+
+#include "paddle/framework/channel.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template
+class Buffered : public paddle::framework::Channel {
+ friend Channel* paddle::framework::MakeChannel(size_t);
+ friend void paddle::framework::CloseChannel(Channel*);
+
+ public:
+ virtual void Send(T*);
+ virtual void Receive(T*);
+ virtual size_t Cap() { return cap_; }
+ virtual void Close();
+ virtual ~Buffered();
+
+ private:
+ size_t cap_;
+ std::mutex mu_;
+ std::condition_variable empty_cond_var_;
+ std::condition_variable full_cond_var_;
+ std::deque channel_;
+ bool closed_;
+
+ Buffered(size_t cap) : cap_(cap), closed_(false) {
+ PADDLE_ENFORCE_GT(cap, 0);
+ }
+
+ void NotifyAllSenders(std::unique_lock*);
+};
+
+template
+void Buffered::Send(T* item) {
+ std::unique_lock lock(mu_);
+ full_cond_var_.wait(lock,
+ [this]() { return channel_.size() < cap_ || closed_; });
+ if (!closed_) {
+ channel_.push_back(std::move(*item));
+ lock.unlock();
+ empty_cond_var_.notify_one();
+ }
+}
+
+template
+void Buffered::Receive(T* item) {
+ std::unique_lock lock(mu_);
+ empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+ if (!closed_) {
+ *item = std::move(channel_.front());
+ channel_.pop_front();
+ NotifyAllSenders(&lock);
+ } else {
+ item = nullptr;
+ }
+}
+
+template
+void Buffered::Close() {
+ std::unique_lock lock(mu_);
+ closed_ = true;
+ NotifyAllSenders(&lock);
+}
+
+template
+Buffered::~Buffered() {
+ std::unique_lock lock(mu_);
+ closed_ = true;
+ channel_.clear();
+ NotifyAllSenders(&lock);
+}
+
+template
+void Buffered::NotifyAllSenders(std::unique_lock* lock) {
+ lock->unlock();
+ full_cond_var_.notify_all();
+}
+
+} // namespace details
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc2d2e587eca981307d4e522bd569fbffa450207
--- /dev/null
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include
+#include
+#include
+
+#include "paddle/framework/channel.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template
+class UnBuffered : public paddle::framework::Channel {
+ friend Channel* paddle::framework::MakeChannel(size_t);
+ friend void paddle::framework::CloseChannel(Channel*);
+
+ public:
+ virtual void Send(T*);
+ virtual void Receive(T*);
+ virtual size_t Cap() { return 0; }
+ virtual void Close();
+ virtual ~UnBuffered();
+
+ private:
+ UnBuffered() {}
+};
+
+template
+void UnBuffered::Send(T* channel_element) {}
+
+template
+void UnBuffered::Receive(T*) {}
+
+template
+void UnBuffered::Close() {}
+
+template
+UnBuffered::~UnBuffered() {}
+
+} // namespace details
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index c28ffefdd0872238299cdbb0653ee17cdad61699..9a232b08434d299d10bb2acdb6e96295de875d56 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -17,6 +17,7 @@ limitations under the License. */
#include
#include "gflags/gflags.h"
+#include "paddle/framework/feed_fetch_method.h"
#include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h"
@@ -24,7 +25,7 @@ limitations under the License. */
#include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
-DECLARE_bool(do_memory_benchmark);
+DECLARE_bool(benchmark);
DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
@@ -32,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle {
namespace framework {
-const std::string kFeedOpType = "feed";
-const std::string kFetchOpType = "fetch";
-
Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
@@ -124,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope);
- if (FLAGS_do_memory_benchmark) {
+ if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
@@ -141,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
- if (FLAGS_do_memory_benchmark) {
+ if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
@@ -149,5 +147,164 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
}
}
+// Check whether the block already has feed operators and feed_holder.
+// Return false if the block does not have any feed operators.
+// If some feed operators have been prepended to the block, check that
+// the info contained in these feed operators matches the feed_targets
+// and feed_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has feed operators and holder of matching info.
+static bool has_feed_operators(
+ BlockDesc* block, std::map& feed_targets,
+ const std::string& feed_holder_name) {
+ size_t feed_count = 0;
+ for (auto* op : block->AllOps()) {
+ if (op->Type() == kFeedOpType) {
+ feed_count++;
+ PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
+ "Input to feed op should be '%s'", feed_holder_name);
+ std::string feed_target_name = op->Output("Out")[0];
+ PADDLE_ENFORCE(
+ feed_targets.find(feed_target_name) != feed_targets.end(),
+ "Feed operator output name '%s' cannot be found in 'feed_targets'",
+ feed_target_name);
+ }
+ }
+
+ if (feed_count > 0) {
+ PADDLE_ENFORCE_EQ(
+ feed_count, feed_targets.size(),
+ "The number of feed operators should match 'feed_targets'");
+
+ // When feed operator are present, so should be feed_holder
+ auto var = block->FindVar(feed_holder_name);
+ PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+ feed_holder_name);
+ PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
+ "'%s' variable should be 'FEED_MINIBATCH' type",
+ feed_holder_name);
+ }
+
+ return feed_count > 0;
+}
+
+// Check whether the block already has fetch operators and fetch_holder.
+// Return false if the block does not have any fetch operators.
+// If some fetch operators have been appended to the block, check that
+// the info contained in these fetch operators matches the fetch_targets
+// and fetch_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has fetch operators and holder of matching info.
+static bool has_fetch_operators(
+ BlockDesc* block, std::map& fetch_targets,
+ const std::string& fetch_holder_name) {
+ size_t fetch_count = 0;
+ for (auto* op : block->AllOps()) {
+ if (op->Type() == kFetchOpType) {
+ fetch_count++;
+ PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
+ "Output of fetch op should be '%s'", fetch_holder_name);
+ std::string fetch_target_name = op->Input("X")[0];
+ PADDLE_ENFORCE(
+ fetch_targets.find(fetch_target_name) != fetch_targets.end(),
+ "Fetch operator input name '%s' cannot be found in 'fetch_targets'",
+ fetch_target_name);
+ }
+ }
+
+ if (fetch_count > 0) {
+ PADDLE_ENFORCE_EQ(
+ fetch_count, fetch_targets.size(),
+ "The number of fetch operators should match 'fetch_targets'");
+
+ // When fetch operator are present, so should be fetch_holder
+ auto var = block->FindVar(fetch_holder_name);
+ PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+ fetch_holder_name);
+ PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
+ "'%s' variable should be 'FETCH_LIST' type",
+ fetch_holder_name);
+ }
+
+ return fetch_count > 0;
+}
+
+void Executor::Run(const ProgramDesc& program, Scope* scope,
+ std::map& feed_targets,
+ std::map& fetch_targets,
+ const std::string& feed_holder_name,
+ const std::string& fetch_holder_name) {
+ auto* copy_program = new ProgramDesc(program);
+ auto* global_block = copy_program->MutableBlock(0);
+
+ if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
+ // create feed_holder variable
+ auto* feed_holder = global_block->Var(feed_holder_name);
+ feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
+ feed_holder->SetPersistable(true);
+
+ int i = 0;
+ for (auto& feed_target : feed_targets) {
+ std::string var_name = feed_target.first;
+ VLOG(3) << "feed target's name: " << var_name;
+
+ // prepend feed op
+ auto* op = global_block->PrependOp();
+ op->SetType(kFeedOpType);
+ op->SetInput("X", {feed_holder_name});
+ op->SetOutput("Out", {var_name});
+ op->SetAttr("col", {static_cast(i)});
+ op->CheckAttrs();
+
+ i++;
+ }
+ }
+
+ // map the data of feed_targets to feed_holder
+ for (auto* op : global_block->AllOps()) {
+ if (op->Type() == kFeedOpType) {
+ std::string feed_target_name = op->Output("Out")[0];
+ int idx = boost::get(op->GetAttr("col"));
+ SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
+ idx);
+ }
+ }
+
+ if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
+ // create fetch_holder variable
+ auto* fetch_holder = global_block->Var(fetch_holder_name);
+ fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
+ fetch_holder->SetPersistable(true);
+
+ int i = 0;
+ for (auto& fetch_target : fetch_targets) {
+ std::string var_name = fetch_target.first;
+ VLOG(3) << "fetch target's name: " << var_name;
+
+ // append fetch op
+ auto* op = global_block->AppendOp();
+ op->SetType(kFetchOpType);
+ op->SetInput("X", {var_name});
+ op->SetOutput("Out", {fetch_holder_name});
+ op->SetAttr("col", {static_cast(i)});
+ op->CheckAttrs();
+
+ i++;
+ }
+ }
+
+ Run(*copy_program, scope, 0, true, true);
+
+ // obtain the data of fetch_targets from fetch_holder
+ for (auto* op : global_block->AllOps()) {
+ if (op->Type() == kFetchOpType) {
+ std::string fetch_target_name = op->Input("X")[0];
+ int idx = boost::get(op->GetAttr("col"));
+ *fetch_targets[fetch_target_name] =
+ GetFetchVariable(*scope, fetch_holder_name, idx);
+ }
+ }
+
+ delete copy_program;
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
index d869e18901b82959a40cc296aa0844c20ea63ac1..035ff48a52bd2fc4b1a46b48b1fbf1fbcb2ac70b 100644
--- a/paddle/framework/executor.h
+++ b/paddle/framework/executor.h
@@ -41,6 +41,12 @@ class Executor {
void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
bool create_vars = true);
+ void Run(const ProgramDesc& program, Scope* scope,
+ std::map& feed_targets,
+ std::map& fetch_targets,
+ const std::string& feed_holder_name = "feed",
+ const std::string& fetch_holder_name = "fetch");
+
private:
const platform::Place place_;
};
diff --git a/paddle/framework/feed_fetch_method.cc b/paddle/framework/feed_fetch_method.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21201b675519e34b11e9f1f3a6f2a135c06d63a7
--- /dev/null
+++ b/paddle/framework/feed_fetch_method.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/feed_fetch_method.h"
+#include "glog/logging.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+ const std::string& var_name, size_t index) {
+ // If var_name Variable is not found in GlobalScope, a new variable will
+ // be created.
+ VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+ Variable* g_feed_value = scope->Var(var_name);
+ auto& feed_inputs =
+ *(g_feed_value->GetMutable>());
+ if (index >= feed_inputs.size()) {
+ feed_inputs.resize(index + 1);
+ }
+ // shared data with input tensor
+ feed_inputs[index].ShareDataWith(input);
+ // set lod
+ feed_inputs[index].set_lod(input.lod());
+}
+
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+ size_t index) {
+ // Since we want to fetch LodTensor from a variable, the variable must
+ // be created alreadly.
+ Variable* g_fetch_value = scope.FindVar(var_name);
+ PADDLE_ENFORCE(g_fetch_value->IsType(),
+ "Only %s can be invoked by GetFetchVariable",
+ typeid(FeedFetchList).name());
+ auto& fetch_outputs = *g_fetch_value->GetMutable();
+ auto& tensor = fetch_outputs[index];
+ VLOG(3) << "Fetch " << var_name << " with index " << index
+ << " shape= " << tensor.dims();
+ PADDLE_ENFORCE_LT(index, fetch_outputs.size());
+ return tensor;
+}
+
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h
index 7feacb1e24708411e7fbb610f9909447cba9e291..b71945fcc8834d2e5fe21151e1e88788b4acd5c1 100644
--- a/paddle/framework/feed_fetch_method.h
+++ b/paddle/framework/feed_fetch_method.h
@@ -13,46 +13,18 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
-#include "glog/logging.h"
+
#include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/scope.h"
-#include "paddle/framework/variable.h"
namespace paddle {
namespace framework {
void SetFeedVariable(Scope* scope, const LoDTensor& input,
- const std::string& var_name, size_t index) {
- // If var_name Variable is not found in GlobalScope, a new variable will
- // be created.
- VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
- Variable* g_feed_value = scope->Var(var_name);
- auto& feed_inputs =
- *(g_feed_value->GetMutable>());
- if (index >= feed_inputs.size()) {
- feed_inputs.resize(index + 1);
- }
- // shared data with input tensor
- feed_inputs[index].ShareDataWith(input);
- // set lod
- feed_inputs[index].set_lod(input.lod());
-}
+ const std::string& var_name, size_t index);
LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
- size_t index) {
- // Since we want to fetch LodTensor from a variable, the variable must
- // be created alreadly.
- Variable* g_fetch_value = scope.FindVar(var_name);
- PADDLE_ENFORCE(g_fetch_value->IsType(),
- "Only %s can be invoked by GetFetchVariable",
- typeid(FeedFetchList).name());
- auto& fetch_outputs = *g_fetch_value->GetMutable();
- auto& tensor = fetch_outputs[index];
- VLOG(3) << "Fetch " << var_name << " with index " << index
- << " shape= " << tensor.dims();
- PADDLE_ENFORCE_LT(index, fetch_outputs.size());
- return tensor;
-}
+ size_t index);
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
index 9bc4a90c44828ecb7458d524f59609f01848cc5c..168f456675af508df86dd0520cdeb5d16d94ad31 100644
--- a/paddle/framework/feed_fetch_type.h
+++ b/paddle/framework/feed_fetch_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
+#include
#include
#include "paddle/framework/lod_tensor.h"
@@ -20,5 +21,8 @@ namespace paddle {
namespace framework {
using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector;
+
+static const std::string kFeedOpType = "feed";
+static const std::string kFetchOpType = "fetch";
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 4ef82a541efaa35bcf831d5122570154f2fa2423..3f6ea121b3994979d89a7d5a8c20c59240a0c111 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include // for strdup
#include
+#include
#include
#include "paddle/framework/init.h"
@@ -46,17 +47,23 @@ void InitDevices() {
std::vector places;
places.emplace_back(platform::CPUPlace());
+ int count = 0;
#ifdef PADDLE_WITH_CUDA
- int count = platform::GetCUDADeviceCount();
- for (int i = 0; i < count; ++i) {
- places.emplace_back(platform::CUDAPlace(i));
+ try {
+ count = platform::GetCUDADeviceCount();
+ } catch (const std::exception &exp) {
+ LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
}
#else
LOG(WARNING)
- << "'GPU' is not supported, Please re-compile with WITH_GPU option";
+ << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif
+ for (int i = 0; i < count; ++i) {
+ places.emplace_back(platform::CUDAPlace(i));
+ }
+
platform::DeviceContextPool::Init(places);
}
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
index f837a965d3be7d40c20803ae4462b3bfd91bffd0..01e076dd8ea24831e3ed7c8a7f8fae6818a89335 100644
--- a/paddle/framework/init_test.cc
+++ b/paddle/framework/init_test.cc
@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
+#ifndef PADDLE_WITH_CUDA
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
- ASSERT_GE(pool.size(), 1U);
+ ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+
+TEST(InitDevices, CUDA) {
+ using paddle::framework::InitDevices;
+ using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_CUDA
+ int count = paddle::platform::GetCUDADeviceCount();
+ InitDevices();
+ DeviceContextPool& pool = DeviceContextPool::Instance();
+ ASSERT_EQ(pool.size(), 1U + static_cast(count));
+#endif
}
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 53b0d0fe083579da4f0bb600f292765aa2aa0d8a..cb27de6991674247e6215ce64a2da5000fa78ed4 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -24,8 +24,6 @@ limitations under the License. */
#include
#include
-#include
-
namespace paddle {
namespace framework {
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 9d1294fdeb9bd76bf944f7ec3687e3c5bb333241..d0ab640485baf6d76ee629ea420b603f42b031b4 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -18,11 +18,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include
#include
-#include
#endif
#include
#include "paddle/framework/ddim.h"
+#include "paddle/framework/mixed_vector.h"
#include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h"
#include "paddle/platform/enforce.h"
@@ -31,15 +31,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
-#ifndef PADDLE_WITH_CUDA
-template
-using Vector = std::vector;
-#else
-template
-using Vector = thrust::host_vector<
- T, thrust::system::cuda::experimental::pinned_allocator>;
-#endif
-
/*
* LoD is short for Level of Details.
*
@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
* 0 2 4 7
* 0 2 5 7 10 12 15 20
*/
-using LoD = std::vector>;
+struct LoD : public std::vector> {
+ using std::vector>::vector;
+
+ void CopyFromCUDA() {
+ for (auto it = this->begin(); it != this->end(); ++it) {
+ it->CopyFromCUDA();
+ }
+ }
+};
std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
*/
class LoDTensor : public Tensor {
public:
- LoDTensor() {}
+ LoDTensor() : Tensor() {}
+
+ /* Constructor with place should only be used in pybind */
+ explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {}
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 4d172c43c7cceacb7d0dfaf1c4d3028717350268..3b63020e685436396071fa05cd7697630ae56c95 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -23,6 +23,17 @@
namespace paddle {
namespace framework {
+TEST(LoD, data) {
+ LoD lod{{0, 1, 2}};
+ lod.push_back({0, 2, 4, 5});
+ lod.push_back(std::vector({0, 1, 6, 8, 10, 11}));
+
+ auto& v = lod[0];
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], i);
+ }
+}
+
TEST(LodExpand, test) {
LoD lod{{0, 2}};
LoDTensor tensor;
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 1e253a2f6f35e827fb2e5db6270da03705b39514..d4c9f00bd9c00f3cae68858ca46c5320fc117405 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -14,6 +14,8 @@
#include
#include
+#include
+#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h"
@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
}
}
+TEST(Vector, Normal) {
+ using namespace paddle::framework;
+ using namespace paddle::platform;
+ using namespace paddle::memory;
+
+ paddle::framework::InitDevices();
+
+ paddle::framework::Vector vec({1, 2, 3});
+ size_t* ptr = vec.data();
+ for (size_t i = 0; i < vec.size(); ++i) {
+ EXPECT_EQ(vec[i], *(ptr + i));
+ }
+
+ vec.clear();
+ vec.CopyFromCUDA();
+
+ std::vector v = {1, 2, 3};
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], vec[i]);
+ }
+}
+
+TEST(LoD, data) {
+ paddle::framework::InitDevices();
+
+ paddle::framework::LoD lod{{0, 1, 2}};
+ lod.push_back({0, 2, 4, 5});
+ lod.push_back(std::vector({0, 1, 6, 8, 10, 11}));
+
+ auto& v = lod[0];
+ test<<<1, 1>>>(v.cuda_data(), v.size());
+ cudaDeviceSynchronize();
+
+ v.CopyFromCUDA();
+ for (size_t i = 0; i < v.size(); ++i) {
+ EXPECT_EQ(v[i], i * 2);
+ }
+}
+
TEST(LoDTensor, LoDInGPU) {
+ paddle::framework::InitDevices();
+
paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0);
@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod();
- test<<<1, 8>>>(lod[0].data(), lod[0].size());
+ test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
cudaDeviceSynchronize();
+ lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..85caac8dcd9ede4fe997e2fd246d1421aa73c80a
--- /dev/null
+++ b/paddle/framework/mixed_vector.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include
+#include
+
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief Vector support both cpu and gpu.
+ * host vector lifetime is same with Vector
+ * device vector is lazily malloc and modified.
+ */
+
+template
+class Vector : public std::vector {
+ public:
+ using std::vector::vector;
+
+ Vector() {}
+ Vector(const std::vector &v) : std::vector(v) {} // NOLINT
+
+ virtual ~Vector() {
+#ifdef PADDLE_WITH_CUDA
+ if (cuda_ptr_ != nullptr) {
+ memory::Free(place_, cuda_ptr_);
+ }
+#endif
+ }
+
+ /* Get device vector */
+ T *cuda_data() {
+ CopyToCUDA();
+ PADDLE_ENFORCE_NOT_NULL(
+ cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
+ return static_cast(cuda_ptr_);
+ }
+
+ /* Get host vector */
+ T *data() { return std::vector::data(); }
+ const T *data() const { return std::vector::data(); }
+
+ /* Synchronize host vector to device vector */
+ void CopyToCUDA();
+ /* Synchronize device vector to host vector */
+ void CopyFromCUDA();
+ /* Switch device vector location */
+ void CopyToPeer(platform::Place);
+
+ private:
+ void *cuda_ptr_ = nullptr;
+ size_t cuda_size_ = 0; // device vector numel
+ platform::CUDAPlace place_;
+};
+
+template
+void Vector::CopyToCUDA() {
+#ifdef PADDLE_WITH_CUDA
+ if (cuda_size_ < this->size()) {
+ if (cuda_ptr_ != nullptr) {
+ memory::Free(place_, cuda_ptr_);
+ }
+ cuda_ptr_ =
+ memory::Alloc(place_, this->size() * sizeof(T));
+ }
+ cuda_size_ = this->size();
+ platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+ auto *ctx = pool.GetByPlace(place_);
+ memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
+ static_cast(this->data()),
+ this->size() * sizeof(T), ctx->stream());
+ ctx->Wait();
+#endif
+}
+
+template
+void Vector::CopyFromCUDA() {
+#ifdef PADDLE_WITH_CUDA
+ if (cuda_ptr_ == nullptr) {
+ LOG(WARNING) << "No uncommitted cuda data.";
+ return;
+ }
+ this->resize(cuda_size_);
+ platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+ auto *ctx = pool.GetByPlace(place_);
+ memory::Copy(platform::CPUPlace(), static_cast(this->data()), place_,
+ static_cast(cuda_ptr_), this->size() * sizeof(T),
+ ctx->stream());
+ ctx->Wait();
+#endif
+}
+
+template
+void Vector::CopyToPeer(platform::Place peer_place) {
+#ifdef PADDLE_WITH_CUDA
+ auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
+ void *peer_cuda_ptr = memory::Alloc(
+ boost::get(peer_place), this->size() * sizeof(T));
+ memory::Copy(boost::get(peer_place), peer_cuda_ptr,
+ place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+ ctx->Wait();
+
+ memory::Free(place_, cuda_ptr_);
+ place_ = boost::get(peer_place);
+ cuda_ptr_ = peer_cuda_ptr;
+#endif
+}
+
+template class Vector;
+template class Vector;
+template class Vector;
+template class Vector;
+
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc
index 649afeee8a846b0579545f2edff77e9dbe3b4dd8..cb23bbde01493d1a3b5845e77d6160a75f409c7a 100644
--- a/paddle/framework/op_kernel_type_test.cc
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -26,9 +26,9 @@ TEST(OpKernelType, ToString) {
OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN);
- ASSERT_EQ(
- paddle::framework::KernelTypeToString(op_kernel_type),
- "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]");
+ ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
+ "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
+ "CUDNN]");
}
TEST(OpKernelType, Hash) {
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 831b1e2a1e10777d9e89364adcd4b1f367e86080..4e854f54dd43d760bab44fb5f7cafeb13314b27c 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,9 +22,7 @@ limitations under the License. */
#include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
-DEFINE_bool(op_sync, false,
- "Default cuda is asynchronous device, set to True will"
- "force op run in synchronous mode.");
+DECLARE_bool(benchmark);
namespace paddle {
namespace framework {
@@ -531,7 +529,7 @@ void OperatorWithKernel::Run(const Scope& scope,
ExecutionContext(*this, new_scope, *new_dev_ctx));
/*For profiling/benchmark only*/
- if (FLAGS_op_sync) {
+ if (FLAGS_benchmark) {
new_dev_ctx->Wait();
}
}
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index b5d9e5e385c1ba57169ef885824fc23b0f130692..15ea4035c6e6193105b621210a900e74d1466941 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/framework/program_desc.h"
#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
namespace paddle {
namespace framework {
@@ -64,5 +65,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
}
}
+const std::vector ProgramDesc::GetFeedTargetNames() {
+ BlockDesc *global_block = blocks_[0].get();
+ std::vector feed_target_names;
+ for (auto *op : global_block->AllOps()) {
+ if (op->Type() == kFeedOpType) {
+ feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
+ }
+ }
+ return feed_target_names;
+}
+
+const std::vector ProgramDesc::GetFetchTargetNames() {
+ BlockDesc *global_block = blocks_[0].get();
+ std::vector fetch_target_names;
+ for (auto *op : global_block->AllOps()) {
+ if (op->Type() == kFetchOpType) {
+ fetch_target_names.push_back(op->Input("X")[0]);
+ }
+ }
+ return fetch_target_names;
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index 15a962bb696d6172acd1a83cf9bb1ffd0846d449..8e958eab6ee08436ca73b13bac010e66c7df2b8b 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -16,6 +16,7 @@ limitations under the License. */
#include
#include
+#include "paddle/framework/block_desc.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/platform/macros.h"
@@ -45,6 +46,9 @@ class ProgramDesc {
proto::ProgramDesc *Proto();
+ const std::vector GetFeedTargetNames();
+ const std::vector GetFetchTargetNames();
+
private:
proto::ProgramDesc desc_;
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
index 25eb813ffb96e9b1e13299421ead9f85c02da59f..bff8e0bceaca9749101b2c45edddba526d565624 100644
--- a/paddle/framework/prune.cc
+++ b/paddle/framework/prune.cc
@@ -17,6 +17,7 @@ limitations under the License. */
#include
#include
#include
+#include
#include
#include
@@ -102,6 +103,32 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
*op_field->Add() = input.blocks(block_id).ops(i);
}
}
+
+ // remove the VarDescs in BlockDesc that are not referenced in
+ // the pruned OpDescs
+ std::unordered_map var_map;
+ auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
+ for (const auto& var : *var_field) {
+ var_map[var.name()] = var;
+ }
+
+ var_field->Clear();
+ for (const auto& op : *op_field) {
+ // add VarDescs of all input arguments for each OpDesc
+ auto& input_field = op.inputs();
+ for (auto& input_var : input_field) {
+ for (auto& arg : input_var.arguments()) {
+ *var_field->Add() = var_map[arg];
+ }
+ }
+ // add VarDescs of all output arguments for each OpDesc
+ auto& output_field = op.outputs();
+ for (auto& output_var : output_field) {
+ for (auto& arg : output_var.arguments()) {
+ *var_field->Add() = var_map[arg];
+ }
+ }
+ }
}
// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index a67ff910093d93060d07d849f6e968e5f4ce21cd..af08b2ab816f63c05d4c65df9601c787e57994f5 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -20,9 +20,11 @@ limitations under the License. */
#include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h"
-DEFINE_bool(do_memory_benchmark, false,
+DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
- "and add some memory usage logs");
+ "and add some memory usage logs."
+ "Default cuda is asynchronous device, set to True will"
+ "force op run in synchronous mode.");
namespace paddle {
namespace framework {
@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it);
// When making memory benchmark on Fluid, we have to delete scope sync.
- if (FLAGS_do_memory_benchmark) {
+ if (FLAGS_benchmark) {
delete scope;
} else {
Async([scope] { delete scope; });
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 4aaa29d794c95592832a1fe990e2dce274eba9d5..f0ea709a5c37e769e3ffa1b2e9d1e39721979251 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -47,6 +47,11 @@ class Tensor {
public:
Tensor() : offset_(0) {}
+ /*! Constructor with place should only be used in pybind. */
+ explicit Tensor(const platform::Place& place) : offset_(0) {
+ holder_->set_place(place);
+ }
+
/*! Return a pointer to mutable memory block. */
template
inline T* data();
@@ -137,6 +142,7 @@ class Tensor {
virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
+ virtual void set_place(platform::Place place) = 0;
};
template
@@ -156,6 +162,7 @@ class Tensor {
virtual void* ptr() const { return static_cast(ptr_.get()); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
+ virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */
std::unique_ptr> ptr_;
diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc
index 109a7e7dc440d91e8223f2c0924f489f54a06f64..b7d7c00bcf9d9770f58284023ca2defcda299d64 100644
--- a/paddle/framework/threadpool.cc
+++ b/paddle/framework/threadpool.cc
@@ -1,24 +1,95 @@
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
+ http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
#include "paddle/framework/threadpool.h"
+#include "paddle/platform/enforce.h"
+
namespace paddle {
namespace framework {
-std::unique_ptr ThreadPool::threadpool(nullptr);
-std::once_flag ThreadPool::init_flag;
+std::unique_ptr ThreadPool::threadpool_(nullptr);
+std::once_flag ThreadPool::init_flag_;
+
+ThreadPool* ThreadPool::GetInstance() {
+ std::call_once(init_flag_, &ThreadPool::Init);
+ return threadpool_.get();
+}
+
+void ThreadPool::Init() {
+ if (threadpool_.get() == nullptr) {
+ // TODO(Yancey1989): specify the max threads number
+ int num_threads = std::thread::hardware_concurrency();
+ PADDLE_ENFORCE_GT(num_threads, 0);
+ threadpool_.reset(new ThreadPool(num_threads));
+ }
+}
+
+ThreadPool::ThreadPool(int num_threads)
+ : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+ threads_.resize(num_threads);
+ for (auto& thread : threads_) {
+ // TODO(Yancey1989): binding the thread on the specify CPU number
+ thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+ }
+}
+
+ThreadPool::~ThreadPool() {
+ {
+ // notify all threads to stop running
+ running_ = false;
+ scheduled_.notify_all();
+ }
+
+ for (auto& t : threads_) {
+ t->join();
+ t.reset(nullptr);
+ }
+}
+
+void ThreadPool::Wait() {
+ std::unique_lock lock(mutex_);
+ completed_.wait(lock, [=] { return Done() == true; });
+}
+
+void ThreadPool::TaskLoop() {
+ while (running_) {
+ std::unique_lock lock(mutex_);
+ scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+
+ if (!running_) {
+ break;
+ }
+ // pop a task from the task queue
+ auto task = std::move(tasks_.front());
+ tasks_.pop();
+
+ --idle_threads_;
+ lock.unlock();
+
+ // run the task
+ task();
+
+ {
+ std::unique_lock lock(mutex_);
+ ++idle_threads_;
+ if (Done()) {
+ completed_.notify_all();
+ }
+ }
+ }
+}
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index 3ac345851c38557f82698786dd3bc8e1202a4256..4e9b58679d9e7c84adf76b6245b397c7a8872483 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -20,52 +20,36 @@ limitations under the License. */
#include
#include
#include
+#include
-#include "paddle/platform/enforce.h"
+#include "paddle/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace framework {
+// ThreadPool maintains a queue of tasks, and runs them using a fixed
+// number of threads.
class ThreadPool {
public:
typedef std::packaged_task Task;
- /**
- * @brief Get a instance of threadpool, the thread number will
- * be specified as the number of hardware thread contexts
- */
- static ThreadPool* GetInstance() {
- std::call_once(init_flag, &ThreadPool::Init);
- return threadpool.get();
- }
+ // Returns the singleton of ThreadPool.
+ static ThreadPool* GetInstance();
- ~ThreadPool() {
- {
- // notify all threads to stop running
- running_ = false;
- scheduled_.notify_all();
- }
-
- for (auto& t : threads_) {
- t->join();
- t.reset(nullptr);
- }
- }
+ ~ThreadPool();
- int GetNumThreads() const { return num_threads_; }
+ // Returns the number of threads created by the constructor.
+ size_t Threads() const { return total_threads_; }
- int GetAvailable() {
+ // Returns the number of currently idle threads.
+ size_t IdleThreads() {
std::unique_lock lock(mutex_);
- return available_;
+ return idle_threads_;
}
- /**
- * @brief Push a function to the queue, and will be scheduled and
- * executed if a thread is available.
- * @param[in] Task, will be pushed to the task queue.
- * @return std::future, we could wait for the task finished by
- * f.wait().
- */
+ // Run pushes a function to the task queue and returns a std::future
+ // object. To wait for the completion of the task, call
+ // std::future::wait().
template
std::future Run(Callback fn) {
std::unique_lock lock(mutex_);
@@ -77,84 +61,40 @@ class ThreadPool {
return f;
}
- /**
- * @brief Wait until all the tasks are completed.
- */
- void Wait() {
- std::unique_lock lock(mutex_);
- completed_.wait(lock, [=] { return Done() == true; });
- }
+ // Wait until all the tasks are completed.
+ void Wait();
private:
DISABLE_COPY_AND_ASSIGN(ThreadPool);
- explicit ThreadPool(int num_threads)
- : num_threads_(num_threads), available_(num_threads), running_(true) {
- threads_.resize(num_threads);
- for (auto& thread : threads_) {
- // TODO(Yancey1989): binding the thread on the specify CPU number
- thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
- }
- }
+ explicit ThreadPool(int num_threads);
- /**
- * @brief If the task queue is empty and avaialbe
- * is equal to the number of threads, means that
- * all tasks are completed.
- *
- * Note: this function is not thread-safe.
- *
- * @return true if all tasks are completed.
- */
- bool Done() { return tasks_.empty() && available_ == num_threads_; }
-
- void TaskLoop() {
- while (running_) {
- std::unique_lock lock(mutex_);
- scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
-
- if (!running_) {
- break;
- }
- // pop a task from the task queue
- auto task = std::move(tasks_.front());
- tasks_.pop();
-
- --available_;
- lock.unlock();
-
- // run the task
- task();
-
- {
- std::unique_lock lock(mutex_);
- ++available_;
- if (Done()) {
- completed_.notify_all();
- }
- }
- }
- }
+ // If the task queue is empty and avaialbe is equal to the number of
+ // threads, means that all tasks are completed. Note: this function
+ // is not thread-safe. Returns true if all tasks are completed.
+ // Note: don't delete the data member total_threads_ and use
+ // threads_.size() instead; because you'd need to lock the mutex
+ // before accessing threads_.
+ bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
- static void Init() {
- if (threadpool.get() == nullptr) {
- // TODO(Yancey1989): specify the max threads number
- int num_threads = std::thread::hardware_concurrency();
- PADDLE_ENFORCE_GT(num_threads, 0);
- threadpool.reset(new ThreadPool(num_threads));
- }
- }
+ // The constructor starts threads to run TaskLoop, which retrieves
+ // and runs tasks from the queue.
+ void TaskLoop();
+
+ // Init is called by GetInstance.
+ static void Init();
private:
- static std::unique_ptr threadpool;
- static std::once_flag init_flag;
+ static std::unique_ptr threadpool_;
+ static std::once_flag init_flag_;
- int num_threads_;
- int available_;
- bool running_;
- std::queue tasks_;
std::vector> threads_;
+ const size_t total_threads_;
+ size_t idle_threads_;
+
+ std::queue tasks_;
std::mutex mutex_;
+ bool running_;
std::condition_variable scheduled_;
std::condition_variable completed_;
};
diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc
index 50b6238cd8786be9d8cf2d5f821daadea12bd208..3fbfe7efc867144dbd0dd2613c824c6a3c41b7d8 100644
--- a/paddle/framework/threadpool_test.cc
+++ b/paddle/framework/threadpool_test.cc
@@ -22,11 +22,7 @@ namespace framework = paddle::framework;
void do_sum(framework::ThreadPool* pool, std::atomic& sum, int cnt) {
std::vector> fs;
for (int i = 0; i < cnt; ++i) {
- auto f = pool->Run([&sum]() { sum.fetch_add(1); });
- fs.push_back(std::move(f));
- }
- for (auto& f : fs) {
- f.wait();
+ fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
}
}
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
index ae4d3fd2f58daf87a650428e04722581610ed780..2289ddc139cbddfbaa5238e683b2f8e784a7291e 100644
--- a/paddle/inference/CMakeLists.txt
+++ b/paddle/inference/CMakeLists.txt
@@ -1,14 +1,14 @@
-set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
+set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
cc_library(paddle_fluid_api
- SRCS inference.cc
+ SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Merge all modules into a single static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Create shared library
-add_library(paddle_fluid_shared SHARED inference.cc)
+add_library(paddle_fluid_shared SHARED io.cc)
target_circle_link_libraries(paddle_fluid_shared
ARCHIVE_START
@@ -20,23 +20,10 @@ SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
# install library & headers
if(NOT WITH_C_API AND WITH_FLUID)
- install(FILES inference.h DESTINATION include/paddle/inference)
+ install(FILES io.h DESTINATION include/paddle/inference)
install(TARGETS paddle_fluid_shared DESTINATION lib)
endif()
-add_executable(example example.cc)
-if(APPLE)
- set(OPTIONAL_LINK_FLAGS)
- if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
- set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
- endif()
- target_link_libraries(example
- -Wl,-force_load paddle_fluid
- ${OPTIONAL_LINK_FLAGS}
- ${PTOOLS_LIB})
-else()
- target_link_libraries(example
- -Wl,--start-group -Wl,--whole-archive paddle_fluid
- -Wl,--no-whole-archive -Wl,--end-group
- ${PTOOLS_LIB})
+if(WITH_TESTING)
+ add_subdirectory(tests/book)
endif()
diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc
deleted file mode 100644
index 0c18b45624dedcb5839d4b771e044b4a7b32af52..0000000000000000000000000000000000000000
--- a/paddle/inference/example.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include
-#include
-#include "gflags/gflags.h"
-#include "paddle/inference/inference.h"
-
-DEFINE_string(dirname, "", "Directory of the inference model.");
-
-int main(int argc, char** argv) {
- google::ParseCommandLineFlags(&argc, &argv, true);
- if (FLAGS_dirname.empty()) {
- // Example:
- // ./example --dirname=recognize_digits_mlp.inference.model
- std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
- exit(1);
- }
-
- std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
- std::string dirname = FLAGS_dirname;
-
- paddle::InferenceEngine* engine = new paddle::InferenceEngine();
- engine->LoadInferenceModel(dirname);
-
- paddle::framework::LoDTensor input;
- srand(time(0));
- float* input_ptr =
- input.mutable_data({1, 784}, paddle::platform::CPUPlace());
- for (int i = 0; i < 784; ++i) {
- input_ptr[i] = rand() / (static_cast(RAND_MAX));
- }
-
- std::vector feeds;
- feeds.push_back(input);
- std::vector fetchs;
- engine->Execute(feeds, fetchs);
-
- for (size_t i = 0; i < fetchs.size(); ++i) {
- auto dims_i = fetchs[i].dims();
- std::cout << "dims_i:";
- for (int j = 0; j < dims_i.size(); ++j) {
- std::cout << " " << dims_i[j];
- }
- std::cout << std::endl;
- std::cout << "result:";
- float* output_ptr = fetchs[i].data();
- for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
- std::cout << " " << output_ptr[j];
- }
- std::cout << std::endl;
- }
-
- delete engine;
- return 0;
-}
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
deleted file mode 100644
index 09268ffb3a1410b22f1b7d997a5cc0e4176b6d55..0000000000000000000000000000000000000000
--- a/paddle/inference/inference.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "inference.h"
-#include
-#include "paddle/framework/executor.h"
-#include "paddle/framework/feed_fetch_method.h"
-#include "paddle/framework/init.h"
-#include "paddle/framework/scope.h"
-
-namespace paddle {
-
-void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
- std::string model_filename = dirname + "/__model__";
- LOG(INFO) << "loading model from " << model_filename;
- std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
- std::string program_desc_str;
- inputfs.seekg(0, std::ios::end);
- program_desc_str.resize(inputfs.tellg());
- inputfs.seekg(0, std::ios::beg);
- LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
- inputfs.read(&program_desc_str[0], program_desc_str.size());
- inputfs.close();
-
- program_ = new framework::ProgramDesc(program_desc_str);
- GenerateLoadProgram(dirname);
-
- framework::BlockDesc* global_block = program_->MutableBlock(0);
- feed_var_names_.clear();
- fetch_var_names_.clear();
- for (auto* op : global_block->AllOps()) {
- if (op->Type() == "feed") {
- feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
- } else if (op->Type() == "fetch") {
- fetch_var_names_.push_back(op->Input("X")[0]);
- }
- }
-}
-
-bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
- if (var->Persistable()) {
- // There are many unreachable variables in the program
- for (size_t i = 0; i < program_->Size(); ++i) {
- const framework::BlockDesc& block = program_->Block(i);
- for (auto* op : block.AllOps()) {
- if (op->Type() == "feed") {
- continue;
- }
- for (auto input_argument_name : op->InputArgumentNames()) {
- if (input_argument_name == var->Name()) {
- return true;
- }
- }
- }
- }
- }
- return false;
-}
-
-void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
- framework::BlockDesc* global_block = program_->MutableBlock(0);
-
- load_program_ = new framework::ProgramDesc();
- framework::BlockDesc* load_block = load_program_->MutableBlock(0);
- for (auto* var : global_block->AllVars()) {
- if (IsParameter(var)) {
- LOG(INFO) << "parameter's name: " << var->Name();
-
- framework::VarDesc* new_var = load_block->Var(var->Name());
- new_var->SetShape(var->Shape());
- new_var->SetDataType(var->GetDataType());
- new_var->SetType(var->GetType());
- new_var->SetLoDLevel(var->GetLoDLevel());
- new_var->SetPersistable(true);
-
- // append_op
- framework::OpDesc* op = load_block->AppendOp();
- op->SetType("load");
- op->SetOutput("Out", {new_var->Name()});
- op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
- op->CheckAttrs();
- }
- }
-}
-
-void InferenceEngine::PrependFeedOp() {
- if (!program_) {
- LOG(FATAL) << "Please initialize the program_ first.";
- }
-
- framework::BlockDesc* global_block = program_->MutableBlock(0);
-
- // create_var
- framework::VarDesc* feed_var = global_block->Var("feed");
- feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
- feed_var->SetPersistable(true);
-
- // prepend feed_op
- for (size_t i = 0; i < feed_var_names_.size(); ++i) {
- std::string var_name = feed_var_names_[i];
- LOG(INFO) << "feed var's name: " << var_name;
-
- // prepend_op
- framework::OpDesc* op = global_block->PrependOp();
- op->SetType("feed");
- op->SetInput("X", {"feed"});
- op->SetOutput("Out", {var_name});
- op->SetAttr("col", {static_cast(i)});
- op->CheckAttrs();
- }
-}
-
-void InferenceEngine::AppendFetchOp() {
- if (!program_) {
- LOG(FATAL) << "Please initialize the program_ first.";
- }
-
- framework::BlockDesc* global_block = program_->MutableBlock(0);
-
- // create_var
- framework::VarDesc* fetch_var = global_block->Var("fetch");
- fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
- fetch_var->SetPersistable(true);
-
- // append fetch_op
- for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
- std::string var_name = fetch_var_names_[i];
- LOG(INFO) << "fetch var's name: " << var_name;
-
- // append_op
- framework::OpDesc* op = global_block->AppendOp();
- op->SetType("fetch");
- op->SetInput("X", {var_name});
- op->SetOutput("Out", {"fetch"});
- op->SetAttr("col", {static_cast(i)});
- op->CheckAttrs();
- }
-}
-
-void InferenceEngine::Execute(const std::vector& feeds,
- std::vector& fetchs) {
- if (!program_ || !load_program_) {
- LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
- }
-
- if (feeds.size() < feed_var_names_.size()) {
- LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
- }
-
- auto* place = new platform::CPUPlace();
- framework::InitDevices();
- framework::Executor* executor = new framework::Executor(*place);
- framework::Scope* scope = new framework::Scope();
-
- executor->Run(*load_program_, scope, 0, true, true);
-
- // set_feed_variable
- for (size_t i = 0; i < feed_var_names_.size(); ++i) {
- framework::SetFeedVariable(scope, feeds[i], "feed", i);
- }
-
- executor->Run(*program_, scope, 0, true, true);
-
- // get_fetch_variable
- fetchs.resize(fetch_var_names_.size());
- for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
- fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
- }
-
- delete place;
- delete scope;
- delete executor;
-}
-} // namespace paddle
diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h
deleted file mode 100644
index 26f259824b945e260b370ced9d065842264075d5..0000000000000000000000000000000000000000
--- a/paddle/inference/inference.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/block_desc.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/program_desc.h"
-
-namespace paddle {
-
-class InferenceEngine {
-public:
- InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
- ~InferenceEngine() {
- delete program_;
- delete load_program_;
- }
-
- void LoadInferenceModel(const std::string& dirname);
- void Execute(const std::vector& feeds,
- std::vector& fetchs);
-
-private:
- bool IsParameter(const framework::VarDesc* var);
- void GenerateLoadProgram(const std::string& dirname);
- void PrependFeedOp();
- void AppendFetchOp();
-
-private:
- framework::ProgramDesc* program_;
- framework::ProgramDesc* load_program_;
- std::vector feed_var_names_;
- std::vector fetch_var_names_;
-};
-
-} // namespace paddle
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
new file mode 100644
index 0000000000000000000000000000000000000000..60ad7af1c0a469beb6a07bf057a8647fcb98cca8
--- /dev/null
+++ b/paddle/inference/io.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/inference/io.h"
+
+#include
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace inference {
+
+bool IsParameter(const framework::VarDesc* var,
+ const framework::ProgramDesc& main_program) {
+ if (var->Persistable()) {
+ // There are many unreachable variables in the program
+ for (size_t i = 0; i < main_program.Size(); ++i) {
+ const framework::BlockDesc& block = main_program.Block(i);
+ for (auto* op : block.AllOps()) {
+ if (op->Type() == framework::kFeedOpType) {
+ continue;
+ }
+ for (auto input_argument_name : op->InputArgumentNames()) {
+ if (input_argument_name == var->Name()) {
+ return true;
+ }
+ }
+ }
+ }
+ }
+ return false;
+}
+
+void LoadPersistables(framework::Executor& executor,
+ framework::Scope& scope,
+ const std::string& dirname,
+ const framework::ProgramDesc& main_program) {
+ const framework::BlockDesc& global_block = main_program.Block(0);
+
+ framework::ProgramDesc* load_program = new framework::ProgramDesc();
+ framework::BlockDesc* load_block = load_program->MutableBlock(0);
+ for (auto* var : global_block.AllVars()) {
+ if (IsParameter(var, main_program)) {
+ VLOG(3) << "parameter's name: " << var->Name();
+
+ framework::VarDesc* new_var = load_block->Var(var->Name());
+ new_var->SetShape(var->Shape());
+ new_var->SetDataType(var->GetDataType());
+ new_var->SetType(var->GetType());
+ new_var->SetLoDLevel(var->GetLoDLevel());
+ new_var->SetPersistable(true);
+
+ // append_op
+ framework::OpDesc* op = load_block->AppendOp();
+ op->SetType("load");
+ op->SetOutput("Out", {new_var->Name()});
+ op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+ op->CheckAttrs();
+ }
+ }
+ executor.Run(*load_program, &scope, 0, true, true);
+ delete load_program;
+}
+
+std::unique_ptr Load(framework::Executor& executor,
+ framework::Scope& scope,
+ const std::string& dirname) {
+ std::string model_filename = dirname + "/__model__";
+ LOG(INFO) << "loading model from " << model_filename;
+ std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+ std::string program_desc_str;
+ inputfs.seekg(0, std::ios::end);
+ program_desc_str.resize(inputfs.tellg());
+ inputfs.seekg(0, std::ios::beg);
+ LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+ inputfs.read(&program_desc_str[0], program_desc_str.size());
+ inputfs.close();
+
+ std::unique_ptr main_program(
+ new framework::ProgramDesc(program_desc_str));
+
+ LoadPersistables(executor, scope, dirname, *main_program);
+ return main_program;
+}
+
+} // namespace inference
+} // namespace paddle
diff --git a/paddle/inference/io.h b/paddle/inference/io.h
new file mode 100644
index 0000000000000000000000000000000000000000..962b6c4e20d30de3cc28eae1c8c5c33b3ab5f6ac
--- /dev/null
+++ b/paddle/inference/io.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include
+#include
+#include
+#include "paddle/framework/executor.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/scope.h"
+
+namespace paddle {
+namespace inference {
+
+void LoadPersistables(framework::Executor& executor,
+ framework::Scope& scope,
+ const std::string& dirname,
+ const framework::ProgramDesc& main_program);
+
+std::unique_ptr Load(framework::Executor& executor,
+ framework::Scope& scope,
+ const std::string& dirname);
+
+} // namespace inference
+} // namespace paddle
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d3798fb8fd8769aef5940d4ce724cb0cc8686422
--- /dev/null
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+cc_test(test_inference_recognize_digits_mlp
+ SRCS test_inference_recognize_digits.cc
+ DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+ ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
+set_tests_properties(test_inference_recognize_digits_mlp
+ PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
new file mode 100644
index 0000000000000000000000000000000000000000..26dc2aee04261d9a1fd29b4d75bfacc7870c09d8
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include
+#include
+#include "gflags/gflags.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/inference/io.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+template
+void TestInference(const std::string& dirname,
+ const std::vector& cpu_feeds,
+ std::vector& cpu_fetchs) {
+ // 1. Define place, executor and scope
+ auto place = Place();
+ auto executor = paddle::framework::Executor(place);
+ auto* scope = new paddle::framework::Scope();
+
+ // 2. Initialize the inference_program and load all parameters from file
+ auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+
+ // 3. Get the feed_target_names and fetch_target_names
+ const std::vector& feed_target_names =
+ inference_program->GetFeedTargetNames();
+ const std::vector& fetch_target_names =
+ inference_program->GetFetchTargetNames();
+
+ // 4. Prepare inputs: set up maps for feed targets
+ std::map feed_targets;
+ for (size_t i = 0; i < feed_target_names.size(); ++i) {
+ // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+ feed_targets[feed_target_names[i]] = cpu_feeds[i];
+ }
+
+ // 5. Define Tensor to get the outputs: set up maps for fetch targets
+ std::map fetch_targets;
+ for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+ fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+ }
+
+ // 6. Run the inference program
+ executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+
+ delete scope;
+}
+
+TEST(inference, recognize_digits) {
+ if (FLAGS_dirname.empty()) {
+ LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+ }
+
+ LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+ std::string dirname = FLAGS_dirname;
+
+ // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+ // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+ paddle::framework::LoDTensor input;
+ srand(time(0));
+ float* input_ptr =
+ input.mutable_data({1, 28, 28}, paddle::platform::CPUPlace());
+ for (int i = 0; i < 784; ++i) {
+ input_ptr[i] = rand() / (static_cast(RAND_MAX));
+ }
+ std::vector cpu_feeds;
+ cpu_feeds.push_back(&input);
+
+ paddle::framework::LoDTensor output1;
+ std::vector cpu_fetchs1;
+ cpu_fetchs1.push_back(&output1);
+
+ // Run inference on CPU
+ TestInference(
+ dirname, cpu_feeds, cpu_fetchs1);
+ LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+ paddle::framework::LoDTensor output2;
+ std::vector cpu_fetchs2;
+ cpu_fetchs2.push_back(&output2);
+
+ // Run inference on CUDA GPU
+ TestInference(
+ dirname, cpu_feeds, cpu_fetchs2);
+ LOG(INFO) << output2.dims();
+
+ EXPECT_EQ(output1.dims(), output2.dims());
+ EXPECT_EQ(output1.numel(), output2.numel());
+
+ float err = 1E-3;
+ int count = 0;
+ for (int64_t i = 0; i < output1.numel(); ++i) {
+ if (fabs(output1.data()[i] - output2.data()[i]) > err) {
+ count++;
+ }
+ }
+ EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+#endif
+}
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index f7d600414fc15e1a10b000900d6dfbe055866ea0..000c2089c176adf8d845a56a1f98528734f47ea1 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -122,9 +122,11 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
- cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+ op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
+ set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+ cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
else()
- set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
+ set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op)
endif()
op_library(cond_op DEPS framework_proto tensor net_op)
@@ -147,6 +149,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
op_library(sequence_conv_op DEPS context_project)
op_library(sequence_pool_op DEPS sequence_pooling)
op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
@@ -175,6 +178,8 @@ endif()
# FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor)
+op_library(save_combine_op DEPS lod_tensor)
+op_library(load_combine_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
@@ -194,3 +199,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 88c3d1c597a853abdee7753a5110be4a1726e905..c0809abc05104c1e8c1f42331c0530724dd1472f 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -323,7 +323,7 @@ template
struct FloorFunctor : public BaseActivationFunctor {
template
void operator()(Device d, X x, Out out) const {
- out.device(d) = x.ceil();
+ out.device(d) = x.floor();
}
};
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
index 4e579387924a5b0499f29609bc6b1322030a3c0d..00cb6e9cafb4e79ed3d59cd4a6e40ea132e5efda 100644
--- a/paddle/operators/adagrad_op.cu
+++ b/paddle/operators/adagrad_op.cu
@@ -82,7 +82,7 @@ struct SparseAdagradFunctor {
math::scatter::MergeAdd merge_func;
auto grad_merge = merge_func(context, grad);
auto* grad_merge_data = grad_merge.mutable_value()->template data();
- auto& merge_rows = grad_merge.rows();
+ framework::Vector merge_rows(grad_merge.rows());
// 2. m += g_m * g_m
math::scatter::Mul sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge);
@@ -101,8 +101,8 @@ struct SparseAdagradFunctor {
SparseAdagradFunctorKernel<
T, 256><<(context)
- .stream()>>>(grad_merge_data, grad_merge.rows().data(),
- lr, param_data, moment_data, grad_width,
+ .stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
+ param_data, moment_data, grad_width,
epsilon);
}
};
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
index 9cc34bdded780e61e8700eb4fa4a295c84fb48bc..bf536687d398b8342e6ae76a07c11e5fe47483e0 100644
--- a/paddle/operators/adam_op.h
+++ b/paddle/operators/adam_op.h
@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel {
merge_func(ctx.template device_context(), grad);
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data();
- auto* rows = grad_merge.rows().data();
+ int64_t* rows = nullptr;
+ if (platform::is_gpu_place(ctx.GetPlace())) {
+ rows = grad_merge.mutable_rows()->cuda_data();
+ } else {
+ rows = grad_merge.mutable_rows()->data();
+ }
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor functor(
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu
index 45635f16745346b08f7e31db2f25905bdbc3aeeb..2a970cd9fa965b4126356eaa1519068f9c7a7f34 100644
--- a/paddle/operators/ctc_align_op.cu
+++ b/paddle/operators/ctc_align_op.cu
@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel {
auto stream = ctx.cuda_device_context().stream();
MergeAndDelCudaKernel<<<1, 1, 0, stream>>>(
- num_tokens, tokens, num_seq, input_lod[level].data(), blank,
+ num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
merge_repeated, dev_out_lod0_ptr, output_data);
// set output lod
- thrust::host_vector host_out_lod0(dev_out_lod0.begin(),
- dev_out_lod0.end());
+ std::vector host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
framework::LoD out_lod;
out_lod.push_back(host_out_lod0);
output->set_lod(out_lod);
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
index d699dabf2fb982f267c4869180efaf0e600eb46c..9b5f7afc6a48f13ff999f635efeb9e7bf0a76fb5 100644
--- a/paddle/operators/detail/grpc_client.cc
+++ b/paddle/operators/detail/grpc_client.cc
@@ -97,12 +97,27 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
return true;
}
+bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+ const auto ch = GetChannel(ep);
+
+ BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+ s->Prepare(time_out);
+
+ sendrecv::VariableMessage req;
+ req.set_varname(BATCH_BARRIER_MESSAGE);
+ auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+ rpc->Finish(&s->reply_, &s->status_, (void*)s);
+ req_count_++;
+
+ return true;
+}
+
bool RPCClient::Wait() {
if (req_count_ <= 0) {
return true;
}
-
- std::vector a(req_count_);
+ const size_t kReqCnt = req_count_;
+ bool a[kReqCnt];
std::vector> waits(req_count_);
for (int i = 0; i < req_count_; i++) {
diff --git a/paddle/operators/detail/grpc_client.h b/paddle/operators/detail/grpc_client.h
index a62e70a2533ae52d84d010504b19fed5aeb15dc0..f9499f6dc70c541c214e0b659f10b2ed1e8e8581 100644
--- a/paddle/operators/detail/grpc_client.h
+++ b/paddle/operators/detail/grpc_client.h
@@ -71,6 +71,15 @@ class ClientBase {
context_->set_deadline(deadline);
}
+ virtual void Prepare(int64_t time_out) {
+ context_.reset(new grpc::ClientContext());
+
+ std::chrono::system_clock::time_point deadline =
+ std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+ context_->set_deadline(deadline);
+ }
+
virtual void Process() = 0;
std::unique_ptr stub_;
@@ -117,6 +126,17 @@ class GetProcessor : public ClientBase {
RequestGetCallBack response_call_back_ = ProcGetResponse;
};
+class BatchBarrierProcessor : public ClientBase {
+ public:
+ explicit BatchBarrierProcessor(std::shared_ptr