提交 4d12a813 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into rename_rpc_ops

......@@ -39,7 +39,7 @@ option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_F
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check" ON)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
......
......@@ -224,12 +224,18 @@ function(cc_test TARGET_NAME)
if(WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
add_executable(${TARGET_NAME} ${cc_test_SRCS})
target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
# Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
endif()
add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
add_test(NAME ${TARGET_NAME}
COMMAND ${TARGET_NAME} ${cc_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
endif()
endfunction(cc_test)
......@@ -457,7 +463,7 @@ endfunction()
function(py_test TARGET_NAME)
if(WITH_TESTING)
set(options STATIC static SHARED shared)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS ARGS)
cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
===========
DataFeeder
data_feeder
===========
DataFeeder
-----------
.. automodule:: paddle.v2.fluid.data_feeder
:members: DataFeeder
----------
.. autoclass:: paddle.v2.fluid.data_feeder.DataFeeder
:members:
:noindex:
===========
Evaluator
===========
Evaluator
-----------
.. automodule:: paddle.v2.fluid.evaluator
:members: Evaluator
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========
evaluator
=========
Accuracy
--------
.. autoclass:: paddle.v2.fluid.evaluator.Accuracy
:members:
:noindex:
ChunkEvaluator
--------------
.. autoclass:: paddle.v2.fluid.evaluator.ChunkEvaluator
:members:
:noindex:
===========
Executor
===========
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
========
executor
========
Executor
--------
.. autoclass:: paddle.v2.fluid.executor.Executor
:members:
:noindex:
global_scope
------------
.. autofunction:: paddle.v2.fluid.executor.global_scope
:noindex:
scope_guard
-----------
.. automodule:: paddle.v2.fluid.executor
:members: Executor
.. autofunction:: paddle.v2.fluid.executor.scope_guard
:noindex:
switch_scope
------------
.. autofunction:: paddle.v2.fluid.executor.switch_scope
:noindex:
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import argparse
import sys
import types
import paddle.v2.fluid as fluid
def parse_arg():
parser = argparse.ArgumentParser()
parser.add_argument('--submodules', nargs="*")
parser.add_argument(
'module', type=str, help='Generate the documentation of which module')
return parser.parse_args()
class DocGenerator(object):
def __init__(self, module_name, stream=sys.stdout):
self.stream = stream
self.module_name = module_name
if not hasattr(fluid, module_name):
raise ValueError("Cannot find fluid.{0}".format(module_name))
else:
self.module = getattr(fluid, module_name)
self.stream.write('''.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
''')
self._print_header_(module_name, dot='=', is_title=True)
def print_submodule(self, submodule_name):
submodule = getattr(self.module, submodule_name)
if submodule is None:
raise ValueError("Cannot find submodule {0}".format(submodule_name))
self.print_section(submodule_name)
for item in submodule.__all__:
self.print_item(item)
def print_current_module(self):
for item in self.module.__all__:
self.print_item(item)
def print_section(self, name):
self._print_header_(name, dot='=', is_title=False)
def print_item(self, name):
item = getattr(self.module, name)
if isinstance(item, types.TypeType):
self.print_class(name)
elif isinstance(item, types.FunctionType):
self.print_method(name)
else:
raise RuntimeError("Unsupported item {0}".format(name))
def print_class(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autoclass:: paddle.v2.fluid.{0}.{1}
:members:
:noindex:
'''.format(self.module_name, name))
def print_method(self, name):
self._print_header_(name, dot='-', is_title=False)
self.stream.write('''.. autofunction:: paddle.v2.fluid.{0}.{1}
:noindex:
'''.format(self.module_name, name))
def _print_header_(self, name, dot, is_title):
dot_line = dot * len(name)
if is_title:
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write(name)
self.stream.write('\n')
self.stream.write(dot_line)
self.stream.write('\n')
self.stream.write('\n')
def main():
args = parse_arg()
gen = DocGenerator(args.module)
if args.submodules is None:
gen.print_current_module()
else:
for submodule_name in args.submodules:
gen.print_submodule(submodule_name)
if __name__ == '__main__':
main()
#!/bin/bash
python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst
for module in io data_feeder evaluator executor initializer io nets optimizer param_attr profiler regularizer
do
python gen_doc.py ${module} > ${module}.rst
done
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
===========
Initializer
initializer
===========
Constant
--------
Initializer
-----------
.. automodule:: paddle.v2.fluid.initializer
:members: Initializer
:noindex:
ConstantInitializer
-------------------
.. automodule:: paddle.v2.fluid.initializer
:members: ConstantInitializer
.. autoclass:: paddle.v2.fluid.initializer.Constant
:members:
:noindex:
Uniform
-------
UniformInitializer
------------------
.. automodule:: paddle.v2.fluid.initializer
:members: UniformInitializer
:noindex:
NormalInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: NormalInitializer
.. autoclass:: paddle.v2.fluid.initializer.Uniform
:members:
:noindex:
Normal
------
XavierInitializer
-----------------
.. automodule:: paddle.v2.fluid.initializer
:members: XavierInitializer
.. autoclass:: paddle.v2.fluid.initializer.Normal
:members:
:noindex:
Xavier
------
MSRAInitializer
---------------
.. automodule:: paddle.v2.fluid.initializer
:members: MSRAInitializer
.. autoclass:: paddle.v2.fluid.initializer.Xavier
:members:
:noindex:
===========
IO
===========
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==
io
==
save_vars
---------
is_parameter
.. autofunction:: paddle.v2.fluid.io.save_vars
:noindex:
save_params
-----------
.. autofunction:: paddle.v2.fluid.io.is_parameter
.. autofunction:: paddle.v2.fluid.io.save_params
:noindex:
save_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.save_persistables
:noindex:
load_vars
---------
.. autofunction:: paddle.v2.fluid.io.load_vars
:noindex:
load_params
-----------
.. autofunction:: paddle.v2.fluid.io.load_params
:noindex:
load_persistables
-----------------
.. autofunction:: paddle.v2.fluid.io.load_persistables
:noindex:
save_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.save_inference_model
:noindex:
load_inference_model
--------------------
.. autofunction:: paddle.v2.fluid.io.load_inference_model
:noindex:
get_inference_program
---------------------
.. autofunction:: paddle.v2.fluid.io.get_inference_program
:noindex:
==========
Layers
==========
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
======
layers
======
fc
---
.. autofunction:: paddle.v2.fluid.layers.fc
control_flow
============
split_lod_tensor
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
:noindex:
embedding
---------
.. autofunction:: paddle.v2.fluid.layers.embedding
merge_lod_tensor
----------------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
:noindex:
dynamic_lstm
------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
BlockGuard
----------
.. autoclass:: paddle.v2.fluid.layers.BlockGuard
:members:
:noindex:
dynamic_lstmp
-------------
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
BlockGuardWithCompletion
------------------------
.. autoclass:: paddle.v2.fluid.layers.BlockGuardWithCompletion
:members:
:noindex:
dynamic_gru
-----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
StaticRNNMemoryLink
-------------------
.. autoclass:: paddle.v2.fluid.layers.StaticRNNMemoryLink
:members:
:noindex:
data
----
.. autofunction:: paddle.v2.fluid.layers.data
WhileGuard
----------
.. autoclass:: paddle.v2.fluid.layers.WhileGuard
:members:
:noindex:
mean
----
.. autofunction:: paddle.v2.fluid.layers.mean
While
-----
.. autoclass:: paddle.v2.fluid.layers.While
:members:
:noindex:
mul
---
.. autofunction:: paddle.v2.fluid.layers.mul
lod_rank_table
--------------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
:noindex:
elementwise_add
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
max_sequence_len
----------------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
:noindex:
elementwise_sub
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
topk
----
.. autofunction:: paddle.v2.fluid.layers.topk
:noindex:
elementwise_mul
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
lod_tensor_to_array
-------------------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
:noindex:
elementwise_div
---------------
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
array_to_lod_tensor
-------------------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
:noindex:
increment
---------
dropout
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
.. autofunction:: paddle.v2.fluid.layers.increment
:noindex:
array_write
-----------
reshape
--------
.. autofunction:: paddle.v2.fluid.layers.reshape
.. autofunction:: paddle.v2.fluid.layers.array_write
:noindex:
create_array
------------
sigmoid
.. autofunction:: paddle.v2.fluid.layers.create_array
:noindex:
less_than
---------
.. autofunction:: paddle.v2.fluid.layers.sigmoid
.. autofunction:: paddle.v2.fluid.layers.less_than
:noindex:
array_read
----------
scale
---------
.. autofunction:: paddle.v2.fluid.layers.scale
.. autofunction:: paddle.v2.fluid.layers.array_read
:noindex:
shrink_memory
-------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
:noindex:
array_length
------------
transpose
.. autofunction:: paddle.v2.fluid.layers.array_length
:noindex:
IfElse
------
.. autoclass:: paddle.v2.fluid.layers.IfElse
:members:
:noindex:
DynamicRNN
----------
.. autoclass:: paddle.v2.fluid.layers.DynamicRNN
:members:
:noindex:
ConditionalBlock
----------------
.. autoclass:: paddle.v2.fluid.layers.ConditionalBlock
:members:
:noindex:
StaticRNN
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
.. autoclass:: paddle.v2.fluid.layers.StaticRNN
:members:
:noindex:
reorder_lod_tensor_by_rank
--------------------------
sigmoid_cross_entropy_with_logits
---------------------------------
.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
.. autofunction:: paddle.v2.fluid.layers.reorder_lod_tensor_by_rank
:noindex:
ParallelDo
----------
cast
.. autoclass:: paddle.v2.fluid.layers.ParallelDo
:members:
:noindex:
Print
-----
.. autofunction:: paddle.v2.fluid.layers.Print
:noindex:
device
======
get_places
----------
.. autofunction:: paddle.v2.fluid.layers.get_places
:noindex:
io
==
data
----
.. autofunction:: paddle.v2.fluid.layers.cast
.. autofunction:: paddle.v2.fluid.layers.data
:noindex:
BlockGuardServ
--------------
concat
-------
.. autofunction:: paddle.v2.fluid.layers.concat
.. autoclass:: paddle.v2.fluid.layers.BlockGuardServ
:members:
:noindex:
ListenAndServ
-------------
sums
.. autoclass:: paddle.v2.fluid.layers.ListenAndServ
:members:
:noindex:
Send
----
.. autofunction:: paddle.v2.fluid.layers.sums
.. autofunction:: paddle.v2.fluid.layers.Send
:noindex:
nn
==
linear_chain_crf
----------------
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
fc
--
.. autofunction:: paddle.v2.fluid.layers.fc
:noindex:
embedding
---------
assign
-------
.. autofunction:: paddle.v2.fluid.layers.embedding
:noindex:
dynamic_lstm
------------
split_lod_tensor
----------------
.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm
:noindex:
dynamic_lstmp
-------------
merge_lod_tensor
.. autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
:noindex:
dynamic_gru
-----------
.. autofunction:: paddle.v2.fluid.layers.dynamic_gru
:noindex:
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex:
linear_chain_crf
----------------
.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf
:noindex:
crf_decoding
------------
.. autofunction:: paddle.v2.fluid.layers.crf_decoding
:noindex:
cos_sim
--------
-------
.. autofunction:: paddle.v2.fluid.layers.cos_sim
:noindex:
cross_entropy
-------------
.. autofunction:: paddle.v2.fluid.layers.cross_entropy
:noindex:
square_error_cost
-----------------
.. autofunction:: paddle.v2.fluid.layers.square_error_cost
:noindex:
accuracy
---------
--------
.. autofunction:: paddle.v2.fluid.layers.accuracy
:noindex:
chunk_eval
----------
.. autofunction:: paddle.v2.fluid.layers.chunk_eval
:noindex:
sequence_conv
-------------
.. autofunction:: paddle.v2.fluid.layers.sequence_conv
:noindex:
conv2d
------
.. autofunction:: paddle.v2.fluid.layers.conv2d
:noindex:
sequence_pool
-------------
.. autofunction:: paddle.v2.fluid.layers.sequence_pool
:noindex:
pool2d
------
sequence_first_step
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
.. autofunction:: paddle.v2.fluid.layers.pool2d
:noindex:
batch_norm
----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
:noindex:
sequence_last_step
beam_search_decode
------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
:noindex:
conv2d_transpose
----------------
pool2d
------
.. autofunction:: paddle.v2.fluid.layers.pool2d
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
:noindex:
sequence_expand
---------------
batch_norm
.. autofunction:: paddle.v2.fluid.layers.sequence_expand
:noindex:
lstm_unit
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
:noindex:
reduce_sum
----------
.. autofunction:: paddle.v2.fluid.layers.batch_norm
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
:noindex:
reduce_mean
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
:noindex:
reduce_max
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
:noindex:
beam_search_decode
reduce_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
:noindex:
sequence_first_step
-------------------
.. autofunction:: paddle.v2.fluid.layers.sequence_first_step
:noindex:
sequence_last_step
------------------
.. autofunction:: paddle.v2.fluid.layers.beam_search_decode
.. autofunction:: paddle.v2.fluid.layers.sequence_last_step
:noindex:
dropout
-------
.. autofunction:: paddle.v2.fluid.layers.dropout
:noindex:
split
-----
lod_rank_table
--------------
.. autofunction:: paddle.v2.fluid.layers.lod_rank_table
.. autofunction:: paddle.v2.fluid.layers.split
:noindex:
ctc_greedy_decoder
------------------
max_sequence_len
----------------
.. autofunction:: paddle.v2.fluid.layers.max_sequence_len
.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
:noindex:
edit_distance
-------------
topk
-----
.. autofunction:: paddle.v2.fluid.layers.topk
.. autofunction:: paddle.v2.fluid.layers.edit_distance
:noindex:
l2_normalize
------------
lod_tensor_to_array
-------------------
.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
.. autofunction:: paddle.v2.fluid.layers.l2_normalize
:noindex:
matmul
------
array_to_lod_tensor
-------------------
.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
.. autofunction:: paddle.v2.fluid.layers.matmul
:noindex:
warpctc
-------
.. autofunction:: paddle.v2.fluid.layers.warpctc
:noindex:
sequence_reshape
----------------
fill_constant
-------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
:noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
:noindex:
fill_constant_batch_size_like
-----------------------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
im2sequence
-----------
.. autofunction:: paddle.v2.fluid.layers.im2sequence
:noindex:
nce
---
ones
----
.. autofunction:: paddle.v2.fluid.layers.ones
.. autofunction:: paddle.v2.fluid.layers.nce
:noindex:
beam_search
-----------
zeros
-----
.. autofunction:: paddle.v2.fluid.layers.zeros
.. autofunction:: paddle.v2.fluid.layers.beam_search
:noindex:
row_conv
--------
increment
---------
.. autofunction:: paddle.v2.fluid.layers.increment
.. autofunction:: paddle.v2.fluid.layers.row_conv
:noindex:
multiplex
---------
array_write
-----------
.. autofunction:: paddle.v2.fluid.layers.array_write
.. autofunction:: paddle.v2.fluid.layers.multiplex
:noindex:
ops
===
mean
----
create_array
------------
.. autofunction:: paddle.v2.fluid.layers.create_array
.. autofunction:: paddle.v2.fluid.layers.mean
:noindex:
mul
---
less_than
---------
.. autofunction:: paddle.v2.fluid.layers.less_than
.. autofunction:: paddle.v2.fluid.layers.mul
:noindex:
reshape
-------
array_read
----------
.. autofunction:: paddle.v2.fluid.layers.array_read
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
scale
-----
shrink_memory
--------------
.. autofunction:: paddle.v2.fluid.layers.shrink_memory
.. autofunction:: paddle.v2.fluid.layers.scale
:noindex:
sigmoid_cross_entropy_with_logits
---------------------------------
array_length
-------------
.. autofunction:: paddle.v2.fluid.layers.array_length
.. autofunction:: paddle.v2.fluid.layers.sigmoid_cross_entropy_with_logits
:noindex:
elementwise_add
---------------
conv2d_transpose
----------------
.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose
.. autofunction:: paddle.v2.fluid.layers.elementwise_add
:noindex:
sequence_expand
elementwise_div
---------------
.. autofunction:: paddle.v2.fluid.layers.sequence_expand
.. autofunction:: paddle.v2.fluid.layers.elementwise_div
:noindex:
elementwise_sub
---------------
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
.. autofunction:: paddle.v2.fluid.layers.elementwise_sub
:noindex:
elementwise_mul
---------------
lstm_unit
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
.. autofunction:: paddle.v2.fluid.layers.elementwise_mul
:noindex:
elementwise_max
---------------
sequence_softmax
----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
.. autofunction:: paddle.v2.fluid.layers.elementwise_max
:noindex:
elementwise_min
---------------
reduce_sum
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_sum
.. autofunction:: paddle.v2.fluid.layers.elementwise_min
:noindex:
elementwise_pow
---------------
reduce_mean
-----------
.. autofunction:: paddle.v2.fluid.layers.reduce_mean
.. autofunction:: paddle.v2.fluid.layers.elementwise_pow
:noindex:
clip
----
reduce_max
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_max
.. autofunction:: paddle.v2.fluid.layers.clip
:noindex:
clip_by_norm
------------
reduce_min
----------
.. autofunction:: paddle.v2.fluid.layers.reduce_min
.. autofunction:: paddle.v2.fluid.layers.clip_by_norm
:noindex:
sequence_softmax
----------------
split
-----
.. autofunction:: paddle.v2.fluid.layers.split
.. autofunction:: paddle.v2.fluid.layers.sequence_softmax
:noindex:
sigmoid
-------
matmul
------
.. autofunction:: paddle.v2.fluid.layers.matmul
.. autofunction:: paddle.v2.fluid.layers.sigmoid
:noindex:
logsigmoid
----------
.. autofunction:: paddle.v2.fluid.layers.logsigmoid
:noindex:
exp
---
.. autofunction:: paddle.v2.fluid.layers.exp
:noindex:
relu
----
.. autofunction:: paddle.v2.fluid.layers.relu
:noindex:
tanh
----
.. autofunction:: paddle.v2.fluid.layers.tanh
:noindex:
tanh_shrink
-----------
.. autofunction:: paddle.v2.fluid.layers.tanh_shrink
:noindex:
softshrink
----------
.. autofunction:: paddle.v2.fluid.layers.softshrink
:noindex:
sqrt
----
.. autofunction:: paddle.v2.fluid.layers.sqrt
:noindex:
abs
----
---
.. autofunction:: paddle.v2.fluid.layers.abs
:noindex:
ceil
----
.. autofunction:: paddle.v2.fluid.layers.ceil
:noindex:
floor
-----
.. autofunction:: paddle.v2.fluid.layers.floor
:noindex:
round
-----
.. autofunction:: paddle.v2.fluid.layers.round
:noindex:
reciprocal
----------
.. autofunction:: paddle.v2.fluid.layers.reciprocal
:noindex:
log
---
.. autofunction:: paddle.v2.fluid.layers.log
:noindex:
square
------
.. autofunction:: paddle.v2.fluid.layers.square
:noindex:
softplus
--------
.. autofunction:: paddle.v2.fluid.layers.softplus
:noindex:
softsign
---------
--------
.. autofunction:: paddle.v2.fluid.layers.softsign
:noindex:
brelu
-----
.. autofunction:: paddle.v2.fluid.layers.brelu
:noindex:
leaky_relu
----------
.. autofunction:: paddle.v2.fluid.layers.leaky_relu
:noindex:
soft_relu
---------
.. autofunction:: paddle.v2.fluid.layers.soft_relu
:noindex:
elu
----
---
.. autofunction:: paddle.v2.fluid.layers.elu
:noindex:
relu6
-----
.. autofunction:: paddle.v2.fluid.layers.relu6
:noindex:
pow
----
---
.. autofunction:: paddle.v2.fluid.layers.pow
:noindex:
stanh
-----
.. autofunction:: paddle.v2.fluid.layers.stanh
:noindex:
hard_shrink
-----------
.. autofunction:: paddle.v2.fluid.layers.hard_shrink
:noindex:
thresholded_relu
----------------
.. autofunction:: paddle.v2.fluid.layers.thresholded_relu
:noindex:
hard_sigmoid
-------------
------------
.. autofunction:: paddle.v2.fluid.layers.hard_sigmoid
:noindex:
swish
------
-----
.. autofunction:: paddle.v2.fluid.layers.swish
:noindex:
im2sequence
tensor
======
create_tensor
-------------
.. autofunction:: paddle.v2.fluid.layers.create_tensor
:noindex:
create_parameter
----------------
.. autofunction:: paddle.v2.fluid.layers.create_parameter
:noindex:
create_global_var
-----------------
.. autofunction:: paddle.v2.fluid.layers.create_global_var
:noindex:
cast
----
.. autofunction:: paddle.v2.fluid.layers.cast
:noindex:
concat
------
.. autofunction:: paddle.v2.fluid.layers.im2sequence
.. autofunction:: paddle.v2.fluid.layers.concat
:noindex:
edit_distance
---------------
.. autofunction:: paddle.v2.fluid.layers.edit_distance_error
sums
----
.. autofunction:: paddle.v2.fluid.layers.sums
:noindex:
ctc_greedy_decoder
---------------
.. autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
assign
------
.. autofunction:: paddle.v2.fluid.layers.assign
:noindex:
l2_normalize
------------
.. autofunction:: paddle.v2.fluid.layers.l2_normalize
fill_constant_batch_size_like
-----------------------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
:noindex:
sequence_reshape
----------------
.. autofunction:: paddle.v2.fluid.layers.sequence_reshape
fill_constant
-------------
.. autofunction:: paddle.v2.fluid.layers.fill_constant
:noindex:
row_conv
--------
.. autofunction:: paddle.v2.fluid.layers.row_conv
ones
----
.. autofunction:: paddle.v2.fluid.layers.ones
:noindex:
multiplex
---------
.. autofunction:: paddle.v2.fluid.layers.multiplex
zeros
-----
.. autofunction:: paddle.v2.fluid.layers.zeros
:noindex:
===========
Nets
===========
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
====
nets
====
simple_img_conv_pool
--------------------
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
img_conv_group
---------------
.. autofunction:: paddle.v2.fluid.nets.img_conv_group
.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
:noindex:
sequence_conv_pool
------------------
.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
:noindex:
glu
---
.. autofunction:: paddle.v2.fluid.nets.glu
:noindex:
scaled_dot_product_attention
----------------------------
.. autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
:noindex:
===========
Optimizer
===========
Optimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: Optimizer
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
=========
optimizer
=========
SGDOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: SGDOptimizer
:noindex:
SGD
---
.. autoclass:: paddle.v2.fluid.optimizer.SGD
:members:
:noindex:
Momentum
--------
MomentumOptimizer
-----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: MomentumOptimizer
.. autoclass:: paddle.v2.fluid.optimizer.Momentum
:members:
:noindex:
Adagrad
-------
AdagradOptimizer
----------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdagradOptimizer
.. autoclass:: paddle.v2.fluid.optimizer.Adagrad
:members:
:noindex:
Adam
----
AdamOptimizer
-------------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamOptimizer
.. autoclass:: paddle.v2.fluid.optimizer.Adam
:members:
:noindex:
Adamax
------
AdamaxOptimizer
-----------
.. automodule:: paddle.v2.fluid.optimizer
:members: AdamaxOptimizer
.. autoclass:: paddle.v2.fluid.optimizer.Adamax
:members:
:noindex:
DecayedAdagrad
--------------
DecayedAdagradOptimizer
-----------------------
.. automodule:: paddle.v2.fluid.optimizer
:members: DecayedAdagradOptimizer
.. autoclass:: paddle.v2.fluid.optimizer.DecayedAdagrad
:members:
:noindex:
===========
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
==========
param_attr
==========
ParamAttr
===========
---------
.. autoclass:: paddle.v2.fluid.param_attr.ParamAttr
:members:
:noindex:
WeightNormParamAttr
-------------------
ParamAttr
-----------
.. automodule:: paddle.v2.fluid.param_attr
:members: ParamAttr
.. autoclass:: paddle.v2.fluid.param_attr.WeightNormParamAttr
:members:
:noindex:
===========
Profiler
===========
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
========
profiler
========
cuda_profiler
-------------
Profiler
-----------
.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler
:noindex:
reset_profiler
--------------
.. autofunction:: paddle.v2.fluid.profiler.reset_profiler
:noindex:
profiler
--------
.. autofunction:: paddle.v2.fluid.profiler.profiler
:noindex:
.. THIS FILE IS GENERATED BY `gen_doc.{py|sh}`
!DO NOT EDIT THIS FILE MANUALLY!
===========
Regularizer
regularizer
===========
WeightDecayRegularizer
----------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: WeightDecayRegularizer
:noindex:
append_regularization_ops
-------------------------
L2DecayRegularizer
------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L2DecayRegularizer
.. autofunction:: paddle.v2.fluid.regularizer.append_regularization_ops
:noindex:
L1Decay
-------
.. autoclass:: paddle.v2.fluid.regularizer.L1Decay
:members:
:noindex:
L1DecayRegularizer
-------------------
.. automodule:: paddle.v2.fluid.regularizer
:members: L1DecayRegularizer
L2Decay
-------
.. autoclass:: paddle.v2.fluid.regularizer.L2Decay
:members:
:noindex:
......@@ -140,7 +140,19 @@ TODO by Assignees
### Beam Search with CTC and LM
TODO by Assignees
<div align="center">
<img src="image/beam_search.png" width=600><br/>
Figure 2. Algorithm for CTC Beam Search Decoder.
</div>
- The **Beam Search Decoder** for DS2 CTC-trained network follows the similar approach in \[[3](#references)\] as shown in Figure 2, with two important modifications for the ambiguous parts:
- 1) in the iterative computation of probabilities, the assignment operation is changed to accumulation for one prefix may comes from different paths;
- 2) the if condition ```if l^+ not in A_prev then``` after probabilities' computation is deprecated for it is hard to understand and seems unnecessary.
- An **external scorer** would be passed into the decoder to evaluate a candidate prefix during decoding whenever a white space appended in English decoding and any character appended in Mandarin decoding.
- Such external scorer consists of language model, word count or any other custom scorers.
- The **language model** is built from Task 5, with parameters should be carefully tuned to achieve minimum WER/CER (c.f. Task 7)
- This decoder needs to perform with **high efficiency** for the convenience of parameters tuning and speech recognition in reality.
## Future Work
......@@ -153,3 +165,4 @@ TODO by Assignees
1. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](http://proceedings.mlr.press/v48/amodei16.pdf). ICML 2016.
2. Dario Amodei, etc., [Deep Speech 2 : End-to-End Speech Recognition in English and Mandarin](https://arxiv.org/abs/1512.02595). arXiv:1512.02595.
3. Awni Y. Hannun, etc. [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/abs/1408.2873). arXiv:1408.2873
......@@ -115,7 +115,7 @@ PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种B
"WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
"WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
"WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
"WITH_TESTING", "是否开启单元测试", "ON"
"WITH_TESTING", "是否开启单元测试", "OFF"
"WITH_DOC", "是否编译中英文文档", "OFF"
"WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
"WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
......
......@@ -126,7 +126,7 @@ You can add :code:`-D` argument to pass such options, like:
"WITH_AVX", "Build with AVX support", "ON"
"WITH_PYTHON", "Build with integrated Python interpreter", "ON"
"WITH_STYLE_CHECK", "Check code style when building", "ON"
"WITH_TESTING", "Build unit tests", "ON"
"WITH_TESTING", "Build unit tests", "OFF"
"WITH_DOC", "Build documentations", "OFF"
"WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
"WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
......
......@@ -95,6 +95,12 @@ PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Note
docker run -p 8888:8888 paddlepaddle/book
国内用户可以使用下面的镜像源来加速访问:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
然后在浏览器中输入以下网址:
.. code-block:: text
......
......@@ -102,6 +102,12 @@ We provide a packaged book image, simply issue the command:
docker run -p 8888:8888 paddlepaddle/book
For users in China, we provide a faster mirror:
.. code-block: bash
docker run -p 8888:8888 docker.paddlepaddlehub.com/book
Then, you would back and paste the address into the local browser:
.. code-block:: text
......
......@@ -22,11 +22,11 @@ cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor init)
cc_test(variable_test SRCS variable_test.cc)
cc_library(threadpool SRCS threadpool.cc)
cc_library(threadpool SRCS threadpool.cc DEPS enforce)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool)
......
......@@ -26,9 +26,7 @@ class Channel {
virtual void Send(T*) = 0;
virtual void Receive(T*) = 0;
virtual size_t Cap() = 0;
// Don't delete channels; instead, call Channel::Close.
protected:
virtual void Close() = 0;
virtual ~Channel() {}
};
......@@ -50,11 +48,7 @@ Channel<T>* MakeChannel(size_t buffer_size) {
template <typename T>
void CloseChannel(Channel<T>* ch) {
if (ch->Cap() > 0) {
delete dynamic_cast<details::Buffered<T>*>(ch);
} else {
delete dynamic_cast<details::UnBuffered<T>*>(ch);
}
ch->Close();
}
} // namespace framework
......
......@@ -14,13 +14,67 @@ limitations under the License. */
#include "paddle/framework/channel.h"
#include <chrono>
#include <thread>
#include "gtest/gtest.h"
using paddle::framework::Channel;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
TEST(Channel, MakeAndClose) {
using paddle::framework::Channel;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
{
// MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel<int>(10);
EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel<int>(0);
EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
}
TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) {
ch->Send(&i); // should not block
}
size_t out;
for (size_t i = 0; i < buffer_size; ++i) {
ch->Receive(&out); // should not block
EXPECT_EQ(out, i);
}
CloseChannel(ch);
delete ch;
}
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
size_t sum = 0;
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
ch->Send(&i); // should not block
sum += i;
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
EXPECT_EQ(sum, 45U);
Channel<int>* ch = MakeChannel<int>(10);
CloseChannel(ch);
t.join();
delete ch;
}
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <mutex>
#include "paddle/framework/channel.h"
#include "paddle/platform/enforce.h"
namespace paddle {
namespace framework {
......@@ -32,6 +33,8 @@ class Buffered : public paddle::framework::Channel<T> {
virtual void Send(T*);
virtual void Receive(T*);
virtual size_t Cap() { return cap_; }
virtual void Close();
virtual ~Buffered();
private:
size_t cap_;
......@@ -39,9 +42,11 @@ class Buffered : public paddle::framework::Channel<T> {
std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_;
std::deque<T> channel_;
bool closed_;
Buffered(size_t cap) : cap_(cap) {}
virtual ~Buffered();
Buffered(size_t cap) : cap_(cap), closed_(false) {
PADDLE_ENFORCE_GT(cap, 0);
}
void NotifyAllSenders(std::unique_lock<std::mutex>*);
};
......@@ -49,24 +54,39 @@ class Buffered : public paddle::framework::Channel<T> {
template <typename T>
void Buffered<T>::Send(T* item) {
std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock, [this]() { return channel_.size() < cap_; });
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
if (!closed_) {
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
}
}
template <typename T>
void Buffered<T>::Receive(T* item) {
std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty(); });
*item = std::move(channel_.front());
channel_.pop_front();
empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
if (!closed_) {
*item = std::move(channel_.front());
channel_.pop_front();
NotifyAllSenders(&lock);
} else {
item = nullptr;
}
}
template <typename T>
void Buffered<T>::Close() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
NotifyAllSenders(&lock);
}
template <typename T>
Buffered<T>::~Buffered() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
channel_.clear();
NotifyAllSenders(&lock);
}
......@@ -74,7 +94,7 @@ Buffered<T>::~Buffered() {
template <typename T>
void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
lock->unlock();
full_cond_var_.notify_one();
full_cond_var_.notify_all();
}
} // namespace details
......
......@@ -32,10 +32,11 @@ class UnBuffered : public paddle::framework::Channel<T> {
virtual void Send(T*);
virtual void Receive(T*);
virtual size_t Cap() { return 0; }
virtual void Close();
virtual ~UnBuffered();
private:
UnBuffered() {}
virtual ~UnBuffered();
};
template <typename T>
......@@ -44,6 +45,9 @@ void UnBuffered<T>::Send(T* channel_element) {}
template <typename T>
void UnBuffered<T>::Receive(T*) {}
template <typename T>
void UnBuffered<T>::Close() {}
template <typename T>
UnBuffered<T>::~UnBuffered() {}
......
......@@ -25,7 +25,7 @@ limitations under the License. */
#include "paddle/platform/place.h"
#include "paddle/platform/profiler.h"
DECLARE_bool(do_memory_benchmark);
DECLARE_bool(benchmark);
DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
......@@ -33,9 +33,6 @@ DEFINE_bool(check_nan_inf, false,
namespace paddle {
namespace framework {
const std::string kFeedOpType = "feed";
const std::string kFetchOpType = "fetch";
Executor::Executor(const platform::Place& place) : place_(place) {}
static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
......@@ -125,7 +122,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
op->Run(*local_scope, place_);
VLOG(3) << op->DebugStringEx(local_scope);
if (FLAGS_do_memory_benchmark) {
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
<< memory::memory_usage(place_);
}
......@@ -142,7 +139,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
if (FLAGS_do_memory_benchmark) {
if (FLAGS_benchmark) {
VLOG(2) << "-------------------------------------------------------";
VLOG(2) << "Memory used after deleting local scope: "
<< memory::memory_usage(place_);
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/framework/lod_tensor.h"
......@@ -20,5 +21,8 @@ namespace paddle {
namespace framework {
using FeedFetchType = LoDTensor;
using FeedFetchList = std::vector<FeedFetchType>;
static const std::string kFeedOpType = "feed";
static const std::string kFetchOpType = "fetch";
} // namespace framework
} // namespace paddle
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <string.h> // for strdup
#include <algorithm>
#include <stdexcept>
#include <string>
#include "paddle/framework/init.h"
......@@ -46,17 +47,23 @@ void InitDevices() {
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
int count = 0;
#ifdef PADDLE_WITH_CUDA
int count = platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
try {
count = platform::GetCUDADeviceCount();
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
}
#else
LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option";
<< "'CUDA' is not supported, Please re-compile with WITH_GPU option";
#endif
for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
}
platform::DeviceContextPool::Init(places);
}
......
......@@ -20,7 +20,21 @@ TEST(InitDevices, CPU) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifndef PADDLE_WITH_CUDA
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_GE(pool.size(), 1U);
ASSERT_EQ(pool.size(), 1U);
#endif
}
TEST(InitDevices, CUDA) {
using paddle::framework::InitDevices;
using paddle::platform::DeviceContextPool;
#ifdef PADDLE_WITH_CUDA
int count = paddle::platform::GetCUDADeviceCount();
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
#endif
}
......@@ -24,8 +24,6 @@ limitations under the License. */
#include <algorithm>
#include <iterator>
#include <glog/logging.h>
namespace paddle {
namespace framework {
......
......@@ -18,11 +18,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif
#include <glog/logging.h>
#include "paddle/framework/ddim.h"
#include "paddle/framework/mixed_vector.h"
#include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h"
#include "paddle/platform/enforce.h"
......@@ -31,15 +31,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
#ifndef PADDLE_WITH_CUDA
template <typename T>
using Vector = std::vector<T>;
#else
template <typename T>
using Vector = thrust::host_vector<
T, thrust::system::cuda::experimental::pinned_allocator<T>>;
#endif
/*
* LoD is short for Level of Details.
*
......@@ -55,7 +46,15 @@ using Vector = thrust::host_vector<
* 0 2 4 7
* 0 2 5 7 10 12 15 20
*/
using LoD = std::vector<Vector<size_t>>;
struct LoD : public std::vector<Vector<size_t>> {
using std::vector<Vector<size_t>>::vector;
void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA();
}
}
};
std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
......@@ -109,7 +108,10 @@ bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
*/
class LoDTensor : public Tensor {
public:
LoDTensor() {}
LoDTensor() : Tensor() {}
/* Constructor with place should only be used in pybind */
explicit LoDTensor(const platform::Place& place) : Tensor(place) {}
explicit LoDTensor(const LoD& lod) : lod_(lod) {}
......
......@@ -23,6 +23,17 @@
namespace paddle {
namespace framework {
TEST(LoD, data) {
LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i);
}
}
TEST(LodExpand, test) {
LoD lod{{0, 2}};
LoDTensor tensor;
......
......@@ -14,6 +14,8 @@
#include <cuda.h>
#include <cuda_runtime.h>
#include <stdio.h>
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/platform/assert.h"
......@@ -26,7 +28,48 @@ __global__ void test(size_t* a, int size) {
}
}
TEST(Vector, Normal) {
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
paddle::framework::InitDevices();
paddle::framework::Vector<size_t> vec({1, 2, 3});
size_t* ptr = vec.data();
for (size_t i = 0; i < vec.size(); ++i) {
EXPECT_EQ(vec[i], *(ptr + i));
}
vec.clear();
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3};
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], vec[i]);
}
}
TEST(LoD, data) {
paddle::framework::InitDevices();
paddle::framework::LoD lod{{0, 1, 2}};
lod.push_back({0, 2, 4, 5});
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0];
test<<<1, 1>>>(v.cuda_data(), v.size());
cudaDeviceSynchronize();
v.CopyFromCUDA();
for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2);
}
}
TEST(LoDTensor, LoDInGPU) {
paddle::framework::InitDevices();
paddle::framework::LoDTensor lod_tensor;
paddle::platform::CUDAPlace place(0);
......@@ -42,8 +85,9 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].data(), lod[0].size());
test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size());
cudaDeviceSynchronize();
lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <initializer_list>
#include <vector>
#include "paddle/memory/memcpy.h"
#include "paddle/memory/memory.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/enforce.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace framework {
/**
* @brief Vector support both cpu and gpu.
* host vector lifetime is same with Vector
* device vector is lazily malloc and modified.
*/
template <typename T>
class Vector : public std::vector<T> {
public:
/* NOTE(dzhwinter):
* Data always store and modified on Host.
* If the data is modified when use cuda_data interface,
* You need to call the CopyFromCUDA explicitly to synchronize data.
*
*/
enum class kDataPosition {
kDataOnHost = 0,
kDataOnDevice = 1,
};
public:
using std::vector<T>::vector;
Vector() {}
Vector(const std::vector<T> &v) : std::vector<T>(v) {} // NOLINT
virtual ~Vector() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ != nullptr) {
memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
}
#endif
}
T *cuda_data() {
CopyToCUDA();
PADDLE_ENFORCE_NOT_NULL(
cuda_ptr_, "No data or Insufficient CUDA memory to allocation");
return static_cast<T *>(cuda_ptr_);
}
T *data() { return std::vector<T>::data(); }
const T *data() const { return std::vector<T>::data(); }
void CopyToCUDA();
void CopyFromCUDA();
void CopyToPeer(platform::Place);
private:
void *cuda_ptr_ = nullptr;
size_t cuda_size_ = 0;
/*The DataPosition is unused now,
if we want support random access from cpu and cuda,
we need to overload all the vector method */
kDataPosition position_ = kDataPosition::kDataOnHost;
platform::CUDAPlace place_;
};
template <typename T>
void Vector<T>::CopyToCUDA() {
#ifdef PADDLE_WITH_CUDA
if (cuda_ptr_ == nullptr) {
cuda_ptr_ =
memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *cuda_ctx = pool.GetByPlace(place_);
memory::Copy(place_, static_cast<void *>(cuda_ptr_), platform::CPUPlace(),
static_cast<const void *>(this->data()),
this->size() * sizeof(T), cuda_ctx->stream());
cuda_ctx->Wait();
cuda_size_ = this->size();
#endif
}
template <typename T>
void Vector<T>::CopyFromCUDA() {
#ifdef PADDLE_WITH_CUDA
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *cuda_ctx = pool.GetByPlace(place_);
if (cuda_ptr_ == nullptr) {
LOG(WARNING) << "No uncommited cuda data.";
return;
}
this->resize(cuda_size_);
memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
cuda_ctx->stream());
cuda_ctx->Wait();
#endif
}
template <typename T>
void Vector<T>::CopyToPeer(platform::Place peer_place) {
if (platform::is_cpu_place(peer_place)) {
return;
}
#ifdef PADDLE_WITH_CUDA
auto *cuda_ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
void *peer_cuda_ptr_ = memory::Alloc<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
memory::Copy(boost::get<platform::CUDAPlace>(peer_place),
static_cast<void *>(peer_cuda_ptr_), place_,
static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
cuda_ctx->stream());
cuda_ctx->Wait();
memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
place_ = boost::get<platform::CUDAPlace>(peer_place);
cuda_ptr_ = peer_cuda_ptr_;
#endif
}
template class Vector<int>;
template class Vector<unsigned>;
template class Vector<size_t>;
template class Vector<int64_t>;
} // namespace framework
} // namespace paddle
......@@ -22,9 +22,7 @@ limitations under the License. */
#include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
DEFINE_bool(op_sync, false,
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
DECLARE_bool(benchmark);
namespace paddle {
namespace framework {
......@@ -531,7 +529,7 @@ void OperatorWithKernel::Run(const Scope& scope,
ExecutionContext(*this, new_scope, *new_dev_ctx));
/*For profiling/benchmark only*/
if (FLAGS_op_sync) {
if (FLAGS_benchmark) {
new_dev_ctx->Wait();
}
}
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#include "paddle/framework/program_desc.h"
#include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle {
namespace framework {
......@@ -64,5 +65,27 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
}
}
const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
BlockDesc *global_block = blocks_[0].get();
std::vector<std::string> feed_target_names;
for (auto *op : global_block->AllOps()) {
if (op->Type() == kFeedOpType) {
feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
}
}
return feed_target_names;
}
const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
BlockDesc *global_block = blocks_[0].get();
std::vector<std::string> fetch_target_names;
for (auto *op : global_block->AllOps()) {
if (op->Type() == kFetchOpType) {
fetch_target_names.push_back(op->Input("X")[0]);
}
}
return fetch_target_names;
}
} // namespace framework
} // namespace paddle
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory>
#include <vector>
#include "paddle/framework/block_desc.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/proto_desc.h"
#include "paddle/platform/macros.h"
......@@ -45,6 +46,9 @@ class ProgramDesc {
proto::ProgramDesc *Proto();
const std::vector<std::string> GetFeedTargetNames();
const std::vector<std::string> GetFetchTargetNames();
private:
proto::ProgramDesc desc_;
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <algorithm>
#include <set>
#include <string>
#include <unordered_map>
#include <vector>
#include <glog/logging.h>
......@@ -102,6 +103,32 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
*op_field->Add() = input.blocks(block_id).ops(i);
}
}
// remove the VarDescs in BlockDesc that are not referenced in
// the pruned OpDescs
std::unordered_map<std::string, proto::VarDesc> var_map;
auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
for (const auto& var : *var_field) {
var_map[var.name()] = var;
}
var_field->Clear();
for (const auto& op : *op_field) {
// add VarDescs of all input arguments for each OpDesc
auto& input_field = op.inputs();
for (auto& input_var : input_field) {
for (auto& arg : input_var.arguments()) {
*var_field->Add() = var_map[arg];
}
}
// add VarDescs of all output arguments for each OpDesc
auto& output_field = op.outputs();
for (auto& output_var : output_field) {
for (auto& arg : output_var.arguments()) {
*var_field->Add() = var_map[arg];
}
}
}
}
// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
......
......@@ -20,9 +20,11 @@ limitations under the License. */
#include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h"
DEFINE_bool(do_memory_benchmark, false,
DEFINE_bool(benchmark, false,
"Doing memory benchmark. It will make deleting scope synchronized, "
"and add some memory usage logs");
"and add some memory usage logs."
"Default cuda is asynchronous device, set to True will"
"force op run in synchronous mode.");
namespace paddle {
namespace framework {
......@@ -93,7 +95,7 @@ void Scope::DeleteScope(Scope* scope) {
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it);
// When making memory benchmark on Fluid, we have to delete scope sync.
if (FLAGS_do_memory_benchmark) {
if (FLAGS_benchmark) {
delete scope;
} else {
Async([scope] { delete scope; });
......
......@@ -47,6 +47,11 @@ class Tensor {
public:
Tensor() : offset_(0) {}
/*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) {
holder_->set_place(place);
}
/*! Return a pointer to mutable memory block. */
template <typename T>
inline T* data();
......@@ -137,6 +142,7 @@ class Tensor {
virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
virtual void set_place(platform::Place place) = 0;
};
template <typename Place>
......@@ -156,6 +162,7 @@ class Tensor {
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
virtual void set_place(platform::Place place) { place_ = place; }
/*! the pointer of memory block. */
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
......
......@@ -14,6 +14,8 @@
#include "paddle/framework/threadpool.h"
#include "paddle/platform/enforce.h"
namespace paddle {
namespace framework {
......
......@@ -22,7 +22,7 @@ limitations under the License. */
#include <thread>
#include <vector>
#include "paddle/platform/enforce.h"
#include "paddle/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace paddle {
namespace framework {
......
set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
set(FLUID_CORE_MODULES proto_desc paddle_memory lod_tensor executor prune init)
cc_library(paddle_fluid_api
SRCS inference.cc
SRCS io.cc
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Merge all modules into a single static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Create shared library
add_library(paddle_fluid_shared SHARED inference.cc)
add_library(paddle_fluid_shared SHARED io.cc)
target_circle_link_libraries(paddle_fluid_shared
ARCHIVE_START
......@@ -20,23 +20,10 @@ SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
# install library & headers
if(NOT WITH_C_API AND WITH_FLUID)
install(FILES inference.h DESTINATION include/paddle/inference)
install(FILES io.h DESTINATION include/paddle/inference)
install(TARGETS paddle_fluid_shared DESTINATION lib)
endif()
add_executable(example example.cc)
if(APPLE)
set(OPTIONAL_LINK_FLAGS)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
endif()
target_link_libraries(example
-Wl,-force_load paddle_fluid
${OPTIONAL_LINK_FLAGS}
${PTOOLS_LIB})
else()
target_link_libraries(example
-Wl,--start-group -Wl,--whole-archive paddle_fluid
-Wl,--no-whole-archive -Wl,--end-group
${PTOOLS_LIB})
if(WITH_TESTING)
add_subdirectory(tests/book)
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <time.h>
#include <iostream>
#include "gflags/gflags.h"
#include "paddle/inference/inference.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_dirname.empty()) {
// Example:
// ./example --dirname=recognize_digits_mlp.inference.model
std::cout << "Usage: ./example --dirname=path/to/your/model" << std::endl;
exit(1);
}
std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
paddle::InferenceEngine* engine = new paddle::InferenceEngine();
engine->LoadInferenceModel(dirname);
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor> feeds;
feeds.push_back(input);
std::vector<paddle::framework::LoDTensor> fetchs;
engine->Execute(feeds, fetchs);
for (size_t i = 0; i < fetchs.size(); ++i) {
auto dims_i = fetchs[i].dims();
std::cout << "dims_i:";
for (int j = 0; j < dims_i.size(); ++j) {
std::cout << " " << dims_i[j];
}
std::cout << std::endl;
std::cout << "result:";
float* output_ptr = fetchs[i].data<float>();
for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
std::cout << " " << output_ptr[j];
}
std::cout << std::endl;
}
delete engine;
return 0;
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
......@@ -12,48 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "inference.h"
#include "paddle/inference/io.h"
#include <fstream>
#include "paddle/framework/executor.h"
#include "paddle/framework/init.h"
#include "paddle/framework/scope.h"
#include "paddle/framework/block_desc.h"
#include "paddle/framework/feed_fetch_type.h"
namespace paddle {
namespace inference {
void InferenceEngine::LoadInferenceModel(const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
program_ = new framework::ProgramDesc(program_desc_str);
GenerateLoadProgram(dirname);
framework::BlockDesc* global_block = program_->MutableBlock(0);
feed_var_names_.clear();
fetch_var_names_.clear();
for (auto* op : global_block->AllOps()) {
if (op->Type() == "feed") {
feed_var_names_.insert(feed_var_names_.begin(), op->Output("Out")[0]);
} else if (op->Type() == "fetch") {
fetch_var_names_.push_back(op->Input("X")[0]);
}
}
}
bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
bool IsParameter(const framework::VarDesc* var,
const framework::ProgramDesc& main_program) {
if (var->Persistable()) {
// There are many unreachable variables in the program
for (size_t i = 0; i < program_->Size(); ++i) {
const framework::BlockDesc& block = program_->Block(i);
for (size_t i = 0; i < main_program.Size(); ++i) {
const framework::BlockDesc& block = main_program.Block(i);
for (auto* op : block.AllOps()) {
if (op->Type() == "feed") {
if (op->Type() == framework::kFeedOpType) {
continue;
}
for (auto input_argument_name : op->InputArgumentNames()) {
......@@ -67,14 +42,17 @@ bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
return false;
}
void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
framework::BlockDesc* global_block = program_->MutableBlock(0);
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname,
const framework::ProgramDesc& main_program) {
const framework::BlockDesc& global_block = main_program.Block(0);
load_program_ = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program_->MutableBlock(0);
for (auto* var : global_block->AllVars()) {
if (IsParameter(var)) {
LOG(INFO) << "parameter's name: " << var->Name();
framework::ProgramDesc* load_program = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program->MutableBlock(0);
for (auto* var : global_block.AllVars()) {
if (IsParameter(var, main_program)) {
VLOG(3) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape());
......@@ -91,97 +69,30 @@ void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
op->CheckAttrs();
}
}
executor.Run(*load_program, &scope, 0, true, true);
delete load_program;
}
void InferenceEngine::PrependFeedOp() {
if (!program_) {
LOG(FATAL) << "Please initialize the program_ first.";
}
framework::BlockDesc* global_block = program_->MutableBlock(0);
// create_var
framework::VarDesc* feed_var = global_block->Var("feed");
feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
feed_var->SetPersistable(true);
// prepend feed_op
for (size_t i = 0; i < feed_var_names_.size(); ++i) {
std::string var_name = feed_var_names_[i];
LOG(INFO) << "feed var's name: " << var_name;
// prepend_op
framework::OpDesc* op = global_block->PrependOp();
op->SetType("feed");
op->SetInput("X", {"feed"});
op->SetOutput("Out", {var_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
}
}
void InferenceEngine::AppendFetchOp() {
if (!program_) {
LOG(FATAL) << "Please initialize the program_ first.";
}
framework::BlockDesc* global_block = program_->MutableBlock(0);
// create_var
framework::VarDesc* fetch_var = global_block->Var("fetch");
fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
fetch_var->SetPersistable(true);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
// append fetch_op
for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
std::string var_name = fetch_var_names_[i];
LOG(INFO) << "fetch var's name: " << var_name;
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str));
// append_op
framework::OpDesc* op = global_block->AppendOp();
op->SetType("fetch");
op->SetInput("X", {var_name});
op->SetOutput("Out", {"fetch"});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
}
LoadPersistables(executor, scope, dirname, *main_program);
return main_program;
}
void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
std::vector<framework::LoDTensor>& fetchs) {
if (!program_ || !load_program_) {
LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
}
if (feeds.size() != feed_var_names_.size()) {
LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
}
auto* place = new platform::CPUPlace();
framework::InitDevices();
framework::Executor* executor = new framework::Executor(*place);
framework::Scope* scope = new framework::Scope();
executor->Run(*load_program_, scope, 0, true, true);
std::map<std::string, const framework::LoDTensor*> feed_targets;
std::map<std::string, framework::LoDTensor*> fetch_targets;
// set_feed_variable
for (size_t i = 0; i < feed_var_names_.size(); ++i) {
feed_targets[feed_var_names_[i]] = &feeds[i];
}
// get_fetch_variable
fetchs.resize(fetch_var_names_.size());
for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
fetch_targets[fetch_var_names_[i]] = &fetchs[i];
}
executor->Run(*program_, scope, feed_targets, fetch_targets);
delete place;
delete scope;
delete executor;
}
} // namespace inference
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "paddle/framework/executor.h"
#include "paddle/framework/program_desc.h"
#include "paddle/framework/scope.h"
namespace paddle {
namespace inference {
void LoadPersistables(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname,
const framework::ProgramDesc& main_program);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
framework::Scope& scope,
const std::string& dirname);
} // namespace inference
} // namespace paddle
set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
cc_test(test_inference_recognize_digits_mlp
SRCS test_inference_recognize_digits.cc
DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
set_tests_properties(test_inference_recognize_digits_mlp
PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <time.h>
#include <sstream>
#include "gflags/gflags.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/inference/io.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
template <typename Place, typename T>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
// 1. Define place, executor and scope
auto place = Place();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
// 2. Initialize the inference_program and load all parameters from file
auto inference_program = paddle::inference::Load(executor, *scope, dirname);
// 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names =
inference_program->GetFeedTargetNames();
const std::vector<std::string>& fetch_target_names =
inference_program->GetFetchTargetNames();
// 4. Prepare inputs: set up maps for feed targets
std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
for (size_t i = 0; i < feed_target_names.size(); ++i) {
// Please make sure that cpu_feeds[i] is right for feed_target_names[i]
feed_targets[feed_target_names[i]] = cpu_feeds[i];
}
// 5. Define Tensor to get the outputs: set up maps for fetch targets
std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
for (size_t i = 0; i < fetch_target_names.size(); ++i) {
fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
}
// 6. Run the inference program
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
delete scope;
}
TEST(inference, recognize_digits) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace, float>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, float>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
EXPECT_EQ(output1.dims(), output2.dims());
EXPECT_EQ(output1.numel(), output2.numel());
float err = 1E-3;
int count = 0;
for (int64_t i = 0; i < output1.numel(); ++i) {
if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
count++;
}
}
EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
#endif
}
......@@ -175,6 +175,8 @@ endif()
# FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
......@@ -194,3 +196,4 @@ if(WITH_GPU)
cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
......@@ -82,7 +82,7 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
auto grad_merge = merge_func(context, grad);
auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
auto& merge_rows = grad_merge.rows();
framework::Vector<int64_t> merge_rows(grad_merge.rows());
// 2. m += g_m * g_m
math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
auto grad_square = sqare_func(context, grad_merge, grad_merge);
......@@ -101,8 +101,8 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, grad_merge.rows().data(),
lr, param_data, moment_data, grad_width,
.stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr,
param_data, moment_data, grad_width,
epsilon);
}
};
......
......@@ -199,7 +199,12 @@ class AdamOpKernel : public framework::OpKernel<T> {
merge_func(ctx.template device_context<DeviceContext>(), grad);
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
auto* rows = grad_merge.rows().data();
int64_t* rows = nullptr;
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->cuda_data();
} else {
rows = grad_merge.mutable_rows()->data();
}
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor(
......
......@@ -69,12 +69,11 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
auto stream = ctx.cuda_device_context().stream();
MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
num_tokens, tokens, num_seq, input_lod[level].data(), blank,
num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank,
merge_repeated, dev_out_lod0_ptr, output_data);
// set output lod
thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
dev_out_lod0.end());
std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());
framework::LoD out_lod;
out_lod.push_back(host_out_lod0);
output->set_lod(out_lod);
......
......@@ -51,6 +51,13 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
"'dropout_prob' must be between 0.0 and 1.0.");
});
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddAttr<bool>("fix_seed",
"A flag indicating whether to use a fixed seed to generate "
"random mask. NOTE: DO NOT set this flag to true in "
"training. Setting this flag to true is only useful in "
"unittest or for debug that always the same output units "
"will be dropped.")
.SetDefault(false);
AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
AddComment(R"DOC(
......
......@@ -62,7 +62,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int size = framework::product(mask->dims());
int seed = context.Attr<int>("seed");
std::random_device rnd;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
thrust::transform(index_sequence_begin, index_sequence_begin + size,
thrust::device_ptr<T>(mask_data),
......
......@@ -38,9 +38,15 @@ class CPUDropoutKernel : public framework::OpKernel<T> {
if (!context.Attr<bool>("is_test")) {
auto* mask = context.Output<Tensor>("Mask");
auto* mask_data = mask->mutable_data<T>(context.GetPlace());
int seed = context.Attr<int>("seed");
// NOTE: fixed seed should only be used in unittest or for debug.
// Guarantee to use random seed in training.
std::random_device rnd;
std::minstd_rand engine;
int seed =
context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
engine.seed(seed);
std::uniform_real_distribution<float> dist(0, 1);
size_t size = framework::product(mask->dims());
for (size_t i = 0; i < size; ++i) {
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/elementwise_pow_op.h"
#include "paddle/operators/elementwise_op.h"
namespace paddle {
namespace operators {
class ElementwisePowOpMaker : public ElementwiseOpMaker {
public:
ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: ElementwiseOpMaker(proto, op_checker) {
SetComment("Pow", "Out = X ^ Y");
AddComment(comment_);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
ops::ElementwisePowOpMaker);
REGISTER_OP_CPU_KERNEL(
elementwise_pow,
ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#define EIGEN_USE_GPU
#include "paddle/operators/elementwise_pow_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
elementwise_pow,
ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include "paddle/operators/elementwise_op_function.h"
namespace paddle {
namespace operators {
template <typename T>
struct PowFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
};
template <typename DeviceContext, typename T>
class ElementwisePowKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx);
}
};
} // namespace operators
} // namespace paddle
......@@ -30,11 +30,12 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src, const size_t* index,
const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index, *dst, indexed_src);
row_shuffle(ctx, src, index_lod, *dst, indexed_src);
}
template <typename DeviceContext, typename T>
......@@ -76,7 +77,9 @@ class GRUKernel : public framework::OpKernel<T> {
gru_value.state_weight =
const_cast<T*>(weight_data + 2 * frame_size * frame_size);
Tensor ordered_h0;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (h0) {
// Since the batch computing for GRU reorders the input sequences
// according to their length. The initialized cell state also needs
......@@ -159,7 +162,9 @@ class GRUGradKernel : public framework::OpKernel<T> {
zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
Tensor ordered_h0, ordered_h0_grad;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (h0) {
ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
true);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/label_smooth_op.h"
namespace paddle {
namespace operators {
class LabelSmoothOp : public framework::OperatorWithKernel {
public:
LabelSmoothOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LabelSmoothOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of LabelSmoothOp should not be null.");
auto in_dims = ctx->GetInputDim("X");
if (ctx->HasInput("PriorDist")) {
auto noise_dims = ctx->GetInputDim("PriorDist");
auto noise_numel = paddle::framework::product(noise_dims);
PADDLE_ENFORCE(
in_dims[1] == noise_numel,
"The number of elements in Input(PriorDist) must be equal to the "
"dimension of each label.");
}
ctx->ShareLoD("X", /*->*/ "Out");
ctx->SetOutputDim("Out", in_dims);
}
};
class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X",
"(LoDTensor) The input labels of LabelSmooth operator. This "
"input can be batched labels in one-hot encoding or output from "
"softmax, with shape [N x K], where N is the batch size and K is "
"the number of classes");
AddInput("PriorDist",
"(Tensor, optional)"
"The prior distribution to be added to the smoothed label. It is "
"fixed during training and the number of elements should be equal "
"to the dimension K of each label. Default is uniform "
"distribution and each element will be set to 1/K if not provided "
"in input.")
.AsDispensable();
AddOutput("Out",
"(loDTensor) The smoothed label of LabelSmooth operator. It has"
"the same shape and LoD with the Input(LoDTensor).");
AddAttr<float>("epsilon",
"(float, default 0.0f)"
"The smoothing parameter of LabelSmooth operator.")
.SetDefault(0.0f);
AddComment(R"DOC(
LabelSmooth Operator.
Label smoothing is a mechanism to regularize the classifier layer. In machine
learning, optimizing the log-likelihood of the correct label directly may
cause two problems. First, it may result in overfitting: if the model learns
to assign full probability to the ground-truth label for each training example,
it is not guaranteed to generalize. Second, it encourages the differences
between the largest logit and all others to become large, reducing the ability
of the model to adapt. Label smoothing is proposed to encourage the model to
be less confident, which replaces the ground-truth label $y$ with the weighted
sum of itself and some fixed distribution $\mu$, i.e.
$$
\tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
$$
where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and
$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for
$\mu$. This change in the ground-truth label is called label-smoothing
regularization or LSR.
See more details about label smoothing in https://arxiv.org/abs/1512.00567.
)DOC");
}
};
class LabelSmoothGradOp : public framework::OperatorWithKernel {
public:
LabelSmoothGradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@GRAD) shouldn't be null.");
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
label_smooth_grad, ops::LabelSmoothGradOp);
REGISTER_OP_CPU_KERNEL(
label_smooth,
ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
label_smooth_grad,
ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/label_smooth_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
label_smooth,
ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
label_smooth_grad,
ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LabelSmoothKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* out_t = ctx.Output<framework::LoDTensor>("Out");
auto* in_t = ctx.Input<framework::LoDTensor>("X");
auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
auto label_dim = in_t->dims()[1];
out_t->mutable_data<T>(ctx.GetPlace());
auto epsilon = ctx.Attr<float>("epsilon");
auto out = framework::EigenVector<T>::Flatten(*out_t);
auto in = framework::EigenVector<T>::Flatten(*in_t);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
if (dist_t) {
auto dist = framework::EigenVector<T>::Flatten(*dist_t);
out.device(dev) =
static_cast<T>(1 - epsilon) * in +
epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
} else {
out.device(dev) = static_cast<T>(1 - epsilon) * in +
static_cast<T>(epsilon / label_dim);
}
}
};
template <typename DeviceContext, typename T>
class LabelSmoothGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
d_in_t->mutable_data<T>(ctx.GetPlace());
auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
auto epsilon = ctx.Attr<float>("epsilon");
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/layer_norm_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using DataLayout = framework::DataLayout;
template <typename T>
using EigenMatrixMapRowMajor = Eigen::Map<
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
template <typename T>
using ConstEigenMatrixMapRowMajor = Eigen::Map<
const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
class LayerNormOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"),
"Output(Y) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Mean"),
"Output(Mean) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Variance"),
"Output(Variance) of LayerNormOp should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
"'begin_norm_axis' must be less than the rank of X.");
auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
if (ctx->HasInput("Scale")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
}
if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
}
ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
ctx->SetOutputDim("Mean", {left});
ctx->SetOutputDim("Variance", {left});
ctx->ShareLoD("X", "Y");
}
};
class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
public:
LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(LoDTensor) The input tensor.");
AddInput("Scale",
"(Tensor, optional) Scale is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output.")
.AsDispensable();
AddInput("Bias",
"(Tensor, optional) Bias is a 1-dimensional tensor of size "
"H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
"It is applied to the output.")
.AsDispensable();
AddOutput("Y", "(LoDTensor) Result after normalization.");
AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
.AsIntermediate();
AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
.AsIntermediate();
AddAttr<float>("epsilon",
"(float, default 1e-5) Constant for "
"numerical stability")
.SetDefault(1e-5)
.AddCustomChecker([](const float &epsilon) {
PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
"'epsilon' should be between 0.0 and 0.001.");
});
AddAttr<int>("begin_norm_axis",
"(int default:1), the "
"axis of `begin_norm_axis ... Rank(X) - 1` will be "
"normalized. `begin_norm_axis` splits the tensor(`X`) to a "
"matrix [N,H].")
.SetDefault(1)
.AddCustomChecker([](const int &begin_norm_axis) {
PADDLE_ENFORCE_GT(begin_norm_axis, 0,
"'begin_norm_axis' should be greater than zero.");
});
AddComment(R"DOC(
Layer Normalization.
Layer Norm has been implemented as discussed in the paper:
https://arxiv.org/abs/1607.06450
...
)DOC");
}
};
template <typename T>
class LayerNormKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *bias = ctx.Input<Tensor>("Bias");
const auto *x = ctx.Input<Tensor>("X");
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto *output = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance");
output->mutable_data<T>(ctx.GetPlace());
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
auto squre = [](T ele) { return ele * ele; };
auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
mean_map = input_map.rowwise().mean();
var_map = (input_map - mean_map.replicate(1, right))
.unaryExpr(squre)
.rowwise()
.mean()
.unaryExpr(add_epslion);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): Some thinking about output_map, is it appropriate that
// `output_map` and `input_map` point to the same memory.
auto inv_std = var_map.unaryExpr(inv_std_func);
if (scale && bias) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1)) +
bias_map.replicate(left, 1);
} else if (scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1));
} else if (bias) {
auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right)) +
bias_map.replicate(left, 1);
} else {
output_map = (input_map - mean_map.replicate(1, right))
.cwiseProduct(inv_std.replicate(1, right));
}
}
};
class LayerNormGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
// check input
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Scale"),
"Input(Scale) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Mean"),
"Input(Mean) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Variance"),
"Input(Variance) of LayerNormOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) of LayerNormOp should not be null.");
// check output
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
}
if (ctx->HasOutput(framework::GradVarName("Scale"))) {
ctx->SetOutputDim(framework::GradVarName("Scale"),
ctx->GetInputDim("Scale"));
}
if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias"));
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
const Tensor *t = nullptr;
if (var->IsType<Tensor>()) {
t = &var->Get<Tensor>();
} else if (var->IsType<LoDTensor>()) {
t = &var->Get<LoDTensor>();
}
if (t == nullptr) {
PADDLE_THROW("can't find Y@GRAD");
}
return framework::OpKernelType(framework::ToDataType(t->type()),
ctx.GetPlace());
}
};
template <typename T>
class LayerNormGradKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
const auto *var = ctx.Input<Tensor>("Variance");
const auto *scale = ctx.Input<Tensor>("Scale");
const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto &x_dims = x->dims();
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
if (d_bias) {
d_bias->mutable_data<T>(ctx.GetPlace());
auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
d_bias_map = d_y_map.colwise().sum();
}
if (d_scale) {
d_scale->mutable_data<T>(ctx.GetPlace());
auto d_scale_map =
EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// There are two equation to compute d_scale. One uses "Y" and the other
// does not use "Y"
d_scale_map =
((x_map - mean_map.replicate(1, right))
.cwiseProduct(
var_map.unaryExpr(inv_std_func).replicate(1, right))
.cwiseProduct(d_y_map))
.colwise()
.sum();
}
if (d_x) {
d_x->mutable_data<T>(ctx.GetPlace());
auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
auto triple_product_func = [](T ele) { return ele * ele * ele; };
auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
// TODO(zcd): these code can be refined
if (d_scale) {
auto scale_map =
ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1));
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.cwiseProduct(scale_map.replicate(left, 1))
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(scale_map.replicate(left, 1))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
} else {
// dy_dx
auto dx_end = var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map);
// dy_dmean_dx
auto dx_mean = (T(-1.0) / right) *
var_map.unaryExpr(inv_std_func)
.replicate(1, right)
.cwiseProduct(d_y_map)
.rowwise()
.sum()
.replicate(1, right);
// dy_var_dx
auto dvar_end_part = (x_map - mean_map.replicate(1, right))
.cwiseProduct(d_y_map)
.rowwise()
.sum();
auto dvar_end = var_map.unaryExpr(inv_std_func)
.unaryExpr(triple_product_func)
.cwiseProduct(dvar_end_part)
.replicate(1, right);
auto dx_var =
(T(-1.0) / right) *
(x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
d_x_map = dx_end + dx_mean + dx_var;
}
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
layer_norm_grad, ops::LayerNormGradOp);
REGISTER_OP_CPU_KERNEL(
layer_norm,
ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
layer_norm_grad,
ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
......@@ -13,36 +13,23 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/block_desc.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/program_desc.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class LayerNormKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
class InferenceEngine {
public:
InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
~InferenceEngine() {
delete program_;
delete load_program_;
}
void LoadInferenceModel(const std::string& dirname);
void Execute(const std::vector<framework::LoDTensor>& feeds,
std::vector<framework::LoDTensor>& fetchs);
private:
bool IsParameter(const framework::VarDesc* var);
void GenerateLoadProgram(const std::string& dirname);
void PrependFeedOp();
void AppendFetchOp();
private:
framework::ProgramDesc* program_;
framework::ProgramDesc* load_program_;
std::vector<std::string> feed_var_names_;
std::vector<std::string> fetch_var_names_;
template <typename DeviceContext, typename T>
class LayerNormGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override;
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fstream>
#include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
class LoadCombineOp : public framework::OperatorBase {
public:
LoadCombineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path");
std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
auto out_var_names = Outputs("Out");
PADDLE_ENFORCE_GT(
static_cast<int>(out_var_names.size()), 0,
"The number of output variables should be greater than 0.");
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < out_var_names.size(); i++) {
auto *out_var = scope.FindVar(out_var_names[i]);
PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
out_var_names[i]);
auto *tensor = out_var->GetMutable<framework::LoDTensor>();
// Error checking
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
filename);
// Get data from fin to tensor
DeserializeFromStream(fin, tensor, dev_ctx);
if (platform::is_gpu_place(place)) {
// copy CPU to GPU
framework::LoDTensor cpu_tensor;
cpu_tensor.ShareDataWith(*tensor);
cpu_tensor.set_lod(tensor->lod());
// reset tensor
out_var->Clear();
tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->set_lod(cpu_tensor.lod());
Copy(cpu_tensor, place, dev_ctx, tensor);
}
}
}
};
class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput(
"Out",
"(vector) The output LoDTensors that will be read from the input file.")
.AsDuplicable();
AddAttr<std::string>("file_path",
"(string) "
"LoDTensors will be loaded from \"file_path\".")
.AddCustomChecker(
[](const std::string &path) { return !path.empty(); });
AddComment(R"DOC(
LoadCombine Operator.
LoadCombine operator loads LoDTensor variables from a file. The file should
contain one or more LoDTensors serialized using the SaveCombine operator. The
LoadCombine operator applies a deserialization strategy to appropriately load
the LodTensors, and this strategy complements the serialization strategy used
in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
with the SaveCombine operator, and can only deserialize one or more LoDTensors
that were saved using the SaveCombine operator.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
ops::LoadCombineOpProtoMaker);
......@@ -125,8 +125,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
new_rows.resize(ids_dim[0]);
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
ids_dim[0] * sizeof(int64_t), stream);
memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place,
ids_data, ids_dim[0] * sizeof(int64_t), stream);
d_table->set_rows(new_rows);
......
......@@ -27,11 +27,12 @@ using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src, const size_t* index,
const framework::Tensor& src,
framework::Vector<size_t> index_lod,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
row_shuffle(ctx, src, index, *dst, indexed_src);
row_shuffle(ctx, src, index_lod, *dst, indexed_src);
}
template <typename DeviceContext, typename T>
......@@ -84,7 +85,9 @@ class LSTMKernel : public framework::OpKernel<T> {
}
lstm_value.prev_state_value = nullptr;
Tensor ordered_c0;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (cell_t0) {
// Since the batch computing for LSTM reorders the input sequence
// according to their length. The initialized cell state also needs
......@@ -202,7 +205,8 @@ class LSTMGradKernel : public framework::OpKernel<T> {
// ordered_h0_g/c0_g is the reordered gradient of hidden/cell
// initialization.
Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (c0) {
ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
true);
......
......@@ -34,7 +34,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T>
inline void ReorderInitState(const DeviceContext& ctx,
const framework::Tensor& src, const size_t* index,
const framework::Tensor& src,
framework::Vector<size_t> index,
framework::Tensor* dst, bool indexed_src) {
math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
dst->mutable_data<T>(src.dims(), ctx.GetPlace());
......@@ -109,7 +110,9 @@ class LSTMPKernel : public framework::OpKernel<T> {
}
lstmp_value.prev_state_value = nullptr;
Tensor ordered_c0;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (cell_t0) {
// Since the batch computing for LSTMP reorders the input sequence
// according to their length. The initialized cell state also needs
......@@ -275,7 +278,9 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
// ordered_h0_g/c0_g is the reordered gradient of hidden/cell
// initialization.
Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
const size_t* order = batch_gate->lod()[2].data();
framework::Vector<size_t> order(batch_gate->lod()[2]);
if (c0) {
ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
true);
......
......@@ -31,7 +31,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, input2.height());
output->set_height(in1_height);
auto& in1_rows = input1.rows();
framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = input2.rows();
std::vector<int64_t> out_rows;
out_rows.reserve(in1_rows.size() + in2_rows.size());
......@@ -108,7 +108,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
framework::Vector<int64_t> in1_rows(input1.rows());
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
......@@ -126,7 +126,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size());
SelectedRowsAddTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.data(), out_data, in1_row_numel);
in1_data, in1_rows.cuda_data(), out_data, in1_row_numel);
auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
......@@ -146,7 +146,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height());
auto& in1_rows = input1.rows();
framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value();
......@@ -204,7 +204,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value();
auto& in1_rows = input1.rows();
framework::Vector<int64_t> in1_rows(input1.rows());
int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
......@@ -216,7 +216,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size());
SelectedRowsAddToTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.data(), in2_data, in1_row_numel);
in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel);
}
};
......@@ -257,7 +257,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input) {
framework::SelectedRows out;
auto input_rows = input.rows();
framework::Vector<int64_t> input_rows(input.rows());
std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
......@@ -283,9 +283,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
MergeAddKernel<
T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input_data, input.rows().data(), out_data,
out.rows().data(), out.rows().size(),
input_width);
.stream()>>>(input_data, input_rows.cuda_data(), out_data,
out.mutable_rows()->cuda_data(),
out.rows().size(), input_width);
return out;
}
};
......@@ -370,8 +370,8 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
dim3 grid(1, in1_rows.size());
UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
in2_data, in1_row_numel);
grid, threads, 0, context.stream()>>>(in1_data, in1_rows.cuda_data(),
op, in2_data, in1_row_numel);
}
};
} // namespace scatter
......
......@@ -23,8 +23,10 @@ template <typename T>
class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& src, const size_t* index,
framework::Tensor& dst, bool is_src_index) {
const framework::Tensor& src,
framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index) {
size_t* index = index_lod.data();
auto src_dims = src.dims();
auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
......
......@@ -42,8 +42,10 @@ template <typename T>
class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& src, const size_t* index,
framework::Tensor& dst, bool is_src_index) {
const framework::Tensor& src,
framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index) {
size_t* index = index_lod.cuda_data();
auto src_dims = src.dims();
auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2,
......
......@@ -35,7 +35,7 @@ class CopyMatrixRowsFunctor {
// copy the input src to the indexed rows of output dst.
// The indexed rows are based on the input index.
void operator()(const DeviceContext& context, const framework::Tensor& src,
const size_t* index, framework::Tensor& dst,
framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index);
};
......@@ -66,7 +66,7 @@ class LoDTensor2BatchFunctor {
PADDLE_ENFORCE_EQ(lods[1].size(),
static_cast<size_t>(lod_tensor.dims()[0]));
CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
to_batch(context, lod_tensor, lods[1].data(), batch, true);
to_batch(context, lod_tensor, lods[1], batch, true);
return;
}
......@@ -144,7 +144,7 @@ class LoDTensor2BatchFunctor {
batch.set_lod(batch_lods);
CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
to_batch(context, lod_tensor, seq2batch_idx, batch, true);
to_batch(context, lod_tensor, batch_lods[1], batch, true);
}
};
......@@ -159,8 +159,7 @@ class Batch2LoDTensorFunctor {
PADDLE_ENFORCE_EQ(in_lod[1].size(),
static_cast<size_t>(lod_tensor.dims()[0]));
CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
size_t* index = in_lod[1].data();
to_seq(context, batch, index, lod_tensor, false);
to_seq(context, batch, in_lod[1], lod_tensor, false);
}
};
......
......@@ -120,12 +120,14 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
T* padding_data = padding.data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
sequence_width, max_sequence_length, num_sequences);
padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} else {
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
sequence_width, max_sequence_length, num_sequences);
padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
}
}
};
......@@ -193,12 +195,14 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
T* seq_data = seq.data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
sequence_width, max_sequence_length, num_sequences);
const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
} else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
sequence_width, max_sequence_length, num_sequences);
const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].cuda_data(), sequence_width,
max_sequence_length, num_sequences);
}
}
};
......
......@@ -73,7 +73,7 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
dim3 grid(num_seq, 1);
auto stream = context.stream();
KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
in_data, starts.data(), out_data, max_index, num_seq, dim);
in_data, starts.cuda_data(), out_data, max_index, num_seq, dim);
}
};
......
......@@ -46,7 +46,7 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
seq_data, abs_offset_lod[level].data(), scales, seq_width);
seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width);
}
};
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/reduce_op.h"
#include "paddle/operators/net_op.h"
namespace paddle {
namespace operators {
......@@ -38,10 +37,14 @@ class ReduceOp : public framework::OperatorWithKernel {
dim, x_rank,
"The dim should be in the range [-rank(input), rank(input)).");
bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
if (reduce_all) {
ctx->SetOutputDim("Out", {1});
if (keep_dim)
ctx->SetOutputDim(
"Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
else
ctx->SetOutputDim("Out", {1});
} else {
bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
auto dims_vector = vectorize(x_dims);
if (keep_dim || x_rank == 1) {
dims_vector[dim] = 1;
......
......@@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0];
size_t *idx = batch_indices.data();
size_t *idx = batch_indices.cuda_data();
auto stream = context.cuda_device_context().stream();
if (future_context <= 32) {
......@@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0];
size_t *idx = batch_indices.data();
size_t *idx = batch_indices.cuda_data();
auto &device_ctx = context.cuda_device_context();
math::SetConstant<platform::CUDADeviceContext, T> zero;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdint.h>
#include <sys/stat.h>
#include <fstream>
#include <numeric>
#include <sstream>
#include "paddle/framework/data_type.h"
#include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
// TODO(sidgoyal78): These function are needed by other files (save_op), move
// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
constexpr char kSEP = '/';
static bool FileExists(const std::string &filepath) {
struct stat buffer;
return (stat(filepath.c_str(), &buffer) == 0);
}
static std::string DirName(const std::string &filepath) {
auto pos = filepath.rfind(kSEP);
if (pos == std::string::npos) {
return "";
}
return filepath.substr(0, pos);
}
static void MkDir(const char *path) {
if (mkdir(path, 0755)) {
PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
}
}
static void MkDirRecursively(const char *fullpath) {
if (*fullpath == '\0') return; // empty string
if (FileExists(fullpath)) return;
MkDirRecursively(DirName(fullpath).c_str());
MkDir(fullpath);
}
class SaveCombineOp : public framework::OperatorBase {
public:
SaveCombineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path");
auto overwrite = Attr<bool>("overwrite");
bool is_present = FileExists(filename);
if (is_present && !overwrite) {
PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
filename, overwrite);
}
MkDirRecursively(DirName(filename).c_str());
std::ofstream fout(filename);
PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
filename);
auto inp_var_names = Inputs("X");
PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
"The number of input variables should be greater than 0");
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
for (size_t i = 0; i < inp_var_names.size(); i++) {
auto *var = scope.FindVar(inp_var_names[i]);
PADDLE_ENFORCE(var != nullptr,
"Cannot find variable %s for save_combine_op",
inp_var_names[i]);
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
"SaveCombineOp only supports LoDTensor, %s has wrong type",
inp_var_names[i]);
auto &tensor = var->Get<framework::LoDTensor>();
// Serialize tensor
framework::SerializeToStream(fout, tensor, dev_ctx);
}
fout.close();
}
};
class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(
"X",
"(vector) Input LoDTensors that need to be saved together in a file.")
.AsDuplicable();
AddComment(R"DOC(
SaveCombine operator
This operator will serialize and write a list of input LoDTensor variables
to a file on disk.
)DOC");
AddAttr<bool>("overwrite",
"(boolean, default true)"
"Overwrite the output file if it exists.")
.SetDefault(true);
AddAttr<std::string>(
"file_path",
"(string)"
"The \"file_path\" where the LoDTensor variables will be saved.")
.AddCustomChecker(
[](const std::string &path) { return !path.empty(); });
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
ops::SaveCombineOpProtoMaker);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <iostream>
#include <string>
#include <vector>
#include "gtest/gtest.h"
#include "paddle/framework/op_registry.h"
USE_NO_KERNEL_OP(save_combine);
USE_NO_KERNEL_OP(load_combine);
int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
std::string var_name,
paddle::platform::CPUPlace& place,
paddle::framework::Scope& scope,
paddle::framework::LoD& expect_lod) {
auto var = scope.Var(var_name);
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({x, y});
expect_lod.resize(1);
for (size_t i = 0; i < lod_info.size(); i++) {
expect_lod[0].push_back(lod_info[i]);
}
tensor->set_lod(expect_lod);
int* expect = tensor->mutable_data<int>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<int>(i);
}
return expect;
}
paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
const std::string out_var_name, paddle::framework::Scope& scope) {
auto load_var = scope.Var(out_var_name);
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
return target;
}
int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
paddle::framework::Scope& scope,
paddle::framework::LoD& actual_lod) {
int* actual = target->data<int>();
actual_lod = target->lod();
return actual;
}
void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
paddle::framework::LoD actual_lod, const int& numel) {
for (int64_t i = 0; i < numel; ++i) {
EXPECT_EQ(expect[i], actual[i]);
}
EXPECT_EQ(expect_lod.size(), actual_lod.size());
for (size_t i = 0; i < expect_lod.size(); ++i) {
for (size_t j = 0; j < expect_lod[i].size(); ++j) {
EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
}
}
}
// Here, we create 4 LoDTensors and use save_combine_op to first save these
// in a single file. Then, we use load_combine_op to load these sequentially
TEST(SaveLoadCombineOp, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
std::vector<int> lod1 = {0, 1, 2, 3, 10};
int numel1 = 100;
paddle::framework::LoD expect_lod1;
int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
expect_lod1);
std::vector<int> lod2 = {0, 2, 5, 10};
int numel2 = 200;
paddle::framework::LoD expect_lod2;
int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
expect_lod2);
std::vector<int> lod3 = {0, 2, 3, 20};
int numel3 = 4000;
paddle::framework::LoD expect_lod3;
int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
scope, expect_lod3);
std::vector<int> lod4 = {0, 1, 20};
int numel4 = 1000;
paddle::framework::LoD expect_lod4;
int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
expect_lod4);
// Set attributes
std::string filename = "check_tensor.ls";
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string(filename)});
// Run the save_combine_op
auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
"save_combine",
{{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
save_combine_op->Run(scope, place);
// Set up output vars
auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
// Run the load_combine_op
auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
"load_combine", {},
{{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
load_combine_op->Run(scope, place);
paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
}
// Test with original SaveLoadTest
TEST(SaveLoadTestWithCombineOp, CPU) {
paddle::framework::Scope scope;
paddle::platform::CPUPlace place;
auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({3, 10});
paddle::framework::LoD expect_lod;
expect_lod.resize(1);
expect_lod[0].push_back(0);
expect_lod[0].push_back(1);
expect_lod[0].push_back(2);
expect_lod[0].push_back(3);
tensor->set_lod(expect_lod);
int* expect = tensor->mutable_data<int>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) {
expect[i] = static_cast<int>(i);
}
paddle::framework::AttributeMap attrs;
attrs.insert({"file_path", std::string("check_t.save")});
auto save_op = paddle::framework::OpRegistry::CreateOp(
"save_combine", {{"X", {"test_var"}}}, {}, attrs);
save_op->Run(scope, place);
auto load_var = scope.Var("out_var");
auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
auto load_op = paddle::framework::OpRegistry::CreateOp(
"load_combine", {}, {{"Out", {"out_var"}}}, attrs);
load_op->Run(scope, place);
int* actual = target->data<int>();
for (int64_t i = 0; i < tensor->numel(); ++i) {
EXPECT_EQ(expect[i], actual[i]);
}
auto& actual_lod = target->lod();
EXPECT_EQ(expect_lod.size(), actual_lod.size());
for (size_t i = 0; i < expect_lod.size(); ++i) {
for (size_t j = 0; j < expect_lod[i].size(); ++j) {
EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
}
}
}
......@@ -24,7 +24,7 @@ TEST(SaveLoadOp, CPU) {
auto var = scope.Var("test_var");
auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
tensor->Resize({10, 10});
tensor->Resize({3, 10});
paddle::framework::LoD expect_lod;
expect_lod.resize(1);
expect_lod[0].push_back(0);
......
......@@ -42,17 +42,25 @@ class SendOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
}
PADDLE_ENFORCE(client_.Wait());
PADDLE_ENFORCE(rpc_client->Wait());
for (auto& ep : endpoints) {
VLOG(3) << "batch barrier, ep: " << ep;
client_.AsyncSendBatchBarrier(ep);
rpc_client->AsyncSendBatchBarrier(ep);
}
PADDLE_ENFORCE(client_.Wait());
PADDLE_ENFORCE(rpc_client->Wait());
if (outs.size() > 0) {
for (size_t i = 0; i < outs.size(); i++) {
......@@ -62,11 +70,6 @@ class SendOp : public framework::OperatorBase {
PADDLE_ENFORCE(client_.Wait());
}
}
private:
// TODO(typhoonzero): put RPCClient in a Variable, so that
// send and recv can use the same connection.
mutable detail::RPCClient client_;
};
class SendOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -76,6 +79,9 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
AddOutput("Out", "(Tensor) Output tensor to be received from server")
.AsDuplicable();
AddOutput("RPCClient",
"(RPCClient) The RPC client object which is"
"initialized at most once.");
AddComment(R"DOC(
Send operator
......
......@@ -96,9 +96,8 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
// Set LoD for output
thrust::host_vector<size_t> out_lod0 = dev_out_lod;
std::vector<size_t> out_lod0(dev_out_lod.begin(), dev_out_lod.end());
framework::LoD out_lod;
out_lod.push_back(out_lod0);
out->set_lod(out_lod);
......
......@@ -30,8 +30,13 @@ class SequenceReshapeOp : public framework::OperatorWithKernel {
auto x_numel = product(x_dims);
PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
int new_dim = ctx->Attrs().Get<int>("new_dim");
ctx->SetOutputDim("Out",
{x_numel / new_dim, static_cast<int64_t>(new_dim)});
if (ctx->IsRuntime()) {
ctx->SetOutputDim("Out",
{x_numel / new_dim, static_cast<int64_t>(new_dim)});
} else {
// when compiling, the batch size is undetermined, just set to -1
ctx->SetOutputDim("Out", {-1, static_cast<int64_t>(new_dim)});
}
}
};
......
......@@ -89,7 +89,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
auto& in_value = grad->value();
auto& in_rows = grad->rows();
framework::Vector<int64_t> in_rows(grad->rows());
int64_t in_row_numel = in_value.numel() / in_rows.size();
PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
......@@ -102,7 +102,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
dim3 grid(1, in_rows.size());
SparseSGDFunctorKernel<
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
in_data, in_rows.data(), learning_rate->data<T>(), out_data,
in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data,
in_row_numel);
} else {
......
......@@ -68,7 +68,32 @@ class SumKernel : public framework::OpKernel<T> {
}
}
} else if (out_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
std::unique_ptr<framework::SelectedRows> in0;
if (in_place) {
// If is in_place, we store the input[0] to in0
auto &in_sel0 = in_vars[0]->Get<SelectedRows>();
auto &rows = in_sel0.rows();
#ifdef PADDLE_WITH_CUDA
std::vector<int64_t> rows_in_cpu;
rows_in_cpu.reserve(rows.size());
for (auto item : rows) {
rows_in_cpu.push_back(item);
}
in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height()));
#else
in0.reset(new framework::SelectedRows(rows, in_sel0.height()));
#endif
in0->mutable_value()->ShareDataWith(in_sel0.value());
}
auto get_selected_row = [&](size_t i) -> const SelectedRows & {
if (i == 0 && in0) {
return *in0.get();
} else {
return in_vars[i]->Get<SelectedRows>();
}
};
auto *out = context.Output<SelectedRows>("Out");
out->mutable_rows()->clear();
auto *out_value = out->mutable_value();
......@@ -76,24 +101,26 @@ class SumKernel : public framework::OpKernel<T> {
// Runtime InferShape
size_t first_dim = 0;
for (int i = 0; i < N; i++) {
first_dim += in_vars[i]->Get<SelectedRows>().rows().size();
auto &sel_row = get_selected_row(i);
first_dim += sel_row.rows().size();
}
auto in_dim = in_vars[0]->Get<SelectedRows>().value().dims();
auto in_dim_vec = framework::vectorize(in_dim);
in_dim_vec[0] = static_cast<int64_t>(first_dim);
auto in_dim =
framework::vectorize(get_selected_row(N - 1).value().dims());
in_dim[0] = static_cast<int64_t>(first_dim);
out_value->Resize(framework::make_ddim(in_dim_vec));
out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace());
math::SelectedRowsAddTo<DeviceContext, T> functor;
int64_t offset = 0;
for (int i = 0; i < N; i++) {
PADDLE_ENFORCE_EQ(out->height(),
in_vars[i]->Get<SelectedRows>().height());
functor(context.template device_context<DeviceContext>(),
in_vars[i]->Get<SelectedRows>(), offset, out);
offset += in_vars[i]->Get<SelectedRows>().value().numel();
auto &sel_row = get_selected_row(i);
PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
functor(context.template device_context<DeviceContext>(), sel_row,
offset, out);
offset += sel_row.value().numel();
}
} else if (out_var->IsType<framework::LoDTensorArray>()) {
auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
......
......@@ -124,44 +124,25 @@ PYBIND11_PLUGIN(core) {
.def(
"__init__",
[](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
#ifndef PADDLE_WITH_CUDA
new (&instance) LoDTensor(lod);
#else
LoD new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
new (&instance) LoDTensor(new_lod);
#endif
LoD new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
new (&instance) LoDTensor(new_lod);
})
.def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
.def("set_lod",
[](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
#ifndef PADDLE_WITH_CUDA
self.set_lod(lod);
#else
LoD new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
self.set_lod(new_lod);
#endif
})
.def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
#ifndef PADDLE_WITH_CUDA
return self.lod();
#else
auto lod = self.lod();
std::vector<std::vector<size_t>> new_lod;
new_lod.reserve(lod.size());
std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod),
[](Vector<size_t> item) ->
std::vector<size_t> {
std::vector<size_t> v;
v.reserve(item.size());
std::copy(item.begin(), item.end(), std::back_inserter(v));
return v;
});
return new_lod;
#endif
auto lod = self.lod();
std::vector<std::vector<size_t>> new_lod;
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
return new_lod;
});
py::class_<SelectedRows>(m, "SelectedRows")
......
......@@ -56,7 +56,7 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
| ------ | -------- | ----------- |
| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
| `WITH_TESTING` | ON | Build unit tests binaries. |
| `WITH_TESTING` | OFF | Build unit tests binaries. |
| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
......
......@@ -22,7 +22,9 @@ limitations under the License. */
int main(int argc, char** argv) {
std::vector<char*> new_argv;
std::string gflags_env;
new_argv.push_back(argv[0]);
for (int i = 0; i < argc; ++i) {
new_argv.push_back(argv[i]);
}
#ifdef PADDLE_WITH_CUDA
new_argv.push_back(
strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
......
......@@ -140,8 +140,13 @@ def init_config_environment(
g_submodel_stack=[],
g_add_submodel_suffix=False, ):
for k, v in locals().iteritems():
globals()[k] = copy.deepcopy(v)
# directly iterate through locals().iteritems() will change
# the size of locals() due to introducing k, v into scope
# which will break the process in some env
local_vars = copy.deepcopy(locals())
for k, v in local_vars.iteritems():
globals()[k] = v
# Because type is widely used as a variable name in this code.
......
......@@ -26,6 +26,7 @@ import initializer
import layers
import nets
import optimizer
import learning_rate_decay
import backward
import regularizer
from param_attr import ParamAttr
......@@ -35,27 +36,16 @@ from distribute_transpiler import DistributeTranspiler
from distribute_transpiler_simple import SimpleDistributeTranspiler
import clip
from memory_optimization_transpiler import memory_optimize
import profiler
Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + [
'io',
'initializer',
'layers',
'nets',
'optimizer',
'backward',
'regularizer',
'LoDTensor',
'CPUPlace',
'CUDAPlace',
'Tensor',
'io', 'initializer', 'layers', 'nets', 'optimizer', 'learning_rate_decay',
'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor',
'ParamAttr'
'DataFeeder',
'clip',
'SimpleDistributeTranspiler',
'DistributeTranspiler',
'memory_optimize',
'DataFeeder', 'clip', 'SimpleDistributeTranspiler', 'DistributeTranspiler',
'memory_optimize', 'profiler'
]
......@@ -86,11 +76,9 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = [
'use_pinned_memory', 'check_nan_inf', 'do_memory_benchmark'
]
read_env_flags = ['use_pinned_memory', 'check_nan_inf', 'benchmark']
if core.is_compiled_with_cuda():
read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
read_env_flags += ['fraction_of_gpu_memory_to_use']
core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
......
......@@ -30,6 +30,9 @@ __all__ = [
class BaseErrorClipAttr(object):
def __str__(self):
raise NotImplementedError()
def append_clip_op(self, block, grad_name):
raise NotImplementedError()
......@@ -44,6 +47,9 @@ class ErrorClipByValue(BaseErrorClipAttr):
self.max = max
self.min = min
def __str__(self):
return "ByValue, min=%f, max=%f" % (self.min, self.max)
def append_clip_op(self, block, grad_name):
clip_op_desc = block.desc.append_op()
clip_op_desc.set_type("clip")
......@@ -71,6 +77,9 @@ def error_clip_callback(block, context):
class BaseGradientClipAttr(object):
def __str__(self):
raise NotImplementedError()
def process_context(self, context, param, grad):
raise NotImplementedError()
......@@ -79,6 +88,9 @@ class BaseGradientClipAttr(object):
class NullGradientClipAttr(BaseGradientClipAttr):
def __str__(self):
return "Null"
def process_context(self, context, param, grad):
pass
......@@ -96,6 +108,9 @@ class GradientClipByValue(BaseGradientClipAttr):
self.max = max
self.min = min
def __str__(self):
return "ByValue, min=%f, max=%f" % (self.min, self.max)
def process_context(self, context, param, grad):
pass
......@@ -108,6 +123,9 @@ class GradientClipByNorm(BaseGradientClipAttr):
def __init__(self, clip_norm):
self.clip_norm = clip_norm
def __str__(self):
return "ByNorm, clip_norm=%f" % self.clip_norm
def process_context(self, context, param, grad):
pass
......@@ -124,6 +142,10 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
self.clip_norm = clip_norm
self.group_name = group_name
def __str__(self):
return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
self.clip_norm)
def process_context(self, context, param, grad):
if self.group_name not in context:
context[self.group_name] = []
......@@ -160,6 +182,17 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
def set_gradient_clip(clip, param_list=None, program=None):
"""
To specify parameters that require gradient clip.
Args:
clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr,
which describes the type and detailed attributes of required gradient clip.
param_list(list, None by default): Parameters that require gradient clip.
It can be a list of parameter or a list of parameter's name.
When it's None, all parameters in the program will be included.
program(Program, None by default): The program where parameters are.
Will be the default main program when assigned with None.
"""
if not isinstance(clip, BaseGradientClipAttr):
raise TypeError(
"'clip' should be an instance of BaseGradientClipAttr's derived class"
......@@ -199,3 +232,5 @@ def append_gradient_clip_ops(param_grad):
ClipByValue = GradientClipByValue
ClipByNorm = GradientClipByNorm
ClipByGlobalNorm = GradientClipByGlobalNorm
......@@ -153,11 +153,18 @@ class DistributeTranspiler:
self.param_grad_ep_mapping[ep]["params"].append(param)
self.param_grad_ep_mapping[ep]["grads"].append(grad)
rpc_client_var = program.global_block().create_var(
name="RPC_CLIENT_VAR",
psersistable=True,
dtype='float32', # dtype and shape is not used in fact
shape=[0])
# create send_op
send_op = program.global_block().append_op(
type="send",
inputs={"X": send_inputs},
outputs={"Out": send_outputs},
outputs={"Out": send_outputs,
"RPCClient": rpc_client_var},
attrs={"endpoints": pserver_endpoints,
"epmap": eplist})
# step4
......
此差异已折叠。
......@@ -145,7 +145,9 @@ def monkey_patch_variable():
# a*b == b*a. Do not need to reverse explicitly
("__rmul__", "elementwise_mul", False),
("__div__", "elementwise_div", False),
("__rdiv__", "elementwise_div", True)):
("__rdiv__", "elementwise_div", True),
("__pow__", "elementwise_pow", False),
("__rpow__", "elementwise_pow", True)):
setattr(Variable, method_name,
_elemwise_method_creator_(method_name, op_type, reverse))
......
......@@ -847,7 +847,35 @@ def cos_sim(X, Y, **kwargs):
return out
def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
def dropout(x, dropout_prob, is_test=False, seed=None, **kwargs):
"""
Computes dropout.
Drop or keep each element of `x` independently. Dropout is a regularization
technique for reducing overfitting by preventing neuron co-adaption during
training. The dropout operator randomly set (according to the given dropout
probability) the outputs of some units to zero, while others are remain
unchanged.
Args:
x(variable): The input tensor.
dropout_prob(float): Probability of setting units to zero.
is_test(bool): A flag indicating whether it is in test phrase or not.
seed(int): A Python integer used to create random seeds. If this
parameter is set to None, a random seed is used.
NOTE: If an integer seed is given, always the same output
units will be dropped. DO NOT use a fixed seed in training.
Returns:
Variable: A tensor variable.
Examples:
.. code-block:: python
x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
"""
helper = LayerHelper('dropout', **kwargs)
out = helper.create_tmp_variable(dtype=x.dtype)
mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
......@@ -856,9 +884,12 @@ def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
inputs={'X': [x]},
outputs={'Out': [out],
'Mask': [mask]},
attrs={'dropout_prob': dropout_prob,
'is_test': is_test,
'seed': seed})
attrs={
'dropout_prob': dropout_prob,
'is_test': is_test,
'fix_seed': seed is not None,
'seed': seed if seed is not None else 0
})
return out
......
......@@ -56,6 +56,7 @@ __all__ = [
'elementwise_mul',
'elementwise_max',
'elementwise_min',
'elementwise_pow',
'clip',
'clip_by_norm',
'sequence_softmax',
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册