Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
0098a494
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
0098a494
编写于
8月 15, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
差异文件
Merge remote-tracking branch 'ups/develop' into refine/op/fc
上级
742300ba
842fb021
变更
46
隐藏空白更改
内联
并排
Showing
46 changed file
with
1317 addition
and
354 deletion
+1317
-354
CMakeLists.txt
CMakeLists.txt
+1
-2
cmake/configure.cmake
cmake/configure.cmake
+8
-0
cmake/external/anakin.cmake
cmake/external/anakin.cmake
+45
-36
cmake/inference_lib.cmake
cmake/inference_lib.cmake
+1
-1
doc/fluid/api/executor.rst
doc/fluid/api/executor.rst
+0
-8
doc/fluid/api/fluid.rst
doc/fluid/api/fluid.rst
+0
-16
paddle/fluid/API.spec
paddle/fluid/API.spec
+5
-20
paddle/fluid/framework/op_desc.cc
paddle/fluid/framework/op_desc.cc
+14
-1
paddle/fluid/framework/op_desc.h
paddle/fluid/framework/op_desc.h
+3
-1
paddle/fluid/framework/program_desc.cc
paddle/fluid/framework/program_desc.cc
+1
-1
paddle/fluid/inference/analysis/analyzer.cc
paddle/fluid/inference/analysis/analyzer.cc
+3
-3
paddle/fluid/inference/api/CMakeLists.txt
paddle/fluid/inference/api/CMakeLists.txt
+12
-17
paddle/fluid/inference/api/paddle_inference_api.h
paddle/fluid/inference/api/paddle_inference_api.h
+1
-1
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+4
-1
paddle/fluid/inference/tensorrt/convert/softmax_op.cc
paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+49
-0
paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
+49
-0
paddle/fluid/inference/tensorrt/convert/ut_helper.h
paddle/fluid/inference/tensorrt/convert/ut_helper.h
+17
-7
paddle/fluid/operators/CMakeLists.txt
paddle/fluid/operators/CMakeLists.txt
+7
-2
paddle/fluid/operators/crop_op.cc
paddle/fluid/operators/crop_op.cc
+3
-2
paddle/fluid/operators/crop_op.cu
paddle/fluid/operators/crop_op.cu
+3
-2
paddle/fluid/operators/crop_op.h
paddle/fluid/operators/crop_op.h
+57
-15
paddle/fluid/operators/elementwise_add_op.cu
paddle/fluid/operators/elementwise_add_op.cu
+0
-54
paddle/fluid/operators/elementwise_add_op.h
paddle/fluid/operators/elementwise_add_op.h
+8
-33
paddle/fluid/operators/layer_norm_op.cu
paddle/fluid/operators/layer_norm_op.cu
+505
-1
paddle/fluid/pybind/protobuf.cc
paddle/fluid/pybind/protobuf.cc
+2
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+1
-1
python/paddle/fluid/__init__.py
python/paddle/fluid/__init__.py
+1
-3
python/paddle/fluid/backward.py
python/paddle/fluid/backward.py
+2
-2
python/paddle/fluid/concurrency.py
python/paddle/fluid/concurrency.py
+3
-4
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+34
-34
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+98
-35
python/paddle/fluid/initializer.py
python/paddle/fluid/initializer.py
+2
-1
python/paddle/fluid/io.py
python/paddle/fluid/io.py
+6
-2
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+1
-1
python/paddle/fluid/parallel_executor.py
python/paddle/fluid/parallel_executor.py
+3
-3
python/paddle/fluid/tests/unittests/test_desc_clone.py
python/paddle/fluid/tests/unittests/test_desc_clone.py
+196
-0
python/paddle/fluid/tests/unittests/test_dist_base.py
python/paddle/fluid/tests/unittests/test_dist_base.py
+56
-13
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+31
-1
python/paddle/fluid/tests/unittests/test_fetch_var.py
python/paddle/fluid/tests/unittests/test_fetch_var.py
+1
-1
python/paddle/fluid/tests/unittests/test_program.py
python/paddle/fluid/tests/unittests/test_program.py
+34
-0
python/paddle/fluid/tests/unittests/test_protobuf_descs.py
python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+1
-1
python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
...n/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+2
-1
python/paddle/fluid/tests/unittests/transformer_model.py
python/paddle/fluid/tests/unittests/transformer_model.py
+1
-1
python/paddle/fluid/transpiler/distribute_transpiler.py
python/paddle/fluid/transpiler/distribute_transpiler.py
+41
-20
python/paddle/fluid/transpiler/inference_transpiler.py
python/paddle/fluid/transpiler/inference_transpiler.py
+4
-4
python/requirements.txt
python/requirements.txt
+1
-1
未找到文件。
CMakeLists.txt
浏览文件 @
0098a494
...
...
@@ -204,12 +204,11 @@ include(external/snappy) # download snappy
include
(
external/snappystream
)
include
(
external/threadpool
)
set
(
WITH_ANAKIN OFF CACHE STRING
"Disable Anakin first, will add it later."
FORCE
)
if
(
WITH_GPU
)
include
(
cuda
)
include
(
tensorrt
)
include
(
external/anakin
)
else
()
set
(
WITH_ANAKIN OFF CACHE STRING
"Anakin is valid only when GPU is set."
FORCE
)
endif
()
include
(
cudnn
)
# set cudnn libraries, must before configure
...
...
cmake/configure.cmake
浏览文件 @
0098a494
...
...
@@ -97,6 +97,14 @@ if(WITH_GPU)
endif
()
include_directories
(
${
TENSORRT_INCLUDE_DIR
}
)
endif
()
if
(
WITH_ANAKIN
)
if
(
${
CUDA_VERSION_MAJOR
}
VERSION_LESS 8
)
message
(
FATAL_ERROR
"Anakin needs CUDA >= 8.0 to compile"
)
endif
()
if
(
${
CUDNN_MAJOR_VERSION
}
VERSION_LESS 7
)
message
(
FATAL_ERROR
"Anakin needs CUDNN >= 7.0 to compile"
)
endif
()
endif
()
elseif
(
WITH_AMD_GPU
)
add_definitions
(
-DPADDLE_WITH_HIP
)
set
(
CMAKE_C_FLAGS
"
${
CMAKE_C_FLAGS
}
-D__HIP_PLATFORM_HCC__"
)
...
...
cmake/external/anakin.cmake
浏览文件 @
0098a494
...
...
@@ -2,10 +2,22 @@ if (NOT WITH_ANAKIN)
return
()
endif
()
set
(
ANAKIN_INSTALL_DIR
"
${
THIRD_PARTY_PATH
}
/install/anakin"
CACHE PATH
"Anakin install path."
FORCE
)
set
(
ANAKIN_INCLUDE
"
${
ANAKIN_INSTALL_DIR
}
"
CACHE STRING
"root of Anakin header files"
)
set
(
ANAKIN_LIBRARY
"
${
ANAKIN_INSTALL_DIR
}
"
CACHE STRING
"path of Anakin library"
)
INCLUDE
(
ExternalProject
)
set
(
ANAKIN_SOURCE_DIR
${
THIRD_PARTY_PATH
}
/anakin
)
# the anakin install dir is only default one now
set
(
ANAKIN_INSTALL_DIR
${
THIRD_PARTY_PATH
}
/anakin/src/extern_anakin/output
)
set
(
ANAKIN_INCLUDE
${
ANAKIN_INSTALL_DIR
}
)
set
(
ANAKIN_LIBRARY
${
ANAKIN_INSTALL_DIR
}
)
set
(
ANAKIN_SHARED_LIB
${
ANAKIN_LIBRARY
}
/libanakin.so
)
set
(
ANAKIN_SABER_LIB
${
ANAKIN_LIBRARY
}
/libanakin_saber_common.so
)
# TODO(luotao): ANAKIN_MODLE_URL will move to demo ci later.
set
(
ANAKIN_MODLE_URL
"http://paddle-inference-dist.bj.bcebos.com/mobilenet_v2.anakin.bin"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_SOURCE_DIR
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_SOURCE_DIR
}
; wget -q --no-check-certificate
${
ANAKIN_MODLE_URL
}
"
)
include_directories
(
${
ANAKIN_INCLUDE
}
)
include_directories
(
${
ANAKIN_INCLUDE
}
/saber/
)
set
(
ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
...
...
@@ -20,36 +32,33 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
-Wno-reorder
-Wno-error=cpp
)
set
(
ANAKIN_LIBRARY_URL
"https://github.com/pangge/Anakin/releases/download/Version0.1.0/anakin.tar.gz"
)
# A helper function used in Anakin, currently, to use it, one need to recursively include
# nearly all the header files.
function
(
fetch_include_recursively root_dir
)
if
(
IS_DIRECTORY
${
root_dir
}
)
include_directories
(
${
root_dir
}
)
endif
()
file
(
GLOB ALL_SUB RELATIVE
${
root_dir
}
${
root_dir
}
/*
)
foreach
(
sub
${
ALL_SUB
}
)
if
(
IS_DIRECTORY
${
root_dir
}
/
${
sub
}
)
fetch_include_recursively
(
${
root_dir
}
/
${
sub
}
)
endif
()
endforeach
()
endfunction
()
if
(
NOT EXISTS
"
${
ANAKIN_INSTALL_DIR
}
"
)
# download library
message
(
STATUS
"Download Anakin library from
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"rm -rf
${
ANAKIN_INSTALL_DIR
}
/*"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; wget --no-check-certificate -q
${
ANAKIN_LIBRARY_URL
}
"
)
execute_process
(
COMMAND bash -c
"mkdir -p
${
ANAKIN_INSTALL_DIR
}
"
)
execute_process
(
COMMAND bash -c
"cd
${
ANAKIN_INSTALL_DIR
}
; tar xzf anakin.tar.gz"
)
endif
()
ExternalProject_Add
(
extern_anakin
${
EXTERNAL_PROJECT_LOG_ARGS
}
# TODO(luotao): use PaddlePaddle/Anakin later
GIT_REPOSITORY
"https://github.com/luotao1/Anakin"
GIT_TAG
"3957ae9263eaa0b1986758dac60a88852afb09be"
PREFIX
${
ANAKIN_SOURCE_DIR
}
UPDATE_COMMAND
""
CMAKE_ARGS -DUSE_GPU_PLACE=YES
-DUSE_X86_PLACE=YES
-DBUILD_WITH_UNIT_TEST=NO
-DPROTOBUF_ROOT=
${
THIRD_PARTY_PATH
}
/install/protobuf
-DMKLML_ROOT=
${
THIRD_PARTY_PATH
}
/install/mklml
-DCUDNN_ROOT=
${
CUDNN_ROOT
}
${
EXTERNAL_OPTIONAL_ARGS
}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=
${
ANAKIN_INSTALL_DIR
}
)
if
(
WITH_ANAKIN
)
message
(
STATUS
"Anakin for inference is enabled"
)
message
(
STATUS
"Anakin is set INCLUDE:
${
ANAKIN_INCLUDE
}
LIBRARY:
${
ANAKIN_LIBRARY
}
"
)
fetch_include_recursively
(
${
ANAKIN_INCLUDE
}
)
link_directories
(
${
ANAKIN_LIBRARY
}
)
endif
()
message
(
STATUS
"Anakin for inference is enabled"
)
message
(
STATUS
"Anakin is set INCLUDE:
${
ANAKIN_INCLUDE
}
LIBRARY:
${
ANAKIN_LIBRARY
}
"
)
add_library
(
anakin_shared SHARED IMPORTED GLOBAL
)
set_property
(
TARGET anakin_shared PROPERTY IMPORTED_LOCATION
${
ANAKIN_SHARED_LIB
}
)
add_dependencies
(
anakin_shared extern_anakin protobuf mklml
)
add_library
(
anakin_saber SHARED IMPORTED GLOBAL
)
set_property
(
TARGET anakin_saber PROPERTY IMPORTED_LOCATION
${
ANAKIN_SABER_LIB
}
)
add_dependencies
(
anakin_saber extern_anakin protobuf mklml
)
list
(
APPEND external_project_dependencies anakin_shared anakin_saber
)
cmake/inference_lib.cmake
浏览文件 @
0098a494
...
...
@@ -143,7 +143,7 @@ if (WITH_ANAKIN AND WITH_GPU)
copy
(
anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
SRCS
${
PADDLE_BINARY_DIR
}
/paddle/fluid/inference/api/libinference_anakin_api*
# compiled anakin api
${
PADDLE_BINARY_DIR
}
/third_party/install/anakin/*.tar.gz
# anakin release
${
ANAKIN_INSTALL_DIR
}
# anakin release
DSTS
${
dst_dir
}
/inference/anakin
${
dst_dir
}
/inference/anakin
)
list
(
APPEND inference_deps anakin_inference_lib
)
endif
()
...
...
doc/fluid/api/executor.rst
浏览文件 @
0098a494
...
...
@@ -38,11 +38,3 @@ _switch_scope
.. autofunction:: paddle.fluid.executor._switch_scope
:noindex:
.. _api_fluid_executor_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.executor.fetch_var
:noindex:
doc/fluid/api/fluid.rst
浏览文件 @
0098a494
...
...
@@ -106,22 +106,6 @@ _switch_scope
.. autofunction:: paddle.fluid._switch_scope
:noindex:
.. _api_fluid_fetch_var:
fetch_var
---------
.. autofunction:: paddle.fluid.fetch_var
:noindex:
.. _api_fluid_Go:
Go
--
.. autoclass:: paddle.fluid.Go
:members:
:noindex:
.. _api_fluid_make_channel:
...
...
paddle/fluid/API.spec
浏览文件 @
0098a494
...
...
@@ -6,7 +6,7 @@ paddle.fluid.Program.create_block ArgSpec(args=['self', 'parent_idx'], varargs=N
paddle.fluid.Program.current_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.get_desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self'
], varargs=None, keywords=None, defaults=None
)
paddle.fluid.Program.inference_optimize ArgSpec(args=['self'
, 'export_for_deployment'], varargs=None, keywords=None, defaults=(True,)
)
paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Program.optimized_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None)
...
...
@@ -18,6 +18,9 @@ paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=No
paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None)
...
...
@@ -34,21 +37,10 @@ paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None,
paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.as_lodtensor ArgSpec(args=['self', 'data'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.fetch_var ArgSpec(args=['name', 'scope', 'return_numpy'], varargs=None, keywords=None, defaults=(None, True))
paddle.fluid.Go.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Go.construct_go_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.make_channel ArgSpec(args=['dtype', 'capacity'], varargs=None, keywords=None, defaults=(0,))
paddle.fluid.channel_send ArgSpec(args=['channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.channel_recv ArgSpec(args=['channel', 'return_value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.channel_close ArgSpec(args=['channel'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Select.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.Select.case ArgSpec(args=['self', 'channel_action_fn', 'channel', 'value', 'is_copy'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.Select.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
...
...
@@ -62,20 +54,16 @@ paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', '
paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.InferenceTranspiler.__init__
paddle.fluid.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.DistributeTranspilerConfig.__init__
paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0))
paddle.fluid.ParallelExecutor.bcast_params ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
...
...
@@ -89,7 +77,7 @@ paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_pro
paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'
], varargs=None, keywords=None, defaults=(None, None, Non
e))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename'
, 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, Tru
e))
paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
...
...
@@ -338,14 +326,11 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init',
paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None)
paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
paddle.fluid.transpiler.InferenceTranspiler.__init__
paddle.fluid.transpiler.InferenceTranspiler.fuse_batch_norm ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.fuse_relu_mkldnn ArgSpec(args=['self', 'program'], varargs=None, keywords=None, defaults=None)
paddle.fluid.transpiler.InferenceTranspiler.transpile ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
...
...
paddle/fluid/framework/op_desc.cc
浏览文件 @
0098a494
...
...
@@ -238,7 +238,20 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
}
}
int
OpDesc
::
GetBlockAttr
(
const
std
::
string
&
name
)
const
{
std
::
vector
<
int
>
OpDesc
::
GetBlocksAttrIds
(
const
std
::
string
&
name
)
const
{
auto
it
=
attrs_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
attrs_
.
end
(),
"Attribute %s is not found"
,
name
);
auto
blocks
=
boost
::
get
<
std
::
vector
<
BlockDesc
*>>
(
it
->
second
);
std
::
vector
<
int
>
ids
;
for
(
auto
n
:
blocks
)
{
ids
.
push_back
(
n
->
ID
());
}
return
ids
;
}
int
OpDesc
::
GetBlockAttrId
(
const
std
::
string
&
name
)
const
{
auto
it
=
attrs_
.
find
(
name
);
PADDLE_ENFORCE
(
it
!=
attrs_
.
end
(),
"Attribute %s is not found"
,
name
);
return
boost
::
get
<
BlockDesc
*>
(
it
->
second
)
->
ID
();
...
...
paddle/fluid/framework/op_desc.h
浏览文件 @
0098a494
...
...
@@ -83,7 +83,9 @@ class OpDesc {
Attribute
GetNullableAttr
(
const
std
::
string
&
name
)
const
;
int
GetBlockAttr
(
const
std
::
string
&
name
)
const
;
int
GetBlockAttrId
(
const
std
::
string
&
name
)
const
;
std
::
vector
<
int
>
GetBlocksAttrIds
(
const
std
::
string
&
name
)
const
;
void
Rename
(
const
std
::
string
&
old_name
,
const
std
::
string
&
new_name
);
...
...
paddle/fluid/framework/program_desc.cc
浏览文件 @
0098a494
...
...
@@ -58,7 +58,7 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
for
(
const
std
::
string
&
attr_name
:
op
->
AttrNames
())
{
if
(
op
->
GetAttrType
(
attr_name
)
==
proto
::
AttrType
::
BLOCK
)
{
int
sub_block_id
=
o
.
Block
(
block_id
).
Op
(
op_id
)
->
GetBlockAttr
(
attr_name
);
o
.
Block
(
block_id
).
Op
(
op_id
)
->
GetBlockAttr
Id
(
attr_name
);
op
->
SetBlockAttr
(
attr_name
,
MutableBlock
(
sub_block_id
));
}
}
...
...
paddle/fluid/inference/analysis/analyzer.cc
浏览文件 @
0098a494
...
...
@@ -44,13 +44,13 @@ class DfgPassManagerImpl final : public DfgPassManager {
if
(
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine
)
{
auto
trt_teller
=
[
&
](
const
Node
*
node
)
{
std
::
unordered_set
<
std
::
string
>
teller_set
(
{
"elementwise_add"
,
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
});
{
"elementwise_add"
,
"mul"
,
"conv2d"
,
"pool2d"
,
"relu"
,
"softmax"
});
if
(
!
node
->
IsFunction
())
return
false
;
const
auto
*
func
=
static_cast
<
const
Function
*>
(
node
);
if
(
teller_set
.
count
(
func
->
func_type
()))
if
(
teller_set
.
count
(
func
->
func_type
()))
{
return
true
;
else
{
}
else
{
return
false
;
}
};
...
...
paddle/fluid/inference/api/CMakeLists.txt
浏览文件 @
0098a494
...
...
@@ -45,7 +45,6 @@ endfunction(inference_api_test)
cc_library
(
paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor
)
cc_test
(
test_paddle_inference_api
SRCS api_tester.cc
DEPS paddle_inference_api
)
...
...
@@ -62,22 +61,18 @@ inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_e
endif
()
if
(
WITH_ANAKIN
)
# only needed in CI
# Due to Anakin do not have official library releases and the versions of protobuf and cuda do not match Paddle's,
# so anakin library will not be merged to our official inference library. To use anakin prediction API, one need to
# compile the libinference_anakin_api.a and compile with anakin.so.
fetch_include_recursively
(
${
ANAKIN_INCLUDE
}
)
# compile the libinference_anakin_api.a and anakin.so.
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc
)
nv_library
(
inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc
)
target_compile_options
(
inference_anakin_api BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
target_compile_options
(
inference_anakin_api_shared BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
target_link_libraries
(
inference_anakin_api anakin anakin_saber_common
)
target_link_libraries
(
inference_anakin_api_shared anakin anakin_saber_common
)
nv_library
(
inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber
)
#nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
function
(
anakin_target target_name
)
target_compile_options
(
${
target_name
}
BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endfunction
()
anakin_target
(
inference_anakin_api
)
#anakin_target(inference_anakin_api_shared)
if
(
WITH_TESTING
)
# this test is unstable, disable it first.
#cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
#ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin
#DEPS inference_anakin_api_shared)
#target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
endif
(
WITH_TESTING
)
cc_test
(
inference_anakin_test SRCS api_anakin_engine_tester.cc
ARGS --model=
${
ANAKIN_SOURCE_DIR
}
/mobilenet_v2.anakin.bin
DEPS inference_anakin_api dynload_cuda SERIAL
)
target_compile_options
(
inference_anakin_test BEFORE PUBLIC
${
ANAKIN_COMPILE_EXTRA_FLAGS
}
)
endif
(
WITH_TESTING
)
endif
()
paddle/fluid/inference/api/paddle_inference_api.h
浏览文件 @
0098a494
...
...
@@ -45,7 +45,7 @@ class PaddleBuf {
PaddleBuf
(
void
*
data
,
size_t
length
)
:
data_
(
data
),
length_
(
length
),
memory_owned_
{
false
}
{}
// Own memory.
PaddleBuf
(
size_t
length
)
explicit
PaddleBuf
(
size_t
length
)
:
data_
(
new
char
[
length
]),
length_
(
length
),
memory_owned_
(
true
)
{}
// Resize to `length` bytes.
void
Resize
(
size_t
length
);
...
...
paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
浏览文件 @
0098a494
# Add TRT tests
nv_library
(
tensorrt_converter
SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
activation_op.cc
activation_op.cc
softmax_op.cc
DEPS tensorrt_engine operator scope framework_proto op_registry
)
nv_test
(
test_op_converter SRCS test_op_converter.cc DEPS
...
...
@@ -21,3 +21,6 @@ nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
nv_test
(
test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine elementwise_add_op SERIAL
)
nv_test
(
test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
DEPS
${
FLUID_CORE_MODULES
}
tensorrt_engine softmax_op SERIAL
)
paddle/fluid/inference/tensorrt/convert/softmax_op.cc
0 → 100644
浏览文件 @
0098a494
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
/*
* SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights.
*/
class
SoftMaxOpConverter
:
public
OpConverter
{
public:
void
operator
()(
const
framework
::
proto
::
OpDesc
&
op
,
const
framework
::
Scope
&
scope
,
bool
test_mode
)
override
{
VLOG
(
4
)
<<
"convert a fluid softmax op to tensorrt softmax layer without bias"
;
framework
::
OpDesc
op_desc
(
op
,
nullptr
);
// Declare inputs
auto
*
input1
=
engine_
->
GetITensor
(
op_desc
.
Input
(
"X"
)[
0
]);
auto
*
layer
=
TRT_ENGINE_ADD_LAYER
(
engine_
,
SoftMax
,
*
const_cast
<
nvinfer1
::
ITensor
*>
(
input1
));
auto
output_name
=
op_desc
.
Output
(
"Out"
)[
0
];
engine_
->
SetITensor
(
output_name
,
layer
->
getOutput
(
0
));
if
(
test_mode
)
{
engine_
->
DeclareOutput
(
output_name
);
}
}
};
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
softmax
);
REGISTER_TRT_OP_CONVERTER
(
softmax
,
SoftMaxOpConverter
);
paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc
0 → 100644
浏览文件 @
0098a494
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
namespace
paddle
{
namespace
inference
{
namespace
tensorrt
{
TEST
(
SoftMaxOpConverter
,
main
)
{
framework
::
Scope
scope
;
std
::
unordered_set
<
std
::
string
>
parameters
;
TRTConvertValidation
validator
(
8
,
parameters
,
scope
,
1000
);
std
::
vector
<
int
>
tensor_shape
{
8
,
10
};
validator
.
DeclInputVar
(
"softmax-X"
,
tensor_shape
,
nvinfer1
::
DimsCHW
(
10
,
1
,
1
));
validator
.
DeclOutputVar
(
"softmax-Out"
,
nvinfer1
::
DimsCHW
(
10
,
1
,
1
));
// Prepare Op description
framework
::
OpDesc
desc
;
desc
.
SetType
(
"softmax"
);
desc
.
SetInput
(
"X"
,
{
"softmax-X"
});
desc
.
SetOutput
(
"Out"
,
{
"softmax-Out"
});
LOG
(
INFO
)
<<
"set OP"
;
validator
.
SetOp
(
*
desc
.
Proto
());
LOG
(
INFO
)
<<
"execute"
;
validator
.
Execute
(
3
);
}
}
// namespace tensorrt
}
// namespace inference
}
// namespace paddle
USE_OP
(
softmax
);
paddle/fluid/inference/tensorrt/convert/ut_helper.h
浏览文件 @
0098a494
...
...
@@ -79,6 +79,12 @@ class TRTConvertValidation {
}
// Declare a Variable as input with random initialization.
void
DeclInputVar
(
const
std
::
string
&
name
,
const
std
::
vector
<
int
>
tensor_dims
,
const
nvinfer1
::
Dims
&
trt_dims
)
{
DeclVar
(
name
,
tensor_dims
);
engine_
->
DeclareInput
(
name
,
nvinfer1
::
DataType
::
kFLOAT
,
trt_dims
);
}
void
DeclInputVar
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
&
dims
)
{
DeclVar
(
name
,
dims
);
// Declare TRT inputs.
...
...
@@ -94,12 +100,18 @@ class TRTConvertValidation {
DeclVar
(
name
,
dims
);
}
// Declare a variable in a fluid Scope.
void
DeclVar
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
&
dims
,
bool
is_param
=
false
)
{
void
DeclVar
(
const
std
::
string
&
name
,
const
std
::
vector
<
int
>
dim_vec
)
{
platform
::
CPUPlace
place
;
platform
::
CPUDeviceContext
ctx
(
place
);
auto
*
x
=
scope_
.
Var
(
name
);
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
RandomizeTensor
(
x_tensor
,
place
,
ctx
);
}
// Declare a variable in a fluid Scope.
void
DeclVar
(
const
std
::
string
&
name
,
const
nvinfer1
::
Dims
&
dims
,
bool
is_param
=
false
)
{
// Init Fluid tensor.
std
::
vector
<
int
>
dim_vec
(
dims
.
d
,
dims
.
d
+
dims
.
nbDims
);
// There is no batchsize in ITensor's shape, but We should add it to
...
...
@@ -107,10 +119,8 @@ class TRTConvertValidation {
// if_add_batch_ flag is true, add the max batchsize to dim_vec.
if
(
is_param
!=
true
&&
if_add_batch_
==
true
)
dim_vec
.
insert
(
dim_vec
.
begin
(),
max_batch_size_
);
auto
*
x
=
scope_
.
Var
(
name
);
auto
*
x_tensor
=
x
->
GetMutable
<
framework
::
LoDTensor
>
();
x_tensor
->
Resize
(
framework
::
make_ddim
(
dim_vec
));
RandomizeTensor
(
x_tensor
,
place
,
ctx
);
DeclVar
(
name
,
dim_vec
);
}
void
SetOp
(
const
framework
::
proto
::
OpDesc
&
desc
)
{
...
...
paddle/fluid/operators/CMakeLists.txt
浏览文件 @
0098a494
...
...
@@ -235,7 +235,12 @@ else()
endif
()
op_library
(
cross_entropy_op DEPS cross_entropy
)
op_library
(
softmax_with_cross_entropy_op DEPS cross_entropy softmax
)
if
(
WITH_GPU
)
op_library
(
softmax_with_cross_entropy_op DEPS cross_entropy softmax cub
)
else
()
op_library
(
softmax_with_cross_entropy_op DEPS cross_entropy softmax
)
endif
()
op_library
(
softmax_op DEPS softmax
)
op_library
(
sequence_softmax_op DEPS softmax
)
if
(
WITH_GPU AND TENSORRT_FOUND
)
...
...
@@ -273,9 +278,9 @@ op_library(squeeze_op DEPS reshape_op)
op_library
(
extract_rows_op DEPS memory
)
op_library
(
flatten_op DEPS reshape_op
)
if
(
WITH_GPU
)
op_library
(
conv_op DEPS vol2col depthwise_conv im2col
)
op_library
(
layer_norm_op DEPS cub
)
else
()
op_library
(
conv_op DEPS vol2col im2col
)
endif
()
...
...
paddle/fluid/operators/crop_op.cc
浏览文件 @
0098a494
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -188,6 +188,7 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR
(
crop
,
ops
::
CropOp
,
ops
::
CropOpMaker
,
paddle
::
framework
::
DefaultGradOpDescMaker
<
true
>
);
REGISTER_OPERATOR
(
crop_grad
,
ops
::
CropOpGrad
);
REGISTER_OP_CPU_KERNEL
(
crop
,
ops
::
CropKernel
<
float
>
);
REGISTER_OP_CPU_KERNEL
(
crop
,
ops
::
CropKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
REGISTER_OP_CPU_KERNEL
(
crop_grad
,
ops
::
CropGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
);
paddle/fluid/operators/crop_op.cu
浏览文件 @
0098a494
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/operators/crop_op.h"
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
crop
,
ops
::
CropKernel
<
float
>
);
REGISTER_OP_CUDA_KERNEL
(
crop
,
ops
::
CropKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
REGISTER_OP_CUDA_KERNEL
(
crop_grad
,
ops
::
CropGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
);
paddle/fluid/operators/crop_op.h
浏览文件 @
0098a494
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -58,32 +58,74 @@ static std::vector<int> GetOffsets(const framework::ExecutionContext& ctx) {
return
res
;
}
template
<
typename
T
>
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
void
CropFunction
(
const
framework
::
ExecutionContext
&
context
)
{
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
out_dims
=
out
->
dims
();
if
(
out_dims
[
0
]
==
-
1
)
{
out_dims
[
0
]
=
x
->
dims
()[
0
];
}
out
->
mutable_data
<
T
>
(
out_dims
,
context
.
GetPlace
());
auto
x_stride
=
framework
::
stride
(
x
->
dims
());
auto
out_stride
=
framework
::
stride
(
out
->
dims
());
auto
offsets
=
GetOffsets
(
context
);
int64_t
offset
=
0
;
for
(
size_t
i
=
0
;
i
<
offsets
.
size
();
++
i
)
{
offset
+=
(
x_stride
[
i
]
*
offsets
[
i
]);
}
auto
x_tensor
=
EigenTensor
<
T
,
D
>::
From
(
*
x
);
auto
out_tensor
=
EigenTensor
<
T
,
D
>::
From
(
*
out
);
Eigen
::
array
<
int
,
D
>
e_offsets
;
Eigen
::
array
<
int
,
D
>
e_shape
;
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
e_offsets
[
i
]
=
offsets
[
i
];
e_shape
[
i
]
=
out
->
dims
()[
i
];
}
auto
&
place
=
*
context
.
template
device_context
<
DeviceContext
>().
eigen_device
();
out_tensor
.
device
(
place
)
=
x_tensor
.
slice
(
e_offsets
,
e_shape
);
}
template
<
typename
DeviceContext
,
typename
T
>
class
CropKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
auto
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
const
T
*
x_data
=
x
->
data
<
T
>
();
T
*
out_data
=
out
->
mutable_data
<
T
>
(
context
.
GetPlace
());
auto
x_stride
=
framework
::
stride
(
x
->
dims
());
auto
out_stride
=
framework
::
stride
(
out
->
dims
());
auto
offsets
=
GetOffsets
(
context
);
int64_t
offset
=
0
;
for
(
size_t
i
=
0
;
i
<
offsets
.
size
();
++
i
)
{
offset
+=
(
x_stride
[
i
]
*
offsets
[
i
]);
int
rank
=
context
.
Input
<
Tensor
>
(
"X"
)
->
dims
().
size
();
switch
(
rank
)
{
case
1
:
CropFunction
<
DeviceContext
,
T
,
1
>
(
context
);
break
;
case
2
:
CropFunction
<
DeviceContext
,
T
,
2
>
(
context
);
break
;
case
3
:
CropFunction
<
DeviceContext
,
T
,
3
>
(
context
);
break
;
case
4
:
CropFunction
<
DeviceContext
,
T
,
4
>
(
context
);
break
;
case
5
:
CropFunction
<
DeviceContext
,
T
,
5
>
(
context
);
break
;
case
6
:
CropFunction
<
DeviceContext
,
T
,
6
>
(
context
);
break
;
default:
PADDLE_THROW
(
"CropOp only support tensors with no more than 6 dimensions."
);
}
StridedMemcpy
<
T
>
(
context
.
device_context
(),
x_data
+
offset
,
x_stride
,
out
->
dims
(),
out_stride
,
out_data
);
}
};
template
<
typename
DeviceContext
,
typename
T
,
size_t
D
>
void
CropGradFunction
(
const
framework
::
ExecutionContext
&
context
)
{
auto
*
d_x
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
if
(
d_x
!=
nullptr
)
{
auto
*
d_out
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
d_x
->
mutable_data
<
T
>
(
context
.
GetPlace
());
d_x
->
mutable_data
<
T
>
(
x
->
dims
(),
context
.
GetPlace
());
auto
offsets
=
GetOffsets
(
context
);
Eigen
::
array
<
std
::
pair
<
int
,
int
>
,
D
>
paddings
;
for
(
size_t
i
=
0
;
i
<
D
;
++
i
)
{
...
...
paddle/fluid/operators/elementwise_add_op.cu
浏览文件 @
0098a494
...
...
@@ -16,60 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise_add_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
__global__
void
ElementwiseAddCUDAKernel
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
,
int
post
,
int
size
)
{
int
idx_x
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx_x
<
size
)
{
int
idx_y
=
idx_x
/
post
-
(
idx_x
/
(
n
*
post
))
*
n
;
z
[
idx_x
]
=
x
[
idx_x
]
+
y
[
idx_y
];
}
}
template
<
typename
T
>
class
ElementwiseAddKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
using
Tensor
=
framework
::
Tensor
;
const
auto
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
const
auto
y
=
ctx
.
Input
<
Tensor
>
(
"Y"
);
auto
z
=
ctx
.
Output
<
Tensor
>
(
"Out"
);
auto
*
z_data
=
z
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
device
=
*
(
ctx
.
cuda_device_context
().
eigen_device
());
const
framework
::
DDim
&
x_dim
=
x
->
dims
();
framework
::
DDim
y_dim
=
y
->
dims
();
int
size
=
x
->
numel
();
if
(
x_dim
==
y_dim
)
{
auto
dim
=
framework
::
make_ddim
({
size
});
auto
z_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
z
,
dim
);
auto
x_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
x
,
dim
);
auto
y_eigen
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
y
,
dim
);
z_eigen
.
device
(
device
)
=
x_eigen
+
y_eigen
;
}
else
{
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
y_dim
=
trim_trailing_singular_dims
(
y_dim
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
int
threads
=
512
;
int
grids
=
(
size
+
threads
-
1
)
/
threads
;
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
ElementwiseAddCUDAKernel
<
T
><<<
grids
,
threads
,
0
,
stream
>>>
(
x
->
data
<
T
>
(),
y
->
data
<
T
>
(),
z_data
,
n
,
post
,
size
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
...
...
paddle/fluid/operators/elementwise_add_op.h
浏览文件 @
0098a494
...
...
@@ -144,41 +144,16 @@ class ElementwiseAddGradKernel : public framework::OpKernel<T> {
auto
*
dout
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
*
dx
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
dy
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
// skip out, x, y
auto
*
out
=
dout
;
auto
*
x
=
dout
,
*
y
=
dout
;
if
(
dx
!=
nullptr
)
{
// In fact, we can just share memory, but it may cause a bug of memory
// optimizer
// dx->ShareDataWith(*dout);
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
DeviceContext
>(),
dx
);
}
if
(
dy
==
nullptr
)
return
;
const
framework
::
DDim
&
x_dim
=
dout
->
dims
();
framework
::
DDim
y_dim
=
dy
->
dims
();
if
(
x_dim
==
y_dim
)
{
// dy->ShareDataWith(*dout);
framework
::
TensorCopy
(
*
dout
,
ctx
.
GetPlace
(),
ctx
.
template
device_context
<
DeviceContext
>(),
dy
);
if
(
platform
::
is_cpu_place
(
ctx
.
GetPlace
())
&&
dx
!=
nullptr
&&
dy
!=
nullptr
&&
(
dx
->
dims
()
==
dy
->
dims
()))
{
elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
else
{
dy
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
// Perform reduction to dout to calculate dy
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
(
axis
==
-
1
?
x_dim
.
size
()
-
y_dim
.
size
()
:
axis
);
y_dim
=
trim_trailing_singular_dims
(
y_dim
);
axis
=
(
y_dim
.
size
()
==
0
)
?
x_dim
.
size
()
:
axis
;
auto
&
device
=
*
(
ctx
.
template
device_context
<
DeviceContext
>().
eigen_device
());
int
pre
,
n
,
post
;
get_mid_dims
(
x_dim
,
y_dim
,
axis
,
&
pre
,
&
n
,
&
post
);
auto
eigen_dout
=
framework
::
EigenTensor
<
T
,
3
>::
From
(
*
dout
,
framework
::
make_ddim
({
pre
,
n
,
post
}));
auto
eigen_dy
=
framework
::
EigenTensor
<
T
,
1
>::
From
(
*
dy
,
framework
::
make_ddim
({
n
}));
eigen_dy
.
device
(
device
)
=
eigen_dout
.
sum
(
framework
::
EigenDim
<
2
>::
From
(
framework
::
make_ddim
({
0
,
2
})));
default_elementwise_add_grad
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
out
,
dout
,
dx
,
dy
);
}
}
};
...
...
paddle/fluid/operators/layer_norm_op.cu
浏览文件 @
0098a494
/* Copyright (c) 201
6
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 201
8
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -12,8 +12,512 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cub/cub.cuh>
#include "paddle/fluid/operators/layer_norm_op.h"
namespace
paddle
{
namespace
operators
{
inline
static
int
GetDesiredBlockDim
(
int
block_dim
)
{
const
int
kMaxBlockDim
=
512
;
return
block_dim
>=
kMaxBlockDim
?
kMaxBlockDim
:
(
1
<<
(
static_cast
<
int
>
(
std
::
log2f
(
block_dim
))));
}
#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \
case (1 << (log2_block_dim)): { \
constexpr auto kBlockDim = (1 << (log2_block_dim)); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM_CASE(...) \
FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(2, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(1, ##__VA_ARGS__)
static
__device__
__forceinline__
float
real_sqrt
(
float
x
)
{
return
sqrtf
(
x
);
}
static
__device__
__forceinline__
double
real_sqrt
(
double
x
)
{
return
sqrt
(
x
);
}
template
<
typename
T
>
struct
PairForLayerNorm
{
__device__
__forceinline__
PairForLayerNorm
()
{}
__device__
__forceinline__
PairForLayerNorm
(
const
T
&
first
,
const
T
&
second
)
:
first_
(
first
),
second_
(
second
)
{}
T
first_
;
T
second_
;
};
template
<
typename
T
>
struct
PairForLayerNormAddFunctor
{
__device__
__forceinline__
PairForLayerNorm
<
T
>
operator
()(
const
PairForLayerNorm
<
T
>
&
p1
,
const
PairForLayerNorm
<
T
>
&
p2
)
{
return
PairForLayerNorm
<
T
>
(
p1
.
first_
+
p2
.
first_
,
p1
.
second_
+
p2
.
second_
);
}
};
template
<
typename
T
,
int
BlockDim
>
__global__
void
LayerNormForward
(
const
T
*
x
,
const
T
*
scale
,
const
T
*
bias
,
T
*
y
,
T
*
mean
,
T
*
var
,
float
epsilon
,
int
feature_size
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
PairForLayerNorm
<
T
>
,
BlockDim
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
int
beg_idx
=
blockIdx
.
x
*
feature_size
+
threadIdx
.
x
;
int
end_idx
=
(
blockIdx
.
x
+
1
)
*
feature_size
;
// Step 1: Reduce to calculate mean and var
T
mean_val
=
static_cast
<
T
>
(
0
);
T
var_val
=
static_cast
<
T
>
(
0
);
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
T
tmp
=
x
[
i
];
mean_val
+=
tmp
;
var_val
+=
(
tmp
*
tmp
);
}
auto
pair
=
BlockReduce
(
temp_storage
)
.
Reduce
(
PairForLayerNorm
<
T
>
(
mean_val
,
var_val
),
PairForLayerNormAddFunctor
<
T
>
());
if
(
threadIdx
.
x
==
0
)
{
auto
tmp
=
pair
.
first_
/
feature_size
;
mean
[
blockIdx
.
x
]
=
tmp
;
var
[
blockIdx
.
x
]
=
pair
.
second_
/
feature_size
-
tmp
*
tmp
;
}
__syncthreads
();
mean_val
=
mean
[
blockIdx
.
x
];
var_val
=
static_cast
<
T
>
(
real_sqrt
(
var
[
blockIdx
.
x
]
+
epsilon
));
// Step 2: Calculate y
if
(
scale
!=
nullptr
)
{
if
(
bias
!=
nullptr
)
{
for
(
int
i
=
beg_idx
,
j
=
threadIdx
.
x
;
i
<
end_idx
;
i
+=
BlockDim
,
j
+=
BlockDim
)
{
y
[
i
]
=
scale
[
j
]
*
(
x
[
i
]
-
mean_val
)
/
var_val
+
bias
[
j
];
}
}
else
{
for
(
int
i
=
beg_idx
,
j
=
threadIdx
.
x
;
i
<
end_idx
;
i
+=
BlockDim
,
j
+=
BlockDim
)
{
y
[
i
]
=
scale
[
j
]
*
(
x
[
i
]
-
mean_val
)
/
var_val
;
}
}
}
else
{
// scale == nullptr
if
(
bias
!=
nullptr
)
{
for
(
int
i
=
beg_idx
,
j
=
threadIdx
.
x
;
i
<
end_idx
;
i
+=
BlockDim
,
j
+=
BlockDim
)
{
y
[
i
]
=
(
x
[
i
]
-
mean_val
)
/
var_val
+
bias
[
j
];
}
}
else
{
for
(
int
i
=
beg_idx
,
j
=
threadIdx
.
x
;
i
<
end_idx
;
i
+=
BlockDim
,
j
+=
BlockDim
)
{
y
[
i
]
=
(
x
[
i
]
-
mean_val
)
/
var_val
;
}
}
}
}
// Make sure that d_scale != nullptr && d_bias != nullptr
// Since d_scale != nullptr, scale would not be nullptr
template
<
typename
T
,
int
BlockDim
,
bool
HasDx
>
__global__
void
LayerNormBackwardGradientAll
(
const
T
*
x
,
const
T
*
d_y
,
T
*
d_scale
,
T
*
d_bias
,
T
*
d_x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
scale
,
float
epsilon
,
int
batch_size
,
int
feature_size
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
PairForLayerNorm
<
T
>
,
BlockDim
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
int
beg_idx
=
threadIdx
.
x
*
feature_size
+
blockIdx
.
x
;
int
end_idx
=
batch_size
*
feature_size
+
blockIdx
.
x
;
int
stride
=
BlockDim
*
feature_size
;
T
d_scale_partial
=
0
,
d_bias_partial
=
0
;
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
stride
)
{
int
row_idx
=
i
/
feature_size
;
auto
var_val
=
static_cast
<
T
>
(
real_sqrt
(
var
[
row_idx
]
+
epsilon
));
d_scale_partial
+=
d_y
[
i
]
*
(
x
[
i
]
-
mean
[
row_idx
])
/
var_val
;
d_bias_partial
+=
d_y
[
i
];
if
(
HasDx
)
{
d_x
[
i
]
=
d_y
[
i
]
*
scale
[
blockIdx
.
x
]
/
var_val
;
}
}
auto
pair
=
BlockReduce
(
temp_storage
)
.
Reduce
(
PairForLayerNorm
<
T
>
(
d_scale_partial
,
d_bias_partial
),
PairForLayerNormAddFunctor
<
T
>
());
if
(
threadIdx
.
x
==
0
)
{
d_scale
[
blockIdx
.
x
]
=
pair
.
first_
;
d_bias
[
blockIdx
.
x
]
=
pair
.
second_
;
}
}
// Make sure that there is only one true expression: d_scale != nullptr
// or d_bias != nullptr
// Notice: scale may be nullptr
template
<
typename
T
,
int
BlockDim
,
bool
HasDx
,
bool
HasDScale
>
__global__
void
LayerNormBackwardGradientScaleOrBias
(
const
T
*
x
,
const
T
*
d_y
,
T
*
d_scale
,
T
*
d_bias
,
T
*
d_x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
scale
,
float
epsilon
,
int
batch_size
,
int
feature_size
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
T
,
BlockDim
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
int
beg_idx
=
threadIdx
.
x
*
feature_size
+
blockIdx
.
x
;
int
end_idx
=
batch_size
*
feature_size
+
blockIdx
.
x
;
int
stride
=
BlockDim
*
feature_size
;
T
d_scale_or_d_bias_partial
=
0
;
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
stride
)
{
int
row_idx
=
i
/
feature_size
;
auto
var_val
=
static_cast
<
T
>
(
real_sqrt
(
var
[
row_idx
]
+
epsilon
));
if
(
HasDScale
)
{
d_scale_or_d_bias_partial
+=
d_y
[
i
]
*
(
x
[
i
]
-
mean
[
row_idx
])
/
var_val
;
}
else
{
// d_bias != nullptr
d_scale_or_d_bias_partial
+=
d_y
[
i
];
}
if
(
HasDx
)
{
if
(
scale
!=
nullptr
)
{
d_x
[
i
]
=
d_y
[
i
]
*
scale
[
blockIdx
.
x
]
/
var_val
;
}
else
{
d_x
[
i
]
=
d_y
[
i
]
/
var_val
;
}
}
}
d_scale_or_d_bias_partial
=
BlockReduce
(
temp_storage
).
Reduce
(
d_scale_or_d_bias_partial
,
cub
::
Sum
());
if
(
threadIdx
.
x
==
0
)
{
if
(
HasDScale
)
{
d_scale
[
blockIdx
.
x
]
=
d_scale_or_d_bias_partial
;
}
else
{
d_bias
[
blockIdx
.
x
]
=
d_scale_or_d_bias_partial
;
}
}
}
template
<
typename
T
,
int
BlockDim
>
__global__
void
LayerNormBackwardPostProcessToCalculateDX
(
const
T
*
x
,
T
*
d_x
,
const
T
*
mean
,
const
T
*
var
,
float
epsilon
,
int
feature_size
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
PairForLayerNorm
<
T
>
,
BlockDim
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
__shared__
T
d_x_reduce_tmp
[
2
];
int
beg_idx
=
blockIdx
.
x
*
feature_size
+
threadIdx
.
x
;
int
end_idx
=
(
blockIdx
.
x
+
1
)
*
feature_size
;
T
block_mean
=
mean
[
blockIdx
.
x
];
T
block_var
=
var
[
blockIdx
.
x
];
T
d_x_mean_partial
=
0
,
d_x_var_partial
=
0
;
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
d_x_mean_partial
+=
d_x
[
i
];
d_x_var_partial
+=
d_x
[
i
]
*
(
x
[
i
]
-
block_mean
);
}
auto
pair
=
BlockReduce
(
temp_storage
)
.
Reduce
(
PairForLayerNorm
<
T
>
(
d_x_mean_partial
,
d_x_var_partial
),
PairForLayerNormAddFunctor
<
T
>
());
if
(
threadIdx
.
x
==
0
)
{
d_x_reduce_tmp
[
0
]
=
pair
.
first_
/
feature_size
;
d_x_reduce_tmp
[
1
]
=
pair
.
second_
/
(
feature_size
*
(
block_var
+
epsilon
));
}
__syncthreads
();
d_x_mean_partial
=
d_x_reduce_tmp
[
0
];
d_x_var_partial
=
d_x_reduce_tmp
[
1
];
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
d_x
[
i
]
-=
d_x_mean_partial
;
d_x
[
i
]
-=
(
x
[
i
]
-
block_mean
)
*
d_x_var_partial
;
}
}
// Here, we only calculate d_x
template
<
typename
T
,
int
BlockDim
>
__global__
void
LayerNormBackwardGradientOnlyDX
(
const
T
*
x
,
const
T
*
d_y
,
T
*
d_x
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
scale
,
float
epsilon
,
int
feature_size
)
{
using
BlockReduce
=
cub
::
BlockReduce
<
PairForLayerNorm
<
T
>
,
BlockDim
>
;
__shared__
typename
BlockReduce
::
TempStorage
temp_storage
;
__shared__
T
d_x_reduce_tmp
[
2
];
int
beg_idx
=
blockIdx
.
x
*
feature_size
+
threadIdx
.
x
;
int
end_idx
=
(
blockIdx
.
x
+
1
)
*
feature_size
;
T
block_mean
=
mean
[
blockIdx
.
x
],
block_var
=
var
[
blockIdx
.
x
];
T
d_x_mean_partial
=
0
,
d_x_var_partial
=
0
;
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
auto
var_val
=
static_cast
<
T
>
(
real_sqrt
(
block_var
+
epsilon
));
if
(
scale
!=
nullptr
)
{
int
col_idx
=
i
%
feature_size
;
d_x
[
i
]
=
d_y
[
i
]
*
scale
[
col_idx
]
/
var_val
;
}
else
{
d_x
[
i
]
=
d_y
[
i
]
/
var_val
;
}
d_x_mean_partial
+=
d_x
[
i
];
d_x_var_partial
+=
d_x
[
i
]
*
(
x
[
i
]
-
block_mean
);
}
auto
pair
=
BlockReduce
(
temp_storage
)
.
Reduce
(
PairForLayerNorm
<
T
>
(
d_x_mean_partial
,
d_x_var_partial
),
PairForLayerNormAddFunctor
<
T
>
());
if
(
threadIdx
.
x
==
0
)
{
d_x_reduce_tmp
[
0
]
=
pair
.
first_
/
feature_size
;
d_x_reduce_tmp
[
1
]
=
pair
.
second_
/
(
feature_size
*
(
block_var
+
epsilon
));
}
__syncthreads
();
d_x_mean_partial
=
d_x_reduce_tmp
[
0
];
d_x_var_partial
=
d_x_reduce_tmp
[
1
];
for
(
int
i
=
beg_idx
;
i
<
end_idx
;
i
+=
BlockDim
)
{
d_x
[
i
]
-=
d_x_mean_partial
;
d_x
[
i
]
-=
(
x
[
i
]
-
block_mean
)
*
d_x_var_partial
;
}
}
template
<
typename
T
>
__global__
void
LayerNormBackwardWhenBatchSizeIsOne
(
const
T
*
x
,
const
T
*
d_y
,
T
*
d_x
,
T
*
d_scale
,
T
*
d_bias
,
const
T
*
mean
,
const
T
*
var
,
const
T
*
scale
,
float
epsilon
,
int
feature_size
)
{
int
idx
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
idx
<
feature_size
)
{
auto
var_val
=
static_cast
<
T
>
(
real_sqrt
(
var
[
idx
]
+
epsilon
));
if
(
d_x
!=
nullptr
)
{
if
(
d_scale
==
nullptr
)
{
d_x
[
idx
]
=
d_y
[
idx
]
/
var_val
;
}
else
{
d_x
[
idx
]
=
d_y
[
idx
]
*
scale
[
idx
]
/
var_val
;
}
}
if
(
d_scale
!=
nullptr
)
{
d_scale
[
idx
]
=
d_y
[
idx
]
*
(
x
[
idx
]
-
mean
[
idx
])
/
var_val
;
}
if
(
d_bias
!=
nullptr
)
d_bias
[
idx
]
=
d_y
[
idx
];
}
}
template
<
typename
T
>
static
void
LayerNormBackward
(
const
T
*
x
,
const
T
*
d_y
,
const
T
*
scale
,
const
T
*
mean
,
const
T
*
var
,
T
*
d_x
,
T
*
d_scale
,
T
*
d_bias
,
float
epsilon
,
int
batch_size
,
int
feature_size
,
cudaStream_t
stream
)
{
const
int
kMaxBlockDim
=
512
;
int
gradient_flag
=
((
d_x
!=
nullptr
?
1
:
0
)
<<
2
)
|
((
d_scale
!=
nullptr
?
1
:
0
)
<<
1
)
|
((
d_bias
!=
nullptr
?
1
:
0
));
if
(
gradient_flag
==
0
)
return
;
if
(
batch_size
==
1
)
{
LayerNormBackwardWhenBatchSizeIsOne
<
T
><<<
(
feature_size
+
kMaxBlockDim
-
1
)
/
kMaxBlockDim
,
kMaxBlockDim
,
0
,
stream
>>>
(
x
,
d_y
,
d_x
,
d_scale
,
d_bias
,
mean
,
var
,
scale
,
epsilon
,
feature_size
);
if
(
d_x
!=
nullptr
)
{
switch
(
GetDesiredBlockDim
(
feature_size
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardPostProcessToCalculateDX
<
T
,
kBlockDim
><<<
1
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_x
,
mean
,
var
,
epsilon
,
feature_size
));
}
}
return
;
}
auto
block_dim
=
GetDesiredBlockDim
(
batch_size
);
switch
(
gradient_flag
)
{
case
1
:
// d_x == nulptr, d_scale == nullptr, d_bias != nullptr
switch
(
block_dim
)
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardGradientScaleOrBias
<
T
,
kBlockDim
,
false
,
false
><<<
feature_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_y
,
d_scale
,
d_bias
,
d_x
,
mean
,
var
,
scale
,
epsilon
,
batch_size
,
feature_size
));
}
break
;
case
2
:
// d_x == nullptr, d_scale != nullptr, d_bias == nullptr
switch
(
block_dim
)
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardGradientScaleOrBias
<
T
,
kBlockDim
,
false
,
true
><<<
feature_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_y
,
d_scale
,
d_bias
,
d_x
,
mean
,
var
,
scale
,
epsilon
,
batch_size
,
feature_size
));
}
break
;
case
3
:
// d_x == nullptr, d_scale != nulptr, d_bias != nullptr
switch
(
block_dim
)
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardGradientAll
<
T
,
kBlockDim
,
false
><<<
feature_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_y
,
d_scale
,
d_bias
,
d_x
,
mean
,
var
,
scale
,
epsilon
,
batch_size
,
feature_size
));
}
break
;
case
4
:
// d_x != nullptr, d_scale == nullptr, d_bias == nullptr
switch
(
GetDesiredBlockDim
(
feature_size
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardGradientOnlyDX
<
T
,
kBlockDim
><<<
batch_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_y
,
d_x
,
mean
,
var
,
scale
,
epsilon
,
feature_size
));
}
break
;
case
5
:
// d_x != nulptr, d_scale == nullptr, d_bias != nullptr
switch
(
block_dim
)
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardGradientScaleOrBias
<
T
,
kBlockDim
,
true
,
false
><<<
feature_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_y
,
d_scale
,
d_bias
,
d_x
,
mean
,
var
,
scale
,
epsilon
,
batch_size
,
feature_size
));
}
switch
(
GetDesiredBlockDim
(
feature_size
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardPostProcessToCalculateDX
<
T
,
kBlockDim
><<<
batch_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_x
,
mean
,
var
,
epsilon
,
feature_size
));
}
break
;
case
6
:
// d_x != nullptr, d_scale != nullptr, d_bias == nullptr
switch
(
block_dim
)
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardGradientScaleOrBias
<
T
,
kBlockDim
,
true
,
true
><<<
feature_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_y
,
d_scale
,
d_bias
,
d_x
,
mean
,
var
,
scale
,
epsilon
,
batch_size
,
feature_size
));
}
switch
(
GetDesiredBlockDim
(
feature_size
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardPostProcessToCalculateDX
<
T
,
kBlockDim
><<<
batch_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_x
,
mean
,
var
,
epsilon
,
feature_size
));
}
break
;
case
7
:
// d_x != nullptr, d_scale != nullptr, d_bias != nullptr
switch
(
block_dim
)
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardGradientAll
<
T
,
kBlockDim
,
true
><<<
feature_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_y
,
d_scale
,
d_bias
,
d_x
,
mean
,
var
,
scale
,
epsilon
,
batch_size
,
feature_size
));
}
switch
(
GetDesiredBlockDim
(
feature_size
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormBackwardPostProcessToCalculateDX
<
T
,
kBlockDim
><<<
batch_size
,
kBlockDim
,
0
,
stream
>>>
(
x
,
d_x
,
mean
,
var
,
epsilon
,
feature_size
));
}
break
;
default:
break
;
}
}
template
<
typename
T
>
class
LayerNormKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
y
=
ctx
.
Output
<
Tensor
>
(
"Y"
);
auto
*
mean
=
ctx
.
Output
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Output
<
Tensor
>
(
"Variance"
);
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
const
auto
x_dims
=
x
->
dims
();
auto
*
x_data
=
x
->
data
<
T
>
();
auto
*
y_data
=
y
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
mean_data
=
mean
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
var_data
=
var
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
*
scale_data
=
(
scale
==
nullptr
?
nullptr
:
scale
->
data
<
T
>
());
auto
*
bias_data
=
(
bias
==
nullptr
?
nullptr
:
bias
->
data
<
T
>
());
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
batch_size
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
feature_size
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
switch
(
GetDesiredBlockDim
(
feature_size
))
{
FIXED_BLOCK_DIM_CASE
(
LayerNormForward
<
T
,
kBlockDim
><<<
batch_size
,
kBlockDim
,
0
,
stream
>>>
(
x_data
,
scale_data
,
bias_data
,
y_data
,
mean_data
,
var_data
,
epsilon
,
feature_size
));
default:
PADDLE_THROW
(
"Product from begin_norm_axis to end must be larger than 1"
);
break
;
}
}
};
template
<
typename
T
>
class
LayerNormGradKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
float
epsilon
=
ctx
.
Attr
<
float
>
(
"epsilon"
);
// d_x, d_scale, d_bias may be nullptr
auto
*
d_x
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_scale
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Scale"
));
auto
*
d_bias
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Bias"
));
auto
*
x
=
ctx
.
Input
<
Tensor
>
(
"X"
);
auto
*
mean
=
ctx
.
Input
<
Tensor
>
(
"Mean"
);
auto
*
var
=
ctx
.
Input
<
Tensor
>
(
"Variance"
);
auto
*
scale
=
ctx
.
Input
<
Tensor
>
(
"Scale"
);
auto
*
d_y
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Y"
));
auto
*
x_data
=
x
->
data
<
T
>
();
auto
*
d_y_data
=
d_y
->
data
<
T
>
();
auto
*
mean_data
=
mean
->
data
<
T
>
();
auto
*
var_data
=
var
->
data
<
T
>
();
auto
*
scale_data
=
(
scale
==
nullptr
?
nullptr
:
scale
->
data
<
T
>
());
auto
*
d_scale_data
=
(
d_scale
==
nullptr
?
nullptr
:
d_scale
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
auto
*
d_bias_data
=
(
d_bias
==
nullptr
?
nullptr
:
d_bias
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
auto
*
d_x_data
=
(
d_x
==
nullptr
?
nullptr
:
d_x
->
mutable_data
<
T
>
(
ctx
.
GetPlace
()));
const
auto
&
x_dims
=
x
->
dims
();
const
auto
begin_norm_axis
=
ctx
.
Attr
<
int
>
(
"begin_norm_axis"
);
auto
matrix_dim
=
framework
::
flatten_to_2d
(
x_dims
,
begin_norm_axis
);
int
batch_size
=
static_cast
<
int
>
(
matrix_dim
[
0
]);
int
feature_size
=
static_cast
<
int
>
(
matrix_dim
[
1
]);
auto
stream
=
ctx
.
cuda_device_context
().
stream
();
LayerNormBackward
<
T
>
(
x_data
,
d_y_data
,
scale_data
,
mean_data
,
var_data
,
d_x_data
,
d_scale_data
,
d_bias_data
,
epsilon
,
batch_size
,
feature_size
,
stream
);
}
};
#undef FIXED_BLOCK_DIM_CASE_BASE
#undef FIXED_BLOCK_DIM_CASE
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_CUDA_KERNEL
(
layer_norm
,
...
...
paddle/fluid/pybind/protobuf.cc
浏览文件 @
0098a494
...
...
@@ -301,7 +301,8 @@ void BindOpDesc(pybind11::module *m) {
std
::
string
ser
(
seriralized
);
self
.
SetAttr
(
name
,
ser
);
})
.
def
(
"block_attr"
,
&
pd
::
OpDesc
::
GetBlockAttr
)
.
def
(
"block_attr_id"
,
&
pd
::
OpDesc
::
GetBlockAttrId
)
.
def
(
"blocks_attr_ids"
,
&
pd
::
OpDesc
::
GetBlocksAttrIds
)
.
def
(
"check_attrs"
,
&
pd
::
OpDesc
::
CheckAttrs
)
.
def
(
"infer_shape"
,
&
pd
::
OpDesc
::
InferShape
)
.
def
(
"infer_var_type"
,
&
pd
::
OpDesc
::
InferVarType
)
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
0098a494
...
...
@@ -664,7 +664,7 @@ All parameter, weight, gradient are variables in Paddle.
const
std
::
string
&
,
Scope
*
,
std
::
vector
<
Scope
*>
&
,
const
ExecutionStrategy
&
,
const
BuildStrategy
&
,
size_t
,
size_t
>
())
.
def
(
"bcast_params"
,
&
ParallelExecutor
::
BCastParamsToDevices
)
.
def
(
"
_
bcast_params"
,
&
ParallelExecutor
::
BCastParamsToDevices
)
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
...
...
python/paddle/fluid/__init__.py
浏览文件 @
0098a494
...
...
@@ -48,8 +48,6 @@ from .data_feeder import DataFeeder
from
.core
import
LoDTensor
,
LoDTensorArray
,
CPUPlace
,
CUDAPlace
,
CUDAPinnedPlace
,
Scope
from
.transpiler
import
DistributeTranspiler
,
InferenceTranspiler
,
\
memory_optimize
,
release_memory
,
DistributeTranspilerConfig
from
.concurrency
import
(
Go
,
make_channel
,
channel_send
,
channel_recv
,
channel_close
,
Select
)
from
.lod_tensor
import
create_lod_tensor
,
create_random_int_lodtensor
from
.
import
clip
from
.
import
profiler
...
...
@@ -61,7 +59,7 @@ from paddle.fluid.layers.math_op_patch import monkey_patch_variable
Tensor
=
LoDTensor
__all__
=
framework
.
__all__
+
executor
.
__all__
+
concurrency
.
__all__
+
\
__all__
=
framework
.
__all__
+
executor
.
__all__
+
\
trainer
.
__all__
+
inferencer
.
__all__
+
transpiler
.
__all__
+
\
parallel_executor
.
__all__
+
lod_tensor
.
__all__
+
[
'io'
,
...
...
python/paddle/fluid/backward.py
浏览文件 @
0098a494
...
...
@@ -344,7 +344,7 @@ def _append_backward_ops_(block,
grad_sub_block_list
=
[]
# If the op has its own sub-block, deal with the sub-block first
if
op
.
has_attr
(
"sub_block"
):
sub_block
=
program
.
block
(
op
.
block_attr
(
"sub_block"
))
sub_block
=
program
.
block
(
op
.
block_attr
_id
(
"sub_block"
))
grad_sub_block
=
program
.
create_block
()
grad_sub_block
.
_set_forward_block_idx
(
sub_block
.
idx
)
cb
=
_callback_lookup_
(
op
)
...
...
@@ -406,7 +406,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
for
op_idx
in
range
(
start_op_idx
,
block
.
desc
.
op_size
()):
op_desc
=
block
.
desc
.
op
(
op_idx
)
if
op_desc
.
has_attr
(
"sub_block"
):
sub_block
=
block
.
program
.
block
(
op_desc
.
block_attr
(
"sub_block"
))
sub_block
=
block
.
program
.
block
(
op_desc
.
block_attr
_id
(
"sub_block"
))
_append_backward_vars_
(
sub_block
,
0
,
grad_to_var
,
grad_info_map
)
new_vars
=
set
()
# create new gradient variables
...
...
python/paddle/fluid/concurrency.py
浏览文件 @
0098a494
...
...
@@ -19,8 +19,7 @@ from .layers import fill_constant
from
.
import
core
__all__
=
[
'Go'
,
'make_channel'
,
'channel_send'
,
'channel_recv'
,
'channel_close'
,
'Select'
'make_channel'
,
'channel_send'
,
'channel_recv'
,
'channel_close'
,
'Select'
]
...
...
@@ -35,10 +34,10 @@ class Go(BlockGuard):
def
__exit__
(
self
,
exc_type
,
exc_val
,
exc_tb
):
if
exc_type
is
not
None
:
return
False
self
.
construct_go_op
()
self
.
_
construct_go_op
()
return
super
(
Go
,
self
).
__exit__
(
exc_type
,
exc_val
,
exc_tb
)
def
construct_go_op
(
self
):
def
_
construct_go_op
(
self
):
main_program
=
self
.
helper
.
main_program
go_block
=
main_program
.
current_block
()
parent_block
=
main_program
.
block
(
main_program
.
current_block
()
...
...
python/paddle/fluid/executor.py
浏览文件 @
0098a494
...
...
@@ -18,9 +18,7 @@ import six
from
.framework
import
Program
,
default_main_program
,
Variable
from
.
import
core
__all__
=
[
'Executor'
,
'global_scope'
,
'scope_guard'
,
'_switch_scope'
,
'fetch_var'
]
__all__
=
[
'Executor'
,
'global_scope'
,
'scope_guard'
,
'_switch_scope'
]
g_scope
=
core
.
Scope
()
...
...
@@ -171,7 +169,7 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
return
fetch_count
>
0
def
fetch_var
(
name
,
scope
=
None
,
return_numpy
=
True
):
def
_
fetch_var
(
name
,
scope
=
None
,
return_numpy
=
True
):
"""
Fetch the value of the variable with the given name from the
given scope.
...
...
@@ -222,6 +220,37 @@ def _get_program_cache_key(feed, fetch_list):
return
str
(
feed_var_names
+
fetch_var_names
)
def
_as_lodtensor
(
data
,
place
):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> place = fluid.CPUPlace()
>>> exe = fluid.executor(place)
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if
isinstance
(
data
,
list
):
raise
RuntimeError
(
"Some of your feed data hold LoD information.
\
They can not be completely cast from a list of Python
\
ndarray to LoDTensor. Please convert data to LoDTensor
\
directly before feeding the data.
\
"
)
# single tensor case
tensor
=
core
.
LoDTensor
()
tensor
.
set
(
data
,
place
)
return
tensor
class
Executor
(
object
):
"""
An Executor in Python, only support the single-GPU running. For multi-cards, please refer to
...
...
@@ -250,35 +279,6 @@ class Executor(object):
self
.
program_caches
=
dict
()
self
.
_closed
=
False
def
as_lodtensor
(
self
,
data
):
"""
Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
For higher dimensional sequence data, please use LoDTensor directly.
Examples:
>>> import paddle.fluid as fluid
>>> exe = fluid.executor(fluid.CPUPlace())
>>> data = np.array(size=(100, 200, 300))
>>> np_outs = map(lambda x: exe.as_lodtensor(x), data)
>>> ...
Args:
data(numpy.ndarray): a instance of array
Returns:
LoDTensor
"""
if
isinstance
(
data
,
list
):
raise
RuntimeError
(
"Some of your feed data hold LoD information.
\
They can not be completely cast from a list of Python
\
ndarray to LoDTensor. Please convert data to LoDTensor
\
directly before feeding the data.
\
"
)
# single tensor case
tensor
=
core
.
LoDTensor
()
tensor
.
set
(
data
,
self
.
place
)
return
tensor
def
_get_program_cache
(
self
,
program_cache_key
):
return
self
.
program_caches
.
get
(
program_cache_key
,
None
)
...
...
@@ -337,7 +337,7 @@ class Executor(object):
feed_target_name
=
op
.
desc
.
output
(
'Out'
)[
0
]
cur_feed
=
feed
[
feed_target_name
]
if
not
isinstance
(
cur_feed
,
core
.
LoDTensor
):
cur_feed
=
self
.
as_lodtensor
(
cur_feed
)
cur_feed
=
_as_lodtensor
(
cur_feed
,
self
.
place
)
idx
=
op
.
desc
.
attr
(
'col'
)
core
.
set_feed_variable
(
scope
,
cur_feed
,
feed_var_name
,
idx
)
else
:
...
...
python/paddle/fluid/framework.py
浏览文件 @
0098a494
...
...
@@ -476,23 +476,25 @@ class Operator(object):
attrs
=
None
):
self
.
block
=
block
self
.
desc
=
desc
self
.
attrs
=
attrs
if
self
.
attrs
is
None
:
self
.
attrs
=
dict
()
# note: not add self.attrs here:
# https://github.com/PaddlePaddle/Paddle/pull/12583#pullrequestreview-145093173
op_attrs
=
attrs
if
op_attrs
is
None
:
op_attrs
=
dict
()
del
attrs
op_maker
=
core
.
op_proto_and_checker_maker
if
op_maker
.
kOpRoleAttrName
()
not
in
self
.
attrs
:
self
.
attrs
[
op_maker
.
kOpRoleAttrName
()]
=
self
.
block
.
program
.
op_role
if
op_maker
.
kOpRoleAttrName
()
not
in
op_
attrs
:
op_
attrs
[
op_maker
.
kOpRoleAttrName
()]
=
self
.
block
.
program
.
op_role
role_var_name
=
op_maker
.
kOpRoleVarAttrName
()
if
len
(
self
.
block
.
program
.
op_role_var
)
!=
0
and
role_var_name
not
in
self
.
attrs
:
self
.
attrs
[
role_var_name
]
=
self
.
block
.
program
.
op_role_var
op_role_var
)
!=
0
and
role_var_name
not
in
op_
attrs
:
op_
attrs
[
role_var_name
]
=
self
.
block
.
program
.
op_role_var
if
role_var_name
in
self
.
attrs
and
len
(
self
.
attrs
[
role_var_name
])
==
0
:
del
self
.
attrs
[
role_var_name
]
if
role_var_name
in
op_attrs
and
len
(
op_
attrs
[
role_var_name
])
==
0
:
del
op_
attrs
[
role_var_name
]
if
len
(
self
.
desc
.
type
())
!=
0
:
return
...
...
@@ -576,15 +578,14 @@ class Operator(object):
arg
.
op
=
self
self
.
desc
.
set_output
(
out_proto
.
name
,
out_arg_names
)
if
self
.
attrs
is
not
None
:
if
not
isinstance
(
self
.
attrs
,
dict
):
if
op_
attrs
is
not
None
:
if
not
isinstance
(
op_
attrs
,
dict
):
raise
TypeError
(
"'attrs' should be a dict."
)
for
attr
in
proto
.
attrs
:
attr_name
=
attr
.
name
if
(
attr_name
not
in
self
.
attrs
)
or
(
self
.
attrs
[
attr_name
]
is
None
):
if
(
attr_name
not
in
op_attrs
)
or
(
op_attrs
[
attr_name
]
is
None
):
continue
attr_val
=
self
.
attrs
[
attr_name
]
attr_val
=
op_
attrs
[
attr_name
]
self
.
_update_desc_attr
(
attr_name
,
attr_val
)
self
.
desc
.
check_attrs
()
...
...
@@ -732,7 +733,6 @@ class Operator(object):
Raises:
ValueError: If the type of value doesn't match with desc.attr_type(name).
"""
self
.
attrs
[
name
]
=
val
self
.
_update_desc_attr
(
name
,
val
)
def
_update_desc_attr
(
self
,
name
,
val
):
...
...
@@ -774,9 +774,9 @@ class Operator(object):
"""
return
self
.
desc
.
attr
(
name
)
def
block_attr
(
self
,
name
):
def
block_attr
_id
(
self
,
name
):
"""
Get the block attribute by name.
Get the block attribute
's id
by name.
Args:
name(str): the attribute name.
...
...
@@ -784,22 +784,74 @@ class Operator(object):
Returns:
int: the block index.
"""
return
self
.
desc
.
block_attr
(
name
)
return
self
.
desc
.
block_attr_id
(
name
)
def
block_attr
(
self
,
name
):
"""
Get the block attribute by name.
Args:
name(str): the attribute name.
Returns:
block: the block attribute.
"""
id
=
self
.
block_attr_id
(
name
)
assert
(
id
>=
0
and
id
<
len
(
self
.
block
.
program
.
blocks
))
return
self
.
block
.
program
.
blocks
[
id
]
def
blocks_attr
(
self
,
name
):
"""
Get the blocks attribute by name.
Args:
name(str): the attribute name.
Returns:
list: list of the blocks attribute.
"""
attrs
=
[]
for
i
in
self
.
blocks_attr_ids
(
name
):
assert
(
i
>=
0
and
i
<
len
(
self
.
block
.
program
.
blocks
))
attrs
.
append
(
self
.
block
.
program
.
blocks
[
i
])
return
attrs
def
blocks_attr_ids
(
self
,
name
):
"""
Get the blocks attribute's ids by name.
Args:
name(str): the attribute name.
Returns:
list: list of the blocks ids.
"""
return
self
.
desc
.
blocks_attr_ids
(
name
)
def
all_attrs
(
self
):
"""
Get the attribute dict.
Returns:
dict: The Operator's attribute dict.
dict: The Operator's attribute dict
, name->attr
.
"""
attr_names
=
self
.
attr_names
attr_map
=
{}
for
n
in
attr_names
:
if
n
==
'sub_block'
:
attr_type
=
self
.
desc
.
attr_type
(
n
)
if
attr_type
==
core
.
AttrType
.
BLOCK
:
attr_map
[
n
]
=
self
.
block_attr
(
n
)
else
:
attr_map
[
n
]
=
self
.
attr
(
n
)
continue
if
attr_type
==
core
.
AttrType
.
BLOCKS
:
attr_map
[
n
]
=
self
.
blocks_attr
(
n
)
continue
attr_map
[
n
]
=
self
.
attr
(
n
)
return
attr_map
...
...
@@ -1518,11 +1570,17 @@ class Program(object):
The two code snippets above will generate same programs.
"""
if
for_test
:
p
=
self
.
inference_optimize
()
p
=
self
.
inference_optimize
(
export_for_deployment
=
False
)
else
:
p
=
Program
()
p
.
current_block_idx
=
self
.
current_block_idx
p
.
_seed
=
self
.
_seed
p
.
desc
=
core
.
ProgramDesc
(
self
.
desc
)
p
.
blocks
=
[
Block
(
p
,
i
)
for
i
in
range
(
self
.
desc
.
num_blocks
())]
p
.
blocks
=
[
Block
(
p
,
i
)
for
i
in
xrange
(
self
.
desc
.
num_blocks
())]
p
.
_current_role
=
self
.
_current_role
p
.
_op_role_var
=
self
.
_op_role_var
p
.
_sync_with_cpp
()
p
.
_copy_param_info_from
(
self
)
...
...
@@ -1578,7 +1636,7 @@ class Program(object):
res
.
_sync_with_cpp
()
return
res
def
inference_optimize
(
self
):
def
inference_optimize
(
self
,
export_for_deployment
=
True
):
"""
This method will create a new program and do following adjustments on it:
1. Remove all reader variables and their creator ops if exist.
...
...
@@ -1589,6 +1647,10 @@ class Program(object):
attribute of operators to :code:`True`. All the :code:`Parameter`
information will be lost.
Args:
export_for_deployment(bool): remove the read ops that are added by py_reader
for cpp inference library
Notes: This API is a very low level API. Use
:code:`Program.clone(for_test=True)` instead.
...
...
@@ -1603,16 +1665,17 @@ class Program(object):
# remove all readers and the read_op if exist
read_op_idx
=
0
root_block
=
res
.
desc
.
block
(
0
)
while
True
:
if
read_op_idx
>=
root_block
.
op_size
()
or
root_block
.
op
(
read_op_idx
).
type
()
==
'read'
:
break
read_op_idx
+=
1
if
read_op_idx
<
root_block
.
op_size
():
root_block
.
_remove_op
(
0
,
read_op_idx
+
1
)
for
var
in
root_block
.
all_vars
():
if
var
.
type
()
==
core
.
VarDesc
.
VarType
.
READER
:
root_block
.
_remove_var
(
var
.
name
())
if
export_for_deployment
:
while
True
:
if
read_op_idx
>=
root_block
.
op_size
()
or
root_block
.
op
(
read_op_idx
).
type
()
==
'read'
:
break
read_op_idx
+=
1
if
read_op_idx
<
root_block
.
op_size
():
root_block
.
_remove_op
(
0
,
read_op_idx
+
1
)
for
var
in
root_block
.
all_vars
():
if
var
.
type
()
==
core
.
VarDesc
.
VarType
.
READER
:
root_block
.
_remove_var
(
var
.
name
())
# change all `is_test` attributes to True
for
i
in
range
(
res
.
desc
.
num_blocks
()):
...
...
python/paddle/fluid/initializer.py
浏览文件 @
0098a494
...
...
@@ -264,7 +264,8 @@ class NormalInitializer(Initializer):
"dtype"
:
int
(
var
.
dtype
),
"mean"
:
self
.
_mean
,
"std"
:
self
.
_std_dev
,
"seed"
:
self
.
_seed
"seed"
:
self
.
_seed
,
"use_mkldnn"
:
False
})
var
.
op
=
op
return
op
...
...
python/paddle/fluid/io.py
浏览文件 @
0098a494
...
...
@@ -555,7 +555,8 @@ def save_inference_model(dirname,
executor
,
main_program
=
None
,
model_filename
=
None
,
params_filename
=
None
):
params_filename
=
None
,
export_for_deployment
=
True
):
"""
Prune the given `main_program` to build a new program especially for inference,
and then save it and all related parameters to given `dirname` by the `executor`.
...
...
@@ -577,6 +578,8 @@ def save_inference_model(dirname,
params_filename(str|None): The name of file to save all related parameters.
If it is setted None, parameters will be saved
in separate files .
export_for_deployment(bool): remove the read ops that are added by py_reader
for cpp inference lib. Default True
Returns:
None
...
...
@@ -643,7 +646,8 @@ def save_inference_model(dirname,
copy_program
.
desc
.
flush
()
pruned_program
=
copy_program
.
prune
(
targets
=
target_vars
)
inference_program
=
pruned_program
.
inference_optimize
()
inference_program
=
pruned_program
.
inference_optimize
(
export_for_deployment
=
export_for_deployment
)
fetch_var_names
=
[
v
.
name
for
v
in
target_vars
]
prepend_feed_ops
(
inference_program
,
feeded_var_names
)
...
...
python/paddle/fluid/layers/detection.py
浏览文件 @
0098a494
...
...
@@ -164,7 +164,7 @@ def rpn_target_assign(loc,
})
# 4. Reshape and gather the target entry
scores
=
nn
.
reshape
(
x
=
scores
,
shape
=
(
-
1
,
1
))
scores
=
nn
.
reshape
(
x
=
scores
,
shape
=
(
-
1
,
2
))
loc
=
nn
.
reshape
(
x
=
loc
,
shape
=
(
-
1
,
4
))
target_label
=
nn
.
reshape
(
x
=
target_label
,
shape
=
(
-
1
,
1
))
target_bbox
=
nn
.
reshape
(
x
=
target_bbox
,
shape
=
(
-
1
,
4
))
...
...
python/paddle/fluid/parallel_executor.py
浏览文件 @
0098a494
...
...
@@ -273,19 +273,19 @@ class ParallelExecutor(object):
arr
=
self
.
scope
.
find_var
(
fetch_var_name
).
get_lod_tensor_array
()
if
self
.
is_dist
:
self
.
bcast_params
()
self
.
_
bcast_params
()
if
return_numpy
:
return
executor
.
as_numpy
(
arr
)
return
[
arr
[
i
]
for
i
in
range
(
len
(
arr
))]
def
bcast_params
(
self
):
def
_
bcast_params
(
self
):
"""
Broadcast the parameters to other devices. It is used during
distributed training.
"""
self
.
executor
.
bcast_params
(
set
(
self
.
persistable_vars
))
self
.
executor
.
_
bcast_params
(
set
(
self
.
persistable_vars
))
@
property
def
device_count
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_desc_clone.py
0 → 100644
浏览文件 @
0098a494
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
numpy
as
np
import
argparse
import
time
import
math
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.profiler
as
profiler
from
paddle.fluid
import
core
import
unittest
from
multiprocessing
import
Process
import
os
import
signal
import
collections
SEED
=
1
DTYPE
=
"float32"
paddle
.
dataset
.
mnist
.
fetch
()
# random seed must set before configuring the network.
# fluid.default_startup_program().random_seed = SEED
def
cnn_model
(
data
):
conv_pool_1
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
data
,
filter_size
=
5
,
num_filters
=
20
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
conv_pool_2
=
fluid
.
nets
.
simple_img_conv_pool
(
input
=
conv_pool_1
,
filter_size
=
5
,
num_filters
=
50
,
pool_size
=
2
,
pool_stride
=
2
,
act
=
"relu"
)
# TODO(dzhwinter) : refine the initializer and random seed settting
SIZE
=
10
input_shape
=
conv_pool_2
.
shape
param_shape
=
[
reduce
(
lambda
a
,
b
:
a
*
b
,
input_shape
[
1
:],
1
)]
+
[
SIZE
]
scale
=
(
2.0
/
(
param_shape
[
0
]
**
2
*
SIZE
))
**
0.5
predict
=
fluid
.
layers
.
fc
(
input
=
conv_pool_2
,
size
=
SIZE
,
act
=
"softmax"
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
scale
)))
return
predict
def
get_model
(
batch_size
):
# Input data
images
=
fluid
.
layers
.
data
(
name
=
'pixel'
,
shape
=
[
1
,
28
,
28
],
dtype
=
DTYPE
)
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
# Train program
predict
=
cnn_model
(
images
)
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
predict
,
label
=
label
)
avg_cost
=
fluid
.
layers
.
mean
(
x
=
cost
)
# Evaluator
batch_size_tensor
=
fluid
.
layers
.
create_tensor
(
dtype
=
'int64'
)
batch_acc
=
fluid
.
layers
.
accuracy
(
input
=
predict
,
label
=
label
,
total
=
batch_size_tensor
)
inference_program
=
fluid
.
default_main_program
().
clone
()
# Optimization
opt
=
fluid
.
optimizer
.
AdamOptimizer
(
learning_rate
=
0.001
,
beta1
=
0.9
,
beta2
=
0.999
)
# Reader
train_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
train
(),
batch_size
=
batch_size
)
test_reader
=
paddle
.
batch
(
paddle
.
dataset
.
mnist
.
test
(),
batch_size
=
batch_size
)
opt
.
minimize
(
avg_cost
)
return
inference_program
,
avg_cost
,
train_reader
,
test_reader
,
batch_acc
,
predict
def
get_transpiler
(
trainer_id
,
main_program
,
pserver_endpoints
,
trainers
):
t
=
fluid
.
DistributeTranspiler
()
t
.
transpile
(
trainer_id
=
trainer_id
,
program
=
main_program
,
pservers
=
pserver_endpoints
,
trainers
=
trainers
)
return
t
def
operator_equal
(
a
,
b
):
for
k
,
v
in
a
.
__dict__
.
iteritems
():
if
isinstance
(
v
,
fluid
.
framework
.
Program
)
or
\
isinstance
(
v
,
fluid
.
framework
.
Block
):
continue
elif
isinstance
(
v
,
core
.
OpDesc
):
if
v
.
serialize_to_string
()
!=
b
.
__dict__
[
k
].
serialize_to_string
():
raise
ValueError
(
"In operator_equal not equal:{0}
\n
"
.
format
(
k
))
elif
isinstance
(
v
,
collections
.
OrderedDict
):
v0
=
sorted
(
v
.
iteritems
(),
key
=
lambda
x
:
x
[
0
])
v1
=
sorted
(
b
.
__dict__
[
k
].
iteritems
(),
key
=
lambda
x
:
x
[
0
])
if
v0
!=
v1
:
raise
ValueError
(
"In operator_equal not equal:{0}
\n
"
.
format
(
k
))
elif
(
v
!=
b
.
__dict__
[
k
]):
raise
ValueError
(
"In operator_equal not equal:{0}
\n
"
.
format
(
k
))
return
True
def
block_equal
(
a
,
b
):
for
k
,
v
in
a
.
__dict__
.
iteritems
():
if
isinstance
(
v
,
core
.
ProgramDesc
)
or
isinstance
(
v
,
fluid
.
framework
.
Program
)
or
isinstance
(
v
,
core
.
BlockDesc
):
continue
elif
k
==
"ops"
:
for
i
in
range
(
0
,
len
(
a
.
ops
)):
if
not
operator_equal
(
a
.
ops
[
i
],
b
.
ops
[
i
]):
raise
ValueError
(
"In block_equal not equal:{0}
\n
"
.
format
(
k
))
assert
(
len
(
a
.
ops
)
==
len
(
b
.
ops
))
elif
isinstance
(
v
,
collections
.
OrderedDict
):
v0
=
sorted
(
v
.
iteritems
(),
key
=
lambda
x
:
x
[
0
])
v1
=
sorted
(
b
.
__dict__
[
k
].
iteritems
(),
key
=
lambda
x
:
x
[
0
])
if
v0
!=
v1
:
raise
ValueError
(
"In block_equal not equal:{0}
\n
"
.
format
(
k
))
elif
(
v
!=
b
.
__dict__
[
k
]):
raise
ValueError
(
"In block_equal not equal:{0}
\n
"
.
format
(
k
))
return
True
def
program_equal
(
a
,
b
):
for
k
,
v
in
a
.
__dict__
.
iteritems
():
if
isinstance
(
v
,
core
.
ProgramDesc
):
continue
elif
k
==
'blocks'
:
for
i
in
range
(
0
,
len
(
a
.
blocks
)):
if
not
block_equal
(
a
.
blocks
[
i
],
b
.
blocks
[
i
]):
raise
ValueError
(
"In operator_equal not equal:{0}
\n
"
.
format
(
k
))
return
False
assert
(
len
(
a
.
blocks
)
==
len
(
b
.
blocks
))
elif
(
v
!=
b
.
__dict__
[
k
]):
raise
ValueError
(
"In program_equal not equal:{0}
\n
"
.
format
(
k
))
return
True
class
TestDistMnist
(
unittest
.
TestCase
):
def
test_desc_clone
(
self
):
get_model
(
batch_size
=
20
)
pserver_endpoints
=
"127.0.0.1:9123"
trainers
=
1
current_endpoint
=
"127.0.0.1:9123"
t
=
get_transpiler
(
0
,
fluid
.
default_main_program
(),
pserver_endpoints
,
trainers
)
pserver_prog
=
t
.
get_pserver_program
(
current_endpoint
)
startup_prog
=
t
.
get_startup_program
(
current_endpoint
,
pserver_prog
)
main
=
pserver_prog
.
clone
()
startup
=
startup_prog
.
clone
()
self
.
assertTrue
(
program_equal
(
main
,
pserver_prog
))
self
.
assertTrue
(
program_equal
(
startup
,
startup_prog
))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_base.py
浏览文件 @
0098a494
...
...
@@ -130,7 +130,7 @@ class TestDistBase(unittest.TestCase):
self
.
_ps_endpoints
=
"127.0.0.1:9123,127.0.0.1:9124"
self
.
_python_interp
=
"python"
def
start_pserver
(
self
,
model_file
):
def
start_pserver
(
self
,
model_file
,
check_error_log
):
ps0_ep
,
ps1_ep
=
self
.
_ps_endpoints
.
split
(
","
)
ps0_cmd
=
"%s %s pserver %s 0 %s %d TRUE"
%
\
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps0_ep
,
...
...
@@ -139,11 +139,23 @@ class TestDistBase(unittest.TestCase):
(
self
.
_python_interp
,
model_file
,
self
.
_ps_endpoints
,
ps1_ep
,
self
.
_trainers
)
ps0_pipe
=
subprocess
.
PIPE
ps1_pipe
=
subprocess
.
PIPE
if
check_error_log
:
print
(
"ps0_cmd:"
,
ps0_cmd
)
print
(
"ps1_cmd:"
,
ps1_cmd
)
ps0_pipe
=
open
(
"/tmp/ps0_err.log"
,
"wb"
)
ps1_pipe
=
open
(
"/tmp/ps1_err.log"
,
"wb"
)
ps0_proc
=
subprocess
.
Popen
(
ps0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
ps0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
ps0_pipe
)
ps1_proc
=
subprocess
.
Popen
(
ps1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
)
return
ps0_proc
,
ps1_proc
ps1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
ps1_pipe
)
if
not
check_error_log
:
return
ps0_proc
,
ps1_proc
,
None
,
None
else
:
return
ps0_proc
,
ps1_proc
,
ps0_pipe
,
ps1_pipe
def
_wait_ps_ready
(
self
,
pid
):
retry_times
=
50
...
...
@@ -160,7 +172,7 @@ class TestDistBase(unittest.TestCase):
(
e
,
retry_times
))
retry_times
-=
1
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
):
def
check_with_place
(
self
,
model_file
,
delta
=
1e-3
,
check_error_log
=
False
):
# *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
required_envs
=
{
"PATH"
:
os
.
getenv
(
"PATH"
),
...
...
@@ -169,17 +181,32 @@ class TestDistBase(unittest.TestCase):
"FLAGS_fraction_of_gpu_memory_to_use"
:
"0.15"
,
"FLAGS_cudnn_deterministic"
:
"1"
}
if
check_error_log
:
required_envs
[
"GLOG_v"
]
=
"7"
required_envs
[
"GLOG_logtostderr"
]
=
"1"
# Run local to get a base line
env_local
=
{
"CUDA_VISIBLE_DEVICES"
:
"0"
}
env_local
.
update
(
required_envs
)
local_cmd
=
"%s %s trainer %s 0 %s %d FLASE"
%
\
(
self
.
_python_interp
,
model_file
,
"127.0.0.1:1234"
,
"127.0.0.1:1234"
,
1
)
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env_local
)
if
not
check_error_log
:
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
env
=
env_local
)
else
:
print
(
"trainer cmd:"
,
local_cmd
)
err_log
=
open
(
"/tmp/trainer.err.log"
,
"wb"
)
local_proc
=
subprocess
.
Popen
(
local_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
err_log
,
env
=
env_local
)
local_proc
.
wait
()
out
,
err
=
local_proc
.
communicate
()
local_ret
=
out
...
...
@@ -187,7 +214,8 @@ class TestDistBase(unittest.TestCase):
sys
.
stderr
.
write
(
'local_stderr: %s
\n
'
%
err
)
# Run dist train to compare with local results
ps0
,
ps1
=
self
.
start_pserver
(
model_file
)
ps0
,
ps1
,
ps0_pipe
,
ps1_pipe
=
self
.
start_pserver
(
model_file
,
check_error_log
)
self
.
_wait_ps_ready
(
ps0
.
pid
)
self
.
_wait_ps_ready
(
ps1
.
pid
)
...
...
@@ -205,15 +233,23 @@ class TestDistBase(unittest.TestCase):
env1
.
update
(
required_envs
)
FNULL
=
open
(
os
.
devnull
,
'w'
)
tr0_pipe
=
subprocess
.
PIPE
tr1_pipe
=
subprocess
.
PIPE
if
check_error_log
:
print
(
"tr0_cmd:"
,
tr0_cmd
)
print
(
"tr1_cmd:"
,
tr1_cmd
)
tr0_pipe
=
open
(
"/tmp/tr0_err.log"
,
"wb"
)
tr1_pipe
=
open
(
"/tmp/tr1_err.log"
,
"wb"
)
tr0_proc
=
subprocess
.
Popen
(
tr0_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
stderr
=
tr0_pipe
,
env
=
env0
)
tr1_proc
=
subprocess
.
Popen
(
tr1_cmd
.
split
(
" "
),
stdout
=
subprocess
.
PIPE
,
stderr
=
subprocess
.
PIPE
,
stderr
=
tr1_pipe
,
env
=
env1
)
tr0_proc
.
wait
()
...
...
@@ -230,6 +266,13 @@ class TestDistBase(unittest.TestCase):
local_first_loss
=
eval
(
local_lines
[
0
])[
0
]
local_last_loss
=
eval
(
local_lines
[
1
])[
0
]
# close trainer file
if
check_error_log
:
tr0_pipe
.
close
()
tr1_pipe
.
close
()
ps0_pipe
.
close
()
ps1_pipe
.
close
()
# FIXME: use terminate() instead of sigkill.
os
.
kill
(
ps0
.
pid
,
signal
.
SIGKILL
)
os
.
kill
(
ps1
.
pid
,
signal
.
SIGKILL
)
...
...
python/paddle/fluid/tests/unittests/test_dist_transpiler.py
浏览文件 @
0098a494
...
...
@@ -259,7 +259,7 @@ class TestLRDecayConditional(TranspilerTest):
serv_op
=
pserver
.
blocks
[
0
].
ops
[
0
]
sub_blocks
=
[]
optimize_blocks
=
[]
for
b
in
serv_op
.
a
ttrs
[
"optimize_blocks"
]:
for
b
in
serv_op
.
a
ll_attrs
()
[
"optimize_blocks"
]:
optimize_blocks
.
append
(
b
.
idx
)
for
b
in
pserver
.
blocks
:
if
b
.
idx
not
in
optimize_blocks
:
...
...
@@ -536,5 +536,35 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase):
self
.
assertEqual
([
op
.
type
for
op
in
trainer
.
blocks
[
0
].
ops
],
ops
)
class
TestRMSPropOptimizer
(
TranspilerTest
):
def
net_conf
(
self
):
x
=
fluid
.
layers
.
data
(
name
=
'x'
,
shape
=
[
1000
],
dtype
=
'float32'
)
y_predict
=
fluid
.
layers
.
fc
(
input
=
x
,
size
=
1000
,
act
=
None
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'fc_w'
),
bias_attr
=
fluid
.
ParamAttr
(
name
=
'fc_b'
))
y
=
fluid
.
layers
.
data
(
name
=
'y'
,
shape
=
[
1
],
dtype
=
'float32'
)
cost
=
fluid
.
layers
.
square_error_cost
(
input
=
y_predict
,
label
=
y
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
optimizer
=
fluid
.
optimizer
.
RMSProp
(
learning_rate
=
0.1
)
optimizer
.
minimize
(
avg_cost
)
return
def
transpiler_test_impl
(
self
):
pserver
,
startup
=
self
.
get_pserver
(
self
.
pserver1_ep
)
pserver2
,
startup2
=
self
.
get_pserver
(
self
.
pserver2_ep
)
self
.
assertEqual
(
len
(
pserver
.
blocks
),
3
)
# block1~2: optimize pass
self
.
assertEqual
([
op
.
type
for
op
in
pserver
.
blocks
[
1
].
ops
],
[
"sum"
,
"scale"
,
"rmsprop"
])
# the variable #fc_w will be split into two blocks
fc_w_var
=
startup
.
global_block
().
var
(
"fc_w.block1"
)
self
.
assertEqual
(
fc_w_var
.
shape
,
(
500
,
1000
))
moment_var
=
startup
.
global_block
().
var
(
"momentum_1"
)
self
.
assertEqual
(
moment_var
.
shape
,
(
500
,
1000
))
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fetch_var.py
浏览文件 @
0098a494
...
...
@@ -26,7 +26,7 @@ class TestFetchVar(op_test.OpTest):
layers
.
assign
(
input
=
val
,
output
=
x
)
exe
=
fluid
.
Executor
(
fluid
.
CPUPlace
())
exe
.
run
(
fluid
.
default_main_program
(),
feed
=
{},
fetch_list
=
[])
fetched_x
=
fluid
.
fetch_var
(
"x"
)
fetched_x
=
fluid
.
executor
.
_
fetch_var
(
"x"
)
self
.
assertTrue
(
numpy
.
array_equal
(
fetched_x
,
val
),
"fetch_x=%s val=%s"
%
(
fetched_x
,
val
))
...
...
python/paddle/fluid/tests/unittests/test_program.py
浏览文件 @
0098a494
...
...
@@ -17,6 +17,7 @@ import unittest
from
paddle.fluid.framework
import
Program
,
default_main_program
,
program_guard
,
grad_var_name
import
paddle.fluid.layers
as
layers
import
paddle.fluid
as
fluid
main_program
=
default_main_program
()
...
...
@@ -98,6 +99,39 @@ class TestProgram(unittest.TestCase):
new_program
=
main_program
.
clone
()
self
.
assertNotEqual
(
0
,
len
(
new_program
.
blocks
[
0
].
all_parameters
()))
def
test_program_inference_optimize
(
self
):
def
net
():
reader
=
fluid
.
layers
.
py_reader
(
capacity
=
10
,
shapes
=
[[
-
1
,
10
],
[
-
1
,
1
]],
lod_levels
=
[
0
,
0
],
dtypes
=
[
'float32'
,
'int64'
],
use_double_buffer
=
True
)
in_data
,
label
=
fluid
.
layers
.
read_file
(
reader
)
predict_label
=
fluid
.
layers
.
fc
(
in_data
,
size
=
2
,
act
=
'softmax'
)
loss
=
fluid
.
layers
.
mean
(
fluid
.
layers
.
cross_entropy
(
input
=
predict_label
,
label
=
label
))
optimizer
=
fluid
.
optimizer
.
Adam
()
optimizer
.
minimize
(
loss
)
startup_program
=
fluid
.
Program
()
main_program
=
fluid
.
Program
()
with
fluid
.
program_guard
(
main_program
,
startup_program
):
net
()
no_read_program
=
main_program
.
inference_optimize
()
keep_read_program
=
main_program
.
inference_optimize
(
export_for_deployment
=
False
)
no_read_ops
=
no_read_program
.
global_block
().
ops
keep_read_ops
=
keep_read_program
.
global_block
().
ops
self
.
assertEqual
(
len
(
keep_read_ops
)
-
len
(
no_read_ops
),
2
)
self
.
assertEqual
(
keep_read_ops
[
0
].
type
,
'create_double_buffer_reader'
)
self
.
assertEqual
(
keep_read_ops
[
1
].
type
,
'read'
)
for
i
in
range
(
len
(
no_read_ops
)):
self
.
assertEqual
(
no_read_ops
[
i
].
type
,
keep_read_ops
[
i
+
2
].
type
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_protobuf_descs.py
浏览文件 @
0098a494
...
...
@@ -68,7 +68,7 @@ class TestOpDesc(unittest.TestCase):
self
.
assertEqual
(
8
,
len
(
op
.
attr_names
()))
op
.
set_block_attr
(
"block_attr"
,
program_desc
.
block
(
0
))
self
.
assertEqual
(
0
,
op
.
block_attr
(
"block_attr"
))
self
.
assertEqual
(
0
,
op
.
block_attr
_id
(
"block_attr"
))
mul_op
=
block
.
append_op
()
mul_op
.
set_type
(
"mul"
)
...
...
python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
浏览文件 @
0098a494
...
...
@@ -62,7 +62,8 @@ class TestPyReader(unittest.TestCase):
next_data
=
np
.
random
.
uniform
(
low
=
0
,
high
=
1000
,
size
=
(
batch_size
,
)
+
shape
[
1
:]).
astype
(
dtype
)
in_data
.
append
(
executor
.
as_lodtensor
(
next_data
))
in_data
.
append
(
fluid
.
executor
.
_as_lodtensor
(
next_data
,
place
))
self
.
inputs
.
append
(
in_data
)
...
...
python/paddle/fluid/tests/unittests/transformer_model.py
浏览文件 @
0098a494
...
...
@@ -22,7 +22,7 @@ pos_enc_param_names = (
"src_pos_enc_table"
,
"trg_pos_enc_table"
,
)
batch_size
=
64
batch_size
=
2
def
position_encoding_init
(
n_position
,
d_pos_vec
):
...
...
python/paddle/fluid/transpiler/distribute_transpiler.py
浏览文件 @
0098a494
...
...
@@ -584,12 +584,12 @@ class DistributeTranspiler(object):
if
op
.
type
in
[
"gaussian_random"
,
"fill_constant"
,
"uniform_random"
]:
op
.
attrs
[
"shape"
]
=
new_outputs
[
"Out"
].
shape
op
.
set_attr
(
"shape"
,
list
(
new_outputs
[
"Out"
].
shape
))
s_prog
.
global_block
().
append_op
(
type
=
op
.
type
,
inputs
=
new_inputs
,
outputs
=
new_outputs
,
attrs
=
op
.
a
ttrs
)
attrs
=
op
.
a
ll_attrs
()
)
return
s_prog
# ====================== private transpiler functions =====================
...
...
@@ -603,7 +603,7 @@ class DistributeTranspiler(object):
self
.
table_name
=
None
for
op
in
self
.
origin_program
.
global_block
().
ops
:
if
op
.
type
==
LOOKUP_TABLE_TYPE
:
if
op
.
attr
s
[
'is_distributed'
]
is
True
:
if
op
.
attr
(
'is_distributed'
)
is
True
:
if
self
.
table_name
is
None
:
self
.
table_name
=
op
.
input
(
"W"
)[
0
]
if
self
.
table_name
!=
op
.
input
(
"W"
)[
0
]:
...
...
@@ -749,14 +749,14 @@ class DistributeTranspiler(object):
out_name
=
op
.
output
(
"Out"
)
ids_var
=
program
.
global_block
().
vars
[
ids_name
[
0
]]
prefetch_input_vars
=
self
.
create_splited_vars
(
prefetch_input_vars
=
self
.
_
create_splited_vars
(
source_var
=
ids_var
,
block
=
program
.
global_block
(),
tag
=
"_prefetch_in_"
)
self
.
all_prefetch_input_vars
.
append
(
prefetch_input_vars
)
out_var
=
program
.
global_block
().
vars
[
out_name
[
0
]]
prefetch_output_vars
=
self
.
create_splited_vars
(
prefetch_output_vars
=
self
.
_
create_splited_vars
(
source_var
=
out_var
,
block
=
program
.
global_block
(),
tag
=
"_prefetch_out_"
)
...
...
@@ -1038,7 +1038,7 @@ class DistributeTranspiler(object):
program
.
global_block
().
_sync_with_cpp
()
return
var_mapping
def
create_splited_vars
(
self
,
source_var
,
block
,
tag
):
def
_
create_splited_vars
(
self
,
source_var
,
block
,
tag
):
return
[
block
.
create_var
(
name
=
str
(
source_var
.
name
+
tag
+
str
(
index
)),
...
...
@@ -1182,18 +1182,39 @@ class DistributeTranspiler(object):
program
=
optimize_block
.
program
pserver_block
=
program
.
global_block
()
new_inputs
=
dict
()
# update param/grad shape first, then other inputs like
# moment can use the updated shape
def
_get_param_block
(
opt_op
):
# param is already created on global program
param_block
=
None
for
p
in
self
.
param_grad_ep_mapping
[
endpoint
][
"params"
]:
if
same_or_split_var
(
p
.
name
,
opt_op
.
input
(
"Param"
)[
0
]):
param_block
=
p
break
return
param_block
for
key
in
opt_op
.
input_names
:
if
key
==
"Grad"
:
new_inputs
[
key
]
=
merged_var
# For RMSProp optimizer
elif
key
==
"Moment"
or
key
==
"MeanSquare"
:
param_block
=
_get_param_block
(
opt_op
)
if
not
param_block
:
return
moment_var
=
origin_program
.
global_block
().
vars
[
opt_op
.
input
(
key
)[
0
]]
tmpvar
=
pserver_block
.
create_var
(
name
=
moment_var
.
name
,
persistable
=
moment_var
.
persistable
,
dtype
=
moment_var
.
dtype
,
# change to use same shape as param
# TODO(typhoonzero): didn't append .block in the var name,
# may affect checkpoint saving? Need to verify.
shape
=
param_block
.
shape
)
new_inputs
[
key
]
=
tmpvar
elif
key
==
"Param"
:
# param is already created on global program
param_block
=
None
for
p
in
self
.
param_grad_ep_mapping
[
endpoint
][
"params"
]:
if
same_or_split_var
(
p
.
name
,
opt_op
.
input
(
key
)[
0
]):
param_block
=
p
break
param_block
=
_get_param_block
(
opt_op
)
if
not
param_block
:
return
tmpvar
=
pserver_block
.
create_var
(
...
...
@@ -1219,7 +1240,7 @@ class DistributeTranspiler(object):
for
key
in
opt_op
.
input_names
:
new_shape
=
None
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
]:
if
key
in
[
"Param"
,
"Grad"
,
"LearningRate"
,
"Moment"
,
"MeanSquare"
]:
continue
var
=
self
.
origin_program
.
global_block
().
vars
[
opt_op
.
input
(
key
)[
0
]]
# update accumulator variable shape
...
...
@@ -1242,7 +1263,7 @@ class DistributeTranspiler(object):
type
=
opt_op
.
type
,
inputs
=
new_inputs
,
outputs
=
outputs
,
attrs
=
opt_op
.
a
ttrs
)
attrs
=
opt_op
.
a
ll_attrs
()
)
def
_is_splited_grad_var
(
self
,
var
,
var_dict
):
grad_block
=
None
...
...
@@ -1275,7 +1296,7 @@ class DistributeTranspiler(object):
block
.
_clone_variable
(
var
)
return
block
.
append_op
(
type
=
op
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
op
.
a
ttrs
)
type
=
op
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
op
.
a
ll_attrs
()
)
def
_append_pserver_non_opt_ops
(
self
,
optimize_block
,
opt_op
):
program
=
optimize_block
.
program
...
...
@@ -1316,7 +1337,7 @@ class DistributeTranspiler(object):
type
=
opt_op
.
type
,
inputs
=
inputs
,
outputs
=
outputs
,
attrs
=
opt_op
.
a
ttrs
)
attrs
=
opt_op
.
a
ll_attrs
()
)
def
_is_op_connected
(
self
,
op1
,
op2
):
# If one op's input is another op's output or
...
...
@@ -1421,8 +1442,8 @@ class DistributeTranspiler(object):
# optimize
op_maker
=
core
.
op_proto_and_checker_maker
optimize_role
=
core
.
op_proto_and_checker_maker
.
OpRole
.
Optimize
if
op_maker
.
kOpRoleAttrName
()
in
op
.
attrs
and
\
int
(
op
.
attrs
[
op_maker
.
kOpRoleAttrName
()])
==
int
(
optimize_role
):
if
op_maker
.
kOpRoleAttrName
()
in
op
.
attr
_name
s
and
\
int
(
op
.
all_attrs
()
[
op_maker
.
kOpRoleAttrName
()])
==
int
(
optimize_role
):
return
True
return
False
...
...
@@ -1445,8 +1466,8 @@ class DistributeTranspiler(object):
# and op_role_var to get the pair.
for
input_name
in
op
.
input_arg_names
:
if
input_name
.
find
(
"@GRAD"
)
!=
-
1
and
\
op
.
attr
s
[
RPC_OP_ROLE_ATTR_NAME
]
:
param_name
=
op
.
attr
s
[
OP_ROLE_VAR_ATTR_NAME
]
[
0
]
op
.
attr
(
RPC_OP_ROLE_ATTR_NAME
)
:
param_name
=
op
.
attr
(
OP_ROLE_VAR_ATTR_NAME
)
[
0
]
params_grads
.
append
([
origin_var_dict
[
param_name
],
origin_var_dict
[
input_name
]
...
...
python/paddle/fluid/transpiler/inference_transpiler.py
浏览文件 @
0098a494
...
...
@@ -57,10 +57,10 @@ class InferenceTranspiler(object):
scope
=
global_scope
()
if
not
isinstance
(
scope
,
core
.
Scope
):
raise
TypeError
(
"scope should be as Scope type or None"
)
self
.
fuse_batch_norm
(
program
,
place
,
scope
)
self
.
fuse_relu_mkldnn
(
program
)
self
.
_
fuse_batch_norm
(
program
,
place
,
scope
)
self
.
_
fuse_relu_mkldnn
(
program
)
def
fuse_relu_mkldnn
(
self
,
program
):
def
_
fuse_relu_mkldnn
(
self
,
program
):
'''
Transpile the program by fused relu activation for MKLDNN program.
...
...
@@ -104,7 +104,7 @@ class InferenceTranspiler(object):
# And a better solution will be considered later.
program
=
program
.
clone
()
def
fuse_batch_norm
(
self
,
program
,
place
,
scope
):
def
_
fuse_batch_norm
(
self
,
program
,
place
,
scope
):
'''
Transpile the program by fused batch normalization.
...
...
python/requirements.txt
浏览文件 @
0098a494
requests==2.9.2
numpy>=1.12
numpy>=1.12
,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
protobuf==3.1
recordio>=0.1.0
matplotlib
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录