diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index ed054ff41ae0ec5a4b31dd256e397129cba3e8f1..84354c446e2f54fa13b90fa37221eed90968b251 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -52,6 +52,7 @@ ExternalProject_Add( PREFIX ${ANAKIN_SOURCE_DIR} UPDATE_COMMAND "" CMAKE_ARGS ${CMAKE_ARGS_PREFIX} + -DUSE_LOGGER=YES -DUSE_X86_PLACE=YES -DBUILD_WITH_UNIT_TEST=NO -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf diff --git a/cmake/flags.cmake b/cmake/flags.cmake index e0556a0babc74ba6efa0a190d4f7b77416bef3bf..331b2af367bdf261ffbf96fb88f61cc6958ee647 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -27,7 +27,6 @@ endfunction() CheckCompilerCXX11Flag() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - # safe_set_flag # # Set a compile flag only if compiler is support @@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name) endif() endmacro() +macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared + if (BUILD_SHARED_LIBS) + return() # if build shared libs, the flags keep same with '/MD' + endif(BUILD_SHARED_LIBS) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) if(NOT UINT64_MAX_EXISTS) @@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "") # Common flags. the compiler flag used for C/C++ sources whenever release or debug # Do not care if this flag is support for gcc. + +# https://github.com/PaddlePaddle/Paddle/issues/12773 +if (NOT WIN32) set(COMMON_FLAGS -fPIC -fno-omit-frame-pointer + -Werror -Wall -Wextra -Wnon-virtual-dtor @@ -114,11 +131,6 @@ set(COMMON_FLAGS -Wno-error=terminate # Warning in PADDLE_ENFORCE ) -# https://github.com/PaddlePaddle/Paddle/issues/12773 -if (NOT WIN32) -list(APPEND COMMON_FLAGS -Werror) -endif() - set(GPU_COMMON_FLAGS -fPIC -fno-omit-frame-pointer @@ -133,30 +145,53 @@ set(GPU_COMMON_FLAGS -Wno-error=array-bounds # Warnings in Eigen::array ) +else(NOT WIN32) +set(COMMON_FLAGS + "/w") #disable all warnings. +set(GPU_COMMON_FLAGS + "/w") #disable all warnings +endif(NOT WIN32) + if (APPLE) if(NOT CMAKE_CROSSCOMPILING) # On Mac OS X build fat binaries with x86_64 architectures by default. set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) endif() -else() +endif(APPLE) + +if(LINUX) set(GPU_COMMON_FLAGS -Wall -Wextra -Werror ${GPU_COMMON_FLAGS}) -endif() +endif(LINUX) if(UNIX AND NOT APPLE) # except apple from nix*Os family set(LINUX TRUE) endif(UNIX AND NOT APPLE) - foreach(flag ${COMMON_FLAGS}) safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) + endforeach() foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() + +if(WIN32) +# windows build turn off warnings. +safe_set_static_flag() + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/W3") + string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/W3") + endforeach(flag_var) +endif(WIN32) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 0f9521616952a2857222feab8c38fb480761ee2d..a777a4974cc377db103a470698f817612a4e9a32 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -1,11 +1,9 @@ add_custom_target(paddle_apis ALL - DEPENDS paddle_v2_apis paddle_fluid_apis) + DEPENDS paddle_v2_apis) add_custom_target(paddle_docs ALL DEPENDS paddle_v2_docs paddle_v2_docs_cn - paddle_fluid_docs paddle_fluid_docs_cn paddle_mobile_docs paddle_mobile_docs_cn) add_subdirectory(v2) -add_subdirectory(fluid) add_subdirectory(mobile) diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py index 66e0345c299730c113ffbdc8dd3c1fa32f872f3d..8d95dc0591e1d6bd815cc697528191c2ee8c1cfe 100644 --- a/paddle/contrib/float16/float16_transpiler.py +++ b/paddle/contrib/float16/float16_transpiler.py @@ -102,8 +102,8 @@ class Float16Transpiler: continue for input_arg in current_op.input_arg_names: if input_arg in self.input_map: - current_op.rename_input(input_arg, - self.input_map[input_arg]) + current_op._rename_input(input_arg, + self.input_map[input_arg]) def _remove_unused_var(self): ''' @@ -187,7 +187,7 @@ class Float16Transpiler: shape=var.shape, persistable=var.persistable) find_op(var) - var.op.rename_output(var_name, tmp_var_name) + var.op._rename_output(var_name, tmp_var_name) self.block._insert_op( i, type="cast", diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index d3583cf894991624f537a4073f14aacc470aadd0..452806a20e08c518b0f5aab7f63366eeb9341561 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -6,26 +6,9 @@ paddle.fluid.Program.global_block ArgSpec(args=['self'], varargs=None, keywords= paddle.fluid.Program.list_vars ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.parse_from_string ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None) paddle.fluid.Program.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)) -paddle.fluid.Operator.__init__ ArgSpec(args=['self', 'block', 'desc', 'type', 'inputs', 'outputs', 'attrs'], varargs=None, keywords=None, defaults=(None, None, None, None)) -paddle.fluid.Operator.all_attrs ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.attr_type ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.block_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.block_attr_id ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.blocks_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.blocks_attr_ids ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.has_attr ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.has_kernel ArgSpec(args=['self', 'op_type'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.input ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.output ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None) -paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None) paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None) paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) -paddle.fluid.get_var ArgSpec(args=['name', 'program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.name_scope ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.Executor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -38,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) @@ -170,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_ paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)) +paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False)) +paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) +paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')) +paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -241,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -286,7 +269,7 @@ paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kw paddle.fluid.layers.box_coder ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.polygon_box_transform ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) -paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk'], varargs=None, keywords=None, defaults=('ROC', 4095, 1)) +paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.natural_exp_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.inverse_time_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -315,13 +298,18 @@ paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)) +paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index ee1f655e25dedb8846bb26275072fd9f6c1f123e..519a00fb073b08f6c88de8186de187476b548fd3 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -13,3 +13,5 @@ if(WITH_INFERENCE) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) endif() + +add_subdirectory(train) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d8cbe5d9e491555a94e9420995149041213ab79..39898dd23643c5742f209858c7d3dfad89968f7d 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -56,9 +56,9 @@ else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() if (NOT WIN32) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) else() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) endif (NOT WIN32) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) @@ -141,20 +141,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + if(WITH_DISTRIBUTE) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op) endif() if (NOT WIN32) - cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor - graph graph_viz_pass multi_devices_graph_pass - multi_devices_graph_print_pass multi_devices_graph_check_pass - fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + graph build_strategy + fast_threaded_ssa_graph_executor) endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a8e0c4a3fedfd56e38de7568be6b3f2e76a4b25f..e0a3ef5a9c6c53c42ebea1a41cac0d18a77781b2 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu # device_context reduce_op_handle ) cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) + +cc_library(build_strategy SRCS build_strategy.cc DEPS + graph_viz_pass multi_devices_graph_pass + multi_devices_graph_print_pass multi_devices_graph_check_pass + fuse_elewise_add_act_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a6b497fa897e3882995688bf36704b1d77ea962 --- /dev/null +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/details/build_strategy.h" + +#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" +#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelExecutorPassBuilder : public ir::PassBuilder { + public: + explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) + : ir::PassBuilder(), strategy_(strategy) { + // Add a graph viz pass to record a graph. + if (!strategy_.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + } + + // Add op fusion. + if (strategy.fuse_elewise_add_act_ops_) { + auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); + // Add a graph viz pass to record a graph. + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); + viz_pass->Set("graph_viz_path", + new std::string(graph_path)); + } + } + + // Convert graph to run on multi-devices. + auto multi_devices_pass = AppendPass("multi_devices_pass"); + multi_devices_pass->SetNotOwned("strategy", + &strategy_); + + // Add a graph print pass to record a graph with device info. + if (!strategy_.debug_graphviz_path_.empty()) { + auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); + multi_devices_print_pass->SetNotOwned( + "debug_graphviz_path", &strategy_.debug_graphviz_path_); + multi_devices_print_pass->Set( + "graph_printer", new details::GraphvizSSAGraphPrinter); + } + + // Verify that the graph is correct for multi-device executor. + AppendPass("multi_devices_check_pass"); + } + + private: + BuildStrategy strategy_; +}; + +std::shared_ptr BuildStrategy::CreatePassesFromStrategy() + const { + pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); + return pass_builder_; +} + +std::unique_ptr BuildStrategy::Apply( + const ProgramDesc &main_program, const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, +#ifdef PADDLE_WITH_CUDA + const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { +#else + const bool use_cuda) const { +#endif + // Create a default one if not initialized by user. + if (!pass_builder_) { + CreatePassesFromStrategy(); + } + + std::unique_ptr graph(new ir::Graph(main_program)); + + for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + if (pass->Type() == "multi_devices_pass") { + pass->Erase("places"); + pass->SetNotOwned>("places", &places); + pass->Erase("loss_var_name"); + pass->SetNotOwned("loss_var_name", &loss_var_name); + pass->Erase("params"); + pass->SetNotOwned>("params", + ¶m_names); + pass->Erase("local_scopes"); + pass->SetNotOwned>("local_scopes", + &local_scopes); +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase("nccl_ctxs"); + pass->SetNotOwned("nccl_ctxs", nctx); +#endif + } + graph = pass->Apply(std::move(graph)); + } + return graph; +} +} // namespace details +} // namespace framework +} // namespace paddle + +USE_PASS(fuse_elewise_add_act_pass); +USE_PASS(graph_viz_pass); +USE_PASS(multi_devices_pass); +USE_PASS(multi_devices_check_pass); +USE_PASS(multi_devices_print_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 77cafa49f18158ea4187908856e10adaa0f916c9..02c4bea16916d58a6d0fce8918f8fceb9ff9356e 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -15,6 +15,17 @@ #pragma once #include +#include + +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif namespace paddle { namespace framework { @@ -57,6 +68,30 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; bool enable_data_balance_{false}; + + // User normally doesn't need to call this API. + // The PassBuilder allows for more customized insert, remove of passes + // from python side. + // A new PassBuilder is created based on configs defined above and + // passes are owned by the PassBuilder. + std::shared_ptr CreatePassesFromStrategy() const; + + // Apply the passes built by the pass_builder_. The passes will be + // applied to the Program and output an ir::Graph. + std::unique_ptr Apply( + const ProgramDesc &main_program, + const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, +#ifdef PADDLE_WITH_CUDA + const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; +#else + const bool use_cuda) const; +#endif + + private: + mutable std::shared_ptr pass_builder_; }; } // namespace details diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h index 21f75957be5f33f3dfc09c41fa9a1e1ca590f99e..090517ff3c1822c2e62e61fad05d49e1c8db8573 100644 --- a/paddle/fluid/framework/details/cow_ptr.h +++ b/paddle/fluid/framework/details/cow_ptr.h @@ -20,79 +20,37 @@ namespace paddle { namespace framework { namespace details { -// Change it to thread safe flags if needed. -class ThreadUnsafeOwnershipFlags { +template +class COWPtr { public: - explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} - - ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete; - ThreadUnsafeOwnershipFlags& operator=( - const ThreadUnsafeOwnershipFlags& other) = delete; - ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default; - - void SetOwnership(bool flag) { flag_ = flag; } - - // Invoke the callback if it is not owned. - template - void AcquireOwnershipOnce(Callback acquire) { - if (!flag_) { - acquire(); - flag_ = true; - } - } + typedef std::shared_ptr RefPtr; private: - bool flag_; -}; + RefPtr m_sp; -// Copy-On-Write pointer. -// It will hold a T* pointer, and only copy once when `MutableData` is invoked. -// -// The template parameter OwnershipFlags should have: -// * a constructor takes a bool. True if own. -// * SetOwnership(bool flag). -// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not -// owned. -// -// https://en.wikipedia.org/wiki/Copy-on-write -template -class COWPtr { public: - // Ctor from raw pointer. - explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} + COWPtr() : m_sp(nullptr) {} + explicit COWPtr(T* t) : m_sp(t) {} - // Move methods. Steal ownership from origin - COWPtr(COWPtr&& other) - : payload_(other.payload_), ownership_{std::move(other.ownership_)} {} - COWPtr& operator=(COWPtr&& origin) = default; + const T& Data() const { return *m_sp; } - // Copy methods. Not own payload - COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {} - COWPtr& operator=(const COWPtr& other) { - payload_ = other.payload_; - ownership_.SetOwnership(false); - return *this; - } - - // Access read only data. - const T& Data() const { return *payload_; } - - // Access mutable data. If the data is not owned, the data will be copied - // before. T* MutableData() { - ownership_.AcquireOwnershipOnce( - [this] { payload_.reset(new T(*payload_)); }); - return payload_.get(); + DetachIfNotUnique(); + return m_sp.get(); } - private: - // Actual data pointer. - std::shared_ptr payload_; + void DetachIfNotUnique() { + T* tmp = m_sp.get(); + if (!(tmp == nullptr || m_sp.unique())) { + Detach(); + } + } - // Ownership flag. - OwnershipFlags ownership_; + void Detach() { + T* tmp = m_sp.get(); + m_sp = RefPtr(new T(*tmp)); + } }; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc index d2142af277c0b356d83941b3baab1947cce31dac..5b055d7cb4d127dc20f2cf70869134f24a93d429 100644 --- a/paddle/fluid/framework/details/cow_ptr_test.cc +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -30,6 +30,14 @@ TEST(COWPtr, all) { ASSERT_EQ(ptr2.Data(), 10); } +TEST(COWPtr, change_old) { + COWPtr ptr(new int{0}); + COWPtr ptr2 = ptr; + *ptr.MutableData() = 10; + ASSERT_EQ(ptr2.Data(), 0); + ASSERT_EQ(ptr.Data(), 10); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 4dca3ceb4569fb708c7a98621c5239acbe217586..a0bf1afd402c4e4eebe13cc3fc43f44f23dccaed 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -28,9 +28,9 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) -if(WITH_MKLDNN) - pass_library(conv_relu_mkldnn_fuse_pass inference) -endif() +if (WITH_MKLDNN) + pass_library(conv_relu_mkldnn_fuse_pass inference) +endif () pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) @@ -41,12 +41,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") +cc_library(pass_builder SRCS pass_builder.cc DEPS pass) + cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) -if(WITH_MKLDNN) - cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) -endif() +if (WITH_MKLDNN) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) +endif () diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index bb52d7e498e55c02ddc2cd6d07ccccd51ce4edc5..1c75cb5a82029b6a542a3a2f031a353f5e40f4ea 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -257,6 +257,22 @@ std::unique_ptr AttentionLSTMFusePass::ApplyImpl( std::unique_ptr graph) const { PDPattern external_pattern, subblock_pattern; + // Use the following variables to tell whether this model is RNN1. + // This fuse can only works on the RNN1 model. + std::unordered_set specified_vars({"data_lod_attention", + "cell_init", "hidden_init", + "data", "week", "minute"}); + int count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsVar() && specified_vars.count(node->Name())) { + ++count; + } + } + if (count < specified_vars.size()) { + return graph; + } + + // Continue to fuse. FindWhileOp(graph.get()); return graph; } diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index aa95d3e9f6c8221f6e48d192b73ad5135539dc75..f5c286486520391906a6cd7545041c8a7df614ea 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, const std::string BatchedCellPreAct = patterns::UniqueKey("BatchedCellPreAct"); const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); + const std::string CheckedCell = patterns::UniqueKey("CheckedCell"); scope->Var(BatchedInput)->GetMutable(); scope->Var(BatchedCellPreAct)->GetMutable(); scope->Var(BatchedGate)->GetMutable(); + scope->Var(CheckedCell)->GetMutable(); op_desc.SetInput("H0", {}); op_desc.SetInput("C0", {}); @@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, op_desc.SetOutput("BatchedGate", {BatchedGate}); op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); op_desc.SetOutput("BatchedInput", {BatchedInput}); + op_desc.SetOutput("CheckedCell", {CheckedCell}); op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); // TODO(TJ): get from attr diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 62f94a1c0e5a300438bbe5fea34b9a07df5d9ebf..c54766d95a61ac1a4b61566c6de62cbc86685a1d 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/ir/graph_helper.h" #include +#include #include -#include "paddle/fluid/framework/ir/graph_helper.h" - namespace paddle { namespace framework { namespace ir { @@ -113,6 +113,74 @@ std::map> BuildOperationAdjList( return adj_list; } +size_t GraphNum(const Graph &graph) { + std::unordered_set nodes = graph.Nodes(); + std::unordered_set visited_nodes; + visited_nodes.reserve(nodes.size()); + std::deque q_nodes; + std::vector> graph_nodes; + std::unordered_set g_nodes; + size_t graph_count = 0; + + auto traverse_nodes = [&visited_nodes, + &q_nodes](const std::vector &nodes) { + std::copy_if( + nodes.begin(), nodes.end(), std::back_inserter(q_nodes), + [&visited_nodes](Node *node) { return !visited_nodes.count(node); }); + }; + + while (visited_nodes.size() != nodes.size()) { + if (!q_nodes.empty()) { + auto cur_node = q_nodes.front(); + q_nodes.pop_front(); + visited_nodes.insert(cur_node); + g_nodes.insert(cur_node); + traverse_nodes(cur_node->inputs); + traverse_nodes(cur_node->outputs); + } else { + ++graph_count; + if (g_nodes.size()) { + graph_nodes.emplace_back(g_nodes); + } + g_nodes.clear(); + for (auto &n : nodes) { + if (visited_nodes.count(n) == 0) { + q_nodes.push_back(n); + break; + } + } + } + } + + if (g_nodes.size()) { + graph_nodes.emplace_back(g_nodes); + } + + if (VLOG_IS_ON(10)) { + VLOG(10) << "graph_num: " << graph_nodes.size(); + for (auto &g_n : graph_nodes) { + VLOG(10) << "graph_nodes: " << g_n.size(); + if (g_n.size() < 10) { + std::stringstream out; + for (auto &node : g_n) { + out << "\nNode: " << node->Name() << " in ["; + for (auto &n : node->inputs) { + out << n->Name() << ", "; + } + out << "], out["; + for (auto &n : node->outputs) { + out << n->Name() << ", "; + } + out << "]"; + } + VLOG(10) << out.str(); + } + } + } + + return graph_count; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index cd6c53a07f8f56781989739d995226bd02b3d3d0..ec46b38c01b8c369ab37b4fbd5497ec120d8db91 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -27,6 +27,8 @@ namespace ir { // Test if the graph contains circle. bool HasCircle(const Graph &graph); +size_t GraphNum(const Graph &graph); + // Topology Sort the operations in the graph from inputs to outputs. // `graph` cannot contain circle. std::vector TopologySortOperations(const Graph &graph); diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index a260dd3da2a7863c06e51aa4feafd824ea254139..cea902809339f9d45b0e2525163f08a3c1c44c95 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) { ASSERT_EQ(node_map.at("op2"), 1UL); ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); } + +void BuildZeroGraph(Graph* g) {} + +void BuildOneGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +void BuildTwoGraphs(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + // o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + // v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + // o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + // v4->outputs.push_back(o5); +} + +TEST(GraphHelperTest, GraphNum) { + ProgramDesc prog; + + Graph g(prog); + BuildZeroGraph(&g); + ASSERT_EQ(GraphNum(g), 0); + + Graph g2(prog); + BuildOneGraph(&g2); + ASSERT_EQ(GraphNum(g2), 1); + + Graph g3(prog); + BuildTwoGraphs(&g3); + ASSERT_EQ(GraphNum(g3), 2); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc index 8f548913e4e1d9d5bc5bdace8b92db9065cf3b5e..084a4ba2def87eaa8badb3ca2c39865c6e5cb981 100644 --- a/paddle/fluid/framework/ir/graph_traits.cc +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/framework/ir/graph_traits.h" +#include + namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index d7158eba62686be57499df697466797e4034ea8f..6cf405efe63d2bc284c4650771a747b27bb4a9f6 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -19,7 +19,6 @@ namespace paddle { namespace framework { namespace ir { std::unique_ptr Pass::Apply(std::unique_ptr graph) const { - PADDLE_ENFORCE(!applied_, "Pass can only Apply() once."); PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 0f14083d259172f5b5f1ed80c7d38312d711beb5..9570c59cff2a6afeb1c607f7219b7b455974d6ce 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -42,6 +42,8 @@ class Pass { attr_dels_.clear(); } + std::string Type() const { return type_; } + std::unique_ptr Apply(std::unique_ptr graph) const; // Get a reference to the attributed previously set. @@ -52,6 +54,21 @@ class Pass { return *boost::any_cast(attrs_.at(attr_name)); } + bool Has(const std::string &attr_name) const { + return attrs_.find(attr_name) != attrs_.end(); + } + + void Erase(const std::string &attr_name) { + if (!Has(attr_name)) { + return; + } + if (attr_dels_.find(attr_name) != attr_dels_.end()) { + attr_dels_[attr_name](); + attr_dels_.erase(attr_name); + } + attrs_.erase(attr_name); + } + // Set a pointer to the attribute. Pass takes ownership of the attribute. template void Set(const std::string &attr_name, AttrType *attr) { @@ -68,13 +85,15 @@ class Pass { // should delete the attribute. template void SetNotOwned(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass", + attr_name); attrs_[attr_name] = attr; } protected: - virtual std::unique_ptr ApplyImpl( - std::unique_ptr graph) const = 0; + virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + LOG(FATAL) << "Calling virtual Pass not implemented."; + } private: template @@ -89,7 +108,10 @@ class Pass { required_graph_attrs_.insert(attrs.begin(), attrs.end()); } + void RegisterType(const std::string &type) { type_ = type; } + mutable bool applied_{false}; + std::string type_; std::unordered_set required_pass_attrs_; std::unordered_set required_graph_attrs_; std::map attrs_; @@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar { PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), "'%s' is registered more than once.", pass_type); PassRegistry::Instance().Insert( - pass_type, [this]() -> std::unique_ptr { + pass_type, [this, pass_type]() -> std::unique_ptr { std::unique_ptr pass(new PassType()); pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); + pass->RegisterType(pass_type); return pass; }); } diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0719867b34d13666672b22070ce14dbaf80d85d --- /dev/null +++ b/paddle/fluid/framework/ir/pass_builder.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass_builder.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::shared_ptr PassBuilder::AppendPass(const std::string& pass_type) { + auto pass = ir::PassRegistry::Instance().Get(pass_type); + passes_.emplace_back(pass.release()); + return passes_.back(); +} + +void PassBuilder::RemovePass(size_t idx) { + PADDLE_ENFORCE(passes_.size() > idx); + passes_.erase(passes_.begin() + idx); +} + +std::shared_ptr PassBuilder::InsertPass(size_t idx, + const std::string& pass_type) { + PADDLE_ENFORCE(passes_.size() >= idx); + std::shared_ptr pass( + ir::PassRegistry::Instance().Get(pass_type).release()); + passes_.insert(passes_.begin() + idx, std::move(pass)); + return passes_[idx]; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_builder.h b/paddle/fluid/framework/ir/pass_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..733d3a3ad1ab8989ea30fe45cd7e1ffe9432de13 --- /dev/null +++ b/paddle/fluid/framework/ir/pass_builder.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class PassBuilder { + public: + PassBuilder() {} + + virtual ~PassBuilder() {} + + // Append a new pass to the end. + std::shared_ptr AppendPass(const std::string& pass_type); + + // Insert a new pass after `idx`. + std::shared_ptr InsertPass(size_t idx, const std::string& pass_type); + + // Remove a new pass at `idx`. + void RemovePass(size_t idx); + + // Returns a list of all passes. + std::vector> AllPasses() const { return passes_; } + + protected: + std::vector> passes_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc index 5b5011412ed39e033a7a65921e9c64ce2d54c638..6ad7d1df8bdd016b617c820c022ef55f23ba21cd 100644 --- a/paddle/fluid/framework/ir/pass_test.cc +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) { ASSERT_EQ(graph->Get("copy_test_pass_attr"), 2); ASSERT_EQ(graph->Get("copy_test_graph_attr"), 2); - try { - graph = pass->Apply(std::move(graph)); - } catch (paddle::platform::EnforceNotMet e) { - exception = std::string(e.what()); - } - ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos); + // Allow apply more than once. + graph.reset(new Graph(prog)); + graph->Set("test_graph_attr", new int); + graph = pass->Apply(std::move(graph)); pass = PassRegistry::Instance().Get("test_pass"); pass->SetNotOwned("test_pass_attr", &val); diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 7836ecb1272a07a79a70c9cb040335f9a42e5684..77386f4f069489b6ff7b927a281bdc286ff816e0 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -17,10 +17,13 @@ #include #include #include +#include // NOLINT +#include #include - +#include "paddle/fluid/framework/details/cow_ptr.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/memcpy.h" #include "glog/logging.h" @@ -28,206 +31,436 @@ namespace paddle { namespace framework { #if defined(PADDLE_WITH_CUDA) +namespace details { +struct CUDABuffer { + void *data_{nullptr}; + size_t size_{0}; + platform::CUDAPlace place_; + + CUDABuffer() {} + CUDABuffer(platform::Place place, size_t size) + : size_(size), place_(boost::get(place)) { + data_ = memory::Alloc(place_, size); + } + + ~CUDABuffer() { ClearMemory(); } + + CUDABuffer(const CUDABuffer &o) = delete; + CUDABuffer &operator=(const CUDABuffer &o) = delete; + + void Resize(platform::Place place, size_t size) { + ClearMemory(); + place_ = boost::get(place); + data_ = memory::Alloc(place_, size); + PADDLE_ENFORCE_NOT_NULL(data_); + size_ = size; + } + + void Swap(CUDABuffer &o) { + std::swap(data_, o.data_); + std::swap(place_, o.place_); + std::swap(size_, o.size_); + } + + private: + void ClearMemory() const { + if (data_ != nullptr) { + memory::Free(place_, data_); + } + } +}; +} // namespace details + // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template class Vector { public: using value_type = T; + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; - // Default ctor. Create empty Vector - Vector() { InitEmpty(); } + private: + // The actual class to implement vector logic + class VectorData { + public: + VectorData() : flag_(kDataInCPU) {} + VectorData(size_t count, const T &value) + : cpu_(count, value), flag_(kDataInCPU) {} + VectorData(std::initializer_list init) : cpu_(init), flag_(kDataInCPU) {} + template + explicit VectorData(const std::vector &dat) + : cpu_(dat), flag_(kDataInCPU) {} + ~VectorData() {} + + VectorData(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + } - // Fill vector with value. The vector size is `count`. - explicit Vector(size_t count, const T &value = T()) { - InitEmpty(); - if (count != 0) { - resize(count); - T *ptr = begin(); - for (size_t i = 0; i < count; ++i) { - ptr[i] = value; + VectorData &operator=(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + details::CUDABuffer null; + gpu_.Swap(null); + return *this; + } + + T &operator[](size_t i) { + MutableCPU(); + return cpu_[i]; + } + + const T &operator[](size_t i) const { + ImmutableCPU(); + return cpu_[i]; + } + + size_t size() const { return cpu_.size(); } + + iterator begin() { + MutableCPU(); + return cpu_.begin(); + } + + iterator end() { + MutableCPU(); + return cpu_.end(); + } + + T &front() { + MutableCPU(); + return cpu_.front(); + } + + T &back() { + MutableCPU(); + return cpu_.back(); + } + + const_iterator begin() const { + ImmutableCPU(); + return cpu_.begin(); + } + + const_iterator end() const { + ImmutableCPU(); + return cpu_.end(); + } + + const T &back() const { + ImmutableCPU(); + return cpu_.back(); + } + + T *data() { return &(*this)[0]; } + + const T *data() const { return &(*this)[0]; } + + const T &front() const { + ImmutableCPU(); + return cpu_.front(); + } + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + MutableCPU(); + cpu_.assign(begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + MutableCPU(); + cpu_.push_back(elem); + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + MutableCPU(); + auto out_it = std::back_inserter>(this->cpu_); + std::copy(begin, end, out_it); + } + + // resize the vector + void resize(size_t size) { + MutableCPU(); + cpu_.resize(size); + } + + // get cuda ptr. immutable + const T *CUDAData(platform::Place place) const { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return reinterpret_cast(gpu_.data_); + } + + // get cuda ptr. mutable + T *CUDAMutableData(platform::Place place) { + const T *ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } + + // clear + void clear() { + cpu_.clear(); + flag_ = kDirty | kDataInCPU; + } + + size_t capacity() const { return cpu_.capacity(); } + + // reserve data + void reserve(size_t size) const { cpu_.reserve(size); } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + ImmutableCPU(); + return cpu_; + } + + bool operator==(const VectorData &other) const { + ImmutableCPU(); + other.ImmutableCPU(); + return cpu_ == other.cpu_; + } + + std::mutex &Mutex() const { return mtx_; } + + std::unique_ptr CUDAPlace() const { + if (gpu_.data_ == nullptr) { + return nullptr; + } else { + return std::unique_ptr( + new platform::CUDAPlace(gpu_.place_)); } } - } - // Ctor with init_list - Vector(std::initializer_list init) { - if (init.size() == 0) { - InitEmpty(); - } else { - InitByIter(init.size(), init.begin(), init.end()); + private: + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get( + platform::Place(gpu_.place_))); + auto stream = dev_ctx->stream(); + void *src = gpu_.data_; + void *dst = cpu_.data(); + memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, + stream); + dev_ctx->Wait(); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; } - } + + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + CopyCPUDataToCUDA(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && + !(boost::get(place) == gpu_.place_)) { + PADDLE_THROW("This situation should not happen"); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + CopyCPUDataToCUDA(place); + SetFlag(kDataInCUDA); + } else if (!(boost::get(place) == gpu_.place_)) { + PADDLE_THROW("This situation should not happen."); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } + } + + void CopyCPUDataToCUDA(const platform::Place &place) const { + void *src = cpu_.data(); + gpu_.Resize(place, cpu_.size() * sizeof(T)); + void *dst = gpu_.data_; + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto stream = dev_ctx->stream(); + memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, + stream); + } + + void ImmutableCPU() const { + if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or + // CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + mutable std::vector cpu_; + mutable details::CUDABuffer gpu_; + mutable int flag_; + + mutable std::mutex mtx_; + }; + + public: + // Default ctor. Create empty Vector + Vector() : m_(new VectorData()) {} + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T &value = T()) + : m_(new VectorData(count, value)) {} + + // Ctor with init_list + Vector(std::initializer_list init) : m_(new VectorData(init)) {} // implicit cast from std::vector. template - Vector(const std::vector &dat) { // NOLINT - if (dat.size() == 0) { - InitEmpty(); - } else { - InitByIter(dat.size(), dat.begin(), dat.end()); - } + Vector(const std::vector &dat) : m_(new VectorData(dat)) { // NOLINT } // Copy ctor - Vector(const Vector &other) { this->operator=(other); } + Vector(const Vector &other) { m_ = other.m_; } // Copy operator Vector &operator=(const Vector &other) { - if (other.size() != 0) { - this->InitByIter(other.size(), other.begin(), other.end()); - } else { - InitEmpty(); - } + m_ = other.m_; return *this; } // Move ctor - Vector(Vector &&other) { - this->size_ = other.size_; - this->flag_ = other.flag_; - if (other.cuda_vec_.memory_size()) { - this->cuda_vec_.ShareDataWith(other.cuda_vec_); - } - if (other.cpu_vec_.memory_size()) { - this->cpu_vec_.ShareDataWith(other.cpu_vec_); - } - } + Vector(Vector &&other) { m_ = std::move(other.m_); } // CPU data access method. Mutable. - T &operator[](size_t i) { - MutableCPU(); - return const_cast(cpu_vec_.data())[i]; - } + T &operator[](size_t i) { return (*m_.MutableData())[i]; } // CPU data access method. Immutable. - const T &operator[](size_t i) const { - ImmutableCPU(); - return cpu_vec_.data()[i]; - } + const T &operator[](size_t i) const { return m_.Data()[i]; } // std::vector iterator methods. Based on CPU data access method - size_t size() const { return size_; } + size_t size() const { return m_.Data().size(); } - T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } + iterator begin() { return m_.MutableData()->begin(); } - T *end() { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } + iterator end() { return m_.MutableData()->end(); } - T &front() { return *begin(); } + T &front() { return m_.MutableData()->front(); } - T &back() { - auto it = end(); - --it; - return *it; - } + T &back() { return m_.MutableData()->back(); } - const T *begin() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); - } + const_iterator begin() const { return m_.Data().begin(); } - const T *end() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } + const_iterator end() const { return m_.Data().end(); } - const T *cbegin() const { return begin(); } + const_iterator cbegin() const { return begin(); } - const T *cend() const { return end(); } + const_iterator cend() const { return end(); } - const T &back() const { - auto it = end(); - --it; - return *it; - } + const T &back() const { return m_.Data().back(); } - T *data() { return begin(); } + T *data() { return m_.MutableData()->data(); } - const T *data() const { return begin(); } + const T *data() const { return m_.Data().data(); } - const T &front() const { return *begin(); } + const T &front() const { return m_.Data().front(); } // end of std::vector iterator methods // assign this from iterator. // NOTE: the iterator must support `end-begin` template void assign(Iter begin, Iter end) { - InitByIter(end - begin, begin, end); + m_.MutableData()->assign(begin, end); } // push_back. If the previous capacity is not enough, the memory will // double. - void push_back(T elem) { - if (size_ + 1 > capacity()) { - reserve((size_ + 1) << 1); - } - *end() = elem; - ++size_; - } + void push_back(T elem) { m_.MutableData()->push_back(elem); } // extend a vector by iterator. // NOTE: the iterator must support end-begin template void Extend(It begin, It end) { - size_t pre_size = size_; - resize(pre_size + (end - begin)); - T *ptr = this->begin() + pre_size; - for (; begin < end; ++begin, ++ptr) { - *ptr = *begin; - } + m_.MutableData()->Extend(begin, end); } // resize the vector void resize(size_t size) { - if (size + 1 <= capacity()) { - size_ = size; - } else { - MutableCPU(); - Tensor cpu_tensor; - platform::Place cpu = platform::CPUPlace(); - T *ptr = cpu_tensor.mutable_data( - framework::make_ddim({static_cast(size)}), cpu); - const T *old_ptr = - cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); - if (old_ptr != nullptr) { - std::copy(old_ptr, old_ptr + size_, ptr); - } - size_ = size; - cpu_vec_.ShareDataWith(cpu_tensor); + if (m_.Data().size() != size) { + m_.MutableData()->resize(size); } } // get cuda ptr. immutable const T *CUDAData(platform::Place place) const { - PADDLE_ENFORCE(platform::is_gpu_place(place), - "CUDA Data must on CUDA place"); - ImmutableCUDA(place); - return cuda_vec_.data(); + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == nullptr || + *cuda_place == boost::get(place)) { + return m_.Data().CUDAData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAData(place); } // get cuda ptr. mutable T *CUDAMutableData(platform::Place place) { - const T *ptr = CUDAData(place); - flag_ = kDirty | kDataInCUDA; - return const_cast(ptr); + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == nullptr || + *cuda_place == boost::get(place)) { + return m_.MutableData()->CUDAMutableData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAMutableData(place); } // clear - void clear() { - size_ = 0; - flag_ = kDirty | kDataInCPU; - } + void clear() { m_.MutableData()->clear(); } - size_t capacity() const { - return cpu_vec_.memory_size() / SizeOfType(typeid(T)); - } + size_t capacity() const { return m_.Data().capacity(); } // reserve data - void reserve(size_t size) { - size_t pre_size = size_; - resize(size); - resize(pre_size); - } + void reserve(size_t size) { m_.Data().reserve(size); } // the unify method to access CPU or CUDA data. immutable. const T *Data(platform::Place place) const { @@ -248,12 +481,7 @@ class Vector { } // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { - std::vector result; - result.resize(size()); - std::copy(begin(), end(), result.begin()); - return result; - } + operator std::vector() const { return m_.Data(); } bool operator==(const Vector &other) const { if (size() != other.size()) return false; @@ -267,118 +495,11 @@ class Vector { return true; } - private: - void InitEmpty() { - size_ = 0; - flag_ = kDataInCPU; - } - - template - void InitByIter(size_t size, Iter begin, Iter end) { - platform::Place cpu = platform::CPUPlace(); - T *ptr = this->cpu_vec_.template mutable_data( - framework::make_ddim({static_cast(size)}), cpu); - for (size_t i = 0; i < size; ++i) { - *ptr++ = *begin++; - } - flag_ = kDataInCPU | kDirty; - size_ = size; - } - - enum DataFlag { - kDataInCPU = 0x01, - kDataInCUDA = 0x02, - // kDirty means the data has been changed in one device. - kDirty = 0x10 - }; - - void CopyToCPU() const { - // COPY GPU Data To CPU - TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); - WaitPlace(cuda_vec_.place()); - } - - void MutableCPU() { - if (IsInCUDA() && IsDirty()) { - CopyToCPU(); - } - flag_ = kDirty | kDataInCPU; - } - - void ImmutableCUDA(platform::Place place) const { - if (IsDirty()) { - if (IsInCPU()) { - TensorCopy(cpu_vec_, boost::get(place), - &cuda_vec_); - WaitPlace(place); - UnsetFlag(kDirty); - SetFlag(kDataInCUDA); - } else if (IsInCUDA() && !(place == cuda_vec_.place())) { - framework::Tensor tmp; - TensorCopy(cuda_vec_, boost::get(place), &tmp); - WaitPlace(cuda_vec_.place()); - cuda_vec_.ShareDataWith(tmp); - // Still dirty - } else { - // Dirty && DataInCUDA && Device is same - // Do nothing - } - } else { - if (!IsInCUDA()) { - // Even data is not dirty. However, data is not in CUDA. Copy data. - TensorCopy(cpu_vec_, boost::get(place), - &cuda_vec_); - WaitPlace(place); - SetFlag(kDataInCUDA); - } else if (!(place == cuda_vec_.place())) { - framework::Tensor tmp; - WaitPlace(cuda_vec_.place()); - TensorCopy(cuda_vec_, boost::get(place), &tmp); - WaitPlace(cuda_vec_.place()); - WaitPlace(place); - cuda_vec_.ShareDataWith(tmp); - } else { - // Not Dirty && DataInCUDA && Device is same - // Do nothing. - } - } - } - - void ImmutableCPU() const { - if (IsDirty() && - !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. - CopyToCPU(); - UnsetFlag(kDirty); - } - SetFlag(kDataInCPU); - } - - void UnsetFlag(int flag) const { flag_ &= ~flag; } - void SetFlag(int flag) const { flag_ |= flag; } + const void *Handle() const { return &m_.Data(); } - bool IsDirty() const { return flag_ & kDirty; } - - bool IsInCUDA() const { return flag_ & kDataInCUDA; } - - bool IsInCPU() const { return flag_ & kDataInCPU; } - - static void WaitPlace(const platform::Place place) { - if (platform::is_gpu_place(place)) { - platform::DeviceContextPool::Instance() - .Get(boost::get(place)) - ->Wait(); - } - } - - static T &EmptyDummy() { - static T dummy = T(); - return dummy; - } - - mutable int flag_; - mutable Tensor cpu_vec_; - mutable Tensor cuda_vec_; - size_t size_; + private: + // Vector is an COW object. + mutable details::COWPtr m_; }; #else // PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..f681d4ecef9efe2b51c7154787230e8be2fb2702 --- /dev/null +++ b/paddle/fluid/framework/naive_executor.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { + +// These code can be shared with Executor. +static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { + if (var_type == proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarType::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarType::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarType::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarType::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::READER) { + var->GetMutable(); + } else if (var_type == proto::VarType::CHANNEL) { + var->GetMutable(); + } else if (var_type == proto::VarType::RAW) { + // GetMutable will be called in operator + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", + var_type); + } +} + +void NaiveExecutor::Prepare(Scope *parent_scope, + const ProgramDesc &program_desc, int block_id, + bool with_feed_fetch_ops) { + if (!parent_scope) { + scope_ = new framework::Scope; + } else { + scope_ = &parent_scope->NewScope(); + } + CreateVariables(program_desc, scope_, block_id); + CreateOps(program_desc, block_id, with_feed_fetch_ops); +} + +void NaiveExecutor::Run() { + for (auto &op : ops_) { + VLOG(4) << "run " << op->Type(); + op->Run(*scope_, place_); + } +} + +void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, + int block_id) { + PADDLE_ENFORCE(scope); + auto &global_block = desc.Block(block_id); + + const Scope *ancestor_scope = scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + + if (ancestor_scope != scope) { + for (auto &var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + // Create persistable vars in ancestor scope. + if (var->Persistable()) { + auto *ptr = const_cast(ancestor_scope)->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { // Create temporary variables in local scope. + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto &var : global_block.AllVars()) { + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } +} + +void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, + bool with_feed_fetch_ops) { + for (const auto &op_desc : desc.Block(block_id).AllOps()) { + if (!with_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + string::PrettyLogEndl(string::Style::detail(), "--- skip [%s], %s -> %s", + op_desc->Input("X")[0], op_desc->Type(), + op_desc->Output("Out")[0]); + continue; + } + ops_.emplace_back(OpRegistry::CreateOp(*op_desc)); + } +} + +LoDTensor *NaiveExecutor::FindTensor(const std::string &name) { + PADDLE_ENFORCE(scope_, "Need to init scope first"); + auto *var = scope_->FindVar(name); + PADDLE_ENFORCE(var, "No variable [%s] in the scope"); + auto *tensor = const_cast(&var->Get()); + return tensor; +} + +void NaiveExecutor::CleanFeedFetchOps() { + std::vector> ops; + for (auto &op : ops_) { + if (op->Type() != "feed" && op->Type() != "fetch") { + ops.emplace_back(std::move(op)); + } + } + ops_.swap(ops); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..9355e9e36a6358aa91553dca35aaf1b658516a0a --- /dev/null +++ b/paddle/fluid/framework/naive_executor.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +/* + * Simple, intuitive and effective. Only single thread is supported, and + * currently designed for inference. + */ +class NaiveExecutor { + public: + explicit NaiveExecutor(const platform::Place& place) : place_(place) {} + + // Create child scope. + // Create variables. + // @with_feed_fetch_ops: whether to work with the feed and fetch operators. + void Prepare(Scope* parent_scope, const ProgramDesc& program_desc, + int block_id, bool with_feed_fetch_ops); + + // Run all the operators. + void Run(); + + // Get an tensor to operating directly, without the need for feed_ops. + LoDTensor* FindTensor(const std::string& name); + + Scope* scope() { return scope_; } + + void CleanFeedFetchOps(); + + protected: + void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); + + void CreateOps(const ProgramDesc& desc, int block_id, + bool with_feed_fetch_ops); + + private: + const platform::Place place_; + // Catch the required resource to avoid recreate. + std::vector> ops_; + Scope* scope_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b9f79b9d398bc5a0ee6ba66587924daad0dbbc5 --- /dev/null +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +TEST(NaiveExecutor, Basic) { + ProgramDesc program; + auto* main_block = program.MutableBlock(0); + auto* a = main_block->Var("a"); // input + auto* b = main_block->Var("b"); // input + auto* c = main_block->Var("c"); // input + a->SetType(proto::VarType::LOD_TENSOR); + b->SetType(proto::VarType::LOD_TENSOR); + c->SetType(proto::VarType::LOD_TENSOR); + + auto* add = main_block->AppendOp(); + add->SetType("elementwise_add"); + add->SetInput("X", {"a"}); + add->SetInput("Y", {"b"}); + add->SetOutput("Out", {"c"}); + + auto place = platform::CPUPlace(); + NaiveExecutor exe(place); + exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/); + auto* a_tensor = exe.FindTensor("a"); + auto* b_tensor = exe.FindTensor("b"); + auto* c_tensor = exe.FindTensor("c"); + + a_tensor->Resize({1, 4}); + b_tensor->Resize({1, 4}); + c_tensor->Resize({1, 4}); + b_tensor->mutable_data(place); + a_tensor->mutable_data(place); + + float a_arr[] = {0, 1, 2, 3}; + float b_arr[] = {0.0, .1, .2, .3}; + + std::copy_n(a_arr, 4, a_tensor->mutable_data(place)); + std::copy_n(b_arr, 4, b_tensor->mutable_data(place)); + + exe.Run(); + + auto* c_data = c_tensor->mutable_data(place); + for (int i = 0; i < 4; i++) { + EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3); + } +} + +} // namespace framework +} // namespace paddle + +USE_OP(elementwise_add); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e800cb9993ddde45de7c33b11994359e77710daf..96624e33c6323dee7b6534673278b6b1b6343ae0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); + + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + } + RunImpl(scope, place); + if (VLOG_IS_ON(3)) { VLOG(3) << place << " " << DebugStringEx(&scope); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f5a54c0f48c98512dcb393d63a61f3c927e7ac1f..720d17a654bf96ca2bad43cc0c4374b2303ac233 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,21 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" - #include #include #include +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_viz_pass.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" #endif #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" -#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -35,80 +33,6 @@ limitations under the License. */ namespace paddle { namespace framework { -std::unique_ptr ApplyParallelExecutorPass( - const ProgramDesc &main_program, const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶m_names, - const std::vector &local_scopes, const bool use_cuda, -#ifdef PADDLE_WITH_CUDA - const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) { -#else - const BuildStrategy &strategy) { -#endif - // Convert the program to graph. - std::unique_ptr graph(new ir::Graph(main_program)); - - // Apply a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } - - // Apply op fusion. - if (strategy.fuse_elewise_add_act_ops_) { - auto fuse_elewise_add_act_pass = - ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass"); - graph = fuse_elewise_add_act_pass->Apply(std::move(graph)); - // Apply a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } - } - - // Convert graph to run on multi-devices. - auto multi_devices_pass = - ir::PassRegistry::Instance().Get("multi_devices_pass"); - multi_devices_pass->SetNotOwned>("places", - &places); - multi_devices_pass->SetNotOwned("loss_var_name", - &loss_var_name); - multi_devices_pass->SetNotOwned>( - "params", ¶m_names); - multi_devices_pass->SetNotOwned>("local_scopes", - &local_scopes); - multi_devices_pass->SetNotOwned("strategy", &strategy); - -#ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; - multi_devices_pass->SetNotOwned("nccl_ctxs", nctx); -#endif - graph = multi_devices_pass->Apply(std::move(graph)); - - // Apply a graph print pass to record a graph with device info. - if (!strategy.debug_graphviz_path_.empty()) { - auto multi_devices_print_pass = - ir::PassRegistry::Instance().Get("multi_devices_print_pass"); - multi_devices_print_pass->SetNotOwned( - "debug_graphviz_path", &strategy.debug_graphviz_path_); - multi_devices_print_pass->Set( - "graph_printer", new details::GraphvizSSAGraphPrinter); - graph = multi_devices_print_pass->Apply(std::move(graph)); - } - - // Verify that the graph is correct for multi-device executor. - auto multi_devices_check_pass = - ir::PassRegistry::Instance().Get("multi_devices_check_pass"); - graph = multi_devices_check_pass->Apply(std::move(graph)); - return graph; -} - class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) @@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor( // Step 3. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp #ifdef PADDLE_WITH_CUDA - std::unique_ptr graph = ApplyParallelExecutorPass( + std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, build_strategy, - member_->nccl_ctxs_.get()); + member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { @@ -228,11 +151,17 @@ ParallelExecutor::ParallelExecutor( } } #else - std::unique_ptr graph = ApplyParallelExecutorPass( - main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, build_strategy); + std::unique_ptr graph = + build_strategy.Apply(main_program, member_->places_, loss_var_name, + params, member_->local_scopes_, member_->use_cuda_); #endif + // If the loss_var_name is given, the number of graph should be only one. + if (loss_var_name.size()) { + PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, + "The number of graph should be only one"); + } + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); @@ -373,12 +302,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle - -USE_PASS(fuse_elewise_add_act_pass); -USE_PASS(graph_viz_pass); -USE_PASS(multi_devices_pass); -USE_PASS(multi_devices_check_pass); -USE_PASS(multi_devices_print_pass); #ifdef PADDLE_WITH_CUDA USE_PASS(reference_count_pass); #endif diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index c64906ff230df5f2b7cc9f5c6b29d68956ab8f33..fd386a5987f11ff64964e95eb7e9b83572dc790c 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,14 +14,14 @@ limitations under the License. */ #pragma once -#include #include #include #include #include #include + +#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 1a727a2c8c759d010606d5b605823b7252b35c69..40dee143f5d8f64a44bc2469bd5f38b89338ea5d 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -20,6 +20,13 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/string/printf.h" +// The mutex is not needed by training and inference, only for distribution. +#if PADDLE_WITH_DISTRIBUTE +#define WITH_LOCK 1 +#else +#define WITH_LOCK 0 +#endif + DEFINE_bool(benchmark, false, "Doing memory benchmark. It will make deleting scope synchronized, " "and add some memory usage logs." @@ -49,18 +56,24 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return VarInternal(name); } Variable* Scope::Var(std::string* name) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -69,29 +82,39 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return FindVarInternal(name); } const Scope* Scope::FindScope(const Variable* var) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif return FindScopeInternal(var); } void Scope::DropKids() { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -101,7 +124,9 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); @@ -114,7 +139,9 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -127,12 +154,16 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { +#if WITH_LOCK std::unique_lock lock(mutex_); +#endif auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc index 5ca864cfdf7176850dd31dd42ef3306061a742cf..928e1ad8b9168e61ddc5782066a4aa29a4296a94 100644 --- a/paddle/fluid/framework/selected_rows_test.cc +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test { selected_rows_.reset(new SelectedRows(rows, height)); Tensor* value = selected_rows_->mutable_value(); - value->mutable_data( + auto* data = value->mutable_data( make_ddim({static_cast(rows.size()), row_numel}), place_); + for (int64_t i = 0; i < value->numel(); ++i) { + data[i] = static_cast(i); + } } protected: @@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); + auto* dst_data = dst_tensor.value().data(); + for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) { + ASSERT_EQ(dst_data[i], static_cast(i)); + } } TEST(SelectedRows, SparseTable) { diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 6698efd1fa773127a84b4bcb28f57f4226dd7ae2..db381bbc3911ad9650162d9b9012580e5b638828 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -53,7 +53,7 @@ if(NOT APPLE) endif() if(WITH_TESTING) - # tests/book depends the models that generated by python/paddle/fluid/tests/book + # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) if(WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index c2a1c6634bd8f8de0796456e91cb3c530d4c6823..c740ea009f6cfc2ea250d8f1abdd7d442c2a0bb0 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 3b5be7f3ee33c73a9704bafa9f1b736c8a3cd9ea..f90910ac0d0a897ef01d4ca2bd0bca575baf4c40 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -37,12 +37,16 @@ TEST(Analyzer, analysis_without_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) { FLAGS_IA_enable_tensorrt_subgraph_engine = true; Argument argument; + argument.Set("minimum_subgraph_size", new int(0)); + argument.Set("max_batch_size", new int(3)); + argument.Set("workspace_size", new int(1 << 20)); + argument.Set("precision_mode", new std::string("FP32")); argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } -void TestWord2vecPrediction(const std::string &model_path) { +void TestWord2vecPrediction(const std::string& model_path) { NativeConfig config; config.model_dir = model_path; config.use_gpu = false; @@ -73,8 +77,8 @@ void TestWord2vecPrediction(const std::string &model_path) { // The outputs' buffers are in CPU memory. for (size_t i = 0; i < std::min(5UL, num_elements); i++) { LOG(INFO) << "data: " - << static_cast(outputs.front().data.data())[i]; - PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], result[i]); } } diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 5652940ec6d4cc7ba9a1d3a3e65f7dca1690d8c4..cb549f4b50cf56154a951d16b58b022dbad3e990 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -97,8 +97,10 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { } } -void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, +void CreateTrtEngineOp(Node *node, Argument *argument, framework::proto::BlockDesc *block) { + PADDLE_ENFORCE(argument->main_dfg.get()); + const DataFlowGraph &graph = *(argument->main_dfg); static int counter{0}; PADDLE_ENFORCE(node->IsFunctionBlock()); framework::OpDesc desc; @@ -204,7 +206,10 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, PADDLE_ENFORCE(!block->vars().empty(), "the block has no var-desc"); // Set attrs + SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); + SetAttr(desc.Proto(), "max_batch_size", argument->Get("max_batch_size")); + SetAttr(desc.Proto(), "workspace_size", argument->Get("workspace_size")); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "output_name_mapping", output_mapping); @@ -248,7 +253,7 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { *block_desc.Proto()->mutable_vars() = argument_->origin_program_desc->blocks(0).vars(); PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); - CreateTrtEngineOp(node, *argument_->main_dfg, block_desc.Proto()); + CreateTrtEngineOp(node, argument_, block_desc.Proto()); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *op = main_block->add_ops(); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index b879067d2f2f6294c50e0adb21f9399a7c36698a..526bbbadfe90c3064d7c620cc22e30f7fef99088 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -309,6 +309,8 @@ void SubGraphFuse::operator()() { ReplaceNodesWithSubGraphs(); } void SubGraphFuse::ReplaceNodesWithSubGraphs() { auto subgraphs = SubGraphSplitter(graph_, node_inside_subgraph_teller_)(); for (auto &subgraph : subgraphs) { + if (subgraph.size() <= argument_->Get("minimum_subgraph_size")) + continue; std::unordered_set subgraph_uniq(subgraph.begin(), subgraph.end()); // replace this sub-graph with the first node. Two steps: 1. Create a Block // Node that contains this subgraph 2. Mark the nodes inside the sub-graph diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.h b/paddle/fluid/inference/analysis/subgraph_splitter.h index a31afbe6933da8d3c7a88142cc12d63b98b55796..76e4fda0249e03c617d1b37c079dcd97f21387c1 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.h +++ b/paddle/fluid/inference/analysis/subgraph_splitter.h @@ -20,6 +20,7 @@ limitations under the License. */ #include +#include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/node.h" @@ -63,8 +64,11 @@ class SubGraphFuse { public: using NodeInsideSubgraphTeller = SubGraphSplitter::NodeInsideSubgraphTeller; - SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller) - : graph_(graph), node_inside_subgraph_teller_(teller) {} + SubGraphFuse(DataFlowGraph *graph, const NodeInsideSubgraphTeller &teller, + Argument *argument) + : graph_(graph), + node_inside_subgraph_teller_(teller), + argument_(argument) {} // The main method which run all the logic. void operator()(); @@ -76,6 +80,7 @@ class SubGraphFuse { private: DataFlowGraph *graph_; NodeInsideSubgraphTeller node_inside_subgraph_teller_; + Argument *argument_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 531a170512f727d891aa6644ee08a60c25f16876..e1dc89fab5fb76d456b07c316ab1cabe6de23b26 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -66,10 +66,12 @@ TEST(SubGraphSplitter, Split) { TEST(SubGraphSplitter, Fuse) { auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); + Argument argument; + argument.Set("minimum_subgraph_size", new int(3)); size_t count0 = dfg.nodes.size(); - SubGraphFuse fuse(&dfg, teller); + SubGraphFuse fuse(&dfg, teller, &argument); fuse(); int count1 = 0; diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc index faf876de6d65d20cf7a084cd97392cfc8d791a42..cc1746ecb34c983d219693bcec17c8789c38fa9f 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc @@ -24,7 +24,7 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( : node_inside_subgraph_teller_(teller) {} void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { - SubGraphFuse(graph, node_inside_subgraph_teller_)(); + SubGraphFuse(graph, node_inside_subgraph_teller_, argument_)(); VLOG(4) << "debug info " << graph->HumanReadableInfo(false /*show_values*/, true /*show_functions*/); diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h index 219e3f5470f627e81005aabf94f9c72c33fd2eed..3545da9109d79964f36c3d7e738620cc2e0f9a6c 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h @@ -33,7 +33,10 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller); - bool Initialize(Argument* argument) override { return true; } + bool Initialize(Argument* argument) override { + argument_ = argument; + return true; + } // This class get a sub-graph as input and determine whether to transform this // sub-graph into TensorRT. @@ -46,6 +49,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass { private: NodeInsideSubgraphTeller node_inside_subgraph_teller_; + Argument* argument_; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc index 67a5af83d89b771536ea11be51b35244ff5c09d6..9748e24b06295a4e7c2995429e6588cd0f225fe6 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc @@ -36,6 +36,10 @@ TEST(TensorRTSubGraphPass, main) { }; Argument argument(FLAGS_inference_model_dir); + argument.Set("minimum_subgraph_size", new int(0)); + argument.Set("max_batch_size", new int(3)); + argument.Set("workspace_size", new int(1 << 20)); + argument.Set("precision_mode", new std::string("FP32")); DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e569df94c54c304852dab7c7496804c1b08d665c..32d58b87413c95908644ffba31bbec22d8e23201 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,10 +18,10 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) - set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) + set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) endif() function(inference_api_test TARGET_NAME) @@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) +cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) +cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api inference_api_test(test_api_impl SRC api_impl_tester.cc ARGS test_word2vec test_image_classification) +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api + ARGS --dirname=${PYTHON_TESTS_DIR}/book) + if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) + DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) endif() if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 1032aadcbda4f1b05841e08e1abe7c737c3aeb9c..0c11694d5a905be4d9f0c6ebbc6159a4dc4a346e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -16,11 +16,15 @@ #include #include #include +#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/profiler.h" @@ -28,8 +32,11 @@ DECLARE_bool(profile); namespace paddle { +using contrib::AnalysisConfig; + bool AnalysisPredictor::Init( - const std::shared_ptr& parent_scope) { + const std::shared_ptr &parent_scope, + const std::shared_ptr &program) { VLOG(3) << "Predictor::init()"; #if !defined(_WIN32) if (FLAGS_profile) { @@ -43,7 +50,8 @@ bool AnalysisPredictor::Init( if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); - LOG(WARNING) << "ir optimize only supports CPU currently"; + LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " + "is turned false."; config_.enable_ir_optim = false; } else { place_ = paddle::platform::CPUPlace(); @@ -56,37 +64,134 @@ bool AnalysisPredictor::Init( scope_.reset(new paddle::framework::Scope()); } - executor_.reset(new paddle::framework::Executor(place_)); + executor_.reset(new paddle::framework::NaiveExecutor(place_)); - // Initialize the inference program - if (!config_.model_dir.empty()) { - // Parameters are saved in separate files sited in - // the specified `dirname`. - inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), - config_.model_dir); - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { - // All parameters are saved in a single file. - // The file names should be consistent with that used - // in Python API `fluid.io.save_inference_model`. - inference_program_ = paddle::inference::Load( - executor_.get(), scope_.get(), config_.prog_file, config_.param_file); + if (!program) { + if (!LoadProgramDesc()) return false; + OptimizeInferenceProgram(); } else { - LOG(ERROR) << "fail to load inference model from " << config_.model_dir; + inference_program_ = program; + } + executor_->Prepare(scope_.get(), *inference_program_, 0, + config_.use_feed_fetch_ops); + + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + return true; +} + +bool AnalysisPredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + VLOG(3) << "Predictor::predict"; + inference::Timer timer; + timer.tic(); + // set feed variable + std::vector feeds; + framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); + if (!SetFeed(inputs, scope)) { + LOG(ERROR) << "fail to set feed"; return false; } + // Run the inference program + // if share variables, we need not create variables + executor_->Run(); - OptimizeInferenceProgram(); - if (config_._use_mkldnn) { - executor_->EnableMKLDNN(*inference_program_); + // get fetch variable + if (!GetFetch(output_data, scope)) { + LOG(ERROR) << "fail to get fetches"; + return false; } - ctx_ = executor_->Prepare(*inference_program_, 0); + VLOG(3) << "predict cost: " << timer.toc() << "ms"; + return true; +} - VLOG(5) << "to create variables"; - PADDLE_ENFORCE(scope_.get()); - executor_->CreateVariables(*inference_program_, - sub_scope_ ? sub_scope_ : scope_.get(), 0); - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); +bool AnalysisPredictor::SetFeed(const std::vector &inputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::set_feed"; + if (inputs.size() != feeds_.size()) { + LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " + << inputs.size(); + return false; + } + + // Cache the inputs memory for better concurrency performance. + feed_tensors_.resize(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto &input = feed_tensors_[i]; + framework::DDim ddim = framework::make_ddim(inputs[i].shape); + void *input_ptr; + if (inputs[i].dtype == PaddleDType::INT64) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else if (inputs[i].dtype == PaddleDType::FLOAT32) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else { + LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; + return false; + } + + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + int idx = -1; + if (config_.specify_input_name) { + idx = feed_names_[inputs[i].name]; + } else { + idx = boost::get(feeds_[i]->GetAttr("col")); + } + framework::SetFeedVariable(scope, input, "feed", idx); + } + return true; +} + +template +void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, + PaddleTensor *output) { + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); + } +} + +bool AnalysisPredictor::GetFetch(std::vector *outputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::get_fetch"; + outputs->resize(fetchs_.size()); + for (size_t i = 0; i < fetchs_.size(); ++i) { + int idx = boost::get(fetchs_[i]->GetAttr("col")); + PADDLE_ENFORCE((size_t)idx == i); + framework::LoDTensor &fetch = + framework::GetFetchVariable(*scope, "fetch", idx); + auto type = fetch.type(); + auto output = &(outputs->at(i)); + if (type == typeid(float)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT32; + } else if (type == typeid(int64_t)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + } return true; } @@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { new std::string(config_.prog_file)); argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); } + argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); PADDLE_ENFORCE( @@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } template <> -std::unique_ptr -CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { +std::unique_ptr CreatePaddlePredictor< + AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy @@ -150,15 +255,90 @@ CreatePaddlePredictor( } std::unique_ptr predictor(new AnalysisPredictor(config)); - if (!dynamic_cast(predictor.get())->Init(nullptr)) { + if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return predictor; } +void AnalysisPredictor::PrepareFeedFetch() { + for (auto *op : inference_program_->Block(0).AllOps()) { + if (op->Type() == "feed") { + int idx = boost::get(op->GetAttr("col")); + if (feeds_.size() <= static_cast(idx)) { + feeds_.resize(idx + 1); + } + feeds_[idx] = op; + feed_names_[op->Output("Out")[0]] = idx; + } else if (op->Type() == "fetch") { + int idx = boost::get(op->GetAttr("col")); + if (fetchs_.size() <= static_cast(idx)) { + fetchs_.resize(idx + 1); + } + fetchs_[idx] = op; + } + } +} + +std::unique_ptr AnalysisPredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = true; + res->SetName(name); + return res; +} + +std::unique_ptr AnalysisPredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = false; + res->SetName(name); + return res; +} + +bool AnalysisPredictor::ZeroCopyRun() { + executor_->Run(); + return true; +} + +bool AnalysisPredictor::LoadProgramDesc() { + // Initialize the inference program + std::unique_ptr tmp_exe( + new framework::Executor(platform::CPUPlace())); + if (!config_.model_dir.empty()) { + // Parameters are saved in separate files sited in + // the specified `dirname`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.model_dir); + } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.prog_file, config_.param_file); + } else { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s'.", config_.model_dir, + config_.param_file); + return false; + } + return true; +} +std::unique_ptr AnalysisPredictor::Clone() { + auto *x = new AnalysisPredictor(config_); + x->Init(scope_, inference_program_); + return std::unique_ptr(x); +} + template <> std::unique_ptr CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { + const contrib::AnalysisConfig &config) { return CreatePaddlePredictor(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index aa00e8be5c28c2e3bfe74fa0bff2c72210bd106e..0d01d7ac2b29ea6364b07af9bb3bdeb5ced6bd00 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -12,42 +12,81 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include #include +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" namespace paddle { using inference::analysis::Argument; using inference::analysis::Analyzer; using framework::proto::ProgramDesc; +using framework::NaiveExecutor; +using contrib::AnalysisConfig; /* This predictor is based on the original native predictor with IR and Analysis * support. It will optimize IR and Parameters in the runtime. * TODO(Superjomn) Replace the Navive predictor? */ -class AnalysisPredictor : public NativePaddlePredictor { +class AnalysisPredictor : public PaddlePredictor { public: - explicit AnalysisPredictor(const contrib::AnalysisConfig& config) - : NativePaddlePredictor(config), config_(config) {} + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} - bool Init(const std::shared_ptr& parent_scope); + bool Init(const std::shared_ptr &parent_scope, + const std::shared_ptr &program = nullptr); - bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) override { - return NativePaddlePredictor::Run(inputs, output_data, batch_size); - } + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + std::unique_ptr GetInputTensor( + const std::string &name) override; + std::unique_ptr GetOutputTensor( + const std::string &name) override; + + bool ZeroCopyRun() override; + + void PrepareFeedFetch(); void OptimizeInferenceProgram(); - Argument& analysis_argument() { return argument_; } + Argument &analysis_argument() { return argument_; } + + std::unique_ptr Clone() override; + + framework::Scope *scope() { return executor_->scope(); } + framework::ProgramDesc &program() { return *inference_program_; } + + protected: + bool LoadProgramDesc(); + + bool SetFeed(const std::vector &input_datas, + framework::Scope *scope); + bool GetFetch(std::vector *output_data, + framework::Scope *scope); + template + void GetFetchOne(const framework::LoDTensor &fetchs, + PaddleTensor *output_data); private: contrib::AnalysisConfig config_; Argument argument_; + std::unique_ptr executor_; + platform::Place place_; + std::shared_ptr scope_; + framework::Scope *sub_scope_{nullptr}; + std::shared_ptr inference_program_; + std::vector feeds_; + std::map feed_names_; + std::vector fetchs_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious + // concurrency problems, so cache them. + std::vector feed_tensors_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d25f55b3188a684fe38df1417d114348cfa2e8a --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { +namespace inference { +using contrib::AnalysisConfig; + +TEST(AnalysisPredictor, ZeroCopy) { + AnalysisConfig config; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.use_feed_fetch_ops = false; + + auto predictor = + CreatePaddlePredictor( + config); + + auto w0 = predictor->GetInputTensor("firstw"); + auto w1 = predictor->GetInputTensor("secondw"); + auto w2 = predictor->GetInputTensor("thirdw"); + auto w3 = predictor->GetInputTensor("forthw"); + + w0->Reshape({4, 1}); + w1->Reshape({4, 1}); + w2->Reshape({4, 1}); + w3->Reshape({4, 1}); + + auto* w0_data = w0->mutable_data(PaddlePlace::kCPU); + auto* w1_data = w1->mutable_data(PaddlePlace::kCPU); + auto* w2_data = w2->mutable_data(PaddlePlace::kCPU); + auto* w3_data = w3->mutable_data(PaddlePlace::kCPU); + + for (int i = 0; i < 4; i++) { + w0_data[i] = i; + w1_data[i] = i; + w2_data[i] = i; + w3_data[i] = i; + } + + predictor->ZeroCopyRun(); + + auto out = predictor->GetOutputTensor("fc_1.tmp_2"); + PaddlePlace place; + int size = 0; + auto* out_data = out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + LOG(INFO) << "output_data: " << out_data; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index c71769a32f604358fe68c927546591310649f116..01ea942d3c8d20180cfc9664b8601ba87a898e86 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -1,16 +1,22 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle_inference_api.h" namespace paddle { @@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) { } } -PaddleBuf::PaddleBuf(PaddleBuf&& other) +PaddleBuf::PaddleBuf(PaddleBuf &&other) : data_(other.data_), length_(other.length_), memory_owned_(other.memory_owned_) { @@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) other.length_ = 0; } -PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } +PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; } -PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { +PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { if (!other.memory_owned_) { data_ = other.data_; length_ = other.length_; @@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { return *this; } -PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { +PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) { // only the buffer with external memory can be copied data_ = other.data_; length_ = other.length_; @@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) { } } -void PaddleBuf::Reset(void* data, size_t length) { +void PaddleBuf::Reset(void *data, size_t length) { Free(); memory_owned_ = false; data_ = data; @@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { PADDLE_ENFORCE_GT(length_, 0); - free(static_cast(data_)); + free(static_cast(data_)); data_ = nullptr; length_ = 0; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index dca4386b21b4a064c21b52218682321258f368c4..53740899cd4176ae007c09b7728e504675d13248 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, VLOG(4) << "Run prepared context"; executor_->RunPreparedContext(ctx_.get(), scope, false, /* don't create local scope each time*/ - false /* don't create variable eatch time */); + false /* don't create variable each time */); VLOG(4) << "Finish prepared context"; // get fetch variable if (!GetFetch(output_data, scope)) { diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 6386d601262b3dac0e957fae991d23768b52f2c0..7882f6a53c7ce9a2486158ea9b50c018d1814091 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once @@ -30,6 +30,8 @@ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler.h" @@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor { ~NativePaddlePredictor() override; + framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); } + protected: bool SetFeed(const std::vector &input_datas, framework::Scope *scope); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index fc1364b80ac1ee2d304eb2fe429eae5f56967516..106a941b2954bc7490c4ee6380b5249e126fbfb3 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -43,7 +43,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { NativeConfig GetConfig() { NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; #ifdef PADDLE_WITH_CUDA @@ -110,7 +110,7 @@ void MainImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; const bool is_combined = false; std::vector> feed_target_shapes = @@ -214,7 +214,7 @@ void MainThreadsImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 6c7e63971b2d93f58e219dbd93637c8d389deb7c..5ee6a5a93168f58770067f76ca7f6bb6f67b2965 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -35,8 +35,6 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { bool Init(const std::shared_ptr& parent_scope) { FLAGS_IA_enable_tensorrt_subgraph_engine = true; VLOG(3) << "Predictor::init()"; - FLAGS_tensorrt_max_batch_size = config_.max_batch_size; - FLAGS_tensorrt_workspace_size = config_.workspace_size; if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); } else { @@ -92,6 +90,14 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { void OptimizeInferenceProgram() { // Analyze inference_program Argument argument; + + argument.Set("minimum_subgraph_size", + new int(config_.minimum_subgraph_size)); + argument.Set("max_batch_size", new int(config_.max_batch_size)); + argument.Set("workspace_size", new int(config_.workspace_size)); + argument.Set("precision_mode", + new std::string(config_.precision_mode)); + if (!config_.model_dir.empty()) { argument.fluid_model_dir.reset(new std::string(config_.model_dir)); } else { diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..14698f6dfc8885ec1d35f1912bad10a9caa13db4 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(input_or_output_, + "Can't reshape the output tensor, it is readonly"); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + tensor->Resize(framework::make_ddim(shape)); +} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + auto *tensor = static_cast(FindTensor()); + switch (static_cast(place)) { + case static_cast(PaddlePlace::kCPU): { + return tensor->mutable_data(platform::CPUPlace()); + } + case static_cast(PaddlePlace::kGPU): { + return tensor->mutable_data(platform::CUDAPlace()); + } + default: + PADDLE_THROW("Unsupported place: %d", static_cast(place)); + break; + } + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + auto *tensor = static_cast(FindTensor()); + auto *res = tensor->data(); + + if (platform::is_cpu_place(tensor->place())) { + *place = PaddlePlace::kCPU; + } else if (platform::is_gpu_place(tensor->place())) { + *place = PaddlePlace::kGPU; + } else { + *place = PaddlePlace::kUNK; + } + + *size = tensor->numel(); + return res; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + return tensor; +} + +std::vector ZeroCopyTensor::shape() { + auto *tensor = static_cast(FindTensor()); + PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_); + return framework::vectorize(tensor->dims()); +} + +void ZeroCopyTensor::SetLoD(const std::vector> &x) { + auto *tensor = static_cast(FindTensor()); + framework::LoD lod; + for (auto &level : x) { + lod.emplace_back(level); + } + tensor->set_lod(lod); +} + +std::vector> ZeroCopyTensor::lod() const { + std::vector> res; + auto *tensor = static_cast(FindTensor()); + for (auto &level : tensor->lod()) { + res.emplace_back(level); + } + return res; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d5b561d801cd9e734cab13b28e7285493e30f94 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) {} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + return nullptr; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { return nullptr; } + +std::vector ZeroCopyTensor::shape() { return {}; } + +void ZeroCopyTensor::SetLoD(const std::vector> &x) {} + +std::vector> ZeroCopyTensor::lod() const { + return std::vector>(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 1fec2f96da0f9d978a3537b2d78e4ce5ef628c81..dbbd3f6a6786a4a4849002878263353919e8f31b 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -21,8 +21,10 @@ #include #include #include +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/timer.h" +#include "paddle/fluid/string/printf.h" namespace paddle { namespace inference { @@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { + int size{0}; + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + int c = 0; + for (const auto &f : data) { + for (T v : f) { + ptr[c++] = v; + } + } + return size; +} + static std::string DescribeTensor(const PaddleTensor &tensor) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; @@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, } } +template +std::string LoDTensorSummary(const framework::LoDTensor &tensor) { + std::stringstream ss; + ss << "\n---- tensor ---" << '\n'; + ss << "lod: ["; + for (const auto &level : tensor.lod()) { + ss << "[ "; + for (auto i : level) { + ss << i << ", "; + } + ss << "]"; + } + ss << "]\n"; + + ss << "shape: ["; + int size = 1; + for (int i = 0; i < tensor.dims().size(); i++) { + int dim = tensor.dims()[i]; + ss << dim << ", "; + size *= dim; + } + ss << "]\n"; + + ss << "data: "; + for (int i = 0; i < std::min(20, size); i++) { + ss << tensor.data()[i] << " "; + } + ss << "\n"; + + return ss.str(); +} + +static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + auto &al = a[i]; + auto &bl = b[i]; + if (al.size() != bl.size()) { + LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(), + bl.size()); + return false; + } + } + return true; +} + +static bool CompareShape(const std::vector &a, + const std::vector &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (a[i] != b[i]) { + LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i, + a[i], b[i]); + return false; + } + } + return true; +} + +static bool CompareTensorData(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + auto a_shape = framework::vectorize(a.dims()); + auto b_shape = framework::vectorize(b.dims()); + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, + [](int a, int b) { return a * b; }); + if (a_size != b_size) { + LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", + a_size, b_size); + } + + for (size_t i = 0; i < a_size; i++) { + if (a.type() == typeid(float)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } else if (a.type() == typeid(int64_t)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } + } + + return true; +} + +static bool CompareTensor(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + if (!CompareLoD(a.lod(), b.lod())) { + return false; + } + if (!CompareShape(framework::vectorize(a.dims()), + framework::vectorize(b.dims()))) { + return false; + } + + if (!CompareTensorData(a, b)) { + return false; + } + + return true; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 2b4e5ed73704041e18bdbce32338405f3601e082..3aa5c614687953f824fc5a94e8bde29090dbeb5d 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -101,6 +101,40 @@ struct PaddleTensor { std::vector> lod; // Tensor+LoD equals LoDTensor }; +enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; +// Tensor without copy, currently only supports AnalysisPredictor. +class ZeroCopyTensor { + public: + void Reshape(const std::vector& shape); + + // Get the memory in CPU or GPU with specific data type, should Reshape first + // to tell the data size. + // Once can directly call this data to feed the data. + // This is for write the input tensor. + template + T* mutable_data(PaddlePlace place); + // Get the memory directly, will return the place and memory size by pointer. + // This is for reading the output tensor. + template + T* data(PaddlePlace* place, int* size); + + std::vector shape(); + + void SetLoD(const std::vector>& x); + std::vector> lod() const; + + protected: + ZeroCopyTensor(void* scope) : scope_{scope} {} + void SetName(const std::string& name) { name_ = name; } + void* FindTensor() const; + + private: + std::string name_; + bool input_or_output_; + friend class AnalysisPredictor; + void* scope_{nullptr}; +}; + /* * A simple Inference API for Paddle. */ @@ -120,6 +154,19 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + // Zero copy input and output optimization. + // Get the input or output tensors, and operate on their memory directly, + // without copy. + virtual std::unique_ptr GetInputTensor( + const std::string& name) { + return nullptr; + } + virtual std::unique_ptr GetOutputTensor( + const std::string& name) { + return nullptr; + } + virtual bool ZeroCopyRun() { return false; } + // Clone a predictor that share the model weights, the Cloned predictor should // be thread-safe. virtual std::unique_ptr Clone() = 0; @@ -194,6 +241,14 @@ struct MixedRTConfig : public NativeConfig { // For workspace_size, refer it from here: // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int workspace_size{1 << 30}; + // We transform the Ops that can be converted into TRT layer in the model, + // and aggregate these Ops into subgraphs for TRT execution. + // We set this variable to control the minimum number of nodes in the + // subgraph, 3 as default value. + int minimum_subgraph_size = 3; + // Reserved configuration + // We just support "FP32" now, "FP16" and "INT8" will be supported. + std::string precision_mode = "FP32"; }; // NOTE WIP, not stable yet. @@ -204,12 +259,18 @@ struct AnalysisConfig : public NativeConfig { kExclude // Specify the disabled passes in `ir_passes`. }; + // Determine whether to perform graph optimization. bool enable_ir_optim = true; + // Manually determine the IR passes to run. IrPassMode ir_mode{IrPassMode::kExclude}; - // attention lstm fuse works only on some specific models, disable as default. - std::vector ir_passes{"attention_lstm_fuse_pass"}; + std::vector ir_passes; + + // NOT stable yet. + bool use_feed_fetch_ops{true}; - // NOTE this is just for internal development, please not use it. + // NOTE this is just for internal development, please not use it. NOT + // stable + // yet. bool _use_mkldnn{false}; }; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index d7ab2ac980af2cf3bd9d95bfdbfa1887ef9a64d7..70f9e397c96cf3fe92779778950f3df71b5a67c9 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -90,3 +90,13 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI DEPS inference_anakin_api_shared dynload_cuda SERIAL) endif() endif() + +if(WITH_GPU AND TENSORRT_FOUND) + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") + if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") + endif() + cc_test(test_trt_models SRCS trt_models_tester.cc + ARGS --dirname=${TRT_MODEL_INSTALL_DIR}/trt_test_models + DEPS paddle_inference_tensorrt_subgraph_engine) +endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 36bbec473114cfd2e68c97a53264957477ade3fb..5fb551810fd4d1c56547a8aa581cb6c4587df031 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -18,6 +18,8 @@ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; + struct DataRecord { std::vector data; std::vector lod; @@ -78,6 +80,7 @@ struct DataRecord { } } } + DataRecord NextBatch() { DataRecord data; data.data = batched_datas[batch_iter]; @@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 59020545cd609961487cafc4a08c20951a02c8ce..577b97e271aacab5d6740de7c8bc00bc87ae54dd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -16,6 +16,7 @@ namespace paddle { namespace inference { +using contrib::AnalysisConfig; struct DataRecord { std::vector> word_data_all, mention_data_all; @@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 3bf5383d8f35347c767d6caee83e0dcc5fb0a446..d2e344111bdf84c936bbef7ff51246b0f248f41d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -12,12 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_bool(with_precision_check, true, "turn on test"); + namespace paddle { namespace inference { using namespace framework; // NOLINT +using namespace contrib; // NOLINT struct DataRecord { std::vector>> link_step_data_all; @@ -29,10 +33,12 @@ struct DataRecord { size_t batch_iter{0}; size_t batch_size{1}; DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { Load(path); } + DataRecord NextBatch() { DataRecord data; size_t batch_end = batch_iter + batch_size; @@ -101,6 +107,7 @@ struct DataRecord { num_samples = num_lines; } }; + void PrepareInputs(std::vector *input_slots, DataRecord *data, int batch_size) { PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, @@ -149,7 +156,55 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { +void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, + ZeroCopyTensor *cell_init_tensor, + ZeroCopyTensor *data_tensor, + ZeroCopyTensor *hidden_init_tensor, + ZeroCopyTensor *week_tensor, + ZeroCopyTensor *minute_tensor, + DataRecord *data_record, int batch_size) { + auto one_batch = data_record->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor->Reshape({1, 2}); + lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2}); + + cell_init_tensor->Reshape({batch_size, 15}); + cell_init_tensor->SetLoD({one_batch.lod3}); + + hidden_init_tensor->Reshape({batch_size, 15}); + hidden_init_tensor->SetLoD({one_batch.lod3}); + + data_tensor->Reshape(rnn_link_data_shape); + data_tensor->SetLoD({one_batch.lod1}); + + week_tensor->Reshape( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor->SetLoD({one_batch.lod3}); + + minute_tensor->Reshape( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor->SetLoD({one_batch.lod3}); + + // assign data + float arr0[] = {0, 0}; + std::vector zeros(batch_size * 15, 0); + std::copy_n(arr0, 2, + lod_attention_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(arr0, 2, data_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + cell_init_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + hidden_init_tensor->mutable_data(PaddlePlace::kCPU)); + ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data); + ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas); + ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas); +} + +void SetConfig(AnalysisConfig *cfg) { cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->param_file = FLAGS_infer_model + "/param"; cfg->use_gpu = false; @@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM @@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */); + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); +} + +bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope, + const std::vector &tensors) { + for (auto &x : tensors) { + auto *a_var = a_scope.FindVar(x); + auto *b_var = b_scope.FindVar(x); + if (a_var && b_var) { + if (a_var->Type() == typeid(framework::LoDTensor) || + a_var->Type() == typeid(framework::Tensor)) { + LOG(INFO) << "comparing tensor " << x; + auto &a_t = a_var->Get(); + auto &b_t = b_var->Get(); + if (!inference::CompareTensor(a_t, b_t)) { + LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x); + } + } else { + LOG(INFO) << "skip no tensor " << x; + } + } else { + LOG(INFO) << "skip tensor " << x; + } + } + return true; +} + +// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing +// on the complex RNN1 model. +TEST(Analyzer_rnn1, ZeroCopy) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + + PaddlePlace place; + int output_size{0}; + + auto predictor = + CreatePaddlePredictor( + config); + + config.use_feed_fetch_ops = true; + auto native_predictor = + CreatePaddlePredictor(config); + + config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. + auto analysis_predictor = + CreatePaddlePredictor( + config); + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), + data_tensor.get(), hidden_init_tensor.get(), + week_tensor.get(), minute_tensor.get(), &data, + FLAGS_batch_size); + + // Prepare data for NativePredictor + std::vector> native_inputs; + SetInput(&native_inputs); + std::vector native_outputs; + std::vector analysis_outputs; + + auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); + // Run analysis predictor + + int num_ops; + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); + ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + ASSERT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + + Timer timer; + double total_time{0}; + double native_total_time{0}; + double analysis_total_time{0.}; + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + auto *output_data = output_tensor->data(&place, &output_size); + ASSERT_GT(output_size, 0); // more than one output! + + for (int i = 0; i < FLAGS_repeat; i++) { + // Run native predictor. + timer.tic(); + ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); + native_total_time += timer.toc(); + } + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + ASSERT_TRUE( + analysis_predictor->Run(native_inputs.front(), &analysis_outputs)); + analysis_total_time += timer.toc(); + } + + if (!FLAGS_with_precision_check) { + return; + } + int native_output_size = VecReduceToInt(native_outputs.front().shape); + + EXPECT_EQ(native_output_size, output_size); + + // Compare tensors between analysis and zerocopy + auto *p0 = static_cast(predictor.get()); + auto *p1 = static_cast(analysis_predictor.get()); + auto *p2 = static_cast(native_predictor.get()); + + std::vector tensor_names; + for (auto &var_desc : p0->program().Block(0).AllVars()) { + tensor_names.push_back(var_desc->Name()); + } + + LOG(INFO) << "Comparing tensors"; + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"})); + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"})); + + LOG(INFO) << "output1 " << inference::LoDTensorSummary( + p0->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output2 " << inference::LoDTensorSummary( + p1->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output3 " << inference::LoDTensorSummary( + p2->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + + for (int i = 0; i < output_size; i++) { + LOG(INFO) << output_data[i] << " " + << static_cast(native_outputs.front().data.data())[i] + << " " + << static_cast(analysis_outputs.front().data.data())[i]; + EXPECT_NEAR(output_data[i], + static_cast(native_outputs.front().data.data())[i], + 1e-3); + } + + LOG(INFO) << "batch_size: " << FLAGS_batch_size; + + LOG(INFO) << "zero average time: " + << total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "analysis average time: " + << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "native average time: " + << native_total_time / (FLAGS_repeat * FLAGS_batch_size); +} + +TEST(Analyzer_rnn1, ZeroCopyMultiThread) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + + auto base_predictor = CreatePaddlePredictor(config); + double total_time_of_threads{0}; + std::vector threads; + std::vector> predictors; + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + predictors.emplace_back(CreatePaddlePredictor(config)); + } + + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + threads.emplace_back([config, &total_time_of_threads, &predictors, tid] { + // auto predictor = base_predictor->Clone(); + auto &predictor = predictors[tid]; + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + Timer timer; + double total_time{0}; + + for (int i = 0; i < FLAGS_repeat; i++) { + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), + cell_init_tensor.get(), data_tensor.get(), + hidden_init_tensor.get(), week_tensor.get(), + minute_tensor.get(), &data, FLAGS_batch_size); + + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + total_time_of_threads += total_time; + + LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; + }); + } + + for (auto &t : threads) { + t.join(); + } + + LOG(INFO) << "average time: " + << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 2f71ed46ffc9fd5f853f5b5b46de1446d28b9e69..cb4671c4379b5f6f144bfd5330866aa38163f4d4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 483ae66c5b24f6147b1b07da86494a914f80c34c..a2e86305b85dd893f578e97e0105fec828916fb4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -19,6 +19,7 @@ limitations under the License. */ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; struct Record { std::vector data; @@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 9fcb5129d268a7730c11e5910077ad233050484e..cb36ddc8c879b1aff9838bba90364b17d53aa84e 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -86,11 +86,9 @@ std::unique_ptr CreateTestPredictor( size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } -std::unordered_map GetFuseStatis(AnalysisConfig config, +std::unordered_map GetFuseStatis(PaddlePredictor *predictor, int *num_ops) { - auto predictor = CreateTestPredictor(config); - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); + auto *analysis_predictor = static_cast(predictor); auto &fuse_statis = analysis_predictor->analysis_argument() .Get>( framework::ir::kFuseStatisAttr); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..bf320a0cbc2fff5f973c48768281e26d0fde232b --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { +using paddle::contrib::MixedRTConfig; + +DEFINE_string(dirname, "", "Directory of the inference model."); + +NativeConfig GetConfigNative() { + NativeConfig config; + config.model_dir = FLAGS_dirname; + // LOG(INFO) << "dirname " << config.model_dir; + config.fraction_of_gpu_memory = 0.45; + config.use_gpu = true; + config.device = 0; + return config; +} + +MixedRTConfig GetConfigTRT() { + MixedRTConfig config; + config.model_dir = FLAGS_dirname; + config.use_gpu = true; + config.fraction_of_gpu_memory = 0.2; + config.device = 0; + config.max_batch_size = 3; + return config; +} + +void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { + NativeConfig config0 = GetConfigNative(); + config0.model_dir = model_dirname; + + MixedRTConfig config1 = GetConfigTRT(); + config1.model_dir = model_dirname; + config1.max_batch_size = batch_size; + + auto predictor0 = + CreatePaddlePredictor(config0); + auto predictor1 = + CreatePaddlePredictor(config1); + // Prepare inputs + int height = 224; + int width = 224; + float *data = new float[batch_size * 3 * height * width]; + memset(data, 0, sizeof(float) * (batch_size * 3 * height * width)); + data[0] = 1.0f; + + // Prepare inputs + PaddleTensor tensor; + tensor.name = "input_0"; + tensor.shape = std::vector({batch_size, 3, height, width}); + tensor.data = PaddleBuf(static_cast(data), + sizeof(float) * (batch_size * 3 * height * width)); + tensor.dtype = PaddleDType::FLOAT32; + std::vector paddle_tensor_feeds(1, tensor); + + // Prepare outputs + std::vector outputs0; + std::vector outputs1; + CHECK(predictor0->Run(paddle_tensor_feeds, &outputs0)); + + CHECK(predictor1->Run(paddle_tensor_feeds, &outputs1, batch_size)); + + // Get output. + ASSERT_EQ(outputs0.size(), 1UL); + ASSERT_EQ(outputs1.size(), 1UL); + + const size_t num_elements = outputs0.front().data.length() / sizeof(float); + const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); + EXPECT_EQ(num_elements, num_elements1); + + auto *data0 = static_cast(outputs0.front().data.data()); + auto *data1 = static_cast(outputs1.front().data.data()); + + ASSERT_GT(num_elements, 0UL); + for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { + EXPECT_NEAR(data0[i], data1[i], 1e-3); + } +} + +TEST(trt_models_test, main) { + std::vector infer_models = {"mobilenet", "resnet50", + "resnext50"}; + for (auto &model_dir : infer_models) { + CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir); + } +} +} // namespace paddle diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 283745e977533358ef52521b36e67f0ada950e61..0f13a4ea9c1af175771f5cc201ea5c0a8a0f7555 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -36,6 +36,8 @@ namespace memory { using BuddyAllocator = detail::BuddyAllocator; BuddyAllocator* GetCPUBuddyAllocator() { + // We tried thread_local for inference::RNN1 model, but that not works much + // for multi-thread test. static std::once_flag init_flag; static detail::BuddyAllocator* a = nullptr; @@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() { return a; } +// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation, +// seems they are almost the same overhead. +struct NaiveAllocator { + void* Alloc(size_t size) { return malloc(size); } + + void Free(void* p) { + PADDLE_ENFORCE(p); + free(p); + } + + static NaiveAllocator* Instance() { + static NaiveAllocator x; + return &x; + } + + private: + std::mutex lock_; +}; + template <> void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); diff --git a/paddle/fluid/operators/auc_op.cc b/paddle/fluid/operators/auc_op.cc index dfaa7456f917c1308984b361afed752f96ea6f59..0784920064a879963cd9725cd9acf4cec7b874ce 100644 --- a/paddle/fluid/operators/auc_op.cc +++ b/paddle/fluid/operators/auc_op.cc @@ -36,11 +36,16 @@ class AucOp : public framework::OperatorWithKernel { "Out and Label should have same height."); int num_pred_buckets = ctx->Attrs().Get("num_thresholds") + 1; + int slide_steps = ctx->Attrs().Get("slide_steps"); + + PADDLE_ENFORCE_GE(num_pred_buckets, 1, "num_thresholds must larger than 1"); + PADDLE_ENFORCE_GE(slide_steps, 0, "slide_steps must be natural number"); ctx->SetOutputDim("AUC", {1}); - ctx->SetOutputDim("BatchAUC", {1}); - ctx->SetOutputDim("StatPosOut", {num_pred_buckets}); - ctx->SetOutputDim("StatNegOut", {num_pred_buckets}); + + slide_steps = slide_steps == 0 ? 1 : slide_steps; + ctx->SetOutputDim("StatPosOut", {slide_steps, num_pred_buckets}); + ctx->SetOutputDim("StatNegOut", {slide_steps, num_pred_buckets}); } protected: @@ -62,6 +67,7 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "A 2D int tensor indicating the label of the training data. " "shape: [batch_size, 1]"); + // TODO(typhoonzero): support weight input AddInput("StatPos", "Statistic value when label = 1"); AddInput("StatNeg", "Statistic value when label = 0"); @@ -69,18 +75,19 @@ class AucOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("AUC", "A scalar representing the " "current area-under-the-curve."); - AddOutput("BatchAUC", "The AUC for current batch"); + AddOutput("StatPosOut", "Statistic value when label = 1"); AddOutput("StatNegOut", "Statistic value when label = 0"); AddAttr("curve", "Curve type, can be 'ROC' or 'PR'.") .SetDefault("ROC"); - AddAttr("num_thresholds", - "The number of thresholds to use when discretizing the" - " roc curve.") + AddAttr( + "num_thresholds", + "The number of thresholds to use when discretizing the roc curve.") .SetDefault((2 << 12) - 1); - + AddAttr("slide_steps", "Use slide steps to calc batch auc.") + .SetDefault(1); AddComment(R"DOC( Area Under The Curve (AUC) Operator. diff --git a/paddle/fluid/operators/auc_op.h b/paddle/fluid/operators/auc_op.h index fb0517d70635e090f8c5b59ff9d8420fc34c747b..fb370842d1942c3b3eebecb1fe5e8ffb845cb34b 100644 --- a/paddle/fluid/operators/auc_op.h +++ b/paddle/fluid/operators/auc_op.h @@ -32,7 +32,9 @@ class AucKernel : public framework::OpKernel { std::string curve = ctx.Attr("curve"); int num_thresholds = ctx.Attr("num_thresholds"); + // buckets contain numbers from 0 to num_thresholds int num_pred_buckets = num_thresholds + 1; + int slide_steps = ctx.Attr("slide_steps"); // Only use output var for now, make sure it's persistable and // not cleaned up for each batch. @@ -40,16 +42,19 @@ class AucKernel : public framework::OpKernel { auto *stat_pos = ctx.Output("StatPosOut"); auto *stat_neg = ctx.Output("StatNegOut"); - auto *stat_pos_data = stat_pos->mutable_data(ctx.GetPlace()); - auto *stat_neg_data = stat_neg->mutable_data(ctx.GetPlace()); - calcAuc(ctx, label, predict, stat_pos_data, stat_neg_data, num_thresholds, - auc); + auto *origin_stat_pos = stat_pos->mutable_data(ctx.GetPlace()); + auto *origin_stat_neg = stat_neg->mutable_data(ctx.GetPlace()); - auto *batch_auc = ctx.Output("BatchAUC"); - std::vector stat_pos_batch(num_pred_buckets, 0); - std::vector stat_neg_batch(num_pred_buckets, 0); - calcAuc(ctx, label, predict, stat_pos_batch.data(), stat_neg_batch.data(), - num_thresholds, batch_auc); + std::vector stat_pos_data(num_pred_buckets, 0); + std::vector stat_neg_data(num_pred_buckets, 0); + + auto stat_pos_calc = stat_pos_data.data(); + auto stat_neg_calc = stat_neg_data.data(); + + statAuc(label, predict, num_pred_buckets, num_thresholds, slide_steps, + origin_stat_pos, origin_stat_neg, &stat_pos_calc, &stat_neg_calc); + + calcAuc(ctx, stat_pos_calc, stat_neg_calc, num_thresholds, auc); } private: @@ -58,29 +63,76 @@ class AucKernel : public framework::OpKernel { return (X1 > X2 ? (X1 - X2) : (X2 - X1)) * (Y1 + Y2) / 2.0; } - inline static void calcAuc(const framework::ExecutionContext &ctx, - const framework::Tensor *label, + inline static void statAuc(const framework::Tensor *label, const framework::Tensor *predict, - int64_t *stat_pos, int64_t *stat_neg, - int num_thresholds, - framework::Tensor *auc_tensor) { + const int num_pred_buckets, + const int num_thresholds, const int slide_steps, + int64_t *origin_stat_pos, int64_t *origin_stat_neg, + int64_t **stat_pos, int64_t **stat_neg) { size_t batch_size = predict->dims()[0]; size_t inference_width = predict->dims()[1]; const T *inference_data = predict->data(); const auto *label_data = label->data(); - auto *auc = auc_tensor->mutable_data(ctx.GetPlace()); - for (size_t i = 0; i < batch_size; i++) { uint32_t binIdx = static_cast( inference_data[i * inference_width + 1] * num_thresholds); if (label_data[i]) { - stat_pos[binIdx] += 1.0; + (*stat_pos)[binIdx] += 1.0; } else { - stat_neg[binIdx] += 1.0; + (*stat_neg)[binIdx] += 1.0; } } + int bucket_length = num_pred_buckets * sizeof(int64_t); + + // will stat auc unlimited. + if (slide_steps == 0) { + for (int slide = 0; slide < num_pred_buckets; ++slide) { + origin_stat_pos[slide] += (*stat_pos)[slide]; + origin_stat_neg[slide] += (*stat_neg)[slide]; + } + + *stat_pos = origin_stat_pos; + *stat_neg = origin_stat_neg; + + } else { + for (int slide = 1; slide < slide_steps; ++slide) { + int dst_idx = (slide - 1) * num_pred_buckets; + int src_inx = slide * num_pred_buckets; + std::memcpy(origin_stat_pos + dst_idx, origin_stat_pos + src_inx, + bucket_length); + std::memcpy(origin_stat_neg + dst_idx, origin_stat_neg + src_inx, + bucket_length); + } + + std::memcpy(origin_stat_pos + (slide_steps - 1) * num_pred_buckets, + *stat_pos, bucket_length); + std::memcpy(origin_stat_neg + (slide_steps - 1) * num_pred_buckets, + *stat_neg, bucket_length); + + std::memset(*stat_pos, 0, bucket_length); + std::memset(*stat_neg, 0, bucket_length); + + for (int slide = 0; slide < num_pred_buckets; ++slide) { + int stat_pos_steps = 0; + int stat_neg_steps = 0; + for (int step = 0; step < slide_steps; ++step) { + stat_pos_steps += origin_stat_pos[slide + step * num_pred_buckets]; + stat_neg_steps += origin_stat_neg[slide + step * num_pred_buckets]; + } + (*stat_pos)[slide] += stat_pos_steps; + (*stat_neg)[slide] += stat_neg_steps; + } + } + } + + inline static void calcAuc(const framework::ExecutionContext &ctx, + int64_t *stat_pos, int64_t *stat_neg, + int num_thresholds, + framework::Tensor *auc_tensor) { + auto *auc = auc_tensor->mutable_data(ctx.GetPlace()); + *auc = 0.0f; double totPos = 0.0; @@ -96,7 +148,6 @@ class AucKernel : public framework::OpKernel { totPos += stat_pos[idx]; totNeg += stat_neg[idx]; *auc += trapezoidArea(totNeg, totNegPrev, totPos, totPosPrev); - --idx; } diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 5a058ddbc59c6135bacf7c2dc4b5c8b687f9b2b1..aa8ed502fc94bd0970dfe5dbf00ef090e799ad30 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -30,7 +30,13 @@ detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc polygon_box_transform_op.cu) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) -detection_library(generate_proposals_op SRCS generate_proposals_op.cc) + +if(WITH_GPU) + detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) +else() + detection_library(generate_proposals_op SRCS generate_proposals_op.cc) +endif() + detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) #Export local libraries to parent set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index c33aa255362bc5234f2813fb93e70c943b03c33f..818d58ea9ee327fd99182ad2f8cbeed07e6aaea2 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/gather.h" #include "paddle/fluid/operators/math/math_function.h" @@ -69,7 +70,7 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Anchors")->type()), - platform::CPUPlace()); + ctx.device_context()); } }; @@ -162,7 +163,7 @@ void FilterBoxes(const platform::DeviceContext &ctx, Tensor *boxes, const T *im_info_data = im_info.data(); T *boxes_data = boxes->mutable_data(ctx.GetPlace()); T im_scale = im_info_data[2]; - keep->Resize({boxes->dims()[0], 1}); + keep->Resize({boxes->dims()[0]}); min_size = std::max(min_size, 1.0f); int *keep_data = keep->mutable_data(ctx.GetPlace()); @@ -463,7 +464,7 @@ class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("post_nms_topN", "post_nms_topN"); AddAttr("nms_thresh", "nms_thres"); AddAttr("min_size", "min size"); - AddAttr("eta", "eta"); + AddAttr("eta", "The parameter for adaptive NMS."); AddComment(R"DOC( Generate Proposals OP diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..6146ff509d768c0317a5c65ed22af1a3075977a2 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -0,0 +1,449 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +namespace { + +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +int const kThreadsPerBlock = sizeof(uint64_t) * 8; + +template +__global__ void RangeInitKernel(const T start, const T delta, const int size, + T *out) { + CUDA_1D_KERNEL_LOOP(i, size) { out[i] = start + i * delta; } +} + +template +void SortDescending(const platform::CUDADeviceContext &ctx, const Tensor &value, + Tensor *value_out, Tensor *index_out) { + int num = value.numel(); + Tensor index_in_t; + int *idx_in = index_in_t.mutable_data({num}, ctx.GetPlace()); + int block = 512; + auto stream = ctx.stream(); + RangeInitKernel<<>>(0, 1, num, idx_in); + int *idx_out = index_out->mutable_data({num}, ctx.GetPlace()); + + const T *keys_in = value.data(); + T *keys_out = value_out->mutable_data({num}, ctx.GetPlace()); + + // Determine temporary device storage requirements + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, + num); + + // Allocate temporary storage + auto place = boost::get(ctx.GetPlace()); + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + + // Run sorting operation + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, + num); + + memory::Free(place, d_temp_storage); +} + +template +__device__ __forceinline__ T Min(T x, T y) { + return x < y ? x : y; +} + +template +__device__ __forceinline__ T Max(T x, T y) { + return x > y ? x : y; +} + +template +__global__ void BoxDecodeAndClipKernel(const T *anchor, const T *deltas, + const T *var, const int *index, + const T *im_info, const int num, + T *proposals) { + T kBBoxClipDefault = log(1000.0 / 16.0); + CUDA_1D_KERNEL_LOOP(i, num) { + int k = index[i] * 4; + T axmin = anchor[k]; + T aymin = anchor[k + 1]; + T axmax = anchor[k + 2]; + T aymax = anchor[k + 3]; + + T w = axmax - axmin + 1.0; + T h = aymax - aymin + 1.0; + T cx = axmin + 0.5 * w; + T cy = aymin + 0.5 * h; + + T dxmin = deltas[k]; + T dymin = deltas[k + 1]; + T dxmax = deltas[k + 2]; + T dymax = deltas[k + 3]; + + T d_cx = 0., d_cy = 0., d_w = 0., d_h = 0.; + if (var) { + d_cx = cx + dxmin * w * var[k]; + d_cy = cy + dymin * h * var[k + 1]; + d_w = exp(Min(dxmax * var[k + 2], kBBoxClipDefault)) * w; + d_h = exp(Min(dymax * var[k + 3], kBBoxClipDefault)) * h; + } else { + d_cx = cx + dxmin * w; + d_cy = cy + dymin * h; + d_w = exp(Min(dxmax, kBBoxClipDefault)) * w; + d_h = exp(Min(dymax, kBBoxClipDefault)) * h; + } + + T oxmin = d_cx - d_w * 0.5; + T oymin = d_cy - d_h * 0.5; + T oxmax = d_cx + d_w * 0.5 - 1.; + T oymax = d_cy + d_h * 0.5 - 1.; + + proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.); + proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.); + proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.); + proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.); + } +} + +template +__global__ void FilterBBoxes(const T *bboxes, const T *im_info, + const T min_size, const int num, int *keep_num, + int *keep) { + T im_h = im_info[0]; + T im_w = im_info[1]; + T im_scale = im_info[2]; + + int cnt = 0; + __shared__ int keep_index[BlockSize]; + + CUDA_1D_KERNEL_LOOP(i, num) { + keep_index[threadIdx.x] = -1; + __syncthreads(); + + int k = i * 4; + T xmin = bboxes[k]; + T ymin = bboxes[k + 1]; + T xmax = bboxes[k + 2]; + T ymax = bboxes[k + 3]; + + T w = xmax - xmin + 1.0; + T h = ymax - ymin + 1.0; + T cx = xmin + w / 2.; + T cy = ymin + h / 2.; + + T w_s = (xmax - xmin) / im_scale + 1.; + T h_s = (ymax - ymin) / im_scale + 1.; + + if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) { + keep_index[threadIdx.x] = i; + } + __syncthreads(); + if (threadIdx.x == 0) { + int size = (num - i) < BlockSize ? num - i : BlockSize; + for (int j = 0; j < size; ++j) { + if (keep_index[j] > -1) { + keep[cnt++] = keep_index[j]; + } + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { + keep_num[0] = cnt; + } +} + +__device__ inline float IoU(const float *a, const float *b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float inter_s = width * height; + float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return inter_s / (s_a + s_b - inter_s); +} + +__global__ void NMSKernel(const int n_boxes, const float nms_overlap_thresh, + const float *dev_boxes, uint64_t *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + const int row_size = + min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock); + const int col_size = + min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock); + + __shared__ float block_boxes[kThreadsPerBlock * 4]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 4 + 0] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0]; + block_boxes[threadIdx.x * 4 + 1] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1]; + block_boxes[threadIdx.x * 4 + 2] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2]; + block_boxes[threadIdx.x * 4 + 3] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 4; + int i = 0; + uint64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +template +void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, + const Tensor &sorted_indices, const T nms_threshold, + Tensor *keep_out) { + int boxes_num = proposals.dims()[0]; + PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]); + + const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock); + dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock), + DIVUP(boxes_num, kThreadsPerBlock)); + dim3 threads(kThreadsPerBlock); + + const T *boxes = proposals.data(); + auto place = boost::get(ctx.GetPlace()); + int size_bytes = boxes_num * col_blocks * sizeof(uint64_t); + uint64_t *d_mask = + reinterpret_cast(memory::Alloc(place, size_bytes)); + NMSKernel<<>>(boxes_num, nms_threshold, boxes, d_mask); + uint64_t *h_mask = reinterpret_cast( + memory::Alloc(platform::CPUPlace(), size_bytes)); + memory::Copy(platform::CPUPlace(), h_mask, place, d_mask, size_bytes, 0); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + + std::vector keep_vec; + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / kThreadsPerBlock; + int inblock = i % kThreadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + ++num_to_keep; + keep_vec.push_back(i); + uint64_t *p = &h_mask[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + int *keep = keep_out->mutable_data({num_to_keep}, ctx.GetPlace()); + memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(), + sizeof(int) * num_to_keep, 0); + memory::Free(place, d_mask); + memory::Free(platform::CPUPlace(), h_mask); +} + +template +std::pair ProposalForOneImage( + const platform::CUDADeviceContext &ctx, const Tensor &im_info, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas, // [M, 4] + const Tensor &scores, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) { + // 1. pre nms + Tensor scores_sort, index_sort; + SortDescending(ctx, scores, &scores_sort, &index_sort); + int num = scores.numel(); + int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() + : pre_nms_top_n; + scores_sort.Resize({pre_nms_num, 1}); + index_sort.Resize({pre_nms_num, 1}); + + // 2. box decode and clipping + Tensor proposals; + proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); + int block = 512; + auto stream = ctx.stream(); + BoxDecodeAndClipKernel<<>>( + anchors.data(), bbox_deltas.data(), variances.data(), + index_sort.data(), im_info.data(), pre_nms_num, + proposals.data()); + + // 3. filter + Tensor keep_index, keep_num_t; + keep_index.mutable_data({pre_nms_num}, ctx.GetPlace()); + keep_num_t.mutable_data({1}, ctx.GetPlace()); + min_size = std::max(min_size, 1.0f); + FilterBBoxes<<<1, 512, 0, stream>>>( + proposals.data(), im_info.data(), min_size, pre_nms_num, + keep_num_t.data(), keep_index.data()); + int keep_num; + const auto gpu_place = boost::get(ctx.GetPlace()); + memory::Copy(platform::CPUPlace(), &keep_num, gpu_place, + keep_num_t.data(), sizeof(int), 0); + keep_index.Resize({keep_num}); + + Tensor scores_filter, proposals_filter; + proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); + scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); + GPUGather(ctx, proposals, keep_index, &proposals_filter); + GPUGather(ctx, scores_sort, keep_index, &scores_filter); + + if (nms_thresh <= 0) { + return std::make_pair(proposals_filter, scores_filter); + } + + // 4. nms + Tensor keep_nms; + NMS(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms); + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + Tensor scores_nms, proposals_nms; + proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); + scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); + GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + + return std::make_pair(proposals_nms, scores_nms); +} +} // namespace + +template +class CUDAGenerateProposalsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_info = context.Input("ImInfo"); + auto *anchors = context.Input("Anchors"); + auto *variances = context.Input("Variances"); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS."); + + auto &dev_ctx = context.template device_context(); + + auto scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + math::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); + trans(dev_ctx, *scores, &scores_swap, axis); + + Tensor *anchor = const_cast(anchors); + anchor->Resize({anchors->numel() / 4, 4}); + Tensor *var = const_cast(variances); + var->Resize({var->numel() / 4, 4}); + + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); + + T *rpn_rois_data = rpn_rois->data(); + T *rpn_roi_probs_data = rpn_roi_probs->data(); + + auto place = boost::get(dev_ctx.GetPlace()); + + int64_t num_proposals = 0; + std::vector offset(1, 0); + for (int64_t i = 0; i < num; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair box_score_pair = + ProposalForOneImage(dev_ctx, im_info_slice, *anchor, *var, + bbox_deltas_slice, scores_slice, pre_nms_top_n, + post_nms_top_n, nms_thresh, min_size, eta); + + Tensor proposals = box_score_pair.first; + Tensor scores = box_score_pair.second; + + memory::Copy(place, rpn_rois_data + num_proposals * 4, place, + proposals.data(), sizeof(T) * proposals.numel(), 0); + memory::Copy(place, rpn_roi_probs_data + num_proposals, place, + scores.data(), sizeof(T) * scores.numel(), 0); + num_proposals += proposals.dims()[0]; + offset.emplace_back(num_proposals); + } + framework::LoD lod; + lod.emplace_back(offset); + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(generate_proposals, + ops::CUDAGenerateProposalsKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be..dd5d138a1e979826d59c4731920379b030e3b492 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto ap_type = GetAPType(ctx.Attr("ap_type")); int class_num = ctx.Attr("class_num"); - auto label_lod = in_label->lod(); - auto detect_lod = in_detect->lod(); + auto& label_lod = in_label->lod(); + auto& detect_lod = in_detect->lod(); PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), @@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto labels = framework::EigenTensor::From(input_label); auto detect = framework::EigenTensor::From(input_detect); - auto label_lod = input_label.lod(); - auto detect_lod = input_detect.lod(); + auto& label_lod = input_label.lod(); + auto& detect_lod = input_detect.lod(); int batch_size = label_lod[0].size() - 1; - auto label_index = label_lod[0]; + auto& label_index = label_lod[0]; for (int n = 0; n < batch_size; ++n) { std::map> boxes; @@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel { output_true_pos->set_lod(true_pos_lod); output_false_pos->set_lod(false_pos_lod); - return; } void GetInputPos(const framework::Tensor& input_pos_count, @@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto SetData = [](const framework::LoDTensor& pos_tensor, std::map>>& pos) { const T* pos_data = pos_tensor.data(); - auto pos_data_lod = pos_tensor.lod()[0]; + auto& pos_data_lod = pos_tensor.lod()[0]; for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { T score = pos_data[j * 2]; @@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel { std::map>>* false_pos) const { int batch_size = gt_boxes.size(); for (int n = 0; n < batch_size; ++n) { - auto image_gt_boxes = gt_boxes[n]; - for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { + auto& image_gt_boxes = gt_boxes[n]; + for (auto& image_gt_box : image_gt_boxes) { size_t count = 0; - auto labeled_bboxes = it->second; + auto& labeled_bboxes = image_gt_box.second; if (evaluate_difficult) { count = labeled_bboxes.size(); } else { - for (size_t i = 0; i < labeled_bboxes.size(); ++i) - if (!(labeled_bboxes[i].is_difficult)) ++count; + for (auto& box : labeled_bboxes) { + if (!box.is_difficult) { + ++count; + } + } } if (count == 0) { continue; } - int label = it->first; + int label = image_gt_box.first; if (label_pos_count->find(label) == label_pos_count->end()) { (*label_pos_count)[label] = count; } else { diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc index 9a297d03cfb041e584159a5fc5ba214f8ac404b4..3acae3bcdf4a509ab6e7e19f21c4b2ec4d72b7d7 100644 --- a/paddle/fluid/operators/extract_rows_op.cc +++ b/paddle/fluid/operators/extract_rows_op.cc @@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { auto &in = scope.FindVar(Input("X"))->Get(); auto out = scope.FindVar(Output("Out"))->GetMutable(); - auto in_rows = in.rows(); + auto &in_rows = in.rows(); auto out_dim = framework::make_ddim( std::vector{static_cast(in_rows.size()), 1}); auto dst_ptr = out->mutable_data(out_dim, in.place()); diff --git a/paddle/fluid/operators/fusion_lstm_op.cc b/paddle/fluid/operators/fusion_lstm_op.cc index 8ca79d20ec4f6412b00dbf3990068f81b65e2efd..23e8edd18d037a7f9127482951f25be3abf1b62f 100644 --- a/paddle/fluid/operators/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fusion_lstm_op.cc @@ -76,12 +76,18 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); PADDLE_ENFORCE_EQ(b_dims[0], 1, "The first dimension of Input(Bias) should be 1."); - PADDLE_ENFORCE_EQ( - b_dims[1], (ctx->Attrs().Get("use_peepholes") ? 7 : 4) * frame_size, - "The second dimension of Input(Bias) should be " - "7 * %d if enable peepholes connection or" - "4 * %d if disable peepholes", - frame_size, frame_size); + if (ctx->Attrs().Get("use_peepholes")) { + PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection", + frame_size); + ctx->SetOutputDim("CheckedCell", {2, frame_size}); + } else { + PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size, + "The second dimension of Input(Bias) should be " + "4 * %d if disable peepholes", + frame_size); + } framework::DDim out_dims({x_dims[0], frame_size}); ctx->SetOutputDim("Hidden", out_dims); @@ -173,6 +179,8 @@ void FusionLSTMOpMaker::Make() { AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); + AddOutput("CheckedCell", "(Tensor) (2 x D) only for peephole.") + .AsIntermediate(); AddAttr("use_peepholes", "(bool, defalut: True) " "whether to enable diagonal/peephole connections.") @@ -250,19 +258,19 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D3 = D * 3; \ const int D4 = wh_dims[1]; -#define INIT_BASE_INPUT_DATAS \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wc_data = bias->data() + D4; \ - /* for peephole only*/ \ - Tensor checked_cell; \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - checked_cell_data = checked_cell.mutable_data({2, D}, place); \ +#define INIT_BASE_INPUT_DATAS \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wc_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ } /// Compute LSTM diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 74823dab09cac358f647c074ac2f2ee2fed17e55..abd5dce8f7e7146a1671a387328c177e5e6e0a85 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto gpu_place = boost::get(context.GetPlace()); // TODO(yuyang18): Strange code here. - memory::Copy(platform::CPUPlace(), - new_rows.CUDAMutableData(context.GetPlace()), gpu_place, - ids_data, ids_num * sizeof(int64_t), stream); - + memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()), + gpu_place, ids_data, ids_num * sizeof(int64_t), stream); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index b27880c232a51d32777569cf9ac67656ce02f232..ba8eccf82042b679f69a32f9d053f05ac8fb9a99 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -60,11 +60,9 @@ struct SelectedRowsAdd { auto out_place = context.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(out_place)); - memory::Copy( - boost::get(out_place), out_data, - boost::get(in1_place), in1_data, - in1_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); auto* in2_data = in2_value.data(); memory::Copy(boost::get(out_place), @@ -148,7 +146,7 @@ struct SelectedRowsAddTo { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ(in1_height, input2->height()); - framework::Vector in1_rows(input1.rows()); + auto& in1_rows = input1.rows(); auto& in2_rows = *(input2->mutable_rows()); auto& in1_value = input1.value(); diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc index 724463c95c4a29fb5c00fe791b389d3908771640..a4f41a170426a4650fd3bf8f7fec4758ff34e1b9 100644 --- a/paddle/fluid/operators/sampling_id_op.cc +++ b/paddle/fluid/operators/sampling_id_op.cc @@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker { SamplingId Operator. A layer for sampling id from multinomial distribution from the input. Sampling one id for one sample.)DOC"); - AddAttr("min", "Minimum value of random. [default 0.0].") + AddAttr("min", "Minimum value of random. (float, default 0.0).") .SetDefault(0.0f); - AddAttr("max", "Maximun value of random. [default 1.0].") + AddAttr("max", "Maximun value of random. (float, default 1.0).") .SetDefault(1.0f); - AddAttr("seed", - "Random seed used for the random number engine. " - "0 means use a seed generated by the system." - "Note that if seed is not 0, this operator will always " - "generate the same random numbers every time. [default 0].") + AddAttr( + "seed", + "Random seed used for the random number engine. " + "0 means use a seed generated by the system." + "Note that if seed is not 0, this operator will always " + "generate the same random numbers every time. (int, default 0).") .SetDefault(0); } }; diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index bf4df4f600c14050b636b7ee6d7b6973b57adb94..981969d2aaa684731a615ec64ca7f7718b35cf09 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -77,8 +77,10 @@ class ScaleOpVarTypeInference : public framework::VarTypeInference { auto out_var_name = op_desc.Output("Out").front(); auto *out_var = block->FindVarRecursive(out_var_name); - out_var->SetType(in_var.GetType()); - out_var->SetDataType(in_var.GetDataType()); + if (in_var_name != out_var_name) { + out_var->SetType(in_var.GetType()); + out_var->SetDataType(in_var.GetDataType()); + } } }; diff --git a/paddle/fluid/operators/sequence_slice_op.h b/paddle/fluid/operators/sequence_slice_op.h index b5ea6ff49bbb29571f9a6ef6358ef881acd9be9e..03b59d71cc0ca2eddd1d9912e7ca25348507ba03 100644 --- a/paddle/fluid/operators/sequence_slice_op.h +++ b/paddle/fluid/operators/sequence_slice_op.h @@ -75,11 +75,11 @@ class SequenceSliceOpKernel : public framework::OpKernel { } for (size_t i = 0; i < n; ++i) { - PADDLE_ENFORCE_LT(0, offset_data[i], + PADDLE_ENFORCE_LE(0, offset_data[i], "The offset[%d] must greater than zero.", i); PADDLE_ENFORCE_LT(0, length_data[i], "The length[%d] must greater than zero.", i); - PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], + PADDLE_ENFORCE_LE(lod[0][i] + offset_data[i] + length_data[i], lod[0][i + 1], "The target tensor's length overflow."); } diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 4722be7a666d3e8f3c25c9499f88ddda835f60e3..243609075713305a90dc162991166ba24d54e835 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU +#include #include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, } } -template +template __global__ void SparseSGDFunctorKernel(const T* selected_rows, const int64_t* rows, const T* learning_rate, T* tensor_out, - int64_t row_numel) { - const int ty = blockIdx.y; - int tid = threadIdx.x; - - selected_rows += ty * row_numel; - tensor_out += rows[ty] * row_numel; - - for (int index = tid; index < row_numel; index += block_size) { - // Since index in rows of SelectedRows can be duplicate, we have to use - // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd( - tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]); + int64_t row_numel, int64_t limit) { + for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) { + const T* selected_rows_ptr = selected_rows + i * row_numel; + T* tensor_out_ptr = tensor_out + rows[i] * row_numel; + for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd( + tensor_out_ptr + index, + -1.0 * learning_rate[0] * selected_rows_ptr[index]); + } } } } // namespace @@ -89,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(in_height, out_dims[0]); auto& in_value = grad->value(); - framework::Vector in_rows(grad->rows()); + auto& in_rows = grad->rows(); int64_t in_row_numel = in_value.numel() / in_rows.size(); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); @@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel { auto* in_data = in_value.data(); auto* out_data = param_out->data(); - const int block_size = 256; - dim3 threads(block_size, 1); - dim3 grid(1, in_rows.size()); - SparseSGDFunctorKernel< - T, 256><<>>( + const int kThreadsPerBlock = 256; + int thread_x = kThreadsPerBlock; + int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + SparseSGDFunctorKernel<<>>( in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), - out_data, in_row_numel); + out_data, in_row_numel, in_rows.size()); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 6dffe527c1072ee97fcde1725bfc1a47ed1ad74a..34403c7a7aa717cca470be2931009e219e00e3ae 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -32,7 +32,7 @@ class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto in_vars = context.MultiInputVar("X"); - int N = in_vars.size(); + size_t in_num = in_vars.size(); auto out_var = context.OutputVar("Out"); bool in_place = out_var == in_vars[0]; @@ -53,7 +53,7 @@ class SumKernel : public framework::OpKernel { auto &place = *context.template device_context().eigen_device(); // If in_place, just skip the first tensor - for (int i = in_place ? 1 : 0; i < N; i++) { + for (size_t i = in_place ? 1 : 0; i < in_num; i++) { if (in_vars[i]->IsType()) { auto &in_t = in_vars[i]->Get(); if (in_t.numel() == 0) { @@ -101,13 +101,13 @@ class SumKernel : public framework::OpKernel { // Runtime InferShape size_t first_dim = 0; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); first_dim += sel_row.rows().size(); } std::vector in_dim; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); if (sel_row.rows().size() > 0) { in_dim = framework::vectorize(sel_row.value().dims()); @@ -116,14 +116,14 @@ class SumKernel : public framework::OpKernel { } if (in_dim.empty()) { VLOG(3) << "WARNING: all the inputs are empty"; - in_dim = framework::vectorize(get_selected_row(N - 1).value().dims()); + in_dim = + framework::vectorize(get_selected_row(in_num - 1).value().dims()); } else { in_dim[0] = static_cast(first_dim); } out_value->Resize(framework::make_ddim(in_dim)); out_value->mutable_data(context.GetPlace()); - // if all the input sparse vars are empty, no need to // merge these vars. if (first_dim == 0UL) { @@ -133,7 +133,7 @@ class SumKernel : public framework::OpKernel { math::SelectedRowsAddTo functor; int64_t offset = 0; - for (int i = 0; i < N; i++) { + for (size_t i = 0; i < in_num; i++) { auto &sel_row = get_selected_row(i); if (sel_row.rows().size() == 0) { continue; diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 1048d3017140c9e31426a1580b2862667116a024..41a5786fe8c3295390144732221280e152d0a15a 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -22,8 +22,6 @@ namespace paddle { DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); -DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size"); -DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size"); namespace operators { @@ -34,6 +32,8 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("engine_uniq_key", "unique key for the TRT engine."); + AddAttr("max_batch_size", "the maximum batch size."); + AddAttr("workspace_size", "the workspace size."); AddComment("TensorRT engine operator."); } }; diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 69173ff5178d32634f9ab291b7d709a3f91cb368..3c78c29c1a30d74947be84cd2b52ad308e732a2d 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -28,8 +28,6 @@ namespace paddle { DECLARE_int32(tensorrt_engine_batch_size); -DECLARE_int32(tensorrt_max_batch_size); -DECLARE_int32(tensorrt_workspace_size); namespace operators { @@ -92,14 +90,14 @@ class TensorRTEngineKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto engine_name = context.Attr("engine_uniq_key"); + int max_batch_size = context.Attr("max_batch_size"); if (!Singleton::Global().HasEngine(engine_name)) { Prepare(context); } auto* engine = Singleton::Global().Get(engine_name); auto input_names = context.op().Inputs("Xs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, - FLAGS_tensorrt_max_batch_size); + PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size); std::vector output_maps = context.Attr>("output_name_mapping"); @@ -173,8 +171,9 @@ class TensorRTEngineKernel : public framework::OpKernel { // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch = FLAGS_tensorrt_max_batch_size; - auto max_workspace = FLAGS_tensorrt_workspace_size; + int max_batch_size = context.Attr("max_batch_size"); + int workspace_size = context.Attr("workspace_size"); + auto params = context.Attr>("parameters"); std::unordered_set parameters; for (const auto& param : params) { @@ -186,7 +185,7 @@ class TensorRTEngineKernel : public framework::OpKernel { // TODO(Superjomn) replace this with a different stream auto* engine = Singleton::Global().Create( - max_batch, max_workspace, nullptr /*engine hold its own stream*/, + max_batch_size, workspace_size, nullptr /*engine hold its own stream*/, context.Attr("engine_uniq_key"), boost::get(context.GetPlace()).device); diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 27c1d29762b3de5e57f877b271aae52e71eb7cf9..e21101e8d12f210af08284dbcebe5c14c1af6dd3 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -58,8 +58,6 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block, using inference::analysis::SetAttr; TEST(TensorRTEngineOp, manual) { - FLAGS_tensorrt_engine_batch_size = 2; - FLAGS_tensorrt_max_batch_size = 2; framework::ProgramDesc program; auto* block_ = program.Proto()->add_blocks(); block_->set_idx(0); @@ -101,6 +99,8 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetOutput("Ys", std::vector({"z0"})); SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); + SetAttr(engine_op_desc.Proto(), "max_batch_size", 2); + SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); @@ -129,8 +129,6 @@ TEST(TensorRTEngineOp, manual) { } void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { - FLAGS_tensorrt_engine_batch_size = batch_size; - FLAGS_tensorrt_max_batch_size = batch_size; framework::ProgramDesc program; framework::Scope scope; platform::CUDAPlace place; @@ -195,8 +193,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch", batch_size); - SetAttr(engine_op_desc.Proto(), "max_workspace", 2 << 10); + SetAttr(engine_op_desc.Proto(), "max_batch_size", batch_size); + SetAttr(engine_op_desc.Proto(), "workspace_size", 2 << 10); SetAttr>( engine_op_desc.Proto(), "parameters", std::vector({"y0", "y1", "y2", "y3"})); diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b5bd07d401f9ebfe441bc0f84f9bad317f0e8da9..e7f634c4a622b48e97040987836406cf73cb23b6 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) if(NOT WIN32) list(APPEND PYBIND_DEPS parallel_executor profiler) diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 67501186d150171728194f23bc02d2c014848dd7..a5bc44122028c1191f511157bdde2e7c2d30c6aa 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -285,12 +285,12 @@ void BindOpDesc(pybind11::module *m) { .def("set_output", &pd::OpDesc::SetOutput) .def("input_arg_names", &pd::OpDesc::InputArgumentNames) .def("output_arg_names", &pd::OpDesc::OutputArgumentNames) - .def("rename_input", &pd::OpDesc::RenameInput) - .def("rename_output", &pd::OpDesc::RenameOutput) + .def("_rename_input", &pd::OpDesc::RenameInput) + .def("_rename_output", &pd::OpDesc::RenameOutput) .def("has_attr", &pd::OpDesc::HasAttr) .def("attr_type", &pd::OpDesc::GetAttrType) .def("attr_names", &pd::OpDesc::AttrNames) - .def("set_attr", &pd::OpDesc::SetAttr) + .def("_set_attr", &pd::OpDesc::SetAttr) .def("attr", &pd::OpDesc::GetAttr) .def("set_block_attr", &pd::OpDesc::SetBlockAttr) .def("set_blocks_attr", &pd::OpDesc::SetBlocksAttr) @@ -300,8 +300,8 @@ void BindOpDesc(pybind11::module *m) { std::string ser(seriralized); self.SetAttr(name, ser); }) - .def("block_attr_id", &pd::OpDesc::GetBlockAttrId) - .def("blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) + .def("_block_attr_id", &pd::OpDesc::GetBlockAttrId) + .def("_blocks_attr_ids", &pd::OpDesc::GetBlocksAttrIds) .def("check_attrs", &pd::OpDesc::CheckAttrs) .def("infer_shape", &pd::OpDesc::InferShape) .def("infer_var_type", &pd::OpDesc::InferVarType) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 8b62502e3f920a3bf7d80f9e7edc2f3647a0e5b1..ef2f1f2a20a2eddee8ac077ee4bbf4dfd777448d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -595,6 +596,29 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); + py::class_> pass(m, "Pass"); + pass.def(py::init()) + .def("set_str", [](ir::Pass &self, const std::string &name, + const std::string &attr) { + self.Set(name, new std::string(attr)); + }); + + py::class_> pb( + m, "PassBuilder"); + pb.def(py::init()) + .def("append_pass", + [](ir::PassBuilder &self, + const std::string &pass_type) -> std::shared_ptr { + return self.AppendPass(pass_type); + }) + .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) + .def("insert_pass", + [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { + return self.InsertPass(idx, pass_type); + }) + .def("remove_pass", + [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); + // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy"); @@ -677,7 +701,11 @@ All parameter, weight, gradient are variables in Paddle. }, [](BuildStrategy &self, bool b) { self.fuse_elewise_add_act_ops_ = b; - }); + }) + .def("_create_passes_from_strategy", + [](BuildStrategy &self) -> std::shared_ptr { + return self.CreatePassesFromStrategy(); + }); pe.def(py::init &, const std::unordered_set &, diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index a3b4e38f453835828a4a53130e11c854ac3f4a74..10c9eb80d0a7e07d5974ca10d740e71e7717b5c5 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -56,13 +56,13 @@ struct Style { }; template -static void PrettyLogEndl(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLogEndl(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; } template -static void PrettyLog(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLog(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset(); } diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cd9cbe379874e5ab7e40c1349e0483ff45bb63a --- /dev/null +++ b/paddle/fluid/train/CMakeLists.txt @@ -0,0 +1,30 @@ +function(train_test TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs ARGS) + cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) + set(arg_list "") + if(train_test_ARGS) + foreach(arg ${train_test_ARGS}) + list(APPEND arg_list "_${arg}") + endforeach() + else() + list(APPEND arg_list "_") + endif() + foreach(arg ${arg_list}) + string(REGEX REPLACE "^_$" "" arg "${arg}") + cc_test(test_train_${TARGET_NAME}${arg} + SRCS test_train_${TARGET_NAME}.cc + DEPS paddle_fluid_origin + ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.train.model/) + set_tests_properties(test_train_${TARGET_NAME}${arg} + PROPERTIES DEPENDS test_${TARGET_NAME}) + endforeach() +endfunction(train_test) + + +if(WITH_TESTING) + train_test(recognize_digits ARGS mlp conv) +endif() diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc new file mode 100644 index 0000000000000000000000000000000000000000..e8731dd51ad698e53b7f10cc781c52134f2d17a8 --- /dev/null +++ b/paddle/fluid/train/test_train_recognize_digits.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/place.h" + +DEFINE_string(dirname, "", "Directory of the train model."); + +namespace paddle { + +void Train() { + CHECK(!FLAGS_dirname.empty()); + framework::InitDevices(false); + const auto cpu_place = platform::CPUPlace(); + framework::Executor executor(cpu_place); + framework::Scope scope; + + auto train_program = inference::Load( + &executor, &scope, FLAGS_dirname + "__model_combined__.main_program", + FLAGS_dirname + "__params_combined__"); + + std::string loss_name = ""; + for (auto op_desc : train_program->Block(0).AllOps()) { + if (op_desc->Type() == "mean") { + loss_name = op_desc->Output("Out")[0]; + break; + } + } + + PADDLE_ENFORCE_NE(loss_name, "", "loss not found"); + + // prepare data + auto x_var = scope.Var("img"); + auto x_tensor = x_var->GetMutable(); + x_tensor->Resize({64, 1, 28, 28}); + + auto x_data = x_tensor->mutable_data(cpu_place); + for (int i = 0; i < 64 * 28 * 28; ++i) { + x_data[i] = 1.0; + } + + auto y_var = scope.Var("label"); + auto y_tensor = y_var->GetMutable(); + y_tensor->Resize({64, 1}); + auto y_data = y_tensor->mutable_data(cpu_place); + for (int i = 0; i < 64 * 1; ++i) { + y_data[i] = static_cast(1); + } + + auto loss_var = scope.Var(loss_name); + float first_loss = 0.0; + float last_loss = 0.0; + for (int i = 0; i < 100; ++i) { + executor.Run(*train_program.get(), &scope, 0, false, true); + if (i == 0) { + first_loss = loss_var->Get().data()[0]; + } else if (i == 99) { + last_loss = loss_var->Get().data()[0]; + } + } + EXPECT_LT(last_loss, first_loss); +} + +TEST(train, recognize_digits) { Train(); } + +} // namespace paddle diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index e6a9524382be219e550017ed4f1a6070dca22fbf..7d2fb7c6ce9e6a89df2c777323fc6a547fc227f4 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -70,8 +70,8 @@ function cmake_gen() { PYTHON_FLAGS="" SYSTEM=`uname -s` if [ "$SYSTEM" == "Darwin" ]; then + echo "Using python abi: $1" if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then - echo "using python abi: $1" if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 @@ -82,7 +82,18 @@ function cmake_gen() { else exit 1 fi - # TODO: qiyang add python3 part here + elif [ "$1" == "cp35-cp35m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi fi else if [ "$1" != "" ]; then @@ -381,7 +392,7 @@ function run_mac_test() { EOF # TODO: jiabin need to refine this part when these tests fixed on mac - ctest --output-on-failure -j8 + ctest --output-on-failure -j $1 # make install should also be test when unittest make install -j 8 pip install /usr/local/opt/paddle/share/wheels/*.whl @@ -629,10 +640,10 @@ EOF function gen_capi_package() { if [[ ${WITH_C_API} == "ON" ]]; then - install_prefix="${PADDLE_ROOT}/build/capi_output" - rm -rf $install_prefix - make DESTDIR="$install_prefix" install - cd $install_prefix/usr/local + capi_install_prefix=${INSTALL_PREFIX:-/paddle/build}/capi_output + rm -rf $capi_install_prefix + make DESTDIR="$capi_install_prefix" install + cd $capi_install_prefix/ ls | egrep -v "^Found.*item$" | xargs tar -czf ${PADDLE_ROOT}/build/paddle.tgz fi } @@ -729,7 +740,11 @@ function main() { maccheck) cmake_gen ${PYTHON_ABI:-""} build_mac - run_mac_test + run_mac_test ${PROC_RUN:-1} + ;; + macbuild) + cmake_gen ${PYTHON_ABI:-""} + build_mac ;; cicheck_py35) cmake_gen ${PYTHON_ABI:-""} diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 9cdcb87df5dd1669066c204c86c269973df506f1..1c5ded943b3814688af1f177503d3bdc35073c3f 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -87,6 +87,7 @@ if (WITH_TESTING) endif() endif() add_subdirectory(paddle/fluid/tests) + add_subdirectory(paddle/fluid/contrib/tests) endif() install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR} DESTINATION opt/paddle/share/wheels diff --git a/python/paddle/dataset/common.py b/python/paddle/dataset/common.py index ece4046f5b7a7eff5be724d6f890665be7f3344e..58a4c66c206c3f783437126c855c2890644f1bc0 100644 --- a/python/paddle/dataset/common.py +++ b/python/paddle/dataset/common.py @@ -77,13 +77,14 @@ def download(url, module_name, md5sum, save_name=None): retry_limit = 3 while not (os.path.exists(filename) and md5file(filename) == md5sum): if os.path.exists(filename): - print("file md5", md5file(filename), md5sum) + sys.stderr.write("file %s md5 %s" % (md5file(filename), md5sum)) if retry < retry_limit: retry += 1 else: raise RuntimeError("Cannot download {0} within retry limit {1}". format(url, retry_limit)) - print("Cache file %s not found, downloading %s" % (filename, url)) + sys.stderr.write("Cache file %s not found, downloading %s" % + (filename, url)) r = requests.get(url, stream=True) total_length = r.headers.get('content-length') @@ -100,10 +101,11 @@ def download(url, module_name, md5sum, save_name=None): dl += len(data) f.write(data) done = int(50 * dl / total_length) - sys.stdout.write("\r[%s%s]" % ('=' * done, + sys.stderr.write("\r[%s%s]" % ('=' * done, ' ' * (50 - done))) sys.stdout.flush() - + sys.stderr.write("\n") + sys.stdout.flush() return filename diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py index f8c1a33574e642b21feb6843d115b7f4205ef250..adc0c1aac80cbdb0b0c04535fc39b6a172d23eec 100644 --- a/python/paddle/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -89,7 +89,8 @@ def reader_creator(tar_file, file_name, dict_size): ] for name in names: for line in f.extractfile(name): - line_split = line.strip().split(six.b('\t')) + line = cpt.to_text(line) + line_split = line.strip().split('\t') if len(line_split) != 2: continue src_seq = line_split[0] # one source sequence diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py index f30dcd518ea6c0c685d027ede3ad6e0a1cb0c82c..9c02e0f41b04e113251e0fda72ca8abd976ab6f7 100644 --- a/python/paddle/dataset/wmt16.py +++ b/python/paddle/dataset/wmt16.py @@ -64,7 +64,8 @@ def __build_dict(tar_file, dict_size, save_path, lang): word_dict = defaultdict(int) with tarfile.open(tar_file, mode="r") as f: for line in f.extractfile("wmt16/train"): - line_split = line.strip().split(six.b("\t")) + line = cpt.to_text(line) + line_split = line.strip().split("\t") if len(line_split) != 2: continue sen = line_split[0] if lang == "en" else line_split[1] for w in sen.split(): @@ -123,7 +124,8 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang): with tarfile.open(tar_file, mode="r") as f: for line in f.extractfile(file_name): - line_split = line.strip().split(six.b("\t")) + line = cpt.to_text(line) + line_split = line.strip().split("\t") if len(line_split) != 2: continue src_words = line_split[src_col].split() diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 88eaae10dd55edcc7e811163acf17579eb32cbf1..17fe8dc3c8a28ad129e2d377820da95e8e7a02d9 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -38,8 +38,8 @@ def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): op_desc = op_descs[i] if isinstance(op_desc, tuple): op_desc = op_desc[0] - op_desc.rename_input(old_name, new_name) - op_desc.rename_output(old_name, new_name) + op_desc._rename_input(old_name, new_name) + op_desc._rename_output(old_name, new_name) def _create_op_desc_(op_type, inputs, outputs, attrs): @@ -70,7 +70,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs): if isinstance(val, framework.Block): op_desc.set_block_attr(name, val.desc) else: - op_desc.set_attr(name, val) + op_desc._set_attr(name, val) return op_desc @@ -346,7 +346,7 @@ def _append_backward_ops_(block, grad_sub_block_list = [] # If the op has its own sub-block, deal with the sub-block first if op.has_attr("sub_block"): - sub_block = program.block(op.block_attr_id("sub_block")) + sub_block = program.block(op._block_attr_id("sub_block")) grad_sub_block = program._create_block() grad_sub_block._set_forward_block_idx(sub_block.idx) cb = _callback_lookup_(op) @@ -382,7 +382,7 @@ def _append_backward_ops_(block, for op_desc in grad_op_descs: new_op_desc = target_block.desc.append_op() new_op_desc.copy_from(op_desc) - new_op_desc.set_attr(op_role_attr_name, backward) + new_op_desc._set_attr(op_role_attr_name, backward) grad_to_var["__current_op_desc__"] = new_op_desc if callbacks is not None: assert (isinstance(callbacks, list)) @@ -408,7 +408,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): for op_idx in range(start_op_idx, block.desc.op_size()): op_desc = block.desc.op(op_idx) if op_desc.has_attr("sub_block"): - sub_block = block.program.block(op_desc.block_attr_id("sub_block")) + sub_block = block.program.block(op_desc._block_attr_id("sub_block")) _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map) new_vars = set() # create new gradient variables @@ -438,12 +438,12 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map): op_desc = block.desc.op(op_idx) for name in op_desc.input_arg_names(): if name in var_map: - op_desc.rename_input(name, var_map[name]) + op_desc._rename_input(name, var_map[name]) for name in op_desc.output_arg_names(): if block.desc.find_var(name.encode("ascii")): new_name = unique_name.generate(name) - op_desc.rename_output(name, new_name) + op_desc._rename_output(name, new_name) var_map[name] = new_name for g, ng in six.iteritems(var_map): @@ -542,9 +542,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, if loss.op is None: raise ValueError("loss.op is None. Should not happend") - loss.op.set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(), - int(core.op_proto_and_checker_maker.OpRole.Forward) | - int(core.op_proto_and_checker_maker.OpRole.Loss)) + loss.op._set_attr(core.op_proto_and_checker_maker.kOpRoleAttrName(), + int(core.op_proto_and_checker_maker.OpRole.Forward) | + int(core.op_proto_and_checker_maker.OpRole.Loss)) if callbacks is not None: isinstance(callbacks, list) @@ -631,7 +631,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, attr_val = [p.name, g.name] if g.op.has_attr(op_role_var_attr_name): attr_val.extend(g.op.attr(op_role_var_attr_name)) - g.op.set_attr(op_role_var_attr_name, attr_val) + g.op._set_attr(op_role_var_attr_name, attr_val) return params_and_grads diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 32b8f1189fd65ba1e8da5aeaf316fc0ae05af552..e884185528282021fd16289ccc6a3533e22b9967 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -75,8 +75,8 @@ class ErrorClipByValue(BaseErrorClipAttr): clip_op_desc.set_type("clip") clip_op_desc.set_input("X", [grad_name]) clip_op_desc.set_output("Out", [grad_name]) - clip_op_desc.set_attr("min", self.min) - clip_op_desc.set_attr("max", self.max) + clip_op_desc._set_attr("min", self.min) + clip_op_desc._set_attr("max", self.max) def error_clip_callback(block, context): diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 5607f11932bbe6aff548be316dc39b4636e079f4..3bf2fe5db0cb2126295ebfda822eeac8762dbdb7 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -18,5 +18,13 @@ from . import decoder from .decoder import * from . import memory_usage_calc from .memory_usage_calc import * +from . import op_frequence +from .op_frequence import * +from . import quantize +from .quantize import * -__all__ = decoder.__all__ + memory_usage_calc.__all__ +__all__ = [] +__all__ += decoder.__all__ +__all__ += memory_usage_calc.__all__ +__all__ += op_frequence.__all__ +__all__ += quantize.__all__ diff --git a/python/paddle/fluid/contrib/op_frequence.py b/python/paddle/fluid/contrib/op_frequence.py new file mode 100644 index 0000000000000000000000000000000000000000..68dd0a946b4b69d47d51dce3de25ce147198f09a --- /dev/null +++ b/python/paddle/fluid/contrib/op_frequence.py @@ -0,0 +1,104 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from collections import OrderedDict + +from ..framework import Program + +__all__ = ['op_freq_statistic'] + + +def op_freq_statistic(program): + """ + Statistics of Op frequency. + + Args: + program(Program): The current Program. + + Returns: + uni_op_freq(dict): the single op frequency. + adj_2_op_freq(dict): the two adjacent ops frequency. + + Examples: + + >>> import paddle.fluid as fluid + >>> uni_op_freq, adj_2_op_freq = fluid.contrib.op_freq_statistic( + >>> fluid.default_main_program()) + >>> for op_type, op_num in uni_op_freq: + >>> print("%s \t %d" % (op_type, op_num)) + >>> for op_type, op_num in adj_2_op_freq: + >>> print("%s \t %d" % (op_type, op_num)) + + """ + + if not isinstance(program, Program): + raise TypeError("The input type should be Porgram." + "But you passed in %s" % (type(program))) + + uni_op_freq = OrderedDict() + adj_2_op_freq = OrderedDict() + op_in_ops = OrderedDict() + + parameters = [p.name for p in program.blocks[0].all_parameters()] + + # get uni_op_freq + for op in program.global_block().ops: + had_recorded = False + for var_name in op.output_arg_names: + if var_name in parameters: + continue + if not had_recorded and uni_op_freq.has_key(op.type): + uni_op_freq[op.type] += 1 + had_recorded = True + elif not had_recorded: + uni_op_freq[op.type] = 1 + had_recorded = True + + # get adj_2_op_freq + var_gen_op = {} + for op in program.global_block().ops: + for var_name in op.input_arg_names: + if var_name in parameters: + continue + if var_gen_op.has_key(var_name): + assert len(var_gen_op[var_name]) > 0 + if op_in_ops.has_key(op.type): + op_in_ops[op.type].append(var_gen_op[var_name][-1]) + else: + op_in_ops[op.type] = [var_gen_op[var_name][-1]] + else: + print("Var's generate op is not found,%s, %s" % + (var_name, op.type)) + + for var_name in op.output_arg_names: + if var_gen_op.has_key(var_name): + var_gen_op[var_name].append(op.type) + else: + var_gen_op[var_name] = [op.type] + + for op, in_ops in op_in_ops.iteritems(): + for in_op in in_ops: + op_op = in_op + "->" + op + if adj_2_op_freq.has_key(op_op): + adj_2_op_freq[op_op] += 1 + else: + adj_2_op_freq[op_op] = 1 + + uni_op_freq = sorted( + uni_op_freq.items(), key=lambda item: item[1], reverse=True) + adj_2_op_freq = sorted( + adj_2_op_freq.items(), key=lambda item: item[1], reverse=True) + + return uni_op_freq, adj_2_op_freq diff --git a/python/paddle/fluid/contrib/quantize/__init__.py b/python/paddle/fluid/contrib/quantize/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..14c208d0e7f35ebfbbe1c36d0b11a8d0f0efb4a6 --- /dev/null +++ b/python/paddle/fluid/contrib/quantize/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import quantize_transpiler +from .quantize_transpiler import * + +__all__ = quantize_transpiler.__all__ diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..032d0353ea6d80c4356ea9a9886ea59c48feec7a --- /dev/null +++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py @@ -0,0 +1,557 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import numpy as np + +from paddle.fluid.framework import default_main_program, default_startup_program, program_guard +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid import unique_name +from paddle.fluid import core +from paddle.fluid.initializer import Constant +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.layers.nn import autoincreased_step_counter +from paddle.fluid.framework import Variable +from paddle.fluid.executor import global_scope +from paddle.fluid.transpiler.inference_transpiler import InferenceTranspiler + +__all__ = ['QuantizeTranspiler'] + +_QUANTIZABLE_OP_TYPES = ['conv2d', 'depthwise_conv2d', 'mul'] + + +def _quantized_var_name(var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.quantized" % (var_name) + + +def _dequantized_var_name(var_name): + """ + Return dequantized variable name for the input `var_name`. + """ + return "%s.dequantized" % (var_name) + + +def _quantized_scale_name(var_name): + """ + Return quantized variable name for the input `var_name`. + """ + return "%s.scale" % (var_name) + + +def _original_var_name(var_name): + """ + Return the original variable name. + """ + if var_name.endswith('.quantized.dequantized'): + return var_name[:-len('.quantized.dequantized')] + if var_name.endswith('.quantized'): + return var_name[:-len('.quantized')] + if var_name.endswith('.dequantized'): + return var_name[:-len('.dequantized')] + if var_name.endswith('.scale'): + return var_name[:-len('.scale')] + else: + return var_name + + +def _is_float(v): + return isinstance(v, float) or isinstance(v, np.float32) + + +def quant(x, scale, num_bits): + y = np.round(x / scale * ((1 << (num_bits - 1)) - 1)) + return y + + +class QuantizeTranspiler(object): + def __init__(self, + weight_bits=8, + activation_bits=8, + activation_quantize_type='abs_max', + weight_quantize_type='abs_max', + window_size=10000): + """ + Convert and rewrite the fluid Program according to weight and + activation quantization type. + + Args: + weight_bits (int): quantization bit number for weights, + the bias is not quantized. + activation_bits (int): quantization bit number for activation. + activation_quantize_type (str): quantization type for activation, + now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode, + the quantization scale will be calculated dynamically each step + in both training and testing period. If use 'range_abs_max', + a static quantization scale will be calculated during training + and used in inference. + weight_quantize_type (str): quantization type for weights, + support 'abs_max'. The 'range_abs_max' usually is not used for + weight, since weights are fixed once the model is well trained. + window_size (int): the window size for 'range_abs_max' quantization. + + Examples: + + .. code-block:: python + + # the original program will be rewrite, if you don't want to + # change it, please clone at first. + # quantize_program = program.clone() + t = fluid.QuantizeTranspiler() + t.transpile(quantize_program) + + """ + self.weight_bits = weight_bits + self.activation_bits = activation_bits + quant_type = ['abs_max', 'range_abs_max'] + if weight_quantize_type not in quant_type: + raise ValueError( + "Unknown weight_quantize_type: '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(weight_quantize_type)) + if activation_quantize_type not in quant_type: + raise ValueError( + "Unknown activation_quantize_type : '%s'. It can only be ", + "'abs_max' or 'range_abs_max'.", str(activation_quantize_type)) + + self.weight_quantize_type = weight_quantize_type + self.activation_quantize_type = activation_quantize_type + + self.window_size = window_size + self.helper = LayerHelper(self.__class__.__name__) + self.fake_quant_op_types = [ + 'fake_quantize_abs_max', 'fake_quantize_range_abs_max' + ] + self.fake_dequant_op_types = ['fake_dequantize_max_abs'] + self.is_test = None + self.global_step = None + + def training_transpile(self, program=None, startup_program=None): + """Rewrites a training input program in place for simulated + quantization. Insert fake quantization and de-quantization ops into + program to simulate the error introduced by quantization. And change + the graident ops' input by using the faked quantization weights and + activation. Since the program is transformed in place, the graph + connection will change. + + Args: + program (Program): the input program to be transpile. + """ + self.is_test = False + program = default_main_program() if program is None else program + startup_program = default_startup_program() if startup_program is \ + None else startup_program + + # marked the variable which has been quantized and dequantized. + dequanted_vars = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES] + + params = [p.name for p in program.global_block().iter_parameters()] + + def _transpile_forward(block, op): + idx = block.ops.index(op) + block_id = block.idx + # insert quant op and dequant op + for name in op.input_arg_names: + if name in dequanted_vars[block_id]: + dequant_var = dequanted_vars[block_id][name] + else: + var = block.var(name) + quant_bits = self.weight_bits if var.name in params \ + else self.activation_bits + quant_type = self.weight_quantize_type if var.name \ + in params else self.activation_quantize_type + + quant_var, scale_var = self._insert_quant_op( + block, idx, var, quant_bits, quant_type) + dequant_var = self._insert_dequant_op( + block, idx + 1, quant_var, scale_var, quant_bits) + dequanted_vars[block_id][name] = dequant_var + # rename the forward op inputs + op._rename_input(name, dequant_var.name) + + def _transpile_backward(block, op): + block_id = block.idx + no_dequanted_input_vars = True + for name in op.input_arg_names: + if name in dequanted_vars[block_id]: + dequant_var = dequanted_vars[block_id][name] + op._rename_input(name, dequant_var.name) + no_dequanted_input_vars = False + if no_dequanted_input_vars: + raise ValueError("There is no dequanted inputs for op %s." % + (op.type)) + + with program_guard(program, startup_program): + self._create_global_step() + for block in program.blocks: + ops = list(block.ops) + block_id = block.idx + for op in ops: + # rewrite the forward ProgramDes + if op.type in _QUANTIZABLE_OP_TYPES: + _transpile_forward(block, op) + # rename the backward op inputs + if op.type in grad_op_types: + _transpile_backward(block, op) + + def _create_global_step(self): + if self.weight_quantize_type == 'range_abs_max' or \ + self.activation_quantize_type == 'range_abs_max': + self.global_step = autoincreased_step_counter() + + def freeze_program(self, program, place, fuse_bn=False, scope=None): + """Freeze input training program for inference. + + Args: + program (Program): the input program to be transpile. + """ + + self.is_test = True + scope = global_scope() if scope is None else scope + program = default_main_program() if program is None else program + + if fuse_bn: + bn_fuse_transpiler = BNFuseTranspiler() + bn_fuse_transpiler.transpile(program, place) + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, program.list_vars()) + ] + op_in_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + op_out_rename_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + var_scale_map = [ + collections.OrderedDict() for _ in range(len(program.blocks)) + ] + + def _remove_fake_quant_and_dequant_op(block, op): + idx = block.ops.index(op) + block_id = block.idx + k = op.output('Out')[0] + v = op.input('X')[0] + if v not in op_in_rename_map[block_id]: + op_in_rename_map[block_id][k] = v + else: + op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v] + block._remove_op(idx) + + def _insert_post_dequant_op(block, op): + idx = block.ops.index(op) + block_id = block.idx + max_range = None + scale_var = None + for name in op.input_arg_names: + if name in op_in_rename_map[block_id]: + op._rename_input(name, op_in_rename_map[block_id][name]) + + scale_v = var_scale_map[block_id][_original_var_name(name)] + if _original_var_name(name) in persistable_vars: + param_range = (1 << (self.weight_bits - 1)) - 1 + act_range = (1 << (self.activation_bits - 1)) - 1 + assert _is_float(scale_v) + max_range = param_range * act_range / scale_v + else: + assert isinstance(scale_v, Variable) + scale_var = var_scale_map[block_id][_original_var_name( + name)] + + if len(op.output_arg_names) != 1: + raise ValueError("Only support one output, but op %s has" + " more than one output." % (op.type)) + out_var = block.var(op.output_arg_names[0]) + dequant_var = block.create_var( + name=_dequantized_var_name(out_var.name), + type=out_var.type, + shape=out_var.shape, + dtype=out_var.dtype) + # insert fake_dequantize_op + dequant_op = block._insert_op( + idx + 1, + type="fake_dequantize_max_abs", + attrs={'max_range': float(max_range)}, + inputs={"X": out_var, + 'Scale': scale_var}, + outputs={"Out": dequant_var}) + op_out_rename_map[block_id][out_var.name] = dequant_var.name + return dequant_var + + def _load_var(name): + return np.array(scope.find_var(name).get_tensor()) + + def _restore_var(name, arr): + t = scope.find_var(name).get_tensor() + t.set(arr, place) + + for block in program.blocks: + ops = list(block.ops) + block_id = block.idx + for op in ops: + op_type = op.type + + # insert dequant_op after fc/conv, need to rename + # input of the followed ops + for name in op.input_arg_names: + if name in op_out_rename_map[block_id]: + op._rename_input(name, + op_out_rename_map[block_id][name]) + + if op_type in self.fake_quant_op_types: + in_arg_name = op.input('X')[0] + if in_arg_name in persistable_vars: + if self.weight_quantize_type == 'abs_max': + param = _load_var(in_arg_name) + scale_v = np.max(np.abs(param)) + else: + scale_v = _load_var(op.output('OutScale')[0]) + var_scale_map[block_id][in_arg_name] = scale_v + else: + scale_v = block.var(op.output('OutScale')[0]) + var_scale_map[block_id][in_arg_name] = scale_v + + if in_arg_name in persistable_vars: + _remove_fake_quant_and_dequant_op(block, op) + # quantize weight and restore + param_t = _load_var(in_arg_name) + param_q_t = quant(param_t, scale_v, self.weight_bits) + _restore_var(in_arg_name, param_q_t) + + if op_type in self.fake_dequant_op_types: + _remove_fake_quant_and_dequant_op(block, op) + + if op_type in _QUANTIZABLE_OP_TYPES: + dequant_var = _insert_post_dequant_op(block, op) + + # remove the unused var in ProgramDesc + self._remove_unused_var(program) + #program = program.clone() + + def convert_to_int8(self, program, place, scope=None): + scope = global_scope() if scope is None else scope + program = default_main_program() if program is None else program + + def _load_var(name): + return np.array(scope.find_var(name).get_tensor()) + + global_block = program.global_block() + + def convert_to_int8(var): + int8_var_name = var.name + ".int8" + int8_var = global_block.create_parameter( + name=int8_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.INT8, + shape=var.shape) + + tensor = _load_var(var.name) + + scope.var(int8_var_name) + int8_tensor = scope.find_var(int8_var_name).get_tensor() + int8_tensor.set(tensor.astype(np.int8), place) + return int8_var + + input_map = {} + for block in program.blocks: + for op in list(block.ops): + if op.type in _QUANTIZABLE_OP_TYPES: + for name in op.input_arg_names: + var = block.var(name) + if var.persistable: + if name not in input_map: + int8_var = convert_to_int8(var) + input_map[name] = int8_var.name + op._rename_input(name, input_map[name]) + self._remove_unused_var(program) + + def _remove_unused_var(self, program): + all_remove_vars = [] + for block in program.blocks: + args = [] + for op in block.ops: + args += op.input_arg_names + args += op.output_arg_names + args = list(set(args)) + var_names = block.vars.keys() + sub_block_remove_vars = [] + for var in var_names: + if var not in args: + sub_block_remove_vars.append(var) + all_remove_vars.append(sub_block_remove_vars) + + remove_vars = [list(set(v)) for v in all_remove_vars] + for i, block in enumerate(program.blocks): + for v in remove_vars[i]: + block._remove_var(v) + + def _insert_quant_abs_max_op(self, block, idx, var, quant_bits): + """Insert fake_quantize_abs_max op. + """ + quant_var = block.create_var( + name=_quantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + scale = block.create_var( + name=_quantized_scale_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + quant_op = block._insert_op( + idx, + type='fake_quantize_abs_max', + attrs={'bit_length': quant_bits}, + inputs={'X': var}, + outputs={'Out': quant_var, + 'OutScale': scale}) + return quant_var, scale + + def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits): + """Insert fake_quantize_range_abs_max + """ + quant_var = block.create_var( + name=_quantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + scale = self.helper.create_parameter( + attr=ParamAttr( + name=_quantized_scale_name(var.name), + initializer=Constant(0.001), + trainable=False), + shape=[1], + dtype=var.dtype) + scale.stop_gradient = True + + ins = {'X': var, 'InScale': scale} + outs = {'Out': quant_var, 'OutScale': scale} + if not self.is_test: + # A global step counter variable with type int64 + scales = self.helper.create_global_variable( + name=unique_name.generate('scales'), + persistable=True, + dtype=var.dtype, + shape=[self.window_size]) + self.helper.set_variable_initializer( + scales, initializer=Constant(value=0)) + + ins['Iter'] = self.global_step + outs['OutScales'] = scales + + attrs = { + 'window_size': self.window_size, + 'bit_length': quant_bits, + 'is_test': self.is_test + } + + quant_op = block._insert_op( + idx, + type='fake_quantize_range_abs_max', + attrs=attrs, + inputs=ins, + outputs=outs) + + return quant_var, scale + + def _insert_quant_op(self, block, idx, var, quant_bits, quant_type): + """ + Insert fake_quantize_op + """ + if quant_type == 'abs_max': + return self._insert_quant_abs_max_op(block, idx, var, quant_bits) + elif quant_type == 'range_abs_max': + return self._insert_quant_range_abs_max_op(block, idx, var, + quant_bits) + + def _insert_dequant_op(self, block, idx, var, scale, quant_bits): + """ + Insert fake_quantize_op + """ + dequant_var = block.create_var( + name=_dequantized_var_name(var.name), + type=var.type, + shape=var.shape, + dtype=var.dtype) + # insert fake_dequantize_op + max_range = (1 << (quant_bits - 1)) - 1 + dequant_op = block._insert_op( + idx, + type="fake_dequantize_max_abs", + attrs={'max_range': float(max_range)}, + inputs={"X": var, + 'Scale': scale}, + outputs={"Out": dequant_var}) + return dequant_var + + +class BNFuseTranspiler(InferenceTranspiler): + def _fuse_param(self, current_op, bn_op, bias_op, with_bias): + def _update_param(op, param_name, new_param): + var = self.block.vars[param_name] + tensor = self.scope.find_var(param_name).get_tensor() + tensor.set(np.array(new_param), self.place) + + def _load_param(param_name): + return np.array(self.scope.find_var(param_name).get_tensor()) + + bias_bn = _load_param(bn_op.input("Bias")[0]) #Bias + scale_bn = _load_param(bn_op.input("Scale")[0]) #Scale + mean_bn = _load_param(bn_op.input("Mean")[0]) #Mean + var_bn = _load_param(bn_op.input("Variance")[0]) #Variance + + if current_op.type in ['conv2d', 'depthwise_conv2d']: + current_param = _load_param( + _original_var_name(current_op.input("Filter")[0])) + elif current_op.type == 'mul': + current_param = _load_param( + _original_var_name(current_op.input("Y")[0])) + + std_bn = np.float32(np.sqrt(np.add(var_bn, 1e-5))) + tmp = np.float32(np.divide(scale_bn, std_bn)) + + # add bias of batch_norm_op to conv2d + if with_bias: + bias = _load_param(bias_op.input("Y")) + else: + bias = np.zeros(bias_bn.shape) + bias = np.float32( + np.add(np.multiply(np.subtract(bias, mean_bn), tmp), bias_bn)) + + # re-compute weight of conv2d/fc + tmp = tmp.reshape(tmp.shape[0], -1) + dst_param = current_param.reshape((tmp.shape[0], -1)) + dst_param = np.float32(np.multiply(dst_param, tmp)) + dst_param = dst_param.reshape(current_param.shape) + + # update parameters + if current_op.type in ['conv2d', 'depthwise_conv2d']: + _update_param(current_op, + _original_var_name(current_op.input("Filter")[0]), + dst_param) + elif current_op.type == 'mul': + _update_param(current_op, + _original_var_name(current_op.input("Y")[0]), + dst_param) + + _update_param(bias_op, bias_op.input("Y")[0], bias) + + # collect the renamed input + self.input_map[bn_op.output("Y")[0]] = bias_op.output("Out")[0] diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..79bec8c4ad34d682895250bc29b1fddb3a569bd4 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) +endforeach() diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py new file mode 100644 index 0000000000000000000000000000000000000000..9af3a6c9fda121d411a8a19f3928238be84fe8a6 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py @@ -0,0 +1,272 @@ +# copyright (c) 2018 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +import numpy as np +import six + +import unittest +import paddle +import paddle.fluid as fluid +from paddle.fluid.contrib.quantize.quantize_transpiler import _original_var_name +from paddle.fluid.contrib.quantize.quantize_transpiler import QuantizeTranspiler + + +def linear_fc(num): + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + hidden = fluid.layers.fc(hidden, size=128, act='relu') + loss = fluid.layers.cross_entropy(input=hidden, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def residual_block(num): + def conv_bn_layer(input, + ch_out, + filter_size, + stride, + padding, + act='relu', + bias_attr=False): + tmp = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=tmp, act=act) + + data = fluid.layers.data(name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + conv = conv_bn_layer(hidden, 16, 3, 1, 1, act=None, bias_attr=True) + short = conv_bn_layer(hidden, 16, 1, 1, 0, act=None) + hidden = fluid.layers.elementwise_add(x=conv, y=short, act='relu') + fc = fluid.layers.fc(input=hidden, size=10) + loss = fluid.layers.cross_entropy(input=fc, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def conv_net(img, label): + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + return avg_loss + + +class TestQuantizeTranspiler(unittest.TestCase): + def setUp(self): + # since quant_op and dequant_op is not ready, use cos and sin for test + self.weight_quant_op_type = 'fake_quantize_abs_max' + self.dequant_op_type = 'fake_dequantize_max_abs' + self.quantizable_op_and_inputs = { + 'conv2d': ['Input', 'Filter'], + 'depthwise_conv2d': ['Input', 'Filter'], + 'mul': ['X', 'Y'] + } + self.quantizable_op_grad_and_inputs = { + 'conv2d_grad': ['Input', 'Filter'], + 'depthwise_conv2d_grad': ['Input', 'Filter'], + 'mul_grad': ['X', 'Y'] + } + + def check_program(self, program): + quantized_ops = {} + + persistable_vars = [ + v.name + for v in filter(lambda var: var.persistable, program.list_vars()) + ] + + for block in program.blocks: + for idx, op in enumerate(block.ops): + # check forward + if op.type in self.quantizable_op_and_inputs: + for i, arg_name in enumerate(op.input_arg_names): + quant_op_type = self.weight_quant_op_type if \ + _original_var_name(arg_name) \ + in persistable_vars else self.act_quant_op_type + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + if arg_name not in quantized_ops: + self.assertEqual(block.ops[idx - 2 * i - 1].type, + self.dequant_op_type) + self.assertEqual(block.ops[idx - 2 * i - 2].type, + quant_op_type) + quantized_ops[arg_name] = block.ops[idx - 2 * i - 2] + else: + op_idx = block.ops.index(quantized_ops[arg_name]) + self.assertLess(op_idx, idx) + + # check backward + if op.type in self.quantizable_op_grad_and_inputs: + for pname in self.quantizable_op_grad_and_inputs[op.type]: + arg_name = op.input(pname)[0] + self.assertTrue( + arg_name.endswith('.quantized.dequantized')) + self.assertTrue(arg_name in quantized_ops) + + def linear_fc_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = linear_fc(3) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + t = QuantizeTranspiler(activation_quantize_type=quant_type) + t.training_transpile(main) + self.check_program(main) + + def test_linear_fc_quant_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.linear_fc_quant('abs_max') + + def test_linear_fc_quant_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.linear_fc_quant('range_abs_max') + + def residual_block_quant(self, quant_type): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = residual_block(2) + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + t = QuantizeTranspiler(activation_quantize_type=quant_type) + t.training_transpile(main) + self.check_program(main) + + def test_residual_block_abs_max(self): + self.act_quant_op_type = 'fake_quantize_abs_max' + self.residual_block_quant('abs_max') + + def test_residual_block_range_abs_max(self): + self.act_quant_op_type = 'fake_quantize_range_abs_max' + self.residual_block_quant('range_abs_max') + + def freeze_program(self, use_cuda): + def build_program(main, startup, is_test): + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + img = fluid.layers.data( + name='image', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + loss = conv_net(img, label) + if not is_test: + opt = fluid.optimizer.Adam(learning_rate=0.001) + opt.minimize(loss) + return [img, label], loss + + main = fluid.Program() + startup = fluid.Program() + test_program = fluid.Program() + + feeds, loss = build_program(main, startup, False) + build_program(test_program, startup, True) + test_program = test_program.clone(for_test=True) + + quant_transpiler = QuantizeTranspiler() + quant_transpiler.training_transpile(main) + quant_transpiler.training_transpile(test_program) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + iter = 5 + batch_size = 8 + class_num = 10 + exe.run(startup) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + feeder = fluid.DataFeeder(feed_list=feeds, place=place) + + with fluid.program_guard(main): + for _ in range(iter): + data = next(train_reader()) + loss_v = exe.run(program=main, + feed=feeder.feed(data), + fetch_list=[loss]) + + with fluid.program_guard(test_program): + test_data = next(test_reader()) + w_var = fluid.framework._get_var('conv2d_1.w_0.quantized', + test_program) + # Testing during training + test_loss1, w_quant = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[loss, w_var]) + + # Freeze program for inference, but the weight of fc/conv is still float type. + quant_transpiler.freeze_program(test_program, place) + test_loss2, = exe.run(program=test_program, + feed=feeder.feed(test_data), + fetch_list=[loss]) + self.assertAlmostEqual(test_loss1, test_loss2, delta=1e-3) + w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0') + .get_tensor()) + self.assertEqual(np.sum(w_freeze), np.sum(w_quant)) + + # Convert parameter to 8-bit. + quant_transpiler.convert_to_int8(test_program, place) + # Save the 8-bit parameter and model file. + fluid.io.save_inference_model('model_8bit', ['image', 'label'], + [loss], exe, test_program) + # Test whether the 8-bit parameter and model file can be loaded successfully. + [infer, feed, fetch] = fluid.io.load_inference_model('model_8bit', + exe) + # Check the loaded 8-bit weight. + w_8bit = np.array(fluid.global_scope().find_var('conv2d_1.w_0.int8') + .get_tensor()) + + self.assertEqual(w_8bit.dtype, np.int8) + self.assertEqual(np.sum(w_8bit), np.sum(w_freeze)) + + def test_freeze_program_cuda(self): + if fluid.core.is_compiled_with_cuda(): + with fluid.unique_name.guard(): + self.freeze_program(True) + + def test_freeze_program_cpu(self): + with fluid.unique_name.guard(): + self.freeze_program(False) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index bced5fd1d9c617ab614212c811e86422d65a2e56..d795b92d79b2b9c616639d2fc56f3d2be383f376 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -40,11 +40,9 @@ PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None __all__ = [ 'Program', - 'Operator', 'default_startup_program', 'default_main_program', 'program_guard', - 'get_var', 'name_scope', ] @@ -663,11 +661,11 @@ class Operator(object): self._update_desc_attr(attr_name, attr_val) self.desc.check_attrs() - if self.has_kernel(type): + if self._has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) - def has_kernel(self, op_type): + def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET def to_string(self, throw_on_error): @@ -708,7 +706,7 @@ class Operator(object): """ return self.desc.input(name) - def rename_input(self, old_name, new_name): + def _rename_input(self, old_name, new_name): """ Rename the `old_name` to `new_name`. @@ -719,9 +717,9 @@ class Operator(object): Returns: None """ - self.desc.rename_input(old_name, new_name) + self.desc._rename_input(old_name, new_name) - def rename_output(self, old_name, new_name): + def _rename_output(self, old_name, new_name): """ Rename the `old_name` to `new_name`. @@ -732,7 +730,7 @@ class Operator(object): Returns: None """ - self.desc.rename_output(old_name, new_name) + self.desc._rename_output(old_name, new_name) @property def input_names(self): @@ -796,7 +794,7 @@ class Operator(object): """ return self.desc.attr_type(name) - def set_attr(self, name, val): + def _set_attr(self, name, val): """ Set the value of attribute by attribute's name. @@ -829,7 +827,7 @@ class Operator(object): isinstance(val, core.ProgramDesc): self.desc.set_serialized_attr(name, val.serialize_to_string()) else: - self.desc.set_attr(name, val) + self.desc._set_attr(name, val) @property def attr_names(self): @@ -848,7 +846,7 @@ class Operator(object): """ return self.desc.attr(name) - def block_attr_id(self, name): + def _block_attr_id(self, name): """ Get the block attribute's id by name. @@ -858,9 +856,9 @@ class Operator(object): Returns: int: the block index. """ - return self.desc.block_attr_id(name) + return self.desc._block_attr_id(name) - def block_attr(self, name): + def _block_attr(self, name): """ Get the block attribute by name. @@ -871,11 +869,11 @@ class Operator(object): block: the block attribute. """ - id = self.block_attr_id(name) + id = self._block_attr_id(name) assert (id >= 0 and id < len(self.block.program.blocks)) return self.block.program.blocks[id] - def blocks_attr(self, name): + def _blocks_attr(self, name): """ Get the blocks attribute by name. @@ -886,13 +884,13 @@ class Operator(object): list: list of the blocks attribute. """ attrs = [] - for i in self.blocks_attr_ids(name): + for i in self._blocks_attr_ids(name): assert (i >= 0 and i < len(self.block.program.blocks)) attrs.append(self.block.program.blocks[i]) return attrs - def blocks_attr_ids(self, name): + def _blocks_attr_ids(self, name): """ Get the blocks attribute's ids by name. @@ -903,7 +901,7 @@ class Operator(object): list: list of the blocks ids. """ - return self.desc.blocks_attr_ids(name) + return self.desc._blocks_attr_ids(name) def all_attrs(self): """ @@ -917,11 +915,11 @@ class Operator(object): for n in attr_names: attr_type = self.desc.attr_type(n) if attr_type == core.AttrType.BLOCK: - attr_map[n] = self.block_attr(n) + attr_map[n] = self._block_attr(n) continue if attr_type == core.AttrType.BLOCKS: - attr_map[n] = self.blocks_attr(n) + attr_map[n] = self._blocks_attr(n) continue attr_map[n] = self.attr(n) @@ -1795,7 +1793,7 @@ class Program(object): for j in six.moves.range(block.op_size()): op = block.op(j) if op.has_attr('is_test'): - op.set_attr('is_test', True) + op._set_attr('is_test', True) res.blocks = [ Block(res, i) for i in six.moves.range(res.desc.num_blocks()) ] @@ -2169,7 +2167,7 @@ def program_guard(main_program, startup_program=None): switch_startup_program(startup_program) -def get_var(name, program=None): +def _get_var(name, program=None): """ Get a variable by name from the global block of a program. diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index e703e5ac7943b006741f12886a14bf344a6b9b28..604f3eacd75beff306915b224b30c369dd3a486f 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -600,7 +600,7 @@ def save_inference_model(dirname, """ if isinstance(feeded_var_names, six.string_types): feeded_var_names = [feeded_var_names] - else: + elif export_for_deployment: if len(feeded_var_names) > 0: # TODO(paddle-dev): polish these code blocks if not (bool(feeded_var_names) and all( @@ -610,61 +610,60 @@ def save_inference_model(dirname, if isinstance(target_vars, Variable): target_vars = [target_vars] - else: + elif export_for_deployment: if not (bool(target_vars) and all( isinstance(var, Variable) for var in target_vars)): raise ValueError("'target_vars' should be a list of Variable.") if main_program is None: main_program = default_main_program() - copy_program = main_program.clone() + + # if there is lookup table, the trainer 0 will notify all pserver to save. + if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: + lookup_table_filename = os.path.join(dirname, "__lookup_table__") + _save_lookup_tables_by_notify(executor, lookup_table_filename, + main_program._distributed_lookup_table, + main_program._endpoints) if not os.path.isdir(dirname): os.makedirs(dirname) + if model_filename is not None: + model_basename = os.path.basename(model_filename) + else: + model_basename = "__model__" + model_basename = os.path.join(dirname, model_basename) # When export_for_deployment is true, we modify the program online so that # it can only be loaded for inference directly. If it's false, the whole # original program and related meta are saved so that future usage can be # more flexible. if export_for_deployment: - global_block = copy_program.global_block() + main_program = main_program.clone() + global_block = main_program.global_block() for i, op in enumerate(global_block.ops): op.desc.set_is_target(False) if op.type == "feed" or op.type == "fetch": global_block._remove_op(i) - copy_program.desc.flush() + main_program.desc.flush() - pruned_program = copy_program._prune(targets=target_vars) - saved_program = pruned_program._inference_optimize(prune_read_op=True) + main_program = main_program._prune(targets=target_vars) + main_program = main_program._inference_optimize(prune_read_op=True) fetch_var_names = [v.name for v in target_vars] - prepend_feed_ops(saved_program, feeded_var_names) - append_fetch_ops(saved_program, fetch_var_names) + prepend_feed_ops(main_program, feeded_var_names) + append_fetch_ops(main_program, fetch_var_names) + + with open(model_basename, "wb") as f: + f.write(main_program.desc.serialize_to_string()) else: # TODO(panyx0718): Save more information so that it can also be used # for training and more flexible post-processing. - saved_program = copy_program - - if model_filename is not None: - model_filename = os.path.basename(model_filename) - else: - model_filename = "__model__" - model_filename = os.path.join(dirname, model_filename) + with open(model_basename + ".main_program", "wb") as f: + f.write(main_program.desc.serialize_to_string()) if params_filename is not None: params_filename = os.path.basename(params_filename) - - with open(model_filename, "wb") as f: - f.write(saved_program.desc.serialize_to_string()) - - save_persistables(executor, dirname, saved_program, params_filename) - - # if there is lookup table, the trainer 0 will notify all pserver to save. - if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: - lookup_table_filename = os.path.join(dirname, "__lookup_table__") - _save_lookup_tables_by_notify(executor, lookup_table_filename, - main_program._distributed_lookup_table, - main_program._endpoints) + save_persistables(executor, dirname, main_program, params_filename) def load_inference_model(dirname, diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 574d0d727cba9fa9de0cffbe116f71b9e65a7092..9772c65738a2c5373f657164e3bc379404ba642e 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -284,7 +284,7 @@ def detection_output(loc, target_box=loc, code_type='decode_center_size') compile_shape = scores.shape - run_shape = ops.shape(scores) + run_shape = nn.shape(scores) scores = nn.flatten(x=scores, axis=2) scores = nn.softmax(input=scores) scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape) @@ -697,7 +697,7 @@ def ssd_loss(location, raise ValueError("Only support mining_type == max_negative now.") num, num_prior, num_class = confidence.shape - conf_shape = ops.shape(confidence) + conf_shape = nn.shape(confidence) def __reshape_to_2d(var): return nn.flatten(x=var, axis=2) @@ -724,7 +724,7 @@ def ssd_loss(location, target_label.stop_gradient = True conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) # 3. Mining hard examples - actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2]) + actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2]) actual_shape.stop_gradient = True conf_loss = nn.reshape( x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index d56fa76300e7054ef71a7729483a579fa35f1dac..81c78cba219007a9348af961e4b0dc227edba747 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -507,7 +507,6 @@ def py_reader(capacity, 1. The basic usage of :code:`py_reader` is as follows: - >>> import paddle.v2 >>> import paddle.fluid as fluid >>> import paddle.dataset.mnist as mnist >>> @@ -515,7 +514,7 @@ def py_reader(capacity, >>> shapes=[(-1,3,224,224), (-1,1)], >>> dtypes=['float32', 'int64']) >>> reader.decorate_paddle_reader( - >>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) + >>> paddle.reader.shuffle(paddle.batch(mnist.train()) >>> >>> img, label = fluid.layers.read_file(reader) >>> loss = network(img, label) # some network definition @@ -534,7 +533,6 @@ def py_reader(capacity, 2. When training and testing are both performed, two different :code:`py_reader` should be created with different names, e.g.: - >>> import paddle.v2 >>> import paddle.fluid as fluid >>> import paddle.dataset.mnist as mnist >>> @@ -548,7 +546,7 @@ def py_reader(capacity, >>> dtypes=['float32', 'int64'], >>> name='train_reader') >>> train_reader.decorate_paddle_reader( - >>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) + >>> paddle.reader.shuffle(paddle.batch(mnist.train()) >>> >>> test_reader = fluid.layers.py_reader(capacity=32, >>> shapes=[(-1,3,224,224), (-1,1)], diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py index b1598bfec210474ae1e17f9f88e8b57aa80b8452..a3064b565d096f7feda18379c66ffc8bf2f4a55c 100644 --- a/python/paddle/fluid/layers/metric_op.py +++ b/python/paddle/fluid/layers/metric_op.py @@ -78,7 +78,12 @@ def accuracy(input, label, k=1, correct=None, total=None): return acc_out -def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): +def auc(input, + label, + curve='ROC', + num_thresholds=2**12 - 1, + topk=1, + slide_steps=1): """ **Area Under the Curve (AUC) Layer** @@ -105,6 +110,8 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): num_thresholds(int): The number of thresholds to use when discretizing the roc curve. Default 200. topk(int): only topk number of prediction output will be used for auc. + slide_steps: when calc batch auc, we can not only use step currently but the previous steps can be used. slide_steps=1 means use the current step, slide_steps=3 means use current step and the previous second steps, slide_steps=0 use all of the steps. + Returns: Variable: A scalar representing the current AUC. @@ -120,16 +127,48 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): auc_out = helper.create_tmp_variable(dtype="float64") batch_auc_out = helper.create_tmp_variable(dtype="float64") # make tp, tn, fp, fn persistable, so that can accumulate all batches. + + # for batch auc + batch_stat_pos = helper.create_global_variable( + persistable=True, + dtype='int64', + shape=[slide_steps, num_thresholds + 1]) + batch_stat_neg = helper.create_global_variable( + persistable=True, + dtype='int64', + shape=[slide_steps, num_thresholds + 1]) + + # for global auc stat_pos = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds + 1]) + persistable=True, dtype='int64', shape=[1, num_thresholds + 1]) stat_neg = helper.create_global_variable( - persistable=True, dtype='int64', shape=[num_thresholds + 1]) + persistable=True, dtype='int64', shape=[1, num_thresholds + 1]) - for var in [stat_pos, stat_neg]: + for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]: helper.set_variable_initializer( var, Constant( value=0.0, force_cpu=True)) + # Batch AUC + helper.append_op( + type="auc", + inputs={ + "Predict": [input], + "Label": [label], + "StatPos": [batch_stat_pos], + "StatNeg": [batch_stat_neg] + }, + attrs={ + "curve": curve, + "num_thresholds": num_thresholds, + "slide_steps": slide_steps + }, + outputs={ + "AUC": [batch_auc_out], + "StatPosOut": [batch_stat_pos], + "StatNegOut": [batch_stat_neg] + }) + # Global AUC helper.append_op( type="auc", inputs={ @@ -138,12 +177,16 @@ def auc(input, label, curve='ROC', num_thresholds=2**12 - 1, topk=1): "StatPos": [stat_pos], "StatNeg": [stat_neg] }, - attrs={"curve": curve, - "num_thresholds": num_thresholds}, + attrs={ + "curve": curve, + "num_thresholds": num_thresholds, + "slide_steps": 0 + }, outputs={ "AUC": [auc_out], - "BatchAUC": [batch_auc_out], "StatPosOut": [stat_pos], "StatNegOut": [stat_neg] }) - return auc_out, batch_auc_out, [stat_pos, stat_neg] + return auc_out, batch_auc_out, [ + batch_stat_pos, batch_stat_neg, stat_pos, stat_neg + ] diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2cb61a9cd25c744710ab7ac9ea591902740f78da..a9696ac20060d1069a99a02a79a755a740e760f0 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -29,110 +29,29 @@ from .. import unique_name from functools import reduce __all__ = [ - 'fc', - 'embedding', - 'dynamic_lstm', - 'dynamic_lstmp', - 'dynamic_gru', - 'gru_unit', - 'linear_chain_crf', - 'crf_decoding', - 'cos_sim', - 'cross_entropy', - 'square_error_cost', - 'chunk_eval', - 'sequence_conv', - 'conv2d', - 'conv3d', - 'sequence_pool', - 'sequence_softmax', - 'softmax', - 'pool2d', - 'pool3d', - 'batch_norm', - 'beam_search_decode', - 'conv2d_transpose', - 'conv3d_transpose', - 'sequence_expand', - 'sequence_expand_as', - 'sequence_pad', - 'lstm_unit', - 'reduce_sum', - 'reduce_mean', - 'reduce_max', - 'reduce_min', - 'reduce_prod', - 'sequence_first_step', - 'sequence_last_step', - 'dropout', - 'split', - 'ctc_greedy_decoder', - 'edit_distance', - 'l2_normalize', - 'matmul', - 'topk', - 'warpctc', - 'sequence_reshape', - 'transpose', - 'im2sequence', - 'nce', - 'hsigmoid', - 'beam_search', - 'row_conv', - 'multiplex', - 'layer_norm', - 'softmax_with_cross_entropy', - 'smooth_l1', - 'one_hot', - 'autoincreased_step_counter', - 'reshape', - 'squeeze', - 'unsqueeze', - 'lod_reset', - 'lrn', - 'pad', - 'pad_constant_like', - 'label_smooth', - 'roi_pool', - 'dice_loss', - 'image_resize', - 'image_resize_short', - 'resize_bilinear', - 'gather', - 'scatter', - 'sequence_scatter', - 'random_crop', - 'mean_iou', - 'relu', - 'log', - 'crop', - 'rank_loss', - 'elu', - 'relu6', - 'pow', - 'stanh', - 'hard_sigmoid', - 'swish', - 'prelu', - 'brelu', - 'leaky_relu', - 'soft_relu', - 'flatten', - 'sequence_mask', - 'stack', - 'pad2d', - 'unstack', - 'sequence_enumerate', - 'expand', - 'sequence_concat', - 'scale', - 'elementwise_add', - 'elementwise_div', - 'elementwise_sub', - 'elementwise_mul', - 'elementwise_max', - 'elementwise_min', - 'elementwise_pow', + 'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', + 'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy', + 'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d', + 'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d', + 'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose', + 'sequence_expand', 'sequence_expand_as', 'sequence_pad', 'lstm_unit', + 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod', + 'sequence_first_step', 'sequence_last_step', 'dropout', 'split', + 'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk', + 'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce', + 'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm', + 'softmax_with_cross_entropy', 'smooth_l1', 'one_hot', + 'autoincreased_step_counter', 'reshape', 'squeeze', 'unsqueeze', + 'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', 'roi_pool', + 'dice_loss', 'image_resize', 'image_resize_short', 'resize_bilinear', + 'gather', 'scatter', 'sequence_scatter', 'random_crop', 'mean_iou', 'relu', + 'log', 'crop', 'rank_loss', 'elu', 'relu6', 'pow', 'stanh', 'hard_sigmoid', + 'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', 'flatten', + 'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate', + 'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div', + 'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min', + 'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random', + 'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape' ] @@ -6463,6 +6382,246 @@ def expand(x, expand_times, name=None): return out +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + + +@templatedoc() +def uniform_random_batch_size_like(input, + shape, + dtype='float32', + input_dim_idx=0, + output_dim_idx=0, + min=-1.0, + max=1.0, + seed=0): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + shape (tuple|list): ${shape_comment} + input_dim_idx (Int): ${input_dim_idx_comment} + output_dim_idx (Int): ${output_dim_idx_comment} + min (Float): ${min_comment} + max (Float): ${max_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('uniform_random_batch_size_like', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='uniform_random_batch_size_like', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'input_dim_idx': input_dim_idx, + 'output_dim_idx': output_dim_idx, + 'min': min, + 'max': max, + 'seed': seed, + 'dtype': c_dtype + }) + + return out + + +@templatedoc() +def gaussian_random(shape, + mean=0.0, + std=1.0, + seed=0, + dtype='float32', + use_mkldnn=False): + """ + ${comment} + + Args: + shape (tuple|list): ${shape_comment} + mean (Float): ${mean_comment} + std (Float): ${std_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): Output data type. + use_mkldnn (Bool): Only used in mkldnn kernel. + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('gaussian_random', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='gaussian_random', + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'mean': mean, + 'std': std, + 'seed': seed, + 'dtype': c_dtype, + 'use_mkldnn': use_mkldnn + }) + + return out + + +@templatedoc() +def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + min (Float): ${min_comment} + max (Float): ${max_comment} + seed (Float): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('sampling_id', **locals()) + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='sampling_id', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'min': min, + 'max': max, + 'seed': seed}) + + return out + + +@templatedoc() +def gaussian_random_batch_size_like(input, + shape, + input_dim_idx=0, + output_dim_idx=0, + mean=0.0, + std=1.0, + seed=0, + dtype='float32'): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + shape (tuple|list): ${shape_comment} + input_dim_idx (Int): ${input_dim_idx_comment} + output_dim_idx (Int): ${output_dim_idx_comment} + mean (Float): ${mean_comment} + std (Float): ${std_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc + + Returns: + out (Variable): ${out_comment} + """ + + helper = LayerHelper('gaussian_random_batch_size_like', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='gaussian_random_batch_size_like', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'input_dim_idx': input_dim_idx, + 'output_dim_idx': output_dim_idx, + 'mean': mean, + 'std': std, + 'seed': seed, + 'dtype': c_dtype + }) + + return out + + +@templatedoc() +def sum(x, use_mkldnn=False): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + use_mkldnn (Bool): ${use_mkldnn_comment} + + Returns: + out (Variable): ${out_comment} + """ + + helper = LayerHelper('sum', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('x')) + helper.append_op( + type='sum', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'use_mkldnn': use_mkldnn}) + + return out + + +@templatedoc() +def slice(input, axes, starts, ends): + """ + ${comment} + + Args: + input (Variable): ${input_comment}. + axes (List): ${axes_comment} + starts (List): ${starts_comment} + ends (List): ${ends_comment} + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('slice', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) + helper.append_op( + type='slice', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={'axes': axes, + 'starts': starts, + 'ends': ends}) + + return out + + +@templatedoc() +def shape(input): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('shape', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) + helper.append_op( + type='shape', inputs={'Input': input}, outputs={'Out': out}) + + return out + + def _elementwise_op(helper): op_type = helper.layer_type x = helper.kwargs.get('x', None) diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 7867bfe00e25711643eab1ab8d0141dbbad3da52..220d065f8f1cc02508dea2679820e1f7f490866d 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -45,13 +45,6 @@ __all__ = [ 'logical_or', 'logical_xor', 'logical_not', - 'uniform_random_batch_size_like', - 'gaussian_random', - 'sampling_id', - 'gaussian_random_batch_size_like', - 'sum', - 'slice', - 'shape', 'maxout', ] @@ -63,6 +56,8 @@ for _OP in set(__all__): # e.g.: test_program_code.py, test_dist_train.py globals()['_scale'] = generate_layer_fn('scale') +globals()['_elementwise_div'] = generate_layer_fn('elementwise_div') + __all__ += __activations_noattr__ for _OP in set(__activations_noattr__): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ad09005d866b10146e6fcd7cf108c51f34322607..1b9571f6d3a6a69d1ac35f6be74b80eaa2ce6251 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -26,6 +26,7 @@ from .layer_helper import LayerHelper from .regularizer import append_regularization_ops from .clip import append_gradient_clip_ops, error_clip_callback from contextlib import contextmanager +from .layers import ops __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', @@ -1301,7 +1302,7 @@ class ModelAverage(Optimizer): x=tmp, dtype='float32' if self._dtype == None else self._dtype) sum = layers.cast( x=sum, dtype='float32' if self._dtype == None else self._dtype) - layers.elementwise_div(x=sum, y=tmp, out=param) + ops._elementwise_div(x=sum, y=tmp, out=param) def _add_average_restore_op(self, block, param_grad): param = block._clone_variable(param_grad[0]) diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 4b4f3e403776625fb5ca2f9b03d14ee7efe23d53..4a70976a4837c668a5e0ba6d49b598d046a8ec5d 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -67,6 +67,7 @@ def train(nn_type, use_cuda, parallel, save_dirname=None, + save_full_dirname=None, model_filename=None, params_filename=None, is_local=True): @@ -143,6 +144,13 @@ def train(nn_type, exe, model_filename=model_filename, params_filename=params_filename) + if save_full_dirname is not None: + fluid.io.save_inference_model( + save_full_dirname, [], [], + exe, + model_filename=model_filename, + params_filename=params_filename, + export_for_deployment=False) return else: print( @@ -214,10 +222,12 @@ def infer(use_cuda, def main(use_cuda, parallel, nn_type, combine): save_dirname = None + save_full_dirname = None model_filename = None params_filename = None if not use_cuda and not parallel: save_dirname = "recognize_digits_" + nn_type + ".inference.model" + save_full_dirname = "recognize_digits_" + nn_type + ".train.model" if combine == True: model_filename = "__model_combined__" params_filename = "__params_combined__" @@ -228,6 +238,7 @@ def main(use_cuda, parallel, nn_type, combine): use_cuda=use_cuda, parallel=parallel, save_dirname=save_dirname, + save_full_dirname=save_full_dirname, model_filename=model_filename, params_filename=params_filename) infer( diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d02c890209e65bdceb5da23ba5b9c7c0356174b8..723f9eb9c978755b77724100c266be199e0f301a 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,7 +28,6 @@ list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/Paddl list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not a test - if(APPLE) if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_desc_clone) diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py new file mode 100644 index 0000000000000000000000000000000000000000..902dc6544ed6858c4cd8d64b14d6af2367059091 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_ctr.py @@ -0,0 +1,109 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid + +import dist_ctr_reader +from test_dist_base import TestDistRunnerBase, runtime_main + +IS_SPARSE = True + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +class TestDistCTR2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + dnn_input_dim, lr_input_dim = dist_ctr_reader.load_data_meta() + """ network definition """ + dnn_data = fluid.layers.data( + name="dnn_data", + shape=[-1, 1], + dtype="int64", + lod_level=1, + append_batch_size=False) + lr_data = fluid.layers.data( + name="lr_data", + shape=[-1, 1], + dtype="int64", + lod_level=1, + append_batch_size=False) + label = fluid.layers.data( + name="click", + shape=[-1, 1], + dtype="int64", + lod_level=0, + append_batch_size=False) + + # build dnn model + dnn_layer_dims = [128, 64, 32, 1] + dnn_embedding = fluid.layers.embedding( + is_distributed=False, + input=dnn_data, + size=[dnn_input_dim, dnn_layer_dims[0]], + param_attr=fluid.ParamAttr( + name="deep_embedding", + initializer=fluid.initializer.Constant(value=0.01)), + is_sparse=IS_SPARSE) + dnn_pool = fluid.layers.sequence_pool( + input=dnn_embedding, pool_type="sum") + dnn_out = dnn_pool + for i, dim in enumerate(dnn_layer_dims[1:]): + fc = fluid.layers.fc( + input=dnn_out, + size=dim, + act="relu", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01)), + name='dnn-fc-%d' % i) + dnn_out = fc + + # build lr model + lr_embbding = fluid.layers.embedding( + is_distributed=False, + input=lr_data, + size=[lr_input_dim, 1], + param_attr=fluid.ParamAttr( + name="wide_embedding", + initializer=fluid.initializer.Constant(value=0.01)), + is_sparse=IS_SPARSE) + lr_pool = fluid.layers.sequence_pool(input=lr_embbding, pool_type="sum") + + merge_layer = fluid.layers.concat(input=[dnn_out, lr_pool], axis=1) + + predict = fluid.layers.fc(input=merge_layer, size=2, act='softmax') + acc = fluid.layers.accuracy(input=predict, label=label) + auc_var, batch_auc_var, auc_states = fluid.layers.auc(input=predict, + label=label) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + + inference_program = paddle.fluid.default_main_program().clone() + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) + sgd_optimizer.minimize(avg_cost) + + dataset = dist_ctr_reader.Dataset() + train_reader = paddle.batch(dataset.train(), batch_size=batch_size) + test_reader = paddle.batch(dataset.test(), batch_size=batch_size) + + return inference_program, avg_cost, train_reader, test_reader, None, predict + + +if __name__ == "__main__": + runtime_main(TestDistCTR2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..95e39d891f7e6a3dcb57540bd96fe70027443cda --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py @@ -0,0 +1,172 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import paddle +import tarfile + +logging.basicConfig() +logger = logging.getLogger("paddle") +logger.setLevel(logging.INFO) + +DATA_URL = "http://paddle-ctr-data.cdn.bcebos.com/avazu_ctr_data.tgz" +DATA_MD5 = "c11df99fbd14e53cd4bfa6567344b26e" +""" +avazu_ctr_data/train.txt +avazu_ctr_data/infer.txt +avazu_ctr_data/test.txt +avazu_ctr_data/data.meta.txt +""" + + +def read_data(file_name): + path = paddle.dataset.common.download(DATA_URL, "avazu_ctr_data", DATA_MD5) + tar = tarfile.open(path, "r:gz") + tar_info = None + for member in tar.getmembers(): + if member.name.endswith(file_name): + tar_info = member + f = tar.extractfile(tar_info) + ret_lines = [_.decode('utf-8') for _ in f.readlines()] + return ret_lines + + +class TaskMode: + TRAIN_MODE = 0 + TEST_MODE = 1 + INFER_MODE = 2 + + def __init__(self, mode): + self.mode = mode + + def is_train(self): + return self.mode == self.TRAIN_MODE + + def is_test(self): + return self.mode == self.TEST_MODE + + def is_infer(self): + return self.mode == self.INFER_MODE + + @staticmethod + def create_train(): + return TaskMode(TaskMode.TRAIN_MODE) + + @staticmethod + def create_test(): + return TaskMode(TaskMode.TEST_MODE) + + @staticmethod + def create_infer(): + return TaskMode(TaskMode.INFER_MODE) + + +class ModelType: + CLASSIFICATION = 0 + REGRESSION = 1 + + def __init__(self, mode): + self.mode = mode + + def is_classification(self): + return self.mode == self.CLASSIFICATION + + def is_regression(self): + return self.mode == self.REGRESSION + + @staticmethod + def create_classification(): + return ModelType(ModelType.CLASSIFICATION) + + @staticmethod + def create_regression(): + return ModelType(ModelType.REGRESSION) + + +def load_dnn_input_record(sent): + return list(map(int, sent.split())) + + +def load_lr_input_record(sent): + res = [] + for _ in [x.split(':') for x in sent.split()]: + res.append(int(_[0])) + return res + + +feeding_index = {'dnn_input': 0, 'lr_input': 1, 'click': 2} + + +class Dataset(object): + def train(self): + ''' + Load trainset. + ''' + file_name = "train.txt" + logger.info("load trainset from %s" % file_name) + mode = TaskMode.create_train() + return self._parse_creator(file_name, mode) + + def test(self): + ''' + Load testset. + ''' + file_name = "test.txt" + logger.info("load testset from %s" % file_name) + mode = TaskMode.create_test() + return self._parse_creator(file_name, mode) + + def infer(self): + ''' + Load infer set. + ''' + file_name = "infer.txt" + logger.info("load inferset from %s" % file_name) + mode = TaskMode.create_infer() + return self._parse_creator(file_name, mode) + + def _parse_creator(self, file_name, mode): + ''' + Parse dataset. + ''' + + def _parse(): + data = read_data(file_name) + for line_id, line in enumerate(data): + fs = line.strip().split('\t') + dnn_input = load_dnn_input_record(fs[0]) + lr_input = load_lr_input_record(fs[1]) + if not mode.is_infer(): + click = int(fs[2]) + yield [dnn_input, lr_input, click] + else: + yield [dnn_input, lr_input] + + return _parse + + +def load_data_meta(): + ''' + load data meta info from path, return (dnn_input_dim, lr_input_dim) + ''' + lines = read_data('data.meta.txt') + err_info = "wrong meta format" + assert len(lines) == 2, err_info + assert 'dnn_input_dim:' in lines[0] and 'lr_input_dim:' in lines[ + 1], err_info + res = map(int, [_.split(':')[1] for _ in lines]) + res = list(res) + logger.info('dnn input dim: %d' % res[0]) + logger.info('lr input dim: %d' % res[1]) + return res diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 85a96c0b53f6bc08687965048d6251265055a6fe..877d21ae882ab4efb49beb6a846ab71a22c2aab7 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -47,7 +47,7 @@ def cnn_model(data): pool_stride=2, act="relu", param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( - value=0.3))) + value=0.01))) conv_pool_2 = fluid.nets.simple_img_conv_pool( input=conv_pool_1, filter_size=5, @@ -56,7 +56,7 @@ def cnn_model(data): pool_stride=2, act="relu", param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( - value=0.2))) + value=0.01))) SIZE = 10 input_shape = conv_pool_2.shape @@ -68,7 +68,7 @@ def cnn_model(data): size=SIZE, act="softmax", param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Constant(value=0.1))) + initializer=fluid.initializer.Constant(value=0.01))) return predict diff --git a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py new file mode 100644 index 0000000000000000000000000000000000000000..6456d1b53a129db04ace7ff4413a3d76e922ccde --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py @@ -0,0 +1,238 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import time +import math +import random + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +from functools import reduce +from test_dist_base import TestDistRunnerBase, runtime_main + +DTYPE = "int64" +DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/simnet.train.1000' +DATA_MD5 = '24e49366eb0611c552667989de2f57d5' + +# For Net +base_lr = 0.2 +emb_lr = base_lr * 3 +dict_dim = 1500 +emb_dim = 128 +hid_dim = 128 +margin = 0.1 +sample_rate = 1 + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + + +def get_acc(cos_q_nt, cos_q_pt, batch_size): + cond = fluid.layers.less_than(cos_q_nt, cos_q_pt) + cond = fluid.layers.cast(cond, dtype='float64') + cond_3 = fluid.layers.reduce_sum(cond) + acc = fluid.layers.elementwise_div( + cond_3, + fluid.layers.fill_constant( + shape=[1], value=batch_size * 1.0, dtype='float64'), + name="simnet_acc") + return acc + + +def get_loss(cos_q_pt, cos_q_nt): + loss_op1 = fluid.layers.elementwise_sub( + fluid.layers.fill_constant_batch_size_like( + input=cos_q_pt, shape=[-1, 1], value=margin, dtype='float32'), + cos_q_pt) + loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt) + loss_op3 = fluid.layers.elementwise_max( + fluid.layers.fill_constant_batch_size_like( + input=loss_op2, shape=[-1, 1], value=0.0, dtype='float32'), + loss_op2) + avg_cost = fluid.layers.mean(loss_op3) + return avg_cost + + +def get_optimizer(): + # SGD optimizer + optimizer = fluid.optimizer.SGD(learning_rate=base_lr) + return optimizer + + +def train_network(batch_size, is_distributed=False, is_sparse=False): + # query + q = fluid.layers.data( + name="query_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + q_emb = fluid.layers.embedding( + input=q, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum') + q_ss = fluid.layers.softsign(q_sum) + ## fc layer after conv + q_fc = fluid.layers.fc( + input=q_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__q_fc__", + learning_rate=base_lr)) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + # pt + pt = fluid.layers.data( + name="pos_title_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + pt_emb = fluid.layers.embedding( + input=pt, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum') + pt_ss = fluid.layers.softsign(pt_sum) + ## fc layer + pt_fc = fluid.layers.fc( + input=pt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + # nt + nt = fluid.layers.data( + name="neg_title_ids", shape=[1], dtype="int64", lod_level=1) + ## embedding + nt_emb = fluid.layers.embedding( + input=nt, + is_distributed=is_distributed, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__emb__", + learning_rate=emb_lr), + is_sparse=is_sparse) + ## vsum + nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum') + nt_ss = fluid.layers.softsign(nt_sum) + ## fc layer + nt_fc = fluid.layers.fc( + input=nt_ss, + size=hid_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01), + name="__fc__", + learning_rate=base_lr), + bias_attr=fluid.ParamAttr(name="__fc_b__")) + cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc) + cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc) + # loss + avg_cost = get_loss(cos_q_pt, cos_q_nt) + # acc + acc = get_acc(cos_q_nt, cos_q_pt, batch_size) + return [avg_cost, acc, cos_q_pt] + + +def combination(x, y): + res = [[[xi, yi] for yi in y] for xi in x] + return res[0] + + +def get_one_data(file_list): + for file in file_list: + contents = [] + with open(file, "r") as fin: + for i in fin: + contents.append(i.strip()) + for index, q in enumerate(contents): + try: + one_data = [[int(j) for j in i.split(" ")] + for i in q.split(";")[:-1]] + if one_data[1][0] + one_data[1][1] != len(one_data) - 3: + q = fin.readline() + continue + tmp = combination(one_data[3:3 + one_data[1][0]], + one_data[3 + one_data[1][0]:]) + except Exception as e: + continue + + for each in tmp: + yield [one_data[2], 0, each[0], each[1]] + + +def get_batch_reader(file_list, batch_size): + def batch_reader(): + res = [] + for i in get_one_data(file_list): + if random.random() <= sample_rate: + res.append(i) + if len(res) >= batch_size: + yield res + res = [] + + return batch_reader + + +def get_train_reader(batch_size): + # The training data set. + train_file = os.path.join(paddle.dataset.common.DATA_HOME, "simnet", + "train") + train_reader = get_batch_reader([train_file], batch_size) + train_feed = ["query_ids", "pos_title_ids", "neg_title_ids", "label"] + return train_reader, train_feed + + +class TestDistSimnetBow2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + # Train program + avg_cost, acc, predict = \ + train_network(batch_size, bool(int(os.environ["IS_DISTRIBUTED"])), bool(int(os.environ["IS_SPARSE"]))) + + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = get_optimizer() + opt.minimize(avg_cost) + + # Reader + train_reader, _ = get_train_reader(batch_size) + return inference_program, avg_cost, train_reader, train_reader, acc, predict + + +if __name__ == "__main__": + paddle.dataset.common.download(DATA_URL, 'simnet', DATA_MD5, "train") + runtime_main(TestDistSimnetBow2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_text_classification.py b/python/paddle/fluid/tests/unittests/dist_text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..095a474fd3ac056c678f9051ed80ef363ae968c9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py @@ -0,0 +1,231 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import signal +import six +import tarfile +import string +import re +from functools import reduce +from test_dist_base import TestDistRunnerBase, runtime_main + +DTYPE = "float32" +VOCAB_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/imdb.vocab' +VOCAB_MD5 = '23c86a0533c0151b6f12fa52b106dcc2' +DATA_URL = 'http://paddle-dist-ce-data.bj.bcebos.com/text_classification.tar.gz' +DATA_MD5 = '29ebfc94f11aea9362bbb7f5e9d86b8a' + + +# Load dictionary. +def load_vocab(filename): + vocab = {} + if six.PY2: + with open(filename, 'r') as f: + for idx, line in enumerate(f): + vocab[line.strip()] = idx + else: + with open(filename, 'r', encoding="utf-8") as f: + for idx, line in enumerate(f): + vocab[line.strip()] = idx + return vocab + + +def get_worddict(dict_path): + word_dict = load_vocab(dict_path) + word_dict[""] = len(word_dict) + dict_dim = len(word_dict) + return word_dict, dict_dim + + +def conv_net(input, + dict_dim, + emb_dim=128, + window_size=3, + num_filters=128, + fc0_dim=96, + class_dim=2): + emb = fluid.layers.embedding( + input=input, + size=[dict_dim, emb_dim], + is_sparse=False, + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.01))) + + conv_3 = fluid.nets.sequence_conv_pool( + input=emb, + num_filters=num_filters, + filter_size=window_size, + act="tanh", + pool_type="max", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + fc_0 = fluid.layers.fc( + input=[conv_3], + size=fc0_dim, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + prediction = fluid.layers.fc( + input=[fc_0], + size=class_dim, + act="softmax", + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + + return prediction + + +def inference_network(dict_dim): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + out = conv_net(data, dict_dim) + return out + + +def get_reader(word_dict, batch_size): + # The training data set. + train_reader = paddle.batch(train(word_dict), batch_size=batch_size) + + # The testing data set. + test_reader = paddle.batch(test(word_dict), batch_size=batch_size) + + return train_reader, test_reader + + +def get_optimizer(learning_rate): + optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) + return optimizer + + +class TestDistTextClassification2x2(TestDistRunnerBase): + def get_model(self, batch_size=2): + vocab = os.path.join(paddle.dataset.common.DATA_HOME, + "text_classification", "imdb.vocab") + word_dict, dict_dim = get_worddict(vocab) + + # Input data + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + # Train program + predict = conv_net(data, dict_dim) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=predict, label=label) + inference_program = fluid.default_main_program().clone() + + # Optimization + opt = get_optimizer(learning_rate=0.001) + opt.minimize(avg_cost) + + # Reader + train_reader, test_reader = get_reader(word_dict, batch_size) + + return inference_program, avg_cost, train_reader, test_reader, acc, predict + + +def tokenize(pattern): + """ + Read files that match the given pattern. Tokenize and yield each file. + """ + + with tarfile.open( + paddle.dataset.common.download(DATA_URL, 'text_classification', + DATA_MD5)) as tarf: + # Note that we should use tarfile.next(), which does + # sequential access of member files, other than + # tarfile.extractfile, which does random access and might + # destroy hard disks. + tf = tarf.next() + while tf != None: + if bool(pattern.match(tf.name)): + # newline and punctuations removal and ad-hoc tokenization. + yield tarf.extractfile(tf).read().rstrip(six.b( + "\n\r")).translate( + None, six.b(string.punctuation)).lower().split() + tf = tarf.next() + + +def reader_creator(pos_pattern, neg_pattern, word_idx): + UNK = word_idx[''] + INS = [] + + def load(pattern, out, label): + for doc in tokenize(pattern): + out.append(([word_idx.get(w, UNK) for w in doc], label)) + + load(pos_pattern, INS, 0) + load(neg_pattern, INS, 1) + + def reader(): + for doc, label in INS: + yield doc, label + + return reader + + +def train(word_idx): + """ + IMDB training set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Training reader creator + :rtype: callable + """ + return reader_creator( + re.compile("train/pos/.*\.txt$"), + re.compile("train/neg/.*\.txt$"), word_idx) + + +def test(word_idx): + """ + IMDB test set creator. + + It returns a reader creator, each sample in the reader is an zero-based ID + sequence and label in [0, 1]. + + :param word_idx: word dictionary + :type word_idx: dict + :return: Test reader creator + :rtype: callable + """ + return reader_creator( + re.compile("test/pos/.*\.txt$"), + re.compile("test/neg/.*\.txt$"), word_idx) + + +if __name__ == "__main__": + paddle.dataset.common.download(VOCAB_URL, 'text_classification', VOCAB_MD5) + paddle.dataset.common.download(DATA_URL, 'text_classification', DATA_MD5) + runtime_main(TestDistTextClassification2x2) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 175bd130e5a8324227953eeeb769474e78f94fd2..a2cc57425841100a2b61279d1b447b88ed4b9a54 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -1488,7 +1488,7 @@ def wrap_decoder(trg_vocab_size, if weight_sharing: predict = layers.matmul( x=dec_output, - y=fluid.get_var(word_emb_param_names[0]), + y=fluid.framework._get_var(word_emb_param_names[0]), transpose_y=True) else: predict = layers.fc(input=dec_output, @@ -1699,10 +1699,9 @@ class DistTransformer2x2(TestDistRunnerBase): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, use_cuda, args): - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - TrainTaskConfig.use_gpu = use_cuda - sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model( + def run_trainer(self, args): + TrainTaskConfig.use_gpu = args.use_cuda + sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( args.is_dist, not args.sync_mode) if args.is_dist: @@ -1718,6 +1717,11 @@ class DistTransformer2x2(TestDistRunnerBase): TrainTaskConfig.batch_size = 20 trainer_prog = fluid.default_main_program() + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + startup_exe = fluid.Executor(place) TrainTaskConfig.local = not args.is_dist diff --git a/python/paddle/fluid/tests/unittests/dist_word2vec.py b/python/paddle/fluid/tests/unittests/dist_word2vec.py index f3e740fc7027a4a562b836c3113b87d55062c185..835306edd0f17490dd10110db40f42dce30b25bb 100644 --- a/python/paddle/fluid/tests/unittests/dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py @@ -122,4 +122,7 @@ class TestDistWord2vec2x2(TestDistRunnerBase): if __name__ == "__main__": + import os + os.environ['CPU_NUM'] = '1' + os.environ['USE_CUDA'] = "FALSE" runtime_main(TestDistWord2vec2x2) diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b5549c507ed753f4504afd655be59b444164e6f3..e97643cddef22465436051a41ef4b825e9634d23 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -345,7 +345,7 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + - str(actual_t)) + str(actual_t) + " in class " + self.__class__.__name__) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + diff --git a/python/paddle/fluid/tests/unittests/test_auc_op.py b/python/paddle/fluid/tests/unittests/test_auc_op.py index 1de4a9d016a177944253d12094722d3a05614be2..810e8a1a8547a92de923877695178e780981edeb 100644 --- a/python/paddle/fluid/tests/unittests/test_auc_op.py +++ b/python/paddle/fluid/tests/unittests/test_auc_op.py @@ -36,7 +36,11 @@ class TestAucOp(OpTest): "StatPos": stat_pos, "StatNeg": stat_neg } - self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds} + self.attrs = { + 'curve': 'ROC', + 'num_thresholds': num_thresholds, + "slide_steps": 1 + } python_auc = metrics.Auc(name="auc", curve='ROC', @@ -45,7 +49,6 @@ class TestAucOp(OpTest): self.outputs = { 'AUC': np.array(python_auc.eval()), - 'BatchAUC': np.array(python_auc.eval()), 'StatPosOut': np.array(python_auc._stat_pos), 'StatNegOut': np.array(python_auc._stat_neg) } diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py index f6eb8f2c6d8b94f92e24ff789c91efb53a645a46..0c5343a97d5ef0f97fc6b144dfc82174eacb8573 100644 --- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py +++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py @@ -20,6 +20,7 @@ import six import sys import collections import math +import paddle.fluid as fluid from op_test import OpTest @@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest): self.detect = np.array(self.detect).astype('float32') self.mAP = np.array(self.mAP).astype('float32') - if (len(self.class_pos_count) > 0): + if len(self.class_pos_count) > 0: self.class_pos_count = np.array(self.class_pos_count).astype( 'int32') self.true_pos = np.array(self.true_pos).astype('float32') @@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): def init_test_case(self): super(TestDetectionMAPOpMultiBatch, self).init_test_case() - self.class_pos_count = [0, 2, 1] + self.class_pos_count = [0, 2, 1, 0] self.true_pos_lod = [[0, 3, 2]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.false_pos_lod = [[0, 3, 2]] diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 37cad73019c529f64868b6ad3c6e2fffe59cc0d8..0b9af6d7f6d5eb2ba81c04a51169127bbdba1b1a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -18,23 +18,27 @@ import time import unittest import os import sys -import six import signal import subprocess +import six import argparse +import paddle.fluid as fluid + +RUN_STEP = 10 + class TestDistRunnerBase(object): def get_model(self, batch_size=2): raise NotImplementedError( "get_model should be implemented by child classes.") - def get_transpiler(self, trainer_id, main_program, pserver_endpoints, - trainers, sync_mode): + @staticmethod + def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers, + sync_mode): # NOTE: import fluid until runtime, or else forking processes will cause error. - import paddle - import paddle.fluid as fluid - t = fluid.DistributeTranspiler() + config = fluid.DistributeTranspilerConfig() + t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id=trainer_id, program=main_program, @@ -44,11 +48,9 @@ class TestDistRunnerBase(object): return t def run_pserver(self, args): - import paddle - import paddle.fluid as fluid + self.get_model(batch_size=2) - if args.mem_opt: - fluid.memory_optimize(fluid.default_main_program()) + # NOTE: pserver should not call memory optimize t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode) @@ -61,29 +63,34 @@ class TestDistRunnerBase(object): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, use_cuda, args): - import paddle - import paddle.fluid as fluid - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + def run_trainer(self, args): test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=2) + if args.mem_opt: - fluid.memory_optimize(fluid.default_main_program()) + fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) if args.is_dist: t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode) + trainer_prog = t.get_trainer_program() else: trainer_prog = fluid.default_main_program() + if args.use_cuda: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + startup_exe = fluid.Executor(place) startup_exe.run(fluid.default_startup_program()) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 strategy.allow_op_delay = False + build_stra = fluid.BuildStrategy() if args.use_reduce: @@ -92,7 +99,7 @@ class TestDistRunnerBase(object): build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor( - use_cuda, + args.use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, build_strategy=build_stra) @@ -103,27 +110,26 @@ class TestDistRunnerBase(object): ] feeder = fluid.DataFeeder(feed_var_list, place) - reader_generator = test_reader() - - data = next(reader_generator) - first_loss, = exe.run(fetch_list=[avg_cost.name], - feed=feeder.feed(data)) - print(first_loss) + reader_generator = train_reader() - for i in six.moves.xrange(5): - data = next(reader_generator) - loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) + def get_data(): + origin_batch = next(reader_generator) + if args.is_dist and args.use_reader_alloc: + new_batch = [] + for offset, item in enumerate(origin_batch): + if offset % 2 == args.trainer_id: + new_batch.append(item) + return new_batch + else: + return origin_batch - data = next(reader_generator) - last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) - print(last_loss) + for _ in six.moves.xrange(RUN_STEP): + loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(get_data())) + print(loss) def runtime_main(test_class): - import paddle - import paddle.fluid as fluid - import paddle.fluid.core as core - parser = argparse.ArgumentParser(description='Run dist test.') parser.add_argument( '--role', type=str, required=True, choices=['pserver', 'trainer']) @@ -135,7 +141,10 @@ def runtime_main(test_class): '--current_endpoint', type=str, required=False, default="") parser.add_argument('--sync_mode', action='store_true') parser.add_argument('--mem_opt', action='store_true') + parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--use_reduce', action='store_true') + parser.add_argument( + '--use_reader_alloc', action='store_true', required=False, default=True) args = parser.parse_args() @@ -143,8 +152,7 @@ def runtime_main(test_class): if args.role == "pserver" and args.is_dist: model.run_pserver(args) else: - use_cuda = True if core.is_compiled_with_cuda() else False - model.run_trainer(use_cuda, args) + model.run_trainer(args) import paddle.compat as cpt @@ -163,8 +171,10 @@ class TestDistBase(unittest.TestCase): self._find_free_port(), self._find_free_port()) self._python_interp = "python" self._sync_mode = True + self._use_cuda = True self._mem_opt = False self._use_reduce = False + self._use_reader_alloc = True self._setup_config() def _find_free_port(self): @@ -172,15 +182,15 @@ class TestDistBase(unittest.TestCase): s.bind(('', 0)) return s.getsockname()[1] - def start_pserver(self, model_file, check_error_log): + def start_pserver(self, model_file, check_error_log, required_envs): ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" ps0_cmd = ps_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, ps0_ep, - self._trainers) + (self._python_interp, model_file, self._ps_endpoints, ps0_ep, + self._trainers) ps1_cmd = ps_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, ps1_ep, - self._trainers) + (self._python_interp, model_file, self._ps_endpoints, ps1_ep, + self._trainers) if self._sync_mode: ps0_cmd += " --sync_mode" @@ -198,9 +208,15 @@ class TestDistBase(unittest.TestCase): ps1_pipe = open("/tmp/ps1_err.log", "wb") ps0_proc = subprocess.Popen( - ps0_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps0_pipe) + ps0_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=ps0_pipe, + env=required_envs) ps1_proc = subprocess.Popen( - ps1_cmd.strip().split(" "), stdout=subprocess.PIPE, stderr=ps1_pipe) + ps1_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=ps1_pipe, + env=required_envs) if not check_error_log: return ps0_proc, ps1_proc, None, None @@ -222,59 +238,60 @@ class TestDistBase(unittest.TestCase): (e, retry_times)) retry_times -= 1 - def check_with_place(self, model_file, delta=1e-3, check_error_log=False): - # TODO(typhoonzero): should auto adapt GPU count on the machine. - required_envs = { - "PATH": os.getenv("PATH", ""), - "PYTHONPATH": os.getenv("PYTHONPATH", ""), - "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), - "FLAGS_fraction_of_gpu_memory_to_use": "0.15", - "FLAGS_cudnn_deterministic": "1", - "CPU_NUM": "1" - } + def _run_local(self, model, envs, check_error_log): - if check_error_log: - required_envs["GLOG_v"] = "7" - required_envs["GLOG_logtostderr"] = "1" + cmd = "%s %s --role trainer" % (self._python_interp, model) + + if self._use_cuda: + cmd += " --use_cuda" + env_local = {"CUDA_VISIBLE_DEVICES": "0"} + else: + env_local = {'CPU_NUM': '1'} + + envs.update(env_local) - # Run local to get a base line - env_local = {"CUDA_VISIBLE_DEVICES": "0"} - env_local.update(required_envs) - local_cmd = "%s %s --role trainer" % (self._python_interp, model_file) if not check_error_log: + err_log = open("/tmp/trainer.err.log", "wb") local_proc = subprocess.Popen( - local_cmd.split(" "), + cmd.split(" "), stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - env=env_local) + stderr=err_log, + env=envs) else: - err_log = open("/tmp/trainer.err.log", "wb") local_proc = subprocess.Popen( - local_cmd.split(" "), + cmd.split(" "), stdout=subprocess.PIPE, - stderr=err_log, - env=env_local) + stderr=subprocess.PIPE, + env=envs) local_proc.wait() - out, err = local_proc.communicate() - local_ret = cpt.to_text(out) - sys.stderr.write('local_loss: %s\n' % local_ret) - sys.stderr.write('local_stderr: %s\n' % err) + local_out, local_err = local_proc.communicate() + local_ret = cpt.to_text(local_out) + + if check_error_log: + err_log.close() + + sys.stderr.write('local_stdout: %s\n' % local_ret) + sys.stderr.write('local_stderr: %s\n' % local_err) + local_losses = local_ret.split("\n") + return local_losses + + def _run_cluster(self, model, envs, check_error_log): # Run dist train to compare with local results - ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model_file, - check_error_log) + ps0, ps1, ps0_pipe, ps1_pipe = self.start_pserver(model, + check_error_log, envs) self._wait_ps_ready(ps0.pid) self._wait_ps_ready(ps1.pid) - ps0_ep, ps1_ep = self._ps_endpoints.split(",") + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" tr0_cmd = tr_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, - 0, ps0_ep, self._trainers) + (self._python_interp, model, self._ps_endpoints, + 0, ps0_ep, self._trainers) tr1_cmd = tr_cmd % \ - (self._python_interp, model_file, self._ps_endpoints, - 1, ps1_ep, self._trainers) + (self._python_interp, model, self._ps_endpoints, + 1, ps1_ep, self._trainers) if self._sync_mode: tr0_cmd += " --sync_mode" @@ -285,18 +302,28 @@ class TestDistBase(unittest.TestCase): if self._use_reduce: tr0_cmd += " --use_reduce" tr1_cmd += " --use_reduce" + if self._use_reader_alloc: + tr0_cmd += " --use_reader_alloc" + tr1_cmd += " --use_reader_alloc" + if self._use_cuda: + tr0_cmd += " --use_cuda" + tr1_cmd += " --use_cuda" + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + else: + env0 = {'CPU_NUM': '1'} + env1 = {'CPU_NUM': '1'} + + env0.update(envs) + env1.update(envs) - env0 = {"CUDA_VISIBLE_DEVICES": "0"} - env1 = {"CUDA_VISIBLE_DEVICES": "1"} - env0.update(required_envs) - env1.update(required_envs) FNULL = open(os.devnull, 'w') tr0_pipe = subprocess.PIPE tr1_pipe = subprocess.PIPE if check_error_log: - print("tr0_cmd:", tr0_cmd) - print("tr1_cmd:", tr1_cmd) + print("tr0_cmd:{}, env0: {}".format(tr0_cmd, env0)) + print("tr1_cmd:{}, env1: {}".format(tr1_cmd, env1)) tr0_pipe = open("/tmp/tr0_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb") @@ -313,17 +340,11 @@ class TestDistBase(unittest.TestCase): tr0_proc.wait() tr1_proc.wait() - out, err = tr0_proc.communicate() - sys.stderr.write('dist_stderr: %s\n' % err) - loss_data0 = cpt.to_text(out) - sys.stderr.write('dist_loss: %s\n' % loss_data0) - lines = loss_data0.split("\n") - dist_first_loss = eval(lines[0].replace(" ", ","))[0] - dist_last_loss = eval(lines[1].replace(" ", ","))[0] - - local_lines = local_ret.split("\n") - local_first_loss = eval(local_lines[0])[0] - local_last_loss = eval(local_lines[1])[0] + + tr0_out, tr0_err = tr0_proc.communicate() + tr0_loss_text = cpt.to_text(tr0_out) + tr1_out, tr1_err = tr1_proc.communicate() + tr1_loss_text = cpt.to_text(tr1_out) # close trainer file if check_error_log: @@ -341,5 +362,47 @@ class TestDistBase(unittest.TestCase): ps1.wait() FNULL.close() - self.assertAlmostEqual(local_first_loss, dist_first_loss, delta=delta) - self.assertAlmostEqual(local_last_loss, dist_last_loss, delta=delta) + # print log + sys.stderr.write('trainer 0 stdout:\n %s\n' % tr0_loss_text) + sys.stderr.write('trainer 0 stderr:\n %s\n' % tr0_err) + sys.stderr.write('trainer 1 stdout: %s\n' % tr1_loss_text) + sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + + tr0_losses = tr0_loss_text.split("\n") + tr1_losses = tr1_loss_text.split("\n") + + return tr0_losses, tr1_losses + + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + # TODO(typhoonzero): should auto adapt GPU count on the machine. + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_cudnn_deterministic": "1", + } + + required_envs.update(need_envs) + + if check_error_log: + required_envs["GLOG_v"] = "7" + required_envs["GLOG_logtostderr"] = "1" + + local_losses\ + = self._run_local(model_file, required_envs, + check_error_log) + tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs, + check_error_log) + + for step_id in range(RUN_STEP): + local_loss = eval(local_losses[step_id])[0] + tr0_loss = eval(tr0_losses[step_id])[0] + tr1_loss = eval(tr1_losses[step_id])[0] + dist_loss = (tr0_loss + tr1_loss) / 2 + print(str(local_loss) + ":" + str(dist_loss)) + self.assertAlmostEqual(local_loss, dist_loss, delta=delta) diff --git a/python/paddle/fluid/tests/unittests/test_dist_ctr.py b/python/paddle/fluid/tests/unittests/test_dist_ctr.py new file mode 100644 index 0000000000000000000000000000000000000000..081d6e9273ebaf7af643b8481399d11d1ab60e00 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_ctr.py @@ -0,0 +1,31 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import unittest +from test_dist_base import TestDistBase + + +class TestDistCTR2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_cuda = False + + def test_dist_ctr(self): + self.check_with_place("dist_ctr.py", delta=1e-7, check_error_log=False) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 09b1c546e49bd02bf336f31885bf4c7339cc5a2c..f65dd7e2a28c4ace3988c0cc1267ebe981fbd9cb 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -23,7 +23,7 @@ class TestDistMnist2x2(TestDistBase): self._use_reduce = False def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=1e-7) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnist2x2WithMemopt(TestDistBase): @@ -32,7 +32,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase): self._mem_opt = True def test_dist_train(self): - self.check_with_place("dist_mnist.py", delta=1e-7) + self.check_with_place("dist_mnist.py", delta=1e-5) class TestDistMnistAsync(TestDistBase): diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index c2b089694ea2f329e67ad6c50def26caa454720e..c0989ca709e100d8f147a08970b0e858c81ce09b 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -20,24 +20,25 @@ from test_dist_base import TestDistBase class TestDistSeResneXt2x2(TestDistBase): def _setup_config(self): self._sync_mode = True + self._use_reader_alloc = False def test_dist_train(self): - self.check_with_place("dist_se_resnext.py", delta=1e-7) + self.check_with_place("dist_se_resnext.py", delta=100) -# TODO(typhoonzero): fix this test -# class TestDistseResnXt2x2WithMemopt(TestDistBase): -# def _setup_config(self): -# self._sync_mode = True -# self._mem_opt = True +class TestDistseResnXt2x2WithMemopt(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._mem_opt = True -# def test_dist_train(self): -# self.check_with_place("dist_se_resnext.py", delta=1e-7) + def test_dist_train(self): + self.check_with_place("dist_se_resnext.py", delta=100) class TestDistSeResneXt2x2Async(TestDistBase): def _setup_config(self): self._sync_mode = False + self._use_reader_alloc = False def test_dist_train(self): self.check_with_place("dist_se_resnext.py", delta=100) diff --git a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py new file mode 100644 index 0000000000000000000000000000000000000000..6bc707c245ab13dd2dbe50b953ef5308aba05b78 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py @@ -0,0 +1,79 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import os +import unittest + +from test_dist_base import TestDistBase + + +class TestDistSimnetBowDense2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_cuda = False + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2DenseAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._use_cuda = False + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '0'} + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBowSparse2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_cuda = False + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + self.check_with_place( + "dist_simnet_bow.py", + delta=1e-5, + check_error_log=False, + need_envs=need_envs) + + +class TestDistSimnetBow2x2SparseAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._use_cuda = False + + def test_simnet_bow(self): + need_envs = {"IS_DISTRIBUTED": '0', "IS_SPARSE": '1'} + self.check_with_place( + "dist_simnet_bow.py", + delta=100, + check_error_log=False, + need_envs=need_envs) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..b830c965caf2e47c5cc648bc98960459fa6b30ee --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import os +import unittest +from test_dist_base import TestDistBase + + +class TestDistTextClassification2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_cuda = False + + def test_text_classification(self): + self.check_with_place("dist_text_classification.py", delta=1e-6) + + +class TestDistTextClassification2x2Async(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._use_cuda = False + + def test_se_resnext(self): + self.check_with_place("dist_text_classification.py", delta=100) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index ecde407e6d85ea1bfc0181b4b60e095ea496fb1a..54a1c68a37f6929890aab697b48d621e6effb7d8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -264,6 +264,25 @@ class TestLRDecay(TranspilerTest): ]) +class TestDecayedAdagrad(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + opt = fluid.optimizer.DecayedAdagrad(learning_rate=0.1) + opt.minimize(avg_cost) + + def transpiler_test_impl(self): + pserver, startup = self.get_pserver(self.pserver1_ep) + trainer, _ = self.get_trainer() + + class TestLRDecayConditional(TranspilerTest): def net_conf(self): x = fluid.layers.data(name='x', shape=[1000], dtype='float32') diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index 33b39b262b95b0013e3696c3f15a288a2e801ce1..b26cbdbea12962a3a41036c774de5dfb61999205 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -39,7 +39,7 @@ class TestDistW2V2x2Async(TestDistBase): self._sync_mode = False def test_dist_train(self): - self.check_with_place("dist_word2vec.py", delta=1) + self.check_with_place("dist_word2vec.py", delta=100) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py index 86e27fe29ed945ec77fbbcdbd1c7cc6ecfba0fd5..9340d558577b4b3141df9317900ee33bbb683a0e 100644 --- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py +++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py @@ -277,7 +277,6 @@ class TestGenerateProposalsOp(OpTest): 'eta': self.eta } - print("lod = ", self.lod) self.outputs = { 'RpnRois': (self.rpn_rois[0], [self.lod]), 'RpnRoiProbs': (self.rpn_roi_probs[0], [self.lod]) @@ -295,7 +294,7 @@ class TestGenerateProposalsOp(OpTest): self.post_nms_topN = 5000 # train 6000, test 1000 self.nms_thresh = 0.7 self.min_size = 3.0 - self.eta = 0.8 + self.eta = 1. def init_test_input(self): batch_size = 1 diff --git a/python/paddle/fluid/tests/unittests/test_infer_shape.py b/python/paddle/fluid/tests/unittests/test_infer_shape.py index a3d700aad8236fea7bb0e6d043323ad3bd0851f2..fdff22cacc28731a91ff4fd17407bd9edbdd9d8b 100644 --- a/python/paddle/fluid/tests/unittests/test_infer_shape.py +++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py @@ -76,8 +76,8 @@ class TestInferShape(unittest.TestCase): mul_op_desc.set_input("X", ["x"]) mul_op_desc.set_input("Y", ["y"]) mul_op_desc.set_output("Out", ["out"]) - mul_op_desc.set_attr("x_num_col_dims", 1) - mul_op_desc.set_attr("y_num_col_dims", 1) + mul_op_desc._set_attr("x_num_col_dims", 1) + mul_op_desc._set_attr("y_num_col_dims", 1) mul_op_desc.check_attrs() mul_op_desc.infer_shape(block) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f474cdae2054531d44724e0e3e0e58a35fb8ddcd..b8dc9e8ad7cd7cd100d5c3cb99319e6f5a37da91 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -541,7 +541,7 @@ class TestBook(unittest.TestCase): with program_guard(program): input = layers.data( name="input", shape=[3, 100, 100], dtype="float32") - out = layers.shape(input, name="shape") + out = layers.shape(input) self.assertIsNotNone(out) print(str(program)) @@ -758,6 +758,65 @@ class TestBook(unittest.TestCase): out = layers.expand(x, [1, 2]) print(str(program)) + def test_uniform_random_batch_size_like(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + out = layers.uniform_random_batch_size_like(input, [-1, 11]) + self.assertIsNotNone(out) + print(str(program)) + + def test_gaussian_random(self): + program = Program() + with program_guard(program): + out = layers.gaussian_random(shape=[20, 30]) + self.assertIsNotNone(out) + print(str(program)) + + def test_sampling_id(self): + program = Program() + with program_guard(program): + x = layers.data( + name="X", + shape=[13, 11], + dtype='float32', + append_batch_size=False) + + out = layers.sampling_id(x) + self.assertIsNotNone(out) + print(str(program)) + + def test_gaussian_random_batch_size_like(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + + out = layers.gaussian_random_batch_size_like( + input, shape=[-1, 11], mean=1.0, std=2.0) + self.assertIsNotNone(out) + print(str(program)) + + def test_sum(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + + out = layers.sum(input) + self.assertIsNotNone(out) + print(str(program)) + + def test_slice(self): + starts = [1, 0, 2] + ends = [3, 3, 4] + axes = [0, 1, 2] + + program = Program() + with program_guard(program): + input = layers.data( + name="input", shape=[3, 4, 5, 6], dtype='float32') + + out = layers.slice(input, axes=axes, starts=starts, ends=ends) + def test_softshrink(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..288c5f6a1f6b1760ca40c0c653e4c0726b799519 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py @@ -0,0 +1,121 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import unittest +import os +import sys +import math + + +def simple_fc_net(): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestPassBuilder(unittest.TestCase): + def check_network_convergence(self, use_cuda, build_strategy=None): + os.environ['CPU_NUM'] = str(4) + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = simple_fc_net() + test_program = main.clone(for_test=True) + + opt = fluid.optimizer.SGD(learning_rate=0.001) + opt.minimize(loss) + + batch_size = 32 + image = np.random.normal(size=(batch_size, 784)).astype('float32') + label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + feed_dict = {'image': image, 'label': label} + + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, + loss_name=loss.name, + main_program=main, + build_strategy=build_strategy) + + test_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, + main_program=test_program, + share_vars_from=train_exe, + build_strategy=build_strategy) + + for i in range(5): + test_loss, = test_exe.run([loss.name], feed=feed_dict) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + + avg_test_loss_val = np.array(test_loss).mean() + if math.isnan(float(avg_test_loss_val)): + sys.exit("got NaN loss, testing failed.") + + avg_train_loss_val = np.array(train_loss).mean() + if math.isnan(float(avg_train_loss_val)): + sys.exit("got NaN loss, training failed.") + + self.assertTrue( + np.allclose( + train_loss, test_loss, atol=1e-8), + "Train loss: " + str(train_loss) + "\n Test loss:" + + str(test_loss)) + + def test_parallel_testing_with_new_strategy(self): + build_strategy = fluid.BuildStrategy() + pass_builder = build_strategy._create_passes_from_strategy() + origin_len = len(pass_builder.all_passes()) + + viz_pass = pass_builder.append_pass("graph_viz_pass") + self.assertEqual(origin_len + 1, len(pass_builder.all_passes())) + + pass_builder.insert_pass( + len(pass_builder.all_passes()), "graph_viz_pass") + self.assertEqual(origin_len + 2, len(pass_builder.all_passes())) + + pass_builder.remove_pass(len(pass_builder.all_passes()) - 1) + self.assertEqual(origin_len + 1, len(pass_builder.all_passes())) + viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass") + + self.check_network_convergence( + use_cuda=core.is_compiled_with_cuda(), + build_strategy=build_strategy) + try: + os.stat("/tmp/test_viz_pass") + except os.error: + self.assertFalse(True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index d24b5cbd06ddf9f332c1369ebd513bef27b77e14..7fb2171f611adea434d6f2710465810fb69d6979 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -38,40 +38,40 @@ class TestOpDesc(unittest.TestCase): self.assertEqual(['z'], op.output("Out")) self.assertEqual(["Out"], op.output_names()) - op.set_attr("int_attr", 1) + op._set_attr("int_attr", 1) self.assertEqual(1, op.attr("int_attr")) self.assertTrue(op.has_attr("int_attr")) self.assertEqual(core.AttrType.INT, op.attr_type("int_attr")) - op.set_attr("float_attr", -1.32) + op._set_attr("float_attr", -1.32) self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4) self.assertTrue(op.has_attr("float_attr")) - op.set_attr("bool_attr", False) + op._set_attr("bool_attr", False) self.assertFalse(op.attr("bool_attr")) - op.set_attr("string_attr", "abc") + op._set_attr("string_attr", "abc") self.assertEqual("abc", op.attr("string_attr")) self.assertTrue(op.has_attr("string_attr")) - op.set_attr("ints_attr", [1, 2, 3]) + op._set_attr("ints_attr", [1, 2, 3]) self.assertEqual([1, 2, 3], op.attr("ints_attr")) expected = [1.2, 2.3, 3.4] - op.set_attr("floats_attr", expected) + op._set_attr("floats_attr", expected) for e, a in zip(expected, op.attr("floats_attr")): self.assertAlmostEqual(e, a, delta=1e-4) - op.set_attr("strings_attr", ["a", "b", "c"]) + op._set_attr("strings_attr", ["a", "b", "c"]) self.assertEqual(["a", "b", "c"], op.attr("strings_attr")) - op.set_attr("bools_attr", [True, False, True]) + op._set_attr("bools_attr", [True, False, True]) self.assertEqual([True, False, True], op.attr("bools_attr")) self.assertEqual(8, len(op.attr_names())) - op.set_block_attr("block_attr", program_desc.block(0)) - self.assertEqual(0, op.block_attr_id("block_attr")) + op.set_block_attr("_block_attr", program_desc.block(0)) + self.assertEqual(0, op._block_attr_id("_block_attr")) mul_op = block.append_op() mul_op.set_type("mul") diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index ab7a18d4c5c4ce1e490e2951ff9fbb023324e753..143d187edc3a154418f9e639b7d492c8ce994d42 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -246,6 +246,7 @@ def prepare_encoder(src_word, padding_idx=pos_pad_idx, param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) + src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc # FIXME(guosheng): Decouple the program desc with batch_size. diff --git a/python/paddle/fluid/transpiler/__init__.py b/python/paddle/fluid/transpiler/__init__.py index 28c7ae5341b20f0f79da8cf682d279fc4cc3fa19..c9a8176a72fb744963ae466e965a25bdfb0a44de 100644 --- a/python/paddle/fluid/transpiler/__init__.py +++ b/python/paddle/fluid/transpiler/__init__.py @@ -20,6 +20,10 @@ from .memory_optimization_transpiler import memory_optimize, release_memory from .ps_dispatcher import HashName, RoundRobin __all__ = [ - "DistributeTranspiler", "memory_optimize", "release_memory", "HashName", - "RoundRobin", "DistributeTranspilerConfig" + "DistributeTranspiler", + "memory_optimize", + "release_memory", + "HashName", + "RoundRobin", + "DistributeTranspilerConfig", ] diff --git a/python/paddle/fluid/transpiler/details/program_utils.py b/python/paddle/fluid/transpiler/details/program_utils.py index 59899e7e9ab98f661699d5ac0645c92bd23a1512..391d6aa12bdd70b9ef988898bee8e86cd0a0d765 100644 --- a/python/paddle/fluid/transpiler/details/program_utils.py +++ b/python/paddle/fluid/transpiler/details/program_utils.py @@ -128,7 +128,7 @@ def op_to_code(op): attr_type = op.desc.attr_type(name) if attr_type == core.AttrType.BLOCK: a = "{name} = block[{value}]".format( - name=name, type=attr_type, value=op.block_attr_id(name)) + name=name, type=attr_type, value=op._block_attr_id(name)) attrs_str += a if i != len(attr_names) - 1: attrs_str += ", " @@ -136,7 +136,7 @@ def op_to_code(op): if attr_type == core.AttrType.BLOCKS: a = "{name} = blocks{value}".format( - name=name, type=attr_type, value=op.blocks_attr_ids(name)) + name=name, type=attr_type, value=op._blocks_attr_ids(name)) attrs_str += a if i != len(attr_names) - 1: attrs_str += ", " diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 43071def7a906e585909e50e4c0c52c56d981cde..ecdbe27f4d90268d755a712e25289cfaf4715f29 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -39,8 +39,8 @@ import six from .ps_dispatcher import RoundRobin, HashName, PSDispatcher from .. import core, framework from ..framework import Program, default_main_program, \ - default_startup_program, Block, \ - Parameter, grad_var_name + default_startup_program, Block, \ + Parameter, grad_var_name from .details import * from functools import reduce @@ -178,7 +178,7 @@ class DistributeTranspiler(object): pserver_program) elif role == "TRAINER": trainer_program = t.get_trainer_program() - + # for nccl2 mode config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" @@ -470,7 +470,10 @@ class DistributeTranspiler(object): """ # remove optimize ops and add a send op to main_program # FIXME(typhoonzero): Also ops like clip_gradient, lrn_decay? + lr_ops = self._get_lr_ops() delete_ops(self.origin_program.global_block(), self.optimize_ops) + delete_ops(self.origin_program.global_block(), lr_ops) + self.origin_program.__str__() if wait_port: @@ -534,7 +537,7 @@ class DistributeTranspiler(object): }) for varname, splited_var in six.iteritems(self.param_var_mapping): - #add concat ops to merge splited parameters received from parameter servers. + # add concat ops to merge splited parameters received from parameter servers. if len(splited_var) <= 1: continue # NOTE: if enable memory optimization, origin vars maybe removed. @@ -668,7 +671,7 @@ in a single call.") __clone_lr_op_sub_block__(cloned_op, program, new_sub_block) # reset the block of op - op.set_attr('sub_block', new_sub_block) + op._set_attr('sub_block', new_sub_block) # append lr decay ops to the child block if exists lr_ops = self._get_lr_ops() @@ -734,19 +737,14 @@ in a single call.") table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) optimize_blocks.append(table_opt_block) - prefetch_var_name_to_block_id = self._create_prefetch_block( + lookup_table_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) checkpoint_block_id = self._create_checkpoint_save_block( pserver_program, table_opt_block.idx) pserver_program._distributed_lookup_table = self.table_name - - # NOTE: if has_distributed_lookup_table is False, then prefetch_block will - # not be executed, so it's safe to use optimize_block to hold the place - if self.has_distributed_lookup_table: - assert len(prefetch_var_name_to_block_id) > 0 - else: - assert len(prefetch_var_name_to_block_id) == 0 + prefetch_var_name_to_block_id.extend( + lookup_table_var_name_to_block_id) attrs = { "optimize_blocks": optimize_blocks, @@ -755,11 +753,14 @@ in a single call.") "sync_mode": self.sync_mode, "grad_to_block_id": grad_to_block_id, } - if len(prefetch_var_name_to_block_id) > 0: - attrs['prefetch_var_name_to_block_id'] \ - = prefetch_var_name_to_block_id + + if self.has_distributed_lookup_table: attrs['checkpint_block_id'] = checkpoint_block_id + if len(prefetch_var_name_to_block_id) > 0: + attrs[ + 'prefetch_var_name_to_block_id'] = prefetch_var_name_to_block_id + # step5 append the listen_and_serv op pserver_program.global_block().append_op( type="listen_and_serv", @@ -864,7 +865,7 @@ to transpile() call.") if op.type in [ "gaussian_random", "fill_constant", "uniform_random" ]: - op.set_attr("shape", list(new_outputs["Out"].shape)) + op._set_attr("shape", list(new_outputs["Out"].shape)) s_prog.global_block().append_op( type=op.type, inputs=new_inputs, @@ -1013,7 +1014,7 @@ to transpile() call.") for g, p in zip(grad_blocks, param_blocks): g_name, g_bid, _ = g.split(":") p_name, p_bid, _ = p.split(":") - self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ + self.grad_param_mapping[self.grad_var_mapping[g_name][int(g_bid)]] = \ self.param_var_mapping[p_name][int(p_bid)] # create mapping of endpoint -> split var to create pserver side program @@ -1320,7 +1321,7 @@ to transpile() call.") if len(splited) == 1: if self.sync_mode and add_trainer_suffix: new_var_name = "%s.trainer_%d" % \ - (orig_var.name, self.trainer_id) + (orig_var.name, self.trainer_id) program.global_block()._rename_var(varname, new_var_name) var_mapping[varname] = \ [program.global_block().var(new_var_name)] @@ -1343,10 +1344,10 @@ to transpile() call.") new_var_name = "" if self.sync_mode and add_trainer_suffix: new_var_name = "%s.block%d.trainer_%d" % \ - (varname, i, self.trainer_id) + (varname, i, self.trainer_id) else: new_var_name = "%s.block%d" % \ - (varname, i) + (varname, i) var = program.global_block().create_var( name=new_var_name, persistable=False, @@ -1430,6 +1431,9 @@ to transpile() call.") elif op_type == "rmsprop": if varkey in ["Moment", "MeanSquare"]: return param_shape + elif op_type == "decayed_adagrad": + if varkey == "Moment": + return param_shape elif op_type == "sgd": pass return orig_shape @@ -1484,9 +1488,8 @@ to transpile() call.") vars2merge = [] for i in range(self.trainer_num): per_trainer_name = "%s.trainer_%d" % \ - (merged_var_name, i) + (merged_var_name, i) vars2merge.append(pserver_block.vars[per_trainer_name]) - optimize_block.append_op( type="sum", inputs={"X": vars2merge}, @@ -1645,7 +1648,7 @@ to transpile() call.") # one op's output is another op's input, we say # the two operator is connected. if set(op1.desc.output_arg_names()) & set(op2.desc.input_arg_names()) or \ - set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): + set(op1.desc.input_arg_names()) & set(op2.desc.output_arg_names()): return True return False @@ -1662,7 +1665,7 @@ to transpile() call.") def _is_optimizer_op(self, op): if "Param" in op.input_names and \ - "LearningRate" in op.input_names: + "LearningRate" in op.input_names: return True return False @@ -1737,7 +1740,7 @@ to transpile() call.") # NOTE: we need to skip all optimize ops, since it is connected # with forward/backward ops and lr ops, we only need the lr ops. if op1 != op2 and self._is_op_connected(op1, op2) and \ - not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2): + not self._is_optimizer_op(op1) and not self._is_optimizer_op(op2): ufind.union(op1, op2) # find all ops which is related with lr var for op1 in block.ops: diff --git a/python/paddle/fluid/transpiler/inference_transpiler.py b/python/paddle/fluid/transpiler/inference_transpiler.py index 49ba2cfd55bc881ed753fcefbd41f5b8fd4ebaf7..43d51b03e81895d7322d9e28a9c40b6d7cc69206 100644 --- a/python/paddle/fluid/transpiler/inference_transpiler.py +++ b/python/paddle/fluid/transpiler/inference_transpiler.py @@ -163,7 +163,7 @@ class InferenceTranspiler(object): next_op = self.block.ops[i + 1] if next_op.type == 'relu': # modify bnorm OP to include relu - current_op.set_attr("fuse_with_relu", True) + current_op._set_attr("fuse_with_relu", True) # remove relu OP self.block._remove_op(i + 1) i = i + 1 @@ -377,7 +377,7 @@ class InferenceTranspiler(object): type=old_var.type, dtype=old_var.dtype, shape=old_var.shape) - op.rename_input(old_param_name, new_param_name) + op._rename_input(old_param_name, new_param_name) self.scope.var(new_param_name) tensor = self.scope.find_var(new_param_name).get_tensor() @@ -463,8 +463,8 @@ class InferenceTranspiler(object): current_op = self.block.ops[i] for input_arg in current_op.input_arg_names: if input_arg in self.input_map: - current_op.rename_input(input_arg, - self.input_map[input_arg]) + current_op._rename_input(input_arg, + self.input_map[input_arg]) def _remove_unused_var(self): ''' diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index d5aa54d752305b188d292f95f05cd70d27702c35..861bb5fae5d7a8561ded1f547fbb86ae1e1a073e 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,10 +14,10 @@ from __future__ import print_function -from collections import defaultdict, OrderedDict, Callable +from collections import defaultdict, MutableSet from .. import core from ... import compat as cpt -from ..framework import Program, default_main_program, Parameter, Variable +from ..framework import Program, default_main_program, Parameter, Variable, core from ..backward import _rename_arg_ from functools import reduce from six.moves import range @@ -44,17 +44,82 @@ SUB_BLOCK_PAIR = [("while", "while_grad"), ("parallel_do", "parallel_do_grad"), PRINT_LOG = False +class OrderedSet(MutableSet): + def __init__(self, iterable=None): + self.end = end = [] + end += [None, end, end] # sentinel node for doubly linked list + self.map = {} # key --> [key, prev, next] + if iterable is not None: + self |= iterable + + def __len__(self): + return len(self.map) + + def __contains__(self, key): + return key in self.map + + def add(self, key): + if key not in self.map: + end = self.end + curr = end[1] + curr[2] = end[1] = self.map[key] = [key, curr, end] + + def update(self, other): + for e in other: + self.add(e) + + def discard(self, key): + if key in self.map: + key, prev, next = self.map.pop(key) + prev[2] = next + next[1] = prev + + def remove(self, key): + self.discard(key) + + def __iter__(self): + end = self.end + curr = end[2] + while curr is not end: + yield curr[0] + curr = curr[2] + + def __reversed__(self): + end = self.end + curr = end[1] + while curr is not end: + yield curr[0] + curr = curr[1] + + def pop(self, last=True): + if not self: + raise KeyError('set is empty') + key = self.end[1][0] if last else self.end[2][0] + self.discard(key) + return key + + def __repr__(self): + if not self: + return '%s()' % (self.__class__.__name__, ) + return '%s(%r)' % (self.__class__.__name__, list(self)) + + def __eq__(self, other): + if isinstance(other, OrderedSet): + return len(self) == len(other) and list(self) == list(other) + return set(self) == set(other) + + class ControlFlowGraph(object): def __init__(self, program, ops, forward_num, skip_opt): self._program = program self._ops = ops self._forward_num = forward_num - self._successors = defaultdict(set) - self._presuccessors = defaultdict(set) - self._uses = defaultdict(set) - self._defs = defaultdict(set) - self._live_in = defaultdict(set) - self._live_out = defaultdict(set) + self._successors = defaultdict(OrderedSet) + self._presuccessors = defaultdict(OrderedSet) + self._uses = defaultdict(OrderedSet) + self._defs = defaultdict(OrderedSet) + self._live_in = defaultdict(OrderedSet) + self._live_out = defaultdict(OrderedSet) self._skip_opt = skip_opt self.pool = [] @@ -116,7 +181,7 @@ class ControlFlowGraph(object): # NOTE: must sort the in_diff set for cases that get different cache var. # FIXME(typhoonzero): maybe use a "sorted set" is better than this. can_optimize = [ - x for x in sorted(list(in_diff)) + x for x in in_diff if self._check_var_validity(block_desc, x, is_forward) ] if can_optimize: @@ -224,7 +289,7 @@ class ControlFlowGraph(object): if self.pool: # NOTE: must sort the in_diff set for cases that get different cache var. defs_can_optimize = [ - x for x in sorted(list(self._defs[i])) + x for x in self._defs[i] if self._check_var_validity(block_desc, x, is_forward) ] out_pair = [ @@ -381,7 +446,19 @@ def _get_cfgs(input_program): return cfgs -def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): +def _is_opt_role_op(op): + op_maker = core.op_proto_and_checker_maker + optimize_role = core.op_proto_and_checker_maker.OpRole.Optimize + if op_maker.kOpRoleAttrName() in op.attr_names and \ + int(op.all_attrs()[op_maker.kOpRoleAttrName()]) == int(optimize_role): + return True + + +def memory_optimize(input_program, + skip_opt_set=None, + print_log=False, + level=0, + skip_grads=False): """Optimize memory by reusing var memory. Note: it doesn't not support subblock nested in subblock. @@ -398,6 +475,19 @@ def memory_optimize(input_program, skip_opt_set=None, print_log=False, level=0): raise ValueError("only support opt_level 0 or 1.") global PRINT_LOG PRINT_LOG = print_log + if skip_grads: + grad_set = set() + OP_ROLE_VAR = core.op_proto_and_checker_maker.kOpRoleVarAttrName() + for op in input_program.global_block().ops: + if _is_opt_role_op(op): + if op.attr(OP_ROLE_VAR): + grad_name = op.attr(OP_ROLE_VAR)[1] + grad_set.add(grad_name) + if not skip_opt_set: + skip_opt_set = grad_set + else: + skip_opt_set.update(grad_set) + cfgs = _get_cfgs(input_program) for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) diff --git a/python/setup.py.in b/python/setup.py.in index 786c9f2e39880b68700b8acb94b3d35a48323958..b376be0ea373f089ef17f27435d979712fbdff72 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -106,6 +106,7 @@ packages=['paddle', 'paddle.fluid.layers', 'paddle.fluid.contrib', 'paddle.fluid.contrib.decoder', + 'paddle.fluid.contrib.quantize', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details']