diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 2e335579f32df4f146c8d88e05e684a9a8105e20..e66459fa3a1508fe4a3687f07bbe18f2a5421296 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH}) return() ENDIF() +INCLUDE(GNUInstallDirs) + INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") @@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) +SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") +SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) +SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) +SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) ExternalProject_Add( ${NGRAPH_PROJECT} @@ -63,18 +69,6 @@ ExternalProject_Add( CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib ) -if(UNIX AND NOT APPLE) - include(GNUInstallDirs) - SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) -else() - SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib) -endif() -MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}") - -SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) -SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) -SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) - # Workaround for nGraph expecting mklml to be in mkldnn install directory. ExternalProject_Add_Step( ${NGRAPH_PROJECT} diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 0b95a780721b0771d55c4dbb2ddce33418612018..c679d8507d8a9d3bce48b7f38491dadd9f2fb7f6 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -129,6 +129,15 @@ if (WITH_MKLDNN) ) endif () +if (WITH_NGRAPH) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph") + copy(ngraph_lib + SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR} + DSTS ${dst_dir} ${dst_dir} + DEPS ngraph + ) +endif () + if (NOT WIN32) if (NOT MOBILE_INFERENCE AND NOT RPI) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 89726bf9859e71ee04c2f9380554090845fd44e5..2ced43f9e6c60da642f7a6252f889d9c9ab9748f 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -166,6 +166,8 @@ function(op_library TARGET) # Append first implemented MKLDNN activation operator if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") + elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c40f6033419a2425d9996eb9a4584fc9cd1a70e3..2722ea078ebdf9a88fe2286fb4050fca652ffb7f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None) paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -69,7 +76,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) -paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False)) +paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -175,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -187,6 +194,9 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -291,6 +301,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -411,3 +422,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) +paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) +paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) +paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) +paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) +paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) +paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) +paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) +paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) +paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) +paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 52946c7f11f90490b1af1347f20db236a8fe24af..e4c471d86b7bff1bfb3b697ab24219144b4667f5 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -34,6 +34,7 @@ add_subdirectory(ir) add_subdirectory(details) # ddim lib proto_library(framework_proto SRCS framework.proto) +proto_library(async_executor_proto SRCS data_feed.proto) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) @@ -117,8 +118,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) +cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler transfer_scope_cache) + shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -126,8 +128,9 @@ cc_library(version SRCS version.cc) cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto) + if(NOT WIN32) +cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler) endif(NOT WIN32) @@ -135,7 +138,7 @@ endif(NOT WIN32) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) -py_proto_compile(framework_py_proto SRCS framework.proto) +py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) # Generate an empty __init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) @@ -157,18 +160,19 @@ endif(NOT WIN32) cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) +cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) -cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() if(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper) else(NOT WIN32) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) endif(NOT WIN32) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() @@ -176,8 +180,11 @@ endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph build_strategy - fast_threaded_ssa_graph_executor) + fast_threaded_ssa_graph_executor variable_helper) + +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry @@ -185,7 +192,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) -cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) +cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..afb2dd2f064384da39904f6aceead4fa915a80f2 --- /dev/null +++ b/paddle/fluid/framework/async_executor.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/async_executor.h" +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/executor_thread_worker.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/pybind/pybind.h" + +namespace paddle { +namespace framework { +AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place) + : root_scope_(scope), place_(place) {} + +void AsyncExecutor::CreateThreads( + ExecutorThreadWorker* worker, const ProgramDesc& main_program, + const std::shared_ptr& reader, + const std::vector& fetch_var_names, Scope* root_scope, + const int thread_index, const bool debug) { + worker->SetThreadId(thread_index); + worker->SetDebug(debug); + worker->SetRootScope(root_scope); + worker->CreateThreadResource(main_program, place_); + worker->SetDataFeed(reader); + worker->SetFetchVarNames(fetch_var_names); + worker->BindingDataFeedMemory(); +} + +void PrepareReaders(std::vector>& readers, // NOLINT + const int thread_num, const DataFeedDesc& data_feed_desc, + const std::vector& filelist) { + readers.resize(thread_num); + for (size_t i = 0; i < readers.size(); ++i) { + readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name()); + readers[i]->Init(data_feed_desc); // set batch_size and queue_size here + } + readers[0]->SetFileList(filelist); +} + +void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, + const std::string& data_feed_desc_str, + const std::vector& filelist, + const int thread_num, + const std::vector& fetch_var_names, + const bool debug) { + std::vector threads; + + auto& block = main_program.Block(0); + for (auto var_name : fetch_var_names) { + auto var_desc = block.FindVar(var_name); + auto shapes = var_desc->GetShape(); + PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1, + "var %s: Fetched var has wrong shape, " + "only variables with the last dimension size 1 supported", + var_name); + } + + DataFeedDesc data_feed_desc; + google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, + &data_feed_desc); + + int actual_thread_num = thread_num; + int file_cnt = filelist.size(); + PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty"); + + if (actual_thread_num > file_cnt) { + VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt + << ". Changing thread_num = " << file_cnt; + actual_thread_num = file_cnt; + } + + /* + readerDesc: protobuf description for reader initlization + argument: class_name, batch_size, use_slot, queue_size, buffer_size, + padding_index + + reader: + 1) each thread has a reader, reader will read input data and + put it into input queue + 2) each reader has a Next() iterface, that can fetch an instance + from the input queue + */ + // todo: should be factory method for creating datafeed + std::vector> readers; + PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); + + std::vector> workers; + workers.resize(actual_thread_num); + for (auto& worker : workers) { + worker.reset(new ExecutorThreadWorker); + } + + // prepare thread resource here + for (int thidx = 0; thidx < actual_thread_num; ++thidx) { + CreateThreads(workers[thidx].get(), main_program, readers[thidx], + fetch_var_names, root_scope_, thidx, debug); + } + + // start executing ops in multiple threads + for (int thidx = 0; thidx < actual_thread_num; ++thidx) { + threads.push_back( + std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + } + + for (auto& th : threads) { + th.join(); + } + + root_scope_->DropKids(); + + return; +} + +} // einit_modelnd namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..f4d2a79ac592e02f49ec0b988c824dc98883fbf6 --- /dev/null +++ b/paddle/fluid/framework/async_executor.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_thread_worker.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +class AsyncExecutor { + public: + AsyncExecutor(Scope* scope, const platform::Place& place); + virtual ~AsyncExecutor() {} + void RunFromFile(const ProgramDesc& main_program, + const std::string& data_feed_desc_str, + const std::vector& filelist, + const int thread_num, + const std::vector& fetch_names, + const bool debug = false); + + private: + void CreateThreads(ExecutorThreadWorker* worker, + const ProgramDesc& main_program, + const std::shared_ptr& reader, + const std::vector& fetch_var_names, + Scope* root_scope, const int thread_index, + const bool debug); + + public: + Scope* root_scope_; + platform::Place place_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc new file mode 100644 index 0000000000000000000000000000000000000000..291d8ffc3c3334c2836e1651a8997984bba084e1 --- /dev/null +++ b/paddle/fluid/framework/data_feed.cc @@ -0,0 +1,386 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" + +namespace paddle { +namespace framework { + +std::vector DataFeed::filelist_; +size_t DataFeed::file_idx_; +std::mutex DataFeed::mutex_for_pick_file_; +bool DataFeed::finish_set_filelist_; + +void DataFeed::AddFeedVar(Variable* var, const std::string& name) { + CheckInit(); + for (size_t i = 0; i < use_slots_.size(); ++i) { + if (name == use_slots_[i]) { + if (use_slots_is_dense_[i]) { + feed_vec_[i] = MixTensor(var->GetMutable()); + } else { + feed_vec_[i] = MixTensor(var->GetMutable()); + } + } + } +} + +bool DataFeed::SetFileList(const std::vector& files) { + std::unique_lock lock(mutex_for_pick_file_); + CheckInit(); + if (finish_set_filelist_) { + VLOG(3) << "info: you have set the filelist."; + return false; + } + PADDLE_ENFORCE(files.size(), "You have set an empty filelist."); + filelist_.assign(files.begin(), files.end()); + file_idx_ = 0; + + finish_set_filelist_ = true; + return true; +} + +void DataFeed::SetBatchSize(int batch_size) { + PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size); + default_batch_size_ = batch_size; +} + +bool DataFeed::PickOneFile(std::string* filename) { + std::unique_lock lock(mutex_for_pick_file_); + if (file_idx_ == filelist_.size()) { + return false; + } + *filename = filelist_[file_idx_++]; + return true; +} + +void DataFeed::CheckInit() { + PADDLE_ENFORCE(finish_init_, "Initialization did not succeed."); +} + +void DataFeed::CheckSetFileList() { + PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed."); +} + +void DataFeed::CheckStart() { + PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet."); +} + +template +void PrivateQueueDataFeed::SetQueueSize(int queue_size) { + PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size); + queue_size_ = queue_size; + queue_ = std::unique_ptr>( + new paddle::operators::reader::BlockingQueue(queue_size_)); +} + +template +bool PrivateQueueDataFeed::Start() { + CheckSetFileList(); + read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this); + read_thread_.detach(); + + finish_start_ = true; + return true; +} + +template +void PrivateQueueDataFeed::ReadThread() { + std::string filename; + while (PickOneFile(&filename)) { + file_.open(filename.c_str()); // is_text_feed + PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str()); + T instance; + while (ParseOneInstance(&instance)) { + queue_->Send(instance); + } + file_.close(); + } + queue_->Close(); +} + +template +int PrivateQueueDataFeed::Next() { + CheckStart(); + int index = 0; + T instance; + T ins_vec; + while (index < default_batch_size_) { + if (!queue_->Receive(&instance)) { + break; + } + AddInstanceToInsVec(&ins_vec, instance, index++); + } + batch_size_ = index; + if (batch_size_ != 0) { + PutToFeedVec(ins_vec); + } + return batch_size_; +} + +#ifdef _WIN32 +template class PrivateQueueDataFeed>; +#endif + +void MultiSlotDataFeed::Init( + const paddle::framework::DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + "Multi_slot_desc has not been set."); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + SetQueueSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + all_slots_.resize(all_slot_num); + all_slots_type_.resize(all_slot_num); + use_slots_index_.resize(all_slot_num); + use_slots_.clear(); + use_slots_is_dense_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + all_slots_type_[i] = slot.type(); + use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1; + if (slot.is_used()) { + use_slots_.push_back(all_slots_[i]); + use_slots_is_dense_.push_back(slot.is_dense()); + } + } + feed_vec_.resize(use_slots_.size()); + finish_init_ = true; +} + +bool MultiSlotDataFeed::CheckFile(const char* filename) { + CheckInit(); // get info of slots + std::ifstream fin(filename); + if (!fin.good()) { + VLOG(1) << "error: open file<" << filename << "> fail"; + return false; + } + std::string line; + int instance_cout = 0; + std::string all_slots_alias = ""; + for (const auto& alias : all_slots_) { + all_slots_alias += alias + " "; + } + std::string use_slots_alias = ""; + for (const auto& alias : use_slots_) { + use_slots_alias += alias + " "; + } + VLOG(3) << "total slots num: " << all_slots_.size(); + VLOG(3) << "total slots alias: " << all_slots_alias; + VLOG(3) << "used slots num: " << use_slots_.size(); + VLOG(3) << "used slots alias: " << use_slots_alias; + while (getline(fin, line)) { + ++instance_cout; + const char* str = line.c_str(); + char* endptr = const_cast(str); + int len = line.length(); + for (size_t i = 0; i < all_slots_.size(); ++i) { + int num = strtol(endptr, &endptr, 10); + if (num < 0) { + VLOG(0) << "error: the number of ids is a negative number: " << num; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } else if (num == 0) { + VLOG(0) + << "error: the number of ids can not be zero, you need " + "padding it in data generator; or if there is something wrong" + " with the data, please check if the data contains unresolvable " + "characters."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } else if (errno == ERANGE || num > INT_MAX) { + VLOG(0) << "error: the number of ids greater than INT_MAX"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (all_slots_type_[i] == "float") { + for (int i = 0; i < num; ++i) { + strtof(endptr, &endptr); + if (errno == ERANGE) { + VLOG(0) << "error: the value is out of the range of " + "representable values for float"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (i + 1 != num && endptr - str == len) { + VLOG(0) << "error: there is a wrong with the number of ids."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } else if (all_slots_type_[i] == "uint64") { + for (int i = 0; i < num; ++i) { + strtoull(endptr, &endptr, 10); + if (errno == ERANGE) { + VLOG(0) << "error: the value is out of the range of " + "representable values for uint64_t"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (i + 1 != num && endptr - str == len) { + VLOG(0) << "error: there is a wrong with the number of ids."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } else { + VLOG(0) << "error: this type<" << all_slots_type_[i] + << "> is not supported"; + return false; + } + } + // It may be added '\t' character to the end of the output of reduce + // task when processes data by Hadoop(when the output of the reduce + // task of Hadoop has only one field, it will add a '\t' at the end + // of the line by default, and you can use this option to avoid it: + // `-D mapred.textoutputformat.ignoreseparator=true`), which does + // not affect the correctness of the data. Therefore, it should be + // judged that the data is not normal when the end of each line of + // data contains characters which are not spaces. + while (endptr - str != len) { + if (!isspace(*(endptr++))) { + VLOG(0) + << "error: there is some extra characters at the end of the line."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } + VLOG(3) << "instances cout: " << instance_cout; + VLOG(3) << "The file format is correct"; + return true; +} + +bool MultiSlotDataFeed::ParseOneInstance(std::vector* instance) { + std::string line; + if (getline(file_, line)) { + int use_slots_num = use_slots_.size(); + instance->resize(use_slots_num); + // parse line + const char* str = line.c_str(); + char* endptr = const_cast(str); + int pos = 0; + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (idx != -1) { + (*instance)[idx].Init(all_slots_type_[i]); + if ((*instance)[idx].GetType()[0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + (*instance)[idx].AddValue(feasign); + } + } else if ((*instance)[idx].GetType()[0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + (*instance)[idx].AddValue(feasign); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + pos = line.find_first_of(' ', pos + 1); + } + } + } + } else { + return false; + } + return true; +} + +void MultiSlotDataFeed::AddInstanceToInsVec( + std::vector* ins_vec, + const std::vector& instance, int index) { + if (index == 0) { + ins_vec->resize(instance.size()); + for (size_t i = 0; i < instance.size(); ++i) { + (*ins_vec)[i].Init(instance[i].GetType()); + (*ins_vec)[i].InitOffset(); + } + } + for (size_t i = 0; i < instance.size(); ++i) { + (*ins_vec)[i].AddIns(instance[i]); + } +} + +void MultiSlotDataFeed::PutToFeedVec( + const std::vector& ins_vec) { + for (size_t i = 0; i < use_slots_.size(); ++i) { + const auto& type = ins_vec[i].GetType(); + const auto& offset = ins_vec[i].GetOffset(); + int total_instance = static_cast(offset.back()); + if (type[0] == 'f') { // float + const auto& feasign = ins_vec[i].GetFloatData(); + if (feed_vec_[i].IsDense()) { + int size_in_each_batch = total_instance / batch_size_; + float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data( + {batch_size_, size_in_each_batch}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + } else { + float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + LoD data_lod{offset}; + feed_vec_[i].GetLoDTensor()->set_lod(data_lod); + } + } else if (type[0] == 'u') { // uint64 + // no uint64_t type in paddlepaddle + const auto& feasign = ins_vec[i].GetUint64Data(); + if (feed_vec_[i].IsDense()) { + int size_in_each_batch = total_instance / batch_size_; + int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data( + {batch_size_, size_in_each_batch}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + } else { + int64_t* tensor_ptr = + feed_vec_[i].GetLoDTensor()->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + LoD data_lod{offset}; + feed_vec_[i].GetLoDTensor()->set_lod(data_lod); + } + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h new file mode 100644 index 0000000000000000000000000000000000000000..a7f8d1d31752af200145bc7934e7880910338e9d --- /dev/null +++ b/paddle/fluid/framework/data_feed.h @@ -0,0 +1,269 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" + +namespace paddle { +namespace framework { + +// Pack Tensor type and LoDTensor type into MixTensor type, in order +// to record either Tensor or LoDTensor information at the same time. +class MixTensor { + public: + MixTensor() {} + explicit MixTensor(LoDTensor* lodtensor) { + is_dense_ = false; + lodtensor_ = lodtensor; + } + explicit MixTensor(Tensor* tensor) { + is_dense_ = true; + tensor_ = tensor; + } + bool IsDense() { return is_dense_; } + LoDTensor* GetLoDTensor() { + PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr."); + return lodtensor_; + } + Tensor* GetTensor() { + PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr."); + return tensor_; + } + + private: + bool is_dense_; + LoDTensor* lodtensor_; + Tensor* tensor_; +}; + +// DataFeed is the base virtual class for all ohther DataFeeds. +// It is used to read files and parse the data for subsequent trainer. +// Example: +// DataFeed* reader = +// paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name); +// reader->Init(data_feed_desc); // data_feed_desc is a protobuf object +// reader->SetFileList(filelist); +// const std::vector & use_slot_alias = +// reader->GetUseSlotAlias(); +// for (auto name: use_slot_alias){ // for binding memory +// reader->AddFeedVar(scope->Var(name), name); +// } +// reader->Start(); +// while (reader->Next()) { +// // trainer do something +// } +class DataFeed { + public: + DataFeed() {} + virtual ~DataFeed() {} + virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0; + virtual bool CheckFile(const char* filename) { + PADDLE_THROW("This function(CheckFile) is not implemented."); + } + // Set filelist for DataFeed. + // Pay attention that it must init all readers before call this function. + // Otherwise, Init() function will init finish_set_filelist_ flag. + virtual bool SetFileList(const std::vector& files); + virtual bool Start() = 0; + // The trainer calls the Next() function, and the DataFeed will load a new + // batch to the feed_vec. The return value of this function is the batch + // size of the current batch. + virtual int Next() = 0; + // Get all slots' alias which defined in protofile + virtual const std::vector& GetAllSlotAlias() { + return all_slots_; + } + // Get used slots' alias which defined in protofile + virtual const std::vector& GetUseSlotAlias() { + return use_slots_; + } + // This function is used for binding feed_vec memory + virtual void AddFeedVar(Variable* var, const std::string& name); + + protected: + // The following three functions are used to check if it is executed in this + // order: + // Init() -> SetFileList() -> Start() -> Next() + virtual void CheckInit(); + virtual void CheckSetFileList(); + virtual void CheckStart(); + virtual void SetBatchSize( + int batch); // batch size will be set in Init() function + // This function is used to pick one file from the global filelist(thread + // safe). + virtual bool PickOneFile(std::string* filename); + + static std::vector filelist_; + static size_t file_idx_; + static std::mutex mutex_for_pick_file_; + + // the alias of used slots, and its order is determined by + // data_feed_desc(proto object) + std::vector use_slots_; + std::vector use_slots_is_dense_; + + // the alias of all slots, and its order is determined by data_feed_desc(proto + // object) + std::vector all_slots_; + std::vector all_slots_type_; + std::vector + use_slots_index_; // -1: not used; >=0: the index of use_slots_ + + // The data read by DataFeed will be stored here + std::vector feed_vec_; + + // the batch size defined by user + int default_batch_size_; + // current batch size + int batch_size_; + + bool finish_init_; + static bool finish_set_filelist_; + bool finish_start_; +}; + +// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds. +// It use a read-thread to read file and parse data to a private-queue +// (thread level), and get data from this queue when trainer call Next(). +template +class PrivateQueueDataFeed : public DataFeed { + public: + PrivateQueueDataFeed() {} + virtual ~PrivateQueueDataFeed() {} + virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0; + virtual bool Start(); + virtual int Next(); + + protected: + // The thread implementation function for reading file and parse. + virtual void ReadThread(); + // This function is used to set private-queue size, and the most + // efficient when the queue size is close to the batch size. + virtual void SetQueueSize(int queue_size); + // The reading and parsing method called in the ReadThread. + virtual bool ParseOneInstance(T* instance) = 0; + // This function is used to put instance to vec_ins + virtual void AddInstanceToInsVec(T* vec_ins, const T& instance, + int index) = 0; + // This function is used to put ins_vec to feed_vec + virtual void PutToFeedVec(const T& ins_vec) = 0; + + // The thread for read files + std::thread read_thread_; + // using ifstream one line and one line parse is faster + // than using fread one buffer and one buffer parse. + // for a 601M real data: + // ifstream one line and one line parse: 6034 ms + // fread one buffer and one buffer parse: 7097 ms + std::ifstream file_; + size_t queue_size_; + // The queue for store parsed data + std::unique_ptr> queue_; +}; + +// This class define the data type of instance(ins_vec) in MultiSlotDataFeed +class MultiSlotType { + public: + MultiSlotType() {} + ~MultiSlotType() {} + void Init(const std::string& type) { + CheckType(type); + if (type_[0] == 'f') { + float_feasign_.clear(); + } else if (type_[0] == 'u') { + uint64_feasign_.clear(); + } + type_ = type; + } + void InitOffset() { + offset_.resize(1); + // LoDTensor' lod is counted from 0, the size of lod + // is one size larger than the size of data. + offset_[0] = 0; + } + const std::vector& GetOffset() const { return offset_; } + void AddValue(const float v) { + CheckFloat(); + float_feasign_.push_back(v); + } + void AddValue(const uint64_t v) { + CheckUint64(); + uint64_feasign_.push_back(v); + } + void AddIns(const MultiSlotType& ins) { + if (ins.GetType()[0] == 'f') { // float + CheckFloat(); + auto& vec = ins.GetFloatData(); + offset_.push_back(offset_.back() + vec.size()); + float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end()); + } else if (ins.GetType()[0] == 'u') { // uint64 + CheckUint64(); + auto& vec = ins.GetUint64Data(); + offset_.push_back(offset_.back() + vec.size()); + uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end()); + } + } + const std::vector& GetFloatData() const { return float_feasign_; } + const std::vector& GetUint64Data() const { return uint64_feasign_; } + const std::string& GetType() const { return type_; } + + private: + void CheckType(const std::string& type) const { + PADDLE_ENFORCE((type == "uint64") || (type == "float"), + "There is no this type<%s>.", type); + } + void CheckFloat() const { + PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_); + } + void CheckUint64() const { + PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_); + } + std::vector float_feasign_; + std::vector uint64_feasign_; + std::string type_; + std::vector offset_; +}; + +// This DataFeed is used to feed multi-slot type data. +// The format of multi-slot type data: +// [n feasign_0 feasign_1 ... feasign_n]* +class MultiSlotDataFeed + : public PrivateQueueDataFeed> { + public: + MultiSlotDataFeed() {} + virtual ~MultiSlotDataFeed() {} + virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc); + virtual bool CheckFile(const char* filename); + + protected: + virtual void AddInstanceToInsVec(std::vector* vec_ins, + const std::vector& instance, + int index); + virtual bool ParseOneInstance(std::vector* instance); + virtual void PutToFeedVec(const std::vector& ins_vec); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto new file mode 100644 index 0000000000000000000000000000000000000000..489fec08d86ccf61ece29bbba6d0204f25530b0f --- /dev/null +++ b/paddle/fluid/framework/data_feed.proto @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +syntax = "proto2"; +package paddle.framework; + +message Slot { + required string name = 1; + required string type = 2; + optional bool is_dense = 3 [ default = false ]; + optional bool is_used = 4 [ default = false ]; +} + +message MultiSlotDesc { repeated Slot slots = 1; } + +message DataFeedDesc { + optional string name = 1; + optional int32 batch_size = 2 [ default = 32 ]; + optional MultiSlotDesc multi_slot_desc = 3; +} diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..72148b9f7d343e19d60bb2be44d8270ad78d1412 --- /dev/null +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_feed_factory.h" +#include +#include +#include + +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { +typedef std::shared_ptr (*Createdata_feedFunction)(); +typedef std::unordered_map data_feedMap; +data_feedMap g_data_feed_map; + +#define REGISTER_DATAFEED_CLASS(data_feed_class) \ + namespace { \ + std::shared_ptr Creator_##data_feed_class() { \ + return std::shared_ptr(new data_feed_class); \ + } \ + class __Registerer_##data_feed_class { \ + public: \ + __Registerer_##data_feed_class() { \ + g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \ + } \ + }; \ + __Registerer_##data_feed_class g_registerer_##data_feed_class; \ + } // namespace + +std::string DataFeedFactory::DataFeedTypeList() { + std::string data_feed_types; + for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end(); + ++iter) { + if (iter != g_data_feed_map.begin()) { + data_feed_types += ", "; + } + data_feed_types += iter->first; + } + return data_feed_types; +} + +std::shared_ptr DataFeedFactory::CreateDataFeed( + std::string data_feed_class) { + if (g_data_feed_map.count(data_feed_class) < 1) { + exit(-1); + } + return g_data_feed_map[data_feed_class](); +} + +REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..13678edb0b8d084a0b3016d93f6e1bc32ce0169a --- /dev/null +++ b/paddle/fluid/framework/data_feed_factory.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { +class DataFeedFactory { + public: + static std::string DataFeedTypeList(); + static std::shared_ptr CreateDataFeed(std::string data_feed_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..3974f8dbadf332801a822618d77f140db440b29d --- /dev/null +++ b/paddle/fluid/framework/data_feed_test.cc @@ -0,0 +1,337 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_feed.h" +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +paddle::framework::DataFeedDesc load_datafeed_param_from_file( + const char* filename) { + paddle::framework::DataFeedDesc data_feed_desc; + int file_descriptor = open(filename, O_RDONLY); + PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename); + google::protobuf::io::FileInputStream fileInput(file_descriptor); + google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc); + close(file_descriptor); + return data_feed_desc; +} + +const std::vector load_filelist_from_file(const char* filename) { + std::vector filelist; + std::ifstream fin(filename); + PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename); + std::string line; + while (getline(fin, line)) { + filelist.push_back(line); + } + fin.close(); + return filelist; +} + +void GenerateFileForTest(const char* protofile, const char* filelist) { + std::ofstream w_protofile(protofile); + w_protofile << "name: \"MultiSlotDataFeed\"\n" + "batch_size: 2\n" + "multi_slot_desc {\n" + " slots {\n" + " name: \"uint64_sparse_slot\"\n" + " type: \"uint64\"\n" + " is_dense: false\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"float_sparse_slot\"\n" + " type: \"float\"\n" + " is_dense: false\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"uint64_dense_slot\"\n" + " type: \"uint64\"\n" + " is_dense: true\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"float_dense_slot\"\n" + " type: \"float\"\n" + " is_dense: true\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"not_used_slot\"\n" + " type: \"uint64\"\n" + " is_dense: false\n" + " is_used: false\n" + " }\n" + "}"; + w_protofile.close(); + std::ofstream w_filelist(filelist); + int total_file = 4; + for (int i = 0; i < total_file; ++i) { + std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i); + w_filelist << filename; + if (i + 1 != total_file) { + w_filelist << std::endl; + } + std::ofstream w_datafile(filename.c_str()); + w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n" + "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n" + "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n"; + w_datafile.close(); + } + w_filelist.close(); +} + +class MultiTypeSet { + public: + MultiTypeSet() { + uint64_set_.clear(); + float_set_.clear(); + } + ~MultiTypeSet() {} + void AddValue(uint64_t v) { uint64_set_.insert(v); } + void AddValue(float v) { float_set_.insert(v); } + const std::set& GetUint64Set() const { return uint64_set_; } + const std::set& GetFloatSet() const { return float_set_; } + + private: + std::set uint64_set_; + std::set float_set_; +}; + +void GetElemSetFromReader(std::vector* reader_elem_set, + const paddle::framework::DataFeedDesc& data_feed_desc, + const std::vector& filelist, + const int thread_num) { + int used_slot_num = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + if (data_feed_desc.multi_slot_desc().slots(i).is_used()) { + ++used_slot_num; + } + } + reader_elem_set->resize(used_slot_num); + std::vector threads; + std::vector> readers; + readers.resize(thread_num); + for (int i = 0; i < thread_num; ++i) { + readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed( + data_feed_desc.name()); + readers[i]->Init(data_feed_desc); + } + readers[0]->SetFileList(filelist); + std::mutex mu; + for (int idx = 0; idx < thread_num; ++idx) { + threads.emplace_back(std::thread([&, idx] { + std::unique_ptr scope( + new paddle::framework::Scope()); + const auto& multi_slot_desc = data_feed_desc.multi_slot_desc(); + std::map + lodtensor_targets; + std::map tensor_targets; + for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { + const auto& slot = multi_slot_desc.slots(i); + if (slot.is_used()) { + const auto& name = slot.name(); + readers[idx]->AddFeedVar(scope->Var(name), name); + if (slot.is_dense()) { + tensor_targets[name] = + &scope->FindVar(name)->Get(); + } else { + lodtensor_targets[name] = + &scope->FindVar(name)->Get(); + } + } + } + readers[idx]->Start(); + while (readers[idx]->Next()) { + int index = 0; + for (int k = 0; k < multi_slot_desc.slots_size(); ++k) { + const auto& slot = multi_slot_desc.slots(k); + if (!slot.is_used()) { + continue; + } + if (slot.is_dense()) { // dense branch + const paddle::framework::Tensor* tens = tensor_targets[slot.name()]; + if (slot.type() == "uint64") { + const int64_t* data = tens->data(); + int batch_size = tens->dims()[0]; + int dim = tens->dims()[1]; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < dim; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue( + (uint64_t)data[i * dim + j]); + } + } + } else if (slot.type() == "float") { + const float* data = tens->data(); + int batch_size = tens->dims()[0]; + int dim = tens->dims()[1]; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < dim; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue(data[i * dim + j]); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + } else { // sparse branch + const paddle::framework::LoDTensor* tens = + lodtensor_targets[slot.name()]; + if (slot.type() == "uint64") { + const int64_t* data = tens->data(); + for (size_t i = 0; i < tens->NumElements(); ++i) { + std::pair element = tens->lod_element(0, i); + for (size_t j = element.first; j < element.second; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue((uint64_t)data[j]); + } + } + } else if (slot.type() == "float") { + const float* data = tens->data(); + for (size_t i = 0; i < tens->NumElements(); ++i) { + std::pair element = tens->lod_element(0, i); + for (size_t j = element.first; j < element.second; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue(data[j]); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + } // end sparse branch + ++index; + } // end slots loop + } // end while Next() + })); // end anonymous function + } + for (auto& th : threads) { + th.join(); + } +} + +void CheckIsUnorderedSame(const std::vector& s1, + const std::vector& s2) { + EXPECT_EQ(s1.size(), s2.size()); + for (size_t i = 0; i < s1.size(); ++i) { + // check for uint64 + const std::set& uint64_s1 = s1[i].GetUint64Set(); + const std::set& uint64_s2 = s2[i].GetUint64Set(); + EXPECT_EQ(uint64_s1.size(), uint64_s2.size()); + auto uint64_it1 = uint64_s1.begin(); + auto uint64_it2 = uint64_s2.begin(); + while (uint64_it1 != uint64_s1.end()) { + EXPECT_EQ(*uint64_it1, *uint64_it2); + ++uint64_it1; + ++uint64_it2; + } + // check for float + const std::set& float_s1 = s1[i].GetFloatSet(); + const std::set& float_s2 = s2[i].GetFloatSet(); + EXPECT_EQ(float_s1.size(), float_s2.size()); + auto float_it1 = float_s1.begin(); + auto float_it2 = float_s2.begin(); + while (float_it1 != float_s1.end()) { + EXPECT_EQ(*float_it1, *float_it2); + ++float_it1; + ++float_it2; + } + } +} + +void GetElemSetFromFile(std::vector* file_elem_set, + const paddle::framework::DataFeedDesc& data_feed_desc, + const std::vector& filelist) { + int used_slot_num = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + if (data_feed_desc.multi_slot_desc().slots(i).is_used()) { + ++used_slot_num; + } + } + file_elem_set->resize(used_slot_num); + for (const auto& file : filelist) { + std::ifstream fin(file.c_str()); + PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str()); + while (1) { + bool end_flag = false; + int index = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + int num; + if (fin >> num) { + auto slot = data_feed_desc.multi_slot_desc().slots(i); + auto type = slot.type(); + if (type == "uint64") { + while (num--) { + uint64_t feasign; + fin >> feasign; + if (slot.is_used()) { + (*file_elem_set)[index].AddValue(feasign); + } + } + } else if (type == "float") { + while (num--) { + float feasign; + fin >> feasign; + if (slot.is_used()) { + (*file_elem_set)[index].AddValue(feasign); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + if (slot.is_used()) { + ++index; + } + } else { + end_flag = true; + break; + } + } + if (end_flag) { + break; + } + } + fin.close(); + } +} + +TEST(DataFeed, MultiSlotUnitTest) { + const char* protofile = "data_feed_desc.prototxt"; + const char* filelist_name = "filelist.txt"; + GenerateFileForTest(protofile, filelist_name); + const std::vector filelist = + load_filelist_from_file(filelist_name); + paddle::framework::DataFeedDesc data_feed_desc = + load_datafeed_param_from_file(protofile); + std::vector reader_elem_set; + std::vector file_elem_set; + GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4); + GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist); + CheckIsUnorderedSame(reader_elem_set, file_elem_set); +} diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index a003995ae3f8e111881b4681554aa8eb17b60cc1..e8bf53e160e7382122c3c2f92a152fea058a032e 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, void AllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); +// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, +// this is a distributed or inter-process call, find a better way. +#ifdef PADDLE_WITH_CUDA + if (NoDummyInputSize() == 1 && + local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { +#else if (NoDummyInputSize() == 1) { +#endif return; // No need to all reduce when GPU count = 1; } else { // Wait input done diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 523f9eadf2d7e2e08504c5920372fb7cdb0d7aba..1e1b945f63cf480308c05ffc0f9a3b9f0b0da02b 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -62,6 +62,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", &strategy_); + multi_devices_pass->Set("num_trainers", + new int(strategy_.num_trainers_)); // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index a36ad259265e01121f8fc0060058ed55406c9f97..cbae5321d9a3fdcaebca2bd9111fff6a10b235a8 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -133,6 +133,7 @@ static const char kPlaces[] = "places"; static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; +static const char kNumTrainers[] = "num_trainers"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); @@ -299,6 +300,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; + int num_trainers = Get(kNumTrainers); + for (auto &node : nodes) { if (node->IsVar() && node->Var()) { all_vars_.emplace(node->Name(), node->Var()); @@ -383,7 +386,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - if (!is_forwarding && places_.size() > 1) { + if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( @@ -862,7 +865,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( if (node->Op()->Type() == "fetch_barrier") { outvar_dev_id = GetVarDeviceID(*result, output->Name(), *sharded_var_device); - PADDLE_ENFORCE_NE(outvar_dev_id, -1); + PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); } p = places_[outvar_dev_id]; ir::Node *new_node = nullptr; @@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass, .RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kParams) .RequirePassAttr(paddle::framework::details::kLocalScopes) - .RequirePassAttr(paddle::framework::details::kStrategy); + .RequirePassAttr(paddle::framework::details::kStrategy) + .RequirePassAttr(paddle::framework::details::kNumTrainers); diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index e5b1eaa7318aecde1dbf89de8fe242a3008db97c..499246a9856bb3ba67a155c6f00c3ad06af50edf 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -16,7 +16,7 @@ #include #include #include -#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/framework/details/reference_count_op_handle.h" diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 96132a2c18233ca10d7bad4e26dfabadd39d84db..73cec21e20f2fd26e144872f1f7b5bb7065adb74 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/transfer_scope_cache.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" @@ -114,36 +115,6 @@ void Executor::Close() { #endif } -void InitializeVariable(Variable* var, proto::VarType::Type var_type) { - if (var_type == proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else if (var_type == proto::VarType::SELECTED_ROWS) { - var->GetMutable(); - } else if (var_type == proto::VarType::FEED_MINIBATCH) { - var->GetMutable(); - } else if (var_type == proto::VarType::FETCH_LIST) { - var->GetMutable(); - } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); - } else if (var_type == proto::VarType::LOD_RANK_TABLE) { - var->GetMutable(); - } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { - var->GetMutable(); - } else if (var_type == proto::VarType::PLACE_LIST) { - var->GetMutable(); - } else if (var_type == proto::VarType::READER) { - var->GetMutable(); - } else if (var_type == proto::VarType::RAW) { - // GetMutable will be called in operator - } else { - PADDLE_THROW( - "Variable type %d is not in " - "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", - var_type); - } -} - void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id) { auto& global_block = pdesc.Block(block_id); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 36b36d49c2728dbef93042158dffa26d8f56d529..2d47903ffbd8d821b7c31386b225fe5e65ca2720 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -26,7 +26,6 @@ limitations under the License. */ namespace paddle { namespace framework { -extern void InitializeVariable(Variable* var, proto::VarType::Type var_type); template std::unordered_map GetNonPersistableReferenceCount( diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e4001e979fdd0774779fa288402c7847af90637 --- /dev/null +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -0,0 +1,223 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/executor_thread_worker.h" +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/pybind/pybind.h" +namespace paddle { +namespace framework { + +void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { + auto& block = program.Block(0); + op_names_.clear(); + for (auto& op_desc : block.AllOps()) { + std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); + op_names_.push_back(op_desc->Type()); + OperatorBase* local_op_ptr = local_op.release(); + ops_.push_back(local_op_ptr); + continue; + } +} + +void ExecutorThreadWorker::CreateThreadResource( + const framework::ProgramDesc& program, + const paddle::platform::Place& place) { + CreateThreadScope(program); + CreateThreadOperators(program); + SetMainProgram(program); + SetPlace(place); +} + +void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) { + auto& block = program.Block(0); + + PADDLE_ENFORCE_NOT_NULL( + root_scope_, "root_scope should be set before creating thread scope"); + + thread_scope_ = &root_scope_->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto* ptr = root_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } else { + auto* ptr = thread_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } +} + +void ExecutorThreadWorker::SetDataFeed( + const std::shared_ptr& datafeed) { + thread_reader_ = datafeed; +} + +void ExecutorThreadWorker::BindingDataFeedMemory() { + const std::vector& input_feed = + thread_reader_->GetUseSlotAlias(); + for (auto name : input_feed) { + thread_reader_->AddFeedVar(thread_scope_->Var(name), name); + } +} + +void ExecutorThreadWorker::SetFetchVarNames( + const std::vector& fetch_var_names) { + fetch_var_names_.clear(); + fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(), + fetch_var_names.end()); +} + +void ExecutorThreadWorker::SetDevice() { +#if defined _WIN32 || defined __APPLE__ + return; +#else + static unsigned concurrency_cap = std::thread::hardware_concurrency(); + int thread_id = this->thread_id_; + + if (thread_id < concurrency_cap) { + unsigned proc = thread_id; + + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(proc, &mask); + + if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) { + VLOG(1) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } else { + CPU_ZERO(&mask); + if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) || + (CPU_ISSET(proc, &mask) == 0)) { + VLOG(3) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } + } + } else { + VLOG(1) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } +#endif +} + +template +void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) { + auto inspect = lod_tensor.data(); + auto element_num = lod_tensor.numel(); + + std::ostringstream sstream; + sstream << var_name << " (element num " << element_num << "): ["; + sstream << inspect[0]; + for (int j = 1; j < element_num; ++j) { + sstream << " " << inspect[j]; + } + sstream << "]"; + + std::cout << sstream.str() << std::endl; +} + +void print_fetch_var(Scope* scope, std::string var_name) { + const LoDTensor& tensor = scope->FindVar(var_name)->Get(); + + if (std::type_index(tensor.type()) == + std::type_index(typeid(platform::float16))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(double))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(int64_t))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(uint8_t))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(int16_t))) { + print_lod_tensor(var_name, tensor); + } else if (std::type_index(tensor.type()) == + std::type_index(typeid(int8_t))) { + print_lod_tensor(var_name, tensor); + } else { + VLOG(1) << "print_fetch_var: unrecognized data type:" + << tensor.type().name(); + } + + return; +} + +void ExecutorThreadWorker::TrainFiles() { + // todo: configurable + SetDevice(); + + int fetch_var_num = fetch_var_names_.size(); + fetch_values_.clear(); + fetch_values_.resize(fetch_var_num); + + thread_reader_->Start(); + + int cur_batch; + int batch_cnt = 0; + while ((cur_batch = thread_reader_->Next()) > 0) { + // executor run here + for (auto& op : ops_) { + op->Run(*thread_scope_, place_); + } + + ++batch_cnt; + thread_scope_->DropKids(); + + if (debug_ == false || thread_id_ != 0) { + continue; + } + + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } // end for (int i = 0...) + } // end while () +} + +void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; } + +void ExecutorThreadWorker::SetPlace(const platform::Place& place) { + place_ = place; +} + +void ExecutorThreadWorker::SetMainProgram( + const ProgramDesc& main_program_desc) { + main_program_.reset(new ProgramDesc(main_program_desc)); +} + +void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { + root_scope_ = g_scope; +} + +} // einit_modelnd namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h new file mode 100644 index 0000000000000000000000000000000000000000..13ec2442c46459116320236bf98f23c91340f389 --- /dev/null +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +void CreateTensor(Variable* var, proto::VarType::Type var_type); + +class ExecutorThreadWorker { + public: + ExecutorThreadWorker() + : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} + ~ExecutorThreadWorker() {} + + void CreateThreadResource(const framework::ProgramDesc& program, + const paddle::platform::Place& place); + void SetThreadId(int tid); + void SetDebug(const bool debug) { debug_ = debug; } + void SetRootScope(Scope* g_scope); + // set cpu device in this function + // cpu binding is used by default + void SetDevice(); + // since we read data into memory that can not be accessed by program + // we need to bind memory of data with corresponding variables in program + // this function should be called after data feed is set + void BindingDataFeedMemory(); + // set data feed declared in executor + void SetDataFeed(const std::shared_ptr& datafeed); + // A multi-thread training function + void TrainFiles(); + // set fetch variable names from python interface assigned by users + void SetFetchVarNames(const std::vector& fetch_var_names); + + private: + void CreateThreadScope(const framework::ProgramDesc& program); + void CreateThreadOperators(const framework::ProgramDesc& program); + void SetMainProgram(const ProgramDesc& main_program_desc); + void SetPlace(const paddle::platform::Place& place); + + protected: + // thread index + std::shared_ptr thread_reader_; // shared queue, thread buffer + int thread_id_; + // operator name + std::vector op_names_; + // thread level, local operators for forward and backward + std::vector ops_; + // main program for training + std::unique_ptr main_program_; + // execution place + platform::Place place_; + // root scope for model parameters + Scope* root_scope_; + // a thread scope, father scope is global score which is shared + Scope* thread_scope_; + + private: + std::vector fetch_var_names_; + std::vector> fetch_values_; + bool debug_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 947c934f0ff3e06e70f26cf9a9155e8d4b4a84ad..bb2d953afbd64dc33d735f483a8eafd9ce951530 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -177,14 +177,13 @@ class Graph { return nullptr; } - const ProgramDesc &program() const { return program_; } - std::map> InitFromProgram( - const ProgramDesc &program); - void ResolveHazard( const std::map> &var_nodes); private: + std::map> InitFromProgram( + const ProgramDesc &program); + // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 292f232ffce48593e1827fe2dfe1b8472360054e..6d8f020918d4e56fa7f125a659f7f8511ca067ca 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -38,7 +38,7 @@ std::unique_ptr IsTestPass::ApplyImpl( for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - if (op->HasAttr("is_test")) { + if (n->RuntimeHasAttr("is_test")) { op->SetAttr("is_test", true); } else if (std::find(begin(op_list), end(op_list), op->Type()) != end(op_list)) { diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index 9696441a21661db89146c448742a992d1f7df022..d9a68c7f1dd2a0dca5204719c4ce6cd9d68292a2 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -104,9 +104,9 @@ TEST(IsTestPass, basic) { auto* op = node->Op(); auto op_name = boost::get(op->GetAttr("name")); if (op_name == "conv3") { - ASSERT_FALSE(op->HasAttr("is_test")); + ASSERT_FALSE(node->RuntimeHasAttr("is_test")); } else { - ASSERT_TRUE(op->HasAttr("is_test")); + ASSERT_TRUE(node->RuntimeHasAttr("is_test")); EXPECT_TRUE(boost::get(op->GetAttr("is_test"))); } } diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 65be69b7f5b5e363d5d0753c45f9ff9e3f329fbe..1cf1315d3d3059261d84d0e8795a75df4deae005 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -22,7 +22,7 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { VLOG(3) << "Aplies MKL-DNN placement strategy."; for (const Node* n : graph->Nodes()) { - if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) { + if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) { n->Op()->SetAttr("use_mkldnn", true); } } diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 50d9113088903aa7681d6c6af5cc65f846d32787..7a88cb2b681c1aa5e1b75481b1aba66a125a1d9c 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_info.h" namespace paddle { namespace framework { @@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[]; const char Node::kControlDepVarName[] = "__control_var"; #endif -std::unique_ptr CreateNodeForTest(const std::string& name, +std::unique_ptr CreateNodeForTest(const std::string &name, Node::Type type) { return std::unique_ptr(new Node(name, type)); } + +bool Node::RuntimeHasAttr(const std::string &name) const { + if (Op()->HasAttr(name)) { + return true; + } else { + auto &op_info = OpInfoMap::Instance(); + auto op_type = Op()->Type(); + if (op_info.Has(op_type)) { + auto op_info_ptr = op_info.Get(op_type); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + } + } + } + return false; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f19e9aab79098757dae663d030b0fa2b..1044a96430f060b750580ea0b225787ba6ebd2a0 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -108,6 +108,18 @@ class Node { Name().find(ir::Node::kControlDepVarName) != std::string::npos; } + // RuntimeHasAttr is different with HasAttr now. + // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr, + // thus, if stored program_desc_ are old which don't have an attr, a new + // library which adds the attr already will fail on this function. + // Details: + // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087 + // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above + // problem. + // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding + // RuntimeHasAttr. + bool RuntimeHasAttr(const std::string& name) const; + std::vector inputs; std::vector outputs; diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index e8295639520b5838dce3c9c9e443cc846bd9c1ec..f1642bc0d2b10f97295e80ee201db8f83bfd06ef 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -21,42 +21,11 @@ #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace framework { - -// These code can be shared with Executor. -static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { - if (var_type == proto::VarType::LOD_TENSOR) { - var->GetMutable(); - } else if (var_type == proto::VarType::SELECTED_ROWS) { - var->GetMutable(); - } else if (var_type == proto::VarType::FEED_MINIBATCH) { - var->GetMutable(); - } else if (var_type == proto::VarType::FETCH_LIST) { - var->GetMutable(); - } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); - } else if (var_type == proto::VarType::LOD_RANK_TABLE) { - var->GetMutable(); - } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { - var->GetMutable(); - } else if (var_type == proto::VarType::PLACE_LIST) { - var->GetMutable(); - } else if (var_type == proto::VarType::READER) { - var->GetMutable(); - } else if (var_type == proto::VarType::RAW) { - // GetMutable will be called in operator - } else { - PADDLE_THROW( - "Variable type %d is not in " - "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", - var_type); - } -} - void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, int block_id, bool with_feed_fetch_ops) { if (!scope) { diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 8177436d0bd90c3bcf8f91d5c55b66be188b19f9..e22c29037718a60ff7f24404d7749600e2edb80b 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -15,23 +15,105 @@ limitations under the License. */ #ifdef PADDLE_WITH_NGRAPH #include #include +#include #include "paddle/fluid/framework/ngraph_bridge.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" #include "ngraph/ngraph.hpp" namespace paddle { namespace framework { +static std::shared_ptr GetNode( + const std::shared_ptr& op, const std::string prm, + const VariableNameMap& var_map, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto& var_names = var_map.at(prm); + PADDLE_ENFORCE_EQ(var_names.size(), 1, + "op %s prm %s expects one associated var", op->Type(), prm); + if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { + return (*ngb_node_map)[var_names[0]]; + } else { + return nullptr; + } +} + +static std::shared_ptr GetInputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + return GetNode(op, prm, op->Inputs(), ngb_node_map); +} + +static std::shared_ptr GetOutputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + return GetNode(op, prm, op->Outputs(), ngb_node_map); +} + +static void SetOutputNode( + const std::shared_ptr& op, const std::string prm, + std::shared_ptr node, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto& var_names = op->Outputs().at(prm); + if (var_names.size() == 1) { + (*ngb_node_map)[var_names[0]] = node; + } else if (var_names.size() == 0) { + (*ngb_node_map)[""] = node; + } else { + PADDLE_THROW("prm %s has more than 1 var_names.", prm); + } +} + +static bool HasOutput(const std::shared_ptr& op, + const std::string prm) { + auto& outputs = op->Outputs(); + if (outputs.find(prm) == outputs.end()) return false; + return outputs.at(prm).size() > 0; +} + +template +static void BuildBinaryNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = GetInputNode(op, "X", ngb_node_map); + auto y = GetInputNode(op, "Y", ngb_node_map); + auto out = std::make_shared(x, y); + SetOutputNode(op, "Out", out, ngb_node_map); +} + +template +static void BuildUnaryNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto input = GetInputNode(op, "X", ngb_node_map); + auto out = std::make_shared(input); + SetOutputNode(op, "Out", out, ngb_node_map); +} + std::map&, std::shared_ptr>>)>> - NgraphBridge::NG_NODE_MAP = {}; + NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode}, + {"tanh", BuildUnaryNode}}; -void NgraphBridge::build_graph(const std::shared_ptr& op) { +void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { auto& op_type = op->Type(); - NG_NODE_MAP[op_type](op, ngb_node_map); + NG_NODE_MAP[op_type](op, ngb_node_map_); } } // namespace framework diff --git a/paddle/fluid/framework/ngraph_bridge.h b/paddle/fluid/framework/ngraph_bridge.h index 55bf0d21f3471013b1fb780e852d813313345f03..9ed6b9510942136a61faa5755fd8fa74286939a8 100644 --- a/paddle/fluid/framework/ngraph_bridge.h +++ b/paddle/fluid/framework/ngraph_bridge.h @@ -20,16 +20,14 @@ limitations under the License. */ #include #include #include -#include -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" - -#include "ngraph/ngraph.hpp" +#include "ngraph/node.hpp" namespace paddle { namespace framework { +class OperatorBase; + class NgraphBridge { public: static std::map< @@ -43,14 +41,14 @@ class NgraphBridge { std::shared_ptr< std::unordered_map>> var_node_map) - : ngb_node_map(var_node_map) {} + : ngb_node_map_(var_node_map) {} - void build_graph(const std::shared_ptr& op); + void BuildNgNode(const std::shared_ptr& op); private: std::shared_ptr< std::unordered_map>> - ngb_node_map; + ngb_node_map_; }; } // namespace framework diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index d967b2780c21713a2f9a73a3402964103f44269e..3fea753f0659019395c9b214e52a7912058c501c 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -19,14 +19,29 @@ limitations under the License. */ #include #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/ngraph_operator.h" -#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_type.h" +#include "ngraph/ngraph.hpp" + namespace paddle { namespace framework { +static ngraph::Shape Ddim2Shape(const DDim& dims) { + ngraph::Shape sp; + for (int i = 0; i < dims.size(); ++i) { + int k = dims[i]; + k = k == 0 ? 1 : k; + sp.push_back(k); + } + return sp; +} + static std::map pd2ng_type_map = { {proto::VarType::FP32, ngraph::element::f32}, {proto::VarType::FP64, ngraph::element::f64}, @@ -42,6 +57,7 @@ typedef enum { /* nGraph support state on ops */ PARTIAL_TEST /* Support partial list of ops for test */ } op_state; +// perform graph build through bridge and execute computation class NgraphOperator { public: explicit NgraphOperator(const Scope& scope, const platform::Place& place, @@ -59,13 +75,23 @@ class NgraphOperator { persistables_(persist), fetches_(fetches), post_op_inputs_(post_op_inputs), - ng_op_state_(ng_op_state) {} + ng_op_state_(ng_op_state) { + var_in_node_map_ = std::make_shared< + std::unordered_map>>(); + + var_node_map_ = std::make_shared< + std::unordered_map>>(); + + BuildNgIO(); + + GetNgFunction(); + } void Run(const Scope& scope, const platform::Place& place) const; private: static std::unordered_map> - func_cache; + func_cache_; const Scope& scope_; const platform::Place& place_; std::vector> fused_ops_; @@ -74,6 +100,35 @@ class NgraphOperator { std::unordered_set fetches_; std::unordered_set post_op_inputs_; op_state ng_op_state_; + + // ngraph backend eg. CPU + static std::shared_ptr backend_; + // ngraph function to call and execute + std::shared_ptr ngraph_function_; + // var_name of inputs + std::vector var_in_; + // var_name of outputs from fetch in order + std::vector var_out_; + // map input vars to nodes + std::shared_ptr< + std::unordered_map>> + var_in_node_map_; + // map each var name with a ngraph node + std::shared_ptr< + std::unordered_map>> + var_node_map_; + // cache key to check if function is cached + std::shared_ptr GetCacheKey(); + // get ngraph input and define ngraph input parameters + void GetNgInputShape(std::shared_ptr op); + // Call ngraph bridge to map ops + void BuildNgNodes(); + // get the ngraph input and output var list + void BuildNgIO(); + // build ngraph function call + void BuildNgFunction(); + // Check cache for ngraph function or otherwise build the function + void GetNgFunction(); }; std::vector>::iterator>> @@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals( } size_t size = ops->size(); size_t left = 0; - while (left < size && ops.at(left)->Type() != kFeedOpType) { + while (left < size && ops->at(left)->Type() != kFeedOpType) { ++left; } if (left == size) { @@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals( size_t start = pivot, end = start; while (pivot < right && (paddle::framework::NgraphBridge::NG_NODE_MAP.find( - ops.at(pivot)->Type()) != + ops->at(pivot)->Type()) != paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { ++pivot; ++end; @@ -136,7 +191,9 @@ FusedOperator::FusedOperator( std::vector>::iterator end, const std::string& type, const VariableNameMap& inputs, const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) { + : OperatorBase(type, inputs, outputs, attrs), + pdesc_(prog), + block_(block_id) { for (std::vector>::iterator it = start; it != end; ++it) { fused_ops_.push_back(std::move(*it)); @@ -152,7 +209,7 @@ FusedOperator::FusedOperator( } if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { - is_complete = true; + is_full_ = true; } Process(); @@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope, } } - if (is_full) { + if (is_full_) { ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; } @@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope, ngraph_op.Run(scope, place); } +std::unordered_map> + NgraphOperator::func_cache_ = {}; + +std::shared_ptr NgraphOperator::backend_ = + ngraph::runtime::Backend::create("CPU"); + +void NgraphOperator::GetNgInputShape(std::shared_ptr op) { + op->RuntimeInferShape(scope_, place_); + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto sp = Ddim2Shape(tensor_pd->dims()); + if (std::find(var_in_.begin(), var_in_.end(), var_name) != + var_in_.end()) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, sp, true); + (*var_node_map_)[var_name] = prm; + (*var_in_node_map_)[var_name] = prm; + } + } + } + } + } +} + +void NgraphOperator::BuildNgNodes() { + for (auto& var_name : var_out_) { + if (var_node_map_->find(var_name) == var_node_map_->end()) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + auto ng_shape = Ddim2Shape(ddim); + auto ng_type = var_type_map_.at(var_name); + auto prm = + std::make_shared(ng_type, ng_shape, true); + (*var_node_map_)[var_name] = prm; + } + } + } + + paddle::framework::NgraphBridge ngb(var_node_map_); + for (auto& op : fused_ops_) { + ngb.BuildNgNode(op); + } +} + +void NgraphOperator::BuildNgIO() { + std::unordered_set inputs; + std::unordered_set outputs; + + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Inputs()) { + for (auto& var_name : var_name_item.second) { + inputs.insert(var_name); + const bool is_output = outputs.find(var_name) != outputs.end(); + if (!is_output && + std::find(var_in_.begin(), var_in_.end(), var_name) == + var_in_.end()) { + // fill var_in here to keep lhs and rhs order + var_in_.push_back(var_name); + } + } + } + + if (op->Type() != "fill_constant") { + GetNgInputShape(op); + } + + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + outputs.insert(var_name); + } + } + } + + // var_out.clear(); + for (auto& op : fused_ops_) { + for (auto& var_name_item : op->Outputs()) { + PADDLE_ENFORCE_LE(var_name_item.second.size(), 1, + "op %s has more than 1 output - Not handling yet", + op->Type()); + for (auto& var_name : var_name_item.second) { + switch (ng_op_state_) { + case PARTIAL_TEST: + if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || + fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case FULL_TEST: + if (fetches_.find(var_name) != fetches_.end()) { + var_out_.push_back(var_name); + } + break; + case PARTIAL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + post_op_inputs_.find(var_name) != post_op_inputs_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + case FULL_TRAIN: + if (fetches_.find(var_name) != fetches_.end() || + persistables_.find(var_name) != persistables_.end()) { + var_out_.push_back(var_name); + } + break; + default: + var_out_.push_back(var_name); + } + } + } + } +} + +void NgraphOperator::BuildNgFunction() { + BuildNgNodes(); + ngraph_function_ = nullptr; + ngraph::NodeVector func_outputs; + ngraph::op::ParameterVector func_inputs; + + for (auto& vo : var_out_) { + func_outputs.push_back(var_node_map_->at(vo)); + } + + for (auto& vi : var_in_) { + std::shared_ptr prm = + std::dynamic_pointer_cast( + var_in_node_map_->at(vi)); + func_inputs.push_back(prm); + } + + ngraph_function_ = + std::make_shared(func_outputs, func_inputs); +} + +std::shared_ptr NgraphOperator::GetCacheKey() { + auto cache_key = std::make_shared(""); + *cache_key += std::to_string(fused_ops_.size()); + for (auto& op : fused_ops_) { + *cache_key += op->Type(); + } + for (auto& var_name : var_in_) { + auto shape = var_node_map_->at(var_name)->get_shape(); + *cache_key += var_name; + *cache_key += var_type_map_.at(var_name).c_type_string(); + for (size_t i = 0; i < shape.size(); ++i) { + *cache_key += std::to_string(shape.at(i)); + } + } + + for (auto& var_name : var_out_) { + auto* var = scope_.FindVar(var_name); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + auto& ddim = tensor_pd->dims(); + for (int i = 0; i < ddim.size(); ++i) { + *cache_key += std::to_string(ddim[i]); + } + } + } + return cache_key; +} + +void NgraphOperator::GetNgFunction() { + bool cache_on = true; + if (cache_on) { + std::string cache_key_val = *GetCacheKey(); + if (func_cache_.find(cache_key_val) != func_cache_.end()) { + ngraph_function_ = func_cache_.at(cache_key_val); + } else { + BuildNgFunction(); + func_cache_[cache_key_val] = ngraph_function_; + } + } else { + BuildNgFunction(); + } +} + +void NgraphOperator::Run(const Scope& scope, + const platform::Place& place) const { + std::vector> t_in; + std::vector> t_out; + + for (size_t i = 0; i < var_in_.size(); ++i) { + auto vi = var_in_.at(i); + auto sp = var_node_map_->at(vi)->get_shape(); + std::shared_ptr ti; + auto* var = scope.FindVar(vi); + if (var && var->IsType()) { + auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); + PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), + "Ensure ngraph tensor layout align with paddle tensor"); + if (tensor_pd->type().hash_code() == + typeid(float).hash_code()) { // NOLINT + const float* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::f32, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(int).hash_code()) { // NOLINT + const int* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i32, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) { + const int64_t* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::i64, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(double).hash_code()) { // NOLINT + const double* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::f64, sp, + const_cast(arr)); + } else if (tensor_pd->type().hash_code() == + typeid(bool).hash_code()) { // NOLINT + const bool* arr = tensor_pd->data(); + ti = backend_->create_tensor(ngraph::element::boolean, sp, + const_cast(arr)); + } else { + PADDLE_THROW("Data type not handling for var %s", vi); + } + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", vi); + } + bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST) + ? true + : false; + bool is_persistable = + (persistables_.find(vi) != persistables_.end()) ? true : false; + if (is_test && is_persistable) { + ti->set_stale(false); + } + t_in.push_back(ti); + } + + for (size_t i = 0; i < var_out_.size(); ++i) { + auto var_name = var_out_[i]; + auto* var = scope.FindVar(var_name); + std::shared_ptr to; + if (var && var->IsType()) { + auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); + auto dd = tensor_pd->dims(); + ngraph::Shape sp = Ddim2Shape(dd); + auto ng_type = var_type_map_.at(var_name); + if (ng_type == ngraph::element::f32) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr); + } else if (ng_type == ngraph::element::i64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr); + } else if (ng_type == ngraph::element::f64) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr); + } else if (ng_type == ngraph::element::boolean) { + auto pd_arr = tensor_pd->mutable_data(place); + to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr); + } else { + PADDLE_THROW("Data type not handled in for var %s", var_name); + } + t_out.push_back(to); + } else { + PADDLE_THROW("Cannot find var or tensor with var name %s", var_name); + } + } + + backend_->call(ngraph_function_, t_out, t_in); +} // NgraphOperator::RunImpl } // namespace framework } // namespace paddle #endif diff --git a/paddle/fluid/framework/ngraph_operator.h b/paddle/fluid/framework/ngraph_operator.h index 0f655cef1dde624bcf4944b5c096279097e1c8ae..3ca023e11111c5b447b2cabbfb8bb29877297f65 100644 --- a/paddle/fluid/framework/ngraph_operator.h +++ b/paddle/fluid/framework/ngraph_operator.h @@ -17,24 +17,19 @@ limitations under the License. */ #ifdef PADDLE_WITH_NGRAPH #include -#include #include #include #include #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/variant.h" -#include "ngraph/ngraph.hpp" +#include "ngraph/type/element_type.hpp" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc new file mode 100644 index 0000000000000000000000000000000000000000..6d4801e4a0eed7083e671e1d49b8628dfb280cf9 --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_kernel_type.h" + +namespace paddle { +namespace framework { + +size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { + int cur_loc = 0; + + int place = key.place_.which(); + cur_loc += OpKernelType::kPlaceBits; + + int data_type = static_cast(key.data_type_) << cur_loc; + cur_loc += OpKernelType::kPrimaryDTypeBits; + + int data_layout = static_cast(key.data_layout_) << cur_loc; + cur_loc += OpKernelType::kLayoutBits; + + int library_type = static_cast(key.library_type_) << cur_loc; + cur_loc += OpKernelType::kLibBits; + + int customized_value = key.customized_type_value_; + PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits)); + customized_value = customized_value << cur_loc; + cur_loc += OpKernelType::kCustomizeBits; + PADDLE_ENFORCE(cur_loc < 64); + + std::hash hasher; + return hasher(place + data_type + data_layout + library_type + + customized_value); +} + +bool OpKernelType::operator==(const OpKernelType& o) const { + return platform::places_are_same_class(place_, o.place_) && + data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && + library_type_ == o.library_type_ && + customized_type_value_ == o.customized_type_value_; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index ac0330218973123771367ed5ba9477c90143a043..9edc1a3e150027b5a3dbd8483dc8b58d1d4ab918 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -24,54 +24,55 @@ limitations under the License. */ namespace paddle { namespace framework { -struct OpKernelType { - struct Hash { - size_t operator()(const OpKernelType& key) const { - int place = key.place_.which(); - int data_type = static_cast(key.data_type_) << LEFT_SHIFT; - int data_layout = static_cast(key.data_layout_) << (LEFT_SHIFT * 2); - int library_type = static_cast(key.library_type_) - << (LEFT_SHIFT * 3); - - std::hash hasher; - return hasher(place + data_type + data_layout + library_type); - } - }; +class OpKernelType { + public: + constexpr static int kDefaultCustomizedTypeValue = 0; - // place, data_type, library_type kinds less than 2^8 - constexpr static int LEFT_SHIFT = 8; - - proto::VarType::Type data_type_; - DataLayout data_layout_; - platform::Place place_; - LibraryType library_type_; + // In total should be smaller than 64. + constexpr static int kPlaceBits = 4; + constexpr static int kPrimaryDTypeBits = 8; + constexpr static int kLayoutBits = 4; + constexpr static int kLibBits = 4; + constexpr static int kCustomizeBits = 4; OpKernelType(proto::VarType::Type data_type, platform::Place place, DataLayout data_layout = DataLayout::kAnyLayout, - LibraryType library_type = LibraryType::kPlain) + LibraryType library_type = LibraryType::kPlain, + int customized_type_value = kDefaultCustomizedTypeValue) : data_type_(data_type), data_layout_(data_layout), place_(place), - library_type_(library_type) {} + library_type_(library_type), + customized_type_value_(customized_type_value) {} OpKernelType(proto::VarType::Type data_type, const platform::DeviceContext& dev_ctx, DataLayout data_layout = DataLayout::kAnyLayout, - LibraryType library_type = LibraryType::kPlain) + LibraryType library_type = LibraryType::kPlain, + int customized_type_value = kDefaultCustomizedTypeValue) : data_type_(data_type), data_layout_(data_layout), place_(dev_ctx.GetPlace()), - library_type_(library_type) {} + library_type_(library_type), + customized_type_value_(customized_type_value) {} + + virtual ~OpKernelType() {} + + struct Hash { + size_t operator()(const OpKernelType& key) const; + }; size_t hash_key() const { return Hash()(*this); } - bool operator==(const OpKernelType& o) const { - return platform::places_are_same_class(place_, o.place_) && - data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && - library_type_ == o.library_type_; - } + bool operator==(const OpKernelType& o) const; bool operator!=(const OpKernelType& o) const { return !(*this == o); } + + proto::VarType::Type data_type_; + DataLayout data_layout_; + platform::Place place_; + LibraryType library_type_; + int customized_type_value_; }; inline std::ostream& operator<<(std::ostream& os, diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 0e6e74293c30d5f8caa58fe6bfa63657d2669b46..36673e48c2047bca54f604b082dfec123f1e2c82 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -35,6 +35,7 @@ limitations under the License. */ namespace paddle { namespace framework { + class Registrar { public: // In our design, various kinds of classes, e.g., operators and kernels, @@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor; template inline void RegisterKernelClass(const char* op_type, const char* library_type, - Func func) { + int customized_type_value, Func func) { std::string library(library_type); std::string data_layout = "ANYLAYOUT"; if (library == "MKLDNN") { @@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type, } OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), StringToDataLayout(data_layout), - StringToLibraryType(library_type)); + StringToLibraryType(library_type), customized_type_value); OperatorWithKernel::AllOpKernels()[op_type][key] = func; } @@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor { using KERNEL_TYPE = typename std::tuple_element>::type; - void operator()(const char* op_type, const char* library_type) const { + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const { using T = typename KERNEL_TYPE::ELEMENT_TYPE; RegisterKernelClass( - op_type, library_type, [](const framework::ExecutionContext& ctx) { + op_type, library_type, customized_type_value, + + [](const framework::ExecutionContext& ctx) { KERNEL_TYPE().Compute(ctx); }); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; template struct OpKernelRegistrarFunctor { - void operator()(const char* op_type, const char* library_type) const {} + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const {} }; // User can register many kernel in one place. The data type could be @@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor { template class OpKernelRegistrar : public Registrar { public: - explicit OpKernelRegistrar(const char* op_type, const char* library_type) { + explicit OpKernelRegistrar(const char* op_type, const char* library_type, + int customized_type_value) { OpKernelRegistrarFunctor func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; @@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx; template class OpKernelRegistrarEx : public Registrar { public: - explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) { + explicit OpKernelRegistrarEx(const char* op_type, const char* library_type, + int customized_type_value) { OpKernelRegistrarFunctorEx func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; template struct OpKernelRegistrarFunctorEx { - void operator()(const char* op_type, const char* library_type) const {} + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const {} }; template @@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx>::type; - void operator()(const char* op_type, const char* library_type) const { - RegisterKernelClass(op_type, library_type, Functor()); + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const { + RegisterKernelClass(op_type, library_type, + customized_type_value, Functor()); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctorEx= size, I + 2, DataTypeAndKernelType...> func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; +// clang-format off /** * check if MACRO is used in GLOBAL NAMESPACE. */ @@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx \ - __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ - #library_type); \ - int TouchOpKernelRegistrar_##op_type##_##library_type() { \ - __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ - return 0; \ +#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \ + place_class, customized_name, \ + customized_type_value, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \ + "REGISTER_OP_KERNEL must be called in " \ + "global namespace"); \ + static ::paddle::framework::OpKernelRegistrar \ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\ + #op_type, #library_type, customized_type_value); \ + int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \ + .Touch(); \ + return 0; \ } +#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ + REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( \ + op_type, library_type, place_class, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) + #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) #define REGISTER_OP_CPU_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) -#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...) \ - STATIC_ASSERT_GLOBAL_NAMESPACE( \ - __reg_op_kernel_##op_type##_##library_type##__, \ - "REGISTER_OP_KERNEL_EX must be called in global namespace"); \ - static ::paddle::framework::OpKernelRegistrarEx \ - __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ - #library_type); \ - int TouchOpKernelRegistrar_##op_type##_##library_type() { \ - __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ - return 0; \ +#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ + customized_name, \ + customized_type_value, \ + ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \ + "REGISTER_OP_KERNEL_EX must be called in " \ + "global namespace"); \ + static ::paddle::framework::OpKernelRegistrarEx \ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\ + #op_type, #library_type, customized_type_value); \ + int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \ + .Touch(); \ + return 0; \ } #define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \ - REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \ - __VA_ARGS__) + REGISTER_OP_KERNEL_EX( \ + op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) -#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ - REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) +#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ + REGISTER_OP_KERNEL_EX( \ + op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) /** * Macro to mark what Operator and Kernel @@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorExInferShape(&infer_shape_ctx); +} + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { RuntimeInferShapeContext infer_shape_ctx(*this, scope); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 5bd68f9ac2e1b30bc6ce3094960bb89842b99e01..0a6a28a5bce01d71cf56f25f5556033db94452c2 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -128,6 +128,8 @@ class OperatorBase { virtual std::vector OutputVars(bool has_intermediate) const; void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } + virtual void RuntimeInferShape(const Scope& scope, + const platform::Place& place) const {} protected: std::string type_; @@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } + void RuntimeInferShape(const Scope& scope, + const platform::Place& place) const override; + protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetKernelTypeForVar( diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc index ac9dd8245ad4e0e8842f219b23d3866b03fdaedb..ab14732e4d6eab9dd15364da02b436c10ed68a19 100644 --- a/paddle/fluid/framework/operator_test.cc +++ b/paddle/fluid/framework/operator_test.cc @@ -50,6 +50,8 @@ class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker { AddInput("input", "input of test op"); AddOutput("output", "output of test op"); AddAttr("scale", "scale of cosine op"); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; @@ -95,6 +97,8 @@ TEST(OperatorBase, all) { namespace paddle { namespace framework { +static int special_type_value = 1; + class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: void Make() { @@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .GreaterThan(0.0); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; static int cpu_kernel_run_num = 0; +static int cpu_kernel2_run_num = 0; class OpWithKernelTest : public OperatorWithKernel { public: @@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override {} OpKernelType GetExpectedKernelType( const ExecutionContext& ctx) const override { - return OpKernelType(proto::VarType::FP32, ctx.GetPlace()); + int sub_type = ctx.Attr("kernel_sub_type"); + return OpKernelType(proto::VarType::FP32, ctx.GetPlace(), + framework::DataLayout::kAnyLayout, + framework::LibraryType::kPlain, sub_type); } }; @@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel { } }; +template +class CPUKernel2Test : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + cpu_kernel2_run_num++; + ASSERT_EQ(ctx.op().Input("x"), "IN1"); + ASSERT_EQ(ctx.op().Output("y"), "OUT1"); + } +}; + class OpKernelTestMultiInputsProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: @@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .GreaterThan(0.0); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; @@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel { REGISTER_OP_WITHOUT_GRADIENT( op_with_kernel, paddle::framework::OpWithKernelTest, paddle::framework::OpKernelTestProtoAndCheckerMaker); + REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest); +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME, + paddle::framework::special_type_value, + paddle::framework::CPUKernel2Test); + // test with single input TEST(OpKernel, all) { paddle::framework::InitDevices(true); @@ -211,7 +240,19 @@ TEST(OpKernel, all) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); op->Run(scope, cpu_place); + // kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called. + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); + ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0); + + attr = op_desc.mutable_attrs()->Add(); + attr->set_name("kernel_sub_type"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(1); + auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc); + op2->Run(scope, cpu_place); + // kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called. ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); + ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1); } REGISTER_OP_WITHOUT_GRADIENT( diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 44384082dbaf7a8d654e8461da87009bde33a3d5..e1bdba9b46a4cbdb664b70c7419f567ef95bdf31 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -32,8 +32,7 @@ namespace framework { class SelectedRows { /* * @brief We can use the SelectedRows structure to reproduce a sparse table. - * A sparse table is a key-value structure that the key is an `int64_t` - * number, + * A sparse table is a key-value structure that the key is an `int64_t`, * and the value is a Tensor which the first dimension is 0. * You can use the following interface to operate the sparse table, and you * can find diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc4525549caeebb06dea766ccb123b5ebc6d5b13 --- /dev/null +++ b/paddle/fluid/framework/variable_helper.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/variable_helper.h" + +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +void InitializeVariable(Variable* var, proto::VarType::Type var_type) { + if (var_type == proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarType::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarType::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarType::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarType::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::READER) { + var->GetMutable(); + } else if (var_type == proto::VarType::RAW) { + // GetMutable will be called in operator + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", + var_type); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..0e0c72c3621dce0a6b372f9a9110a63fbc0a1d71 --- /dev/null +++ b/paddle/fluid/framework/variable_helper.h @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/variable.h" +namespace paddle { +namespace framework { +void InitializeVariable(Variable *var, proto::VarType::Type var_type); +} +} diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index 299f235a74ae0ffb663be61079607d8ac1105a97..d5a972fab3beae4d4e2e512d1ccda3f0b8356682 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -46,8 +46,6 @@ class AnalysisPass { protected: // User should implement these. virtual void RunImpl(Argument* argument) = 0; - - Argument* argument_{nullptr}; }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index c6b7c05f784b7c44fe30dd69529fe48405538ab6..4ffe5f575c232ccfc0089cb86e28737e56b32f94 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, output_mapping.push_back(output_name_map[name]); } - *block_desc.Proto()->mutable_vars() = - const_cast(&graph->program()) - ->Proto() - ->blocks(0) - .vars(); + auto *vars = block_desc.Proto()->mutable_vars(); + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + *vars->Add() = *node->Var()->Proto(); + } + } PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index a30c27b1183a75de8c0bb50ef3617d747b239fae..d3ea511d8f4d8cbec1be57633391f00e29a3e6e9 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -1,6 +1,7 @@ cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass) +cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass) set(analysis_deps ${analysis_deps} ir_graph_build_pass diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc index 108cb6f74b1208395a4faabdf6184152c300d244..c3a2b3ca1d3b09e71921fde0b0bad8d195aaa38f 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc @@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { std::vector passes({ "ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass", }); for (const auto &pass : passes) { VLOG(2) << "Run pass " << pass; diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index d5e0d90de1da8e54e2411c266f7a8c09c33b0336..740030c3a80e4d7e2ac47998a304be97758b95cb 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { // so that the parameters will on the same device, or they will keep copying // between difference devices. platform::Place place; - if (argument->use_gpu()) { - PADDLE_ENFORCE(argument->gpu_device_id_valid()); - place = platform::CUDAPlace(argument->gpu_device_id()); - } else { - place = platform::CPUPlace(); - } + place = platform::CPUPlace(); if (argument->model_dir_valid()) { auto program = diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..8be2d3ac0b105e50fe619a720929dedaacb75537 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { + PADDLE_ENFORCE(argument->scope_valid()); + PADDLE_ENFORCE(argument->use_gpu_valid()); + + platform::Place place; + + // The parameters are on the cpu, therefore, synchronization is not necessary. + if (!argument->use_gpu()) return; + + LOG(INFO) << "Sync params from CPU to GPU"; + + PADDLE_ENFORCE(argument->gpu_device_id_valid()); + place = platform::CUDAPlace(argument->gpu_device_id()); + + auto *scope = argument->scope_ptr(); + std::vector all_vars = scope->LocalVarNames(); + + // We get all the vars from local_scope instead of the ProgramDesc. + // Because there exists the case that new parameter variables are not added to + // the program in the analysis pass. + for (auto &var_name : all_vars) { + auto *var = scope->FindLocalVar(var_name); + PADDLE_ENFORCE(var != nullptr); + if (var->IsType() || + var->IsType()) { + auto *t = var->GetMutable(); + + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + temp_tensor.mutable_data(cpu_place); + + // Copy the parameter data to a tmp tensor. + TensorCopySync(*t, cpu_place, &temp_tensor); + // Reallocation the space on GPU + t->mutable_data(place); + + // Copy parameter data to newly allocated GPU space. + TensorCopySync(temp_tensor, place, t); + } + } +} + +std::string IrParamsSyncAmongDevicesPass::repr() const { + return "ir-params-sync-among-devices-pass"; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..a95f460df6f9636fc17a5cf76920f5f459385120 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Sync parameter from CPU to GPU. + */ +class IrParamsSyncAmongDevicesPass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + std::string repr() const override; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc index 2ef515f45f2483df8d1238b4758d6729d0299ce9..9245e32cee28473c21e2acbc1c64165d8b475d3b 100644 --- a/paddle/fluid/inference/analysis/passes/passes.cc +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" namespace paddle { namespace inference { @@ -27,6 +28,9 @@ PassRegistry::PassRegistry() { std::unique_ptr(new IrGraphBuildPass)); passes_.emplace("ir_analysis_compose_pass", std::unique_ptr(new IrAnalysisComposePass)); + passes_.emplace( + "ir_params_sync_among_devices_pass", + std::unique_ptr(new IrParamsSyncAmongDevicesPass)); } } // namespace analysis diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 1862f61f0f4b94c9fa9636e876e943113d9aebd4..391330a7c0f2dda731fe8455fdab81b276e3f272 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -190,9 +190,13 @@ bool AnalysisPredictor::Run(const std::vector &inputs, } VLOG(3) << "predict cost: " << timer.toc() << "ms"; - // Fix TensorArray reuse not cleaned bug. - tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); - tensor_array_batch_cleaner_.ResetTensorArray(); + // All the containers in the scope will be hold in inference, but the + // operators assume that the container will be reset after each batch. + // Here is a bugfix, collect all the container variables, and reset then to a + // bool; the next time, the operator will call MutableData and construct a new + // container again, so that the container will be empty for each batch. + tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_); + tensor_array_batch_cleaner_.ResetNoTensorVars(); return true; } @@ -417,7 +421,7 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( bool AnalysisPredictor::ZeroCopyRun() { executor_->Run(); // Fix TensorArray reuse not cleaned bug. - tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); + tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_); tensor_array_batch_cleaner_.ResetTensorArray(); return true; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 74369e886692fef3172d24c637b03a5bcf81a6c2..4c5b412a2c1717b8edbb17c238caaa11aeccebd3 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -154,9 +154,9 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, } VLOG(3) << "predict cost: " << timer.toc() << "ms"; - // Fix TensorArray reuse not cleaned bug. - tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); - tensor_array_batch_cleaner_.ResetTensorArray(); + // For some other vector like containers not cleaned after each batch. + tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get()); + tensor_array_batch_cleaner_.ResetNoTensorVars(); return true; } diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 8fb464c0f5443f116815b14324f6cbc966dc6482..ec93729cd2b379dc2ac39b51df6799b74c8529b6 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") link_directories("${PADDLE_LIB}/paddle/lib") +if (NOT WIN32) + set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph") + if(EXISTS ${NGRAPH_PATH}) + include(GNUInstallDirs) + include_directories("${NGRAPH_PATH}/include") + link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") + set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) if(WITH_MKL) @@ -106,7 +116,7 @@ endif() if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} + ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} glog gflags protobuf snappystream snappy z xxhash ${EXTERNAL_LIB}) else() diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc index 4ae6c6dc9f44650c1c62f5be5448864d817513b1..569a487328e2f1febe2ca5014b232dbd51d28079 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.cc +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() { } } +void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) { + if (no_tensor_flag_) { + for (auto &var_name : scope->LocalVarNames()) { + auto *var = scope->FindVar(var_name); + if (!var->IsInitialized()) continue; + if (!valid_types_.count(var->Type())) { + no_tensor_vars_.insert(var); + } + } + + for (auto *kid : scope->kids()) { + CollectTensorArrays(kid); + } + no_tensor_flag_ = false; // Only collect one time. + } +} + +void TensorArrayBatchCleaner::ResetNoTensorVars() { + for (auto *var : no_tensor_vars_) { + var->Clear(); + } +} + } // namespace details } // namespace paddle diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h index a39449ff0e67786815dfb8d2d30d79dcdba757d7..6a5ea64de66fcac44117d0d8f7798e8875703ec6 100644 --- a/paddle/fluid/inference/api/details/reset_tensor_array.h +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -14,9 +14,11 @@ #pragma once +#include #include #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" namespace paddle { namespace details { @@ -24,13 +26,28 @@ namespace details { // Clean the TensorArray each batch to make the behavior the same with the // training phase. struct TensorArrayBatchCleaner { + TensorArrayBatchCleaner() { + valid_types_.insert(typeid(framework::Tensor)); + valid_types_.insert(typeid(framework::LoDTensor)); + } + // Collect the variables that are not Tensor or LoDTensor, and reset them to a + // bool(trick), because some of them are containers, and some operators just + // keep inserting new items without clearing the containers first; So the + // memory grow larger and larger in inference service deployed online. + void CollectNoTensorVars(framework::Scope *scope); + void ResetNoTensorVars(); + // Fix the tensor array not clear in the inference scenarios. void CollectTensorArrays(framework::Scope *scope); void ResetTensorArray(); private: bool flag_{true}; + bool no_tensor_flag_{true}; std::vector arrays_; + + std::unordered_set valid_types_; + std::unordered_set no_tensor_vars_; }; } // namespace details diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 12e3a6f42e14010feedbbb5d8f8a98f60cea4556..825bee833bf918067497f56adebbbcaf55f892a2 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy { class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { - // TODO(NHZlX) Problem with Data synchronization between GPU and CPU - // When running in GPU mode, the parameters are all on GPU. But the - // opearations of "conv_bn_fuse_pass" are on CPU. passes_.assign({ - "infer_clean_graph_pass", - // "infer_clean_graph_pass", "conv_bn_fuse_pass", + "infer_clean_graph_pass", "conv_bn_fuse_pass", }); } diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc index 453f222f1f1e3f3b9ee8fa7bd49f4cab2286e7ea..b086c910d38a243d98315f2d6eb82ecc0ec5c06d 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc @@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) { } // namespace inference } // namespace paddle -// USE_OP(prelu); -USE_CPU_ONLY_OP(prelu); +USE_OP(prelu); diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index e822785ad6f4f6f67b72141f3e7b04aefa72e58b..95443e813327c1247ac530c4d2e68b3607ff0e73 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,4 +1,4 @@ nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu avg_pool_op_plugin.cu - DEPS enforce tensorrt_engine) + DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index e8f4254402a5d8a5e6c5a2384bf9fbe48341956e..3075e87ea6d719a3f49d14c8c4b8015f7d688a50 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -14,92 +14,16 @@ #include #include +#include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" +#include "paddle/fluid/operators/math/prelu.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { -static const int CUDA_NUM_THREADS = 1024; -static const int CUDA_MAX_NUM_BLOCKS = 65535; -inline static int GET_NUM_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -__global__ void PReluChannelWiseKernel(const float *input, const float *alpha, - float *output, int channel, - size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - float *out = output + offset; - float scale = alpha[blockIdx.x % channel]; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale * x; - } -} - -__global__ void PReluElementWiseKernel(const float *input, const float *alpha, - float *output, size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - const float *scale = alpha + offset; - float *out = output + offset; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale[i] * x; - } -} - -__global__ void PReluScalarKernel(const float *input, const float *alpha, - float *output, size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - float scale = *alpha; - float *out = output + offset; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale * x; - } -} - -static inline void PReluChannelWise(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, - const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluChannelWiseKernel<<>>( - input, alpha, output, dims.d[0], spatial_size); -} - -static inline void PReluElementWise(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, - const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluElementWiseKernel<<>>( - input, alpha, output, spatial_size); -} - -static inline void PReluScalar(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluScalarKernel<<>>( - input, alpha, output, spatial_size); -} - nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { @@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, return output_dims; } -int PReluPlugin::enqueue(int batchSize, const void *const *inputs, +int PReluPlugin::enqueue(int batch_size, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) { // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); const float *alpha = reinterpret_cast(alpha_.get().values); float *output = reinterpret_cast(outputs)[0]; + + std::vector input_shape; + input_shape.push_back(batch_size); + for (int i = 0; i < input_dims.nbDims; i++) { + input_shape.push_back(input_dims.d[i]); + } + if (mode_ == "channel") { - PReluChannelWise(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluChannelWiseDirectCUDAFunctor + prelu_channel_wise; + prelu_channel_wise(stream, input, alpha, output, input_shape); } else if (mode_ == "element") { - PReluElementWise(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluElementWiseDirectCUDAFunctor + prelu_element_wise; + prelu_element_wise(stream, input, alpha, output, input_shape); } else { - PReluScalar(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(stream, input, alpha, output, input_shape); } return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 7dc88d9dd052c59aaa59b7802ee5a38ea9d89bc6..a07626a10315a6206f8c1ebc9a19df90663a88ee 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -46,11 +46,18 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc) -# DAM +# normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc) +# small DAM +set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") +download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") +inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1) + # chinese_ner set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index b369cba5c8b3f8aadd1123d6b7345fad6e47bd0f..a3a6130db7cfe75ef558dc901883c29a20088b3f 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -14,38 +14,54 @@ #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_int32(max_turn_num, 9, + "The max turn number: 1 for the small and 9 for the normal."); + namespace paddle { namespace inference { using contrib::AnalysisConfig; -#define MAX_TURN_NUM 9 -#define MAX_TURN_LEN 50 + +constexpr int32_t kMaxTurnLen = 50; + static std::vector result_data; struct DataRecord { - std::vector> - turns[MAX_TURN_NUM]; // turns data : MAX_TURN_NUM - std::vector> - turns_mask[MAX_TURN_NUM]; // turns mask data : MAX_TURN_NUM - std::vector> response; // response data : 1 + std::vector> *turns; + std::vector> *turns_mask; + std::vector> response; // response data : 1 std::vector> response_mask; // response mask data : 1 size_t batch_iter{0}; size_t batch_size{1}; size_t num_samples; // total number of samples - DataRecord() = default; + + DataRecord() { + turns = new std::vector>[FLAGS_max_turn_num]; // turns data : FLAGS_max_turn_num + turns_mask = new std::vector>[FLAGS_max_turn_num]; // turns mask data : FLAGS_max_turn_num + } + explicit DataRecord(const std::string &path, int batch_size = 1) - : batch_size(batch_size) { + : DataRecord() { + this->batch_size = batch_size; Load(path); } + + ~DataRecord() { + delete[] turns; + delete[] turns_mask; + } + DataRecord NextBatch() { DataRecord data; size_t batch_end = batch_iter + batch_size; // NOTE skip the final batch, if no enough data is provided. if (batch_end <= response.size()) { - for (int i = 0; i < MAX_TURN_NUM; ++i) { + for (int i = 0; i < FLAGS_max_turn_num; ++i) { data.turns[i].assign(turns[i].begin() + batch_iter, turns[i].begin() + batch_end); } - for (int i = 0; i < MAX_TURN_NUM; ++i) { + for (int i = 0; i < FLAGS_max_turn_num; ++i) { data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter, turns_mask[i].begin() + batch_end); } @@ -60,6 +76,7 @@ struct DataRecord { batch_iter += batch_size; return data; } + void Load(const std::string &path) { std::ifstream file(path); std::string line; @@ -69,30 +86,30 @@ struct DataRecord { num_lines++; std::vector data; split(line, ',', &data); - CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3)); + CHECK_EQ(data.size(), (size_t)(2 * FLAGS_max_turn_num + 3)); // load turn data - std::vector turns_tmp[MAX_TURN_NUM]; - for (int i = 0; i < MAX_TURN_NUM; ++i) { + std::vector turns_tmp[FLAGS_max_turn_num]; + for (int i = 0; i < FLAGS_max_turn_num; ++i) { split_to_int64(data[i], ' ', &turns_tmp[i]); turns[i].push_back(std::move(turns_tmp[i])); } // load turn_mask data - std::vector turns_mask_tmp[MAX_TURN_NUM]; - for (int i = 0; i < MAX_TURN_NUM; ++i) { - split_to_float(data[MAX_TURN_NUM + i], ' ', &turns_mask_tmp[i]); + std::vector turns_mask_tmp[FLAGS_max_turn_num]; + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + split_to_float(data[FLAGS_max_turn_num + i], ' ', &turns_mask_tmp[i]); turns_mask[i].push_back(std::move(turns_mask_tmp[i])); } // load response data std::vector response_tmp; - split_to_int64(data[2 * MAX_TURN_NUM], ' ', &response_tmp); + split_to_int64(data[2 * FLAGS_max_turn_num], ' ', &response_tmp); response.push_back(std::move(response_tmp)); // load response_mask data std::vector response_mask_tmp; - split_to_float(data[2 * MAX_TURN_NUM + 1], ' ', &response_mask_tmp); + split_to_float(data[2 * FLAGS_max_turn_num + 1], ' ', &response_mask_tmp); response_mask.push_back(std::move(response_mask_tmp)); // load result data float result_tmp; - result_tmp = std::stof(data[2 * MAX_TURN_NUM + 2]); + result_tmp = std::stof(data[2 * FLAGS_max_turn_num + 2]); result_data.push_back(result_tmp); } num_samples = num_lines; @@ -101,8 +118,8 @@ struct DataRecord { void PrepareInputs(std::vector *input_slots, DataRecord *data, int batch_size) { - PaddleTensor turns_tensor[MAX_TURN_NUM]; - PaddleTensor turns_mask_tensor[MAX_TURN_NUM]; + PaddleTensor turns_tensor[FLAGS_max_turn_num]; + PaddleTensor turns_mask_tensor[FLAGS_max_turn_num]; PaddleTensor response_tensor; PaddleTensor response_mask_tensor; std::string turn_pre = "turn_"; @@ -110,16 +127,16 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, auto one_batch = data->NextBatch(); int size = one_batch.response[0].size(); - CHECK_EQ(size, MAX_TURN_LEN); + CHECK_EQ(size, kMaxTurnLen); // turn tensor assignment - for (int i = 0; i < MAX_TURN_NUM; ++i) { + for (int i = 0; i < FLAGS_max_turn_num; ++i) { turns_tensor[i].name = turn_pre + std::to_string(i); turns_tensor[i].shape.assign({batch_size, size, 1}); turns_tensor[i].dtype = PaddleDType::INT64; TensorAssignData(&turns_tensor[i], one_batch.turns[i]); } // turn mask tensor assignment - for (int i = 0; i < MAX_TURN_NUM; ++i) { + for (int i = 0; i < FLAGS_max_turn_num; ++i) { turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i); turns_mask_tensor[i].shape.assign({batch_size, size, 1}); turns_mask_tensor[i].dtype = PaddleDType::FLOAT32; @@ -137,10 +154,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, TensorAssignData(&response_mask_tensor, one_batch.response_mask); // Set inputs. - for (int i = 0; i < MAX_TURN_NUM; ++i) { + for (int i = 0; i < FLAGS_max_turn_num; ++i) { input_slots->push_back(std::move(turns_tensor[i])); } - for (int i = 0; i < MAX_TURN_NUM; ++i) { + for (int i = 0; i < FLAGS_max_turn_num; ++i) { input_slots->push_back(std::move(turns_mask_tensor[i])); } input_slots->push_back(std::move(response_tensor)); @@ -202,8 +219,6 @@ TEST(Analyzer_dam, fuse_statis) { auto fuse_statis = GetFuseStatis( static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); - EXPECT_EQ(fuse_statis.at("fc_fuse"), 317); - EXPECT_EQ(num_ops, 2020); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc index 021edc2de5e90023fcd1431dd2025450e7462bd9..d03aa11b75ee58524746212e43a5796773f47932 100644 --- a/paddle/fluid/inference/utils/benchmark.cc +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const { ss << batch_size_ << "\t"; ss << num_threads_ << "\t"; ss << latency_ << "\t"; - ss << 1000 / latency_; + ss << 1000.0 / latency_; ss << '\n'; return ss.str(); } diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h index 80e8f77adb4ff2cc81a2a3dd0c44e4e304800122..76a3dd2c2992ebdf2528c539b3d161f558b34a08 100644 --- a/paddle/fluid/inference/utils/benchmark.h +++ b/paddle/fluid/inference/utils/benchmark.h @@ -11,9 +11,11 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include #include +#include namespace paddle { namespace inference { @@ -31,8 +33,8 @@ struct Benchmark { bool use_gpu() const { return use_gpu_; } void SetUseGpu() { use_gpu_ = true; } - int latency() const { return latency_; } - void SetLatency(int x) { latency_ = x; } + float latency() const { return latency_; } + void SetLatency(float x) { latency_ = x; } const std::string& name() const { return name_; } void SetName(const std::string& name) { name_ = name; } @@ -43,7 +45,7 @@ struct Benchmark { private: bool use_gpu_{false}; int batch_size_{0}; - int latency_; + float latency_; int num_threads_{1}; std::string name_; }; diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 26e2038a534c18d2b7ab77adf33846803dcffcf5..64aa63ffe9705d75e70c8d9d9cbc433dd6358596 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,11 +14,13 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include +#include #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/split.h" DEFINE_bool(init_allocated_mem, false, "It is a mistake that the values of the memory allocated by " @@ -86,7 +88,7 @@ struct NaiveAllocator { template <> void *Alloc(const platform::CPUPlace &place, size_t size) { - VLOG(1) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void *p = GetCPUBuddyAllocator()->Alloc(size); if (FLAGS_init_allocated_mem) { memset(p, 0xEF, size); @@ -97,7 +99,7 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { template <> void Free(const platform::CPUPlace &place, void *p) { - VLOG(1) << "Free pointer=" << p << " on " << platform::Place(place); + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -110,19 +112,21 @@ size_t Used(const platform::CPUPlace &place) { BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { static std::once_flag init_flag; static detail::BuddyAllocator **a_arr = nullptr; + static std::vector devices; std::call_once(init_flag, [gpu_id]() { - int gpu_num = platform::GetCUDADeviceCount(); - PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id, - gpu_num); + devices = platform::GetSelectedDevices(); + int gpu_num = devices.size(); a_arr = new BuddyAllocator *[gpu_num]; - for (int i = 0; i < gpu_num; i++) { + for (size_t i = 0; i < devices.size(); ++i) { + int dev_id = devices[i]; a_arr[i] = nullptr; - platform::SetDeviceId(i); - a_arr[i] = new BuddyAllocator( - std::unique_ptr(new detail::GPUAllocator(i)), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + platform::SetDeviceId(dev_id); + a_arr[i] = new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); VLOG(10) << "\n\nNOTE: each GPU device use " << FLAGS_fraction_of_gpu_memory_to_use * 100 @@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { }); platform::SetDeviceId(gpu_id); - return a_arr[gpu_id]; + auto pos = std::distance(devices.begin(), + std::find(devices.begin(), devices.end(), gpu_id)); + return a_arr[pos]; } #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index de4f23515d8591f28b80ad00322365f8cdce768b..257bfc0a3f926d20abc4647b27e8e9cc2c49e014 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -37,7 +37,13 @@ if (WITH_GPU) SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub) endif() -register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS}) +SET(OP_PREFETCH_DEPS "") +if (WITH_DISTRIBUTE) + SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) +endif() + +register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) + # warpctc_op needs cudnn 7 above if (WITH_GPU AND NOT WIN32) @@ -64,7 +70,7 @@ endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) endif() # FIXME(typhoonzero): operator deps may not needed. diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 832245371e0b1966000ec0252a58ca02193332a7..9c5b8604f40ae56c463b54c71623feb61bd8d297 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, } #endif return framework::OpKernelType( - framework::ToDataType(ctx.Input(name)->type()), - ctx.GetPlace(), layout, library); + framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout, + library); } class ActivationOp : public framework::OperatorWithKernel { diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index a0f8c5c14c48cb1e2be60b53a2198e30b050b33d..87d549678a0e6c183aac89539cf1f6331729de2c 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -41,6 +41,12 @@ static std::unordered_set InplaceOpSet = { "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", }; +/* The following operator can be used to process SelectedRows, because the + * output of those operator for zero is zero too. + */ +static std::unordered_set CanBeUsedBySelectedRows = { + "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"}; + static bool IsInplace(std::string op) { return InplaceOpSet.count(op); } template @@ -50,16 +56,38 @@ class ActivationKernel using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - auto& X = detail::Ref(context.Input("X"), - "Cannot get input tensor X, variable name = %s", - context.op().Input("X")); - - auto& Out = detail::Ref(context.Output("Out"), - "Cannot get output tensor Out, variable name = %s", - context.op().Output("Out")); - Out.mutable_data(context.GetPlace()); + auto x_var = context.InputVar("X"); + auto out_var = context.OutputVar("Out"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable X, variable name = %s", + context.op().Input("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get output Variable Out, variable name = %s", + context.op().Output("Out")); + + framework::Tensor X, *Out; + + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + X = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var), + "Cannot get input Tensor X, variable name = %s", + context.op().Input("X")); + Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + out_var); + } else { + X = detail::Ref(context.Input("X"), + "Cannot get input Tensor X, variable name = %s", + context.op().Input("X")); + Out = context.Output("Out"); + } + + PADDLE_ENFORCE(Out != nullptr, + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + + Out->mutable_data(context.GetPlace()); auto x = framework::EigenVector::Flatten(X); - auto out = framework::EigenVector::Flatten(Out); + auto out = framework::EigenVector::Flatten(*Out); auto* place = context.template device_context().eigen_device(); Functor functor; @@ -78,14 +106,54 @@ class ActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); + auto out_var = context.InputVar("Out"); + auto out_grad_var = context.InputVar(framework::GradVarName("Out")); + auto x_grad_var = context.OutputVar(framework::GradVarName("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + context.op().Input("Out")); + PADDLE_ENFORCE(out_grad_var != nullptr, + "Cannot get input Variable %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + PADDLE_ENFORCE(x_grad_var != nullptr, + "Cannot get output Variable %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); + + framework::Tensor Out, dOut, *dX; + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + Out = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var), + "Cannot get input Tensor Out, variable name = %s", + context.op().Input("Out")); + dOut = + detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar( + *out_grad_var), + "Cannot get input Tensor %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + x_grad_var); + } else { + Out = detail::Ref(context.Input("Out"), + "Cannot get input Tensor Out, variable name = %s", + context.op().Input("Out")); + dOut = detail::Ref( + context.Input(framework::GradVarName("Out")), + "Cannot get input Tensor %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + dX = context.Output(framework::GradVarName("X")); + } + PADDLE_ENFORCE(dX != nullptr, + "Cannot get output tensor %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); dX->mutable_data(context.GetPlace()); - auto dout = framework::EigenVector::Flatten(*dOut); - auto out = framework::EigenVector::Flatten(*Out); + auto dout = framework::EigenVector::Flatten(dOut); + auto out = framework::EigenVector::Flatten(Out); auto dx = framework::EigenVector::Flatten(*dX); auto* place = context.template device_context().eigen_device(); @@ -96,8 +164,19 @@ class ActivationGradKernel } bool inplace = functor.Inplace(); if (!inplace) { - auto* X = context.Input("X"); - auto x = framework::EigenVector::Flatten(*X); + auto x_var = context.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + framework::Tensor X; + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + X = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var)); + } else { + X = detail::Ref(context.Input("X")); + } + + auto x = framework::EigenVector::Flatten(X); functor(*place, x, out, dout, dx); } else { VLOG(10) << " Inplace activation "; diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 9b943440a869e213db4ed761cfe7c508bc5e94ae..75fc59125f21901b6781315eb3d7dba36b7f11f2 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - math::vec_add_bias(n, *bias, x, y); - math::vec_relu(n, y, y); + math::vec_add_bias(n, *bias, x, y); + math::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + math::vec_relu(n, x, y); } } @@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) { for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - math::vec_add_bias(n, -scalar, x, y); // sub - math::vec_exp(n, y, y); // exp + math::vec_add_bias(n, -scalar, x, y); // sub + math::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { @@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index de641cb08e4cc3322cc8387d873f2aaab279e1dd..bddca232e6c8a2a7fde998877006e37ee6d3d0dc 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "mkldnn.hpp" #include "paddle/fluid/operators/batch_norm_op.h" -#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { @@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { const float epsilon = ctx.Attr("epsilon"); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); + const bool use_global_stats = ctx.Attr("use_global_stats"); const bool fuse_with_relu = ctx.Attr("fuse_with_relu"); + bool global_stats = is_test || use_global_stats; const auto *x = ctx.Input("X"); const auto *mean = ctx.Input("Mean"); @@ -177,13 +179,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { T *batch_mean_data = nullptr; T *batch_variance_data = nullptr; - if (!is_test) { + if (!global_stats) { batch_mean_data = batch_mean->mutable_data(ctx.GetPlace()); batch_variance_data = batch_variance->mutable_data(ctx.GetPlace()); } - auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring - : mkldnn::prop_kind::forward_training; + auto propagation = global_stats == true + ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; auto src_tz = paddle::framework::vectorize2int(x->dims()); auto scale_tz = paddle::framework::vectorize2int(scale->dims()); @@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { shift->data() + ic, &scaleshift_data); unsigned flags = mkldnn::use_scale_shift; - if (is_test) flags |= mkldnn::use_global_stats; + if (global_stats) flags |= mkldnn::use_global_stats; if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; // create mkldnn memory from input x tensor @@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { // keys for backward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, is_test, input_format, + src_tz, epsilon, flags, global_stats, input_format, ctx.op().Output("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; @@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data); std::shared_ptr batch_norm_p; - if (is_test) { + if (global_stats) { // create mkldnn memory for stats (as input) std::shared_ptr mean_memory = handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data)); @@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*batch_norm_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - if (!is_test) { + if (!global_stats) { // mkldnn only compute stats for current batch // so we need compute momentum stats via Eigen lib EigenVectorArrayMap batch_mean_e(batch_mean_data, ic); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 2463c939bc5d19500ba36ba3c73db176bb82c62a..f66813989c64737a4b41e3f653d9ca654be72dd6 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -159,6 +159,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("fuse_with_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("use_global_stats", + "(bool, default false) Whether to use global mean and " + "variance. In inference or test mode, set use_global_stats " + "to true or is_test true. the behavior is equivalent. " + "In train mode, when setting use_global_stats True, the " + "global mean and variance are also used during train time, " + "the BN acts as scaling and shiffting.") + .SetDefault(false); AddComment(R"DOC( Batch Normalization. @@ -190,6 +198,10 @@ class BatchNormKernel const float epsilon = ctx.Attr("epsilon"); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + + bool global_stats = is_test || use_global_stats; + const std::string data_layout_str = ctx.Attr("data_layout"); const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); @@ -217,7 +229,7 @@ class BatchNormKernel saved_mean->mutable_data(ctx.GetPlace()); saved_variance->mutable_data(ctx.GetPlace()); - if (!is_test) { + if (!global_stats) { // saved_xx is use just in this batch of data EigenVectorArrayMap saved_mean_e( saved_mean->mutable_data(ctx.GetPlace()), C); @@ -234,7 +246,7 @@ class BatchNormKernel if ((N * sample_size) == 1) { LOG(WARNING) << "Only 1 element in normalization dimension, " << "we skip the batch norm calculation, let y = x."; - framework::TensorCopySync(*x, ctx.GetPlace(), y); + framework::TensorCopy(*x, ctx.GetPlace(), y); return; } @@ -277,7 +289,7 @@ class BatchNormKernel // use SavedMean and SavedVariance to do normalize Eigen::Array inv_std(C); - if (is_test) { + if (global_stats) { ConstEigenVectorArrayMap var_arr( ctx.Input("Variance")->data(), C); inv_std = (var_arr + epsilon).sqrt().inverse(); @@ -289,8 +301,8 @@ class BatchNormKernel inv_std = saved_inv_std; } ConstEigenVectorArrayMap mean_arr( - is_test ? ctx.Input("Mean")->data() - : ctx.Output("SavedMean")->data(), + global_stats ? ctx.Input("Mean")->data() + : ctx.Output("SavedMean")->data(), C); // ((x - est_mean) * (inv_var) * scale + bias @@ -336,15 +348,27 @@ class BatchNormGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { // check input PADDLE_ENFORCE(ctx->HasInput("X")); - PADDLE_ENFORCE(ctx->HasInput("Scale"), ""); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); - PADDLE_ENFORCE(ctx->HasInput("SavedMean"), ""); - PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), ""); + PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SavedMean"), + "Input(SavedMean) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), + "Input(SavedVariance) should not be null"); // check output PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), ""); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), ""); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), + "Output(Scale@GRAD) and Output(Bias@GRAD) should not be " + "null at same time"); + } + const bool use_global_stats = ctx->Attrs().Get("use_global_stats"); + if (use_global_stats) { + PADDLE_ENFORCE(!ctx->Attrs().Get("use_mkldnn"), + "Using global stats during training is not supported " + "in gradient op kernel of batch_norm_mkldnn_op now."); + } const auto x_dims = ctx->GetInputDim("X"); const DataLayout data_layout = framework::StringToDataLayout( @@ -354,8 +378,10 @@ class BatchNormGradOp : public framework::OperatorWithKernel { : x_dims[x_dims.size() - 1]); ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); - ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); + } } protected: @@ -405,6 +431,8 @@ class BatchNormGradKernel // SavedVariance have been reverted in forward operator const auto *saved_inv_variance = ctx.Input("SavedVariance"); const std::string data_layout_str = ctx.Attr("data_layout"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + const float epsilon = ctx.Attr("epsilon"); const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); @@ -419,38 +447,60 @@ class BatchNormGradKernel : x_dims[x_dims.size() - 1]); const int sample_size = x->numel() / N / C; - ConstEigenVectorArrayMap scale_arr(scale->data(), C); - ConstEigenVectorArrayMap mean_arr(saved_mean->data(), C); - ConstEigenVectorArrayMap inv_var_arr(saved_inv_variance->data(), C); - // init output auto *d_x = ctx.Output(framework::GradVarName("X")); auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); d_x->mutable_data(ctx.GetPlace()); - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); + + const T *mean_data = saved_mean->data(); + const T *inv_var_data = saved_inv_variance->data(); + Tensor inv_var_tensor; + if (use_global_stats) { + const auto *running_mean = ctx.Input("Mean"); + const auto *running_variance = ctx.Input("Variance"); + mean_data = running_mean->data(); + T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); + EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); + ConstEigenVectorArrayMap var_arr(running_variance->data(), C); + + inv_var_tmp = (var_arr + epsilon).sqrt().inverse().eval(); + inv_var_data = running_inv_var_data; + } + + ConstEigenVectorArrayMap scale_arr(scale->data(), C); + ConstEigenVectorArrayMap mean_arr(mean_data, C); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); + + T *d_bias_data = nullptr; + T *d_scale_data = nullptr; + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + d_bias_data = d_bias->mutable_data(ctx.GetPlace()); + d_scale_data = d_scale->mutable_data(ctx.GetPlace()); + } // d_bias = np.sum(d_y, axis=0) // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + EigenVectorArrayMap d_bias_arr(d_bias_data, C); + EigenVectorArrayMap d_scale_arr(d_scale_data, C); - EigenVectorArrayMap d_bias_arr(d_bias->mutable_data(ctx.GetPlace()), - C); - EigenVectorArrayMap d_scale_arr(d_scale->mutable_data(ctx.GetPlace()), - C); - - d_bias_arr.setZero(); - d_scale_arr.setZero(); + if (d_scale && d_bias) { + d_bias_arr.setZero(); + d_scale_arr.setZero(); + } - if ((N * sample_size) == 1) { - framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x); + if ((N * sample_size) == 1 && !use_global_stats) { + framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); return; } - const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size); + int scale_coefff = use_global_stats ? 1 : N * sample_size; + const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; switch (data_layout) { case DataLayout::kNCHW: { @@ -460,19 +510,29 @@ class BatchNormGradKernel sample_size, N * C); d_x_arr.setZero(); - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_bias_arr(c) += d_y_arr.col(nc).sum(); - d_scale_arr(c) += - ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) - .sum(); + if (d_scale && d_bias) { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_bias_arr(c) += d_y_arr.col(nc).sum(); + d_scale_arr(c) += ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * + d_y_arr.col(nc)) + .sum(); + } } - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_x_arr.col(nc) += - scale_inv_var_nhw(c) * - (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) - - (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c)); + if (!use_global_stats) { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) += + scale_inv_var_nhw(c) * + (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * + inv_var_arr(c)); + } + } else { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) += scale_inv_var_nhw(c) * d_y_arr.col(nc); + } } break; } @@ -488,15 +548,27 @@ class BatchNormGradKernel const auto d_y_mul_x_minus_mean_row_sum = (d_y_arr * x_minus_mean).rowwise().sum(); const auto inv_var_sqr = inv_var_arr * inv_var_arr; - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_bias_arr += d_y_arr.col(nhw); - d_scale_arr += - (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); - d_x_arr.col(nhw) += - scale_inv_var_nhw * - (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum - - x_minus_mean.col(nhw) * inv_var_sqr * - d_y_mul_x_minus_mean_row_sum); + + if (d_scale && d_bias) { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_bias_arr += d_y_arr.col(nhw); + d_scale_arr += + (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); + } + } + + if (!use_global_stats) { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) += + scale_inv_var_nhw * + (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum - + x_minus_mean.col(nhw) * inv_var_sqr * + d_y_mul_x_minus_mean_row_sum); + } + } else { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) += scale_inv_var_nhw * d_y_arr.col(nhw); + } } break; } @@ -522,6 +594,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("SavedMean", Output("SavedMean")); op->SetInput("SavedVariance", Output("SavedVariance")); + // used when setting use_global_stats True during training + op->SetInput("Mean", Output("MeanOut")); + op->SetInput("Variance", Output("VarianceOut")); + op->SetAttrMap(Attrs()); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); diff --git a/paddle/fluid/operators/batch_norm_op.cu.cc b/paddle/fluid/operators/batch_norm_op.cu similarity index 57% rename from paddle/fluid/operators/batch_norm_op.cu.cc rename to paddle/fluid/operators/batch_norm_op.cu index aaed335c905c0d80cd519afc5fecb06af73fcfe7..1c45746a92ad057a97d9f65aa256df616fc37f3d 100644 --- a/paddle/fluid/operators/batch_norm_op.cu.cc +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/batch_norm_op.h" +#include #include +#include +#include +#include "cub/cub.cuh" #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" @@ -59,6 +63,7 @@ class BatchNormKernel double epsilon = static_cast(ctx.Attr("epsilon")); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); + const bool use_global_stats = ctx.Attr("use_global_stats"); const std::string data_layout_str = ctx.Attr("data_layout"); const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); @@ -121,7 +126,7 @@ class BatchNormKernel auto handle = dev_ctx.cudnn_handle(); // Now, depending on whether we are running test or not, we have two paths. - if (is_test) { + if (is_test || use_global_stats) { // only when test we use input to do computation. const auto *est_mean = ctx.Input("Mean"); const auto *est_var = ctx.Input("Variance"); @@ -163,7 +168,7 @@ class BatchNormKernel if ((N * H * W * D) == 1) { LOG(WARNING) << "Only 1 element in normalization dimension, " << "we skip the batch norm calculation, let y = x."; - framework::TensorCopySync(*x, ctx.GetPlace(), y); + framework::TensorCopy(*x, ctx.GetPlace(), y); } else { double this_factor = 1. - momentum; @@ -191,6 +196,58 @@ class BatchNormKernel } }; +template +static __global__ void KeBNBackwardData(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *variance, + const double epsilon, const int C, + const int HxW, const int num, T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); + dx[i] = static_cast(static_cast>(dy[i]) * + scale[c] * inv_var); + } +} + +template +static __global__ void KeBNBackwardScaleBias( + const T *dy, const T *x, const BatchNormParamType *mean, + const BatchNormParamType *variance, const double epsilon, const int N, + const int C, const int HxW, BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); + BatchNormParamType mean_i = mean[i]; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + ds_sum += static_cast>(dy[index]) * + (static_cast>(x[index]) - mean_i); + db_sum += static_cast>(dy[index]); + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale[i] = ds_sum * inv_var_i; + dbias[i] = db_sum; + } + __syncthreads(); + } +} + template class BatchNormGradKernel : public framework::OpKernel { @@ -200,6 +257,8 @@ class BatchNormGradKernel "It must use CUDAPlace."); double epsilon = static_cast(ctx.Attr("epsilon")); const std::string data_layout_str = ctx.Attr("data_layout"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); const auto *x = ctx.Input("X"); @@ -219,42 +278,13 @@ class BatchNormGradKernel auto *d_bias = ctx.Output(framework::GradVarName("Bias")); d_x->mutable_data(ctx.GetPlace()); - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); - - auto &dev_ctx = ctx.template device_context(); - if ((N * H * W * D) == 1) { - framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x); - math::SetConstant> - functor; - functor(dev_ctx, d_scale, static_cast>(0)); - functor(dev_ctx, d_bias, static_cast>(0)); - return; + if (d_scale && d_bias) { + d_scale->mutable_data>(ctx.GetPlace()); + d_bias->mutable_data>(ctx.GetPlace()); } - PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); PADDLE_ENFORCE_EQ(scale->dims()[0], C); - // ------------------- cudnn descriptors --------------------- - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - CUDNN_ENFORCE( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#if CUDNN_VERSION_MIN(7, 0, 0) - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; -#else - mode_ = CUDNN_BATCHNORM_SPATIAL; -#endif - std::vector dims; std::vector strides; if (data_layout == DataLayout::kNCHW) { @@ -264,34 +294,114 @@ class BatchNormGradKernel dims = {N, C, H, W, D}; strides = {H * W * C * D, 1, W * D * C, D * C, C}; } - CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, mode_)); - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_var = ctx.Input("SavedVariance"); - const void *saved_mean_data = - saved_mean->template data>(); - const void *saved_var_data = - saved_var->template data>(); - - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, x->template data(), - data_desc_, d_y->template data(), data_desc_, - d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, - scale->template data>(), - d_scale->template mutable_data>(ctx.GetPlace()), - d_bias->template mutable_data>(ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); - // clean when exit. - CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - CUDNN_ENFORCE( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + auto &dev_ctx = ctx.template device_context(); + if (!use_global_stats) { + if ((N * H * W * D) == 1) { + framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + math::SetConstant> + functor; + functor(dev_ctx, d_scale, static_cast>(0)); + functor(dev_ctx, d_bias, static_cast>(0)); + return; + } + + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#if CUDNN_VERSION_MIN(7, 0, 0) + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; +#else + mode_ = CUDNN_BATCHNORM_SPATIAL; +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_var = ctx.Input("SavedVariance"); + const void *saved_mean_data = + saved_mean->template data>(); + const void *saved_var_data = + saved_var->template data>(); + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, d_y->template data(), data_desc_, + d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data>(), + d_scale->template mutable_data>(ctx.GetPlace()), + d_bias->template mutable_data>(ctx.GetPlace()), + epsilon, saved_mean_data, saved_var_data)); + + // clean when exit. + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } else { + const auto *running_mean = ctx.Input("Mean"); + const auto *running_var = ctx.Input("Variance"); + + const auto *running_mean_data = + running_mean->template data>(); + const auto *running_var_data = + running_var->template data>(); + + const int num = x->numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid1 = (num + block - 1) / block; + int grid2 = std::min(C, max_blocks); + + if (data_layout == framework::DataLayout::kNCHW) { + if (d_x) { + KeBNBackwardData<<< + grid1, block, 0, dev_ctx.stream()>>>( + d_y->data(), scale->data>(), + running_var_data, epsilon, C, H * W, num, d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias<<< + grid2, block, 0, dev_ctx.stream()>>>( + d_y->data(), x->data(), running_mean_data, running_var_data, + epsilon, C, H * W, num, d_scale->data>(), + d_bias->data>()); + } + } else { + if (d_x) { + KeBNBackwardData<<< + grid1, block, 0, dev_ctx.stream()>>>( + d_y->data(), scale->data>(), + running_var_data, epsilon, C, H * W, num, d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias<<< + grid2, block, 0, dev_ctx.stream()>>>( + d_y->data(), x->data(), running_mean_data, running_var_data, + epsilon, C, H * W, num, d_scale->data>(), + d_bias->data>()); + } + } + } } }; diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 2c09ee7394ad605f7a324d021ce0468a79bb71ca..3235ad52b999e1ca3f992034781edaab9921a300 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); - if (activation == "identity") { - // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is - // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. - algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - } else if (!exhaustive_search) { + if (!exhaustive_search) { CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, @@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // ------------------- cudnn conv+bias+act forward -------------------- - ScalingParamType alpha1 = 1.0f; - ScalingParamType alpha2 = residual ? 1.0f : 0.0f; - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + if ((activation == "identity") && + (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) && + (!residual)) { + // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is + // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. + // But test in some case, the speed is slower, change to use + // cudnnConvolutionForward and cudnnAddTensor + // ------------- cudnn conv forward and bias add --------------------- + ScalingParamType alpha = 1.0f, beta = 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( + handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } else { + if (activation == "identity") { + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } + // ------------------- cudnn conv+bias+act forward -------------------- + ScalingParamType alpha1 = 1.0f; + ScalingParamType alpha2 = residual ? 1.0f : 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } } }; #endif diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 9e2e2cf818000d9181447a0aa6b4ac4878781f35..ce45dd58419ab20cccf00544288b79d869515578 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { @@ -28,259 +28,6 @@ using mkldnn::stream; using platform::to_void_cast; using platform::GetMKLDNNFormat; -class ConvMKLDNNHandler : public platform::MKLDNNHandler { - public: - ConvMKLDNNHandler( - std::shared_ptr conv_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) { - conv_pd_ = conv_pd; - } - - ConvMKLDNNHandler( - std::shared_ptr conv_pd, - std::shared_ptr - conv_bwd_data_pd, - std::shared_ptr - conv_bwd_weights_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), - conv_pd_(conv_pd), - conv_bwd_weights_pd_(conv_bwd_weights_pd), - conv_bwd_data_pd_(conv_bwd_data_pd) { - // If we are in Grad operatgor then update a key with BWD suffix to - // distinguish from FWD memory primitives - key_ += "-BWD"; - } - - size_t GetDstMemorySize() const { - return conv_pd_->dst_primitive_desc().get_size(); - } - - mkldnn::memory::format GetDstFormat() const { - return static_cast( - conv_pd_->dst_primitive_desc().desc().data.format); - } - - size_t GetDiffWeightsMemorySize() const { - return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size(); - } - - size_t GetDiffSourceMemorySize() const { - return conv_bwd_data_pd_->diff_src_primitive_desc().get_size(); - } - - std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto src_pd = conv_bwd_weights_pd_->src_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); - return this->AcquireMemory(src_pd, user_pd, user_memory_p, - "@weights-src_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@weights-diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( - void* ptr) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr, - "@diff_weights_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@data-diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( - const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { // NOLINT - auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc(); - auto user_pd = user_weights_memory_p->get_primitive_desc(); - return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, - "@data-weights_mem_p", pipeline); - } - - std::shared_ptr AcquireResidualDataMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p"); - } - - std::shared_ptr AcquireDstMemoryFromResidualDataMemory( - const std::shared_ptr& user_residual_memory_p, - void* dst_ptr, - std::vector& pipeline) { // NOLINT - return this->AcquireMemory(user_residual_memory_p, - this->AcquireDstMemoryFromPrimitive(dst_ptr), - "@residual_data_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffSrcMemoryFromDataPrimitive( - void* ptr) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p"); - } - - std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr, - "@dst_mem_p"); - } - - std::shared_ptr AcquireSrcMemoryFromPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto src_pd = conv_pd_->src_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); - return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", - pipeline); - } - - std::shared_ptr AcquireWeightsMemoryFromPrimitive( - const std::shared_ptr user_weights_memory_p, - std::vector& pipeline, // NOLINT - bool is_persistent = false) { - auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); - auto weights_pd = conv_pd_->weights_primitive_desc(); - return this->AcquireMemory(weights_pd, user_weights_pd, - user_weights_memory_p, "@weights_mem_p", - pipeline, is_persistent); - } - - std::shared_ptr AcquireBiasMemoryFromPrimitive( - const std::shared_ptr user_bias_memory_p, - std::vector& pipeline) { // NOLINT - auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); - auto bias_pd = conv_pd_->bias_primitive_desc(); - return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, - "@bias_mem_p", pipeline); - } - - std::shared_ptr AcquireConvolution( - std::shared_ptr src_memory_p, - std::shared_ptr weights_memory_p, - std::shared_ptr dst_memory_p) { - auto prim_key = key_ + "@conv_p"; - auto conv_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), - "Fail to find convolution primitive in device context"); - if (conv_p == nullptr) { - conv_p = std::make_shared( - *conv_pd_, *(src_memory_p), *(weights_memory_p.get()), - *(dst_memory_p.get())); - - dev_ctx_.SetBlob(prim_key, conv_p); - } else { - is_reusing_ = true; - } - return conv_p; - } - - std::shared_ptr AcquireConvolution( - std::shared_ptr src_memory_p, - std::shared_ptr weights_memory_p, - std::shared_ptr bias_memory_p, - std::shared_ptr dst_memory_p) { - auto prim_key = key_ + "@conv_p"; - auto conv_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), - "Fail to find convolution primitive in device context"); - if (conv_p == nullptr) { - conv_p = std::make_shared( - *conv_pd_, *(src_memory_p), *(weights_memory_p.get()), - *(bias_memory_p.get()), *(dst_memory_p.get())); - - dev_ctx_.SetBlob(prim_key, conv_p); - } else { - is_reusing_ = true; - } - return conv_p; - } - - std::shared_ptr - AcquireConvolutionBackwardWeights( - std::shared_ptr src_memory_p, - std::shared_ptr diff_dst_memory_p, - std::shared_ptr diff_weights_memory_p) { - auto prim_key = key_ + "@conv_bwd_weights_p"; - auto conv_bwd_weights_p = - std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - PADDLE_ENFORCE( - (conv_bwd_weights_p != nullptr) || (is_reusing_ == false), - "Fail to find convolution bwd weights primitive in device context"); - if (conv_bwd_weights_p == nullptr) { - // create backward conv primitive for weights - conv_bwd_weights_p = - std::make_shared( - *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p, - *diff_weights_memory_p); - dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p); - } else { - is_reusing_ = true; - } - return conv_bwd_weights_p; - } - - std::shared_ptr - AcquireConvolutionBackwardData( - std::shared_ptr diff_dst_memory_p, - std::shared_ptr weights_memory_p, - std::shared_ptr diff_src_memory_p) { - auto prim_key = key_ + "@conv_bwd_data_p"; - auto conv_bwd_data_p = - std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - PADDLE_ENFORCE( - (conv_bwd_data_p != nullptr) || (is_reusing_ == false), - "Fail to find convolution bwd data primitive in device context"); - if (conv_bwd_data_p == nullptr) { - conv_bwd_data_p = std::make_shared( - *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p, - *diff_src_memory_p); - dev_ctx_.SetBlob(prim_key, conv_bwd_data_p); - } else { - is_reusing_ = true; - } - return conv_bwd_data_p; - } - - // Generate keys for storing/retriving primitives for this operator - // TODO(jczaja): Make hashing function more optimial - static std::string GetHash(memory::dims& input_dims, // NOLINT - memory::dims& weights_dims, // NOLINT - std::vector& strides, // NOLINT - std::vector& paddings, // NOLINT - std::vector& dilations, // NOLINT - int groups, const std::string& suffix) { - return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + - dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + - suffix; - } - - private: - std::shared_ptr conv_pd_; - std::shared_ptr - conv_bwd_weights_pd_; - std::shared_ptr - conv_bwd_data_pd_; -}; - template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -351,7 +98,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // Get unique name for storing MKLDNN primitives - const std::string key = ConvMKLDNNHandler::GetHash( + const std::string key = platform::ConvMKLDNNHandler::GetHash( src_tz, weights_tz, strides, paddings, dilations, groups, ctx.op().Output("Output")); const std::string key_conv_pd = key + "@conv_pd"; @@ -400,7 +147,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { // Save conv_pd/src_memory/weights_memory for backward pass if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd); - ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); + platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); // create mkldnn memory from input tensors (data/weights) auto user_src_memory_p = @@ -616,9 +363,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // Get an unique name from "argument" name of "Output" variable // as well as attributes of primitive to be created // This name will be used as key when saving info into device context - const std::string key = - ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings, - dilations, groups, ctx.op().Input("Output")); + const std::string key = platform::ConvMKLDNNHandler::GetHash( + src_tz, weights_tz, strides, paddings, dilations, groups, + ctx.op().Input("Output")); const std::string key_conv_pd = key + "@conv_pd"; std::vector pipeline; @@ -673,8 +420,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::make_shared( conv_bwd_data_desc, mkldnn_engine, *conv_pd); - ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd, - dev_ctx, mkldnn_engine, key); + platform::ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, + conv_bwd_weights_pd, dev_ctx, + mkldnn_engine, key); // create mkldnn memory from input tensors (data/weights) auto user_src_memory_p = @@ -743,8 +491,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvMKLDNNOpKernel); +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); -REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvMKLDNNGradOpKernel); +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 342525be49e28f1785e25d4daad38c3c81b4774f..7455b9492f054b32ee7fb1fc90b1a344367ceb81 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; framework::LibraryType library{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready std::string data_format = ctx.Attr("data_format"); @@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; + customized_type_value = kConvMKLDNNFP32; } #endif @@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library); + library, customized_type_value); } void Conv2DOpMaker::Make() { @@ -342,6 +345,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; framework::LibraryType library_{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready std::string data_format = ctx.Attr("data_format"); @@ -357,12 +362,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; layout_ = framework::DataLayout::kMKLDNN; + customized_type_value = kConvMKLDNNFP32; } #endif return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + layout_, library_, customized_type_value); } } // namespace operators diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index e69814001e4da5d10e51ee57c1dbe291338b8b49..249f308c13ff5636fbaa6747b28cab7886b7e736 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -27,6 +27,8 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +constexpr int kConvMKLDNNFP32 = 1; +constexpr int kConvMKLDNNINT8 = 2; // Base convolution operator definations for other conv // like operators to reuse the implementation. diff --git a/paddle/fluid/operators/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..317d4cebe26b81ff03c212e6328233d5152ed1b4 --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_mkldnn_op.cc @@ -0,0 +1,299 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using framework::DataLayout; + +template +class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + const bool is_test = ctx.Attr("is_test"); + PADDLE_ENFORCE( + is_test == true, + "ConvTransposeMKLDNN works only for inference!. Set is_test = True"); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); + + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != mkldnn::memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != mkldnn::memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->dims().size() == 4, + "Input must be with 4 dimensions, i.e. NCHW"); + PADDLE_ENFORCE(filter->dims().size() == 4, + "Filter must be with 4 dimensions, i.e. OIHW"); + + if (bias) { + PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && + bias->format() != mkldnn::memory::format::format_undef, + "Wrong layout/format set for Bias tensor"); + PADDLE_ENFORCE(bias->dims().size() == 1, + "Bias must only have 1 dimension, i.e. X"); + } + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + + // TODO(tpatejko): add support for dilation + PADDLE_ENFORCE( + dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + "dilation in convolution is not implemented yet"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector iohw_weights_tz = + paddle::framework::vectorize2int(filter->dims()); + std::vector weights_tz = iohw_weights_tz; + // IOHW -> OIHW + weights_tz[0] = iohw_weights_tz[1]; + weights_tz[1] = iohw_weights_tz[0]; + + // Custom Reorder from IOHW to OIHW + auto iohw2oihw_reorder = + [&iohw_weights_tz](const T* filter_data) -> std::shared_ptr { + int o = iohw_weights_tz[1]; + int c = iohw_weights_tz[0]; + int h = iohw_weights_tz[2]; + int w = iohw_weights_tz[3]; + std::shared_ptr reordered_filter_data(new T[o * c * h * w](), + std::default_delete()); + for (int i = 0; i < c; ++i) { + for (int j = 0; j < o; ++j) { + int in_offset = j * h * w + i * o * h * w; + int out_offset = j * c * h * w + i * h * w; + std::memcpy(&(reordered_filter_data.get())[out_offset], + &filter_data[in_offset], h * w * sizeof(T)); + } + } + + return reordered_filter_data; + }; + + int g = std::max(groups, 1); + if (g > 1) { + int o = weights_tz[0]; + int i = weights_tz[1]; + int h = weights_tz[2]; + int w = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = g; + weights_tz[1] = o / g; + weights_tz[2] = i; + weights_tz[3] = h; + weights_tz[4] = w; + } + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // Get unique name for storing MKLDNN primitives + const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash( + src_tz, weights_tz, strides, paddings, dilations, groups, + ctx.op().Output("Output")); + const std::string key_conv_transpose_pd = key + "@conv_transpose_pd"; + + std::vector pipeline; + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input->format()); + auto user_weights_md = + platform::MKLDNNMemDesc({weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? mkldnn::memory::format::oihw + : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + bool fuse_relu = ctx.Attr("fuse_relu"); + + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. + // Currently used whenever bias is != nullptr. + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + + // create a deconv(conv transpose) primitive descriptor and save it for + // usage in backward + std::shared_ptr + conv_transpose_pd; + auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training; + if (bias) { + bias_tz = paddle::framework::vectorize2int(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc( + bias_tz, platform::MKLDNNGetDataType(), mkldnn::memory::format::x); + conv_transpose_pd = ConvTransposeFwdPrimitiveDesc( + src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, + fuse_relu, fwd_prop_kind); + } else { + conv_transpose_pd = ConvTransposeFwdPrimitiveDesc( + src_md, weights_md, dst_md, strides, paddings, mkldnn_engine, + fuse_relu, fwd_prop_kind); + } + // Save conv_pd/src_memory/weights_memory for backward pass + if (!is_test) dev_ctx.SetBlob(key_conv_transpose_pd, conv_transpose_pd); + + platform::ConvTransposeMKLDNNHandler handler(conv_transpose_pd, dev_ctx, + mkldnn_engine, key); + + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory_p = handler.AcquireSrcMemory( + user_src_md, platform::to_void_cast(input_data)); + auto user_weights_memory_p = handler.AcquireWeightsMemory( + user_weights_md, platform::to_void_cast(filter_data), + is_test ? iohw2oihw_reorder : platform::user_function()); + + // create reorder primitive if the input format is not the preferred one + auto src_memory_p = + handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test); + + std::shared_ptr dst_memory_p; + + auto output_data = output->mutable_data( + ctx.GetPlace(), paddle::memory::Allocator::kDefault, + handler.GetDstMemorySize()); + dst_memory_p = handler.AcquireDstMemoryFromPrimitive( + platform::to_void_cast(output_data)); + + // create convolution op primitive + std::shared_ptr conv_p; + if (bias) { + const T* bias_data = bias->data(); + auto user_bias_md = + platform::MKLDNNMemDesc({bias_tz}, platform::MKLDNNGetDataType(), + mkldnn::memory::format::x); + auto user_bias_memory_p = handler.AcquireBiasMemory( + user_bias_md, platform::to_void_cast(bias_data)); + + auto bias_memory_p = + handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline); + conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); + } + + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); + } + + private: + mkldnn::primitive_attr CreatePostOps(bool fuse_relu) const { + mkldnn::primitive_attr conv_attr; + mkldnn::post_ops post_operations; + // Fusion with ReLU layer is executed through the PostOps feature. Create a + // PostOps object and configure it to execute an eltwise relu operation. + if (fuse_relu) { + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 0.0f; + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + } + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + + std::unique_ptr + ConvTransposeFwdPrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& dst, const std::vector& strides, + const std::vector& paddings, const mkldnn::engine& engine, + const bool fuse_relu, mkldnn::prop_kind fwd_prop_kind) const { + mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; + mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto deconv_desc = mkldnn::deconvolution_forward::desc( + fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, dst, + stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu); + + auto p_conv_transpose_pd = + new mkldnn::deconvolution_forward::primitive_desc(deconv_desc, + deconv_attr, engine); + + return std::unique_ptr( + p_conv_transpose_pd); + } + + std::unique_ptr + ConvTransposeFwdPrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& bias, const mkldnn::memory::desc& dst, + const std::vector& strides, const std::vector& paddings, + const mkldnn::engine& engine, const bool fuse_relu, + mkldnn::prop_kind fwd_prop_kind) const { + mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; + mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto deconv_desc = mkldnn::deconvolution_forward::desc( + fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, bias, dst, + stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu); + + auto p_conv_transpose_pd = + new mkldnn::deconvolution_forward::primitive_desc(deconv_desc, + deconv_attr, engine); + + return std::unique_ptr( + p_conv_transpose_pd); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(conv2d_transpose, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConvTransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index a916dd3496ffaffa138529a8a2f7e20ef26fcc96..2fdfc40d194224f0328161f5689da6246b1aae7f 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -16,6 +16,10 @@ limitations under the License. */ #include #include +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace operators { @@ -78,29 +82,38 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); bool use_cudnn = ctx.Attr("use_cudnn"); use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(ctx.GetPlace())) { auto& dev_ctx = ctx.template device_context(); use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } } #endif - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; } +#endif - std::string data_format = ctx.Attr("data_format"); - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), layout_, library_); } void Conv2DTransposeOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); AddInput( "Input", "(Tensor) The input tensor of convolution transpose operator. " @@ -145,6 +158,11 @@ void Conv2DTransposeOpMaker::Make() { "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -238,6 +256,9 @@ void Conv3DTransposeOpMaker::Make() { "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e63d57be57a66e8e02f7ef88acd01246302bc53c --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -0,0 +1,218 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class CudnnLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(Weight) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("InitH"), + "Input(init_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InitC"), + "Input(init_c) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cache"), + "Input(Cache) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("last_h"), + "Output(last_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("last_c"), + "Output(last_c) of LSTM should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3."); + + ctx->SetOutputDim("Out", ctx->GetInputDim("Input")); + ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH")); + ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC")); + } +}; + +class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "(Tensor) RNN input tensor, which support variable-time length input " + "sequence." + "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)" + "seq_len is the total time step in this mini-batch (CAN be change in " + "different batch)" + "batch_size is the instance number of this batch" + "input_size is the hidden size of the input." + "input_hidden_size and the hidden_size in the next may not be same"); + AddInput("InitH", + "(Tensor) the initial hidden state of the LSTM" + "input. This is a tensor with shape (num_layers x batch_size x " + "hidden_size)" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddInput("InitC", + "(Tensor) the initial cell state of the LSTm " + "input. This is a tensor with shape (num_layers x batch_size x " + "hidden_size)" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddInput("W", + "(Tensor) the learnable hidden-hidden weights." + " The shape is (N), where N is total weight size of the LSTM. " + " cudnn concatenate all the weight to one Tensor"); + AddInput("Cache", + "The cache of dropout op, a RAW type variable including random " + "number generator states and some descriptors, which is used in " + "cudnn kernel.") + .AsDispensable(); + AddOutput("Out", + "(Tensor) the hidden state of LSTM operator. " + "The shape is ( seq_len x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirec is True, the shape will be ( seq_len x " + "batch_size x hidden_size * 2) "); + AddOutput("last_h", + "(Tensor) the hidden state of the last step. " + "The shape is ( num_layers x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddOutput("last_c", + "(Tensor) the cell state of the last step" + "The shape is ( num_layers x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirect is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size*2)"); + AddAttr("max_len", + "max length of the LSTM op" + "the first dim of the Input can NOT be greater than max_len") + .SetDefault(20); + AddAttr( + "dropout_prob", + "dropout prob of the dropout op" + "the dropout ONLY work between lstm layers, not between time steps" + "There is no dropout work on the Out tensor") + .SetDefault(0.0); + AddAttr("is_bidirec", + "is_bidirec" + "if it is bidirection rnn" + "The will affect the shape of the Out, last_h, and last_c") + .SetDefault(false); + AddAttr("input_size", "input size ot the Input Tensor").SetDefault(10); + AddAttr("hidden_size", "hidden size of the LSTM").SetDefault(100); + AddAttr("num_layers", "the total layer number of the LSTM") + .SetDefault(1); + AddAttr("is_test", "True if in test phase.").SetDefault(false); + AddAttr("seed", "seed to used if fix_seed is True").SetDefault(-1); + AddComment(R"DOC( +CUDNN LSTM implementation + +A four-gate Long Short-Term Memory network with no peephole connections. +In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, +the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: + +$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ + +$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + +$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + +$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + +$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + +$$ h_t = o_t \\odot tanh(c_t) $$ + +- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) +- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). +- sigmoid is the logistic sigmoid function. +- $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. +- The $\odot$ is the element-wise product of the vectors. +- `tanh` is the activation functions. +- $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + +Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, +X represensts a matrix multiplication + + +)DOC"); + } +}; + +class CudnnLSTMGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("last_h"), + "Input(last_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("last_c"), + "Input(last_c) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("Cache"), + "Input(last_c) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InitH"), + "Input(init_h) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("InitC"), + "Input(init_c) of LSTM should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) { + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + } + }; + + SetOutGradDim("Input"); + SetOutGradDim("W"); + SetOutGradDim("InitH"); + SetOutGradDim("InitC"); + } +}; + +template +class NotImpleKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW( + "CPU is not support for this kernel now. Will be add in the future"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); + +REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel); +REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..dd64cc327fc383937bc9a9d6e7daa0cec488e4cc --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -0,0 +1,493 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + Tensor reserve_data_; + Tensor workspace_data_; + + Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx, + size_t max_len, int batch_size, int input_size, int hidden_size, + int num_layers, float dropout_prob, bool is_bidirec, int seed, + int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = + dropout_state_.mutable_data(ctx.GetPlace()); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(ctx.GetPlace()); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(ctx.GetPlace()); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +template +class CudnnLSTMGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const Tensor *x = ctx.Input("Input"); + const Tensor *init_h = ctx.Input("InitH"); + const Tensor *init_c = ctx.Input("InitC"); + + auto w = ctx.Input("W"); + + Tensor *out = ctx.Output("Out"); + Tensor *last_h = ctx.Output("last_h"); + Tensor *last_c = ctx.Output("last_c"); + + const T *x_data = x->data(); + const T *init_h_data = init_h->data(); + const T *init_c_data = init_c->data(); + + const T *w_data = w->data(); + + T *out_data = out->mutable_data(ctx.GetPlace()); + T *last_h_data = last_h->mutable_data(ctx.GetPlace()); + T *last_c_data = last_c->mutable_data(ctx.GetPlace()); + + size_t max_len = ctx.Attr("max_len"); + float dropout_prob = ctx.Attr("dropout_prob"); + bool is_bidirec = ctx.Attr("is_bidirec"); + int input_size = ctx.Attr("input_size"); + int hidden_size = ctx.Attr("hidden_size"); + int num_layers = ctx.Attr("num_layers"); + bool is_test = ctx.Attr("is_test"); + + auto &dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto *cache_var = ctx.InputVar("Cache"); + if (!cache_var) { + // The RAW type cache variable wouldn't be created and broadcasted on + // multi-devices before the first running. + // use parent scope to make cache persistable + auto *scope = const_cast(ctx.scope().parent()); + auto cache_var_name = ctx.Inputs("Cache")[0]; + cache_var = scope->Var(cache_var_name); + } + CudnnRNNCache *cudnn_rnn_cache = nullptr; + if (cache_var->IsInitialized()) { + cudnn_rnn_cache = const_cast(cache_var) + ->GetMutable(); + } else { + cudnn_rnn_cache = const_cast(cache_var) + ->GetMutable(); + std::random_device rnd; + int seed = ctx.Attr("seed"); + if (seed == -1) { + seed = rnd(); + } + + auto input_w_numel = w->numel(); + auto batch_size = x->dims()[1]; + cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size, + hidden_size, num_layers, dropout_prob, is_bidirec, + seed, input_w_numel); + } + + auto run_seq_len = x->dims()[0]; + + if (is_test) { + // for inference + CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, + init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, + cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, + cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, + last_c_data, cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_)); + } else { + // for train + CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, + init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, + cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, + cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, + last_c_data, cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_, + cudnn_rnn_cache->reserve_data_.data(), + cudnn_rnn_cache->reserve_size_)); + } + } +}; + +template +class CudnnLSTMGPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *input = ctx.Input("Input"); + auto *weight = ctx.Input("W"); + auto *init_h = ctx.Input("InitH"); + auto *init_c = ctx.Input("InitC"); + // auto * last_h = ctx.Input("last_h"); + // auto * last_c = ctx.Input("last_c"); + auto *out = ctx.Input("Out"); + auto *out_grad = ctx.Input(framework::GradVarName("Out")); + auto *last_h_grad = ctx.Input(framework::GradVarName("last_h")); + auto *last_c_grad = ctx.Input(framework::GradVarName("last_c")); + + // auto* init_h = ctx.Input("init_h"); + // auto* init_c = ctx.Input("init_c"); + + auto *in_grad = ctx.Output(framework::GradVarName("Input")); + auto *weight_grad = ctx.Output(framework::GradVarName("W")); + auto *init_h_grad = ctx.Output(framework::GradVarName("InitH")); + auto *init_c_grad = ctx.Output(framework::GradVarName("InitC")); + + auto &dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto *cache_var = ctx.InputVar("Cache"); + PADDLE_ENFORCE(cache_var->IsInitialized()); + CudnnRNNCache *cudnn_rnn_cache = + const_cast(cache_var) + ->GetMutable(); + + auto input_dims = input->dims(); + auto weight_dims = weight->dims(); + auto init_h_dims = init_h->dims(); + auto init_c_dims = init_c->dims(); + in_grad->mutable_data(ctx.GetPlace()); + weight_grad->mutable_data(ctx.GetPlace()); + math::SetConstant zero; + zero(dev_ctx, in_grad, static_cast(0.0)); + zero(dev_ctx, weight_grad, static_cast(0.0)); + + T *init_h_grad_data = NULL; + if (init_h_grad == nullptr) { + Tensor init_h_grad_temp; + init_h_grad_temp.mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, &init_h_grad_temp, static_cast(0.0)); + + init_h_grad_data = init_h_grad_temp.data(); + } else { + init_h_grad->mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, init_h_grad, static_cast(0.0)); + init_h_grad_data = init_h_grad->data(); + } + + T *init_c_grad_data = NULL; + if (init_c_grad == nullptr) { + Tensor init_c_grad_temp; + init_c_grad_temp.mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, &init_c_grad_temp, static_cast(0.0)); + + init_c_grad_data = init_c_grad_temp.data(); + } else { + init_c_grad->mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, init_c_grad, static_cast(0.0)); + init_c_grad_data = init_c_grad->data(); + } + + const T *last_h_grad_data = NULL; + if (last_h_grad == nullptr) { + Tensor last_h_grad_temp; + last_h_grad_temp.mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, &last_h_grad_temp, static_cast(0.0)); + + last_h_grad_data = (const T *)last_h_grad_temp.data(); + } else { + last_h_grad_data = last_h_grad->data(); + } + + const T *last_c_grad_data = NULL; + if (last_c_grad == nullptr) { + Tensor last_c_grad_temp; + last_c_grad_temp.mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, &last_c_grad_temp, static_cast(0.0)); + + last_c_grad_data = (const T *)last_c_grad_temp.data(); + } else { + last_c_grad_data = last_c_grad->data(); + } + + const T *out_grad_data = NULL; + if (out_grad == nullptr) { + Tensor out_grad_temp; + out_grad_temp.mutable_data(out->dims(), ctx.GetPlace()); + zero(dev_ctx, &out_grad_temp, static_cast(0.0)); + + out_grad_data = (const T *)out_grad_temp.data(); + } else { + out_grad_data = out_grad->data(); + } + + // zero( dev_ctx, last_h_grad, static_cast(0.0)); + // zero( dev_ctx, last_c_grad, static_cast(0.0)); + + auto out_data = out->data(); + // auto out_grad_data = out_grad->data(); + auto weight_data = weight->data(); + auto init_h_data = init_h->data(); + auto init_c_data = init_c->data(); + auto in_grad_data = in_grad->data(); + + auto work_data = cudnn_rnn_cache->workspace_data_.data(); + auto reserve_data = cudnn_rnn_cache->reserve_data_.data(); + + auto run_seq_len = input_dims[0]; + PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_, + "cudnn running seq_len CAN not greater max_lengh"); + CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_, + out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data, + cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_, + weight_data, cudnn_rnn_cache->hx_desc_, init_h_data, + cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_, + in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data, + cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data, + cudnn_rnn_cache->workspace_size_, reserve_data, + cudnn_rnn_cache->reserve_size_)); + + CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, input->data(), cudnn_rnn_cache->hx_desc_, + init_h->data(), cudnn_rnn_cache->y_desc_, out->data(), + cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_, + weight_grad->data(), cudnn_rnn_cache->reserve_data_.data(), + cudnn_rnn_cache->reserve_size_)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel); +REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel); diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h index 5ed8520acddfa8fe2105a7c1615bcb3243cb130f..b2a2bcdce932032a761a1fc064fe622f7629f9bf 100644 --- a/paddle/fluid/operators/detection/box_coder_op.h +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -43,6 +43,9 @@ class BoxCoderKernel : public framework::OpKernel { const T* prior_box_var_data = nullptr; if (prior_box_var) prior_box_var_data = prior_box_var->data(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { T prior_box_width = prior_box_data[j * len + 2] - @@ -96,6 +99,9 @@ class BoxCoderKernel : public framework::OpKernel { const T* prior_box_var_data = nullptr; if (prior_box_var) prior_box_var_data = prior_box_var->data(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif for (int64_t i = 0; i < row; ++i) { for (int64_t j = 0; j < col; ++j) { size_t offset = i * col * len + j * len; diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 21db93958a4a586c74a1e060f1f04b5af1dcd889..36979de68f3abfdedfcc4a49cc312c1f849f5676 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -9,36 +9,37 @@ else() endif() configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) - return() -endif() - - -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory) +else() + set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc + brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc - brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc + brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc + PROTO send_recv.proto + DEPS lod_tensor selected_rows memory) -brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc - brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc - PROTO send_recv.proto - DEPS lod_tensor selected_rows memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory) -set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) + set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) -cc_test(brpc_server_test SRCS rpc_server_test.cc - DEPS ${brpc_test_depends} SERIAL) + cc_test(brpc_server_test SRCS rpc_server_test.cc + DEPS ${brpc_test_depends} SERIAL) -cc_test(brpc_serde_test SRCS brpc_serde_test.cc - DEPS ${brpc_test_depends} SERIAL) + cc_test(brpc_serde_test SRCS brpc_serde_test.cc + DEPS ${brpc_test_depends} SERIAL) +endif() diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 62a2c4d94dea51f87c23503390713776d6b2adce..d7f3ea86aff9e7df3cd9ff3dca573a1ec6ccc27a 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -171,11 +171,13 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, const framework::Scope& scope, const std::string& in_var_name, const std::string& out_var_name, + const std::string& table_name, int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; const std::string out_var_name_val = out_var_name; + const std::string table_name_val = table_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); GetProcessor* s = new GetProcessor(ch); @@ -186,11 +188,12 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, s->Prepare(h, time_out); framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - s, method, h, this] { + s, method, h, table_name_val, this] { auto* var = p_scope->FindVar(in_var_name_val); ::grpc::ByteBuffer req; - SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val); + SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val, + 0, table_name_val); VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index d8e9cee85bd734c2ed4b1cae03ecee04e304b651..a31a465645ee4256a76573576ea7fa5af7a5a101 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -194,6 +194,7 @@ class GRPCClient : public RPCClient { const framework::Scope& scope, const std::string& in_var_name, const std::string& out_var_name, + const std::string& table_name = "", int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncSendBatchBarrier( diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index e6856676d49e867214801810949076151e34356a..31fac2133cf159719474207407c52bb96e80e131 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -42,7 +42,8 @@ static void SerializeDestroyCallback(void* payload) { void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg, const std::string& out_name, - const int trainer_id) { + const int trainer_id, + const std::string& table_name) { platform::RecordRPCEvent record_event("serial", &ctx); VarMsg request; TensorPayload* payload = nullptr; @@ -63,6 +64,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, if (!out_name.empty()) { request.set_out_varname(out_name); } + if (!table_name.empty()) { + request.set_table_name(table_name); + } if (var->IsType()) { request.set_type(::sendrecv::LOD_TENSOR); payload = new TensorPayload(GetTensorPayload(var, ctx, &request)); diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h index 17290d3fb4478191c59623913a82d4142d3c49f9..16f5293b0eb413dc43a28193cfd224090aeed659 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.h +++ b/paddle/fluid/operators/distributed/grpc_serde.h @@ -40,7 +40,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg, const std::string& out_varname = std::string(), - const int trainer_id = 0); + const int trainer_id = 0, + const std::string& table_name = std::string()); void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, diff --git a/paddle/fluid/operators/distributed/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc_serde_test.cc index 96ea05e74ed76768248a27ab435dc801b7d1b995..1936c2c623a779c2599aa560247fa5e24f28cd62 100644 --- a/paddle/fluid/operators/distributed/grpc_serde_test.cc +++ b/paddle/fluid/operators/distributed/grpc_serde_test.cc @@ -130,7 +130,8 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { math::set_constant(ctx, tensor, 31.9); ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); + operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg, + "outvar", 0, "table_name"); EXPECT_GT(msg.Length(), static_cast(0)); // deserialize diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index 28a8f1eda043880a2b99a1259c7c5071f3aef61c..d9200c98b23601f8ffaa8eb7a7092a9cf881ca24 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -183,6 +183,7 @@ class RequestPrefetch final : public RequestBase { // prefetch process... std::string in_var_name = request_->Varname(); std::string out_var_name = request_->OutVarname(); + std::string table_name = request_->TableName(); int trainer_id = request_->GetTrainerId(); VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name << " out_var_name: " << out_var_name; @@ -193,7 +194,7 @@ class RequestPrefetch final : public RequestBase { framework::Variable* outvar = scope->Var(out_var_name); request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name); + out_var_name, table_name); SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), &reply_); diff --git a/paddle/fluid/operators/distributed/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc_variable_response.cc index d6d219d4369ba785e5c369538d4a18dc682952c1..76ad02b0300a58cd19ff2541ad53d067197f4177 100644 --- a/paddle/fluid/operators/distributed/grpc_variable_response.cc +++ b/paddle/fluid/operators/distributed/grpc_variable_response.cc @@ -301,6 +301,20 @@ int GRPCVariableResponse::Parse(Source* source) { meta_.set_trainer_id(trainer_id); break; } + case sendrecv::VariableMessage::kTableNameFieldNumber: { + uint32_t length; + if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { + return tag; + } + + std::string temp; + if (!input.ReadString(&temp, length)) { + return tag; + } + + meta_.set_table_name(temp); + break; + } default: { // Unknown tag, return unknown error. return -1; diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf14538b1c284d297242197088a66cc156b1762c --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -0,0 +1,255 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +static size_t GetSectionIndex(int64_t id, + const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (id < abs_sections[i]) { + return i - 1; + } + } + return abs_sections.size() - 1; +} + +static std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; + abs_sections.resize(height_sections.size()); + abs_sections[0] = 0; + for (size_t i = 1; i < height_sections.size(); ++i) { + abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; + } + return abs_sections; +} + +static std::vector> SplitIds( + const std::vector& ids_vector, + const std::vector& height_section, framework::Scope* scope) { + std::set all_ids; + for (auto id : ids_vector) { + all_ids.insert(id); + } + + auto abs_sections = ToAbsoluteSection(height_section); + std::vector> splited_ids; + splited_ids.resize(height_section.size() + 1); + for (auto& id : all_ids) { + auto section_index = GetSectionIndex(id, abs_sections); + splited_ids[section_index].push_back(id - abs_sections[section_index]); + } + return splited_ids; +} + +static void SplitIdsIntoMultipleVarsBySection( + const std::vector& in_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + framework::Scope* scope) { + PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), ""); + + auto place = platform::CPUPlace(); + + for (size_t i = 0; i < in_var_names.size(); ++i) { + auto* id_tensor = + scope->Var(in_var_names[i])->GetMutable(); + auto& ids = splited_ids[i]; + if (!ids.empty()) { + auto* id_tensor_data = id_tensor->mutable_data( + framework::make_ddim({static_cast(ids.size()), 1}), place); + memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); + } + } +} + +static void MergeMultipleVarsIntoOneBySection( + const std::string& id_name, const std::vector& ids_vector, + const std::string& out_name, const std::vector& out_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + const framework::ExecutionContext& context, framework::Scope* scope, + platform::DeviceContext* actual_ctx) { + PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); + + auto cpu_place = platform::CPUPlace(); + + auto abs_sections = ToAbsoluteSection(height_section); + std::unordered_map> id_to_offset; + for (size_t i = 0; i < ids_vector.size(); ++i) { + id_to_offset[ids_vector[i]].push_back(i); + } + + auto& id_tensor = scope->FindVar(id_name)->Get(); + auto* out_tensor = + scope->FindVar(out_name)->GetMutable(); + auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); + + bool is_on_cpu_place = true; + if (!platform::is_cpu_place(id_tensor.place())) { + is_on_cpu_place = false; + } + + for (size_t section_idx = 0; section_idx < out_var_names.size(); + ++section_idx) { + auto& ids_in_this_section = splited_ids[section_idx]; + if (!ids_in_this_section.empty()) { + auto& prefetch_out_var = + scope->Var(out_var_names[section_idx])->Get(); + const auto* out_var_data = prefetch_out_var.data(); + auto& dims = prefetch_out_var.dims(); + + PADDLE_ENFORCE_EQ(dims.size(), 2, ""); + PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]); + + auto row_numel = dims[1]; + + for (size_t i = 0; i < dims[0]; ++i) { + auto id = ids_in_this_section[i]; + auto origin_id = id + abs_sections[section_idx]; + auto& offsets = id_to_offset[origin_id]; + for (auto& offset : offsets) { + // should support GPU tensor + if (is_on_cpu_place) { + memory::Copy(cpu_place, out_tensor_data + offset * row_numel, + cpu_place, out_var_data + i * row_numel, + sizeof(float) * row_numel); + } else { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("paddle is not compiled with CUDA!"); +#else + auto stream = + static_cast(actual_ctx)->stream(); + memory::Copy(boost::get(id_tensor.place()), + out_tensor_data + offset * row_numel, cpu_place, + out_var_data + i * row_numel, + sizeof(float) * row_numel, stream); +#endif + } + } + } + } else { + VLOG(3) << "ids in this section is empty"; + } + } +} + +void prefetch(const std::string& id_name, const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context) { + auto& local_scope = context.scope().NewScope(); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& cpu_ctx = *pool.Get(platform::CPUPlace()); + auto& actual_ctx = *pool.Get(context.GetPlace()); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + context.Attr("trainer_id")); + + std::vector in_var_names; + std::vector out_var_names; + for (size_t i = 0; i < epmap.size(); ++i) { + in_var_names.push_back(id_name + "@" + epmap[i]); + out_var_names.push_back(out_name + "@" + epmap[i]); + } + + auto& id_tensor = local_scope.FindVar(id_name)->Get(); + std::vector ids_vector; + if (platform::is_cpu_place(id_tensor.place())) { + auto* id_data = id_tensor.data(); + for (size_t i = 0; i < id_tensor.numel(); ++i) { + ids_vector.push_back(id_data[i]); + } + } else { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("paddle is not compiled with CUDA!"); +#else + auto cpu_place = platform::CPUPlace(); + framework::Tensor cpu_tensor; + auto* cpu_tensor_data = + cpu_tensor.mutable_data(id_tensor.dims(), cpu_place); + auto stream = + static_cast(&actual_ctx)->stream(); + memory::Copy(cpu_place, cpu_tensor_data, + boost::get(id_tensor.place()), + id_tensor.data(), sizeof(int64_t) * id_tensor.numel(), + stream); + for (size_t i = 0; i < cpu_tensor.numel(); ++i) { + ids_vector.push_back(cpu_tensor_data[i]); + } +#endif + } + + auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope); + SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids, + &local_scope); + + // create output var in local scope + for (auto& name : out_var_names) { + local_scope.Var(name)->GetMutable(); + } + + std::vector rets; + for (size_t i = 0; i < in_var_names.size(); i++) { + if (NeedSend(local_scope, in_var_names[i])) { + VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i] + << " to get " << out_var_names[i] << " back"; + rets.push_back(rpc_client->AsyncPrefetchVar( + epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i], + table_names[i])); + } else { + VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; + } + } + + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + + MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, + out_var_names, height_sections, splited_ids, + context, &local_scope, &actual_ctx); + + context.scope().DeleteScope(&local_scope); +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h new file mode 100644 index 0000000000000000000000000000000000000000..53b0fbfb51f60fa86351cca34fd1665c7802591b --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void prefetch(const std::string& id_name, const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context); + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3c1db147098055e9974c9dc607266cdaf2e43dae..5272afd42851160ca5352ef474d940a5d2dd2456 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -191,7 +191,8 @@ class RequestHandler { virtual bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name = "") = 0; + const std::string& out_var_name = "", + const std::string& table_name = "") = 0; protected: const bool sync_mode_; diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 025528fe70b8f4d353ab92f29b1bd71c77cf7850..9722f8c96e91d2dfbe929dcc11645a40c44afb4e 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include #include #include @@ -20,7 +21,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/string/printf.h" @@ -37,7 +38,8 @@ bool RequestSendHandler::Handle(const std::string& varname, framework::Variable* invar, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name) { + const std::string& out_var_name, + const std::string& table_name) { VLOG(4) << "RequestSendHandler:" << varname; // Sync @@ -77,8 +79,10 @@ bool RequestGetHandler::Handle(const std::string& varname, framework::Variable* invar, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name) { + const std::string& out_var_name, + const std::string& table_name) { VLOG(4) << "RequestGetHandler:" << varname; + if (sync_mode_) { if (varname == FETCH_BARRIER_MESSAGE) { VLOG(3) << "sync: recv fetch barrier message"; @@ -113,14 +117,22 @@ bool RequestPrefetchHandler::Handle(const std::string& varname, framework::Variable* invar, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name) { + const std::string& out_var_name, + const std::string& table_name) { VLOG(4) << "RequestPrefetchHandler " << varname; - auto var_desc = program_->Block(0).FindVar(out_var_name); - InitializeVariable(*outvar, var_desc->GetType()); - executor_->RunPreparedContext( - (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope); - + if (table_name.empty()) { + auto var_desc = program_->Block(0).FindVar(out_var_name); + InitializeVariable(*outvar, var_desc->GetType()); + executor_->RunPreparedContext( + (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope); + } else { + (*outvar)->GetMutable(); + auto lookup_table_op = + BuildLookupTableOp(table_name, varname, out_var_name); + paddle::platform::CPUPlace cpu_place; + lookup_table_op->Run(*scope, cpu_place); + } return true; } @@ -129,7 +141,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname, framework::Variable* invar, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name) { + const std::string& out_var_name, + const std::string& table_name) { PADDLE_ENFORCE( checkpoint_notify_id != -1, "when checkpoint_notify_id = -1, there should be no RPC invoke."); diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h index c1afda9dd2445e492d8b93659c9ff13e6e1030b8..5e0b25c5c2ce161dee0948a07baab32dfff9be6f 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -24,6 +24,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" @@ -43,8 +44,8 @@ class RequestSendHandler final : public RequestHandler { virtual ~RequestSendHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "") override; + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; private: bool enable_dc_asgd_; @@ -59,21 +60,44 @@ class RequestGetHandler final : public RequestHandler { virtual ~RequestGetHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "") override; + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; private: bool enable_dc_asgd_; }; +static inline void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + class RequestPrefetchHandler final : public RequestHandler { public: explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestPrefetchHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "") override; + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; + + private: + std::unique_ptr BuildLookupTableOp( + const std::string& table_name, const std::string& id_name, + const std::string& out_name) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("lookup_table"); + BuildVar("W", {table_name.data()}, op_desc.add_inputs()); + BuildVar("Ids", {id_name.data()}, op_desc.add_inputs()); + BuildVar("Out", {out_name.data()}, op_desc.add_outputs()); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + return op; + } }; class RequestCheckpointHandler final : public RequestHandler { @@ -85,8 +109,8 @@ class RequestCheckpointHandler final : public RequestHandler { virtual ~RequestCheckpointHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "") override; + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; private: int checkpoint_notify_id; diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 1983802e49506c79041112ac87d429e4c084ddfd..4cd3abb5a61068bc4f9f5b38cafc2daa8406d448 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -48,7 +48,7 @@ class RPCClient { virtual VarHandlePtr AsyncPrefetchVar( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& in_var_name, - const std::string& out_var_name, + const std::string& out_var_name, const std::string& table_name = "", int64_t time_out = FLAGS_rpc_deadline) = 0; virtual VarHandlePtr AsyncSendBatchBarrier( diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index 55820c980e8139625c1b589f9d2d68dfee74a212..7b7d069f17fd0f9e6a776fa4d1a19cf01914cfeb 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -80,6 +80,7 @@ message VariableMessage { // when profile switches from 1 to 2. int64 profile = 11; int64 trainer_id = 12; + string table_name = 13; } message VoidMessage {} diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index 4c7fcbbdfb305ce6b4fc9d1edd9738899b200ec6..a4324f67bb99bfdaa19c1a6dba8e907f17635d14 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -85,6 +85,7 @@ class VariableResponse { inline framework::Scope* GetMutableLocalScope() const { return local_scope_; } inline std::string Varname() const { return meta_.varname(); } inline std::string OutVarname() const { return meta_.out_varname(); } + inline std::string TableName() const { return meta_.table_name(); } // should call parse first. framework::Variable* GetVar() { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index dc25bc57103286ce183a4649964fd96c62169b7f..a8b8a67a114b956f2d6b1b072ef343a179114b34 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -60,15 +60,37 @@ template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); + auto x_var = ctx.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable X, variable name = %s", + ctx.op().Input("X")); auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); + + framework::Tensor x, *z; + if (x_var->IsType()) { + PADDLE_ENFORCE(y->dims().size() == 1 && y->dims()[0] == 1, + "For elementwise_op, if X is Sparse, Y must be scalar."); + auto& x_sele = x_var->Get(); + auto out_sele = ctx.Output("Out"); + x = x_sele.value(); + out_sele->set_rows(x_sele.rows()); + out_sele->set_height(x_sele.height()); + out_sele->mutable_value()->Resize(x_sele.value().dims()); + out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type()); + z = ctx.Output("Out")->mutable_value(); + } else if (x_var->IsType()) { + x = x_var->Get(); + z = ctx.Output("Out"); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + x_var->Type().name()); + } z->mutable_data(ctx.GetPlace()); - if (x->numel() == y->numel()) { - elementwise_mul(ctx, x, y, z); + if (x.numel() == y->numel()) { + elementwise_mul(ctx, &x, y, z); } else { - default_elementwise_mul(ctx, x, y, z); + default_elementwise_mul(ctx, &x, y, z); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 85a7817be9b3a82d40853b417d78a7fdf67f6c1f..87bf7c6b156f32b8f6a1abc30b0676e1d4711d64 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of elementwise op should not be null."); - PADDLE_ENFORCE( - ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front()); PADDLE_ENFORCE( ctx->GetInputsVarType("Y").front() == framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Y").front(), ctx->GetInputsVarType("Y").front()); - - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); + "The input var's type should be LoDTensor, but the received is %s [%s]", + ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); + } ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 6d463538d232e1a38f845e7abc3786568ca3bb21..1eb6523a2dfb358490a07bf1b806d5638442a4d5 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); \ auto& act_cell_str = ctx.Attr("cell_activation"); \ auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ + if (platform::MayIUse(platform::avx)) { \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ } else { \ - math::VecActivations act_functor; \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 288b56fc2485138b20c5b53af3e950f1c1886ba5..17ed9771d074cf7ae8c6735e4cb859139503a0af 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { std::function fc_act; auto& fc_act_str = ctx.Attr("fc_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a4ae19d9c1e3bb2af3eb95650fbb5aabb8944a36 --- /dev/null +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace operators { + +class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "GetTensorFromSelectedRowsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "GetTensorFromSelectedRowsOp must has output Out."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS, + "The input X's type should be SelectedRows, but the received is %s", + ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front()); + PADDLE_ENFORCE( + ctx->GetOutputsVarType("Out").front() == + framework::proto::VarType::LOD_TENSOR, + "The output Out's type should be LoDTensor, but the received is %s", + ctx->Outputs("Out").front(), ctx->GetOutputsVarType("Out").front()); + + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.device_context()); + } +}; + +class GetTensorFromSelectedRowsKernel { + public: + void operator()(const framework::ExecutionContext &ctx) const { + auto *x = ctx.Input("X"); + auto *out = ctx.Output("Out"); + + out->Resize(x->value().dims()); + out->mutable_data(ctx.GetPlace(), x->value().type()); + framework::TensorCopy(x->value(), ctx.GetPlace(), ctx.device_context(), + out); + } +}; + +class GetTensorFromSelectedRowsOpProtoMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input type is SelectedRows."); + AddOutput("Out", "The output type is LoDTensor."); + AddComment( + R"DOC( +GetTensorFromSelectedRows Operator + +GetTensorFromSelectedRows is used to get the tensor from SelectedRows. + +)DOC"); + } +}; + +class GetTensorFromSelectedRowsOpVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const final { + auto out_var_name = op_desc.Output("Out").front(); + auto in_var_name = op_desc.Input("X").front(); + + auto out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto in_var = block->FindRecursiveOrCreateVar(in_var_name); + out_var.SetType(framework::proto::VarType::LOD_TENSOR); + out_var.SetDataType(in_var.GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(get_tensor_from_selected_rows, + ops::GetTensorFromSelectedRowsOp, + ops::GetTensorFromSelectedRowsOpProtoMaker, + ops::GetTensorFromSelectedRowsOpVarTypeInference); + +REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float, + ops::GetTensorFromSelectedRowsKernel, double, + ops::GetTensorFromSelectedRowsKernel, int, + ops::GetTensorFromSelectedRowsKernel, int64_t, + ops::GetTensorFromSelectedRowsKernel); + +#ifdef PADDLE_WITH_CUDA +REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float, + ops::GetTensorFromSelectedRowsKernel, double, + ops::GetTensorFromSelectedRowsKernel, int, + ops::GetTensorFromSelectedRowsKernel, int64_t, + ops::GetTensorFromSelectedRowsKernel); +#endif diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 972dcf5494e9acd47e7ff615db45f056a43724a6..0dbcc442dfa1a395cdb0ffbd69eb78ad66cfaa17 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -150,14 +150,14 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { "Output(W@Grad should not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@Grad should not be null."); - if (!ctx->Attrs().Get("is_sparse")) { - if (ctx->HasOutput(framework::GradVarName("Bias"))) { - ctx->SetOutputDim(framework::GradVarName("Bias"), - ctx->GetInputDim("Bias")); - } - ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); } + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); } protected: diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 07ff8f947e59d2954783e2ba537bfce3cb320f22..b73a32af89e882ac02623dd1d312f400a78fc47a 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("W")); w_grad->set_rows(real_rows); // Build a map of id -> row_index to speed up finding the index of one id - w_grad->SyncIndex(); w_grad->set_height(w.dims()[0]); auto* w_grad_value = w_grad->mutable_value(); framework::DDim temp_dim(w.dims()); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 3226a727b1f5f6de9e97ce2068381be7c9b69ff3..0029932bc068c7f61ddb41cf3f87c9e1a5cd7749 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -87,6 +87,25 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "(boolean, default false) " "If the grad op reuse the input's variable.") .SetDefault(false); + + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); + AddComment(R"DOC( Lookup Table Operator. diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index abd5dce8f7e7146a1671a387328c177e5e6e0a85..6a0d6bad512fe7cc15e60ed25028bc3cbbbca2ab 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids, while (idy < K) { int64_t id = ids[idy]; - PADDLE_ASSERT(id >= 0); - PADDLE_ASSERT(id < N); + PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id); + PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id); T *out = output + idy * D; const T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { @@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids, int idy = blockIdx.x + threadIdx.y * GridDimX; while (idy < K) { - int id = ids[idy]; - PADDLE_ASSERT(id >= 0); - PADDLE_ASSERT(id < N); + int64_t id = ids[idy]; + PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id); + PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id); const T *out = output + idy * D; T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { @@ -78,27 +78,47 @@ class LookupTableCUDAKernel : public framework::OpKernel { auto *output_t = context.Output("Out"); int64_t padding_idx = context.Attr("padding_idx"); - size_t N = table_t->dims()[0]; - size_t D = table_t->dims()[1]; - size_t K = ids_t->numel(); - - auto *ids = ids_t->data(); - auto *table = table_t->data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - dim3 threads(128, 8); - dim3 grids(8, 1); - - if (padding_idx == -1) - LookupTable< - T, 128, 8, 8, - false><<>>( - output, table, ids, N, K, D, padding_idx); - else - LookupTable< - T, 128, 8, 8, - true><<>>( - output, table, ids, N, K, D, padding_idx); + auto id_name = context.Inputs("Ids").front(); + auto out_name = context.Outputs("Out").front(); + + // for remote prefetch + auto epmap = context.Attr>("epmap"); + auto height_sections = context.Attr>("height_sections"); + auto table_names = context.Attr>("table_names"); + + if (!epmap.empty()) { +// if epmap is not empty, then the parameter will be fetched from remote +// parameter +// server +#ifdef PADDLE_WITH_DISTRIBUTE + operators::distributed::prefetch(id_name, out_name, table_names, epmap, + height_sections, context); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + } else { + size_t N = table_t->dims()[0]; + size_t D = table_t->dims()[1]; + size_t K = ids_t->numel(); + + auto *ids = ids_t->data(); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + dim3 threads(128, 8); + dim3 grids(8, 1); + + if (padding_idx == -1) + LookupTable<<< + grids, threads, 0, context.cuda_device_context().stream()>>>( + output, table, ids, N, K, D, padding_idx); + else + LookupTable<<< + grids, threads, 0, context.cuda_device_context().stream()>>>( + output, table, ids, N, K, D, padding_idx); + } } }; @@ -109,6 +129,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto &dev_ctx = context.template device_context(); bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index e504c4f0cd5c0feaef4a251fad57b389a10a2ce7..3a73a7637c6d7d3eff7443802a4a52be9149e0ef 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -23,6 +23,10 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/math/blas.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -41,44 +45,66 @@ class LookupTableKernel : public framework::OpKernel { auto *output_t = context.Output("Out"); // float tensor auto *table_var = context.InputVar("W"); - int64_t padding_idx = context.Attr("padding_idx"); - int64_t *ids = const_cast(ids_t->data()); - int64_t ids_numel = ids_t->numel(); - - if (table_var->IsType()) { - auto *table_t = context.Input("W"); - int64_t row_number = table_t->dims()[0]; - int64_t row_width = table_t->dims()[1]; - - auto *table = table_t->data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_LT(ids[i], row_number); - PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); - memcpy(output + i * row_width, table + ids[i] * row_width, - row_width * sizeof(T)); + auto id_name = context.Inputs("Ids").front(); + auto out_name = context.Outputs("Out").front(); + + // for remote prefetch + auto epmap = context.Attr>("epmap"); + auto height_sections = context.Attr>("height_sections"); + auto table_names = context.Attr>("table_names"); + + if (!epmap.empty()) { +// if epmap is not empty, then the parameter will be fetched from remote +// parameter +// server +#ifdef PADDLE_WITH_DISTRIBUTE + operators::distributed::prefetch(id_name, out_name, table_names, epmap, + height_sections, context); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + } else { + int64_t padding_idx = context.Attr("padding_idx"); + int64_t *ids = const_cast(ids_t->data()); + int64_t ids_numel = ids_t->numel(); + + if (table_var->IsType()) { + auto *table_t = context.Input("W"); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], row_number); + PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); + memcpy(output + i * row_width, table + ids[i] * row_width, + row_width * sizeof(T)); + } } - } - } else if (table_var->IsType()) { - const auto &table_t = table_var->Get(); - int64_t row_width = table_t.value().dims()[1]; - const auto *table = table_t.value().data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - auto blas = math::GetBlas(context); - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_GE(ids[i], 0); - auto id_index = table_t.Index(ids[i]); - PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); - blas.VCOPY(row_width, table + id_index * row_width, - output + i * row_width); + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); + int64_t row_width = table_t.value().dims()[1]; + const auto *table = table_t.value().data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_GE(ids[i], 0); + auto id_index = table_t.Index(ids[i]); + PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); + } } } } diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 63363086adbf12c38ac09949ac20483116ccf4ee..b3d2ea38eb1bfffadc1f68c5a34bc4d557bdea3b 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -59,6 +59,7 @@ math_library(matrix_bit_code) math_library(unpooling) math_library(vol2col) +math_library(prelu) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 7d81aee596934308763002d440f52400f45b5f20..e1e4d168db3ca594b44396a6e30c5bfc03483eaf 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -77,7 +77,7 @@ inline void vec_scal(const int n, const double a, double* x) { #endif // MKL scal only support inplace, choose this if src and dst are not equal -template +template inline void vec_scal(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a * x[i]; @@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); return; } const int rest = n % block; @@ -114,24 +114,24 @@ inline void vec_scal(const int n, const float a, y[i] = a * x[i]; } #else - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); #endif } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { - vec_scal(n, a, x, y); +inline void vec_scal(const int n, const float a, + const float* x, float* y) { + vec_scal(n, a, x, y); } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); } -template +template inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a - x[i]; @@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); return; } const int rest = n % block; @@ -168,27 +168,25 @@ inline void vec_bias_sub(const int n, const float a, y[i] = a - x[i]; } #else - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); #endif } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { - vec_bias_sub(n, a, x, y); +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { + vec_bias_sub(n, a, x, y); } template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); } // out = x*y + (1-x)*z -template +template inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { for (int i = 0; i < n; ++i) { out[i] = x[i] * y[i] + (static_cast(1) - x[i]) * z[i]; @@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { } template <> -inline void vec_cross(const int n, const float* x, - const float* y, const float* z, - float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); return; } const int rest = n % block; @@ -228,25 +226,26 @@ inline void vec_cross(const int n, const float* x, out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; } #else - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); #endif } template <> -inline void vec_cross(const int n, const float* x, - const float* y, - const float* z, float* out) { - vec_cross(n, x, y, z, out); +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { + vec_cross(n, x, y, z, out); } template <> -inline void vec_cross( - const int n, const float* x, const float* y, const float* z, float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { // TODO(TJ): enable me - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); } -template +template inline void vec_add_bias(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] + a; @@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); return; } const int rest = n % block; @@ -283,32 +282,30 @@ inline void vec_add_bias(const int n, const float a, y[i] = x[i] + a; } #else - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); #endif } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { - vec_add_bias(n, a, x, y); +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { + vec_add_bias(n, a, x, y); } template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); } -template +template inline void vec_identity(const int n, const T* x, T* y) { // do nothing return; } -template +template inline void vec_sigmoid(const int n, const T* x, T* y) { const T min = SIGMOID_THRESHOLD_MIN; const T max = SIGMOID_THRESHOLD_MAX; @@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); return; } const int rest = n % block; @@ -377,25 +374,24 @@ inline void vec_sigmoid(const int n, const float* x, y[i] = 1.f / (1.f + y[i]); } #else - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); #endif } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { - vec_sigmoid(n, x, y); +inline void vec_sigmoid(const int n, const float* x, + float* y) { + vec_sigmoid(n, x, y); } template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); } -template +template inline void vec_tanh(const int n, const T* x, T* y) { vec_scal(n, static_cast(2), x, y); vec_sigmoid(n, y, y); @@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) { } // TODO(TJ): make relu clip -template +template inline void vec_relu(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] > 0 ? x[i] : 0; @@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) { } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block * 4) { - vec_relu(n, x, y); + vec_relu(n, x, y); return; } @@ -441,26 +437,26 @@ inline void vec_relu(const int n, const float* x, #undef MOVE_ONE_STEP #else - vec_relu(n, x, y); + vec_relu(n, x, y); #endif } template <> -inline void vec_relu(const int n, const float* x, - float* y) { - vec_relu(n, x, y); +inline void vec_relu(const int n, const float* x, + float* y) { + vec_relu(n, x, y); } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_relu(n, x, y); + vec_relu(n, x, y); } // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary -template +template class VecActivations { public: std::function operator()( diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index c37fa291a259550a3cb6d4f3dd9d5a415c3a2130..28eb9cadc9d4258bf4f8f71a06e029531e448014 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function tgt, } TEST(CpuVecTest, sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, + ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, + ref_relu); } TestAndBench(30, vec_relu, ref_relu); } @@ -162,38 +166,40 @@ void TestInplace(const int n, std::function tgt, } TEST(CpuVecTest, inplace_sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, inplace_tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); } TestInplace(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, inplace_relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); } TestInplace(30, vec_relu, ref_relu); } diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 52cbdf685dee651cbc1490dc6faacb8680004c89..78d0c3e8808f0daf6a18d2217664e965773b95ff 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -22,7 +22,7 @@ namespace math { namespace jitkernel { namespace gen { -using namespace platform::jit; // NOLINT +using namespace platform; // NOLINT bool VXXJitCode::init(int d, int scalar_index) { // It's not necessary to use avx512 since it would slow down the frequency diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index a9214621295a7740b804b26c02d216dd5118d8bb..e2b4761435594fdc952ff5dba5b5fa4f4aa98e6c 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -179,7 +179,7 @@ class VActJitCode : public JitCode { template void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { - using namespace platform::jit; // NOLINT + using namespace platform; // NOLINT // check all idx can not equal JMM jmm_src = JMM(src_idx); JMM jmm_fx = JMM(fx_idx); diff --git a/paddle/fluid/operators/math/jit_gen.cc b/paddle/fluid/operators/math/jit_gen.cc index 6af39518ed926554c8c839bba701d3827923dba0..5c6672928e8c03ccb1920bd828f785084e422fc2 100644 --- a/paddle/fluid/operators/math/jit_gen.cc +++ b/paddle/fluid/operators/math/jit_gen.cc @@ -36,7 +36,7 @@ void JitCode::preCode() { for (int i = 0; i < num_g_abi_regs; ++i) { push(Xbyak::Reg64(g_abi_regs[i])); } - if (platform::jit::MayIUse(platform::jit::avx512f)) { + if (platform::MayIUse(platform::avx512f)) { mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); } } diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc index 68b708b345334bc63b5e2e88c308d20ca6378e6b..118696ba47986e2dbf97535333c9817b7c264a54 100644 --- a/paddle/fluid/operators/math/jit_kernel.cc +++ b/paddle/fluid/operators/math/jit_kernel.cc @@ -21,8 +21,6 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - KernelPool& KernelPool::Instance() { static thread_local KernelPool g_jit_kernels; return g_jit_kernels; diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index a0f93fd8e7eb7d81211724a6991a681e2a0ed9ce..8cf588efba52314650bfd376b95b10e6d4336b2e 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -30,7 +30,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML template @@ -125,7 +124,7 @@ bool VMulKernelImpl::useJIT(int d) { #ifdef PADDLE_WITH_MKLML template <> bool VMulKernelImpl::useMKL(int d) { - return jit::MayIUse(jit::avx512f) && d > 512; + return platform::MayIUse(platform::avx512f) && d > 512; } template <> diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index 4d26b81948238f18b097f535534fcfe9049b93c3..eeb305a88bee8f0e21b205684d24b19ca4631f65 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -25,10 +25,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* CRF Decode JitKernel */ -template +template class CRFDecodeKernelImpl : public CRFDecodeKernel { public: explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel() { @@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(YMM_FLOAT_BLOCK) \ @@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX512_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(ZMM_FLOAT_BLOCK) \ @@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16); INTRIAVX_FLOAT(kGT16); #endif #ifdef __AVX2__ -INTRIAVX2_FLOAT(jit::avx2, kEQ8); -INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX2_FLOAT(jit::avx2, kEQ16); -INTRIAVX2_FLOAT(jit::avx2, kGT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ8); +INTRIAVX2_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ16); +INTRIAVX2_FLOAT(platform::avx2, kGT16); #endif #ifdef __AVX512F__ -INTRIAVX2_FLOAT(jit::avx512f, kEQ8); -INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx512f, kEQ8); +INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16); INTRIAVX512_FLOAT(kEQ16); INTRIAVX512_FLOAT(kGT16); #endif diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 686f3dd9836cb9192088771753065c6add639620..7945cfb253a61b7d1191c39537254126e2bb85dd 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -29,7 +29,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML // try to use MKL to speedup diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index 49904e6e8c7cd346bcbfb67c3a7574118b36e058..fead13ebadcd131afafc308740cdd39b1c53bc08 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -22,10 +22,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* Layer Norm JitKernel */ -template +template class LayerNormKernelImpl : public LayerNormKernel { public: explicit LayerNormKernelImpl(int right) : LayerNormKernel() { @@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel { this->end_ = this->num_ - this->rest_; \ } \ template <> \ - void LayerNormKernelImpl::Compute( \ + void LayerNormKernelImpl::Compute( \ float* x, float* out, float* mean, float* var, const float* scale, \ const float* bias, int height, const float epsilon) const { \ __m256 sum; \ @@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel { } #ifdef __AVX__ -INTRIAVX_FLOAT(jit::avx, kEQ8); -INTRIAVX_FLOAT(jit::avx, kGT8LT16); -INTRIAVX_FLOAT(jit::avx, kEQ16); -INTRIAVX_FLOAT(jit::avx, kGT16); +INTRIAVX_FLOAT(platform::avx, kEQ8); +INTRIAVX_FLOAT(platform::avx, kGT8LT16); +INTRIAVX_FLOAT(platform::avx, kEQ16); +INTRIAVX_FLOAT(platform::avx, kGT16); #endif #ifdef __AVX2__ -INTRIAVX_FLOAT(jit::avx2, kEQ8); -INTRIAVX_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX_FLOAT(jit::avx2, kEQ16); -INTRIAVX_FLOAT(jit::avx2, kGT16); +INTRIAVX_FLOAT(platform::avx2, kEQ8); +INTRIAVX_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX_FLOAT(platform::avx2, kEQ16); +INTRIAVX_FLOAT(platform::avx2, kGT16); #endif #undef INTRIAVX_FLOAT diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 5a3efd979f803d396a5084c199b1d71b88a77126..4dba3b56810794cb4839d26386ae77a8f4507977 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -92,7 +92,6 @@ namespace jitkernel { JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \ JITKERNEL_IMPL) -namespace jit = platform::jit; // TODO(TJ): below defines are deprecated, would be remove recently #define SEARCH_BLOCK(macro_, ker, dtype, isa) \ if (d < YMM_FLOAT_BLOCK) { \ @@ -107,15 +106,15 @@ namespace jit = platform::jit; macro_(ker, dtype, isa, kGT16); \ } -#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ - if (jit::MayIUse(jit::avx512f)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ - } else if (jit::MayIUse(jit::avx2)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ - } else if (jit::MayIUse(jit::avx)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ - } else { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ +#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ + if (platform::MayIUse(platform::avx512f)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \ + } else if (platform::MayIUse(platform::avx2)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx2); \ + } else if (platform::MayIUse(platform::avx)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx); \ + } else { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \ } #define JITKERNEL_KEY(ker_key, dtype_key) \ @@ -156,10 +155,10 @@ namespace jit = platform::jit; marco_declare, macro_key, macro_impl) #define FOR_EACH_ISA(macro_, block) \ - macro_(jit::avx512f, block); \ - macro_(jit::avx2, block); \ - macro_(jit::avx, block); \ - macro_(jit::isa_any, block) + macro_(platform::avx512f, block); \ + macro_(platform::avx2, block); \ + macro_(platform::avx, block); \ + macro_(platform::isa_any, block) #define FOR_EACH_BLOCK(macro_, isa) \ macro_(isa, kLT8); \ @@ -168,11 +167,11 @@ namespace jit = platform::jit; macro_(isa, kEQ16); \ macro_(isa, kGT16) -#define FOR_EACH_ISA_BLOCK(macro_) \ - FOR_EACH_BLOCK(macro_, jit::avx512f); \ - FOR_EACH_BLOCK(macro_, jit::avx2); \ - FOR_EACH_BLOCK(macro_, jit::avx); \ - FOR_EACH_BLOCK(macro_, jit::isa_any) +#define FOR_EACH_ISA_BLOCK(macro_) \ + FOR_EACH_BLOCK(macro_, platform::avx512f); \ + FOR_EACH_BLOCK(macro_, platform::avx2); \ + FOR_EACH_BLOCK(macro_, platform::avx); \ + FOR_EACH_BLOCK(macro_, platform::isa_any) } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index ed86a47e159cacd4f5572e22c7633f725aaeb516..19f7bd8909499c12fd5bee4db0d0a71a632e7f19 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -705,7 +705,7 @@ TEST(JitKernel, pool) { jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); // empty call it to avoid unknown flag 'use_pinned_memory' on Mac - paddle::platform::jit::MayIUse(paddle::platform::jit::avx); + paddle::platform::MayIUse(paddle::platform::avx); const auto& plstm1 = jit::KernelPool::Instance() .template Get, const jit::lstm_attr_t&>(attr); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 71b9293eeded77553ca06a8574cca3941fa36b6a..5a6e64b6f87d33249f0153e5f391deaf78e53de5 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -89,6 +89,8 @@ template void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, const framework::Tensor& weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat->dims()[0]; size_t tmat_width = tmat->dims()[1]; size_t input_width = input.dims()[1]; @@ -99,13 +101,12 @@ void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_row = input_value + input_width * i; for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); + const T* weight_row = weight_value + weight_width * index; T sum = static_cast(0.0); - for (size_t k = 0; k < input_width; ++k) { - sum += weight_value[weight_width * index + k] * - input_value[input_width * i + k]; - } + sum = blas.DOT(input_width, weight_row, input_row); tmat_value[i * tmat_width + j] += sum; } } @@ -115,6 +116,8 @@ template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat.dims()[0]; size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; @@ -122,16 +125,25 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto tmat_value = tmat.data(); auto weight_value = weight->data(); auto input_value = input.data(); + + std::unordered_map>> ops; + for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_value_row = input_value + input_width * i; + const T* tmat_row = tmat_value + i * tmat_width; for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - - for (size_t k = 0; k < input_width; ++k) { - weight_value[weight_width * index + k] += - tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; - } + ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } + } + for (auto& op : ops) { + auto& op_in_row = op.second; + for (auto& pair : op_in_row) { + auto& scale = pair.first; + auto* input_row = pair.second; + T* weight_row = weight_value + op.first * weight_width; + blas.AXPY(input_width, scale, input_row, weight_row); } } } @@ -140,6 +152,8 @@ template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, framework::SelectedRows* weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat.dims()[0]; size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; @@ -147,17 +161,28 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto tmat_value = tmat.data(); auto weight_value = weight->mutable_value()->data(); auto input_value = input.data(); + + std::unordered_map>> ops; + ops.reserve(weight->rows().size()); + for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_value_row = input_value + input_width * i; + const T* tmat_row = tmat_value + i * tmat_width; for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - for (size_t k = 0; k < input_width; ++k) { - int64_t row_index = weight->GetIndexFromId(static_cast(index)); - weight_value[row_index * weight_width + k] += - tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; - } + ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } + } + + for (auto& row : weight->rows()) { + auto& op_in_row = ops[row]; + for (auto& pair : op_in_row) { + auto& scale = pair.first; + auto* input_row = pair.second; + blas.AXPY(input_width, scale, input_row, weight_value); } + weight_value += weight_width; } } diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index c30bb52641e865efe57659a551bc4b493634c6b9..35ca73802b48982ddf3ed7485b56f50221c9f28c 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -13,10 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" #if defined(_WIN32) diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu new file mode 100644 index 0000000000000000000000000000000000000000..701a802080f65ea32b95402682dc46362ccf0966 --- /dev/null +++ b/paddle/fluid/operators/math/prelu.cu @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/prelu.h" + +namespace paddle { +namespace operators { +namespace math { + +static const int CUDA_NUM_THREADS = 1024; +static const int CUDA_MAX_NUM_BLOCKS = 65535; +inline static int GET_NUM_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void PReluChannelWiseKernel(const T *input, const T *alpha, + T *output, int channel, + size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + T *out = output + offset; + T scale = alpha[blockIdx.x % channel]; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale * x; + } +} + +template +__global__ void PReluElementWiseKernel(const T *input, const T *alpha, + T *output, size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + const T *scale = alpha + offset; + T *out = output + offset; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale[i] * x; + } +} + +template +__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output, + size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + T scale = *alpha; + T *out = output + offset; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale * x; + } +} + +template +static inline void PReluChannelWise(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluChannelWiseKernel<<>>( + input, alpha, output, input_shape[1], spatial_size); +} + +template +static inline void PReluElementWise(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluElementWiseKernel<<>>( + input, alpha, output, spatial_size); +} + +template +static inline void PReluScalar(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluScalarKernel<<>>( + input, alpha, output, spatial_size); +} + +template +void PreluChannelWiseDirectCUDAFunctor::operator()( + cudaStream_t stream, const T *input, const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluChannelWiseKernel<<>>( + input, alpha, output, input_shape[1], spatial_size); +} + +template +void PreluElementWiseDirectCUDAFunctor::operator()( + cudaStream_t stream, const T *input, const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluElementWiseKernel<<>>( + input, alpha, output, spatial_size); +} + +template +void PreluScalarDirectCUDAFunctor::operator()(cudaStream_t stream, + const T *input, const T *alpha, + T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluScalarKernel<<>>( + input, alpha, output, spatial_size); +} + +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; + +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; + +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h new file mode 100644 index 0000000000000000000000000000000000000000..3237c6d4cbf956aafb4046ea2ffa42efe62e7b28 --- /dev/null +++ b/paddle/fluid/operators/math/prelu.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +#ifdef PADDLE_WITH_CUDA +template +class PreluChannelWiseDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; + +template +class PreluElementWiseDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; + +template +class PreluScalarDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; +#endif + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c15c839554599104d21a5225c078d41735c4a60 --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_selected_rows_op.h" + +namespace paddle { +namespace operators { + +class MergeSelectedRowsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MergeSelectedRowsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MergeSelectedRowsOp should not be null."); + ctx->ShareDim("X", /*->*/ "Out"); + } +}; + +class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input type is SelectedRows, and the selected rows may be " + "duplicated."); + AddOutput("Out", + "The output type is SelectedRows, and the selected rows are not " + "duplicated."); + AddComment( + R"DOC( +MergeSelectedRows Operator. + +MergeSelectedRows is used to merge the duplicated rows of the input. +)DOC"); + } +}; + +class MergeSelectedRowsOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOp, + ops::MergeSelectedRowsOpMaker, + ops::MergeSelectedRowsOpInferVarType); + +REGISTER_OP_CPU_KERNEL( + merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/merge_selected_rows_op.cu.cc b/paddle/fluid/operators/merge_selected_rows_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..90d5fb3eaeb1f155eeea29ea0cf3f5ecd610f5f0 --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_selected_rows_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4c977e94b175c988e4253b273365b0cabc4b87aa --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +template +class MergeSelectedRowsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + + math::scatter::MergeAdd merge_func; + merge_func(context.template device_context(), *x, out); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/metrics/auc_op.h b/paddle/fluid/operators/metrics/auc_op.h index fb370842d1942c3b3eebecb1fe5e8ffb845cb34b..4ab5cfe53c67eeaa995d7e955eec63a065c5eec5 100644 --- a/paddle/fluid/operators/metrics/auc_op.h +++ b/paddle/fluid/operators/metrics/auc_op.h @@ -75,8 +75,13 @@ class AucKernel : public framework::OpKernel { const auto *label_data = label->data(); for (size_t i = 0; i < batch_size; i++) { - uint32_t binIdx = static_cast( - inference_data[i * inference_width + 1] * num_thresholds); + auto predict_data = inference_data[i * inference_width + 1]; + PADDLE_ENFORCE_LE(predict_data, 1, + "The predict data must less or equal 1."); + PADDLE_ENFORCE_GE(predict_data, 0, + "The predict data must gather or equal 0."); + + uint32_t binIdx = static_cast(predict_data * num_thresholds); if (label_data[i]) { (*stat_pos)[binIdx] += 1.0; } else { diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index a706d05fd7c35ef993f5199f0f893622cb863c5d..a9da21f47902e20cc7460461caca79c3f3292c5a 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -319,20 +319,46 @@ void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels, } } +static inline void GetPaddings(int* paddings, + const framework::ExecutionContext& context) { + auto* paddings_t = context.Input("Paddings"); + if (paddings_t) { + auto paddings_data = paddings_t->data(); + paddings[0] = paddings_data[0]; + paddings[1] = paddings_data[1]; + paddings[2] = paddings_data[2]; + paddings[3] = paddings_data[3]; + } else { + auto pads = context.Attr>("paddings"); + std::copy(pads.begin(), pads.end(), paddings); + } +} + template class Pad2dCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); + int pads[4]; + GetPaddings(pads, context); auto mode = context.Attr("mode"); auto data_format = context.Attr("data_format"); T value = context.Attr("pad_value"); + auto* x = context.Input("X"); - auto* out = context.Output("Out"); auto in_dims = x->dims(); - auto out_dims = out->dims(); const T* in_data = x->data(); + + auto* out = context.Output("Out"); + if (data_format == "NCHW") { + out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[0] + pads[1], + in_dims[3] + pads[2] + pads[3]}); + } else { + out->Resize({in_dims[0], in_dims[1] + pads[0] + pads[1], + in_dims[2] + pads[2] + pads[3], in_dims[3]}); + } + auto out_dims = out->dims(); T* out_data = out->mutable_data(context.GetPlace()); + const int pad_top = pads[0]; const int pad_left = pads[2]; const int num = in_dims[0]; @@ -376,7 +402,8 @@ template class Pad2dGradCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); + int pads[4]; + GetPaddings(pads, context); auto mode = context.Attr("mode"); auto data_format = context.Attr("data_format"); auto* d_out = context.Input(framework::GradVarName("Out")); @@ -442,21 +469,35 @@ class Pad2dOp : public framework::OperatorWithKernel { "Output(Out) of Pad2dOp should not be null."); auto x_dim = ctx->GetInputDim("X"); - auto paddings = ctx->Attrs().Get>("paddings"); PADDLE_ENFORCE_EQ(x_dim.size(), 4, - "Size of paddings should be equal to 4."); - std::vector out_dims(x_dim.size()); + "The size of input(X)'s dimension should be equal to 4."); + std::vector out_dims(x_dim.size()); auto data_format = ctx->Attrs().Get("data_format"); out_dims[0] = x_dim[0]; - if (data_format == "NCHW") { + if (ctx->HasInput("Paddings")) { + auto paddings_dim = ctx->GetInputDim("Paddings"); + PADDLE_ENFORCE_EQ( + paddings_dim.size(), 1, + "Size of Input(Paddings)'s dimension should be equal to 1."); + PADDLE_ENFORCE_EQ(paddings_dim[0], 4, + "Shape of Input(Paddings) should be equal to [4]."); out_dims[1] = x_dim[1]; - out_dims[2] = x_dim[2] + paddings[0] + paddings[1]; // height - out_dims[3] = x_dim[3] + paddings[2] + paddings[3]; // width - } else { // NHWC + out_dims[2] = x_dim[2]; out_dims[3] = x_dim[3]; - out_dims[1] = x_dim[1] + paddings[0] + paddings[1]; - out_dims[2] = x_dim[2] + paddings[2] + paddings[3]; + } else { + auto paddings = ctx->Attrs().Get>("paddings"); + PADDLE_ENFORCE_EQ(paddings.size(), 4, + "Size of paddings should be equal to 4."); + if (data_format == "NCHW") { + out_dims[1] = x_dim[1]; + out_dims[2] = x_dim[2] + paddings[0] + paddings[1]; // height + out_dims[3] = x_dim[3] + paddings[2] + paddings[3]; // width + } else { // NHWC + out_dims[3] = x_dim[3]; + out_dims[1] = x_dim[1] + paddings[0] + paddings[1]; + out_dims[2] = x_dim[2] + paddings[2] + paddings[3]; + } } ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); @@ -466,6 +507,13 @@ class Pad2dOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", /*->*/ "Out"); } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + } }; class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker { @@ -477,6 +525,12 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "The output of pad2d op. " "A tensor with the same shape as X."); + AddInput("Paddings", + "A 1-D tensor to describe the padding rules." + "paddings=[0, 1, 2, 3] means " + "padding 0 row to top, 1 row to bottom, 2 columns to left " + "and 3 columns to right. Size of paddings must be 4.") + .AsDispensable(); AddAttr>( "paddings", "(vector) " @@ -554,6 +608,13 @@ class Pad2dOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + } }; class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker { @@ -564,6 +625,7 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker { std::unique_ptr Apply() const override { auto* bind = new framework::OpDesc(); bind->SetInput("X", Input("X")); + bind->SetInput("Paddings", Input("Paddings")); bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); bind->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu index 9ba0ddbd84a43cfd5f028ce072b5c7606fae343d..72eca08b06b144335424a669241b5754beda758d 100644 --- a/paddle/fluid/operators/pad2d_op.cu +++ b/paddle/fluid/operators/pad2d_op.cu @@ -287,20 +287,50 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data, } } +static inline void GetPaddings(int* paddings, + const framework::ExecutionContext& context) { + auto* paddings_t = context.Input("Paddings"); + if (paddings_t) { + Tensor pads; + framework::TensorCopySync(*paddings_t, platform::CPUPlace(), &pads); + auto pads_data = pads.data(); + paddings[0] = pads_data[0]; + paddings[1] = pads_data[1]; + paddings[2] = pads_data[2]; + paddings[3] = pads_data[3]; + } else { + auto pads = context.Attr>("paddings"); + std::copy(pads.begin(), pads.end(), paddings); + } +} + template class Pad2dCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); + int pads[4]; + GetPaddings(pads, context); auto mode = context.Attr("mode"); auto data_format = context.Attr("data_format"); T value = context.Attr("pad_value"); + auto* x = context.Input("X"); - auto* out = context.Output("Out"); auto in_dims = x->dims(); - auto out_dims = out->dims(); const T* in_data = x->data(); - T* out_data = out->mutable_data(context.GetPlace()); + auto* out = context.Output("Out"); + auto out_dims = out->dims(); + if (data_format == "NCHW") { + out_dims[0] = in_dims[0]; + out_dims[1] = in_dims[1]; + out_dims[2] = in_dims[2] + pads[0] + pads[1]; + out_dims[3] = in_dims[3] + pads[2] + pads[3]; + } else { + out_dims[0] = in_dims[0]; + out_dims[1] = in_dims[1] + pads[0] + pads[1]; + out_dims[2] = in_dims[2] + pads[2] + pads[3]; + out_dims[3] = in_dims[3]; + } + T* out_data = out->mutable_data(out_dims, context.GetPlace()); const int pad_top = pads[0]; const int pad_left = pads[2]; const int num = in_dims[0]; @@ -356,7 +386,8 @@ template class Pad2dGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto pads = context.Attr>("paddings"); + int pads[4]; + GetPaddings(pads, context); auto mode = context.Attr("mode"); auto data_format = context.Attr("data_format"); auto* d_out = context.Input(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 58cfbb76e93a1c15c9b7cf9f9e596066c29b7ebb..64d94ab6044c1992145062319120b0372f5061c0 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..36b5259ae5106914f5668625cad535ebc8aa72ec --- /dev/null +++ b/paddle/fluid/operators/prelu_op.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/prelu.h" +#include "paddle/fluid/operators/prelu_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CUDAPReluKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* alpha = context.Input("Alpha"); + auto* out = context.Output("Out"); + + const T* x_ptr = x->data(); + T* o_ptr = out->mutable_data(context.GetPlace()); + + const T* alpha_ptr = alpha->data(); + auto& mode = context.Attr("mode"); + + int numel = x->numel(); + auto dim = x->dims(); + std::vector input_shape = framework::vectorize2int(dim); + + if (mode == "channel") { + math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; + prelu_channel_wise(context.cuda_device_context().stream(), x_ptr, + alpha_ptr, o_ptr, input_shape); + } else if (mode == "element") { + math::PreluElementWiseDirectCUDAFunctor prelu_element_wise; + prelu_element_wise(context.cuda_device_context().stream(), x_ptr, + alpha_ptr, o_ptr, input_shape); + } else { + math::PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr, + o_ptr, input_shape); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + prelu, ops::CUDAPReluKernel, + ops::CUDAPReluKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h index 18acb735cecabd1e01f7821c880fd8ed5e52971f..8fceed3558b4357b7863368c18add329ea9922b3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h @@ -36,12 +36,10 @@ class SequenceMaskOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist"); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist"); - auto maxlen = ctx->Attrs().Get("maxlen"); - if (maxlen > 0) { // We can only infershape when maxlen > 0 - auto dim = framework::vectorize2int(ctx->GetInputDim("X")); - dim.push_back(maxlen); - ctx->SetOutputDim("Y", framework::make_ddim(dim)); - } + int maxlen = ctx->Attrs().Get("maxlen"); + auto dim = framework::vectorize2int(ctx->GetInputDim("X")); + dim.push_back(maxlen > 0 ? maxlen : -1); + ctx->SetOutputDim("Y", framework::make_ddim(dim)); } }; diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 193de05422bb78572c0e5eaf4cd46744c3bcb113..14746fa95159d707be7c10c69a4ffc2211e17a93 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -18,6 +18,7 @@ namespace paddle { namespace operators { using framework::Tensor; +const int kIgnoreIndex = -100; class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { public: @@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker AddOutput("Out", "(Tensor, default Tensor), a 2-D tensor with shape N x D " " of elementwise logistic losses."); + AddAttr("ignore_index", + "(int, default kIgnoreIndex), Specifies a target value that " + "is ignored and" + "does not contribute to the input gradient.") + .SetDefault(kIgnoreIndex); AddComment(R"DOC( SigmoidCrossEntropyWithLogits Operator. diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h index faef72866eb491887bbf221d32a8121b21fc3c66..b8731c232753074fa9e76b028485d3598c9a7295 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -15,33 +15,72 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/legacy/utils/Logging.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +struct SigmoidCrossEntropyWithLogitsForward { + HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index) + : ignore_index(ignore_index) {} + + HOSTDEVICE T operator()(const T &x, const T &label) const { + if (static_cast(label) == ignore_index) { + return static_cast(0.); + } + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = std::log(static_cast(1) + std::exp(-(std::abs(x)))); + return term1 - term2 + term3; + } + + int ignore_index; +}; + +template +struct SigmoidCrossEntropyWithLogitsBackward { + HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index) + : ignore_index(ignore_index) {} + + HOSTDEVICE T operator()(const T &x, const T &label) const { + if (static_cast(label) == ignore_index) { + return static_cast(0.); + } + T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); + return simoid_x - label; + } + + int ignore_index; +}; + // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) template class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = context.Input("Label"); - framework::Tensor *Out = context.Output("Out"); + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + Tensor *Out = context.Output("Out"); Out->mutable_data(context.GetPlace()); + int ignore_index = context.Attr("ignore_index"); - auto x = framework::EigenVector::Flatten(*X); - auto labels = framework::EigenVector::Flatten(*Labels); - auto out = framework::EigenVector::Flatten(*Out); + auto x = EigenVector::Flatten(*X); + auto labels = EigenVector::Flatten(*Labels); + auto out = EigenVector::Flatten(*Out); auto &place = *context.device_context().eigen_device(); - // term1 = max(x, 0) - auto term1 = x.cwiseMax(static_cast(0)); - // term2 = x * labels - auto term2 = x * labels; - // term3 = log(1 + exp(-abs(x))) - auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); - - out.device(place) = term1 - term2 + term3; + out.device(place) = x.binaryExpr( + labels, SigmoidCrossEntropyWithLogitsForward(ignore_index)); } }; @@ -50,23 +89,23 @@ template class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = context.Input("Label"); - const framework::Tensor *dOut = - context.Input(framework::GradVarName("Out")); - framework::Tensor *dX = - context.Output(framework::GradVarName("X")); + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + Tensor *dX = context.Output(framework::GradVarName("X")); dX->mutable_data(context.GetPlace()); - auto x = framework::EigenVector::Flatten(*X); - auto labels = framework::EigenVector::Flatten(*Labels); - auto dout = framework::EigenVector::Flatten(*dOut); - auto dx = framework::EigenVector::Flatten(*dX); + auto ignore_index = context.Attr("ignore_index"); + auto x = EigenVector::Flatten(*X); + auto labels = EigenVector::Flatten(*Labels); + auto dout = EigenVector::Flatten(*dOut); + auto dx = EigenVector::Flatten(*dX); auto &place = *context.template device_context().eigen_device(); - auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); - dx.device(place) = dout * (sigmoid_x - labels); + auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward( + static_cast(ignore_index))); + dx.device(place) = dout * diff; } }; diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc index 01819f53e3ab0973f6140c5a81f18f954b6a0376..d2b149535426d097fea4b8fffa9efe82bd6edc64 100644 --- a/paddle/fluid/operators/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/softmax_mkldnn_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "mkldnn.hpp" #include "paddle/fluid/operators/softmax_op.h" -#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7597f732430038a4a180297e730340d1bc47b8c --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -0,0 +1,221 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/yolov3_loss_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class Yolov3LossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTBox"), + "Input(GTBox) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTLabel"), + "Input(GTLabel) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of Yolov3LossOp should not be null."); + + auto dim_x = ctx->GetInputDim("X"); + auto dim_gtbox = ctx->GetInputDim("GTBox"); + auto dim_gtlabel = ctx->GetInputDim("GTLabel"); + auto anchors = ctx->Attrs().Get>("anchors"); + auto class_num = ctx->Attrs().Get("class_num"); + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); + PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], + "Input(X) dim[3] and dim[4] should be euqal."); + PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3, + "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5"); + PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2, + "Input(GTBox) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0], + "Input(GTBox) and Input(GTLabel) dim[0] should be same"); + PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1], + "Input(GTBox) and Input(GTLabel) dim[1] should be same"); + PADDLE_ENFORCE_GT(anchors.size(), 0, + "Attr(anchors) length should be greater then 0."); + PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, + "Attr(anchors) length should be even integer."); + PADDLE_ENFORCE_GT(class_num, 0, + "Attr(class_num) should be an integer greater then 0."); + + std::vector dim_out({1}); + ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of YOLO v3 loss operator, " + "This is a 4-D tensor with shape of [N, C, H, W]." + "H and W should be same, and the second dimention(C) stores" + "box locations, confidence score and classification one-hot" + "key of each anchor box"); + AddInput("GTBox", + "The input tensor of ground truth boxes, " + "This is a 3-D tensor with shape of [N, max_box_num, 5], " + "max_box_num is the max number of boxes in each image, " + "In the third dimention, stores x, y, w, h coordinates, " + "x, y is the center cordinate of boxes and w, h is the " + "width and height and x, y, w, h should be divided by " + "input image height to scale to [0, 1]."); + AddInput("GTLabel", + "The input tensor of ground truth label, " + "This is a 2-D tensor with shape of [N, max_box_num], " + "and each element shoudl be an integer to indicate the " + "box class id."); + AddOutput("Loss", + "The output yolov3 loss tensor, " + "This is a 1-D tensor with shape of [1]"); + + AddAttr("class_num", "The number of classes to predict."); + AddAttr>("anchors", + "The anchor width and height, " + "it will be parsed pair by pair."); + AddAttr("ignore_thresh", + "The ignore threshold to ignore confidence loss."); + AddAttr("loss_weight_xy", "The weight of x, y location loss.") + .SetDefault(1.0); + AddAttr("loss_weight_wh", "The weight of w, h location loss.") + .SetDefault(1.0); + AddAttr( + "loss_weight_conf_target", + "The weight of confidence score loss in locations with target object.") + .SetDefault(1.0); + AddAttr("loss_weight_conf_notarget", + "The weight of confidence score loss in locations without " + "target object.") + .SetDefault(1.0); + AddAttr("loss_weight_class", "The weight of classification loss.") + .SetDefault(1.0); + AddComment(R"DOC( + This operator generate yolov3 loss by given predict result and ground + truth boxes. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, specify the grid size, each grid point predict given + number boxes, this given number is specified by anchors, it should be + half anchors length, which following will be represented as S. In the + second dimention(the channel dimention), C should be S * (class_num + 5), + class_num is the box categoriy number of source dataset(such as coco), + so in the second dimention, stores 4 box location coordinates x, y, w, h + and confidence score of the box and class one-hot key of each anchor box. + + While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions + correspnd to: + + $$ + b_x = \sigma(t_x) + c_x + b_y = \sigma(t_y) + c_y + b_w = p_w e^{t_w} + b_h = p_h e^{t_h} + $$ + + While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$ + is specified by anchors. + + As for confidence score, it is the logistic regression value of IoU between + anchor boxes and ground truth boxes, the score of the anchor box which has + the max IoU should be 1, and if the anchor box has IoU bigger then ignore + thresh, the confidence score loss of this anchor box will be ignored. + + Therefore, the yolov3 loss consist of three major parts, box location loss, + confidence score loss, and classification loss. The MSE loss is used for + box location, and binary cross entropy loss is used for confidence score + loss and classification loss. + + Final loss will be represented as follow. + + $$ + loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh} + + \loss_weight_{conf_target} * loss_{conf_target} + + \loss_weight_{conf_notarget} * loss_{conf_notarget} + + \loss_weight_{class} * loss_{class} + $$ + )DOC"); + } +}; + +class Yolov3LossOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null"); + auto dim_x = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), dim_x); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("yolov3_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("GTBox", Input("GTBox")); + op->SetInput("GTLabel", Input("GTLabel")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("GTBox"), {}); + op->SetOutput(framework::GradVarName("GTLabel"), {}); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, + ops::Yolov3LossGradMaker); +REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); +REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel, + ops::Yolov3LossKernel); +REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel, + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0bb285722ddedf721d98237760ec9868e2134442 --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -0,0 +1,483 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; +template +using EigenVector = framework::EigenVector; + +using Array5 = Eigen::DSizes; + +template +static inline bool isZero(T x) { + return fabs(x) < 1e-6; +} + +template +static inline T sigmoid(T x) { + return 1.0 / (exp(-1.0 * x) + 1.0); +} + +template +static inline T CalcMaskPointNum(const Tensor& mask) { + auto mask_t = EigenVector::Flatten(mask); + T count = 0.0; + for (int i = 0; i < mask_t.dimensions()[0]; i++) { + if (mask_t(i)) { + count += 1.0; + } + } + return count; +} + +template +static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + T error_sum = 0.0; + T points = 0.0; + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + error_sum += pow(x_t(i) - y_t(i), 2); + points += 1; + } + } + return (error_sum / points); +} + +template +static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y, + const Tensor& mask, T mf) { + auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf; + } + } +} + +template +static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + T error_sum = 0.0; + T points = 0.0; + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + error_sum += + -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i))); + points += 1; + } + } + return (error_sum / points); +} + +template +static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x, + const Tensor& y, const Tensor& mask, + T mf) { + auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf; + } + } +} + +template +static void CalcPredResult(const Tensor& input, Tensor* pred_conf, + Tensor* pred_class, Tensor* pred_x, Tensor* pred_y, + Tensor* pred_w, Tensor* pred_h, const int anchor_num, + const int class_num) { + const int n = input.dims()[0]; + const int h = input.dims()[2]; + const int w = input.dims()[3]; + const int box_attr_num = 5 + class_num; + + auto input_t = EigenTensor::From(input); + auto pred_conf_t = EigenTensor::From(*pred_conf); + auto pred_class_t = EigenTensor::From(*pred_class); + auto pred_x_t = EigenTensor::From(*pred_x); + auto pred_y_t = EigenTensor::From(*pred_y); + auto pred_w_t = EigenTensor::From(*pred_w); + auto pred_h_t = EigenTensor::From(*pred_h); + + for (int i = 0; i < n; i++) { + for (int an_idx = 0; an_idx < anchor_num; an_idx++) { + for (int j = 0; j < h; j++) { + for (int k = 0; k < w; k++) { + pred_x_t(i, an_idx, j, k) = + sigmoid(input_t(i, box_attr_num * an_idx, j, k)); + pred_y_t(i, an_idx, j, k) = + sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k)); + pred_w_t(i, an_idx, j, k) = + input_t(i, box_attr_num * an_idx + 2, j, k); + pred_h_t(i, an_idx, j, k) = + input_t(i, box_attr_num * an_idx + 3, j, k); + + pred_conf_t(i, an_idx, j, k) = + sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k)); + + for (int c = 0; c < class_num; c++) { + pred_class_t(i, an_idx, j, k, c) = + sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); + } + } + } + } + } +} + +template +static T CalcBoxIoU(std::vector box1, std::vector box2) { + T b1_x1 = box1[0] - box1[2] / 2; + T b1_x2 = box1[0] + box1[2] / 2; + T b1_y1 = box1[1] - box1[3] / 2; + T b1_y2 = box1[1] + box1[3] / 2; + T b2_x1 = box2[0] - box2[2] / 2; + T b2_x2 = box2[0] + box2[2] / 2; + T b2_y1 = box2[1] - box2[3] / 2; + T b2_y2 = box2[1] + box2[3] / 2; + + T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1); + T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1); + + T inter_rect_x1 = std::max(b1_x1, b2_x1); + T inter_rect_y1 = std::max(b1_y1, b2_y1); + T inter_rect_x2 = std::min(b1_x2, b2_x2); + T inter_rect_y2 = std::min(b1_y2, b2_y2); + T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast(0.0)) * + std::max(inter_rect_y2 - inter_rect_y1, static_cast(0.0)); + + return inter_area / (b1_area + b2_area - inter_area); +} + +template +static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label, + const float ignore_thresh, std::vector anchors, + const int grid_size, Tensor* obj_mask, + Tensor* noobj_mask, Tensor* tx, Tensor* ty, + Tensor* tw, Tensor* th, Tensor* tconf, + Tensor* tclass) { + const int n = gt_box.dims()[0]; + const int b = gt_box.dims()[1]; + const int anchor_num = anchors.size() / 2; + auto gt_box_t = EigenTensor::From(gt_box); + auto gt_label_t = EigenTensor::From(gt_label); + auto obj_mask_t = EigenTensor::From(*obj_mask).setConstant(0); + auto noobj_mask_t = EigenTensor::From(*noobj_mask).setConstant(1); + auto tx_t = EigenTensor::From(*tx).setConstant(0.0); + auto ty_t = EigenTensor::From(*ty).setConstant(0.0); + auto tw_t = EigenTensor::From(*tw).setConstant(0.0); + auto th_t = EigenTensor::From(*th).setConstant(0.0); + auto tconf_t = EigenTensor::From(*tconf).setConstant(0.0); + auto tclass_t = EigenTensor::From(*tclass).setConstant(0.0); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < b; j++) { + if (isZero(gt_box_t(i, j, 0)) && isZero(gt_box_t(i, j, 1)) && + isZero(gt_box_t(i, j, 2)) && isZero(gt_box_t(i, j, 3))) { + continue; + } + + int cur_label = gt_label_t(i, j); + T gx = gt_box_t(i, j, 0) * grid_size; + T gy = gt_box_t(i, j, 1) * grid_size; + T gw = gt_box_t(i, j, 2) * grid_size; + T gh = gt_box_t(i, j, 3) * grid_size; + int gi = static_cast(gx); + int gj = static_cast(gy); + + T max_iou = static_cast(0); + T iou; + int best_an_index = -1; + std::vector gt_box_shape({0, 0, gw, gh}); + for (int an_idx = 0; an_idx < anchor_num; an_idx++) { + std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), + static_cast(anchors[2 * an_idx + 1])}); + iou = CalcBoxIoU(gt_box_shape, anchor_shape); + if (iou > max_iou) { + max_iou = iou; + best_an_index = an_idx; + } + if (iou > ignore_thresh) { + noobj_mask_t(i, an_idx, gj, gi) = 0; + } + } + obj_mask_t(i, best_an_index, gj, gi) = 1; + noobj_mask_t(i, best_an_index, gj, gi) = 0; + tx_t(i, best_an_index, gj, gi) = gx - gi; + ty_t(i, best_an_index, gj, gi) = gy - gj; + tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); + th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); + tclass_t(i, best_an_index, gj, gi, cur_label) = 1; + tconf_t(i, best_an_index, gj, gi) = 1; + } + } +} + +static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand, + const Tensor& obj_mask) { + const int n = obj_mask_expand->dims()[0]; + const int an_num = obj_mask_expand->dims()[1]; + const int h = obj_mask_expand->dims()[2]; + const int w = obj_mask_expand->dims()[3]; + const int class_num = obj_mask_expand->dims()[4]; + auto obj_mask_expand_t = EigenTensor::From(*obj_mask_expand); + auto obj_mask_t = EigenTensor::From(obj_mask); + + obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1)) + .broadcast(Array5(1, 1, 1, 1, class_num)); +} + +template +static void AddAllGradToInputGrad( + Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y, + const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, + const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, + const Tensor& grad_conf_target, const Tensor& grad_conf_notarget, + const Tensor& grad_class, const int class_num, const float loss_weight_xy, + const float loss_weight_wh, const float loss_weight_conf_target, + const float loss_weight_conf_notarget, const float loss_weight_class) { + const int n = pred_x.dims()[0]; + const int an_num = pred_x.dims()[1]; + const int h = pred_x.dims()[2]; + const int w = pred_x.dims()[3]; + const int attr_num = class_num + 5; + auto grad_t = EigenTensor::From(*grad).setConstant(0.0); + auto pred_x_t = EigenTensor::From(pred_x); + auto pred_y_t = EigenTensor::From(pred_y); + auto pred_conf_t = EigenTensor::From(pred_conf); + auto pred_class_t = EigenTensor::From(pred_class); + auto grad_x_t = EigenTensor::From(grad_x); + auto grad_y_t = EigenTensor::From(grad_y); + auto grad_w_t = EigenTensor::From(grad_w); + auto grad_h_t = EigenTensor::From(grad_h); + auto grad_conf_target_t = EigenTensor::From(grad_conf_target); + auto grad_conf_notarget_t = EigenTensor::From(grad_conf_notarget); + auto grad_class_t = EigenTensor::From(grad_class); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + grad_t(i, j * attr_num, k, l) = + grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) * + (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy; + grad_t(i, j * attr_num + 1, k, l) = + grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * + (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy; + grad_t(i, j * attr_num + 2, k, l) = + grad_w_t(i, j, k, l) * loss * loss_weight_wh; + grad_t(i, j * attr_num + 3, k, l) = + grad_h_t(i, j, k, l) * loss * loss_weight_wh; + grad_t(i, j * attr_num + 4, k, l) = + grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target; + grad_t(i, j * attr_num + 4, k, l) += + grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss * + loss_weight_conf_notarget; + + for (int c = 0; c < class_num; c++) { + grad_t(i, j * attr_num + 5 + c, k, l) = + grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * + (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class; + } + } + } + } + } +} + +template +class Yolov3LossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); + auto* loss = ctx.Output("Loss"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + float loss_weight_xy = ctx.Attr("loss_weight_xy"); + float loss_weight_wh = ctx.Attr("loss_weight_wh"); + float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); + float loss_weight_conf_notarget = + ctx.Attr("loss_weight_conf_notarget"); + float loss_weight_class = ctx.Attr("loss_weight_class"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + + Tensor pred_x, pred_y, pred_w, pred_h; + Tensor pred_conf, pred_class; + pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, + &pred_w, &pred_h, an_num, class_num); + + Tensor obj_mask, noobj_mask; + Tensor tx, ty, tw, th, tconf, tclass; + obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, + &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); + + Tensor obj_mask_expand; + obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, + ctx.GetPlace()); + ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); + + T loss_x = CalcMSEWithMask(pred_x, tx, obj_mask); + T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); + T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); + T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); + T loss_conf_target = CalcBCEWithMask(pred_conf, tconf, obj_mask); + T loss_conf_notarget = CalcBCEWithMask(pred_conf, tconf, noobj_mask); + T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); + + auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); + loss_data[0] = loss_weight_xy * (loss_x + loss_y) + + loss_weight_wh * (loss_w + loss_h) + + loss_weight_conf_target * loss_conf_target + + loss_weight_conf_notarget * loss_conf_notarget + + loss_weight_class * loss_class; + } +}; + +template +class Yolov3LossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Loss")); + const T loss = output_grad->data()[0]; + float loss_weight_xy = ctx.Attr("loss_weight_xy"); + float loss_weight_wh = ctx.Attr("loss_weight_wh"); + float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); + float loss_weight_conf_notarget = + ctx.Attr("loss_weight_conf_notarget"); + float loss_weight_class = ctx.Attr("loss_weight_class"); + + const int n = input->dims()[0]; + const int c = input->dims()[1]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + + Tensor pred_x, pred_y, pred_w, pred_h; + Tensor pred_conf, pred_class; + pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, + &pred_w, &pred_h, an_num, class_num); + + Tensor obj_mask, noobj_mask; + Tensor tx, ty, tw, th, tconf, tclass; + obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, + &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); + + Tensor obj_mask_expand; + obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, + ctx.GetPlace()); + ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); + + Tensor grad_x, grad_y, grad_w, grad_h; + Tensor grad_conf_target, grad_conf_notarget, grad_class; + grad_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_target.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_notarget.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + T obj_mf = CalcMaskPointNum(obj_mask); + T noobj_mf = CalcMaskPointNum(noobj_mask); + T obj_expand_mf = CalcMaskPointNum(obj_mask_expand); + CalcMSEGradWithMask(&grad_x, pred_x, tx, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_y, pred_y, ty, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_w, pred_w, tw, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_h, pred_h, th, obj_mask, obj_mf); + CalcBCEGradWithMask(&grad_conf_target, pred_conf, tconf, obj_mask, + obj_mf); + CalcBCEGradWithMask(&grad_conf_notarget, pred_conf, tconf, noobj_mask, + noobj_mf); + CalcBCEGradWithMask(&grad_class, pred_class, tclass, obj_mask_expand, + obj_expand_mf); + + input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); + AddAllGradToInputGrad( + input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, + grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class, + class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target, + loss_weight_conf_notarget, loss_weight_class); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h index 2ce9b31bb81de867ff4ed6ee14afddecd95317b9..2e8fa7c1b8f7f7b8f3154aae691bb100375981dd 100644 --- a/paddle/fluid/platform/assert.h +++ b/paddle/fluid/platform/assert.h @@ -36,6 +36,15 @@ limitations under the License. */ asm("trap;"); \ } \ } while (0) + +#define PADDLE_ASSERT_MSG_CODE(e, m, c) \ + do { \ + if (!(e)) { \ + printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \ + TOSTRING(e), m, c); \ + asm("trap;"); \ + } \ + } while (0) #else #include // For cuda, the assertions can affect performance and it is therefore @@ -43,4 +52,5 @@ limitations under the License. */ // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion #define PADDLE_ASSERT(e) assert((e)) #define PADDLE_ASSERT_MSG(e, m) assert((e) && (m)) +#define PADDLE_ASSERT_MSG_CODE(e, m, c) assert((e) && (m) && (c || 1)) #endif diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index d466f28d1ea0a8327f8d7a45c3e55c5aacd61544..f9a32bfa4c15261ba6b79fc4efd3a1961f7c6d4d 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() { return CUDAPinnedMaxAllocSize() / 256; } -namespace jit { #ifdef PADDLE_WITH_XBYAK static Xbyak::util::Cpu cpu; bool MayIUse(const cpu_isa_t cpu_isa) { @@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) { } #endif -} // namespace jit } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index fd31ef77b46d5b5b641983a0421da31914c87c18..55dba545ff133b1c219ee58f6d1bb2d2130d1a59 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { typedef enum { isa_any, sse42, @@ -55,7 +54,5 @@ typedef enum { // May I use some instruction bool MayIUse(const cpu_isa_t cpu_isa); -} // namespace jit - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index dc1d751141187edb7738e42c41514614d4d399b0..0a4563ead65b1e45adca1d1a1fce066a1a55d932 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { auto *kernel = reinterpret_cast(record); - tracer->AddKernelRecords(kernel->start, kernel->end, + tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end, kernel->deviceId, kernel->streamId, kernel->correlationId); break; @@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer { stream_id, correlation_id, bytes}); } - void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, - int64_t stream_id, uint32_t correlation_id) { + void AddKernelRecords(std::string name, uint64_t start, uint64_t end, + int64_t device_id, int64_t stream_id, + uint32_t correlation_id) { // 0 means timestamp information could not be collected for the kernel. if (start == 0 || end == 0) { VLOG(3) << correlation_id << " cannot be traced"; @@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer { } std::lock_guard l(trace_mu_); kernel_records_.push_back( - KernelRecord{start, end, device_id, stream_id, correlation_id}); + KernelRecord{name, start, end, device_id, stream_id, correlation_id}); } bool IsEnabled() { @@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer { profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); for (const KernelRecord &r : kernel_records_) { - if (correlations_.find(r.correlation_id) == correlations_.end()) { - fprintf(stderr, "cannot relate a kernel activity\n"); - continue; - } auto *event = profile_pb.add_events(); event->set_type(proto::Event::GPUKernel); - event->set_name(correlations_.at(r.correlation_id)); + if (correlations_.find(r.correlation_id) != correlations_.end()) { + event->set_name(correlations_.at(r.correlation_id)); + } else { + event->set_name(r.name); + } event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); event->set_sub_device_id(r.stream_id); diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index eaf047d4744762f69d50bff8d467da8e3b8317cc..bf0786be2d0fafbf4b610d16ef587ac219399203 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() { class DeviceTracer { public: struct KernelRecord { + std::string name; uint64_t start_ns; uint64_t end_ns; int64_t device_id; @@ -84,8 +85,9 @@ class DeviceTracer { // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. - virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, - int64_t stream_id, uint32_t correlation_id) = 0; + virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end, + int64_t device_id, int64_t stream_id, + uint32_t correlation_id) = 0; // Generate a proto after done (Disabled). virtual proto::Profile GenProfile(const std::string& profile_path) = 0; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index db62377898339def415a13d185f85f34de326d7f..550fe2edee13d628e761eca194809823537a4024 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -111,7 +111,22 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnFindConvolutionForwardAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ - __macro(cudnnGetErrorString); + __macro(cudnnGetErrorString); \ + __macro(cudnnCreateDropoutDescriptor); \ + __macro(cudnnDropoutGetStatesSize); \ + __macro(cudnnSetDropoutDescriptor); \ + __macro(cudnnCreateRNNDescriptor); \ + __macro(cudnnSetRNNDescriptor); \ + __macro(cudnnGetRNNParamsSize); \ + __macro(cudnnGetRNNWorkspaceSize); \ + __macro(cudnnGetRNNTrainingReserveSize); \ + __macro(cudnnRNNForwardTraining); \ + __macro(cudnnRNNBackwardData); \ + __macro(cudnnRNNBackwardWeights); \ + __macro(cudnnRNNForwardInference); \ + __macro(cudnnDestroyDropoutDescriptor); \ + __macro(cudnnDestroyRNNDescriptor); + CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ @@ -149,6 +164,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif +// APIs in R6 +#if CUDNN_VERSION >= 6000 +#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6); +CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + #if CUDNN_VERSION >= 7001 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ __macro(cudnnSetConvolutionGroupCount); \ diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 6954e4c6a9df8dea01ec2b0f193965d835503b17..ca89d91aadb2d3e9005e6dd06cef124428d7e250 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/split.h" #ifndef _WIN32 constexpr static float fraction_of_gpu_memory_to_use = 0.92f; @@ -45,6 +46,15 @@ DEFINE_bool( "input and output must be half precision) and recurrent neural networks " "(RNNs)."); +DEFINE_string(selected_gpus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (GPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between GPU devices, use CUDA_VISIBLE_DEVICES can only use" + "share-memory only."); + namespace paddle { namespace platform { @@ -121,6 +131,24 @@ int GetCurrentDeviceId() { return device_id; } +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices() { + // use user specified GPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_gpus.empty()) { + auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index 6a0b3c8e02d49068c2dbe14c7feea7e139947694..1e1ab2503f53fe20bbe62c48f65d8535947f1aa8 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include +#include namespace paddle { namespace platform { @@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i); //! Get the current GPU device id in system. int GetCurrentDeviceId(); +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices(); + //! Set the GPU device id for next execution. void SetDeviceId(int device_id); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 258779ba51026d0cc418257a37b78f346fa48efa..0d10d82d74a2011b1b2bc088fe88cbfdb49600b8 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/string/split.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) { std::vector devices; #ifdef PADDLE_WITH_CUDA try { - int count = platform::GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - devices.push_back(i); - } + // use user specified GPUs in single-node multi-process mode. + devices = platform::GetSelectedDevices(); } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } @@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) { void InitDevices(bool init_p2p, const std::vector devices) { std::vector places; - int count = 0; -#ifdef PADDLE_WITH_CUDA - try { - count = platform::GetCUDADeviceCount(); - } catch (const std::exception &exp) { - LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; - } -#endif for (size_t i = 0; i < devices.size(); ++i) { - if (devices[i] >= count || devices[i] < 0) { + // In multi process multi gpu mode, we may have gpuid = 7 + // but count = 1. + if (devices[i] < 0) { LOG(WARNING) << "Invalid devices id."; continue; } + places.emplace_back(platform::CUDAPlace(devices[i])); } if (init_p2p) { @@ -122,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__) - if (platform::jit::MayIUse(platform::jit::avx)) { + if (platform::MayIUse(platform::avx)) { #ifndef __AVX__ LOG(WARNING) << "AVX is available, Please re-compile on local machine"; #endif @@ -137,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector devices) { " version or compile from source code." #ifdef __AVX512F__ - if (!platform::jit::MayIUse(platform::jit::avx512f)) { - if (platform::jit::MayIUse(platform::jit::avx2)) { + if (!platform::MayIUse(platform::avx512f)) { + if (platform::MayIUse(platform::avx2)) { AVX_GUIDE(AVX512, AVX2); - } else if (platform::jit::MayIUse(platform::jit::avx)) { + } else if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX512, AVX); } else { AVX_GUIDE(AVX512, NonAVX); @@ -149,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX2__ - if (!platform::jit::MayIUse(platform::jit::avx2)) { - if (platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx2)) { + if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX2, AVX); } else { AVX_GUIDE(AVX2, NonAVX); @@ -159,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX__ - if (!platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX, NonAVX); } #endif diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 761a9815e098098cb4c4080bd8605dde7f6870a4..167bd4e81d0ddbbba260417b460d083dbeb932b6 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -107,170 +107,6 @@ inline mkldnn::memory::format GetMKLDNNFormat( memory.dst_primitive_desc().desc().data.format); } -class MKLDNNHandler { - public: - MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : dev_ctx_(dev_ctx), - engine_(engine), - key_(base_key), - is_reusing_(false) {} - - std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_src_mem_p"); - } - - std::shared_ptr AcquireWeightsMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_weights_mem_p"); - } - - std::shared_ptr AcquireBiasMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_bias_mem_p"); - } - - std::shared_ptr AcquireDstMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); - } - - std::shared_ptr AcquireDiffSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::primitive_desc mdp, void* ptr, - const std::string& suffix) { - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - "Fail to find mem primitive in device context"); - if (mem_p == nullptr) { - mem_p = std::make_shared(mdp, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } - - std::shared_ptr AcquireMemory(const mkldnn::memory::desc& md, - void* ptr, - const std::string& suffix) { - /*Generate key*/ - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - "Fail to find mem primitive in device context"); - if (mem_p == nullptr) { - mem_p = std::make_shared( - mkldnn::memory::primitive_desc{md, engine_}, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } - - std::shared_ptr AcquireMemory( - const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p, - const std::string& suffix, - std::vector& pipeline) { // NOLINT - auto local_key = key_ + suffix; - auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto stored_reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - - if (stored_reorder_p) { - pipeline.push_back(*stored_reorder_p); - } else { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - pipeline.push_back(*reorder_p); - } - - return target_memory_p; - } - - std::shared_ptr AcquireMemory( - mkldnn::memory::primitive_desc& mpd, // NOLINT - mkldnn::memory::primitive_desc& user_mpd, // NOLINT - const std::shared_ptr user_memory_p, - const std::string& suffix, - std::vector& pipeline, // NOLINT - bool is_persistent = false) { - // create reorder primitive if the input format is not the preferred one - auto local_key = key_ + suffix; - auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto target_memory_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false), - "Fail to find mem primitive in device context"); - if (target_memory_p == nullptr) { - target_memory_p = user_memory_p; - std::shared_ptr reorder_p; - if (mpd != user_mpd) { - target_memory_p = std::make_shared(mpd); - - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - pipeline.push_back(*reorder_p); - } - dev_ctx_.SetBlob(local_key, target_memory_p); - } else if (!is_persistent) { - // Make reorder if needed - auto reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - if (reorder_p != nullptr) { - pipeline.push_back(*reorder_p); - } - is_reusing_ = true; - } - return target_memory_p; - } - - static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT - const std::string& suffix) { - return dims2str(operand_dims) + suffix; - } - - protected: - static std::string dims2str(const mkldnn::memory::dims& operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - } - - protected: - const MKLDNNDeviceContext& dev_ctx_; - mkldnn::engine engine_; - std::string key_; - bool is_reusing_; -}; - inline mkldnn::memory::format MKLDNNFormatForSize( size_t dims_size, mkldnn::memory::format data_format) { if (dims_size == 1) { diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h new file mode 100644 index 0000000000000000000000000000000000000000..1c6421f3fa6ffbe7d3c682611def9e87d2fae5b0 --- /dev/null +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -0,0 +1,458 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +using user_function = std::function(const float*)>; + +class MKLDNNHandler { + public: + MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : dev_ctx_(dev_ctx), + engine_(engine), + key_(base_key), + is_reusing_(false) {} + + std::shared_ptr AcquireSrcMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_src_mem_p"); + } + + std::shared_ptr AcquireWeightsMemory( + const mkldnn::memory::desc& md, void* ptr, + user_function custom_func = {}) { + return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func); + } + + std::shared_ptr AcquireBiasMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_bias_mem_p"); + } + + std::shared_ptr AcquireDstMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); + } + + std::shared_ptr AcquireDiffDstMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); + } + + std::shared_ptr AcquireDiffSrcMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::primitive_desc mdp, void* ptr, + const std::string& suffix) { + auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (mem_p == nullptr) { + mem_p = std::make_shared(mdp, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + + // This incarnation of AcquireMemory can call user function eg. custom reorder + // or preprocessing routine if needed + std::shared_ptr AcquireMemory( + const mkldnn::memory::desc& md, void* ptr, const std::string& suffix, + user_function custom_func = {}) { + /*Generate key*/ + auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (mem_p == nullptr) { + // Call custom reorder/preprocessing func if available + if (custom_func) { + auto reordered_data = custom_func(reinterpret_cast(ptr)); + dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data); + ptr = reinterpret_cast(reordered_data.get()); + } + + mem_p = std::make_shared( + mkldnn::memory::primitive_desc{md, engine_}, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } + + std::shared_ptr AcquireMemory( + const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p, + const std::string& suffix, + std::vector& pipeline) { // NOLINT + auto local_key = key_ + suffix; + auto key_reorder_p = key_ + suffix + "reorder_p"; + + auto stored_reorder_p = std::static_pointer_cast( + dev_ctx_.GetBlob(key_reorder_p)); + + if (stored_reorder_p) { + pipeline.push_back(*stored_reorder_p); + } else { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + dev_ctx_.SetBlob(key_reorder_p, reorder_p); + pipeline.push_back(*reorder_p); + } + + return target_memory_p; + } + + std::shared_ptr AcquireMemory( + mkldnn::memory::primitive_desc& mpd, // NOLINT + mkldnn::memory::primitive_desc& user_mpd, // NOLINT + const std::shared_ptr user_memory_p, + const std::string& suffix, + std::vector& pipeline, // NOLINT + bool is_persistent = false) { + // create reorder primitive if the input format is not the preferred one + auto local_key = key_ + suffix; + auto key_reorder_p = key_ + suffix + "reorder_p"; + + auto target_memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false), + "Fail to find mem primitive in device context"); + if (target_memory_p == nullptr) { + target_memory_p = user_memory_p; + std::shared_ptr reorder_p; + if (mpd != user_mpd) { + target_memory_p = std::make_shared(mpd); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + dev_ctx_.SetBlob(key_reorder_p, reorder_p); + pipeline.push_back(*reorder_p); + } + dev_ctx_.SetBlob(local_key, target_memory_p); + } else if (!is_persistent) { + // Make reorder if needed + auto reorder_p = std::static_pointer_cast( + dev_ctx_.GetBlob(key_reorder_p)); + if (reorder_p != nullptr) { + pipeline.push_back(*reorder_p); + } + is_reusing_ = true; + } + return target_memory_p; + } + + static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT + const std::string& suffix) { + return dims2str(operand_dims) + suffix; + } + + protected: + static std::string dims2str(const mkldnn::memory::dims& operand_dims) { + std::string dstr = ""; + for (size_t i = 0; i < operand_dims.size(); ++i) { + dstr += std::to_string(operand_dims[i]) + "-"; + } + return dstr; + } + + protected: + const MKLDNNDeviceContext& dev_ctx_; + mkldnn::engine engine_; + std::string key_; + bool is_reusing_; +}; + +template +class ConvMKLDNNTemplateHandler : public MKLDNNHandler { + public: + ConvMKLDNNTemplateHandler( + std::shared_ptr conv_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key) { + conv_pd_ = conv_pd; + } + + ConvMKLDNNTemplateHandler( + std::shared_ptr conv_pd, + std::shared_ptr + conv_bwd_data_pd, + std::shared_ptr + conv_bwd_weights_pd, + const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + conv_pd_(conv_pd), + conv_bwd_weights_pd_(conv_bwd_weights_pd), + conv_bwd_data_pd_(conv_bwd_data_pd) { + // If we are in Grad operatgor then update a key with BWD suffix to + // distinguish from FWD memory primitives + key_ += "-BWD"; + } + + size_t GetDstMemorySize() const { + return conv_pd_->dst_primitive_desc().get_size(); + } + + mkldnn::memory::format GetDstFormat() const { + return static_cast( + conv_pd_->dst_primitive_desc().desc().data.format); + } + + size_t GetDiffWeightsMemorySize() const { + return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size(); + } + + size_t GetDiffSourceMemorySize() const { + return conv_bwd_data_pd_->diff_src_primitive_desc().get_size(); + } + + std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { // NOLINT + auto src_pd = conv_bwd_weights_pd_->src_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(src_pd, user_pd, user_memory_p, + "@weights-src_mem_p", pipeline); + } + + std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { // NOLINT + auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, + "@weights-diff_dst_mem_p", pipeline); + } + + std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( + void* ptr) { + return this->AcquireMemoryFromPrimitive( + conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr, + "@diff_weights_mem_p"); + } + + std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { // NOLINT + auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, + "@data-diff_dst_mem_p", pipeline); + } + + std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( + const std::shared_ptr user_weights_memory_p, + std::vector& pipeline) { // NOLINT + auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc(); + auto user_pd = user_weights_memory_p->get_primitive_desc(); + return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, + "@data-weights_mem_p", pipeline); + } + + std::shared_ptr AcquireResidualDataMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p"); + } + + std::shared_ptr AcquireDstMemoryFromResidualDataMemory( + const std::shared_ptr& user_residual_memory_p, + void* dst_ptr, + std::vector& pipeline) { // NOLINT + return this->AcquireMemory(user_residual_memory_p, + this->AcquireDstMemoryFromPrimitive(dst_ptr), + "@residual_data_mem_p", pipeline); + } + + std::shared_ptr AcquireDiffSrcMemoryFromDataPrimitive( + void* ptr) { + return this->AcquireMemoryFromPrimitive( + conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p"); + } + + std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { + return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr, + "@dst_mem_p"); + } + + std::shared_ptr AcquireSrcMemoryFromPrimitive( + const std::shared_ptr user_memory_p, + std::vector& pipeline) { // NOLINT + auto src_pd = conv_pd_->src_primitive_desc(); + auto user_pd = user_memory_p->get_primitive_desc(); + return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", + pipeline); + } + + std::shared_ptr AcquireWeightsMemoryFromPrimitive( + const std::shared_ptr user_weights_memory_p, + std::vector& pipeline, // NOLINT + bool is_persistent = false) { + auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); + auto weights_pd = conv_pd_->weights_primitive_desc(); + return this->AcquireMemory(weights_pd, user_weights_pd, + user_weights_memory_p, "@weights_mem_p", + pipeline, is_persistent); + } + + std::shared_ptr AcquireBiasMemoryFromPrimitive( + const std::shared_ptr user_bias_memory_p, + std::vector& pipeline) { // NOLINT + auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); + auto bias_pd = conv_pd_->bias_primitive_desc(); + return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, + "@bias_mem_p", pipeline); + } + + std::shared_ptr AcquireConvolution( + std::shared_ptr src_memory_p, + std::shared_ptr weights_memory_p, + std::shared_ptr dst_memory_p) { + auto prim_key = key_ + "@conv_p"; + auto conv_p = + std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution primitive in device context"); + if (conv_p == nullptr) { + conv_p = std::make_shared(*conv_pd_, *(src_memory_p), + *(weights_memory_p.get()), + *(dst_memory_p.get())); + + dev_ctx_.SetBlob(prim_key, conv_p); + } else { + is_reusing_ = true; + } + return conv_p; + } + + std::shared_ptr AcquireConvolution( + std::shared_ptr src_memory_p, + std::shared_ptr weights_memory_p, + std::shared_ptr bias_memory_p, + std::shared_ptr dst_memory_p) { + auto prim_key = key_ + "@conv_p"; + auto conv_p = + std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution primitive in device context"); + if (conv_p == nullptr) { + conv_p = std::make_shared( + *conv_pd_, *(src_memory_p), *(weights_memory_p.get()), + *(bias_memory_p.get()), *(dst_memory_p.get())); + + dev_ctx_.SetBlob(prim_key, conv_p); + } else { + is_reusing_ = true; + } + return conv_p; + } + + std::shared_ptr AcquireConvolutionBackwardWeights( + std::shared_ptr src_memory_p, + std::shared_ptr diff_dst_memory_p, + std::shared_ptr diff_weights_memory_p) { + auto prim_key = key_ + "@conv_bwd_weights_p"; + auto conv_bwd_weights_p = std::static_pointer_cast( + dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE( + (conv_bwd_weights_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution bwd weights primitive in device context"); + if (conv_bwd_weights_p == nullptr) { + // create backward conv primitive for weights + conv_bwd_weights_p = std::make_shared( + *conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p, + *diff_weights_memory_p); + dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p); + } else { + is_reusing_ = true; + } + return conv_bwd_weights_p; + } + + std::shared_ptr AcquireConvolutionBackwardData( + std::shared_ptr diff_dst_memory_p, + std::shared_ptr weights_memory_p, + std::shared_ptr diff_src_memory_p) { + auto prim_key = key_ + "@conv_bwd_data_p"; + auto conv_bwd_data_p = + std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); + PADDLE_ENFORCE( + (conv_bwd_data_p != nullptr) || (is_reusing_ == false), + "Fail to find convolution bwd data primitive in device context"); + if (conv_bwd_data_p == nullptr) { + conv_bwd_data_p = std::make_shared( + *conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p, + *diff_src_memory_p); + dev_ctx_.SetBlob(prim_key, conv_bwd_data_p); + } else { + is_reusing_ = true; + } + return conv_bwd_data_p; + } + + // Generate keys for storing/retriving primitives for this operator + // TODO(jczaja): Make hashing function more optimial + static std::string GetHash(mkldnn::memory::dims& input_dims, // NOLINT + mkldnn::memory::dims& weights_dims, // NOLINT + std::vector& strides, // NOLINT + std::vector& paddings, // NOLINT + std::vector& dilations, // NOLINT + int groups, const std::string& suffix) { + return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + + dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + + suffix; + } + + private: + std::shared_ptr conv_pd_; + std::shared_ptr + conv_bwd_weights_pd_; + std::shared_ptr conv_bwd_data_pd_; +}; + +using ConvMKLDNNHandler = + ConvMKLDNNTemplateHandler; + +using ConvTransposeMKLDNNHandler = + ConvMKLDNNTemplateHandler; +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index fc903b548c70e9b72c6121dd24c014973e3cd1d4..7c539d25f6dd02fc09aa1234d7bf0164b54a610f 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -97,7 +97,7 @@ struct NCCLContextMap { order_.size(), contexts_.size(), "NCCL Context Map does not support contain two or more same device"); - if (places.size() <= 1) { + if (places.size() <= 1 && num_trainers == 1) { return; } std::unique_ptr comms(new ncclComm_t[order_.size()]); @@ -111,12 +111,19 @@ struct NCCLContextMap { { int nranks = num_trainers * order_.size(); NCCLGroupGuard gurad; - for (auto &gpu_id : order_) { - int rank = trainer_id * order_.size() + gpu_id; - VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks; + for (size_t i = 0; i < order_.size(); ++i) { + int gpu_id = order_[i]; + int rank; + if (order_.size() > 1) { + rank = trainer_id * order_.size() + i; + } else { + rank = trainer_id; + } + VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks + << "gpu id: " << gpu_id; PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( - comms.get() + gpu_id, nranks, *nccl_id, rank)); + comms.get() + i, nranks, *nccl_id, rank)); } } } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 25d241d9768c16e1da304a78f259d5a626f702fc..d602613fc82223e14f48830a87533880696eb550 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc new file mode 100644 index 0000000000000000000000000000000000000000..470e8b050808295d49728bbdb757b6a612df9a01 --- /dev/null +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +// To avoid conflicting definition in gcc-4.8.2 headers and pyconfig.h (2.7.3) +#ifdef _POSIX_C_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#ifdef _XOPEN_SOURCE +#undef _XOPEN_SOURCE +#endif +#include +#include + +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" +#include "paddle/fluid/framework/async_executor.h" +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/variant.h" +#include "paddle/fluid/pybind/async_executor_py.h" + +namespace py = pybind11; +namespace pd = paddle::framework; + +namespace paddle { +namespace pybind { +using set_name_func = void (pd::DataFeedDesc::*)(const std::string&); +void BindAsyncExecutor(py::module* m) { + py::class_(*m, "AsyncExecutor") + .def(py::init([](framework::Scope* scope, const platform::Place& place) { + return std::unique_ptr( + new framework::AsyncExecutor(scope, place)); + })) + .def("run_from_files", &framework::AsyncExecutor::RunFromFile); +} // end BindAsyncExecutor +} // end namespace pybind +} // end namespace paddle diff --git a/paddle/fluid/pybind/async_executor_py.h b/paddle/fluid/pybind/async_executor_py.h new file mode 100644 index 0000000000000000000000000000000000000000..a99d6e04218c9310ede00de7d9bdfc015889bd22 --- /dev/null +++ b/paddle/fluid/pybind/async_executor_py.h @@ -0,0 +1,28 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindAsyncExecutor(py::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1835c064055635a4284fc64f4ca4dd8728f933ca..fc7991d2974c9262e6225de1537025944c1068c1 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -42,6 +42,7 @@ limitations under the License. */ #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/protobuf.h" @@ -932,6 +933,7 @@ All parameter, weight, gradient are variables in Paddle. }); BindRecordIOWriter(&m); + BindAsyncExecutor(&m); } } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 8572dc1e8e543b552e3ed5a180ec942faf90a624..169a925d12328e7d1df744635445b5674c19b125 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) +cc_test(split_test SRCS split_test.cc) diff --git a/paddle/fluid/string/split.h b/paddle/fluid/string/split.h new file mode 100644 index 0000000000000000000000000000000000000000..ccb96b8a9cb68f03acbca592a2149ba5001f34d2 --- /dev/null +++ b/paddle/fluid/string/split.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +namespace paddle { +namespace string { + +static inline std::vector Split(std::string const& original, + char separator) { + std::vector results; + std::string token; + std::istringstream is(original); + while (std::getline(is, token, separator)) { + if (!token.empty()) { + results.push_back(token); + } + } + return results; +} + +} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/split_test.cc b/paddle/fluid/string/split_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c85dc1eed40dbe25d922c0f4810a747d1bd2d60f --- /dev/null +++ b/paddle/fluid/string/split_test.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/string/split.h" + +#include + +#include "gtest/gtest.h" + +TEST(StringSplit, StringSplit) { + std::string to_split = "0,1,2,3,4,5"; + int i = 0; + for (auto s : paddle::string::Split(to_split, ',')) { + EXPECT_EQ(atoi(s.c_str()), i); + i++; + } +} diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a6720fa798ec5cf60a8806a7f72fe6febaf4f7ac..6299b166af8a5f65cf587ae282c955f33db0044b 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -437,14 +437,32 @@ EOF export http_proxy= export https_proxy= # TODO: jiabin need to refine this part when these tests fixed on mac - ctest --output-on-failure -j $1 + ctest --output-on-failure -j $2 # make install should also be test when unittest make install -j 8 - pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + if [ "$1" == "cp27-cp27m" ]; then + pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + elif [ "$1" == "cp35-cp35m" ]; then + pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + elif [ "$1" == "cp36-cp36m" ]; then + pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + elif [ "$1" == "cp37-cp37m" ]; then + pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl + fi + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi - pip uninstall -y paddlepaddle + + if [ "$1" == "cp27-cp27m" ]; then + pip uninstall -y paddlepaddle + elif [ "$1" == "cp35-cp35m" ]; then + pip3.5 uninstall -y paddlepaddle + elif [ "$1" == "cp36-cp36m" ]; then + pip3.6 uninstall -y paddlepaddle + elif [ "$1" == "cp37-cp37m" ]; then + pip3.7 uninstall -y paddlepaddle + fi fi } @@ -454,12 +472,15 @@ function assert_api_not_changed() { virtualenv .env source .env/bin/activate pip install ${PADDLE_ROOT}/build/python/dist/*whl - python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec + python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec fi + # ComposeNotAligned has significant difference between py2 and py3 + sed -i '/.*ComposeNotAligned.*/d' new.spec + python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate } @@ -469,7 +490,19 @@ function assert_api_spec_approvals() { BRANCH="develop" fi - API_FILES=("paddle/fluid/API.spec" "paddle/fluid/framework/operator.h") + API_FILES=("paddle/fluid/API.spec" + "paddle/fluid/framework/operator.h" + "paddle/fluid/framework/tensor.h" + "paddle/fluid/framework/lod_tensor.h" + "paddle/fluid/framework/selected_rows.h" + "paddle/fluid/framework/op_desc.h" + "paddle/fluid/framework/block_desc.h" + "paddle/fluid/framework/var_desc.h" + "paddle/fluid/framework/scope.h" + "paddle/fluid/framework/ir/node.h" + "paddle/fluid/framework/ir/graph.h" + "paddle/fluid/framework/framework.proto" + "paddle/fluid/operators/distributed/send_recv.proto.in") for API_FILE in ${API_FILES[*]}; do API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true` echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" @@ -883,7 +916,7 @@ function main() { maccheck) cmake_gen ${PYTHON_ABI:-""} build_mac - run_mac_test ${PROC_RUN:-1} + run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1} ;; macbuild) cmake_gen ${PYTHON_ABI:-""} diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index f7fefb3e5b767e25373665058d4fd6a298fb3d60..2a53519188e7454b54424cfdd4a713ae37a2326b 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -20,6 +20,13 @@ from .framework import * # import all class inside executor into fluid module from . import executor from .executor import * + +from . import data_feed_desc +from .data_feed_desc import * + +from . import async_executor +from .async_executor import * + from . import trainer from . import inferencer @@ -54,7 +61,8 @@ Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + \ trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ - parallel_executor.__all__ + lod_tensor.__all__ + [ + parallel_executor.__all__ + lod_tensor.__all__ + \ + data_feed_desc.__all__ + async_executor.__all__ + [ 'io', 'initializer', 'layers', @@ -139,7 +147,7 @@ def __bootstrap__(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search' + 'cudnn_exhaustive_search', 'selected_gpus' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..2664a7301db3bf471126ff26504e7042f02b7d84 --- /dev/null +++ b/python/paddle/fluid/async_executor.py @@ -0,0 +1,151 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import contextlib +import six +from .framework import Program, default_main_program, Variable +from . import core +from .executor import global_scope, Executor +from paddle.fluid.proto import data_feed_pb2 +from google.protobuf import text_format +from . import io +from .data_feed_desc import DataFeedDesc + +__all__ = ['AsyncExecutor'] + + +class AsyncExecutor(object): + """ + An asynchronous Executor in Python. Through exploiting the power of + multi-core processor and data queueing, AsyncExecutor makes data reading + and cosuming decoupled, each run in multiple threads in parallel. + + Instead of reading data in python side, AsyncExecutor accepts a training + file list, which will be retrieved in C++, then training inputs will be + read, parsed and fed to training network within C++ code. + + AsyncExecutor is in active development and the API might change in the near + future. + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> startup_program = fluid.default_startup_program() + >>> main_program = fluid.default_main_program() + >>> filelist = ["train_data/part-%d" % i for i in range(100)] + >>> thread_num = len(filelist) / 4 + >>> + >>> place = fluid.CPUPlace() + >>> async_executor = fluid.AsyncExecutor(place) + >>> + >>> async_executor.run_startup_program(startup_program) + >>> + >>> epoch = 10 + >>> for i in range(epoch): + >>> async_executor.run(main_program, + >>> data_feed, + >>> filelist, + >>> thread_num, + >>> [acc], + >>> debug=False) + + Args: + place(fluid.CPUPlace|None): indicate the executor run on which device. + Only CPUPlace supported + + Note: + For debugging complicated network in parallel-GPUs, you can test it + on the executor. They has the exactly same arguments, and expected + the same results. + + Note: Only running on CPUPlace supported. + """ + + def __init__(self, place=None): + if place is None: + place = core.CPUPlace() + if not isinstance(place, core.CPUPlace): + raise ValueError("AsyncExecutor only supports CPU device") + + p = core.Place() + p.set_place(place) + + scope = global_scope() + self.executor = core.AsyncExecutor(scope, p) + + def run(self, program, data_feed, filelist, thread_num, fetch, debug=False): + """ + Run program by this AsyncExecutor. Training dataset will be in filelist. + Users can also inspect certain variables by naming them in parameter + :code:`fetch`, like in fluid.Executor. Unlike fluid.Executor, however, + AsyncExecutor doesn't return fetched variables, instead, it will dump + the values of each fetched variable to stdandard output. + + Running the dataset will be on multiple threads, within each a thread + local scope will be created, then all OPs also created in that scope. + Parameters are updated by all the OPs simultaneously. + + Args: + program(Program): the program that need to run, if not provied, + then default_main_program will be used. + data_feed(DataFeedDesc): A DataFeedDesc object + filelist(str): a file containing the training dataset file list + thread_num(int): number of concurrent training threads. See + :code:`Note` for how to set this properly + fetch(str|list): the var name or a list of var names to inspect + debug(bool): When set to True, fetch vars will be printed to + standard output after each minibatch + + Note: + the executor will run all operators in the program but not only + the operators dependent by the fetch_list. + + Note: + Running AsyncExecutor will be on multiple threads, each bound to a + CPU core. To achieve best performance, it's suggested to set thread + num to be equal or slightly less than that of CPU cores. + """ + if program is None: + program = default_main_program() + program_desc = program.desc + + if data_feed is None: + raise ValueError('ValueError: data_feed should be provided') + + if filelist is None: + raise ValueError('ValueError: filelist should be provided') + + if isinstance(filelist, str): + filelist = [filelist] + + if not isinstance(thread_num, int): + raise TypeError('TypeError: thread_num should be a positive number') + + if fetch is not None: + if isinstance(fetch, Variable): + fetch = [fetch] + fetch_var_names = [var.name for var in fetch] + for fetch_var in fetch: + shape = fetch_var.shape + if shape[len(shape) - 1] != 1: + raise AssertionError( + "%s: Fetch variable has wrong shape. Only varibles " + "with the last dimension size 1 supported." % + (fetch_var.name)) + + self.executor.run_from_files(program_desc, + data_feed.desc(), filelist, thread_num, + fetch_var_names, debug) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 1738afe93e99f1de28bec2fb23be8e1a309d9288..0f7dd531b3e5992caa558def6bbdf446a7d2ffaa 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -134,12 +134,12 @@ class GradientClipByValue(BaseGradientClipAttr): Examples: .. code-block:: python - w_param_attrs = ParamAttr(name=None, - initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), + w_param_attrs = fluid.ParamAttr(name=None, + initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0), learning_rate=1.0, - regularizer=L1Decay(1.0), + regularizer=fluid.regularizer.L1Decay(1.0), trainable=True, - clip=GradientClipByValue(-1.0, 1.0)) + clip=fluid.clip.GradientClipByValue(-1.0, 1.0)) y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) """ @@ -185,12 +185,12 @@ class GradientClipByNorm(BaseGradientClipAttr): Examples: .. code-block:: python - w_param_attrs = ParamAttr(name=None, - initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), + w_param_attrs = flui.ParamAttr(name=None, + initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0), learning_rate=1.0, - regularizer=L1Decay(1.0), + regularizer=fluid.regularizer.L1Decay(1.0), trainable=True, - clip=GradientClipByNorm(clip_norm=2.0)) + clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)) y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) """ @@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): "All parameters' 'clip_norm' of a same group should be the same" ) - square = grad * grad + merge_grad = grad + if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.merge_selected_rows(grad) + merge_grad = layers.get_tensor_from_selected_rows(merge_grad) + + square = layers.square(merge_grad) local_norm_var = layers.reduce_sum(input=square) context[self.group_name].append(local_norm_var) @@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): new_grad = layers.elementwise_mul( x=grad, y=self.context[group_scale_name]) + return param, new_grad diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py new file mode 100644 index 0000000000000000000000000000000000000000..d2ec74d6cfdeb34c1f48c086a3aa30d5100c3efb --- /dev/null +++ b/python/paddle/fluid/data_feed_desc.py @@ -0,0 +1,152 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.proto import data_feed_pb2 +from google.protobuf import text_format + +__all__ = ['DataFeedDesc'] + + +class DataFeedDesc(object): + """ + Datafeed descriptor, describing input training data format. This class is + currently only used for AsyncExecutor (See comments for class AsyncExecutor + for a brief introduction) + + DataFeedDesc shall be initialized from a valid protobuf message from disk: + >>> data_feed = fluid.DataFeedDesc('data.proto') + + See :code:`paddle/fluid/framework/data_feed.proto` for message definition. + A typical message might look like: + + >>> name: "MultiSlotDataFeed" + >>> batch_size: 2 + >>> multi_slot_desc { + >>> slots { + >>> name: "words" + >>> type: "uint64" + >>> is_dense: false + >>> is_used: true + >>> } + >>> slots { + >>> name: "label" + >>> type: "uint64" + >>> is_dense: false + >>> is_used: true + >>> } + >>> } + + However, users usually shouldn't care about the message format; instead, + they are encouragd to use :code:`Data Generator` as a tool to generate a + valid data description, in the process of converting their raw log files to + training files acceptable to AsyncExecutor. + + DataFeedDesc can also be changed during runtime. Once you got familiar with + what each field mean, you can modify it to better suit your need. E.g.: + >>> data_feed.set_batch_size(128) + >>> data_feed.set_dense_slots('wd') # The slot named 'wd' will be dense + >>> data_feed.set_use_slots('wd') # The slot named 'wd' will be used + + Finally, the content can be dumped out for debugging purpose: + >>> print(data_feed.desc()) + + Args: + proto_file(string): Disk file containing a data feed description. + + """ + + def __init__(self, proto_file): + self.proto_desc = data_feed_pb2.DataFeedDesc() + with open(proto_file, 'r') as f: + text_format.Parse(f.read(), self.proto_desc) + if self.proto_desc.name == "MultiSlotDataFeed": + self.__name_to_index = { + slot.name: i + for i, slot in enumerate(self.proto_desc.multi_slot_desc.slots) + } + + def set_batch_size(self, batch_size): + """ + Set batch size. Will be effective during training + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> data_feed.set_batch_size(128) + + Args: + batch_size: batch size + + """ + self.proto_desc.batch_size = batch_size + + def set_dense_slots(self, dense_slots_name): + """ + Set if a specific slot will be dense. Will be effective during training. + features for a dense slot will be fed into a Tensor, while those for a + sparse slot will be fed into a LoDTensor + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> data_feed.set_dense_slots(['words']) + + Args: + dense_slots_name: a list of slot names which will be set dense + + Note: + Default is sparse for all slots + """ + if self.proto_desc.name != "MultiSlotDataFeed": + raise ValueError( + "Only MultiSlotDataFeed need set_dense_slots, pls check your datafeed.proto" + ) + for name in dense_slots_name: + self.proto_desc.multi_slot_desc.slots[self.__name_to_index[ + name]].is_dense = True + + def set_use_slots(self, use_slots_name): + """ + Set if a specific slot will be used for training. A dataset shall + contain a lot of features, through this function one can select which + ones will be used for a specific model. + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> data_feed.set_use_slots(['words']) + + Args: + use_slots_name: a list of slot names which will be used in training + + Note: + Default is not used for all slots + """ + if self.proto_desc.name != "MultiSlotDataFeed": + raise ValueError( + "Only MultiSlotDataFeed need set_use_slots, pls check your datafeed.proto" + ) + for name in use_slots_name: + self.proto_desc.multi_slot_desc.slots[self.__name_to_index[ + name]].is_used = True + + def desc(self): + """ + Returns a protobuf message for this DataFeedDesc + + Example: + >>> data_feed = fluid.DataFeedDesc('data.proto') + >>> print(data_feed.desc()) + + Returns: + A string message + """ + return text_format.MessageToString(self.proto_desc) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 288951cd7cd32155f136125fb817c35dd2ec6444..f2886090d75f87654b33cf7aa6f98ebf6f2e27d1 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -20,7 +20,7 @@ import six from .framework import Program, default_main_program, Variable from . import core -__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope'] +__all__ = ['Executor', 'global_scope', 'scope_guard'] g_scope = core.Scope() @@ -278,6 +278,7 @@ class Executor(object): p = core.Place() p.set_place(place) self.executor = core.Executor(p) + self.program_caches = dict() self._closed = False @@ -406,16 +407,17 @@ class Executor(object): Examples: - >>> data = layers.data(name='X', shape=[1], dtype='float32') - >>> hidden = layers.fc(input=data, size=10) - >>> layers.assign(hidden, out) - >>> loss = layers.mean(out) + >>> data = fluid.layers.data(name='X', shape=[1], dtype='float32') + >>> out = fluid.layers.create_tensor(dtype='float32') + >>> hidden = fluid.layers.fc(input=data, size=10) + >>> fluid.layers.assign(hidden,out) + >>> loss = fluid.layers.mean(out) >>> adam = fluid.optimizer.Adam() - >>> adam.minimize(loss) + >>> adam.minimize(loss) >>> cpu = core.CPUPlace() - >>> exe = Executor(cpu) - >>> exe.run(default_startup_program()) + >>> exe = fluid.Executor(cpu) + >>> exe.run(fluid.default_startup_program()) >>> x = numpy.random.random(size=(10, 1)).astype('float32') >>> outs = exe.run( diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b991187d424108db176ebd6996d7d161f11dcd3d..b156db53d2928daefed0959fc3e0731709855343 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -89,12 +89,13 @@ def name_scope(prefix=None): Examples: .. code-block:: python + with name_scope("encoder"): ... with name_scope("decoder"): ... - with name_scope("attention"): - ... + with name_scope("attention"): + ... """ # TODO(panyx0718): Only [0-9a-z]. assert prefix, "namescope prefix cannot be empty." diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 4843af8340310e0f47964d41708b13216fcd2161..ce731f39ea099a4d8948812989ad19b3cce119ff 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -20,6 +20,7 @@ from __future__ import print_function from .layer_function_generator import generate_layer_fn from .layer_function_generator import autodoc, templatedoc from ..layer_helper import LayerHelper +from ..framework import Variable from . import tensor from . import nn from . import ops @@ -46,6 +47,7 @@ __all__ = [ 'iou_similarity', 'box_coder', 'polygon_box_transform', + 'yolov3_loss', ] @@ -401,6 +403,113 @@ def polygon_box_transform(input, name=None): return output +@templatedoc(op_type="yolov3_loss") +def yolov3_loss(x, + gtbox, + gtlabel, + anchors, + class_num, + ignore_thresh, + loss_weight_xy=None, + loss_weight_wh=None, + loss_weight_conf_target=None, + loss_weight_conf_notarget=None, + loss_weight_class=None, + name=None): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4], + in the third dimenstion, x, y, w, h should be stored + and x, y, w, h should be relative value of input image. + N is the batch number and B is the max box number in + an image. + gtlabel (Variable): class id of ground truth boxes, shoud be ins shape + of [N, B]. + anchors (list|tuple): ${anchors_comment} + class_num (int): ${class_num_comment} + ignore_thresh (float): ${ignore_thresh_comment} + loss_weight_xy (float|None): ${loss_weight_xy_comment} + loss_weight_wh (float|None): ${loss_weight_wh_comment} + loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment} + loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment} + loss_weight_class (float|None): ${loss_weight_class_comment} + name (string): the name of yolov3 loss + + Returns: + Variable: A 1-D tensor with shape [1], the value of yolov3 loss + + Raises: + TypeError: Input x of yolov3_loss must be Variable + TypeError: Input gtbox of yolov3_loss must be Variable" + TypeError: Input gtlabel of yolov3_loss must be Variable" + TypeError: Attr anchors of yolov3_loss must be list or tuple + TypeError: Attr class_num of yolov3_loss must be an integer + TypeError: Attr ignore_thresh of yolov3_loss must be a float number + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') + gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') + gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') + anchors = [10, 13, 16, 30, 33, 23] + loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 + anchors=anchors, ignore_thresh=0.5) + """ + helper = LayerHelper('yolov3_loss', **locals()) + + if not isinstance(x, Variable): + raise TypeError("Input x of yolov3_loss must be Variable") + if not isinstance(gtbox, Variable): + raise TypeError("Input gtbox of yolov3_loss must be Variable") + if not isinstance(gtlabel, Variable): + raise TypeError("Input gtlabel of yolov3_loss must be Variable") + if not isinstance(anchors, list) and not isinstance(anchors, tuple): + raise TypeError("Attr anchors of yolov3_loss must be list or tuple") + if not isinstance(class_num, int): + raise TypeError("Attr class_num of yolov3_loss must be an integer") + if not isinstance(ignore_thresh, float): + raise TypeError( + "Attr ignore_thresh of yolov3_loss must be a float number") + + if name is None: + loss = helper.create_variable_for_type_inference(dtype=x.dtype) + else: + loss = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + attrs = { + "anchors": anchors, + "class_num": class_num, + "ignore_thresh": ignore_thresh, + } + + if loss_weight_xy is not None and isinstance(loss_weight_xy, float): + self.attrs['loss_weight_xy'] = loss_weight_xy + if loss_weight_wh is not None and isinstance(loss_weight_wh, float): + self.attrs['loss_weight_wh'] = loss_weight_wh + if loss_weight_conf_target is not None and isinstance( + loss_weight_conf_target, float): + self.attrs['loss_weight_conf_target'] = loss_weight_conf_target + if loss_weight_conf_notarget is not None and isinstance( + loss_weight_conf_notarget, float): + self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget + if loss_weight_class is not None and isinstance(loss_weight_class, float): + self.attrs['loss_weight_class'] = loss_weight_class + + helper.append_op( + type='yolov3_loss', + inputs={"X": x, + "GTBox": gtbox, + "GTLabel": gtlabel}, + outputs={'Loss': loss}, + attrs=attrs) + return loss + + @templatedoc() def detection_map(detect_res, label, diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 3f47053961bcc41b82f1b6776e9365166e78ddbf..42f4959a83fe113d6cbbe0db355249a9c203d602 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -943,7 +943,18 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None): def shuffle(reader, buffer_size): """ - Shuffle the reader. + Creates a data reader whose data output is shuffled. + Output from the iterator that created by original reader will be + buffered into shuffle buffer, and then shuffled. The size of shuffle buffer + is determined by argument buf_size. + + Args: + param reader: the original reader whose output will be shuffled. + type reader: callable + param buf_size: shuffle buffer size. + type buf_size: int + return: the new reader whose output is shuffled. + rtype: callable """ return __create_unshared_decorated_reader__( 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index eea0a362a0c31083f304a2167d0fdadfb30fb640..09b1b30216b03e71253ca8da1d462db897e1a607 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -20,7 +20,7 @@ import string from six.moves import cStringIO from ..proto import framework_pb2 -from ..framework import OpProtoHolder, Variable +from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_ from ..layer_helper import LayerHelper __all__ = [ @@ -178,6 +178,15 @@ def generate_layer_fn(op_type): "operator {0} must input same dtype. {1} vs {2}".format( op_type, dtype, each.dtype)) + if dtype is None: + arg_dtype = kwargs.get("dtype") + if arg_dtype: + if not isinstance(arg_dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(arg_dtype) + else: + dtype = arg_dtype + else: + dtype = core.VarDesc.VarType.FP32 return dtype def func(*args, **kwargs): diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 149224bb68ac869dec14ac9f953f0072bd24c7e2..dde05189722fef77e03a1c2d8f3cbae44a3e8245 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -308,13 +308,9 @@ def piecewise_decay(boundaries, values): def append_LARS(params_grads, learning_rate, weight_decay): - """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for - each layer. - - ```python - learning_rate *= local_gw_ratio * sqrt(sumsq(param)) - / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) - ``` + """ + Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for + each layer. Args: learning_rate: A learning rate Variable. This @@ -323,6 +319,11 @@ def append_LARS(params_grads, learning_rate, weight_decay): Returns: The decayed learning rate + Examples: + .. code-block:: python + + learning_rate *= local_gw_ratio * sqrt(sumsq(param)) + / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) """ def _balanced_weight(param_norm, grad_norm): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6d05ca8461bc7e03db5294cd99c36c349ed1ba92..06d7e429ae25cb3fb074ff621e295ea682d14d61 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -170,8 +170,13 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', + 'merge_selected_rows', + 'get_tensor_from_selected_rows', + 'lstm', ] +kIgnoreIndex = -100 + def fc(input, size, @@ -327,6 +332,11 @@ def embedding(input, """ helper = LayerHelper('embedding', **locals()) + remote_prefetch = False + if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): + remote_prefetch = True + if remote_prefetch: + assert is_sparse is True and is_distributed is False w = helper.create_parameter( attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False) tmp = helper.create_variable_for_type_inference(dtype) @@ -340,6 +350,7 @@ def embedding(input, attrs={ 'is_sparse': is_sparse, 'is_distributed': is_distributed, + 'remote_prefetch': remote_prefetch, 'padding_idx': padding_idx }) return tmp @@ -467,6 +478,168 @@ def dynamic_lstm(input, return hidden, cell +def lstm(input, + init_h, + init_c, + max_len, + hidden_size, + num_layers, + dropout_prob=0.0, + is_bidirec=False, + is_test=False, + name=None, + default_initializer=None, + seed=-1): + """ + If Device is GPU, This op will use cudnn LSTM implementation + + A four-gate Long Short-Term Memory network with no peephole connections. + In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, + the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: + + $$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ + + $$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + + $$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + + $$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + + $$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + + $$ h_t = o_t \\odot tanh(c_t) $$ + + - W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) + - The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). + - sigmoid is the logistic sigmoid function. + - $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. + - The $\odot$ is the element-wise product of the vectors. + - `tanh` is the activation functions. + - $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + + Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, + X represensts a matrix multiplication + + + Args: + input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size ) + init_h(Variable): The initial hidden state of the LSTM + This is a tensor with shape ( num_layers x batch_size x hidden_size) + if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) + init_c(Variable): The initial cell state of the LSTM. + This is a tensor with shape ( num_layers x batch_size x hidden_size ) + if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size) + max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len + hidden_size (int): hidden size of the LSTM + num_layers (int): total layers number of the LSTM + dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps + There is NO dropout work on rnn output of the last RNN layers + is_bidirec (bool): If it is bidirectional + is_test (bool): If it is in test phrase + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + default_initializer(Initialize|None): Where use initializer to initialize the Weight + If set None, defaule initializer will be used + seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed + + + Returns: + rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) + if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2) + last_h(Tensor): the hidden state of the last step of LSTM + shape is ( num_layers x batch_size x hidden_size ) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + last_c(Tensor): the cell state of the last step of LSTM + shape is ( num_layers x batch_size x hidden_size ) + if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size) + + + Examples: + .. code-block:: python + + input = embedding + batch_size = 20 + max_len = 100 + dropout_prob = 0.2 + input_size = 100 + hidden_size = 150 + num_layers = 1 + init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) + init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False) + + rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \ + max_len, dropout_prob, input_size, hidden_size, \ + num_layers) + """ + + helper = LayerHelper('cudnn_lstm', **locals()) + + dtype = input.dtype + input_shape = list(input.shape) + input_size = input_shape[-1] + weight_size = 0 + for i in range(num_layers): + if i == 0: + input_weight_size = (input_size * hidden_size) * 4 + else: + if is_bidirec: + input_weight_size = (hidden_size * 2 * hidden_size) * 4 + else: + input_weight_size = (hidden_size * hidden_size) * 4 + + hidden_weight_size = (hidden_size * hidden_size) * 4 + + if is_bidirec: + weight_size += (input_weight_size + hidden_weight_size) * 2 + weight_size += hidden_size * 8 * 2 + else: + weight_size += input_weight_size + hidden_weight_size + weight_size += hidden_size * 8 + + weight = helper.create_parameter( + attr=helper.param_attr, + shape=[weight_size], + dtype=dtype, + default_initializer=default_initializer) + + out = helper.create_variable_for_type_inference(dtype) + last_h = helper.create_variable_for_type_inference(dtype) + last_c = helper.create_variable_for_type_inference(dtype) + + cache = helper.create_variable( + persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True) + + helper.append_op( + type='cudnn_lstm', + inputs={ + 'Input': input, + 'InitH': init_h, + 'InitC': init_c, + 'W': weight, + 'Cache': cache, + }, + outputs={ + 'Out': out, + 'last_h': last_h, + 'last_c': last_c, + }, + attrs={ + 'max_len': max_len, + 'is_bidirec': is_bidirec, + 'input_size': input_size, + 'hidden_size': hidden_size, + 'num_layers': num_layers, + 'is_test': is_test, + 'dropout_prob': dropout_prob, + 'seed': seed, + }) + return out, last_h, last_c + + def dynamic_lstmp(input, size, proj_size, @@ -758,7 +931,7 @@ def dynamic_gru(input, emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) hidden_dim = 512 x = fluid.layers.fc(input=emb, size=hidden_dim * 3) - hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim) + hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim) """ helper = LayerHelper('gru', **locals()) @@ -1099,7 +1272,7 @@ def dropout(x, return out -def cross_entropy(input, label, soft_label=False, ignore_index=-100): +def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): """ **Cross Entropy Layer** @@ -1146,7 +1319,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): labels. Default: `False`. ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient. Only valid - if soft_label is set to False. Default: -100 + if soft_label is set to False. Default: kIgnoreIndex Returns: A 2-D tensor with shape [N x 1], the cross entropy loss. @@ -2313,7 +2486,8 @@ def batch_norm(input, moving_mean_name=None, moving_variance_name=None, do_model_average_for_mean_and_var=False, - fuse_with_relu=False): + fuse_with_relu=False, + use_global_stats=False): """ **Batch Normalization Layer** @@ -2340,6 +2514,19 @@ def batch_norm(input, \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + + When use_global_stats = True, the :math:`\\mu_{\\beta}` + and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + They are global (or running) statistics. (It usually got from the + pre-trained model.) + The training and testing (or inference) have the same behavior: + + .. math:: + + \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ + \\sigma_{\\beta}^{2} + \\epsilon}} \\\\ + y_i &\\gets \\gamma \\hat{x_i} + \\beta + Args: input(variable): The input variable which is a LoDTensor. act(string, Default None): Activation type, linear|relu|prelu|... @@ -2362,6 +2549,11 @@ def batch_norm(input, moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance. do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not. fuse_with_relu (bool): if True, this OP performs relu after batch norm. + use_global_stats(bool, Default False): Whether to use global mean and + variance. In inference or test mode, set use_global_stats to true + or is_test to true, and the behavior is equivalent. + In train mode, when setting use_global_stats True, the global mean + and variance are also used during train period. Returns: Variable: A tensor variable which is the result after applying batch normalization on the input. @@ -2394,9 +2586,15 @@ def batch_norm(input, shape=param_shape, dtype=dtype, default_initializer=Constant(1.0)) + # setting stop_gradient=True to reduce computation + if use_global_stats and helper.param_attr.learning_rate == 0.: + scale.stop_gradient = True bias = helper.create_parameter( attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) + # setting stop_gradient=True to reduce computation + if use_global_stats and helper.bias_attr.learning_rate == 0.: + scale.stop_gradient = True mean = helper.create_parameter( attr=ParamAttr( @@ -2452,7 +2650,8 @@ def batch_norm(input, "epsilon": epsilon, "is_test": is_test, "use_mkldnn": False, - "fuse_with_relu": fuse_with_relu + "fuse_with_relu": fuse_with_relu, + "use_global_stats": use_global_stats }) return helper.append_activation(batch_norm_out) @@ -3402,6 +3601,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None): Examples: .. code-block:: python + # Suppose `ids` and `scores` are LodTensorArray variables reserving # the selected ids and scores of all steps finished_ids, finished_scores = layers.beam_search_decode( @@ -4231,8 +4431,15 @@ def ctc_greedy_decoder(input, blank, name=None): [0.5, 0.1, 0.3, 0.1]] input.lod = [[4, 4]] + + Computation: - Then: + step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get: + [[0], [2], [1], [0]] + step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence: + [[2], [1]] + + Finally: output.data = [[2], [1], @@ -4240,6 +4447,7 @@ def ctc_greedy_decoder(input, blank, name=None): output.lod = [[2, 1]] + Args: input(Variable): (LoDTensor), the probabilities of @@ -4254,8 +4462,10 @@ def ctc_greedy_decoder(input, blank, name=None): name (str): The name of this layer. It is optional. Returns: - Variable: CTC greedy decode result. If all the sequences in result were - empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1]. + Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. + 'Lp' is the sum if all output sequences' length. If all the sequences + in result were empty, the result LoDTensor will be [-1] with + LoD [[]] and dims [1, 1]. Examples: .. code-block:: python @@ -4889,7 +5099,7 @@ def im2sequence(input, output.lod = [[4, 4]] - Examples: + Examples: .. code-block:: python @@ -4993,7 +5203,7 @@ def multiplex(inputs, index): def softmax_with_cross_entropy(logits, label, soft_label=False, - ignore_index=-100, + ignore_index=kIgnoreIndex, numeric_stable_mode=False, return_softmax=False): """ @@ -5051,7 +5261,7 @@ def softmax_with_cross_entropy(logits, labels as soft labels. By default, `soft_label` is set to False. ignore_index (int): Specifies a target value that is ignored and does not contribute to the input gradient. Only valid - if soft_label is set to False. Default: -100 + if soft_label is set to False. Default: kIgnoreIndex numeric_stable_mode (bool): A flag to indicate whether to use a more numerically stable algorithm. Only valid when soft_label is False and GPU is used. @@ -5676,24 +5886,23 @@ def pad_constant_like(x, y, pad_value=0., name=None): [[38, 39, 40]], [[41, 42, 43]]]] Y.shape = (1, 3, 1, 3) + And + pad_value = -1, - And - pad_value = -1, - - Return: - Out = [[[[35, 36, 37], - [-1, -1, -1]], - [[38, 39, 40], - [-1, -1, -1]], - [[41, 42, 43], - [-1, -1, -1]]], - [[[-1, -1, -1], - [-1, -1, -1]], - [[-1, -1, -1], - [-1, -1, -1]], - [[-1, -1, -1], - [-1, -1, -1]]]] - Out.shape = (2, 3, 2, 3) + Return: + Out = [[[[35, 36, 37], + [-1, -1, -1]], + [[38, 39, 40], + [-1, -1, -1]], + [[41, 42, 43], + [-1, -1, -1]]], + [[[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]]]] + Out.shape = (2, 3, 2, 3) Args: x (Variable): The input tensor variable. @@ -5932,6 +6141,7 @@ def image_resize(input, Supporting resample methods: 'BILINEAR' : Bilinear interpolation + 'NEAREST' : Nearest neighbor interpolation Args: @@ -6587,7 +6797,7 @@ def crop(x, shape=None, offsets=None, name=None): # or z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32") - crop = fluid.layers.crop(z, shape=[2, 3]) + crop = fluid.layers.crop(z, shape=[-1, 2, 3]) """ helper = LayerHelper('crop', **locals()) @@ -6868,44 +7078,45 @@ def pad2d(input, than height-1. And the width dimension has the same condition. Example: + .. code-block:: text - Given that X is a channel of image from input: + Given that X is a channel of image from input: - X = [[1, 2, 3], - [4, 5, 6]] + X = [[1, 2, 3], + [4, 5, 6]] - Case 0: + Case 0: - paddings = [0, 1, 2, 3], - mode = 'constant' - pad_value = 0 + paddings = [0, 1, 2, 3], + mode = 'constant' + pad_value = 0 - Out = [[0, 0, 1, 2, 3, 0, 0, 0] - [0, 0, 4, 5, 6, 0, 0, 0] - [0, 0, 0, 0, 0, 0, 0, 0]] + Out = [[0, 0, 1, 2, 3, 0, 0, 0] + [0, 0, 4, 5, 6, 0, 0, 0] + [0, 0, 0, 0, 0, 0, 0, 0]] - Case 1: + Case 1: - paddings = [0, 1, 2, 1], - mode = 'reflect' + paddings = [0, 1, 2, 1], + mode = 'reflect' - Out = [[3, 2, 1, 2, 3, 2] - [6, 5, 4, 5, 6, 5] - [3, 2, 1, 2, 3, 2]] + Out = [[3, 2, 1, 2, 3, 2] + [6, 5, 4, 5, 6, 5] + [3, 2, 1, 2, 3, 2]] - Case 2: + Case 2: - paddings = [0, 1, 2, 1], - mode = 'edge' + paddings = [0, 1, 2, 1], + mode = 'edge' - Out = [[1, 1, 1, 2, 3, 3] - [4, 4, 4, 5, 6, 6] - [4, 4, 4, 5, 6, 6]] + Out = [[1, 1, 1, 2, 3, 3] + [4, 4, 4, 5, 6, 6] + [4, 4, 4, 5, 6, 6]] Args: input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format. - paddings (tuple|list): The padding size. If padding is a tuple, it must + paddings (tuple|list|Variable): The padding size. If padding is a tuple, it must contain four integers, (padding_top, padding_bottom, padding_left, padding_right). Default: padding = [0, 0, 0, 0]. mode (str): Three modes: constant(default), reflect, edge. Default: constant @@ -6930,16 +7141,17 @@ def pad2d(input, helper = LayerHelper('pad2d', **locals()) dtype = helper.input_dtype(input_param_name='input') out = helper.create_variable_for_type_inference(dtype) + inputs = {'X': input} + attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format} + + if isinstance(paddings, Variable): + inputs['Paddings'] = paddings + attrs['paddings'] = [] + else: + attrs['paddings'] = paddings + helper.append_op( - type='pad2d', - inputs={'X': input}, - outputs={"Out": out}, - attrs={ - 'paddings': paddings, - 'mode': mode, - 'pad_value': pad_value, - 'data_frmat': data_format - }) + type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs) return out @@ -7137,13 +7349,13 @@ def prelu(x, mode, param_attr=None, name=None): Args: x (Variable): The input tensor. param_attr(ParamAttr|None): The parameter attribute for the learnable - weight (alpha). + weight (alpha). mode (string): The mode for weight sharing. It supports all, channel - and element. all: all elements share same weight - channel:elements in a channel share same weight - element:each element has a weight + and element. all: all elements share same weight + channel:elements in a channel share same weight + element:each element has a weight name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Returns: Variable: The output tensor with the same shape as input. @@ -7587,6 +7799,11 @@ def uniform_random_batch_size_like(input, Returns: out (Variable): ${out_comment} + Examples: + .. code-block:: python + + input = layers.data(name="input", shape=[13, 11], dtype='float32') + out = layers.uniform_random_batch_size_like(input, [-1, 11]) """ helper = LayerHelper('uniform_random_batch_size_like', **locals()) @@ -7624,6 +7841,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'): Returns: out (Variable): ${out_comment} + Examples: + .. code-block:: python + + out = layers.gaussian_random(shape=[20, 30]) """ helper = LayerHelper('gaussian_random', **locals()) @@ -7659,6 +7880,16 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'): Returns: out (Variable): ${out_comment} + Examples: + .. code-block:: python + + x = layers.data( + name="X", + shape=[13, 11], + dtype='float32', + append_batch_size=False) + + out = layers.sampling_id(x) """ helper = LayerHelper('sampling_id', **locals()) @@ -7698,6 +7929,14 @@ def gaussian_random_batch_size_like(input, Returns: out (Variable): ${out_comment} + + Examples: + .. code-block:: python + + input = layers.data(name="input", shape=[13, 11], dtype='float32') + + out = layers.gaussian_random_batch_size_like( + input, shape=[-1, 11], mean=1.0, std=2.0) """ helper = LayerHelper('gaussian_random_batch_size_like', **locals()) @@ -7730,6 +7969,12 @@ def sum(x): Returns: out (Variable): ${out_comment} + + Examples: + .. code-block:: python + + input = layers.data(name="input", shape=[13, 11], dtype='float32') + out = layers.sum(input) """ helper = LayerHelper('sum', **locals()) @@ -7758,6 +8003,17 @@ def slice(input, axes, starts, ends): Returns: out (Variable): ${out_comment} + Examples: + .. code-block:: python + + starts = [1, 0, 2] + ends = [3, 3, 4] + axes = [0, 1, 2] + + input = layers.data( + name="input", shape=[3, 4, 5, 6], dtype='float32') + + out = layers.slice(input, axes=axes, starts=starts, ends=ends) """ helper = LayerHelper('slice', **locals()) @@ -7785,6 +8041,12 @@ def shape(input): Returns: out (Variable): ${out_comment} + Examples: + .. code-block:: python + + input = layers.data( + name="input", shape=[3, 100, 100], dtype="float32") + out = layers.shape(input) """ helper = LayerHelper('shape', **locals()) @@ -8135,6 +8397,29 @@ def mean(x, name=None): return out +@templatedoc() +def merge_selected_rows(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("merge_selected_rows", **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="merge_selected_rows", + inputs={"X": x}, + attrs={}, + outputs={"Out": out}) + return out + + @templatedoc() def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): """ @@ -8172,13 +8457,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): @templatedoc() -def sigmoid_cross_entropy_with_logits(x, label, name=None): +def sigmoid_cross_entropy_with_logits(x, + label, + ignore_index=kIgnoreIndex, + name=None): """ ${comment} Args: x(${x_type}): ${x_comment} label(${label_type}): ${label_comment} + ignore_index(&{ignore_index}): ${ignore_index_comment} name(basestring|None): Name of the output. Returns: @@ -8197,7 +8486,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None): type="sigmoid_cross_entropy_with_logits", inputs={"X": x, "Label": label}, - attrs={}, + attrs={"ignore_index": ignore_index}, outputs={"Out": out}) return out @@ -8783,3 +9072,26 @@ def bilinear_tensor_product(x, # add activation return helper.append_activation(out) + + +@templatedoc() +def get_tensor_from_selected_rows(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper('get_tensor_from_selected_rows', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type='get_tensor_from_selected_rows', + inputs={'X': x}, + outputs={'Out': out}, + attrs={}) + return out diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index ff32c00104171bf42c00be33f05758a4387228e1..49a486cf0c3d11b18417e8838aead07d748f3e02 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -622,7 +622,7 @@ def reverse(x, axis): out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='reverse', - inputs={'Input': x}, + inputs={'X': x}, outputs={'Out': [out]}, attrs={'axis': axis}) return out diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 829154f1b23d6e49bf963762be6b6488c98ec94a..85af8fea13d5b9a1e22014fbd727e1baed3247be 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -222,13 +222,13 @@ class Precision(MetricBase): Examples: .. code-block:: python - metric = fluid.metrics.Precision() - for pass in range(PASSES): - metric.reset() - for data in train_reader(): - loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - metric.update(preds=preds, labels=labels) - numpy_precision = metric.eval() + metric = fluid.metrics.Precision() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_precision = metric.eval() """ def __init__(self, name=None): @@ -267,13 +267,13 @@ class Recall(MetricBase): Examples: .. code-block:: python - metric = fluid.metrics.Recall() - for pass in range(PASSES): - metric.reset() - for data in train_reader(): - loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - metric.update(preds=preds, labels=labels) - numpy_recall = metric.eval() + metric = fluid.metrics.Recall() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_recall = metric.eval() """ def __init__(self, name=None): @@ -449,8 +449,9 @@ class EditDistance(MetricBase): distance_evaluator.update(distances, seq_num) distance, instance_error = distance_evaluator.eval() - In the above example: + In the above example: 'distance' is the average of the edit distance in a pass. + 'instance_error' is the instance error rate in a pass. """ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index bdcd045341212d6cf9dbfbc3cebc72f320e37e9d..dc27a8eabb5e3671250958b67ea7212ad9bdf08b 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -95,7 +95,14 @@ class ParallelExecutor(object): self._places = [] self._act_places = [] if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): + gpus = [] + gpus_env = os.getenv("FLAGS_selected_gpus") + if gpus_env: + gpus = [int(s) for s in gpus_env.split(",")] + else: + for i in six.moves.range(core.get_cuda_device_count()): + gpus.append(i) + for i in gpus: p = core.Place() self._act_places.append(core.CUDAPlace(i)) p.set_place(self._act_places[-1]) diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index a51607bfdb1dde3d25f490770cc2ba368ceb27ff..38ddf93198d7c58382e36a5b7af488f56e6f9878 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -50,8 +50,9 @@ class ParamAttr(object): w_param_attrs = fluid.ParamAttr(name="fc_weight", learning_rate=0.5, - regularizer=fluid.L2Decay(1.0), + regularizer=fluid.regularizer.L2Decay(1.0), trainable=True) + x = fluid.layers.data(name='X', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs) """ diff --git a/python/paddle/fluid/tests/demo/async_executor.py b/python/paddle/fluid/tests/demo/async_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..fe8da0aab74bd5fc6219666236a04423a6d60489 --- /dev/null +++ b/python/paddle/fluid/tests/demo/async_executor.py @@ -0,0 +1,100 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tarfile +import paddle.fluid as fluid +import paddle +from paddle.fluid import core + +URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz' +MD5 = '2a405a31508969b3ab823f42c0f522ca' + + +def bow_net(data, + label, + dict_dim=89528, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + models/fluid/PaddleNLP/text_classification/nets.py + """ + # embedding + emb = fluid.layers.embedding( + input=data, size=[dict_dim, emb_dim], is_sparse=True) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bowh = fluid.layers.tanh(bow) + # fc layer after conv + fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + # probability of each class + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + # cross entropy loss + cost = fluid.layers.cross_entropy(input=prediction, label=label) + # mean loss + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, acc, prediction + + +def train(): + # Download data + with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf: + tarf.extractall(path='./') + tarf.close() + + # Initialize dataset description + dataset = fluid.DataFeedDesc('train_data/data.prototxt') + dataset.set_batch_size(128) # See API doc for how to change other fields + print dataset.desc() # Debug purpose: see what we get + + # define network + # input text data + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost, acc, prediction = bow_net(data, label) + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) + opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) + + # Run startup program + startup_program = fluid.default_startup_program() + place = fluid.CPUPlace() + executor = fluid.Executor(place) + executor.run(startup_program) + + async_executor = fluid.AsyncExecutor(place) + main_program = fluid.default_main_program() + epochs = 10 + filelist = ["train_data/part-%d" % i for i in range(12)] + for i in range(epochs): + thread_num = 4 + async_executor.run( + main_program, # This can be changed during iteration + dataset, # This can be changed during iteration + filelist, # This can be changed during iteration + thread_num, # This can be changed during iteration + [data, acc], # Multiple fetch targets can be specified + debug=False) + fluid.io.save_inference_model('imdb/epoch%d.model' % i, + [data.name, label.name], [acc], executor) + + +if __name__ == "__main__": + train() diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index a2eca5541a152ca99804a7f87c9b0bc3d12d4eee..d99eaa0634f93dcd16dd80ae172f11e8090a2623 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -388,5 +388,18 @@ class TestGenerateProposals(unittest.TestCase): print(rpn_rois.shape) +class TestYoloDetection(unittest.TestCase): + def test_yolov3_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') + gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') + gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') + loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10, + 0.5) + + self.assertIsNotNone(loss) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py deleted file mode 100644 index 266687fcd092dfdeec9343e2592f4c22b683d588..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/test_gradient_clip.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import paddle -import paddle.fluid as fluid - -BATCH_SIZE = 128 -CLIP = 1 - -prog = fluid.framework.Program() -with fluid.program_guard(main_program=prog): - image = fluid.layers.data(name='x', shape=[784], dtype='float32') - - hidden1 = fluid.layers.fc(input=image, size=128, act='relu') - hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') - predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') - - label = fluid.layers.data(name='y', shape=[1], dtype='int64') - - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) - -prog_clip = prog.clone() - -avg_cost_clip = prog_clip.block(0).var(avg_cost.name) - -p_g = fluid.backward.append_backward(loss=avg_cost) -p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) - -with fluid.program_guard(main_program=prog_clip): - fluid.clip.set_gradient_clip( - fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP)) - p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) - -grad_list = [elem[1] for elem in p_g] -grad_clip_list = [elem[1] for elem in p_g_clip] - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=8192), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -feeder = fluid.DataFeeder(feed_list=[image, label], place=place) -exe.run(fluid.default_startup_program()) - -count = 0 -for data in train_reader(): - count += 1 - if count > 5: - break - out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) - out_clip = exe.run(prog_clip, - feed=feeder.feed(data), - fetch_list=grad_clip_list) - global_norm = 0 - for v in out[1:]: - global_norm += np.sum(np.power(v, 2)) - global_norm = np.sqrt(global_norm) - - global_norm_clip = 0 - for v in out_clip[1:]: - global_norm_clip += np.sum(np.power(v, 2)) - global_norm_clip = np.sqrt(global_norm_clip) - - if not np.isclose( - a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3): - exit(1) -exit(0) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 26035f303e72a87b81fdb120fbb92894d78e996b..61cfdb80af04ede49621bc680fa0fa733a21a2a3 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -43,13 +43,14 @@ if(APPLE) list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_program_code) endif(NOT WITH_DISTRIBUTE) - message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") + message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") # this op is not support on mac list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) # TODO: add the unitest back when it fixed list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) + list(REMOVE_ITEM TEST_OPS test_gradient_clip) endif() if(NOT WITH_MKLML) # this op is not support on openblas @@ -95,13 +96,12 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) + set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) # FIXME(typhoonzero): add these tests back - # py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) - # set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) # py_test_modules(test_dist_transformer MODULES test_dist_transformer) # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) - # TODO(typhoonzero): make dist test parallel when fix port management issue - set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE) + set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() diff --git a/python/paddle/fluid/tests/unittests/dist_ctr.py b/python/paddle/fluid/tests/unittests/dist_ctr.py index 902dc6544ed6858c4cd8d64b14d6af2367059091..65969824338a5c354415cac8a34bd3863716bef4 100644 --- a/python/paddle/fluid/tests/unittests/dist_ctr.py +++ b/python/paddle/fluid/tests/unittests/dist_ctr.py @@ -16,11 +16,13 @@ from __future__ import print_function import paddle import paddle.fluid as fluid +import os import dist_ctr_reader from test_dist_base import TestDistRunnerBase, runtime_main IS_SPARSE = True +os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" # Fix seed for test fluid.default_startup_program().random_seed = 1 diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 271b9c740fd99554e9a7aa8d476a52cf6385b1d9..76a707efdc0804be0316ab12c347ffed6199529a 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -216,6 +216,15 @@ class OpTest(unittest.TestCase): self.dtype) outputs = append_input_output(block, op_proto, self.outputs, False, self.dtype) + + if hasattr(self, "cache_name_list"): + for name in self.cache_name_list: + inputs[name] = block.create_var( + name=name, + persistable=True, + type=core.VarDesc.VarType.RAW, + stop_gradient=True) + op = block.append_op( type=self.op_type, inputs=inputs, @@ -428,8 +437,17 @@ class OpTest(unittest.TestCase): op_inputs = self.inputs if hasattr(self, "inputs") else dict() op_outputs = self.outputs if hasattr(self, "outputs") else dict() op_attrs = self.attrs if hasattr(self, "attrs") else dict() - self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs, - op_attrs) + + cache_list = None + if hasattr(self, "cache_name_list"): + cache_list = self.cache_name_list + self.op = create_op( + self.scope, + self.op_type, + op_inputs, + op_outputs, + op_attrs, + cache_list=cache_list) if no_grad_set is None: no_grad_set = set() diff --git a/python/paddle/fluid/tests/unittests/test_async_executor.py b/python/paddle/fluid/tests/unittests/test_async_executor.py new file mode 100644 index 0000000000000000000000000000000000000000..43855b95f9e3096d58ca3e8acfdb25f034bab175 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_async_executor.py @@ -0,0 +1,142 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle +import unittest +import tarfile +import os +import shutil + +proto_str = ('name: "MultiSlotDataFeed"\n' + 'batch_size: 2\n' + 'multi_slot_desc {\n' + ' slots {\n' + ' name: "words"\n' + ' type: "uint64"\n' + ' is_dense: false\n' + ' is_used: true\n' + ' }\n' + ' slots {\n' + ' name: "label"\n' + ' type: "uint64"\n' + ' is_dense: false\n' + ' is_used: true\n' + ' }\n' + '}') + +URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz' +MD5 = '2a405a31508969b3ab823f42c0f522ca' + + +def bow_net(data, + label, + dict_dim=89528, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + models/fluid/PaddleNLP/text_classification/nets.py + """ + # embedding + emb = fluid.layers.embedding( + input=data, size=[dict_dim, emb_dim], is_sparse=True) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bowh = fluid.layers.tanh(bow) + # fc layer after conv + fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + # probability of each class + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + # cross entropy loss + cost = fluid.layers.cross_entropy(input=prediction, label=label) + # mean loss + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=prediction, label=label) + return avg_cost, acc, prediction + + +class TestAsyncExecutor(unittest.TestCase): + def setUp(self): + with open('./data.prototxt', 'w+') as f: + f.write(proto_str) + f.close() + + with tarfile.open(paddle.dataset.common.download(URL, "imdb", + MD5)) as tarf: + tarf.extractall(path='./') + tarf.close() + + def test_data_feed_desc(self): + data_feed = fluid.DataFeedDesc('./data.prototxt') + # assertEqueal(data_feed.proto_desc.batch, 2) + # assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2) + self.assertEqual(" ".join(data_feed.desc().split()), + " ".join(proto_str.split())) + + def test_run(self): + # Initialize dataset description + data_feed = fluid.DataFeedDesc('train_data/data.prototxt') + data_feed.set_batch_size( + 128) # See API doc for how to change other fields + + # define network + # input text data + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + # label data + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + avg_cost, acc, prediction = bow_net(data, label) + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) + opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) + + # Run startup program + startup_program = fluid.default_startup_program() + place = fluid.CPUPlace() + executor = fluid.Executor(place) + executor.run(startup_program) + + main_program = fluid.default_main_program() + async_executor = fluid.AsyncExecutor(place) + + self.assertRaises(TypeError, async_executor.run) + self.assertRaises(TypeError, async_executor.run, main_program) + self.assertRaises(TypeError, async_executor.run, main_program, + data_feed) + + filelist = ['train_data/part-%d' % i for i in range(10)] + self.assertRaises(TypeError, async_executor.run, main_program, + data_feed, filelist) + + thread_num = 4 + self.assertRaises(TypeError, async_executor.run, main_program, + data_feed, filelist, thread_num) + + async_executor.run(main_program, data_feed, filelist, thread_num, [acc]) + fluid.io.save_inference_model("imdb.model", [data.name, label.name], + [acc], executor) + statinfo = os.stat('imdb.model/__model__') + self.assertGreater(statinfo.st_size, 0) + + os.remove('./data.prototxt') + shutil.rmtree('./train_data') + shutil.rmtree('./imdb.model') + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index 80261eff4e747f87658bc7c9114c21bee511df09..2869a6ba53bfb9120ae68d67d10eb5080be5f07b 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -54,6 +54,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format): return y +def _cal_mean_variance(x, epsilon, data_format): + assert data_format in ['NCHW', 'NHWC'] + x_square = x * x + axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2) + C = x.shape[1] if data_format == 'NCHW' else x.shape[-1] + x_square_sum = np.sum(x_square, axis) + x_sum = np.sum(x, axis=axis) + element_count = np.size(x) / C + mean = x_sum / element_count + var = x_square_sum / element_count - mean * mean + return mean, var + + def _reference_training(x, scale, offset, epsilon, data_format): x_shape = x.shape @@ -294,7 +307,18 @@ class TestBatchNormOpTraining(unittest.TestCase): self.use_mkldnn = False self.fuse_with_relu = False self.data_formats = ["NCHW", "NHWC"] + self.momentum = 0.9 + self.epsilon = 0.00001 self.init_kernel_type() + self.init_test_case() + + def init_test_case(self): + self.use_global_stats = False + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD', + 'scale@GRAD', 'bias@GRAD' + ] def __assert_close(self, tensor, np_array, msg, atol=1e-4): np.allclose(np.array(tensor), np_array, atol=atol) @@ -313,11 +337,22 @@ class TestBatchNormOpTraining(unittest.TestCase): return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad + def set_mean_variance(self, scale_shape, x, data_layout): + mean = np.zeros(scale_shape).astype(np.float32) + variance = np.ones(scale_shape).astype(np.float32) + # computing global mean/variance for one step + if self.use_global_stats: + mom = self.momentum + x_mean, x_var = _cal_mean_variance(x, self.epsilon, data_layout) + mean = x_mean * (1. - mom) + mom * mean + variance = x_var * (1. - mom) + mom * variance + return mean, variance + def test_forward_backward(self): def test_with_place(place, data_layout, shape): # attr - epsilon = 0.00001 - momentum = 0.9 + epsilon = self.epsilon + momentum = self.momentum if data_layout == "NCHW": n, c, h, w = shape[0], shape[1], shape[2], shape[3] else: @@ -328,9 +363,7 @@ class TestBatchNormOpTraining(unittest.TestCase): x = np.random.random_sample(shape).astype(np.float32) scale = np.random.random_sample(scale_shape).astype(np.float32) bias = np.random.random_sample(scale_shape).astype(np.float32) - mean = np.zeros(scale_shape).astype(np.float32) - variance = np.ones(scale_shape).astype(np.float32) - + mean, variance = self.set_mean_variance(scale_shape, x, data_layout) y_grad = np.random.random_sample(shape).astype(np.float32) y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward( @@ -339,6 +372,9 @@ class TestBatchNormOpTraining(unittest.TestCase): var_dict = locals() var_dict['y@GRAD'] = y_grad + var_dict['x@GRAD'] = x_grad + var_dict['scale@GRAD'] = scale_grad + var_dict['bias@GRAD'] = bias_grad var_names = [ 'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean', @@ -365,9 +401,8 @@ class TestBatchNormOpTraining(unittest.TestCase): }, outputs={ "Y": block.var('y'), - "MeanOut": block.var('mean'), # share the same memory - "VarianceOut": - block.var('variance'), # share the same memory + "MeanOut": block.var('mean'), # share memory + "VarianceOut": block.var('variance'), # share memory "SavedMean": block.var('saved_mean'), "SavedVariance": block.var('saved_variance') }, @@ -377,13 +412,14 @@ class TestBatchNormOpTraining(unittest.TestCase): "is_test": False, "data_layout": data_layout, "use_mkldnn": self.use_mkldnn, - "fuse_with_relu": self.fuse_with_relu + "fuse_with_relu": self.fuse_with_relu, + "use_global_stats": self.use_global_stats }) block.create_var(name='y@GRAD', dtype='float32', shape=y.shape) # generate backward op_desc grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( - bn_op.desc, set(), []) + bn_op.desc, self.no_grad_set, []) grad_op_desc = grad_op_desc_list[0] new_op_desc = block.desc.append_op() new_op_desc.copy_from(grad_op_desc) @@ -403,20 +439,10 @@ class TestBatchNormOpTraining(unittest.TestCase): for name in ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD'] }, - fetch_list=[ - 'y', 'mean', 'variance', 'saved_mean', 'saved_variance', - 'x@GRAD', 'scale@GRAD', 'bias@GRAD' - ]) - - self.__assert_close(y, out[0], "y") - self.__assert_close(mean_out, out[1], "mean") - self.__assert_close(variance_out, out[2], "variance", 1e-3) - self.__assert_close(saved_mean, out[3], "saved_mean") - self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3) - self.__assert_close(x_grad, out[5], "x_grad") - self.__assert_close(scale_grad, out[6], "scale_grad") - self.__assert_close(bias_grad, out[7], "bias_grad") + fetch_list=self.fetch_list) + for id, name in enumerate(self.fetch_list): + self.__assert_close(var_dict[name], out[id], name) print("op test forward passed: ", str(place), data_layout) places = [core.CPUPlace()] @@ -432,5 +458,66 @@ class TestBatchNormOpTraining(unittest.TestCase): pass +class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining): + def init_test_case(self): + self.use_global_stats = True + self.no_grad_set = set() + self.fetch_list = [ + 'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD' + ] + + def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format): + if data_format == "NCHW": + x = np.transpose(x, (0, 2, 3, 1)) + y_grad = np.transpose(y_grad, (0, 2, 3, 1)) + + x_grad = scale * y_grad / np.sqrt(var + epsilon) + grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon), + axis=(0, 1, 2)) + grad_offset = np.sum(y_grad, axis=(0, 1, 2)) + + # transfer back to N, C, H, W + if data_format == "NCHW": + x_grad = np.transpose(x_grad, (0, 3, 1, 2)) + x = np.transpose(x, (0, 3, 1, 2)) + y_grad = np.transpose(y_grad, (0, 3, 1, 2)) + + return x_grad, grad_scale, grad_offset + + def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, + epsilon, momentum, shape, data_layout): + if data_layout != "NCHW" and data_layout != "NHWC": + raise ValueError("Unknown data order.") + + if data_layout == "NCHW": + x = np.transpose(x, (0, 2, 3, 1)) + + # run normalizaton + normalized = (x - mean) / np.sqrt(variance + epsilon) + y = normalized * scale + bias + + # transfer back to N, C, H, W + if data_layout == "NCHW": + x = np.transpose(x, (0, 3, 1, 2)) + y = np.transpose(y, (0, 3, 1, 2)) + + mean_out = mean + variance_out = variance + saved_variance = 1. / np.sqrt(variance + epsilon) + # run backward + x_grad, scale_grad, bias_grad = self.reference_grad( + x, y_grad, scale, mean, variance, epsilon, data_layout) + + return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad + + +class TestBatchNormOpFreezeStatsAndScaleBiasTraining( + TestBatchNormOpFreezeStatsTraining): + def init_test_case(self): + self.use_global_stats = True + self.no_grad_set = set(['scale@GRAD', 'bias@GRAD']) + self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD'] + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index 9f3f2f348166864be9583855fcd1949fd4ac818c..6cd71e39e41dae5d07e5761fc9caeca113f3b47e 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp): self.activation = 'identity' +class TestIdentityActivation(TestConv2dFusionOp): + def init_activation(self): + self.activation = 'identity' + self.add_residual_data = False + + class TestWithGroup(TestConv2dFusionOp): def init_group(self): self.groups = 3 diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..deefdd09abe6b9f9ca362654f21850f598337245 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_mkldnn_op.py @@ -0,0 +1,77 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride + + +class TestMKLDNN(TestConv2dTransposeOp): + def init_op_type(self): + self.is_test = True + self.use_mkldnn = True + self.data_format = "NCHW" + self.op_type = "conv2d_transpose" + self._cpu_only = True + + def test_check_grad(self): + return + + def test_check_grad_no_input(self): + return + + def test_check_grad_no_filter(self): + return + + +class TestMKLDNNWithPad(TestWithPad): + def init_op_type(self): + self.is_test = True + self.use_mkldnn = True + self.data_format = "NCHW" + self.op_type = "conv2d_transpose" + self._cpu_only = True + + def test_check_grad(self): + return + + def test_check_grad_no_input(self): + return + + def test_check_grad_no_filter(self): + return + + +class TestMKLDNNWithStride(TestWithStride): + def init_op_type(self): + self.is_test = True + self.use_mkldnn = True + self.data_format = "NCHW" + self.op_type = "conv2d_transpose" + self._cpu_only = True + + def test_check_grad(self): + return + + def test_check_grad_no_input(self): + return + + def test_check_grad_no_filter(self): + return + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index 5bb769b16891d3b7163874751f9bcd25593b4b44..3b820f6ad716e5717e45d0c6341fb89010406d59 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -68,8 +68,11 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs): class TestConv2dTransposeOp(OpTest): def setUp(self): # init as conv transpose + self.is_test = False self.use_cudnn = False + self.use_mkldnn = False self.output_size = None + self.data_format = "AnyLayout" self.init_op_type() self.init_test_case() @@ -83,7 +86,9 @@ class TestConv2dTransposeOp(OpTest): 'groups': self.groups, 'dilations': self.dilations, 'use_cudnn': self.use_cudnn, - 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter + 'is_test': self.is_test, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format } if self.output_size is not None: self.attrs['output_size'] = self.output_size diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 97e7ee6229f081ff67ca3e2aedcad0a2e3d9cabf..160969c63fcbb1b552df195795017d9677c4fe3a 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -291,8 +291,8 @@ class TestDistBase(unittest.TestCase): if check_error_log: err_log.close() - sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out)) sys.stderr.write('local_stderr: %s\n' % local_err) + sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out)) return pickle.loads(local_out) diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index d132dd3c48f55c07725515e40faeb5076398adeb..194387bc98752e66acd2c08a4abcaddfc34ad155 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -782,5 +782,46 @@ class TestNCCL2Transpile(TranspilerTest): pass +# test for remote prefetch +class TestRemoteLookupTable(TestDistLookupTableBase): + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + pserver1, startup1 = self.get_pserver(self.pserver1_ep) + + self.assertEqual(len(pserver1.blocks), 4) + # 0 listen_and_serv + # 1 optimize for fc_w or fc_b adam + self.assertEqual([op.type for op in pserver1.blocks[1].ops], + ["sum", "scale", "adam", "scale", "scale"]) + # 2 optimize for table adam + # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num + self.assertEqual([op.type for op in pserver1.blocks[2].ops], + ["sum", "scale", "adam", "scale", "scale"]) + + # 3 optimize for table 2 adam + # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num + self.assertEqual([op.type for op in pserver1.blocks[3].ops], + ["sum", "scale", "adam", "scale", "scale"]) + + trainer, _ = self.get_trainer() + self.assertEqual(len(trainer.blocks), 1) + ops = [ + 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', + 'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add', + 'cross_entropy', 'mean', 'fill_constant', 'mean_grad', + 'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad', + 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'split_selected_rows', 'send', 'sequence_pool_grad', + 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv', + 'recv', 'fetch_barrier' + ] + self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py new file mode 100644 index 0000000000000000000000000000000000000000..021b950b3b6245caecab22d476bbb9d6b6b45c5e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py @@ -0,0 +1,65 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid.core as core +import numpy as np +from paddle.fluid.op import Operator + + +class TestGetTensorFromSelectedRows(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_with_place(self, place): + scope = core.Scope() + x_rows = [0, 5, 5, 4, 20] + height = 20 + row_numel = 2 + + np_array = np.ones((len(x_rows), row_numel)).astype("float32") + np_array[1, :] = 2.0 + np_array[2, :] = 3.0 + np_array[3, :] = 4.0 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(x_rows) + x.set_height(height) + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + # initialize input variable Out + out = scope.var("Out").get_tensor() + + op = Operator("get_tensor_from_selected_rows", X="X", Out="Out") + + op.run(scope, place) + + out_array = np.array(out) + self.assertEqual((5, 2), out_array.shape) + assert (out_array == np_array).all() + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..e4b3168ba6636253055f546fb3eec8a536714209 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -0,0 +1,162 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + +BATCH_SIZE = 128 +CLIP = 1 + + +def bow_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=True, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestGradientClip(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + self.BATCH_SIZE = 2 + self.train_data = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), + batch_size=self.BATCH_SIZE) + + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_operators(self, place): + prog = fluid.framework.Program() + startup_program = fluid.framework.Program() + with fluid.program_guard( + main_program=prog, startup_program=startup_program): + image = fluid.layers.data(name='x', shape=[784], dtype='float32') + label = fluid.layers.data(name='y', shape=[1], dtype='int64') + + hidden1 = fluid.layers.fc(input=image, size=128, act='relu') + hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') + predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') + + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + + prog_clip = prog.clone() + + avg_cost_clip = prog_clip.block(0).var(avg_cost.name) + + p_g = fluid.backward.append_backward(loss=avg_cost) + p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) + + with fluid.program_guard(main_program=prog_clip): + fluid.clip.set_gradient_clip( + fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP)) + p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) + + grad_list = [elem[1] for elem in p_g] + grad_clip_list = [elem[1] for elem in p_g_clip] + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=8192), + batch_size=BATCH_SIZE) + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[image, label], place=place) + exe.run(startup_program) + + count = 0 + for data in train_reader(): + count += 1 + if count > 5: + break + out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) + out_clip = exe.run(prog_clip, + feed=feeder.feed(data), + fetch_list=grad_clip_list) + global_norm = 0 + for v in out[1:]: + global_norm += np.sum(np.power(v, 2)) + global_norm = np.sqrt(global_norm) + + global_norm_clip = 0 + for v in out_clip[1:]: + global_norm_clip += np.sum(np.power(v, 2)) + global_norm_clip = np.sqrt(global_norm_clip) + + assert np.isclose( + a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3) + + def check_sparse_gradient_clip(self, place): + prog = fluid.framework.Program() + startup_program = fluid.framework.Program() + with fluid.program_guard( + main_program=prog, startup_program=startup_program): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = bow_net(data, label, len(self.word_dict)) + + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) + sgd_optimizer.minimize(cost) + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + exe.run(startup_program) + + data = next(self.train_data()) + val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0] + self.assertEqual((1, ), val.shape) + print(val) + self.assertFalse(np.isnan(val)) + + def test_operators(self): + self.check_operators(core.CPUPlace()) + + def test_sparse_gradient_clip(self): + for place in self.get_places(): + self.check_sparse_gradient_clip(place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 541160771152dd2ebc8a782863bb4ad3643892e5..be51fb06a37a376f6f410336184c95981ded35dc 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -170,9 +170,10 @@ class TestBook(unittest.TestCase): with program_guard(program): dat = layers.data(name='data', shape=[10], dtype='float32') lbl = layers.data(name='label', shape=[10], dtype='float32') + ignore_index = -1 self.assertIsNotNone( layers.sigmoid_cross_entropy_with_logits( - x=dat, label=lbl)) + x=dat, label=lbl, ignore_index=ignore_index)) print(str(program)) def test_hsigmoid(self): @@ -636,13 +637,21 @@ class TestBook(unittest.TestCase): with program_guard(program): input = layers.data( name="input", shape=[3, 100, 100], dtype="float32") + paddings = layers.fill_constant(shape=[4], dtype='int32', value=1) out = layers.pad2d( input, paddings=[1, 2, 3, 4], mode='reflect', data_format='NCHW', name="shape") + out_1 = layers.pad2d( + input, + paddings=paddings, + mode='reflect', + data_format='NCHW', + name="shape") self.assertIsNotNone(out) + self.assertIsNotNone(out_1) print(str(program)) def test_prelu(self): @@ -955,6 +964,15 @@ class TestBook(unittest.TestCase): print(str(program)) + def test_batch_norm(self): + program = Program() + with program_guard(program): + data = layers.data( + name='data', shape=[32, 128, 128], dtype="float32") + out = layers.batch_norm(data) + + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py new file mode 100644 index 0000000000000000000000000000000000000000..47830fb56b4e31018c2691cfa38c8d0d9cb4016e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py @@ -0,0 +1,203 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((10, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_lookup_table_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.full((10, 8), 1.0).astype("float32") + param.set(param_array, place) + + ids = scope.var('Ids').get_tensor() + ids_array = np.array([[1], [2], [5]]).astype("int64") + ids.set(ids_array, place) + ids_lod = [[0, 1, 2, 3]] + ids.set_lod(ids_lod) + + out = scope.var('Out').get_tensor() + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [10] + + # create and run sgd operator + lookup_table_op = Operator( + "lookup_table", + W='W', + Ids='Ids', + Out='Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + lookup_table_op.run(scope, place) + + # get and compare result + result_array = np.array(out) + + self.assertEqual(out.lod(), ids_lod) + self.assertEqual(list(result_array.shape), [len(ids_array), 8]) + for i in range(len(ids_array)): + id = ids_array[i][0] + self.assertTrue((result_array[i] == id).all()) + + def _run_lookup_table_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.full((10, 8), 1.0).astype("float32") + param.set(param_array, place) + + ids = scope.var('Ids').get_tensor() + ids_array = np.array([[1], [2], [11], [13]]).astype("int64") + ids.set(ids_array, place) + ids_lod = [[0, 2, 3, 4]] + ids.set_lod(ids_lod) + + out = scope.var('Out').get_tensor() + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [10, 20] + + # create and run sgd operator + lookup_table_op = Operator( + "lookup_table", + W='W', + Ids='Ids', + Out='Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + lookup_table_op.run(scope, place) + + # get and compare result + result_array = np.array(out) + self.assertEqual(out.lod(), ids_lod) + self.assertEqual(list(result_array.shape), [len(ids_array), 8]) + for i in range(len(ids_array)): + id = ids_array[i][0] + self.assertTrue((result_array[i] == id).all()) + + def test_lookup_remote_table(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for place in places: + self._run_lookup_table_op_one_pserver(place, port0) + self._run_lookup_table_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..0e9e2e8429e51a328e397f9e2a05ab7209c9c1a2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py @@ -0,0 +1,192 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest +import paddle.fluid as fluid + +SIGMOID_THRESHOLD_MIN = -40.0 +SIGMOID_THRESHOLD_MAX = 13.0 +EXP_MAX_INPUT = 40.0 + + +def lstm_naive( + input, + w, ): + seq_len, batch_size, hidden_size = input.shape + + offset = 0 + wi = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wf = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wc = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + wo = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + ri = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + rf = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + rc = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + ro = w[offset:offset + hidden_size * hidden_size].reshape( + (hidden_size, hidden_size)).transpose() + offset += hidden_size * hidden_size + + bi_1 = w[offset:offset + hidden_size] + offset += hidden_size + bf_1 = w[offset:offset + hidden_size] + offset += hidden_size + bc_1 = w[offset:offset + hidden_size] + offset += hidden_size + bo_1 = w[offset:offset + hidden_size] + offset += hidden_size + + bi_2 = w[offset:offset + hidden_size] + offset += hidden_size + bf_2 = w[offset:offset + hidden_size] + offset += hidden_size + bc_2 = w[offset:offset + hidden_size] + offset += hidden_size + bo_2 = w[offset:offset + hidden_size] + + def sigmoid(x): + y = np.copy(x) + y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN + y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX + return 1. / (1. + np.exp(-y)) + + def tanh(x): + y = -2. * x + y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT + return (2. / (1. + np.exp(y))) - 1. + + output = [] + pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype) + pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype) + + for i in range(seq_len): + emb_1 = input[i] + + input_gate = sigmoid( + np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2) + forget_gate = sigmoid( + np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2) + output_gate = sigmoid( + np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2) + c_t_temp = tanh( + np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2) + new_c = input_gate * c_t_temp + forget_gate * pre_c + new_h = output_gate * tanh(new_c) + + pre_h = new_h + pre_c = new_c + + output.append(new_h) + + output = np.concatenate(output, -1) + output = output.reshape((batch_size, -1, hidden_size)) + + output = output.transpose((1, 0, 2)) + + return output, pre_h, pre_c + + +class TestCUDNNLstmOp(OpTest): + def setUp(self): + self.op_type = "cudnn_lstm" + self.dtype = np.float32 + + num_steps = 20 + batch_size = 5 + hidden_size = 20 + + input_weight_size = (hidden_size * hidden_size) * 4 + hidden_weight_size = (hidden_size * hidden_size) * 4 + weight_size = input_weight_size + hidden_weight_size + weight_size += hidden_size * 8 + + input = np.random.uniform( + low=-0.1, high=0.1, size=(num_steps, batch_size, + hidden_size)).astype(self.dtype) + flat_w = np.random.uniform( + low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype) + + output, last_hidden, last_cell = lstm_naive(input, flat_w) + + init_h = np.zeros((batch_size, hidden_size), dtype=np.float32) + init_c = np.zeros((batch_size, hidden_size), dtype=np.float32) + scope = core.Scope() + program = fluid.Program() + block = program.global_block() + + cache_temp = block.create_var( + name="Cache", + persistable=True, + type=core.VarDesc.VarType.RAW, + stop_gradient=True) + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'W': OpTest.np_dtype_to_fluid_dtype(flat_w), + 'InitH': OpTest.np_dtype_to_fluid_dtype(init_h), + 'InitC': OpTest.np_dtype_to_fluid_dtype(init_c), + } + self.cache_name_list = ['Cache'] + self.attrs = { + 'max_len': num_steps, + 'dropout_prob': 0.0, + 'is_bidirec': False, + 'input_size': hidden_size, + 'hidden_size': hidden_size, + 'num_layers': 1, + } + self.outputs = { + 'Out': output, + "last_h": last_hidden, + 'last_c': last_cell + } + + def test_output_with_place(self): + if self.testcuda(): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-5) + + def test_grad_with_place(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + set(['Input', 'W', 'InitH', 'InitC']), + ['Out', 'last_h', 'last_c'], + max_relative_error=0.02) + + def testcuda(self): + return core.is_compiled_with_cuda() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ce64da0478d3997f4889ca942c67e0defac80b45 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py @@ -0,0 +1,73 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid.core as core +import numpy as np +from paddle.fluid.op import Operator + + +class TestMergeSelectedRows(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_with_place(self, place): + scope = core.Scope() + x_rows = [0, 5, 5, 4, 20] + out_rows = [0, 4, 5, 20] + height = 20 + row_numel = 2 + + np_array = np.ones((len(x_rows), row_numel)).astype("float32") + np_array[1, :] = 2.0 + np_array[2, :] = 3.0 + np_array[3, :] = 4.0 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(x_rows) + x.set_height(height) + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + # initialize input variable Out + out = scope.var("Out").get_selected_rows() + + op = Operator("merge_selected_rows", X="X", Out="Out") + + op.run(scope, place) + + self.assertEqual(out.rows(), out_rows) + self.assertEqual(out.height(), height) + + out_array = np.array(out.get_tensor()) + self.assertEqual((4, 2), out_array.shape) + + assert (out_array[0, :] == 1.0).all() + assert (out_array[1, :] == 4.0).all() + assert (out_array[2, :] == 5.0).all() + assert (out_array[3, :] == 1.0).all() + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pad2d_op.py b/python/paddle/fluid/tests/unittests/test_pad2d_op.py index 728b8c181a4410d7df7f304bcc8d2816e91ea6d8..5c4a6ca59e53d0edafda87eae19516a80ec32c40 100644 --- a/python/paddle/fluid/tests/unittests/test_pad2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pad2d_op.py @@ -20,11 +20,17 @@ from op_test import OpTest class TestPad2dOp(OpTest): def setUp(self): self.pad_value = 0.0 + self.variable_paddings = False self.initTestCase() self.op_type = "pad2d" self.inputs = {'X': np.random.random(self.shape).astype("float32"), } self.attrs = {} - self.attrs['paddings'] = np.array(self.paddings).flatten() + if self.variable_paddings: + self.attrs['paddings'] = [] + self.inputs['Paddings'] = np.array(self.paddings).flatten().astype( + "int32") + else: + self.attrs['paddings'] = np.array(self.paddings).flatten() self.attrs['pad_value'] = self.pad_value self.attrs['mode'] = self.mode self.attrs['data_format'] = self.data_format @@ -98,5 +104,24 @@ class TestCase5(TestPad2dOp): self.data_format = "NHWC" +class TestCase6(TestPad2dOp): + def initTestCase(self): + self.shape = (2, 4, 4, 2) + self.paddings = [0, 1, 2, 3] + self.mode = "constant" + self.pad_value = 1.2 + self.data_format = "NHWC" + self.variable_paddings = True + + +class TestCase7(TestPad2dOp): + def initTestCase(self): + self.shape = (2, 3, 4, 4) + self.paddings = [0, 1, 2, 3] + self.mode = "reflect" + self.data_format = "NCHW" + self.variable_paddings = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py index 97ff203499c0bf223930c904de46e1abdd902799..41797a241cab9f2b3bc4b492a1c4b6db89ac2948 100644 --- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py +++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py @@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): """Test sigmoid_cross_entropy_with_logit_op with probabalistic label """ + def setUp(self): + self.op_type = "sigmoid_cross_entropy_with_logits" + batch_size = 64 + num_classes = 20 + ignore_index = -1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype("float32")), + 'Label': np.random.randint(-1, 2, (batch_size, num_classes)) + .astype("float32") + } + self.attrs = {'ignore_index': ignore_index, } + # Fw Pass is implemented as elementwise sigmoid followed by + # elementwise logistic loss + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) + sigmoid_X = expit(self.inputs['X']) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) + out = -term1 - term2 + out[np.where(self.inputs['Label'] == ignore_index)] = 0 + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSigmoidCrossEntropyWithLogitsOp3(OpTest): + """Test sigmoid_cross_entropy_with_logit_op with probabalistic label + """ + def setUp(self): self.op_type = "sigmoid_cross_entropy_with_logits" batch_size = 64 diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py new file mode 100644 index 0000000000000000000000000000000000000000..544fe4b4f81909b69a05d9751316e3d3137fdc45 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -0,0 +1,215 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division + +import unittest +import numpy as np +from op_test import OpTest + +from paddle.fluid import core + + +def sigmoid(x): + return 1.0 / (1.0 + np.exp(-1.0 * x)) + + +def mse(x, y, num): + return ((y - x)**2).sum() / num + + +def bce(x, y, mask): + x = x.reshape((-1)) + y = y.reshape((-1)) + mask = mask.reshape((-1)) + + error_sum = 0.0 + count = 0 + for i in range(x.shape[0]): + if mask[i] > 0: + error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i]) + count += 1 + return error_sum / (-1.0 * count) + + +def box_iou(box1, box2): + b1_x1 = box1[0] - box1[2] / 2 + b1_x2 = box1[0] + box1[2] / 2 + b1_y1 = box1[1] - box1[3] / 2 + b1_y2 = box1[1] + box1[3] / 2 + b2_x1 = box2[0] - box2[2] / 2 + b2_x2 = box2[0] + box2[2] / 2 + b2_y1 = box2[1] - box2[3] / 2 + b2_y2 = box2[1] + box2[3] / 2 + + b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) + b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + + inter_rect_x1 = max(b1_x1, b2_x1) + inter_rect_y1 = max(b1_y1, b2_y1) + inter_rect_x2 = min(b1_x2, b2_x2) + inter_rect_y2 = min(b1_y2, b2_y2) + inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max( + inter_rect_y2 - inter_rect_y1, 0) + + return inter_area / (b1_area + b2_area + inter_area) + + +def build_target(gtboxs, gtlabel, attrs, grid_size): + n, b, _ = gtboxs.shape + ignore_thresh = attrs["ignore_thresh"] + anchors = attrs["anchors"] + class_num = attrs["class_num"] + an_num = len(anchors) // 2 + obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32') + tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tcls = np.zeros( + (n, an_num, grid_size, grid_size, class_num)).astype('float32') + + for i in range(n): + for j in range(b): + if gtboxs[i, j, :].sum() == 0: + continue + + gt_label = gtlabel[i, j] + gx = gtboxs[i, j, 0] * grid_size + gy = gtboxs[i, j, 1] * grid_size + gw = gtboxs[i, j, 2] * grid_size + gh = gtboxs[i, j, 3] * grid_size + + gi = int(gx) + gj = int(gy) + + gtbox = [0, 0, gw, gh] + max_iou = 0 + for k in range(an_num): + anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]] + iou = box_iou(gtbox, anchor_box) + if iou > max_iou: + max_iou = iou + best_an_index = k + if iou > ignore_thresh: + noobj_mask[i, best_an_index, gj, gi] = 0 + + obj_mask[i, best_an_index, gj, gi] = 1 + noobj_mask[i, best_an_index, gj, gi] = 0 + tx[i, best_an_index, gj, gi] = gx - gi + ty[i, best_an_index, gj, gi] = gy - gj + tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 * + best_an_index]) + th[i, best_an_index, gj, gi] = np.log( + gh / anchors[2 * best_an_index + 1]) + tconf[i, best_an_index, gj, gi] = 1 + tcls[i, best_an_index, gj, gi, gt_label] = 1 + + return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask) + + +def YoloV3Loss(x, gtbox, gtlabel, attrs): + n, c, h, w = x.shape + an_num = len(attrs['anchors']) // 2 + class_num = attrs["class_num"] + x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) + pred_x = sigmoid(x[:, :, :, :, 0]) + pred_y = sigmoid(x[:, :, :, :, 1]) + pred_w = x[:, :, :, :, 2] + pred_h = x[:, :, :, :, 3] + pred_conf = sigmoid(x[:, :, :, :, 4]) + pred_cls = sigmoid(x[:, :, :, :, 5:]) + + tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target( + gtbox, gtlabel, attrs, x.shape[2]) + + obj_mask_expand = np.tile( + np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num']))) + loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum()) + loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum()) + loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum()) + loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum()) + loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) + loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask, + noobj_mask) + loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, + obj_mask_expand) + + return attrs['loss_weight_xy'] * (loss_x + loss_y) \ + + attrs['loss_weight_wh'] * (loss_w + loss_h) \ + + attrs['loss_weight_conf_target'] * loss_conf_target \ + + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \ + + attrs['loss_weight_class'] * loss_class + + +class TestYolov3LossOp(OpTest): + def setUp(self): + self.loss_weight_xy = 1.0 + self.loss_weight_wh = 1.0 + self.loss_weight_conf_target = 1.0 + self.loss_weight_conf_notarget = 1.0 + self.loss_weight_class = 1.0 + self.initTestCase() + self.op_type = 'yolov3_loss' + x = np.random.random(size=self.x_shape).astype('float32') + gtbox = np.random.random(size=self.gtbox_shape).astype('float32') + gtlabel = np.random.randint(0, self.class_num, + self.gtbox_shape[:2]).astype('int32') + + self.attrs = { + "anchors": self.anchors, + "class_num": self.class_num, + "ignore_thresh": self.ignore_thresh, + "loss_weight_xy": self.loss_weight_xy, + "loss_weight_wh": self.loss_weight_wh, + "loss_weight_conf_target": self.loss_weight_conf_target, + "loss_weight_conf_notarget": self.loss_weight_conf_notarget, + "loss_weight_class": self.loss_weight_class, + } + + self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel} + self.outputs = { + 'Loss': np.array( + [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32') + } + + def test_check_output(self): + place = core.CPUPlace() + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad_ignore_gtbox(self): + place = core.CPUPlace() + self.check_grad_with_place( + place, ['X'], + 'Loss', + no_grad_set=set(["GTBox", "GTLabel"]), + max_relative_error=0.06) + + def initTestCase(self): + self.anchors = [10, 13, 12, 12] + self.class_num = 10 + self.ignore_thresh = 0.5 + self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) + self.gtbox_shape = (5, 10, 4) + self.loss_weight_xy = 2.5 + self.loss_weight_wh = 0.8 + self.loss_weight_conf_target = 1.5 + self.loss_weight_conf_notarget = 0.5 + self.loss_weight_class = 1.2 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/testsuite.py b/python/paddle/fluid/tests/unittests/testsuite.py index 34fbb1b549cf5fc5f75bcc0715e5c83665f1d200..dc3b2cb8bc15836a4bf067caa05c3a37a917ecad 100644 --- a/python/paddle/fluid/tests/unittests/testsuite.py +++ b/python/paddle/fluid/tests/unittests/testsuite.py @@ -20,7 +20,7 @@ import paddle.fluid.core as core from paddle.fluid.op import Operator -def create_op(scope, op_type, inputs, outputs, attrs): +def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None): kwargs = dict() op_maker = core.op_proto_and_checker_maker @@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs): __create_var__(in_name, sub_in_name) else: __create_var__(in_name, in_name) + if cache_list != None and isinstance(cache_list, list): + for name in cache_list: + kwargs[name] = [] + scope.var(name) + kwargs[name].append(name) for out_name, out_dup in Operator.get_op_outputs(op_type): if out_name in outputs: diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index ebd0d18d36eed4fffed86ba0903ff76f6052ef7a..1d867d9194347cf55fd1bd8b1962856d599be7ec 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -125,13 +125,14 @@ def slice_variable(var_list, slice_count, min_block_size): class DistributeTranspilerConfig(object): """ - slice_var_up (bool): Do Tensor slice for pservers, default is True. - split_method (PSDispatcher): RoundRobin or HashName can be used - try to choose the best method to balance loads for pservers. - min_block_size (int): Minimum splitted element number in block. - According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 - We can use bandwidth effiently when data size is larger than 2MB.If you - want to change it, please be sure you see the slice_variable function. + Args: + slice_var_up (bool): Do Tensor slice for pservers, default is True. + split_method (PSDispatcher): RoundRobin or HashName can be used + try to choose the best method to balance loads for pservers. + min_block_size (int): Minimum splitted element number in block. + According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 + We can use bandwidth effiently when data size is larger than 2MB.If you + want to change it, please be sure you see the slice_variable function. """ slice_var_up = True @@ -163,35 +164,35 @@ class DistributeTranspiler(object): Examples: .. code-block:: python - # for pserver mode - pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - current_endpoint = "192.168.0.1:6174" - trainer_id = 0 - trainers = 4 - role = os.getenv("PADDLE_TRAINING_ROLE") - - t = fluid.DistributeTranspiler() - t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) - if role == "PSERVER": - pserver_program = t.get_pserver_program(current_endpoint) - pserver_startup_program = t.get_startup_program(current_endpoint, + # for pserver mode + pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + current_endpoint = "192.168.0.1:6174" + trainer_id = 0 + trainers = 4 + role = os.getenv("PADDLE_TRAINING_ROLE") + + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id, pservers=pserver_endpoints, trainers=trainers) + if role == "PSERVER": + pserver_program = t.get_pserver_program(current_endpoint) + pserver_startup_program = t.get_startup_program(current_endpoint, pserver_program) - elif role == "TRAINER": - trainer_program = t.get_trainer_program() - - # for nccl2 mode - config = fluid.DistributeTranspilerConfig() - config.mode = "nccl2" - t = fluid.DistributeTranspiler(config=config) - t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep) - exe = fluid.ParallelExecutor( - use_cuda, - loss_name=loss_var.name, - num_trainers=len(trainers.split(",)), - trainer_id=trainer_id - ) + elif role == "TRAINER": + trainer_program = t.get_trainer_program() + + # for nccl2 mode + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep) + exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss_var.name, + num_trainers=len(trainers.split(",)), + trainer_id=trainer_id + ) """ def __init__(self, config=None): @@ -236,6 +237,31 @@ class DistributeTranspiler(object): else: raise ValueError("must set trainer_id > 0") + def _get_all_remote_sparse_update_op(self, main_program): + sparse_update_ops = [] + sparse_update_op_types = ["lookup_table"] + for op in main_program.global_block().ops: + if op.type in sparse_update_op_types and op.attr( + 'remote_prefetch') is True and not op.attr( + 'is_distributed'): + sparse_update_ops.append(op) + return sparse_update_ops + + def _update_remote_sparse_update_op(self, param_varname, height_sections, + endpint_map, table_names): + for op in self.sparse_update_ops: + if param_varname in op.input_arg_names: + op._set_attr('epmap', endpint_map) + op._set_attr('table_names', table_names) + op._set_attr('height_sections', height_sections) + op._set_attr('trainer_id', self.trainer_id) + + def _is_input_of_remote_sparse_update_op(self, param_name): + for op in self.sparse_update_ops: + if param_name in op.input_arg_names: + return True + return False + def transpile(self, trainer_id, program=None, @@ -299,6 +325,12 @@ class DistributeTranspiler(object): self.param_name_to_grad_name[param_var.name] = grad_var.name self.grad_name_to_param_name[grad_var.name] = param_var.name + # get all sparse update ops + self.sparse_update_ops = self._get_all_remote_sparse_update_op( + self.origin_program) + # use_sparse_update_param_name -> split_height_section + self.sparse_param_to_height_sections = dict() + # add distributed attrs to program self.origin_program._is_distributed = True self.origin_program._endpoints = self.pserver_endpoints @@ -336,6 +368,13 @@ class DistributeTranspiler(object): splited_grad_varname = splited_vars[0].name index = find_op_by_output_arg( program.global_block(), splited_grad_varname, reverse=True) + if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS: + sparse_param_name = self.grad_name_to_param_name[ + grad_varname] + if self._is_input_of_remote_sparse_update_op( + sparse_param_name): + self.sparse_param_to_height_sections[ + sparse_param_name] = [splited_vars[0].shape[0]] elif len(splited_vars) > 1: orig_var = program.global_block().vars[splited_grad_varname] index = find_op_by_output_arg( @@ -406,16 +445,18 @@ class DistributeTranspiler(object): all_recv_outputs = [] for param_varname, splited_var in six.iteritems(self.param_var_mapping): eps = [] + table_names = [] for var in splited_var: index = [v.name for v in recv_vars].index(var.name) eps.append(eplist[index]) + table_names.append(var.name) if self.sync_mode: recv_dep_in = send_barrier_out else: # connect deps to send op in async mode recv_dep_in = self.grad_name_to_send_dummy_out[ self.param_name_to_grad_name[param_varname]] - all_recv_outputs.extend(splited_var) + # get recv op_role_var, if not splited, the grad should have .trainer suffix # if splited, grad should be the original grad var name. ParallelExecutor # will use op_role_var to get expected device place to run this op. @@ -425,18 +466,25 @@ class DistributeTranspiler(object): if len(splited_trainer_grad) == 1: recv_op_role_var_name = splited_trainer_grad[0].name - program.global_block().append_op( - type="recv", - inputs={"X": [recv_dep_in]}, - outputs={"Out": splited_var}, - attrs={ - "epmap": eps, - "trainer_id": self.trainer_id, - RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, - OP_ROLE_VAR_ATTR_NAME: - [param_varname, recv_op_role_var_name], - "sync_mode": not self.sync_mode - }) + if param_varname in self.sparse_param_to_height_sections: + height_sections = self.sparse_param_to_height_sections[ + param_varname] + self._update_remote_sparse_update_op( + param_varname, height_sections, eps, table_names) + else: + all_recv_outputs.extend(splited_var) + program.global_block().append_op( + type="recv", + inputs={"X": [recv_dep_in]}, + outputs={"Out": splited_var}, + attrs={ + "epmap": eps, + "trainer_id": self.trainer_id, + RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, + OP_ROLE_VAR_ATTR_NAME: + [param_varname, recv_op_role_var_name], + "sync_mode": not self.sync_mode + }) if self.sync_mode: # form a WAW dependency @@ -454,14 +502,15 @@ class DistributeTranspiler(object): if len(splited_var) <= 1: continue orig_param = program.global_block().vars[param_varname] - program.global_block().append_op( - type="concat", - inputs={"X": splited_var}, - outputs={"Out": [orig_param]}, - attrs={ - "axis": 0, - RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE - }) + if param_varname not in self.sparse_param_to_height_sections: + program.global_block().append_op( + type="concat", + inputs={"X": splited_var}, + outputs={"Out": [orig_param]}, + attrs={ + "axis": 0, + RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE + }) self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist) @@ -1420,6 +1469,10 @@ to transpile() call.") height_sections = [] for v in splited_vars: height_sections.append(v.shape[0]) + sparse_param_name = self.grad_name_to_param_name[orig_var.name] + if self._is_input_of_remote_sparse_update_op(sparse_param_name): + self.sparse_param_to_height_sections[ + sparse_param_name] = height_sections program.global_block()._insert_op( index=index + 1, type="split_selected_rows", diff --git a/python/paddle/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py index b9af8348e16c051db64d57a9594aee303d83aef2..a9dddbbcc82e649b6c98db0fd58c62b58435b8db 100644 --- a/python/paddle/reader/tests/decorator_test.py +++ b/python/paddle/reader/tests/decorator_test.py @@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase): for idx, i in enumerate(b()): elapsed_time = time.time() - last_time if i == 0: - time.sleep(0.3) + time.sleep(1) else: # read time should be short, meaning already buffered. - self.assertLess(elapsed_time, 0.05) + self.assertLess(elapsed_time, 0.08) last_time = time.time() diff --git a/python/setup.py.in b/python/setup.py.in index 200b96ec54ee5daeb905e155d0b7b57ab7740250..5aee26b63832889272cde09c553b4615efb8872a 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name] -if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode. - if '${WITH_MKLDNN}' == 'ON': +if '${WITH_MKLDNN}' == 'ON': + if '${CMAKE_BUILD_TYPE}' == 'Release': + # only change rpath in Release mode. # TODO(typhoonzero): use install_name_tool to patch mkl libs once # we can support mkl on mac. # @@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" if os.system(command) != 0: raise Exception("patch libmkldnn.so failed, command: %s" % command) - package_data['paddle.libs']+=['libmkldnn.so.0'] - shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) + package_data['paddle.libs']+=['libmkldnn.so.0'] + shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) if '${WITH_NGRAPH}' == 'ON': + # only change rpath in Release mode, + # since in Debug mode, nGraph lib may be too large to be changed? if '${CMAKE_BUILD_TYPE}' == 'Release': - # only change rpath in Release mode. - command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" - if os.system(command) != 0: - raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) + if os.name != 'nt': + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}" + else: + command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) shutil.copy('${NGRAPH_SHARED_LIB}', libs_path) shutil.copy('${NGRAPH_CPU_LIB}', libs_path) shutil.copy('${NGRAPH_TBB_LIB}', libs_path) diff --git a/tools/print_signatures.py b/tools/print_signatures.py index e2805c4e7e6aa26a5865b64a874feef672bf9b36..5c5266f904f5dcf74dd1d4ee7e98081f74a79907 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -15,7 +15,7 @@ Print all signature of a python module in alphabet order. Usage: - ./print_signature "paddle.fluid" > signature.txt + ./print_signature "paddle.fluid,paddle.reader" > signature.txt """ from __future__ import print_function @@ -43,7 +43,8 @@ def visit_member(parent_name, member): line.strip() for line in pydoc.render_doc(member).split('\n') if "->" in line ]) - + elif inspect.isgetsetdescriptor(member): + return else: raise RuntimeError("Unsupported generate signature of member, type {0}". format(str(type(member)))) @@ -63,7 +64,9 @@ def visit_all_module(mod): visit_member(mod.__name__, instance) -visit_all_module(importlib.import_module(sys.argv[1])) +modules = sys.argv[1].split(",") +for m in modules: + visit_all_module(importlib.import_module(m)) for name in member_dict: print(name, member_dict[name])