diff --git a/cmake/external/anakin.cmake b/cmake/external/anakin.cmake index ed054ff41ae0ec5a4b31dd256e397129cba3e8f1..84354c446e2f54fa13b90fa37221eed90968b251 100644 --- a/cmake/external/anakin.cmake +++ b/cmake/external/anakin.cmake @@ -52,6 +52,7 @@ ExternalProject_Add( PREFIX ${ANAKIN_SOURCE_DIR} UPDATE_COMMAND "" CMAKE_ARGS ${CMAKE_ARGS_PREFIX} + -DUSE_LOGGER=YES -DUSE_X86_PLACE=YES -DBUILD_WITH_UNIT_TEST=NO -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index d26eebc8ff64784bdd9c37e123f49f7471ee0d50..59d18acedaffbf9f539608a90e919a634ca7cdee 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -21,7 +21,7 @@ paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'en paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.DistributeTranspilerConfig.__init__ paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)) @@ -145,14 +145,14 @@ paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, key paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'out', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None, None)) -paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) -paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)) +paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) +paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) +paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) +paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) +paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) +paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) +paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None)) paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)) paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False)) paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) @@ -160,6 +160,12 @@ paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shap paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.logical_and ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_or ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_xor ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.logical_not ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -225,12 +231,6 @@ paddle.fluid.layers.is_empty ArgSpec(args=['x', 'cond'], varargs=None, keywords= paddle.fluid.layers.mean ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.mul ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.clip ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.clip_by_norm ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -299,13 +299,17 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.contrib.op_freq_statistic ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.QuantizeTranspiler.__init__ ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)) +paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)) +paddle.fluid.contrib.QuantizeTranspiler.training_transpile ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)) paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')) -paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0)) +paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, False)) paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.HashName.dispatch ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 69c6dd02005902d9ee486a0b7f8f0c9a343be255..de960dba8f79b7efb1d6948ef9ec647ac8530c84 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -56,9 +56,9 @@ else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() if (NOT WIN32) -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) else() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) + cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) endif (NOT WIN32) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) @@ -141,12 +141,15 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + if(WITH_DISTRIBUTE) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) + cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass elementwise_add_op) endif() if (NOT WIN32) @@ -166,15 +169,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) -# cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) if (NOT WIN32) cc_test(rw_lock_test SRCS rw_lock_test.cc) endif (NOT WIN32) - -# disable test temporarily. -# TODO https://github.com/PaddlePaddle/Paddle/issues/11971 -# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op -# channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op -# conditional_block_op while_op assign_op print_op executor proto_desc) diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h deleted file mode 100644 index 722bf8e8ecba0c9cbc5e3ad737dbf73148d2873c..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel.h +++ /dev/null @@ -1,291 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // for size_t -#include // NOLINT -#include -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -enum class ChannelAction { - SEND = 0, - RECEIVE = 1, - CLOSE = 2, -}; - -// Channel is the abstract class of buffered and un-buffered channels. -template -class Channel { - public: - virtual bool CanSend() = 0; - virtual bool CanReceive() = 0; - virtual void Send(T*) = 0; - virtual bool Receive(T*) = 0; - virtual size_t Cap() = 0; - virtual void Lock() = 0; - - virtual void Unlock() = 0; - virtual bool IsClosed() = 0; - virtual void Close() = 0; - virtual ~Channel() {} - - virtual void AddToSendQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) = 0; - virtual void AddToReceiveQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) = 0; - virtual void RemoveFromSendQ(const void* referrer) = 0; - virtual void RemoveFromReceiveQ(const void* referrer) = 0; -}; - -// Forward declaration of channel implementations. -template -class ChannelImpl; - -template -Channel* MakeChannel(size_t buffer_size) { - return new ChannelImpl(buffer_size); -} - -template -void CloseChannel(Channel* ch) { - ch->Close(); -} - -/* - * The ChannelHolder class serves two main purposes: - * 1. It acts as a unified wrapper for the different kinds of - * channels, i.e. Buffered and Unbuffered channels. This is - * similar to the ReaderHolder class. - * 2. It also helps us in TypeHiding. This is similar to the - * PlaceHolder implementations in variable.h and tensor.h. - */ -class ChannelHolder { - public: - template - void Reset(size_t buffer_size) { - holder_.reset(new PlaceholderImpl(buffer_size)); - } - - template - void Send(T* data) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - PADDLE_ENFORCE_EQ( - holder_->Type(), std::type_index(typeid(T)), - "Channel type is not same as the type of the data being sent"); - // Static cast should be safe because we have ensured that types are same - Channel* channel = static_cast*>(holder_->Ptr()); - PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null."); - channel->Send(data); - } - - template - bool Receive(T* data) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - PADDLE_ENFORCE_EQ( - holder_->Type(), std::type_index(typeid(T)), - "Channel type is not same as the type of the data being sent"); - Channel* channel = static_cast*>(holder_->Ptr()); - PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null."); - return channel->Receive(data); - } - - bool IsClosed() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->IsClosed(); - } - - bool CanSend() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->CanSend(); - } - - bool CanReceive() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->CanReceive(); - } - - void close() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Close(); - } - - size_t Cap() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->Cap(); - } - - void Lock() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Lock(); - } - - void Unlock() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->Unlock(); - } - - template - void AddToSendQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - Channel* channel = static_cast*>(holder_->Ptr()); - if (channel != nullptr) { - channel->AddToSendQ(referrer, data, cond, cb); - } - } - - template - void AddToReceiveQ(const void* referrer, T* data, - std::shared_ptr cond, - std::function cb) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - Channel* channel = static_cast*>(holder_->Ptr()); - if (channel != nullptr) { - channel->AddToReceiveQ(referrer, data, cond, cb); - } - } - - void RemoveFromSendQ(const void* referrer) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->RemoveFromSendQ(referrer); - } - - void RemoveFromReceiveQ(const void* referrer) { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - holder_->RemoveFromReceiveQ(referrer); - } - - inline bool IsInitialized() const { return holder_ != nullptr; } - - inline const std::type_index Type() { - PADDLE_ENFORCE_EQ(IsInitialized(), true, - "The Channel hasn't been initialized"); - return holder_->Type(); - } - - private: - /** - * @note Placeholder hides type T, so it doesn't appear as a template - * parameter of ChannelHolder. - */ - struct Placeholder { - virtual ~Placeholder() {} - virtual const std::type_index Type() const = 0; - virtual void* Ptr() const = 0; - virtual bool IsClosed() = 0; - virtual bool CanSend() = 0; - virtual bool CanReceive() = 0; - virtual void RemoveFromSendQ(const void* referrer) = 0; - virtual void RemoveFromReceiveQ(const void* referrer) = 0; - virtual void Close() = 0; - virtual void Lock() = 0; - virtual void Unlock() = 0; - virtual size_t Cap() = 0; - }; - - template - struct PlaceholderImpl : public Placeholder { - explicit PlaceholderImpl(size_t buffer_size) - : type_(std::type_index(typeid(T))) { - channel_.reset(MakeChannel(buffer_size)); - } - - virtual const std::type_index Type() const { return type_; } - - virtual void* Ptr() const { return static_cast(channel_.get()); } - - virtual bool IsClosed() { - if (channel_) { - return channel_->IsClosed(); - } - return false; - } - - virtual bool CanSend() { - if (channel_) { - return channel_->CanSend(); - } - return false; - } - - virtual bool CanReceive() { - if (channel_) { - return channel_->CanReceive(); - } - return false; - } - - virtual void RemoveFromSendQ(const void* referrer) { - if (channel_) { - channel_->RemoveFromSendQ(referrer); - } - } - - virtual void RemoveFromReceiveQ(const void* referrer) { - if (channel_) { - channel_->RemoveFromReceiveQ(referrer); - } - } - - virtual void Close() { - if (channel_) channel_->Close(); - } - - virtual size_t Cap() { - if (channel_) - return channel_->Cap(); - else - return -1; - } - - virtual void Lock() { - if (channel_) channel_->Lock(); - } - - virtual void Unlock() { - if (channel_) channel_->Unlock(); - } - - std::unique_ptr> channel_; - const std::type_index type_; - }; - - // Pointer to a PlaceholderImpl object - std::unique_ptr holder_; -}; - -} // namespace framework -} // namespace paddle - -#include "paddle/fluid/framework/channel_impl.h" diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h deleted file mode 100644 index 26d454534e1ae38c4f83376c0836a45781ea9101..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel_impl.h +++ /dev/null @@ -1,369 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include // for size_t -#include -#include // NOLINT -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { - -template -class ChannelImpl : public paddle::framework::Channel { - friend Channel *paddle::framework::MakeChannel(size_t); - friend void paddle::framework::CloseChannel(Channel *); - - public: - virtual bool CanSend(); - virtual bool CanReceive(); - virtual void Send(T *); - virtual bool Receive(T *); - virtual size_t Cap() { return cap_; } - virtual void Lock(); - virtual void Unlock(); - virtual bool IsClosed(); - virtual void Close(); - explicit ChannelImpl(size_t); - virtual ~ChannelImpl(); - - virtual void AddToSendQ(const void *referrer, T *data, - std::shared_ptr cond, - std::function cb); - virtual void AddToReceiveQ(const void *referrer, T *data, - std::shared_ptr cond, - std::function cb); - - virtual void RemoveFromSendQ(const void *referrer); - virtual void RemoveFromReceiveQ(const void *referrer); - - private: - struct QueueMessage { - T *data; - std::shared_ptr cond; - bool chan_closed = false; - bool completed = false; - const void *referrer; // TODO(thuan): figure out better way to do this - std::function callback; - - explicit QueueMessage(T *item) - : data(item), cond(std::make_shared()) {} - - QueueMessage(T *item, std::shared_ptr cond) - : data(item), cond(cond) {} - - void Wait(std::unique_lock &lock) { - cond->wait(lock, [this]() { return completed; }); - } - - void Notify() { - completed = true; - cond->notify_all(); - } - }; - - void send_return() { - send_ctr--; - destructor_cond_.notify_all(); - } - - bool recv_return(bool value) { - recv_ctr--; - destructor_cond_.notify_all(); - return value; - } - - std::shared_ptr get_first_message( - std::deque> *queue, ChannelAction action) { - while (!queue->empty()) { - // Check whether this message was added by Select - // If this was added by Select then execute the callback - // to check if you can execute this message. The callback - // can return false if some other case was executed in Select. - // In that case just discard this QueueMessage and process next. - std::shared_ptr m = queue->front(); - queue->pop_front(); - if (m->callback == nullptr || m->callback(action)) return m; - } - return nullptr; - } - - size_t cap_; - std::recursive_mutex mu_; - bool closed_; - std::deque buf_; - std::deque> recvq; - std::deque> sendq; - std::atomic send_ctr{0}; - std::atomic recv_ctr{0}; - std::condition_variable_any destructor_cond_; -}; - -template -ChannelImpl::ChannelImpl(size_t capacity) - : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) { - PADDLE_ENFORCE_GE(capacity, 0); -} - -template -bool ChannelImpl::CanSend() { - std::lock_guard lock{mu_}; - return !closed_ && (!recvq.empty() || buf_.size() < cap_); -} - -template -bool ChannelImpl::CanReceive() { - std::lock_guard lock{mu_}; - return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0); -} - -template -void ChannelImpl::Send(T *item) { - send_ctr++; - std::unique_lock lock{mu_}; - - // If channel is closed, throw exception - if (closed_) { - send_return(); - lock.unlock(); - PADDLE_THROW("Cannot send on closed channel"); - } - - // If there is a receiver, directly pass the value we want - // to send to the receiver, bypassing the channel buffer if any - if (!recvq.empty()) { - std::shared_ptr m = - get_first_message(&recvq, ChannelAction::SEND); - - if (m != nullptr) { - *(m->data) = std::move(*item); - m->Notify(); - send_return(); - return; - } else { - Send(item); - send_return(); - return; - } - } - - // Unbuffered channel will always bypass this - // If buffered channel has space in buffer, - // write the element to the buffer. - if (buf_.size() < cap_) { - // Copy to buffer - buf_.push_back(std::move(*item)); - send_return(); - return; - } - - // Block on channel, because some receiver will complete - // the operation for us - auto m = std::make_shared(item); - sendq.push_back(m); - m->Wait(lock); - if (m->chan_closed) { - send_return(); - lock.unlock(); - PADDLE_THROW("Cannot send on closed channel"); - } - send_return(); -} - -template -bool ChannelImpl::Receive(T *item) { - recv_ctr++; - std::unique_lock lock{mu_}; - - // If channel is closed and buffer is empty or - // channel is unbuffered - if (closed_ && buf_.empty()) return recv_return(false); - - // If there is a sender, directly receive the value we want - // from the sender. In case of a buffered channel, read from - // buffer and move front of send queue to the buffer - if (!sendq.empty()) { - std::shared_ptr m = - get_first_message(&sendq, ChannelAction::RECEIVE); - if (buf_.size() > 0) { - // Case 1 : Channel is Buffered - // Do Data transfer from front of buffer - // and add a QueueMessage to the buffer - *item = std::move(buf_.front()); - buf_.pop_front(); - // If first message from sendq is not null - // add it to the buffer and notify it - if (m != nullptr) { - // Copy to buffer - buf_.push_back(std::move(*(m->data))); - m->Notify(); - } // Ignore if there is no first message - } else { - // Case 2: Channel is Unbuffered - // Do data transfer from front of SendQ - // If front is nullptr, then recursively call itself - if (m != nullptr) { - *item = std::move(*(m->data)); - m->Notify(); - } else { - return recv_return(Receive(item)); - } - } - return recv_return(true); - } - - // If this is a buffered channel and there are items in buffer - if (buf_.size() > 0) { - // Directly read from buffer - *item = std::move(buf_.front()); - buf_.pop_front(); - // return true - return recv_return(true); - } - - // No sender available, block on this channel - // Some receiver will complete the option for us - auto m = std::make_shared(item); - recvq.push_back(m); - m->Wait(lock); - - return recv_return(!m->chan_closed); -} - -template -void ChannelImpl::Lock() { - mu_.lock(); -} - -template -void ChannelImpl::Unlock() { - mu_.unlock(); -} - -template -bool ChannelImpl::IsClosed() { - std::lock_guard lock{mu_}; - return closed_; -} - -template -void ChannelImpl::Close() { - std::unique_lock lock{mu_}; - - if (closed_) { - // TODO(abhinavarora): closing an already closed channel should panic - lock.unlock(); - return; - } - - closed_ = true; - - // Empty the readers - while (!recvq.empty()) { - std::shared_ptr m = recvq.front(); - recvq.pop_front(); - m->chan_closed = true; - - // Execute callback function (if any) - if (m->callback != nullptr) { - m->callback(ChannelAction::CLOSE); - } - - m->Notify(); - } - - // Empty the senders - while (!sendq.empty()) { - std::shared_ptr m = sendq.front(); - sendq.pop_front(); - m->chan_closed = true; - - // Execute callback function (if any) - if (m->callback != nullptr) { - m->callback(ChannelAction::CLOSE); - } - - m->Notify(); - } -} - -template -void ChannelImpl::AddToSendQ( - const void *referrer, T *data, - std::shared_ptr cond, - std::function cb) { - std::lock_guard lock{mu_}; - auto m = std::make_shared(data, cond); - m->referrer = referrer; - m->callback = cb; - sendq.push_back(m); -} - -template -void ChannelImpl::AddToReceiveQ( - const void *referrer, T *data, - std::shared_ptr cond, - std::function cb) { - std::lock_guard lock{mu_}; - auto m = std::make_shared(data, cond); - m->referrer = referrer; - m->callback = cb; - recvq.push_back(m); -} - -template -void ChannelImpl::RemoveFromSendQ(const void *referrer) { - std::lock_guard lock{mu_}; - - for (auto it = sendq.begin(); it != sendq.end();) { - std::shared_ptr sendMsg = (std::shared_ptr)*it; - - if (sendMsg->referrer == referrer) { - it = sendq.erase(it); - } else { - ++it; - } - } -} - -template -void ChannelImpl::RemoveFromReceiveQ(const void *referrer) { - std::lock_guard lock{mu_}; - - for (auto it = recvq.begin(); it != recvq.end();) { - std::shared_ptr recvMsg = (std::shared_ptr)*it; - - if (recvMsg->referrer == referrer) { - it = recvq.erase(it); - } else { - ++it; - } - } -} - -template -ChannelImpl::~ChannelImpl() { - Close(); - // The destructor must wait for all readers and writers to complete their task - // The channel has been closed, so we will not accept new readers and writers - std::unique_lock lock{mu_}; - destructor_cond_.wait(lock, - [this]() { return send_ctr == 0 && recv_ctr == 0; }); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc deleted file mode 100644 index 542d791f6bbdf7d68a4786998ccc0233fff6473d..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/channel_test.cc +++ /dev/null @@ -1,1008 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" - -#include // NOLINT -#include // NOLINT -#include "gtest/gtest.h" - -using paddle::framework::Channel; -using paddle::framework::ChannelHolder; -using paddle::framework::MakeChannel; -using paddle::framework::CloseChannel; - -TEST(Channel, ChannelCapacityTest) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - EXPECT_EQ(ch->Cap(), buffer_size); - CloseChannel(ch); - delete ch; - - ch = MakeChannel(0); - EXPECT_EQ(ch->Cap(), 0U); - CloseChannel(ch); - delete ch; -} - -void RecevingOrderEqualToSendingOrder(Channel *ch, int num_items) { - unsigned sum_send = 0; - std::thread t([&]() { - for (int i = 0; i < num_items; i++) { - ch->Send(&i); - sum_send += i; - } - }); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - for (int i = 0; i < num_items; i++) { - int recv = -1; - EXPECT_EQ(ch->Receive(&recv), true); - EXPECT_EQ(recv, i); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - CloseChannel(ch); - t.join(); - unsigned expected_sum = (num_items * (num_items - 1)) / 2; - EXPECT_EQ(sum_send, expected_sum); - delete ch; -} - -TEST(Channel, SufficientBufferSizeDoesntBlock) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - for (size_t i = 0; i < buffer_size; ++i) { - ch->Send(&i); - } - - size_t out; - for (size_t i = 0; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), true); // should not block - EXPECT_EQ(out, i); - } - CloseChannel(ch); - delete ch; -} - -// This tests that a channel must return false -// on send and receive performed after closing the channel. -// Receive will only return false after close when queue is empty. -// By creating separate threads for sending and receiving, we make this -// function able to test both buffered and unbuffered channels. -void SendReceiveWithACloseChannelShouldPanic(Channel *ch) { - const size_t data = 5; - std::thread send_thread{[&]() { - size_t i = data; - ch->Send(&i); // should not block - }}; - - std::thread recv_thread{[&]() { - size_t i; - EXPECT_EQ(ch->Receive(&i), true); // should not block - EXPECT_EQ(i, data); - }}; - - send_thread.join(); - recv_thread.join(); - - // After closing send should panic. Receive should - // also false as there is no data in queue. - CloseChannel(ch); - send_thread = std::thread{[&]() { - size_t i = data; - bool is_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - }}; - recv_thread = std::thread{[&]() { - size_t i; - // should return false because channel is closed and queue is empty - EXPECT_EQ(ch->Receive(&i), false); - }}; - - send_thread.join(); - recv_thread.join(); -} - -TEST(Channel, SendReceiveClosedBufferedChannelPanics) { - size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - SendReceiveWithACloseChannelShouldPanic(ch); - delete ch; -} - -TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) { - auto ch = MakeChannel(0); - SendReceiveWithACloseChannelShouldPanic(ch); - delete ch; -} - -TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - - for (size_t i = 0; i < buffer_size; ++i) { - ch->Send(&i); // sending should not block - } - - size_t out; - for (size_t i = 0; i < buffer_size / 2; ++i) { - EXPECT_EQ(ch->Receive(&out), true); // receiving should not block - EXPECT_EQ(out, i); - } - - CloseChannel(ch); - - for (size_t i = buffer_size / 2; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), - true); // receving should return residual values. - EXPECT_EQ(out, i); - } - - for (size_t i = 0; i < buffer_size; ++i) { - EXPECT_EQ(ch->Receive(&out), - false); // receiving on closed channel should return false - } - delete ch; -} - -TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { - const size_t buffer_size = 10; - auto ch = MakeChannel(buffer_size); - std::thread t([&]() { - // Try to write more than buffer size. - for (size_t i = 0; i < 2 * buffer_size; ++i) { - if (i < buffer_size) { - ch->Send(&i); // should block after 10 iterations - } else { - bool is_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - } - } - }); - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - CloseChannel(ch); - t.join(); - delete ch; -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { - auto ch = MakeChannel(0); - RecevingOrderEqualToSendingOrder(ch, 20); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) { - // Test that Receive Order is same as Send Order when number of items - // sent is less than size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 5); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) { - // Test that Receive Order is same as Send Order when number of items - // sent is equal to size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 10); -} - -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) { - // Test that Receive Order is same as Send Order when number of items - // sent is greater than the size of buffer - auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch, 20); -} - -void ChannelCloseUnblocksReceiversTest(Channel *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - - // Explicitly close the channel - // This should unblock all receivers - CloseChannel(ch); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - if (isBuffered) { - // If ch is Buffered, atleast 4 threads must be blocked. - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (!thread_ended[i]) ct++; - } - EXPECT_GE(ct, 4); - } else { - // If ch is UnBuffered, all the threads should be blocked. - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly close the thread - // This should unblock all senders - CloseChannel(ch); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - if (isBuffered) { - // Verify that only 1 send was successful - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that closing a buffered channel also unblocks -// any receivers waiting on the channel -TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { - auto ch = MakeChannel(1); - ChannelCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing a buffered channel also unblocks -// any senders waiting for channel to have write space -TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { - auto ch = MakeChannel(1); - ChannelCloseUnblocksSendersTest(ch, true); - delete ch; -} - -// This tests that closing an unbuffered channel also unblocks -// unblocks any receivers waiting for senders -TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { - auto ch = MakeChannel(0); - ChannelCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing an unbuffered channel also unblocks -// unblocks any senders waiting for senders -TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { - auto ch = MakeChannel(0); - ChannelCloseUnblocksSendersTest(ch, false); - delete ch; -} - -TEST(Channel, UnbufferedLessReceiveMoreSendTest) { - auto ch = MakeChannel(0); - unsigned sum_send = 0; - // Send should block after three iterations - // since we only have three receivers. - std::thread t([&]() { - // Try to send more number of times - // than receivers - for (int i = 0; i < 4; i++) { - try { - ch->Send(&i); - sum_send += i; - } catch (paddle::platform::EnforceNotMet e) { - } - } - }); - for (int i = 0; i < 3; i++) { - int recv; - ch->Receive(&recv); - EXPECT_EQ(recv, i); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - EXPECT_EQ(sum_send, 3U); - - CloseChannel(ch); - t.join(); - delete ch; -} - -TEST(Channel, UnbufferedMoreReceiveLessSendTest) { - auto ch = MakeChannel(0); - unsigned sum_send = 0; - unsigned sum_receive = 0; - // The receiver should block after 5 - // iterations, since there are only 5 senders. - std::thread t([&]() { - for (int i = 0; i < 8; i++) { - int recv; - ch->Receive(&recv); // should block after the fifth iteration. - EXPECT_EQ(recv, i); - sum_receive += i; - } - }); - for (int i = 0; i < 5; i++) { - ch->Send(&i); - sum_send += i; - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - EXPECT_EQ(sum_send, 10U); - EXPECT_EQ(sum_receive, 10U); - // send three more elements - for (int i = 5; i < 8; i++) { - ch->Send(&i); - sum_send += i; - } - - CloseChannel(ch); - t.join(); - EXPECT_EQ(sum_send, 28U); - EXPECT_EQ(sum_receive, 28U); - delete ch; -} - -// This tests that destroying a channel unblocks -// any senders waiting for channel to have write space -void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - if (isBuffered) { - // If channel is buffered, verify that atleast 4 threads are blocked - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (thread_ended[i] == false) ct++; - } - // Atleast 4 threads must be blocked - EXPECT_GE(ct, 4); - } else { - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly destroy the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - // Count number of successful sends - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - - if (isBuffered) { - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } else { - // In unbuffered channel, no send should be successful - EXPECT_EQ(ct, 0); - } - - // Join all threads - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that destroying a channel also unblocks -// any receivers waiting on the channel -void ChannelDestroyUnblockReceivers(Channel *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - // All reads should return false - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait - - // Verify that all threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - // delete the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) { - size_t buffer_size = 1; - auto ch = MakeChannel(buffer_size); - ChannelDestroyUnblockReceivers(ch); -} - -TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) { - size_t buffer_size = 1; - auto ch = MakeChannel(buffer_size); - ChannelDestroyUnblockSenders(ch, true); -} - -// This tests that destroying an unbuffered channel also unblocks -// unblocks any receivers waiting for senders -TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) { - auto ch = MakeChannel(0); - ChannelDestroyUnblockReceivers(ch); -} - -TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) { - auto ch = MakeChannel(0); - ChannelDestroyUnblockSenders(ch, false); -} - -TEST(ChannelHolder, ChannelHolderCapacityTest) { - const size_t buffer_size = 10; - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(buffer_size); - EXPECT_EQ(ch->Cap(), buffer_size); - delete ch; - - ch = new ChannelHolder(); - ch->Reset(0); - EXPECT_EQ(ch->Cap(), 0U); - delete ch; -} - -void ChannelHolderSendReceive(ChannelHolder *ch) { - unsigned sum_send = 0; - std::thread t([&]() { - for (int i = 0; i < 5; i++) { - ch->Send(&i); - sum_send += i; - } - }); - for (int i = 0; i < 5; i++) { - int recv; - EXPECT_EQ(ch->Receive(&recv), true); - EXPECT_EQ(recv, i); - } - - ch->close(); - t.join(); - EXPECT_EQ(sum_send, 10U); -} - -TEST(ChannelHolder, ChannelHolderBufferedSendReceiveTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(10); - ChannelHolderSendReceive(ch); - delete ch; -} - -TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderSendReceive(ch); - delete ch; -} - -TEST(ChannelHolder, ChannelUninitializedTest) { - ChannelHolder *ch = new ChannelHolder(); - EXPECT_EQ(ch->IsInitialized(), false); - int i = 10; - bool send_exception = false; - try { - ch->Send(&i); - } catch (paddle::platform::EnforceNotMet e) { - send_exception = true; - } - EXPECT_EQ(send_exception, true); - - bool recv_exception = false; - try { - ch->Receive(&i); - } catch (paddle::platform::EnforceNotMet e) { - recv_exception = true; - } - EXPECT_EQ(recv_exception, true); - - bool is_exception = false; - try { - ch->Type(); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -TEST(ChannelHolder, ChannelInitializedTest) { - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(2); - EXPECT_EQ(ch->IsInitialized(), true); - // Channel should remain intialized even after close - ch->close(); - EXPECT_EQ(ch->IsInitialized(), true); - delete ch; -} - -TEST(ChannelHolder, TypeMismatchSendTest) { - // Test with unbuffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - bool is_exception = false; - bool boolean_data = true; - try { - ch->Send(&boolean_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; - - // Test with Buffered Channel - ch = new ChannelHolder(); - ch->Reset(10); - is_exception = false; - int int_data = 23; - try { - ch->Send(&int_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -TEST(ChannelHolder, TypeMismatchReceiveTest) { - // Test with unbuffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(0); - bool is_exception = false; - bool float_data; - try { - ch->Receive(&float_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; - - // Test with Buffered Channel - ch = new ChannelHolder(); - ch->Reset(10); - is_exception = false; - int int_data = 23; - try { - ch->Receive(&int_data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - EXPECT_EQ(is_exception, true); - delete ch; -} - -void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - - // Explicitly close the channel - // This should unblock all receivers - ch->close(); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - if (isBuffered) { - // If ch is Buffered, atleast 4 threads must be blocked. - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (!thread_ended[i]) ct++; - } - EXPECT_GE(ct, 4); - } else { - // If ch is UnBuffered, all the threads should be blocked. - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly close the thread - // This should unblock all senders - ch->close(); - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - if (isBuffered) { - // Verify that only 1 send was successful - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that closing a channelholder unblocks -// any receivers waiting on the channel -TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) { - // Check for buffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderCloseUnblocksReceiversTest(ch); - delete ch; - - // Check for unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderCloseUnblocksReceiversTest(ch); - delete ch; -} - -// This tests that closing a channelholder unblocks -// any senders waiting for channel to have write space -TEST(Channel, ChannelHolderCloseUnblocksSendersTest) { - // Check for buffered channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderCloseUnblocksSendersTest(ch, true); - delete ch; - - // Check for unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderCloseUnblocksSendersTest(ch, false); - delete ch; -} - -// This tests that destroying a channelholder unblocks -// any senders waiting for channel -void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - bool send_success[kNumThreads]; - - // Launches threads that try to write and are blocked because of no readers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - send_success[i] = false; - t[i] = std::thread( - [&](bool *ended, bool *success) { - int data = 10; - bool is_exception = false; - try { - ch->Send(&data); - } catch (paddle::platform::EnforceNotMet e) { - is_exception = true; - } - *success = !is_exception; - *ended = true; - }, - &thread_ended[i], &send_success[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - if (isBuffered) { - // If channel is buffered, verify that atleast 4 threads are blocked - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (thread_ended[i] == false) ct++; - } - // Atleast 4 threads must be blocked - EXPECT_GE(ct, 4); - } else { - // Verify that all the threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - } - // Explicitly destroy the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - // Count number of successfuld sends - int ct = 0; - for (size_t i = 0; i < kNumThreads; i++) { - if (send_success[i]) ct++; - } - - if (isBuffered) { - // Only 1 send must be successful - EXPECT_EQ(ct, 1); - } else { - // In unbuffered channel, no send should be successful - EXPECT_EQ(ct, 0); - } - - // Join all threads - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -// This tests that destroying a channelholder also unblocks -// any receivers waiting on the channel -void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) { - const size_t kNumThreads = 5; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to read and are blocked because of no writers - for (size_t i = 0; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - // All reads should return false - EXPECT_EQ(ch->Receive(&data), false); - *p = true; - }, - &thread_ended[i]); - } - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - - // Verify that all threads are blocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], false); - } - // delete the channel - delete ch; - std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - // Verify that all threads got unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderDestroyUnblockReceivers(ch); - // ch is already deleted already deleted in - // ChannelHolderDestroyUnblockReceivers - - // Check for Unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderDestroyUnblockReceivers(ch); -} - -TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(1); - ChannelHolderDestroyUnblockSenders(ch, true); - // ch is already deleted already deleted in - // ChannelHolderDestroyUnblockReceivers - - // Check for Unbuffered channel - ch = new ChannelHolder(); - ch->Reset(0); - ChannelHolderDestroyUnblockSenders(ch, false); -} - -// This tests that closing a channelholder many times. -void ChannelHolderManyTimesClose(ChannelHolder *ch) { - const int kNumThreads = 15; - std::thread t[kNumThreads]; - bool thread_ended[kNumThreads]; - - // Launches threads that try to send data to channel. - for (size_t i = 0; i < kNumThreads / 3; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *ended) { - int data = 10; - ch->Send(&data); - *ended = true; - }, - &thread_ended[i]); - } - - // Launches threads that try to receive data to channel. - for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - int data; - if (ch->Receive(&data)) { - EXPECT_EQ(data, 10); - } - *p = true; - }, - &thread_ended[i]); - } - - // Launches threads that try to close the channel. - for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) { - thread_ended[i] = false; - t[i] = std::thread( - [&](bool *p) { - if (!ch->IsClosed()) { - ch->close(); - } - *p = true; - }, - &thread_ended[i]); - } - - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait - - // Verify that all threads are unblocked - for (size_t i = 0; i < kNumThreads; i++) { - EXPECT_EQ(thread_ended[i], true); - } - EXPECT_TRUE(ch->IsClosed()); - // delete the channel - delete ch; - for (size_t i = 0; i < kNumThreads; i++) t[i].join(); -} - -TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) { - // Check for Buffered Channel - ChannelHolder *ch = new ChannelHolder(); - ch->Reset(10); - ChannelHolderManyTimesClose(ch); -} diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc deleted file mode 100644 index bbf67f5ba92150f70cf45d49e3f4ca0a16393541..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/concurrency_test.cc +++ /dev/null @@ -1,292 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" - -USE_NO_KERNEL_OP(go); -USE_NO_KERNEL_OP(channel_close); -USE_NO_KERNEL_OP(channel_create); -USE_NO_KERNEL_OP(channel_recv); -USE_NO_KERNEL_OP(channel_send); -USE_NO_KERNEL_OP(elementwise_add); -USE_NO_KERNEL_OP(select); -USE_NO_KERNEL_OP(conditional_block); -USE_NO_KERNEL_OP(equal); -USE_NO_KERNEL_OP(assign); -USE_NO_KERNEL_OP(while); -USE_NO_KERNEL_OP(print); - -namespace f = paddle::framework; -namespace p = paddle::platform; - -namespace paddle { -namespace framework { - -template -LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place, - std::string name, T value) { - // Create LoDTensor of dim [1] - auto var = scope->Var(name); - auto tensor = var->GetMutable(); - tensor->Resize({1}); - T *expect = tensor->mutable_data(place); - expect[0] = value; - return tensor; -} - -void AddOp(const std::string &type, const VariableNameMap &inputs, - const VariableNameMap &outputs, AttributeMap attrs, - BlockDesc *block) { - // insert op - auto op = block->AppendOp(); - op->SetType(type); - for (auto &kv : inputs) { - op->SetInput(kv.first, kv.second); - } - for (auto &kv : outputs) { - op->SetOutput(kv.first, kv.second); - } - op->SetAttrMap(attrs); -} - -void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place, - BlockDesc *casesBlock, int caseId, int caseType, - std::string caseChannel, std::string caseVarName, - std::function func) { - std::string caseCondName = std::string("caseCond") + std::to_string(caseId); - std::string caseCondXVarName = - std::string("caseCondX") + std::to_string(caseId); - - BlockDesc *caseBlock = program->AppendBlock(*casesBlock); - func(caseBlock, scope); - - CreateVariable(scope, *place, caseCondName, false); - CreateVariable(scope, *place, caseCondXVarName, caseId); - CreateVariable(scope, *place, caseVarName, caseId); - - scope->Var("step_scope"); - - AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}}, - {{"Out", {caseCondName}}}, {}, casesBlock); - - AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}}, - {{"Out", {}}, {"Scope", {"step_scope"}}}, - {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock); -} - -void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program, - BlockDesc *parentBlock, std::string dataChanName, - std::string quitChanName) { - BlockDesc *whileBlock = program->AppendBlock(*parentBlock); - - CreateVariable(scope, *place, "whileExitCond", true); - CreateVariable(scope, *place, "caseToExecute", -1); - CreateVariable(scope, *place, "case1var", 0); - - CreateVariable(scope, *place, "xtemp", 0); - - // TODO(thuan): Need to create fibXToSend, since channel send moves the actual - // data, - // which causes the data to be no longer accessible to do the fib calculation - // TODO(abhinav): Change channel send to do a copy instead of a move! - CreateVariable(scope, *place, "fibXToSend", 0); - - CreateVariable(scope, *place, "fibX", 0); - CreateVariable(scope, *place, "fibY", 1); - CreateVariable(scope, *place, "quitVar", 0); - - BlockDesc *casesBlock = program->AppendBlock(*whileBlock); - std::function f = [](BlockDesc *caseBlock) {}; - - // TODO(thuan): Remove this once we change channel send to do a copy instead - // of move - AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock); - - // Case 0: Send to dataChanName - std::function case0Func = [&]( - BlockDesc *caseBlock, Scope *scope) { - AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock); - AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock); - AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}}, - {{"Out", {"fibY"}}}, {}, caseBlock); - }; - AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend", - case0Func); - std::string case0Config = - std::string("0,1,") + dataChanName + std::string(",fibXToSend"); - - // Case 1: Receive from quitChanName - std::function case2Func = [&]( - BlockDesc *caseBlock, Scope *scope) { - // Exit the while loop after we receive from quit channel. - // We assign a false to "whileExitCond" variable, which will - // break out of while_op loop - CreateVariable(scope, *place, "whileFalse", false); - AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {}, - caseBlock); - }; - AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar", - case2Func); - std::string case1Config = - std::string("1,2,") + quitChanName + std::string(",quitVar"); - - // Select block - AddOp("select", {{"X", {dataChanName, quitChanName}}, - {"case_to_execute", {"caseToExecute"}}}, - {{"Out", {}}}, - {{"sub_block", casesBlock}, - {"cases", std::vector{case0Config, case1Config}}}, - whileBlock); - - scope->Var("stepScopes"); - AddOp("while", - {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}}, - {{"Out", {}}, {"StepScopes", {"stepScopes"}}}, - {{"sub_block", whileBlock}}, parentBlock); -} - -TEST(Concurrency, Go_Op) { - Scope scope; - p::CPUPlace place; - - // Initialize scope variables - p::CPUDeviceContext ctx(place); - - // Create channel variable - scope.Var("Channel"); - - // Create Variables, x0 will be put into channel, - // result will be pulled from channel - CreateVariable(&scope, place, "Status", false); - CreateVariable(&scope, place, "x0", 99); - CreateVariable(&scope, place, "result", 0); - - framework::Executor executor(place); - ProgramDesc program; - BlockDesc *block = program.MutableBlock(0); - - // Create channel OP - AddOp("channel_create", {}, {{"Out", {"Channel"}}}, - {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}}, - block); - - // Create Go Op routine - BlockDesc *goOpBlock = program.AppendBlock(program.Block(0)); - AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}}, - {{"Status", {"Status"}}}, {}, goOpBlock); - - // Create Go Op - AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}}, - block); - - // Create Channel Receive Op - AddOp("channel_recv", {{"Channel", {"Channel"}}}, - {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block); - - // Create Channel Close Op - AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block); - - // Check the result tensor to make sure it is set to 0 - const LoDTensor &tensor = (scope.FindVar("result"))->Get(); - auto *initialData = tensor.data(); - EXPECT_EQ(initialData[0], 0); - - executor.Run(program, &scope, 0, true, true); - - // After we call executor.run, the Go operator should do a channel_send to - // set the "result" variable to 99. - auto *finalData = tensor.data(); - EXPECT_EQ(finalData[0], 99); -} - -/** - * This test implements the fibonacci function using go_op and select_op - */ -TEST(Concurrency, Select) { - Scope scope; - p::CPUPlace place; - - // Initialize scope variables - p::CPUDeviceContext ctx(place); - - CreateVariable(&scope, place, "Status", false); - CreateVariable(&scope, place, "result", 0); - CreateVariable(&scope, place, "currentXFib", 0); - - framework::Executor executor(place); - ProgramDesc program; - BlockDesc *block = program.MutableBlock(0); - - // Create channel OP - std::string dataChanName = "Channel"; - scope.Var(dataChanName); - AddOp("channel_create", {}, {{"Out", {dataChanName}}}, - {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block); - - std::string quitChanName = "Quit"; - scope.Var(quitChanName); - AddOp("channel_create", {}, {{"Out", {quitChanName}}}, - {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block); - - // Create Go Op routine, which loops 10 times over fibonacci sequence - CreateVariable(&scope, place, "xReceiveVar", 0); - - BlockDesc *goOpBlock = program.AppendBlock(program.Block(0)); - for (int i = 0; i < 10; ++i) { - AddOp("channel_recv", {{"Channel", {dataChanName}}}, - {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock); - AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}}, - {{"first_n", 100}, - {"summarize", -1}, - {"print_tensor_name", false}, - {"print_tensor_type", true}, - {"print_tensor_shape", false}, - {"print_tensor_lod", false}, - {"print_phase", std::string("FORWARD")}, - {"message", std::string("X: ")}}, - goOpBlock); - } - - CreateVariable(&scope, place, "quitSignal", 0); - AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}}, - {{"Status", {"Status"}}}, {}, goOpBlock); - - // Create Go Op - AddOp("go", {{"X", {dataChanName, quitChanName}}}, {}, - {{"sub_block", goOpBlock}}, block); - - AddFibonacciSelect(&scope, &place, &program, block, dataChanName, - quitChanName); - - // Create Channel Close Op - AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block); - AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block); - - executor.Run(program, &scope, 0, true, true); - - // After we call executor.run, "result" variable should be equal to 34 - // (which is 10 loops through fibonacci sequence) - const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get(); - auto *finalData = tensor.data(); - EXPECT_EQ(finalData[0], 34); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 8d8042a0563a21dad216ffd53a474322c378ace6..70ec6e90a4d0106b7f838e51b8357798daa4b10d 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { var->GetMutable(); - } else if (var_type == proto::VarType::CHANNEL) { - var->GetMutable(); } else if (var_type == proto::VarType::RAW) { // GetMutable will be called in operator } else { PADDLE_THROW( "Variable type %d is not in " "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " - "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", + "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", var_type); } } diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index 460401df5473f8650f450a2bd247a703d91b6048..25f0ba418433571343c5b2bbfdbf9fb940eaec52 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -126,7 +126,6 @@ message VarType { LOD_TENSOR_ARRAY = 13; PLACE_LIST = 14; READER = 15; - CHANNEL = 16; // Any runtime decided variable type is raw // raw variables should manage their own allocations // in operators like nccl_op @@ -158,12 +157,6 @@ message VarType { message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } optional ReaderDesc reader = 5; - message ChannelDesc { - required Type data_type = 1; - required int64 capacity = 2; - } - optional ChannelDesc channel = 6; - message Tuple { repeated Type element_type = 1; } optional Tuple tuple = 7; } diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 9796f2778955ce47fe9fd1b13baa36404b1574c5..b56912689b82be384ac5fda0e91a54d690cd9586 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -28,12 +28,13 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) pass_library(fc_fuse_pass inference) -if(WITH_MKLDNN) - pass_library(conv_relu_mkldnn_fuse_pass inference) -endif() +if (WITH_MKLDNN) + pass_library(conv_relu_mkldnn_fuse_pass inference) +endif () pass_library(attention_lstm_fuse_pass inference) pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) +pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference) @@ -49,6 +50,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) -if(WITH_MKLDNN) - cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) -endif() +if (WITH_MKLDNN) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) +endif () diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..ba11f19c9273650113096be3fa23ca077bbc7dd9 --- /dev/null +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h" +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace framework { +namespace ir { + +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Build pattern + PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x")) + ->assert_is_op_input("lookup_table") + ->assert_var_not_persistable(); + patterns::Embedding embedding_pattern(pattern, name_scope); + // TODO(jczaja): Intermediate can only be for val that are not used anywhere + // but lookup table output may go into other LSTM (for reverse + // direction) + auto* embedding_out = embedding_pattern(x); + patterns::FC fc_pattern(pattern, name_scope); + + // fc_out is a tmp var, will be removed after fuse, so marked as intermediate. + auto* fc_out = fc_pattern(embedding_out, with_fc_bias)->AsIntermediate(); + patterns::LSTM lstm_pattern(pattern, name_scope); + lstm_pattern(fc_out); + + // Create New OpDesc + auto embedding_lstm_creator = [&](Node* embedding, Node* W, Node* lstm, + Node* input, Node* weight_x, Node* weight_h, + Node* bias, Node* hidden, Node* cell, + Node* xx, Node* fc_bias) { + OpDesc op_desc; + op_desc.SetType("fused_embedding_fc_lstm"); +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()}); + SET_IN(Ids, input); + SET_IN(WeightH, weight_h); + // Neet to have this passed as We need Wc data for peephole connections + SET_IN(Bias, bias); +#undef SET_IN + + // Multiply embeddings with Weights + PADDLE_ENFORCE(scope); + const std::string& embeddings = patterns::UniqueKey("Embeddings"); + auto* embeddings_var = scope->Var(embeddings); + PADDLE_ENFORCE(embeddings_var); + auto* embeddings_tensor = + embeddings_var->GetMutable(); + // Get WeightX size: [single_embedding, fc_size] + // and embedding size: [dict_size, single_embedding] + // and create new size of embeddings eg. [dict_size , hidden_size] + auto* embedding_var = scope->FindVar(W->Name()); + PADDLE_ENFORCE(embedding_var); + const auto& embedding_tensor = embedding_var->Get(); + + const auto& weightx_tensor = + scope->FindVar(weight_x->Name())->Get(); + embeddings_tensor->Resize( + {embedding_tensor.dims()[0], weightx_tensor.dims()[1]}); + + // Multiplie embeddings via WeightsX and add bias + auto embedding_data = embedding_tensor.data(); + auto weightx_data = weightx_tensor.data(); + auto embeddings_data = + embeddings_tensor->mutable_data(platform::CPUPlace()); + + // Adding biases to GEMM result to be + auto* lstm_bias_var = scope->FindVar(bias->Name()); + PADDLE_ENFORCE(lstm_bias_var); + const auto& lstm_bias_tensor = lstm_bias_var->Get(); + + auto alpha = 1.0f; + auto beta = 1.0f; + int m = embedding_tensor.dims()[0]; + int n = weightx_tensor.dims()[1]; + int k = embedding_tensor.dims()[1]; + + // Copy only gate biases values (only actual bias data, not peephole + // weights) + std::vector combined_biases; + combined_biases.reserve(n); + std::copy_n(lstm_bias_tensor.data(), n, + std::back_inserter(combined_biases)); + + if (with_fc_bias) { + // Add FC-bias with LSTM-bias (into GEMM result to be) + auto* fc_bias_var = scope->FindVar(fc_bias->Name()); + const auto& fc_bias_tensor = fc_bias_var->Get(); + for (int i = 0; i < fc_bias_tensor.numel(); i++) { + combined_biases[i] += fc_bias_tensor.data()[i]; + } + } + + // broadcast biases + std::vector ones(m, 1.0f); + paddle::operators::math::CBlas::GEMM( + CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1, + &combined_biases[0], n, 0.0f, embeddings_data, n); + + // Wx*embeddings + biases + paddle::operators::math::CBlas::GEMM( + CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, + embedding_data, k, weightx_data, n, beta, embeddings_data, n); + op_desc.SetInput("Embeddings", {embeddings}); + + // Create temp variables. + const std::string BatchedInput = patterns::UniqueKey("BatchedInput"); + const std::string BatchedCellPreAct = + patterns::UniqueKey("BatchedCellPreAct"); + const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); + + scope->Var(BatchedInput)->GetMutable(); + scope->Var(BatchedCellPreAct)->GetMutable(); + scope->Var(BatchedGate)->GetMutable(); + + op_desc.SetInput("H0", {}); + op_desc.SetInput("C0", {}); + op_desc.SetOutput("Hidden", {hidden->Name()}); + op_desc.SetOutput("Cell", {cell->Name()}); + op_desc.SetOutput("XX", {xx->Name()}); + op_desc.SetOutput("BatchedGate", {BatchedGate}); + op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); + op_desc.SetOutput("BatchedInput", {BatchedInput}); + op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); + op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); + // TODO(TJ): get from attr + op_desc.SetAttr("use_seq", true); + + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto* scope = graph->Get(kParamScopeAttr); +#define OP_SET_OUT(x) \ + const std::string x = patterns::UniqueKey(#x); \ + op_desc.SetOutput(#x, {x}); \ + scope->Var(x)->GetMutable() + OP_SET_OUT(BatchedCell); + OP_SET_OUT(BatchedHidden); + OP_SET_OUT(ReorderedH0); + OP_SET_OUT(ReorderedC0); +#undef OP_SET_OUT + + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(input, op); + IR_NODE_LINK_TO(weight_x, op); + IR_NODE_LINK_TO(weight_h, op); + IR_NODE_LINK_TO(bias, op); + IR_NODE_LINK_TO(op, hidden); + return op; + }; + + int fusion_count{0}; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, embedding_pattern); + GET_IR_NODE_FROM_SUBGRAPH(W, W, embedding_pattern); + GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); + + // TODO(jczaja): Add support for is_sparse / is_distributed + auto is_sparse = boost::get(lookup_table->Op()->GetAttr("is_sparse")); + auto is_distributed = + boost::get(lookup_table->Op()->GetAttr("is_distributed")); + + if (is_sparse == true || is_distributed == true) { + return; + } + + if (with_fc_bias) { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); + embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, + Bias, Hidden, Cell, fc_out, fc_bias); + // Remove unneeded nodes. + // TODO(jczaja): Proper removing of lookup table + std::unordered_set marked_nodes( + //{lookup_table, mul, lstm, elementwise_add, fc_bias, W}); + {mul, lstm, elementwise_add, fc_bias}); + GraphSafeRemoveNodes(graph, marked_nodes); + } else { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); + embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, + Bias, Hidden, Cell, fc_out, nullptr); + // Remove unneeded nodes. + // TODO(jczaja): Proper removing of lookup table + // std::unordered_set marked_nodes({lookup_table, W, mul, + // lstm}); + std::unordered_set marked_nodes({mul, lstm}); + GraphSafeRemoveNodes(graph, marked_nodes); + } + + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +std::unique_ptr EmbeddingFCLSTMFusePass::ApplyImpl( + std::unique_ptr graph) const { + FusePassBase::Init(name_scope_, graph.get()); + + int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), + true /*with_fc_bias*/); + + AddStatis(fusion_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(embedding_fc_lstm_fuse_pass, + paddle::framework::ir::EmbeddingFCLSTMFusePass); diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..e5ad3067ec4060e41f1464395f3fc76183de3e66 --- /dev/null +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// Fusing of Embedding , FC and LSTM op + +// Just FC without bias +class EmbeddingFCLSTMFusePass : public FusePassBase { + public: + virtual ~EmbeddingFCLSTMFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + + const std::string name_scope_{"embedding_fc_lstm_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 6d2c51b0e9bed8461f6491b84a36a3bf6663a138..46c6a52c09e896596aa6d8e1e901955a68a4957d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -692,6 +692,24 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, } } +PDNode *patterns::Embedding::operator()(PDNode *x) { + x->assert_is_op_input("lookup_table", "Ids"); + auto *lookup_table_op = + pattern->NewNode(lookup_table_repr())->assert_is_op("lookup_table"); +#define NEW_NODE(arg__, io__) \ + auto *arg__ = pattern->NewNode(arg__##_repr()) \ + ->assert_is_op_##io__("lookup_table", #arg__); + + NEW_NODE(W, input); + + NEW_NODE(Out, output); +#undef NEW_NODE + + lookup_table_op->LinksFrom({x, W}); + lookup_table_op->LinksTo({Out}); + return Out; +} + PDNode *patterns::LSTM::operator()(PDNode *x) { x->assert_is_op_input("lstm", "Input"); auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm"); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 69b486c29d8bd1102a8372f5041051c25ce19359..508113bf4fcab274394f2705c36eddbf4ba3c77a 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -418,6 +418,23 @@ struct FC : public PatternBase { PATTERN_DECL_NODE(Out); }; +// Embedding +struct Embedding : public PatternBase { + Embedding(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "embedding") {} + + PDNode* operator()(PDNode* x); + + // declare operator node's name + PATTERN_DECL_NODE(lookup_table); + // Inputs + // + PATTERN_DECL_NODE(Ids); + PATTERN_DECL_NODE(W); // embeddings + // Outputs + PATTERN_DECL_NODE(Out); +}; + struct LSTM : public PatternBase { LSTM(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "lstm") {} diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..53d39513f3686cea59e2d56ff62eec9869f3b2de --- /dev/null +++ b/paddle/fluid/framework/naive_executor.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { + +// These code can be shared with Executor. +static void InitializeVariable(Variable *var, proto::VarType::Type var_type) { + if (var_type == proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarType::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarType::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarType::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarType::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::READER) { + var->GetMutable(); + } else if (var_type == proto::VarType::RAW) { + // GetMutable will be called in operator + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]", + var_type); + } +} + +void NaiveExecutor::Prepare(Scope *parent_scope, + const ProgramDesc &program_desc, int block_id, + bool with_feed_fetch_ops) { + if (!parent_scope) { + scope_ = new framework::Scope; + } else { + scope_ = &parent_scope->NewScope(); + } + CreateVariables(program_desc, scope_, block_id); + CreateOps(program_desc, block_id, with_feed_fetch_ops); +} + +void NaiveExecutor::Run() { + for (auto &op : ops_) { + VLOG(4) << "run " << op->Type(); + op->Run(*scope_, place_); + } +} + +void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope, + int block_id) { + PADDLE_ENFORCE(scope); + auto &global_block = desc.Block(block_id); + + const Scope *ancestor_scope = scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + + if (ancestor_scope != scope) { + for (auto &var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + // Create persistable vars in ancestor scope. + if (var->Persistable()) { + auto *ptr = const_cast(ancestor_scope)->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { // Create temporary variables in local scope. + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto &var : global_block.AllVars()) { + auto *ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } +} + +void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, + bool with_feed_fetch_ops) { + for (const auto &op_desc : desc.Block(block_id).AllOps()) { + if (!with_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + string::PrettyLogEndl(string::Style::detail(), "--- skip [%s], %s -> %s", + op_desc->Input("X")[0], op_desc->Type(), + op_desc->Output("Out")[0]); + continue; + } + ops_.emplace_back(OpRegistry::CreateOp(*op_desc)); + } +} + +LoDTensor *NaiveExecutor::FindTensor(const std::string &name) { + PADDLE_ENFORCE(scope_, "Need to init scope first"); + auto *var = scope_->FindVar(name); + PADDLE_ENFORCE(var, "No variable [%s] in the scope"); + auto *tensor = const_cast(&var->Get()); + return tensor; +} + +void NaiveExecutor::CleanFeedFetchOps() { + std::vector> ops; + for (auto &op : ops_) { + if (op->Type() != "feed" && op->Type() != "fetch") { + ops.emplace_back(std::move(op)); + } + } + ops_.swap(ops); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..9355e9e36a6358aa91553dca35aaf1b658516a0a --- /dev/null +++ b/paddle/fluid/framework/naive_executor.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +/* + * Simple, intuitive and effective. Only single thread is supported, and + * currently designed for inference. + */ +class NaiveExecutor { + public: + explicit NaiveExecutor(const platform::Place& place) : place_(place) {} + + // Create child scope. + // Create variables. + // @with_feed_fetch_ops: whether to work with the feed and fetch operators. + void Prepare(Scope* parent_scope, const ProgramDesc& program_desc, + int block_id, bool with_feed_fetch_ops); + + // Run all the operators. + void Run(); + + // Get an tensor to operating directly, without the need for feed_ops. + LoDTensor* FindTensor(const std::string& name); + + Scope* scope() { return scope_; } + + void CleanFeedFetchOps(); + + protected: + void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); + + void CreateOps(const ProgramDesc& desc, int block_id, + bool with_feed_fetch_ops); + + private: + const platform::Place place_; + // Catch the required resource to avoid recreate. + std::vector> ops_; + Scope* scope_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b9f79b9d398bc5a0ee6ba66587924daad0dbbc5 --- /dev/null +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +TEST(NaiveExecutor, Basic) { + ProgramDesc program; + auto* main_block = program.MutableBlock(0); + auto* a = main_block->Var("a"); // input + auto* b = main_block->Var("b"); // input + auto* c = main_block->Var("c"); // input + a->SetType(proto::VarType::LOD_TENSOR); + b->SetType(proto::VarType::LOD_TENSOR); + c->SetType(proto::VarType::LOD_TENSOR); + + auto* add = main_block->AppendOp(); + add->SetType("elementwise_add"); + add->SetInput("X", {"a"}); + add->SetInput("Y", {"b"}); + add->SetOutput("Out", {"c"}); + + auto place = platform::CPUPlace(); + NaiveExecutor exe(place); + exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/); + auto* a_tensor = exe.FindTensor("a"); + auto* b_tensor = exe.FindTensor("b"); + auto* c_tensor = exe.FindTensor("c"); + + a_tensor->Resize({1, 4}); + b_tensor->Resize({1, 4}); + c_tensor->Resize({1, 4}); + b_tensor->mutable_data(place); + a_tensor->mutable_data(place); + + float a_arr[] = {0, 1, 2, 3}; + float b_arr[] = {0.0, .1, .2, .3}; + + std::copy_n(a_arr, 4, a_tensor->mutable_data(place)); + std::copy_n(b_arr, 4, b_tensor->mutable_data(place)); + + exe.Run(); + + auto* c_data = c_tensor->mutable_data(place); + for (int i = 0; i < 4; i++) { + EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3); + } +} + +} // namespace framework +} // namespace paddle + +USE_OP(elementwise_add); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e800cb9993ddde45de7c33b11994359e77710daf..96624e33c6323dee7b6534673278b6b1b6343ae0 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -154,9 +154,15 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); + + if (platform::IsProfileEnabled()) { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); + } + RunImpl(scope, place); + if (VLOG_IS_ON(3)) { VLOG(3) << place << " " << DebugStringEx(&scope); } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 720d17a654bf96ca2bad43cc0c4374b2303ac233..ed4feaec1c1696ca837d98e2029eff438036fdd5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -156,10 +156,12 @@ ParallelExecutor::ParallelExecutor( params, member_->local_scopes_, member_->use_cuda_); #endif - // If the loss_var_name is given, the number of graph should be only one. - if (loss_var_name.size()) { - PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, - "The number of graph should be only one"); + if (VLOG_IS_ON(5)) { + // If the loss_var_name is given, the number of graph should be only one. + if (loss_var_name.size()) { + PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, + "The number of graph should be only one"); + } } if (exec_strategy.type_ == ExecutionStrategy::kDefault) { diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h index f6c6a1fec13d8b12efd1fa71a7a93316e484d045..508ee931c6ed7f66e09abd8f0e4b33c3d3c135fd 100644 --- a/paddle/fluid/framework/tuple.h +++ b/paddle/fluid/framework/tuple.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/var_desc.h" diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index 1aa0ae0f7c1946d91736ab61236a65a45c203fe3..7e3f002b53351ba5892aaa50482b21a83db94069 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -88,13 +88,7 @@ std::vector> VarDesc::GetShapes() const { } void VarDesc::SetDataType(proto::VarType::Type data_type) { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - mutable_channel_desc()->set_data_type(data_type); - break; - default: - mutable_tensor_desc()->set_data_type(data_type); - } + mutable_tensor_desc()->set_data_type(data_type); } void VarDesc::SetDataTypes( @@ -115,13 +109,7 @@ void VarDesc::SetDataTypes( } proto::VarType::Type VarDesc::GetDataType() const { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return channel_desc().data_type(); - break; - default: - return tensor_desc().data_type(); - } + return tensor_desc().data_type(); } std::vector VarDesc::GetDataTypes() const { @@ -134,17 +122,6 @@ std::vector VarDesc::GetDataTypes() const { return res; } -void VarDesc::SetCapacity(int64_t capacity) { - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - desc_.mutable_type()->mutable_channel()->set_capacity(capacity); - break; - default: - PADDLE_THROW("Setting 'capacity' is not supported by the type of var %s.", - this->Name()); - } -} - void VarDesc::SetLoDLevel(int32_t lod_level) { switch (desc_.type().type()) { case proto::VarType::LOD_TENSOR: @@ -214,19 +191,6 @@ std::vector VarDesc::GetLoDLevels() const { } } -const proto::VarType::ChannelDesc &VarDesc::channel_desc() const { - PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return desc_.type().channel(); - default: - PADDLE_THROW( - "Getting 'channel_desc' is not supported by the type of var %s.", - this->Name()); - } -} - const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); @@ -262,20 +226,6 @@ std::vector VarDesc::tensor_descs() const { } } -proto::VarType::ChannelDesc *VarDesc::mutable_channel_desc() { - PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); - PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); - switch (desc_.type().type()) { - case proto::VarType::CHANNEL: - return desc_.mutable_type()->mutable_channel(); - default: - PADDLE_THROW( - "Getting 'mutable_channel_desc' is not supported by the type of var " - "%s.", - this->Name()); - } -} - proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index 9f7a21ef42b8d3e74b6e211d6254294ba1fa2341..e33849ef502fb10b913e7e28cbd0abdb8b8ff9bb 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -87,8 +87,6 @@ class VarDesc { void SetDataTypes( const std::vector &multiple_data_type); - void SetCapacity(int64_t capacity); - proto::VarType::Type GetDataType() const; std::vector GetDataTypes() const; @@ -110,10 +108,8 @@ class VarDesc { void SetPersistable(bool persistable) { desc_.set_persistable(persistable); } private: - const proto::VarType::ChannelDesc &channel_desc() const; const proto::VarType::TensorDesc &tensor_desc() const; std::vector tensor_descs() const; - proto::VarType::ChannelDesc *mutable_channel_desc(); proto::VarType::TensorDesc *mutable_tensor_desc(); std::vector mutable_tensor_descs(); diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index e9550dbfb976bee70741158b94b04084919e8271..3b6f1cdb8f24ab20bfc80eeeba88891d7b41d1f9 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) { return proto::VarType_Type_SELECTED_ROWS; } else if (IsType(type)) { return proto::VarType_Type_READER; - } else if (IsType(type)) { - return proto::VarType_Type_CHANNEL; } else { PADDLE_THROW("ToVarType:Unsupported type %s", type.name()); } @@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) { case proto::VarType_Type_READER: visitor(var.Get()); return; - case proto::VarType_Type_CHANNEL: - visitor(var.Get()); - return; default: PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type())); } diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 6698efd1fa773127a84b4bcb28f57f4226dd7ae2..db381bbc3911ad9650162d9b9012580e5b638828 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -53,7 +53,7 @@ if(NOT APPLE) endif() if(WITH_TESTING) - # tests/book depends the models that generated by python/paddle/fluid/tests/book + # tests/book depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) if(WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index c2a1c6634bd8f8de0796456e91cb3c530d4c6823..c740ea009f6cfc2ea250d8f1abdd7d442c2a0bb0 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) set(analysis_deps - framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) + framework_proto proto_desc ir_pass_manager graph pass paddle_fluid_api executor pretty_log) cc_library(analysis SRCS pass_manager.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc analyzer.cc diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index b6edb5529ace2ad5bd1b35bfbee1f7a744457cc3..13805ea4acf936b242bcd86b2faf89813753a9fe 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -41,12 +41,6 @@ class AnalysisPass { // all passes have run. virtual bool Finalize() { return false; } - // Get a Pass appropriate to print the Node this pass operates on. - virtual AnalysisPass *CreatePrinterPass(std::ostream &os, - const std::string &banner) const { - return nullptr; - } - // Create a debugger Pass that draw the DFG by graphviz toolkit. virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 9bdbefc07cbc4bf7a4714927c84855837610430e..0aa9367bf5692e53e2a1f1247523cf9a4f0b3a1d 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -64,14 +64,15 @@ class Analyzer : public OrderedRegistry { // larger fusion. const std::vector all_ir_passes_{{ // Manual update the passes here. - "infer_clean_graph_pass", // - "attention_lstm_fuse_pass", // - "fc_lstm_fuse_pass", // - "mul_lstm_fuse_pass", // - "fc_gru_fuse_pass", // - "mul_gru_fuse_pass", // - "seq_concat_fc_fuse_pass", // - "fc_fuse_pass", // + "infer_clean_graph_pass", // + "attention_lstm_fuse_pass", // + "embedding_fc_lstm_fuse_pass", // + "fc_lstm_fuse_pass", // + "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // + "seq_concat_fc_fuse_pass", // + "fc_fuse_pass", // #ifdef PADDLE_WITH_MKLDNN "conv_relu_mkldnn_fuse_pass", // #endif diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index e569df94c54c304852dab7c7496804c1b08d665c..32d58b87413c95908644ffba31bbec22d8e23201 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,10 +18,10 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager ${GLOB_PASS_LIB}) +set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) - set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine) + set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) endif() function(inference_api_test TARGET_NAME) @@ -43,8 +43,10 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) +cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) +cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) @@ -52,18 +54,22 @@ cc_test(test_paddle_inference_api inference_api_test(test_api_impl SRC api_impl_tester.cc ARGS test_word2vec test_image_classification) +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api + ARGS --dirname=${PYTHON_TESTS_DIR}/book) + if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) + DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) endif() if (WITH_ANAKIN AND WITH_MKL) # only needed in CI # compile the libinference_anakin_api.a and anakin.so. - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber mklml scope zero_copy_tensor_dummy) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber scope) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endfunction() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 1032aadcbda4f1b05841e08e1abe7c737c3aeb9c..0c11694d5a905be4d9f0c6ebbc6159a4dc4a346e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -16,11 +16,15 @@ #include #include #include +#include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/api/timer.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/platform/profiler.h" @@ -28,8 +32,11 @@ DECLARE_bool(profile); namespace paddle { +using contrib::AnalysisConfig; + bool AnalysisPredictor::Init( - const std::shared_ptr& parent_scope) { + const std::shared_ptr &parent_scope, + const std::shared_ptr &program) { VLOG(3) << "Predictor::init()"; #if !defined(_WIN32) if (FLAGS_profile) { @@ -43,7 +50,8 @@ bool AnalysisPredictor::Init( if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); - LOG(WARNING) << "ir optimize only supports CPU currently"; + LOG(WARNING) << "ir optimize only supports CPU currently, enable_ir_optim " + "is turned false."; config_.enable_ir_optim = false; } else { place_ = paddle::platform::CPUPlace(); @@ -56,37 +64,134 @@ bool AnalysisPredictor::Init( scope_.reset(new paddle::framework::Scope()); } - executor_.reset(new paddle::framework::Executor(place_)); + executor_.reset(new paddle::framework::NaiveExecutor(place_)); - // Initialize the inference program - if (!config_.model_dir.empty()) { - // Parameters are saved in separate files sited in - // the specified `dirname`. - inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), - config_.model_dir); - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { - // All parameters are saved in a single file. - // The file names should be consistent with that used - // in Python API `fluid.io.save_inference_model`. - inference_program_ = paddle::inference::Load( - executor_.get(), scope_.get(), config_.prog_file, config_.param_file); + if (!program) { + if (!LoadProgramDesc()) return false; + OptimizeInferenceProgram(); } else { - LOG(ERROR) << "fail to load inference model from " << config_.model_dir; + inference_program_ = program; + } + executor_->Prepare(scope_.get(), *inference_program_, 0, + config_.use_feed_fetch_ops); + + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + return true; +} + +bool AnalysisPredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + VLOG(3) << "Predictor::predict"; + inference::Timer timer; + timer.tic(); + // set feed variable + std::vector feeds; + framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); + if (!SetFeed(inputs, scope)) { + LOG(ERROR) << "fail to set feed"; return false; } + // Run the inference program + // if share variables, we need not create variables + executor_->Run(); - OptimizeInferenceProgram(); - if (config_._use_mkldnn) { - executor_->EnableMKLDNN(*inference_program_); + // get fetch variable + if (!GetFetch(output_data, scope)) { + LOG(ERROR) << "fail to get fetches"; + return false; } - ctx_ = executor_->Prepare(*inference_program_, 0); + VLOG(3) << "predict cost: " << timer.toc() << "ms"; + return true; +} - VLOG(5) << "to create variables"; - PADDLE_ENFORCE(scope_.get()); - executor_->CreateVariables(*inference_program_, - sub_scope_ ? sub_scope_ : scope_.get(), 0); - // Get the feed_target_names and fetch_target_names - PrepareFeedFetch(); +bool AnalysisPredictor::SetFeed(const std::vector &inputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::set_feed"; + if (inputs.size() != feeds_.size()) { + LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " + << inputs.size(); + return false; + } + + // Cache the inputs memory for better concurrency performance. + feed_tensors_.resize(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto &input = feed_tensors_[i]; + framework::DDim ddim = framework::make_ddim(inputs[i].shape); + void *input_ptr; + if (inputs[i].dtype == PaddleDType::INT64) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else if (inputs[i].dtype == PaddleDType::FLOAT32) { + input_ptr = input.mutable_data(ddim, platform::CPUPlace()); + } else { + LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; + return false; + } + + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + int idx = -1; + if (config_.specify_input_name) { + idx = feed_names_[inputs[i].name]; + } else { + idx = boost::get(feeds_[i]->GetAttr("col")); + } + framework::SetFeedVariable(scope, input, "feed", idx); + } + return true; +} + +template +void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, + PaddleTensor *output) { + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); + } +} + +bool AnalysisPredictor::GetFetch(std::vector *outputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::get_fetch"; + outputs->resize(fetchs_.size()); + for (size_t i = 0; i < fetchs_.size(); ++i) { + int idx = boost::get(fetchs_[i]->GetAttr("col")); + PADDLE_ENFORCE((size_t)idx == i); + framework::LoDTensor &fetch = + framework::GetFetchVariable(*scope, "fetch", idx); + auto type = fetch.type(); + auto output = &(outputs->at(i)); + if (type == typeid(float)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT32; + } else if (type == typeid(int64_t)) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT64; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + } return true; } @@ -107,6 +212,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { new std::string(config_.prog_file)); argument_.fluid_model_param_path.reset(new std::string(config_.param_file)); } + argument_.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); PADDLE_ENFORCE( @@ -127,9 +233,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } template <> -std::unique_ptr -CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { +std::unique_ptr CreatePaddlePredictor< + AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; if (config.use_gpu) { // 1. GPU memeroy @@ -150,15 +255,90 @@ CreatePaddlePredictor( } std::unique_ptr predictor(new AnalysisPredictor(config)); - if (!dynamic_cast(predictor.get())->Init(nullptr)) { + if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } return predictor; } +void AnalysisPredictor::PrepareFeedFetch() { + for (auto *op : inference_program_->Block(0).AllOps()) { + if (op->Type() == "feed") { + int idx = boost::get(op->GetAttr("col")); + if (feeds_.size() <= static_cast(idx)) { + feeds_.resize(idx + 1); + } + feeds_[idx] = op; + feed_names_[op->Output("Out")[0]] = idx; + } else if (op->Type() == "fetch") { + int idx = boost::get(op->GetAttr("col")); + if (fetchs_.size() <= static_cast(idx)) { + fetchs_.resize(idx + 1); + } + fetchs_[idx] = op; + } + } +} + +std::unique_ptr AnalysisPredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = true; + res->SetName(name); + return res; +} + +std::unique_ptr AnalysisPredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = false; + res->SetName(name); + return res; +} + +bool AnalysisPredictor::ZeroCopyRun() { + executor_->Run(); + return true; +} + +bool AnalysisPredictor::LoadProgramDesc() { + // Initialize the inference program + std::unique_ptr tmp_exe( + new framework::Executor(platform::CPUPlace())); + if (!config_.model_dir.empty()) { + // Parameters are saved in separate files sited in + // the specified `dirname`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.model_dir); + } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + inference_program_ = paddle::inference::Load( + static_cast(tmp_exe.get()), scope_.get(), + config_.prog_file, config_.param_file); + } else { + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s'.", config_.model_dir, + config_.param_file); + return false; + } + return true; +} +std::unique_ptr AnalysisPredictor::Clone() { + auto *x = new AnalysisPredictor(config_); + x->Init(scope_, inference_program_); + return std::unique_ptr(x); +} + template <> std::unique_ptr CreatePaddlePredictor( - const contrib::AnalysisConfig& config) { + const contrib::AnalysisConfig &config) { return CreatePaddlePredictor(config); } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index aa00e8be5c28c2e3bfe74fa0bff2c72210bd106e..0d01d7ac2b29ea6364b07af9bb3bdeb5ced6bd00 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -12,42 +12,81 @@ // See the License for the specific language governing permissions and // limitations under the License. +#pragma once #include #include +#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" namespace paddle { using inference::analysis::Argument; using inference::analysis::Analyzer; using framework::proto::ProgramDesc; +using framework::NaiveExecutor; +using contrib::AnalysisConfig; /* This predictor is based on the original native predictor with IR and Analysis * support. It will optimize IR and Parameters in the runtime. * TODO(Superjomn) Replace the Navive predictor? */ -class AnalysisPredictor : public NativePaddlePredictor { +class AnalysisPredictor : public PaddlePredictor { public: - explicit AnalysisPredictor(const contrib::AnalysisConfig& config) - : NativePaddlePredictor(config), config_(config) {} + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} - bool Init(const std::shared_ptr& parent_scope); + bool Init(const std::shared_ptr &parent_scope, + const std::shared_ptr &program = nullptr); - bool Run(const std::vector& inputs, - std::vector* output_data, - int batch_size = -1) override { - return NativePaddlePredictor::Run(inputs, output_data, batch_size); - } + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + std::unique_ptr GetInputTensor( + const std::string &name) override; + std::unique_ptr GetOutputTensor( + const std::string &name) override; + + bool ZeroCopyRun() override; + + void PrepareFeedFetch(); void OptimizeInferenceProgram(); - Argument& analysis_argument() { return argument_; } + Argument &analysis_argument() { return argument_; } + + std::unique_ptr Clone() override; + + framework::Scope *scope() { return executor_->scope(); } + framework::ProgramDesc &program() { return *inference_program_; } + + protected: + bool LoadProgramDesc(); + + bool SetFeed(const std::vector &input_datas, + framework::Scope *scope); + bool GetFetch(std::vector *output_data, + framework::Scope *scope); + template + void GetFetchOne(const framework::LoDTensor &fetchs, + PaddleTensor *output_data); private: contrib::AnalysisConfig config_; Argument argument_; + std::unique_ptr executor_; + platform::Place place_; + std::shared_ptr scope_; + framework::Scope *sub_scope_{nullptr}; + std::shared_ptr inference_program_; + std::vector feeds_; + std::map feed_names_; + std::vector fetchs_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious + // concurrency problems, so cache them. + std::vector feed_tensors_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..1d25f55b3188a684fe38df1417d114348cfa2e8a --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { +namespace inference { +using contrib::AnalysisConfig; + +TEST(AnalysisPredictor, ZeroCopy) { + AnalysisConfig config; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.use_feed_fetch_ops = false; + + auto predictor = + CreatePaddlePredictor( + config); + + auto w0 = predictor->GetInputTensor("firstw"); + auto w1 = predictor->GetInputTensor("secondw"); + auto w2 = predictor->GetInputTensor("thirdw"); + auto w3 = predictor->GetInputTensor("forthw"); + + w0->Reshape({4, 1}); + w1->Reshape({4, 1}); + w2->Reshape({4, 1}); + w3->Reshape({4, 1}); + + auto* w0_data = w0->mutable_data(PaddlePlace::kCPU); + auto* w1_data = w1->mutable_data(PaddlePlace::kCPU); + auto* w2_data = w2->mutable_data(PaddlePlace::kCPU); + auto* w3_data = w3->mutable_data(PaddlePlace::kCPU); + + for (int i = 0; i < 4; i++) { + w0_data[i] = i; + w1_data[i] = i; + w2_data[i] = i; + w3_data[i] = i; + } + + predictor->ZeroCopyRun(); + + auto out = predictor->GetOutputTensor("fc_1.tmp_2"); + PaddlePlace place; + int size = 0; + auto* out_data = out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + LOG(INFO) << "output_data: " << out_data; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index c71769a32f604358fe68c927546591310649f116..01ea942d3c8d20180cfc9664b8601ba87a898e86 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -1,16 +1,22 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle_inference_api.h" namespace paddle { @@ -26,7 +32,7 @@ int PaddleDtypeSize(PaddleDType dtype) { } } -PaddleBuf::PaddleBuf(PaddleBuf&& other) +PaddleBuf::PaddleBuf(PaddleBuf &&other) : data_(other.data_), length_(other.length_), memory_owned_(other.memory_owned_) { @@ -35,9 +41,9 @@ PaddleBuf::PaddleBuf(PaddleBuf&& other) other.length_ = 0; } -PaddleBuf::PaddleBuf(const PaddleBuf& other) { *this = other; } +PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; } -PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { +PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { if (!other.memory_owned_) { data_ = other.data_; length_ = other.length_; @@ -51,7 +57,7 @@ PaddleBuf& PaddleBuf::operator=(const PaddleBuf& other) { return *this; } -PaddleBuf& PaddleBuf::operator=(PaddleBuf&& other) { +PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) { // only the buffer with external memory can be copied data_ = other.data_; length_ = other.length_; @@ -75,7 +81,7 @@ void PaddleBuf::Resize(size_t length) { } } -void PaddleBuf::Reset(void* data, size_t length) { +void PaddleBuf::Reset(void *data, size_t length) { Free(); memory_owned_ = false; data_ = data; @@ -85,7 +91,7 @@ void PaddleBuf::Reset(void* data, size_t length) { void PaddleBuf::Free() { if (memory_owned_ && data_) { PADDLE_ENFORCE_GT(length_, 0); - free(static_cast(data_)); + free(static_cast(data_)); data_ = nullptr; length_ = 0; } diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index dca4386b21b4a064c21b52218682321258f368c4..53740899cd4176ae007c09b7728e504675d13248 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -145,7 +145,7 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, VLOG(4) << "Run prepared context"; executor_->RunPreparedContext(ctx_.get(), scope, false, /* don't create local scope each time*/ - false /* don't create variable eatch time */); + false /* don't create variable each time */); VLOG(4) << "Finish prepared context"; // get fetch variable if (!GetFetch(output_data, scope)) { diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 6386d601262b3dac0e957fae991d23768b52f2c0..7882f6a53c7ce9a2486158ea9b50c018d1814091 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 +http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once @@ -30,6 +30,8 @@ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/profiler.h" @@ -52,6 +54,8 @@ class NativePaddlePredictor : public PaddlePredictor { ~NativePaddlePredictor() override; + framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); } + protected: bool SetFeed(const std::vector &input_datas, framework::Scope *scope); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index fc1364b80ac1ee2d304eb2fe429eae5f56967516..bed7c871311e476b4ed113d982c690383ad732de 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -21,6 +21,12 @@ limitations under the License. */ #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/tests/test_helper.h" +#ifdef __clang__ +#define ACC_DIFF 4e-3 +#else +#define ACC_DIFF 1e-3 +#endif + DEFINE_string(dirname, "", "Directory of the inference model."); namespace paddle { @@ -43,7 +49,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { NativeConfig GetConfig() { NativeConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; #ifdef PADDLE_WITH_CUDA @@ -99,8 +105,8 @@ void MainWord2Vec(bool use_gpu) { float* lod_data = output1.data(); for (int i = 0; i < output1.numel(); ++i) { - EXPECT_LT(lod_data[i] - data[i], 1e-3); - EXPECT_GT(lod_data[i] - data[i], -1e-3); + EXPECT_LT(lod_data[i] - data[i], ACC_DIFF); + EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF); } } @@ -110,7 +116,7 @@ void MainImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; const bool is_combined = false; std::vector> feed_target_shapes = @@ -144,7 +150,7 @@ void MainImageClassification(bool use_gpu) { float* data = static_cast(outputs[0].data.data()); float* lod_data = output1.data(); for (size_t j = 0; j < len / sizeof(float); ++j) { - EXPECT_NEAR(lod_data[j], data[j], 1e-3); + EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF); } } @@ -199,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) { float* ref_data = refs[tid].data(); EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); for (int i = 0; i < refs[tid].numel(); ++i) { - EXPECT_NEAR(ref_data[i], data[i], 1e-3); + EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); } }); } @@ -214,7 +220,7 @@ void MainThreadsImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "image_classification_resnet.inference.model"; + FLAGS_dirname + "/image_classification_resnet.inference.model"; auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); @@ -251,7 +257,7 @@ void MainThreadsImageClassification(bool use_gpu) { float* ref_data = refs[tid].data(); EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float)); for (int i = 0; i < refs[tid].numel(); ++i) { - EXPECT_NEAR(ref_data[i], data[i], 1e-3); + EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); } }); } diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 0f7d541c5edfc62e80cf50f83b491f06dcb42644..44335a872f2e00b34e29a9e7601cb390a460362c 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -2,6 +2,9 @@ set -x PADDLE_ROOT=$1 TURN_ON_MKL=$2 # use MKL or Openblas TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode +DATA_DIR=$4 # dataset +cd `dirname $0` +current_dir=`pwd` if [ $2 == ON ]; then # You can export yourself if move the install path MKL_LIB=${PADDLE_ROOT}/build/fluid_install_dir/third_party/install/mklml/lib @@ -29,15 +32,15 @@ function download() { fi cd .. } -mkdir -p data -cd data +mkdir -p $DATA_DIR +cd $DATA_DIR vis_demo_list='se_resnext50 ocr mobilenet' for vis_demo_name in $vis_demo_list; do download $vis_demo_name done -cd .. # compile and test the demo +cd $current_dir mkdir -p build cd build @@ -73,9 +76,9 @@ for WITH_STATIC_LIB in ON OFF; do for use_gpu in $use_gpu_list; do for vis_demo_name in $vis_demo_list; do ./vis_demo \ - --modeldir=../data/$vis_demo_name/model \ - --data=../data/$vis_demo_name/data.txt \ - --refer=../data/$vis_demo_name/result.txt \ + --modeldir=$DATA_DIR/$vis_demo_name/model \ + --data=$DATA_DIR/$vis_demo_name/data.txt \ + --refer=$DATA_DIR/$vis_demo_name/result.txt \ --use_gpu=$use_gpu if [ $? -ne 0 ]; then echo "vis demo $vis_demo_name runs fail." diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc new file mode 100644 index 0000000000000000000000000000000000000000..14698f6dfc8885ec1d35f1912bad10a9caa13db4 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(input_or_output_, + "Can't reshape the output tensor, it is readonly"); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + tensor->Resize(framework::make_ddim(shape)); +} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + auto *tensor = static_cast(FindTensor()); + switch (static_cast(place)) { + case static_cast(PaddlePlace::kCPU): { + return tensor->mutable_data(platform::CPUPlace()); + } + case static_cast(PaddlePlace::kGPU): { + return tensor->mutable_data(platform::CUDAPlace()); + } + default: + PADDLE_THROW("Unsupported place: %d", static_cast(place)); + break; + } + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + auto *tensor = static_cast(FindTensor()); + auto *res = tensor->data(); + + if (platform::is_cpu_place(tensor->place())) { + *place = PaddlePlace::kCPU; + } else if (platform::is_gpu_place(tensor->place())) { + *place = PaddlePlace::kGPU; + } else { + *place = PaddlePlace::kUNK; + } + + *size = tensor->numel(); + return res; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + return tensor; +} + +std::vector ZeroCopyTensor::shape() { + auto *tensor = static_cast(FindTensor()); + PADDLE_ENFORCE(tensor, "not found tensor called %s in the scope", name_); + return framework::vectorize(tensor->dims()); +} + +void ZeroCopyTensor::SetLoD(const std::vector> &x) { + auto *tensor = static_cast(FindTensor()); + framework::LoD lod; + for (auto &level : x) { + lod.emplace_back(level); + } + tensor->set_lod(lod); +} + +std::vector> ZeroCopyTensor::lod() const { + std::vector> res; + auto *tensor = static_cast(FindTensor()); + for (auto &level : tensor->lod()) { + res.emplace_back(level); + } + return res; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc new file mode 100644 index 0000000000000000000000000000000000000000..2d5b561d801cd9e734cab13b28e7285493e30f94 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) {} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) { + return nullptr; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size); +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { return nullptr; } + +std::vector ZeroCopyTensor::shape() { return {}; } + +void ZeroCopyTensor::SetLoD(const std::vector> &x) {} + +std::vector> ZeroCopyTensor::lod() const { + return std::vector>(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 1fec2f96da0f9d978a3537b2d78e4ce5ef628c81..dbbd3f6a6786a4a4849002878263353919e8f31b 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -21,8 +21,10 @@ #include #include #include +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/timer.h" +#include "paddle/fluid/string/printf.h" namespace paddle { namespace inference { @@ -93,6 +95,20 @@ static void TensorAssignData(PaddleTensor *tensor, } } +template +static int ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { + int size{0}; + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + int c = 0; + for (const auto &f : data) { + for (T v : f) { + ptr[c++] = v; + } + } + return size; +} + static std::string DescribeTensor(const PaddleTensor &tensor) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; @@ -138,5 +154,127 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, } } +template +std::string LoDTensorSummary(const framework::LoDTensor &tensor) { + std::stringstream ss; + ss << "\n---- tensor ---" << '\n'; + ss << "lod: ["; + for (const auto &level : tensor.lod()) { + ss << "[ "; + for (auto i : level) { + ss << i << ", "; + } + ss << "]"; + } + ss << "]\n"; + + ss << "shape: ["; + int size = 1; + for (int i = 0; i < tensor.dims().size(); i++) { + int dim = tensor.dims()[i]; + ss << dim << ", "; + size *= dim; + } + ss << "]\n"; + + ss << "data: "; + for (int i = 0; i < std::min(20, size); i++) { + ss << tensor.data()[i] << " "; + } + ss << "\n"; + + return ss.str(); +} + +static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + auto &al = a[i]; + auto &bl = b[i]; + if (al.size() != bl.size()) { + LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(), + bl.size()); + return false; + } + } + return true; +} + +static bool CompareShape(const std::vector &a, + const std::vector &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (a[i] != b[i]) { + LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i, + a[i], b[i]); + return false; + } + } + return true; +} + +static bool CompareTensorData(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + auto a_shape = framework::vectorize(a.dims()); + auto b_shape = framework::vectorize(b.dims()); + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), 1, + [](int a, int b) { return a * b; }); + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), 1, + [](int a, int b) { return a * b; }); + if (a_size != b_size) { + LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", + a_size, b_size); + } + + for (size_t i = 0; i < a_size; i++) { + if (a.type() == typeid(float)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } else if (a.type() == typeid(int64_t)) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } + } + + return true; +} + +static bool CompareTensor(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + if (!CompareLoD(a.lod(), b.lod())) { + return false; + } + if (!CompareShape(framework::vectorize(a.dims()), + framework::vectorize(b.dims()))) { + return false; + } + + if (!CompareTensorData(a, b)) { + return false; + } + + return true; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 984358b2bd90daf768cea0a6e36b5805d81050d6..9ee25f85d94eb2c96fb3603a8f0e6a1e27c8e9dd 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -101,6 +101,40 @@ struct PaddleTensor { std::vector> lod; // Tensor+LoD equals LoDTensor }; +enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; +// Tensor without copy, currently only supports AnalysisPredictor. +class ZeroCopyTensor { + public: + void Reshape(const std::vector& shape); + + // Get the memory in CPU or GPU with specific data type, should Reshape first + // to tell the data size. + // Once can directly call this data to feed the data. + // This is for write the input tensor. + template + T* mutable_data(PaddlePlace place); + // Get the memory directly, will return the place and memory size by pointer. + // This is for reading the output tensor. + template + T* data(PaddlePlace* place, int* size); + + std::vector shape(); + + void SetLoD(const std::vector>& x); + std::vector> lod() const; + + protected: + ZeroCopyTensor(void* scope) : scope_{scope} {} + void SetName(const std::string& name) { name_ = name; } + void* FindTensor() const; + + private: + std::string name_; + bool input_or_output_; + friend class AnalysisPredictor; + void* scope_{nullptr}; +}; + /* * A simple Inference API for Paddle. */ @@ -120,6 +154,19 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + // Zero copy input and output optimization. + // Get the input or output tensors, and operate on their memory directly, + // without copy. + virtual std::unique_ptr GetInputTensor( + const std::string& name) { + return nullptr; + } + virtual std::unique_ptr GetOutputTensor( + const std::string& name) { + return nullptr; + } + virtual bool ZeroCopyRun() { return false; } + // Clone a predictor that share the model weights, the Cloned predictor should // be thread-safe. virtual std::unique_ptr Clone() = 0; @@ -216,9 +263,14 @@ struct AnalysisConfig : public NativeConfig { bool enable_ir_optim = true; // Manually determine the IR passes to run. IrPassMode ir_mode{IrPassMode::kExclude}; - std::vector ir_passes; + std::vector ir_passes{"embedding_fc_lstm_fuse_pass"}; + + // NOT stable yet. + bool use_feed_fetch_ops{true}; - // NOTE this is just for internal development, please not use it. + // NOTE this is just for internal development, please not use it. NOT + // stable + // yet. bool _use_mkldnn{false}; }; diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 36bbec473114cfd2e68c97a53264957477ade3fb..5fb551810fd4d1c56547a8aa581cb6c4587df031 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -18,6 +18,8 @@ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; + struct DataRecord { std::vector data; std::vector lod; @@ -78,6 +80,7 @@ struct DataRecord { } } } + DataRecord NextBatch() { DataRecord data; data.data = batched_datas[batch_iter]; @@ -155,7 +158,9 @@ TEST(Analyzer_LAC, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 59020545cd609961487cafc4a08c20951a02c8ce..577b97e271aacab5d6740de7c8bc00bc87ae54dd 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -16,6 +16,7 @@ namespace paddle { namespace inference { +using contrib::AnalysisConfig; struct DataRecord { std::vector> word_data_all, mention_data_all; @@ -145,7 +146,9 @@ TEST(Analyzer_Chinese_ner, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 3bf5383d8f35347c767d6caee83e0dcc5fb0a446..d2e344111bdf84c936bbef7ff51246b0f248f41d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -12,12 +12,16 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_bool(with_precision_check, true, "turn on test"); + namespace paddle { namespace inference { using namespace framework; // NOLINT +using namespace contrib; // NOLINT struct DataRecord { std::vector>> link_step_data_all; @@ -29,10 +33,12 @@ struct DataRecord { size_t batch_iter{0}; size_t batch_size{1}; DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) : batch_size(batch_size) { Load(path); } + DataRecord NextBatch() { DataRecord data; size_t batch_end = batch_iter + batch_size; @@ -101,6 +107,7 @@ struct DataRecord { num_samples = num_lines; } }; + void PrepareInputs(std::vector *input_slots, DataRecord *data, int batch_size) { PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, @@ -149,7 +156,55 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { +void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, + ZeroCopyTensor *cell_init_tensor, + ZeroCopyTensor *data_tensor, + ZeroCopyTensor *hidden_init_tensor, + ZeroCopyTensor *week_tensor, + ZeroCopyTensor *minute_tensor, + DataRecord *data_record, int batch_size) { + auto one_batch = data_record->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor->Reshape({1, 2}); + lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2}); + + cell_init_tensor->Reshape({batch_size, 15}); + cell_init_tensor->SetLoD({one_batch.lod3}); + + hidden_init_tensor->Reshape({batch_size, 15}); + hidden_init_tensor->SetLoD({one_batch.lod3}); + + data_tensor->Reshape(rnn_link_data_shape); + data_tensor->SetLoD({one_batch.lod1}); + + week_tensor->Reshape( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor->SetLoD({one_batch.lod3}); + + minute_tensor->Reshape( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor->SetLoD({one_batch.lod3}); + + // assign data + float arr0[] = {0, 0}; + std::vector zeros(batch_size * 15, 0); + std::copy_n(arr0, 2, + lod_attention_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(arr0, 2, data_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + cell_init_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + hidden_init_tensor->mutable_data(PaddlePlace::kCPU)); + ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data); + ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas); + ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas); +} + +void SetConfig(AnalysisConfig *cfg) { cfg->prog_file = FLAGS_infer_model + "/__model__"; cfg->param_file = FLAGS_infer_model + "/param"; cfg->use_gpu = false; @@ -187,7 +242,9 @@ TEST(Analyzer_rnn1, fuse_statis) { SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); ASSERT_TRUE(fuse_statis.count("fc_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM @@ -214,7 +271,229 @@ TEST(Analyzer_rnn1, multi_thread) { std::vector> input_slots_all; SetInput(&input_slots_all); - TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */); + TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); +} + +bool CompareTensors(framework::Scope &a_scope, framework::Scope &b_scope, + const std::vector &tensors) { + for (auto &x : tensors) { + auto *a_var = a_scope.FindVar(x); + auto *b_var = b_scope.FindVar(x); + if (a_var && b_var) { + if (a_var->Type() == typeid(framework::LoDTensor) || + a_var->Type() == typeid(framework::Tensor)) { + LOG(INFO) << "comparing tensor " << x; + auto &a_t = a_var->Get(); + auto &b_t = b_var->Get(); + if (!inference::CompareTensor(a_t, b_t)) { + LOG(ERROR) << string::Sprintf("tensor %s not match in two scopes", x); + } + } else { + LOG(INFO) << "skip no tensor " << x; + } + } else { + LOG(INFO) << "skip tensor " << x; + } + } + return true; +} + +// Validate that the AnalysisPredictor + ZeroCopyTensor really works by testing +// on the complex RNN1 model. +TEST(Analyzer_rnn1, ZeroCopy) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + + PaddlePlace place; + int output_size{0}; + + auto predictor = + CreatePaddlePredictor( + config); + + config.use_feed_fetch_ops = true; + auto native_predictor = + CreatePaddlePredictor(config); + + config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. + auto analysis_predictor = + CreatePaddlePredictor( + config); + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), cell_init_tensor.get(), + data_tensor.get(), hidden_init_tensor.get(), + week_tensor.get(), minute_tensor.get(), &data, + FLAGS_batch_size); + + // Prepare data for NativePredictor + std::vector> native_inputs; + SetInput(&native_inputs); + std::vector native_outputs; + std::vector analysis_outputs; + + auto output_tensor = predictor->GetOutputTensor("final_output.tmp_1"); + // Run analysis predictor + + int num_ops; + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_EQ(fuse_statis.at("fc_fuse"), 1); + ASSERT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + ASSERT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + ASSERT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. + + Timer timer; + double total_time{0}; + double native_total_time{0}; + double analysis_total_time{0.}; + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + auto *output_data = output_tensor->data(&place, &output_size); + ASSERT_GT(output_size, 0); // more than one output! + + for (int i = 0; i < FLAGS_repeat; i++) { + // Run native predictor. + timer.tic(); + ASSERT_TRUE(native_predictor->Run(native_inputs.front(), &native_outputs)); + native_total_time += timer.toc(); + } + + for (int i = 0; i < FLAGS_repeat; i++) { + timer.tic(); + ASSERT_TRUE( + analysis_predictor->Run(native_inputs.front(), &analysis_outputs)); + analysis_total_time += timer.toc(); + } + + if (!FLAGS_with_precision_check) { + return; + } + int native_output_size = VecReduceToInt(native_outputs.front().shape); + + EXPECT_EQ(native_output_size, output_size); + + // Compare tensors between analysis and zerocopy + auto *p0 = static_cast(predictor.get()); + auto *p1 = static_cast(analysis_predictor.get()); + auto *p2 = static_cast(native_predictor.get()); + + std::vector tensor_names; + for (auto &var_desc : p0->program().Block(0).AllVars()) { + tensor_names.push_back(var_desc->Name()); + } + + LOG(INFO) << "Comparing tensors"; + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p1->scope(), {"final_output.tmp_1"})); + ASSERT_TRUE( + CompareTensors(*p0->scope(), *p2->scope(), {"final_output.tmp_1"})); + + LOG(INFO) << "output1 " << inference::LoDTensorSummary( + p0->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output2 " << inference::LoDTensorSummary( + p1->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + LOG(INFO) << "output3 " << inference::LoDTensorSummary( + p2->scope() + ->FindVar("final_output.tmp_1") + ->Get()); + + for (int i = 0; i < output_size; i++) { + LOG(INFO) << output_data[i] << " " + << static_cast(native_outputs.front().data.data())[i] + << " " + << static_cast(analysis_outputs.front().data.data())[i]; + EXPECT_NEAR(output_data[i], + static_cast(native_outputs.front().data.data())[i], + 1e-3); + } + + LOG(INFO) << "batch_size: " << FLAGS_batch_size; + + LOG(INFO) << "zero average time: " + << total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "analysis average time: " + << analysis_total_time / (FLAGS_repeat * FLAGS_batch_size); + LOG(INFO) << "native average time: " + << native_total_time / (FLAGS_repeat * FLAGS_batch_size); +} + +TEST(Analyzer_rnn1, ZeroCopyMultiThread) { + AnalysisConfig config; + SetConfig(&config); + config.use_feed_fetch_ops = false; + +#define NEW_TENSOR(name__) \ + auto name__##_tensor = predictor->GetInputTensor(#name__); + + auto base_predictor = CreatePaddlePredictor(config); + double total_time_of_threads{0}; + std::vector threads; + std::vector> predictors; + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + predictors.emplace_back(CreatePaddlePredictor(config)); + } + + for (int tid = 0; tid < FLAGS_num_threads; tid++) { + threads.emplace_back([config, &total_time_of_threads, &predictors, tid] { + // auto predictor = base_predictor->Clone(); + auto &predictor = predictors[tid]; + NEW_TENSOR(data_lod_attention); + NEW_TENSOR(cell_init); + NEW_TENSOR(data); + NEW_TENSOR(week); + NEW_TENSOR(minute); + NEW_TENSOR(hidden_init); + + // Prepare data for AnalysisPredictor + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + Timer timer; + double total_time{0}; + + for (int i = 0; i < FLAGS_repeat; i++) { + PrepareZeroCopyInputs(data_lod_attention_tensor.get(), + cell_init_tensor.get(), data_tensor.get(), + hidden_init_tensor.get(), week_tensor.get(), + minute_tensor.get(), &data, FLAGS_batch_size); + + timer.tic(); + predictor->ZeroCopyRun(); + total_time += timer.toc(); + } + + total_time_of_threads += total_time; + + LOG(INFO) << "thread time: " << total_time / FLAGS_repeat; + }); + } + + for (auto &t : threads) { + t.join(); + } + + LOG(INFO) << "average time: " + << total_time_of_threads / FLAGS_num_threads / FLAGS_repeat; } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 2f71ed46ffc9fd5f853f5b5b46de1446d28b9e69..cb4671c4379b5f6f144bfd5330866aa38163f4d4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -182,7 +182,8 @@ TEST(Analyzer_seq_conv1, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - auto fuse_statis = GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 340ef152f0b1a15a451f840b36ae845ef4984740..ca19475bda372398d425b0fa6f9a732cd79a8166 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -104,5 +104,18 @@ TEST(Analyzer_Text_Classification, compare) { CompareNativeAndAnalysis(cfg, input_slots_all); } +TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { + AnalysisConfig cfg; + SetConfig(&cfg); + // Enable embedding_fc_lstm_fuse_pass (disabled by default) + auto it = std::find(cfg.ir_passes.begin(), cfg.ir_passes.end(), + "embedding_fc_lstm_fuse_pass"); + if (it != cfg.ir_passes.end()) cfg.ir_passes.erase(it); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis(cfg, input_slots_all); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 483ae66c5b24f6147b1b07da86494a914f80c34c..a2e86305b85dd893f578e97e0105fec828916fb4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -19,6 +19,7 @@ limitations under the License. */ namespace paddle { namespace inference { namespace analysis { +using contrib::AnalysisConfig; struct Record { std::vector data; @@ -114,7 +115,8 @@ TEST(Analyzer_vis, fuse_statis) { AnalysisConfig cfg; SetConfig(&cfg); int num_ops; - GetFuseStatis(cfg, &num_ops); + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 9fcb5129d268a7730c11e5910077ad233050484e..cb36ddc8c879b1aff9838bba90364b17d53aa84e 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -86,11 +86,9 @@ std::unique_ptr CreateTestPredictor( size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } -std::unordered_map GetFuseStatis(AnalysisConfig config, +std::unordered_map GetFuseStatis(PaddlePredictor *predictor, int *num_ops) { - auto predictor = CreateTestPredictor(config); - AnalysisPredictor *analysis_predictor = - dynamic_cast(predictor.get()); + auto *analysis_predictor = static_cast(predictor); auto &fuse_statis = analysis_predictor->analysis_argument() .Get>( framework::ir::kFuseStatisAttr); diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 283745e977533358ef52521b36e67f0ada950e61..0f13a4ea9c1af175771f5cc201ea5c0a8a0f7555 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -36,6 +36,8 @@ namespace memory { using BuddyAllocator = detail::BuddyAllocator; BuddyAllocator* GetCPUBuddyAllocator() { + // We tried thread_local for inference::RNN1 model, but that not works much + // for multi-thread test. static std::once_flag init_flag; static detail::BuddyAllocator* a = nullptr; @@ -48,6 +50,25 @@ BuddyAllocator* GetCPUBuddyAllocator() { return a; } +// We compared the NaiveAllocator with BuddyAllocator in CPU memory allocation, +// seems they are almost the same overhead. +struct NaiveAllocator { + void* Alloc(size_t size) { return malloc(size); } + + void Free(void* p) { + PADDLE_ENFORCE(p); + free(p); + } + + static NaiveAllocator* Instance() { + static NaiveAllocator x; + return &x; + } + + private: + std::mutex lock_; +}; + template <> void* Alloc(platform::CPUPlace place, size_t size) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9c67df7bdfb2c4e5d1c9fe60676c412ab11b4fa5..b61bca8c3d1e2a63faed43b437459f4e9318e46e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -301,6 +301,7 @@ op_library(fusion_lstm_op DEPS cpu_lstm_compute) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) op_library(layer_norm_op DEPS cub) + op_library(reduce_mean_op DEPS cub) else() op_library(conv_op DEPS vol2col im2col) endif() @@ -313,11 +314,6 @@ op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) op_library(concat_op DEPS concat) -# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency -add_subdirectory(concurrency) -op_library(channel_send_op DEPS concurrency) -op_library(channel_recv_op DEPS concurrency) - list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) diff --git a/paddle/fluid/operators/channel_close_op.cc b/paddle/fluid/operators/channel_close_op.cc deleted file mode 100644 index 8e2db250a069c488ee98f618bc03df6485022456..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_close_op.cc +++ /dev/null @@ -1,70 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/op_registry.h" - -namespace pf = paddle::framework; -static constexpr char kChannel[] = "Channel"; - -namespace paddle { -namespace operators { - -class ChannelCloseOp : public framework::OperatorBase { - public: - ChannelCloseOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto &inp = *scope.FindVar(Input(kChannel)); - - // Get the mutable version of the channel variable and closes it. - pf::ChannelHolder *ch = inp.GetMutable(); - ch->close(); - } -}; - -class ChannelCloseOpOpInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasInput("Channel"), - "The input of ChannelClose op must be set"); - } -}; - -class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kChannel, - "The Channel Variable that should be closed by" - " the ChannelClose Op."); - AddComment(R"DOC( -Channel Close Operator. - -This operator closes an open channel. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelCloseOpMaker); diff --git a/paddle/fluid/operators/channel_create_op.cc b/paddle/fluid/operators/channel_create_op.cc deleted file mode 100644 index a7f59e4088e3fb328e5b5a83eed65f0f90edb9f0..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_create_op.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/lod_rank_table.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/reader.h" - -namespace pf = paddle::framework; - -static constexpr char kOutput[] = "Out"; - -namespace paddle { -namespace operators { - -class ChannelCreateOp : public framework::OperatorBase { - public: - ChannelCreateOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - auto &out = *scope.FindVar(Output(kOutput)); - - // Determine the datatype and capacity of the channel to be created - // from the attributes provided. - auto dtype = - static_cast(Attr("data_type")); - auto capacity = Attr("capacity"); - - // Based on the datatype, create a new channel holder initialized with - // the given capacity. When capacity is 0, an unbuffered channel is - // created. - pf::ChannelHolder *ch = out.GetMutable(); - if (dtype == framework::proto::VarType::LOD_TENSOR) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::SELECTED_ROWS) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::READER) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::CHANNEL) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::BOOL) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::INT32) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::INT64) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::FP32) { - ch->Reset(capacity); - } else if (dtype == framework::proto::VarType::FP64) { - ch->Reset(capacity); - } else { - PADDLE_THROW( - "Data type %d is not in " - "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, " - "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]", - dtype); - } - } -}; - -class ChannelCreateOpOpInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasOutput(kOutput), - "The output of ChannelCreate op must be set"); - context->SetOutputDim(kOutput, {1}); - } -}; - -class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddOutput(kOutput, - "The object of a Channel type created by ChannelCreate Op."); - AddAttr("capacity", "The size of the buffer of Channel.") - .SetDefault(0); - AddAttr("data_type", "The data type of elements inside the Channel."); - AddComment(R"DOC( -Channel Create Operator. - -This operator creates an object of the VarType Channel and returns it. -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelCreateOpMaker); diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc deleted file mode 100644 index 101015e837e28b504b71d919abd5f908a102c812..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_recv_op.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/operators/math/math_function.h" - -static constexpr char Channel[] = "Channel"; -static constexpr char Status[] = "Status"; -static constexpr char Out[] = "Out"; - -namespace paddle { -namespace operators { - -void SetReceiveStatus(const platform::Place &dev_place, - framework::Variable *status_var, bool status) { - auto cpu = platform::CPUPlace(); - auto status_tensor = - status_var->GetMutable()->mutable_data({1}, - cpu); - status_tensor[0] = status; -} - -class ChannelRecvOp : public framework::OperatorBase { - public: - ChannelRecvOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const { - PADDLE_ENFORCE(ctx->HasInput(Channel), - "Input(Channel) of ChannelRecvOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput(Out), - "Input(Channel) of ChannelRecvOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput(Status), - "Output(Status) of ChannelRecvOp should not be null."); - ctx->SetOutputDim("Status", {1}); - } - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - // Get the channel holder created by channel_create op, passed as input. - framework::ChannelHolder *ch = - scope.FindVar(Input(Channel))->GetMutable(); - auto output_var = scope.FindVar(Output(Out)); - // Receive the data from the channel. - bool ok = concurrency::ChannelReceive(ch, output_var); - - // Set the status output of the `ChannelReceive` call. - SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok); - } -}; - -class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(Channel, - "(Channel) A variable which \"receives\" the a value sent" - "to it by a channel_send op.") - .AsDuplicable(); - AddOutput(Out, - "(Variable) Output Variable that will hold the data received" - " from the Channel") - .AsDuplicable(); - AddOutput(Status, - "(Tensor) An LoD Tensor that returns a boolean status of the" - "result of the receive operation.") - .AsDuplicable(); - AddComment(R"DOC( -)DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelRecvOpMaker); diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc deleted file mode 100644 index 67d6deb511d883ac69426ddd34be2199367cd4c7..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/channel_send_op.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/channel.h" -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/operators/math/math_function.h" - -static constexpr char Channel[] = "Channel"; -static constexpr char X[] = "X"; - -namespace paddle { -namespace operators { - -class ChannelSendOp : public framework::OperatorBase { - public: - ChannelSendOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const { - PADDLE_ENFORCE(ctx->HasInput(Channel), - "Input(Channel) of ChannelSendOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput(X), - "Input(X) of ChannelSendOp should not be null."); - } - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - // Get the channel holder created by channel_create op, passed as input. - framework::ChannelHolder *ch = - scope.FindVar(Input(Channel))->GetMutable(); - auto input_var = scope.FindVar(Input(X)); - - // Send the input data through the channel. - concurrency::ChannelSend(ch, input_var); - } -}; - -class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(Channel, - "(Channel) A variable which \"sends\" the passed in value to " - "a listening receiver.") - .AsDuplicable(); - AddInput(X, "(Variable) The value which gets sent by the channel.") - .AsDuplicable(); - AddComment(R"DOC( -)DOC"); - } -}; -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::ChannelSendOpMaker); diff --git a/paddle/fluid/operators/concurrency/CMakeLists.txt b/paddle/fluid/operators/concurrency/CMakeLists.txt deleted file mode 100644 index e4617440d152b4c15d09e81cd19c76739b95b979..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3) diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc deleted file mode 100644 index fba4abf1897bceea615222b2438700085ed8e551..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/channel_util.cc +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/concurrency/channel_util.h" -#include "paddle/fluid/framework/var_type.h" - -namespace poc = paddle::operators::concurrency; - -void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_READER) - ch->Send(var->GetMutable()); - else if (type == framework::proto::VarType_Type_CHANNEL) - ch->Send(var->GetMutable()); - else - PADDLE_THROW("ChannelSend:Unsupported type"); -} - -bool poc::ChannelReceive(framework::ChannelHolder *ch, - framework::Variable *var) { - // Get type of channel and use that to call mutable data for Variable - auto type = framework::ToVarType(ch->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_READER) - return ch->Receive(var->GetMutable()); - else if (type == framework::proto::VarType_Type_CHANNEL) - return ch->Receive(var->GetMutable()); - else - PADDLE_THROW("ChannelReceive:Unsupported type"); -} - -void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, cb); - } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_READER) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_CHANNEL) { - ch->AddToSendQ(referrer, var->GetMutable(), cond, - cb); - } else { - PADDLE_THROW("ChannelAddToSendQ:Unsupported type"); - } -} - -void poc::ChannelAddToReceiveQ( - framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, std::shared_ptr cond, - std::function cb) { - auto type = framework::ToVarType(var->Type()); - if (type == framework::proto::VarType_Type_LOD_TENSOR) { - ch->AddToReceiveQ(referrer, var->GetMutable(), cond, - cb); - } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_READER) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else if (type == framework::proto::VarType_Type_CHANNEL) { - ch->AddToReceiveQ(referrer, var->GetMutable(), - cond, cb); - } else { - PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type"); - } -} diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h deleted file mode 100644 index cd18ca78c6fdecdc6c72748611ccdd9c2690ef46..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/concurrency/channel_util.h +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/variable.h" - -namespace paddle { -namespace operators { -namespace concurrency { - -void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); -bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); - -void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb); -void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer, - framework::Variable *var, - std::shared_ptr cond, - std::function cb); - -} // namespace concurrency -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index b3140116dfe6a17a400bb88219ff43b249ecb32a..ef76106f17218a03d24ebc0eca43dbb0ae935093 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -380,7 +380,8 @@ class DepthwiseConvKernel : public framework::OpKernel { math::DepthwiseConvFunctor depthwiseConv; auto& dev_ctx = context.template device_context(); - depthwiseConv(dev_ctx, *input, filter, strides, paddings, output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output); } }; @@ -415,14 +416,14 @@ class DepthwiseConvGradKernel : public framework::OpKernel { input_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, input_grad, static_cast(0)); depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, - paddings, input_grad); + paddings, dilations, input_grad); } if (filter_grad) { filter_grad->mutable_data(context.GetPlace()); set_zero(dev_ctx, filter_grad, static_cast(0)); depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, paddings, - filter_grad); + dilations, filter_grad); } } }; diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 0d9c6a62fec1ea24bee5c24b4a7b792781f14d9e..88c578b1410558b9adcd55f1cd6b53fb9cb124e2 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -345,7 +345,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, - output); + dilations, output); } }; @@ -367,10 +367,11 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); if (input_grad) { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, + depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations, input_grad); } @@ -382,7 +383,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, - filter_grad); + dilations, filter_grad); } } }; diff --git a/paddle/fluid/operators/cub_reduce.h b/paddle/fluid/operators/cub_reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..16fdad775f7befaac04b1ac59a601f04e0ab2bdc --- /dev/null +++ b/paddle/fluid/operators/cub_reduce.h @@ -0,0 +1,322 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include // NOLINT +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +namespace detail { +template +struct Array { + public: + HOSTDEVICE inline Array() {} + + HOSTDEVICE inline T& operator[](size_t index) { return data_[index]; } + + HOSTDEVICE inline const T& operator[](size_t index) const { + return data_[index]; + } + + HOSTDEVICE constexpr inline size_t size() const { return ElementCount; } + + template + static inline Array From(const VectorLikeType& vec) { + PADDLE_ENFORCE_EQ(vec.size(), ElementCount, "size not match"); + size_t n = static_cast(vec.size()); + Array ret; + for (size_t i = 0; i < n; ++i) ret[i] = vec[i]; + return ret; + } + + private: + T data_[ElementCount]; +}; + +// reduce the last axis of 2d array +template +__global__ void ReduceKernel2D(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, + int reduce_num) { + __shared__ typename cub::BlockReduce::TempStorage temp_storage; + int idx_x = blockIdx.x * reduce_num; + int idx_y = threadIdx.x; + Ty reduce_var = init; + for (int idx_y = threadIdx.x; idx_y < reduce_num; idx_y += BlockDim) + reduce_var = reducer(reduce_var, transformer(x[idx_x + idx_y])); + + reduce_var = + cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); + + if (threadIdx.x == 0) { + y[blockIdx.x] = reduce_var; + } +} + +template +__global__ void ReduceKernel(const Tx* x, Ty* y, ReduceOp reducer, + TransformOp transformer, Ty init, int reduce_num, + Array x_strides, + Array reduce_dim, + Array reduce_strides, + Array left_dim, + Array left_strides) { + __shared__ typename cub::BlockReduce::TempStorage temp_storage; + Array sub_index; + int left_idx = blockIdx.x; + for (int i = 0; i < Rank - ReduceRank; ++i) { + sub_index[left_dim[i]] = left_idx / left_strides[i]; + left_idx %= left_strides[i]; + } + + int reduce_idx = threadIdx.x; + for (int j = 0; j < ReduceRank; ++j) { + sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; + reduce_idx %= reduce_strides[j]; + } + + int idx_x = 0; + for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); + Ty reduce_var = static_cast(transformer(x[idx_x])); + + for (int i = threadIdx.x + BlockDim; i < reduce_num; i += BlockDim) { + int reduce_idx = i; + for (int j = 0; j < ReduceRank; ++j) { + sub_index[reduce_dim[j]] = reduce_idx / reduce_strides[j]; + reduce_idx %= reduce_strides[j]; + } + + int idx_x = 0; + for (int k = 0; k < Rank; ++k) idx_x += (sub_index[k] * x_strides[k]); + reduce_var = static_cast(reducer(reduce_var, transformer(x[idx_x]))); + } + + reduce_var = + cub::BlockReduce(temp_storage).Reduce(reduce_var, reducer); + + if (threadIdx.x == 0) { + y[blockIdx.x] = reduce_var; + } +} + +static inline std::vector GetStrides(const std::vector& dims) { + int n = static_cast(dims.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[i + 1]; + } + return strides; +} + +static inline std::vector GetStrides(const std::vector& dims, + const std::vector& idx) { + int n = static_cast(idx.size()); + if (n == 0) return std::vector(); + std::vector strides(n); + strides.back() = 1; + for (int i = n - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * dims[idx[i + 1]]; + } + return strides; +} + +constexpr int kMaxBlockDim = 512; + +static inline int GetDesiredBlockDim(int block_dim) { + return block_dim >= kMaxBlockDim + ? kMaxBlockDim + : (1 << static_cast(std::log2(block_dim))); +} + +template +static void TensorReduceImpl( + const Tx* x_data, Ty* y_data, const platform::Place& place, + const ReduceOp& reducer, const TransformOp& transformer, const Ty& init, + int left_num, int reduce_num, const std::vector& x_strides, + const std::vector& reduce_dim, const std::vector& reduce_strides, + const std::vector& left_dim, const std::vector& left_strides, + cudaStream_t stream) { +#define CUB_RANK_CASE(i, ...) \ + case i: { \ + constexpr auto kRank = i; \ + switch (reduce_rank) { __VA_ARGS__; } \ + } break + +#define CUB_REDUCE_RANK_CASE(i, ...) \ + case i: { \ + constexpr auto kReduceRank = i; \ + ReduceKernel<<>>( \ + x_data, y_data, reducer, transformer, init, reduce_num, \ + Array::From(x_strides), \ + Array::From(reduce_dim), \ + Array::From(reduce_strides), \ + Array::From(left_dim), \ + Array::From(left_strides)); \ + } break + + int rank = x_strides.size(); + int reduce_rank = reduce_strides.size(); + if (rank == reduce_rank) { + cub::TransformInputIterator trans_x( + x_data, transformer); + size_t temp_storage_bytes = 0; + cub::DeviceReduce::Reduce(nullptr, temp_storage_bytes, trans_x, y_data, + reduce_num, reducer, init, stream); + framework::Tensor tmp; + auto* temp_storage = tmp.mutable_data( + framework::make_ddim({static_cast(temp_storage_bytes)}), + place); + cub::DeviceReduce::Reduce(temp_storage, temp_storage_bytes, trans_x, y_data, + reduce_num, reducer, init, stream); + return; + } + if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { + ReduceKernel2D<<>>( + x_data, y_data, reducer, transformer, init, reduce_num); + return; + } + /* + if (rank == 3 && reduce_rank == 1 && reduce_dim[0] == 1) { + // TODO(liangdun): we can optimize 3d case which the 2nd axis is reduced. + // Currently, it is handled by code below, but inefficient + return; + } + */ + + switch (rank) { + CUB_RANK_CASE(2, CUB_REDUCE_RANK_CASE(1);); + + CUB_RANK_CASE(3, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2);); + + CUB_RANK_CASE(4, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3);); + + CUB_RANK_CASE(5, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4);); + + CUB_RANK_CASE(6, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5);); + + CUB_RANK_CASE(7, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); + + CUB_RANK_CASE(8, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6);); + + CUB_RANK_CASE(9, CUB_REDUCE_RANK_CASE(1); CUB_REDUCE_RANK_CASE(2); + CUB_REDUCE_RANK_CASE(3); CUB_REDUCE_RANK_CASE(4); + CUB_REDUCE_RANK_CASE(5); CUB_REDUCE_RANK_CASE(6); + CUB_REDUCE_RANK_CASE(7); CUB_REDUCE_RANK_CASE(8);); + } + +#undef CUB_REDUCE_RANK_CASE +#undef CUB_RANK_CASE +} + +} // namespace detail + +template +void TensorReduce(const framework::Tensor& x, framework::Tensor* y, + std::vector origin_reduce_dims, const Ty& init, + const ReduceOp& reducer, const TransformOp& transformer, + cudaStream_t stream) { + auto x_dim = framework::vectorize2int(x.dims()); + std::vector new_x_dim, new_reduce_dims; + int is_reduced = 0; + for (auto e : origin_reduce_dims) { + auto pos = e >= 0 ? e : e + x_dim.size(); + is_reduced |= 1 << e; + } + for (int i = 0; i < x_dim.size(); i++) { + if ((i == 0) || (((is_reduced >> i) ^ (is_reduced >> (i - 1))) & 1)) { + new_x_dim.push_back(x_dim[i]); + if ((is_reduced >> i) & 1) + new_reduce_dims.push_back(new_x_dim.size() - 1); + } else { + new_x_dim[new_x_dim.size() - 1] *= x_dim[i]; + } + } + x_dim = new_x_dim; + origin_reduce_dims = new_reduce_dims; + int x_rank = static_cast(x_dim.size()); + std::set left_set, reduce_set; + for (int i = 0; i < x_rank; ++i) left_set.insert(i); + + for (auto e : origin_reduce_dims) { + left_set.erase(e); + reduce_set.insert(e); + } + + std::vector reduce_dim(reduce_set.begin(), reduce_set.end()); + std::vector left_dim(left_set.begin(), left_set.end()); + + std::vector x_strides = detail::GetStrides(x_dim); + std::vector reduce_strides = detail::GetStrides(x_dim, reduce_dim); + std::vector left_strides = detail::GetStrides(x_dim, left_dim); + int reduce_num = reduce_strides[0] * x_dim[reduce_dim[0]]; + int left_num = 1; + if (left_dim.size()) left_num = left_strides[0] * x_dim[left_dim[0]]; + + std::vector y_dim(left_dim.size()); + for (int i = 0; i < left_dim.size(); ++i) { + y_dim[i] = x_dim[left_dim[i]]; + } + auto x_data = x.data(); + auto y_data = y->mutable_data(x.place()); + if (reduce_num == 1) return; + +#define CUB_BLOCK_DIM_CASE(block_dim) \ + case block_dim: { \ + constexpr auto kBlockDim = block_dim; \ + detail::TensorReduceImpl( \ + x_data, y_data, x.place(), reducer, transformer, init, left_num, \ + reduce_num, x_strides, reduce_dim, reduce_strides, left_dim, \ + left_strides, stream); \ + } break + + switch (detail::GetDesiredBlockDim(reduce_num)) { + CUB_BLOCK_DIM_CASE(512); + CUB_BLOCK_DIM_CASE(256); + CUB_BLOCK_DIM_CASE(128); + CUB_BLOCK_DIM_CASE(64); + CUB_BLOCK_DIM_CASE(32); + CUB_BLOCK_DIM_CASE(16); + CUB_BLOCK_DIM_CASE(8); + CUB_BLOCK_DIM_CASE(4); + CUB_BLOCK_DIM_CASE(2); + } +#undef CUB_BLOCK_DIM_CASE +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 75a3662316462a222760bfbb7d7906c70f46d143..d8e9cee85bd734c2ed4b1cae03ecee04e304b651 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include // NOLINT #include // NOLINT diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3dbbd75b1e945208395c42ace3235db7891936c5..5be7095acd3c5ac6f880a8a26c246f60a93643b5 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -15,6 +15,7 @@ #pragma once #include +#include // NOLINT #include #include diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index d88e8c640ffb5ea44e88318cc973c9a783862435..f3e61e1575ced0b9ffbad23e6973121daca9751b 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include // NOLINT diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0b917a403620e2ffb2cbb4ca7856cce9584e1eef --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc @@ -0,0 +1,604 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused_embedding_fc_lstm_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/sequence2batch.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { + +void FusedEmbeddingFCLSTMOp::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Embeddings"), + "Assert only one Input(Embeddings) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("WeightH"), + "Assert only one Input(WeightH) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Assert only one Output(Hidden) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Assert only one Output(Cell) of LSTM."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("Embeddings"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + + auto x_dims = ctx->GetInputDim("Ids"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(Ids)'s rank must be 2."); + + if (ctx->HasInput("H0")) { + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Input(Cell) and Input(Hidden) of LSTM should not " + "be null at the same time."); + auto h_dims = ctx->GetInputDim("H0"); + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + + auto embeddings_dims = ctx->GetInputDim("Embeddings"); + PADDLE_ENFORCE_EQ(embeddings_dims.size(), 2, + "The rank of Input(Embeddings) should be 2."); + + auto wh_dims = ctx->GetInputDim("WeightH"); + int frame_size = wh_dims[1] / 4; + PADDLE_ENFORCE_EQ(wh_dims.size(), 2, + "The rank of Input(WeightH) should be 2."); + PADDLE_ENFORCE_EQ(wh_dims[0], frame_size, + "The first dimension of Input(WeightH) " + "should be %d.", + frame_size); + PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size, + "The second dimension of Input(WeightH) " + "should be 4 * %d.", + frame_size); + + auto b_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, + "The first dimension of Input(Bias) should be 1."); + PADDLE_ENFORCE_EQ( + b_dims[1], (ctx->Attrs().Get("use_peepholes") ? 7 : 4) * frame_size, + "The second dimension of Input(Bias) should be " + "7 * %d if enable peepholes connection or" + "4 * %d if disable peepholes", + frame_size, frame_size); + + framework::DDim out_dims({x_dims[0], frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->ShareLoD("Ids", "Hidden"); + ctx->ShareLoD("Ids", "Cell"); + int xx_width; + if (ctx->Attrs().Get("use_seq")) { + xx_width = wh_dims[1]; + } else { + xx_width = x_dims[1] > wh_dims[1] ? wh_dims[1] : x_dims[1]; + PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"), + "Assert only one Output(BatchedInput) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"), + "Assert only one Output(BatchedHidden) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"), + "Assert only one Output(BatchedCell) of LSTM."); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"), + "Assert only one Output(ReorderedH0) of LSTM"); + PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"), + "Assert only one Output(ReorderedC0) of LSTM."); + ctx->SetOutputDim("BatchedInput", {x_dims[0], wh_dims[1]}); + ctx->SetOutputDim("BatchedHidden", out_dims); + ctx->SetOutputDim("BatchedCell", out_dims); + } + ctx->SetOutputDim("XX", {x_dims[0], xx_width}); + ctx->ShareLoD("Ids", "XX"); +} + +framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::ToDataType( + ctx.Input("Embeddings")->type()), + ctx.device_context()); +} + +void FusedEmbeddingFCLSTMOpMaker::Make() { + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddInput("Embeddings", + "(Tensor) the learnable weights of X." + " - The shape is (M x 4D), where M is the dim size of x, D is the " + "hidden size. " + " - Weight = {W_cx, W_ix, W_fx, W_ox}"); + AddInput("WeightH", + "(Tensor) same as LSTMOp, the learnable hidden-hidden weights." + " - The shape is (D x 4D), where D is the hidden size. " + " - Weight = {W_ch, W_ih, W_fh, W_oh}"); + AddInput("Bias", + "(Tensor) the learnable weights. Almost same as LSTMOp" + "Note: we should add the fc bias into this (1x4D) in bias." + "input-hidden bias weight and peephole connections weight if " + "setting `use_peepholes` True. " + "1. `use_peepholes = False` " + " - The shape is (1 x 4D). " + " - Bias = {b_c, b_i, b_f, b_o}." + "2. `use_peepholes = True` " + " - The shape is (1 x 7D). " + " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}."); + AddInput("H0", + "(Tensor, optional) (same as LSTMOp) the initial hidden state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size and D is the hidden size.") + .AsDispensable(); + AddInput("C0", + "(Tensor, optional) (same as LSTMOp) (the initial cell state is an " + "optional " + "input. This is a tensor with shape (N x D), where N is the " + "batch size. `H0` and `C0` can be NULL but only at the same time.") + .AsDispensable(); + AddOutput("Hidden", + "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("XX", + "(LoDTensor) the result after X * WeightX (size is T x 4D)" + " or batched_X (size is T x M), this will be automatically chosen," + " where T is the total time steps in this mini-batch," + " D is the hidden size, M is the dim size of x input.") + .AsIntermediate(); + AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate(); + AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate(); + AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate(); + AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate(); + AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate(); + AddAttr("use_peepholes", + "(bool, defalut: True) " + "whether to enable diagonal/peephole connections.") + .SetDefault(true); + AddAttr("is_reverse", + "(bool, defalut: False) " + "whether to compute reversed LSTM.") + .SetDefault(false); + AddAttr("use_seq", + "(bool, defalut: True) " + "whether to use seq mode to compute.") + .SetDefault(true); + AddAttr("gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by defalut.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Fusion Long-Short Term Memory (LSTM) Operator. +This operator fuse the X into LSTM, more details can refer to LSTM op. +)DOC"); +} + +template +class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { + public: +#define INIT_VEC_FUNC \ + std::function act_gate, act_cell, act_cand; \ + auto& act_gate_str = ctx.Attr("gate_activation"); \ + auto& act_cell_str = ctx.Attr("cell_activation"); \ + auto& act_cand_str = ctx.Attr("candidate_activation"); \ + if (platform::jit::MayIUse(platform::jit::avx)) { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } else { \ + math::VecActivations act_functor; \ + act_gate = act_functor(act_gate_str); \ + act_cell = act_functor(act_cell_str); \ + act_cand = act_functor(act_cand_str); \ + } + +#define INIT_BASE_INPUT_OUTPUT \ + auto* ids = ctx.Input("Ids"); \ + auto* h0 = ctx.Input("H0"); \ + auto* c0 = ctx.Input("C0"); \ + auto* embeddings = ctx.Input("Embeddings"); \ + auto* wh = ctx.Input("WeightH"); \ + auto* bias = ctx.Input("Bias"); \ + auto* xx = ctx.Output("XX"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + auto* cell_out = ctx.Output("Cell"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + bool use_peepholes = ctx.Attr("use_peepholes"); + +#define INIT_BASE_SIZES \ + auto ids_dims = ids->dims(); /* T x M*/ \ + auto ids_numel = ids->numel(); /* T x 1*/ \ + auto wh_dims = wh->dims(); /* D x 4D*/ \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const int D3 = D * 3; \ + int64_t row_number = embeddings->dims()[0]; \ + int64_t row_width = embeddings->dims()[1]; \ + const int D4 = wh_dims[1]; + +#define INIT_BASE_INPUT_DATAS \ + const int64_t* ids_data = ids->data(); \ + const T* embeddings_data = embeddings->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wc_data = bias->data() + D4; \ + /* for peephole only*/ \ + Tensor checked_cell; \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + checked_cell_data = checked_cell.mutable_data({2, D}, place); \ + } + +/// Compute LSTM +#define GEMM_WH_ADDON(bs, prev, out) \ + blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast(1), prev, D, \ + wh_data, D4, static_cast(1), out, D4) + +// gates: W_ch, W_ih, W_fh, W_oh +#define GET_Ct(ct_1, gates, ct) \ + /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ + act_cand(D, gates, gates); \ + blas.VMUL(D, gates, gates + D, gates + D); \ + blas.VMUL(D, ct_1, gates + D2, gates + D2); \ + blas.VADD(D, gates + D, gates + D2, ct) + +#define GET_Ht(ct, gates, ht) \ + /* H_t = act_cell(C_t) * ogated */ \ + act_cell(D, ct, gates + D2); \ + blas.VMUL(D, gates + D2, gates + D3, ht) + +#define GET_Ct_NOH0C0(gates, ct) \ + /* C_t = igated * cgated*/ \ + act_gate(D, gates + D, gates + D); \ + act_cand(D, gates, gates); \ + blas.VMUL(D, gates, gates + D, ct) + +#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \ + GET_Ct_NOH0C0(gates, ct); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \ + GET_Ct_NOH0C0(gates, ct); \ + /* get outgated, put W_oc * C_t on igated */ \ + blas.VMUL(D, wc_data + D2, ct, gates + D); \ + blas.VADD(D, gates + D, gates + D3, gates + D3); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt(gates, ct_1, ct, ht) \ + act_gate(D3, gates + D, gates + D); \ + GET_Ct(ct_1, gates, ct); \ + GET_Ht(ct, gates, ht) + +#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht) \ + /* get fgated and igated*/ \ + blas.VMUL(D, wc_data, ct_1, checked_cell_data); \ + blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \ + blas.VADD(D2, checked_cell_data, gates + D, gates + D); \ + act_gate(D2, gates + D, gates + D); \ + GET_Ct(ct_1, gates, ct); \ + /* get ogated*/ \ + blas.VMUL(D, wc_data + D2, ct, gates + D); \ + blas.VADD(D, gates + D, gates + D3, gates + D3); \ + act_gate(D, gates + D3, gates + D3); \ + GET_Ht(ct, gates, ht) + + void SeqCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = paddle::platform::CPUDeviceContext; + INIT_BASE_INPUT_OUTPUT + INIT_BASE_SIZES + INIT_VEC_FUNC + INIT_BASE_INPUT_DATAS + + // std::cout << "====> SeqCompute" << std::endl; + auto ids_lod = ids->lod(); + const int total_T = ids_dims[0]; + const int N = ids_lod[0].size() - 1; + const T* h0_data = h0 ? h0->data() : nullptr; + const T* c0_data = c0 ? c0->data() : nullptr; + T* xx_data = xx->mutable_data(place); + T* h_out_data = hidden_out->mutable_data(place); + T* c_out_data = cell_out->mutable_data(place); + auto blas = math::GetBlas(ctx); + + for (int64_t i = 0; i < ids_numel; ++i) { + PADDLE_ENFORCE_LT(ids_data[i], row_number); + PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); + memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, + row_width * sizeof(T)); + } + + int xx_offset = D4; + int gate_offset = D; + if (is_reverse) { + const int offset = (total_T - 1) * D; + xx_data = xx_data + offset * 4; + h_out_data = h_out_data + offset; + c_out_data = c_out_data + offset; + xx_offset = -D4; + gate_offset = -D; + } + +#define MOVE_ONE_STEP \ + prev_h_data = h_out_data; \ + prev_c_data = c_out_data; \ + xx_data = xx_data + xx_offset; \ + h_out_data = h_out_data + gate_offset; \ + c_out_data = c_out_data + gate_offset + +#define PROCESS_H0C0_DEFINES \ + int bid = is_reverse ? N - 1 - i : i; \ + int seq_len = ids_lod[0][bid + 1] - ids_lod[0][bid]; \ + const T* prev_c_data = nullptr; \ + const T* prev_h_data = nullptr; \ + int tstart = 0 + +#define PROCESS_H0C0_PEEPHOLE \ + PROCESS_H0C0_DEFINES; \ + if (h0_data) { \ + prev_h_data = h0_data + bid * D; \ + prev_c_data = c0_data + bid * D; \ + } else { \ + COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \ + MOVE_ONE_STEP; \ + tstart = 1; \ + } + +#define PROCESS_H0C0 \ + PROCESS_H0C0_DEFINES; \ + if (h0_data) { \ + prev_h_data = h0_data + bid * D; \ + prev_c_data = c0_data + bid * D; \ + } else { \ + COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \ + MOVE_ONE_STEP; \ + tstart = 1; \ + } + + if (use_peepholes) { + for (int i = 0; i < N; ++i) { + PROCESS_H0C0_PEEPHOLE + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data); + MOVE_ONE_STEP; + } + } + } else { + for (int i = 0; i < N; ++i) { + PROCESS_H0C0 + for (int step = tstart; step < seq_len; ++step) { + GEMM_WH_ADDON(1, prev_h_data, xx_data); + COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data); + MOVE_ONE_STEP; + } + } + } +#undef PROCESS_H0C0_DEFINES +#undef PROCESS_H0C0_PEEPHOLE +#undef PROCESS_H0C0 +#undef MOVE_ONE_STEP + } + + void BatchCompute(const framework::ExecutionContext& ctx) const { + using DeviceContext = platform::CPUDeviceContext; + INIT_BASE_INPUT_OUTPUT + if (ids->lod()[0].size() == 2) { + SeqCompute(ctx); + return; + } + INIT_BASE_SIZES + INIT_VEC_FUNC + INIT_BASE_INPUT_DATAS + + // std::cout << "===> Batch Compute" << std::endl; + + auto* reordered_h0 = ctx.Output("ReorderedH0"); + auto* reordered_c0 = ctx.Output("ReorderedC0"); + auto* batched_input = ctx.Output("BatchedInput"); + auto* batched_c_out = ctx.Output("BatchedCell"); + auto* batched_h_out = ctx.Output("BatchedHidden"); + T* xx_data = xx->mutable_data(place); + T* batched_input_data = batched_input->mutable_data(place); + T* batched_c_out_data = batched_c_out->mutable_data(place); + T* batched_h_out_data = batched_h_out->mutable_data(place); + hidden_out->mutable_data(place); + cell_out->mutable_data(place); + + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + + for (int64_t i = 0; i < ids_numel; ++i) { + PADDLE_ENFORCE_LT(ids_data[i], row_number); + PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i); + memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width, + row_width * sizeof(T)); + } + + to_batch(dev_ctx, *xx, batched_input, true, is_reverse); + + auto batched_lod = batched_input->lod(); + const auto& seq_order = batched_lod[2]; + const int max_bs = seq_order.size(); + reordered_h0->Resize({max_bs, D}); + reordered_c0->Resize({max_bs, D}); + + int tstart = 0; + T* prev_h_data = nullptr; + T* prev_c_data = nullptr; + if (h0) { + // reorder h0, c0 + T* reordered_h0_data = reordered_h0->mutable_data(place); + T* reordered_c0_data = reordered_c0->mutable_data(place); + const T* h0_data = h0->data(); + const T* c0_data = c0->data(); + prev_h_data = reordered_h0_data; + prev_c_data = reordered_c0_data; + size_t sz = sizeof(T) * D; + for (int i = 0; i < max_bs; ++i) { + std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz); + std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz); + reordered_h0_data += D; + reordered_c0_data += D; + } + } else { + // compute without h0, c0 + T* cur_in_data = batched_input_data; + T* cur_h_out_data = batched_h_out_data; + T* cur_c_out_data = batched_c_out_data; + for (int i = 0; i < max_bs; ++i) { + GET_Ct_NOH0C0(cur_in_data, cur_c_out_data); + if (use_peepholes) { + blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D); + blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3); + } + act_gate(D, cur_in_data + D3, cur_in_data + D3); + GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data); + cur_in_data += D4; + cur_c_out_data += D; + cur_h_out_data += D; + } + tstart = 1; + prev_h_data = batched_h_out_data; + prev_c_data = batched_c_out_data; + } + const auto& batch_starts = batched_lod[0]; + const int max_seq_len = batch_starts.size() - 1; + const int offset = tstart * max_bs * D; + batched_input_data = batched_input_data + offset * 4; + batched_h_out_data = batched_h_out_data + offset; + batched_c_out_data = batched_c_out_data + offset; + +#define DEFINE_CUR \ + T* cur_in_data = batched_input_data; \ + T* cur_prev_c_data = prev_c_data; \ + T* cur_c_out_data = batched_c_out_data; \ + T* cur_h_out_data = batched_h_out_data + +#define MOVE_ONE_BATCH \ + cur_in_data += D4; \ + cur_prev_c_data += D; \ + cur_c_out_data += D; \ + cur_h_out_data += D + +#define MOVE_ONE_STEP \ + prev_c_data = batched_c_out_data; \ + prev_h_data = batched_h_out_data; \ + batched_c_out_data = cur_c_out_data; \ + batched_h_out_data = cur_h_out_data; \ + batched_input_data = cur_in_data + + if (use_peepholes) { + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + DEFINE_CUR; + for (int i = 0; i < cur_bs; ++i) { + COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data); + MOVE_ONE_BATCH; + } + MOVE_ONE_STEP; + } + } else { + for (int step = tstart; step < max_seq_len; ++step) { + const int cur_bs = batch_starts[step + 1] - batch_starts[step]; + GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data); + DEFINE_CUR; + for (int i = 0; i < cur_bs; ++i) { + COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, + cur_h_out_data); + MOVE_ONE_BATCH; + } + MOVE_ONE_STEP; + } + } +#undef MOVE_ONE_STEP +#undef MOVE_ONE_BATCH +#undef DEFINE_CUR + + math::Batch2LoDTensorFunctor to_seq; + batched_h_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_h_out, hidden_out); + batched_c_out->set_lod(batched_lod); + to_seq(dev_ctx, *batched_c_out, cell_out); + } + + void Compute(const framework::ExecutionContext& ctx) const override { + if (ctx.Attr("use_seq")) { + SeqCompute(ctx); + } else { + BatchCompute(ctx); + } + } + +#undef COMPUTE_CtHt_PEEPHOLE +#undef COMPUTE_CtHt +#undef GET_Ct_NOH0C0 +#undef COMPUTE_CtHt_NOH0C0 +#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0 +#undef GET_Ht +#undef GET_Ct +#undef GEMM_WH_ADDON +#undef INIT_BASE_INPUT_DATAS +#undef INIT_BASE_SIZES +#undef INIT_BASE_INPUT_OUTPUT +#undef INIT_VEC_FUNC +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMOp, + ops::FusedEmbeddingFCLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm, + ops::FusedEmbeddingFCLSTMKernel, + ops::FusedEmbeddingFCLSTMKernel); diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.h b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h new file mode 100644 index 0000000000000000000000000000000000000000..2775b2ac04d2890355fe6d75a1e2507a2668dc95 --- /dev/null +++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusedEmbeddingFCLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 027e2de48d229761f12f974dc73625c8ea1b3567..3be389912307f7aac6dda6d1018943eb8f08696d 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -20,149 +21,268 @@ namespace paddle { namespace operators { namespace math { +template +__inline__ __device__ T warpReduceSum(T val) { +#if CUDA_VERSION < 9000 + for (int offset = 16; offset > 0; offset /= 2) + val += __shfl_down(val, offset); + return val; +#else +#define FULL_MASK 0xffffffff + for (int offset = 16; offset > 0; offset /= 2) + val += __shfl_down_sync(FULL_MASK, val, offset); + return val; +#endif +} +__forceinline__ __device__ unsigned lane_id() { + unsigned ret; + asm volatile("mov.u32 %0, %laneid;" : "=r"(ret)); + return ret; +} + +__forceinline__ __device__ unsigned warp_id() { + unsigned ret; + asm volatile("mov.u32 %0, %warpid;" : "=r"(ret)); + return ret; +} + // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. template -__global__ void KernelDepthwiseConv( - const int nthreads, const T* const input_data, const T* const filter_data, - const int batch_size, const int output_channels, const int output_height, - const int output_width, const int input_channels, const int input_height, - const int input_width, const int filter_multiplier, const int filter_height, +__device__ __inline__ void KernelDepthwiseConv( + const T* const input_data, const T* const filter_data, const int batch_size, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, const int filter_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, T* const output_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - - if (index < nthreads) { - const int batch = index / output_channels / output_height / output_width; - const int c_out = (index / output_height / output_width) % output_channels; - const int h_out = (index / output_width) % output_height; - const int w_out = index % output_width; - - const int c_in = c_out / filter_multiplier; - const T* weight = filter_data + c_out * filter_height * filter_width; - T value = 0; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = h_in_start + filter_height; - const int w_in_end = w_in_start + filter_width; - - const int in_offset = - ((batch * input_channels + c_in) * input_height) * input_width; - - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - - for (int h_in = h_start; h_in < h_end; h_in++) { - for (int w_in = w_start; w_in < w_end; w_in++) { - const int offset = in_offset + h_in * input_width + w_in; - value += - weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] * - input_data[offset]; + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const output_data) { + for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) { + for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) { + const int batch = blockIdx.y; + const int c_out = blockIdx.x; + + const int c_in = c_out / filter_multiplier; + const T* weight = filter_data + c_out * filter_height * filter_width; + T value = 0; + const int h_in_start = -padding_height + h_out * stride_height; + const int w_in_start = -padding_width + w_out * stride_width; + const int h_in_end = h_in_start + filter_height * dilate_height; + const int w_in_end = w_in_start + filter_width * dilate_width; + + const int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; + + const int h_end = h_in_end < input_height ? h_in_end : input_height; + const int w_end = w_in_end < input_width ? w_in_end : input_width; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int w_start = w_in_start > 0 ? w_in_start : 0; + int weight_offset = 0; + + for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) { + for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) { + if (h_in >= h_start && h_in < h_end && w_in >= w_start && + w_in < w_end) { + const int offset = in_offset + h_in * input_width + w_in; + value += weight[weight_offset] * input_data[offset]; + } + weight_offset++; + } } + int index = + ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + + w_out; + output_data[index] = value; } - output_data[index] = value; } } +template +__global__ void KernelDepthwiseConvSp( + const T* const input_data, const T* const filter_data, const int batch_size, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const output_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, + input_height, input_width, filter_multiplier, + filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, + dilate_height, dilate_width, output_data); + + else + KernelDepthwiseConv(input_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, + input_height, input_width, c_filter_multiplier, + filter_height, filter_height, c_stride, c_stride, + padding_height, padding_width, dilate_height, + dilate_width, output_data); +} + // CUDA kernel to compute the depthwise convolution backprop w.r.t input. template -__global__ void KernelDepthwiseConvInputGrad( - const int nthreads, const T* const output_grad_data, - const T* const filter_data, const int batch_size, const int output_channels, - const int output_height, const int output_width, const int input_channels, - const int input_height, const int input_width, const int filter_multiplier, - const int filter_height, const int filter_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* const input_grad_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < nthreads) { - const int batch = index / input_channels / input_height / input_width; - const int c_in = (index / input_height / input_width) % input_channels; - const int h_in = (index / input_width) % input_height; - const int w_in = index % input_width; - - const int c_out_start = c_in * filter_multiplier; - - int h_out_start = - (h_in - filter_height + padding_height + stride_height) / stride_height; - h_out_start = 0 > h_out_start ? 0 : h_out_start; - - int h_out_end = (h_in + padding_height) / stride_height; - h_out_end = output_height - 1 < h_out_end ? output_height - 1 : h_out_end; - - int w_out_start = - (w_in - filter_width + padding_width + stride_width) / stride_width; - w_out_start = 0 > w_out_start ? 0 : w_out_start; - - int w_out_end = (w_in + padding_width) / stride_width; - w_out_end = output_width - 1 < w_out_end ? output_width - 1 : w_out_end; - - T value = 0; - - for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; - c_out++) { - for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) { - const int filter_h = h_in + padding_height - h_out * stride_height; - for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) { - const int filter_w = w_in + padding_width - w_out * stride_width; - const int filter_offset = c_out * filter_height * filter_width + - filter_h * filter_width + filter_w; - const int output_grad_offset = - ((batch * output_channels + c_out) * output_height + h_out) * - output_width + - w_out; - value += - output_grad_data[output_grad_offset] * filter_data[filter_offset]; +__device__ __inline__ void KernelDepthwiseConvInputGrad( + const T* const output_grad_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const input_grad_data) { + for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { + for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { + const int batch = blockIdx.y; + const int c_in = blockIdx.x; + + const int c_out_start = c_in * filter_multiplier; + + int h_out_start = + h_in - (filter_height - 1) * dilate_height + padding_height; + + int h_out_end = h_in + padding_height; + + int w_out_start = + w_in - (filter_width - 1) * dilate_width + padding_width; + + int w_out_end = w_in + padding_width; + + T value = 0; + + for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier; + c_out++) { + int filter_offset = (c_out + 1) * filter_height * filter_width; + for (int h_out = h_out_start; h_out <= h_out_end; + h_out += dilate_height) { + for (int w_out = w_out_start; w_out <= w_out_end; + w_out += dilate_width) { + filter_offset--; + int s_h_out = h_out / stride_height; + int s_w_out = w_out / stride_width; + if (h_out % stride_height == 0 && w_out % stride_width == 0 && + s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && + s_w_out < output_width) { + const int output_grad_offset = + ((batch * output_channels + c_out) * output_height + + s_h_out) * + output_width + + s_w_out; + value += output_grad_data[output_grad_offset] * + filter_data[filter_offset]; + } + } } } + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; + input_grad_data[index] = value; } - input_grad_data[index] += value; } } +template +__global__ void KernelDepthwiseConvInputGradSp( + const T* const output_grad_data, const T* const filter_data, + const int batch_size, const int output_channels, const int output_height, + const int output_width, const int input_channels, const int input_height, + const int input_width, const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* const input_grad_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConvInputGrad( + output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, input_grad_data); + else + KernelDepthwiseConvInputGrad( + output_grad_data, filter_data, batch_size, output_channels, + output_height, output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, + padding_height, padding_width, dilate_height, dilate_width, + input_grad_data); +} + // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template -__global__ void KernelDepthwiseConvFilterGrad( - const int nthreads, const T* const output_grad_data, - const T* const input_data, const int num, const int output_channels, - const int output_height, const int output_width, const int input_channels, - const int input_height, const int input_width, const int filter_multiplier, - const int filter_height, const int filter_width, const int stride_height, - const int stride_width, const int padding_height, const int padding_width, - T* const filter_grad_data) { - int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x; - if (index < nthreads) { - const int w_out = index % output_width; - const int h_out = (index / output_width) % output_height; - const int c_out = (index / output_width / output_height) % output_channels; - const int batch = (index / output_width / output_height / output_channels); - const int c_in = c_out / filter_multiplier; - const int h_in_start = -padding_height + h_out * stride_height; - const int w_in_start = -padding_width + w_out * stride_width; - const int h_in_end = - -padding_height + h_out * stride_height + filter_height; - const int w_in_end = -padding_width + w_out * stride_width + filter_width; - const int in_offset = - (batch * input_channels + c_in) * input_height * input_width; - - T* addr_offset = filter_grad_data + c_out * filter_height * filter_width; - const int h_end = h_in_end < input_height ? h_in_end : input_height; - const int w_end = w_in_end < input_width ? w_in_end : input_width; - const int h_start = h_in_start > 0 ? h_in_start : 0; - const int w_start = w_in_start > 0 ? w_in_start : 0; - - for (int h_in = h_start; h_in < h_end; h_in++) { - for (int w_in = w_start; w_in < w_end; w_in++) { - const int offset = in_offset + h_in * input_width + w_in; - const T diff_temp = output_grad_data[index] * input_data[offset]; - T* addr = addr_offset + (h_in - h_in_start) * filter_width + - (w_in - w_in_start); - paddle::platform::CudaAtomicAdd(addr, diff_temp); +__device__ __inline__ void KernelDepthwiseConvFilterGrad( + const T* output_grad_data, const T* input_data, const int num, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* filter_grad_data) { + T s = 0; + + int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; + int lid = lane_id(); + + for (int image_w = threadIdx.x; image_w < output_width; + image_w += blockDim.x) { + for (int bid = 0; bid < num; bid++) { + for (int image_h = threadIdx.y; image_h < output_height; + image_h += blockDim.y) { + int kernel_id = blockIdx.z; + int kernel_h = blockIdx.y * dilate_height - padding_height; + int kernel_w = blockIdx.x * dilate_width - padding_width; + + int image_hk = image_h * stride_height + kernel_h; + int image_wk = image_w * stride_width + kernel_w; + if (image_hk < 0 || image_hk >= input_height) continue; + if (image_wk < 0 || image_wk >= input_width) continue; +#define gaid(N, C, H, W) \ + ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W)) + + s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * + input_data[((bid * (gridDim.z / filter_multiplier) + + kernel_id / filter_multiplier) * + input_height + + image_hk) * + input_width + + image_wk]; + +#undef gaid } } } +#if __CUDA_ARCH__ >= 530 + s = warpReduceSum(s); + if (lid == 0) paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); +#else + paddle::platform::CudaAtomicAdd(&filter_grad_data[gbid], s); +#endif +} + +template +__global__ void KernelDepthwiseConvFilterGradSp( + const T* output_grad_data, const T* input_data, const int num, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* filter_grad_data) { + if (c_filter_multiplier == 0) + KernelDepthwiseConvFilterGrad( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, filter_grad_data); + else + KernelDepthwiseConvFilterGrad( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + c_filter_multiplier, filter_height, filter_width, stride_height, + stride_width, padding_height, padding_width, dilate_height, + dilate_width, filter_grad_data); } /* @@ -177,7 +297,9 @@ class DepthwiseConvFunctor { const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output) { + const std::vector& paddings, + const std::vector& dilations, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -191,22 +313,37 @@ class DepthwiseConvFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* filter_data = filter.data(); T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = batch_size * output_channels * output_height * output_width; - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConv<<>>( - nthreads, input_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - output_data); + int thread = 512; + int blocks = std::min(std::max(thread / output_width, 1), output_height); + dim3 threads(std::min(output_width, thread), blocks, 1); + dim3 grid(output_channels, batch_size, 1); + int filter_multiplier = output_channels / input_channels; +#define check_case(c_filter_multiplier, c_stride) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride) { \ + KernelDepthwiseConvSp<<>>( \ + input_data, filter_data, batch_size, output_channels, output_height, \ + output_width, input_channels, input_height, input_width, \ + filter_multiplier, ksize_height, ksize_width, stride_height, \ + stride_width, padding_height, padding_width, dilate_height, \ + dilate_width, output_data); \ + return; \ + } + check_case(1, 1); + check_case(1, 2); + // NOTE(liangdun): 0,0 for other case + // add other case if needed, e.g. check_case(2^n,1) + check_case(0, 0); +#undef check_case } }; @@ -219,6 +356,7 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -233,22 +371,39 @@ class DepthwiseConvInputGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* filter_data = filter.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = batch_size * input_channels * input_height * input_width; - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConvInputGrad<<>>( - nthreads, output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - input_grad_data); + int thread = 512; + int blocks = std::min(std::max(thread / input_width, 1), input_height); + dim3 threads(std::min(input_width, thread), blocks, 1); + dim3 grid(input_channels, batch_size, 1); + int filter_multiplier = output_channels / input_channels; + +#define check_case(c_filter_multiplier, c_stride) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride) { \ + KernelDepthwiseConvInputGradSp< \ + T, c_filter_multiplier, \ + c_stride><<>>( \ + output_grad_data, filter_data, batch_size, output_channels, \ + output_height, output_width, input_channels, input_height, \ + input_width, filter_multiplier, ksize_height, ksize_width, \ + stride_height, stride_width, padding_height, padding_width, \ + dilate_height, dilate_width, input_grad_data); \ + return; \ + } + check_case(1, 1); + check_case(1, 2); + // NOTE(liangdun): 0,0 for other case + // add other case if needed, e.g. check_case(2^n,1) + check_case(0, 0); +#undef check_case } }; @@ -260,6 +415,7 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* filter_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; @@ -274,23 +430,34 @@ class DepthwiseConvFilterGradFunctor { const int stride_width = strides[1]; const int padding_height = paddings[0]; const int padding_width = paddings[1]; + const int dilate_height = dilations[0]; + const int dilate_width = dilations[1]; const T* input_data = input.data(); const T* output_grad_data = output_grad.data(); T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); - int nthreads = batch_size * output_channels * output_height * output_width; - - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - - KernelDepthwiseConvFilterGrad<<>>( - nthreads, output_grad_data, input_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - output_channels / input_channels, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, - filter_grad_data); + int block_size = 512; + int crop_output_height = + std::min(std::max(block_size / output_width, 1), output_height); + dim3 grid(ksize_width, ksize_height, output_channels); + dim3 threads(std::min(output_width, block_size), crop_output_height, 1); + int filter_multiplier = output_channels / input_channels; + +#define check_case(c_filter_multiplier) \ + if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \ + KernelDepthwiseConvFilterGradSp< \ + T, c_filter_multiplier><<>>( \ + output_grad_data, input_data, batch_size, output_channels, \ + output_height, output_width, input_channels, input_height, \ + input_width, filter_multiplier, ksize_height, ksize_width, \ + stride_height, stride_width, padding_height, padding_width, \ + dilate_height, dilate_width, filter_grad_data); \ + return; \ + } + check_case(1); + check_case(0); +#undef check_case } }; diff --git a/paddle/fluid/operators/math/depthwise_conv.h b/paddle/fluid/operators/math/depthwise_conv.h index 97aec401889a56d3fc9ac08e766d931bb3725b01..71f6fcb23df1942d6dcf7177165f2ec1022a9b35 100644 --- a/paddle/fluid/operators/math/depthwise_conv.h +++ b/paddle/fluid/operators/math/depthwise_conv.h @@ -32,7 +32,8 @@ class DepthwiseConvFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& filter, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output); + const std::vector& paddings, + const std::vector& dilations, framework::Tensor* output); }; template @@ -43,6 +44,7 @@ class DepthwiseConvInputGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* input_grad); }; @@ -53,6 +55,7 @@ class DepthwiseConvFilterGradFunctor { const framework::Tensor& output_grad, const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, framework::Tensor* filter_grad); }; diff --git a/paddle/fluid/operators/reduce_mean_op.cu b/paddle/fluid/operators/reduce_mean_op.cu index 960cb3235be7f4cc98b97d3b088ceaeb3d4a4209..59b30244839849d79e3e531953134633503c4090 100644 --- a/paddle/fluid/operators/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_mean_op.cu @@ -12,17 +12,64 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_mean_op.h" -REGISTER_OP_CUDA_KERNEL(reduce_mean, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +namespace paddle { +namespace operators { + +template +struct DivideFunctor { + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} + + HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } + + private: + T n_inv; +}; + +template +class ReduceMeanKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + int reduce_num = 1; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_num *= input->dims()[reduce_dims[i]]; + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce>( + *input, output, reduce_dims, static_cast(0), cub::Sum(), + DivideFunctor(reduce_num), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, + ops::ReduceMeanKernel, + ops::ReduceMeanKernel, + ops::ReduceMeanKernel); + REGISTER_OP_CUDA_KERNEL( reduce_mean_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/reduce_sum_op.cu b/paddle/fluid/operators/reduce_sum_op.cu index f2e16955a50dc6a7feda9fbaf968c929ef3d8a4f..53cd9e9419dd9aecee730917ae21d7a4ab332ffc 100644 --- a/paddle/fluid/operators/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_sum_op.cu @@ -12,17 +12,59 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/cub_reduce.h" #include "paddle/fluid/operators/reduce_sum_op.h" -REGISTER_OP_CUDA_KERNEL(reduce_sum, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel, - ops::ReduceKernel); +namespace paddle { +namespace operators { + +template +struct IdentityFunctor { + HOSTDEVICE explicit inline IdentityFunctor() {} + + HOSTDEVICE inline T operator()(const T& x) const { return x; } +}; + +template +class ReduceSumKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool reduce_all = context.Attr("reduce_all"); + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + + auto dims = context.Attr>("dim"); + bool keep_dim = context.Attr("keep_dim"); + + std::vector reduce_dims; + if (reduce_all) { + reduce_dims.resize(input->dims().size()); + for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; + } else { + for (auto e : dims) { + reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); + } + } + + int reduce_num = 1; + for (int i = 0; i < reduce_dims.size(); ++i) { + reduce_num *= input->dims()[reduce_dims[i]]; + } + + auto stream = context.cuda_device_context().stream(); + TensorReduce>( + *input, output, reduce_dims, static_cast(0), cub::Sum(), + IdentityFunctor(), stream); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(reduce_sum, ops::ReduceSumKernel, + ops::ReduceSumKernel, ops::ReduceSumKernel, + ops::ReduceSumKernel); + REGISTER_OP_CUDA_KERNEL( reduce_sum_grad, ops::ReduceGradKernel, diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc deleted file mode 100644 index e71841d4d1815d50cd9800910c9db34e121beffc..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/select_op.cc +++ /dev/null @@ -1,419 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include // NOLINT -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/concurrency/channel_util.h" - -#include - -namespace paddle { -namespace operators { - -static constexpr char kX[] = "X"; -static constexpr char kCaseToExecute[] = "case_to_execute"; -static constexpr char kOutputs[] = "Out"; - -static constexpr char kCases[] = "cases"; -static constexpr char kCasesBlock[] = "sub_block"; - -class SelectOp : public framework::OperatorBase { - public: - SelectOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - enum class SelectOpCaseType { - DEFAULT = 0, - SEND = 1, - RECEIVE = 2, - }; - - struct SelectOpCase { - int caseIndex; - SelectOpCaseType caseType; - std::string channelName; - std::string varName; - - SelectOpCase() {} - - SelectOpCase(int caseIndex, SelectOpCaseType caseType, - std::string channelName, std::string varName) - : caseIndex(caseIndex), - caseType(caseType), - channelName(channelName), - varName(varName) {} - }; - - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { - std::vector casesConfigs = - Attr>(kCases); - - framework::BlockDesc *casesBlock = - Attr(kCasesBlock); - - framework::Scope &casesBlockScope = scope.NewScope(); - - std::string caseToExecuteVarName = Input(kCaseToExecute); - framework::Variable *caseToExecuteVar = - casesBlockScope.FindVar(caseToExecuteVarName); - - // Construct cases from "conditional_block_op"(s) in the casesBlock - std::vector> cases = - ParseAndShuffleCases(&casesConfigs); - - // Get all unique channels involved in select - std::set channelsSet; - for (auto c : cases) { - if (!c->channelName.empty()) { - auto channelVar = scope.FindVar(c->channelName); - framework::ChannelHolder *ch = - channelVar->GetMutable(); - - if (channelsSet.find(ch) == channelsSet.end()) { - channelsSet.insert(ch); - } - } - } - - // Order all channels by their pointer address - std::vector channels(channelsSet.begin(), - channelsSet.end()); - std::sort(channels.begin(), channels.end()); - - // Poll all cases - int32_t caseToExecute = pollCases(&scope, &cases, channels); - - // At this point, the case to execute has already been determined, - // so we can proceed with executing the cases block - framework::LoDTensor *caseToExecuteTensor = - caseToExecuteVar->GetMutable(); - caseToExecuteTensor->data()[0] = caseToExecute; - - // Execute the cases block, only one case will be executed since we set the - // case_to_execute value to the index of the case we want to execute - framework::Executor executor(dev_place); - framework::ProgramDesc *program = casesBlock->Program(); - executor.Run(*program, &casesBlockScope, casesBlock->ID(), - false /*create_local_scope*/); - } - - /** - * Goes through all operators in the casesConfigs and processes - * "conditional_block" operators. These operators are mapped to our - * SelectOpCase objects. We randomize the case orders, and set the - * default case (if any exists) as the last case) - * @param casesBlock - * @return - */ - std::vector> ParseAndShuffleCases( - std::vector *casesConfigs) const { - std::vector> cases; - std::shared_ptr defaultCase; - - if (casesConfigs != nullptr) { - boost::char_delimiters_separator sep(false, ",", ""); - for (std::vector::iterator itr = casesConfigs->begin(); - itr < casesConfigs->end(); ++itr) { - std::string caseConfig = *itr; - boost::tokenizer<> tokens(caseConfig, sep); - - boost::tokenizer<>::iterator tok_iter = tokens.begin(); - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index"); - std::string caseIndexString = *tok_iter; - int caseIndex = std::stoi(caseIndexString); - - ++tok_iter; - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type"); - std::string caseTypeString = *tok_iter; - SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString); - - std::string caseChannel; - std::string caseChannelVar; - - ++tok_iter; - if (caseType != SelectOpCaseType::DEFAULT) { - PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel"); - caseChannel = *tok_iter; - - ++tok_iter; - PADDLE_ENFORCE(tok_iter != tokens.end(), - "Cannot get case channel variable"); - caseChannelVar = *tok_iter; - } - - auto c = std::make_shared(caseIndex, caseType, - caseChannel, caseChannelVar); - - if (caseType == SelectOpCaseType::DEFAULT) { - PADDLE_ENFORCE(defaultCase == nullptr, - "Select can only contain one default case."); - defaultCase = c; - } else { - cases.push_back(c); - } - } - } - - // Randomly sort cases, with default case being last - std::random_shuffle(cases.begin(), cases.end()); - if (defaultCase != nullptr) { - cases.push_back(defaultCase); - } - - return cases; - } - - /** - * This method will recursively poll the cases and determines if any case - * condition is true. - * If none of the cases conditions are true (and there is no default case), - * then block - * the thread. The thread may be woken up by a channel operation, at which - * point we - * execute the case. - * @param scope - * @param cases - * @param channels - * @return - */ - int32_t pollCases(const framework::Scope *scope, - std::vector> *cases, - std::vector channels) const { - // Lock all involved channels - lockChannels(channels); - - std::atomic caseToExecute(-1); - - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - - switch (c->caseType) { - case SelectOpCaseType::SEND: - PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel"); - if (ch->CanSend()) { - // We can send to channel directly, send the data to channel - // and execute case - auto chVar = scope->FindVar(c->varName); - concurrency::ChannelSend(ch, chVar); - caseToExecute = c->caseIndex; - } - break; - case SelectOpCaseType::RECEIVE: - if (ch->CanReceive()) { - // We can receive from channel directly, send the data to channel - // and execute case - auto chVar = scope->FindVar(c->varName); - concurrency::ChannelReceive(ch, chVar); - caseToExecute = c->caseIndex; - } - break; - case SelectOpCaseType::DEFAULT: - caseToExecute = c->caseIndex; - break; - } - - if (caseToExecute != -1) { - // We found a case to execute, stop looking at other case statements - break; - } - - ++it; - } - - if (caseToExecute == -1) { - // None of the cases are eligible to execute, enqueue current thread - // into all the sending/receiving queue of each involved channel - std::atomic completed(false); - std::recursive_mutex mutex; - std::unique_lock lock{mutex}; - // std::condition_variable_any selectCond; - auto selectCond = std::make_shared(); - - std::recursive_mutex callbackMutex; - pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute, - &completed, &callbackMutex); - - // TODO(thuan): Atomically unlock all channels and sleep current thread - unlockChannels(channels); - selectCond->wait(lock, [&completed]() { return completed.load(); }); - - // Select has been woken up by case operation - lockChannels(channels); - removeThreadOnChannelQueues(scope, cases); - - if (caseToExecute == -1) { - // Recursively poll cases, since we were woken up by a channel close - // TODO(thuan): Need to test if this is a valid case - unlockChannels(channels); - return pollCases(scope, cases, channels); - } - } - - // At this point, caseToExecute != -1, and we can proceed with executing - // the case block - unlockChannels(channels); - - return caseToExecute; - } - - void lockChannels(std::vector chs) const { - std::vector::iterator it = chs.begin(); - while (it != chs.end()) { - framework::ChannelHolder *ch = *it; - ch->Lock(); - ++it; - } - } - - void unlockChannels(std::vector chs) const { - std::vector::reverse_iterator it = chs.rbegin(); - while (it != chs.rend()) { - framework::ChannelHolder *ch = *it; - ch->Unlock(); - ++it; - } - } - - void pushThreadOnChannelQueues( - const framework::Scope *scope, - std::vector> *cases, - std::shared_ptr rCond, - std::atomic *caseToExecute, std::atomic *completed, - std::recursive_mutex *callbackMutex) const { - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - - std::function cb = - [&caseToExecute, &completed, &callbackMutex, - c](framework::ChannelAction channelAction) { - std::lock_guard lock{*callbackMutex}; - - bool canProcess = false; - if (!(*completed)) { - // If the channel wasn't closed, we set the caseToExecute index - // as this current case - if (channelAction != framework::ChannelAction::CLOSE) { - *caseToExecute = c->caseIndex; - } - // This will allow our conditional variable to break out of wait - *completed = true; - canProcess = true; - } - - return canProcess; - }; - - switch (c->caseType) { - case SelectOpCaseType::SEND: { - auto chOutputVar = scope->FindVar(c->varName); - concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb); - break; - } - case SelectOpCaseType::RECEIVE: { - auto chOutputVar = scope->FindVar(c->varName); - concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb); - break; - } - default: - break; - } - ++it; - } - } - - void removeThreadOnChannelQueues( - const framework::Scope *scope, - std::vector> *cases) const { - std::vector>::iterator it = cases->begin(); - while (it != cases->end()) { - std::shared_ptr c = *it; - - auto chVar = scope->FindVar(c->channelName); - framework::ChannelHolder *ch = - chVar->GetMutable(); - switch (c->caseType) { - case SelectOpCaseType::SEND: { - ch->RemoveFromSendQ(this); - break; - } - case SelectOpCaseType::RECEIVE: { - ch->RemoveFromReceiveQ(this); - break; - } - default: - break; - } - ++it; - } - } -}; - -class SelectOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kX, - "A set of variables, which are required by operators inside the " - "cases of Select Op") - .AsDuplicable(); - AddInput(kCaseToExecute, - "(Int) The variable the sets the index of the case to execute, " - "after evaluating the channels being sent to and received from") - .AsDuplicable(); - AddOutput(kOutputs, - "A set of variables, which will be assigned with values " - "generated by the operators inside the cases of Select Op.") - .AsDuplicable(); - AddAttr>(kCases, - "(String vector) Serialized list of" - "all cases in the select op. Each" - "case is serialized as: " - "',,,'" - "where type is 0 for default, 1 for" - "send, and 2 for receive" - "No channel and values are needed for" - "default cases."); - AddAttr(kCasesBlock, - "The cases block inside select_op"); - AddComment(R"DOC( -)DOC"); - } -}; - -// TODO(thuan): Implement Gradient Operator for SELECT_OP - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(select, paddle::operators::SelectOp, - paddle::framework::EmptyGradOpMaker, - paddle::operators::SelectOpMaker); diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index c7c533bd42859c374c4783d43ec4cdd34a6a994a..4ea0cd7283b55649dbdbbf97f81f10c69ac6a1d2 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -55,7 +55,7 @@ extern void *cublas_dso_handle; struct DynLoad__##__name { \ template \ inline cublasStatus_t operator()(Args... args) { \ - return __name(args...); \ + return ::__name(args...); \ } \ }; \ extern DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 0103e7a3accf88f3c83f109298010c3c9af3d549..e6353f67ef118072a2d8e49111e8ecc486589998 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#define GLOG_NO_ABBREVIATED_SEVERITIES +#define GOOGLE_GLOG_DLL_DECL +#include #include #include // NOLINT @@ -47,13 +50,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #else -#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> decltype(__name(args...)) { \ - return __name(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline cudnnStatus_t operator()(Args... args) { \ + return ::__name(args...); \ + } \ + }; \ extern DynLoad__##__name __name #endif diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 2daf1b4215ce1f7f771bbac72bfe103b0b941976..0bb300ec33076d9ddfaf69190f14131279cc888e 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -44,7 +44,7 @@ extern void *curand_dso_handle; struct DynLoad__##__name { \ template \ curandStatus_t operator()(Args... args) { \ - return __name(args...); \ + return ::__name(args...); \ } \ }; \ extern DynLoad__##__name __name diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 6a3ad2151081504fda2a3818c5f99ad47039d91d..cc5cda6106c188f3156d33480b5d3641eed32556 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -107,7 +107,11 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, const std::string& dso_name, bool throw_on_error = true) { +#if !defined(_WIN32) int dynload_flags = RTLD_LAZY | RTLD_LOCAL; +#else + int dynload_flags = 0; +#endif // !_WIN32 void* dso_handle = nullptr; std::string dlPath = dso_name; @@ -117,10 +121,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, // search xxx.so from custom path dlPath = join(search_root, dso_name); dso_handle = dlopen(dlPath.c_str(), dynload_flags); +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 // if not found, search from default path if (nullptr == dso_handle) { LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " (" - << dlerror() << ")"; + << errorno << ")"; if (dlPath.find("nccl") != std::string::npos) { std::cout << "You may need to install 'nccl2' from NVIDIA official website: " @@ -139,10 +148,15 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root, "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " "using the DYLD_LIBRARY_PATH is impossible unless System " "Integrity Protection (SIP) is disabled."; +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 if (throw_on_error) { - PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, dlerror()); + PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno); } else if (nullptr == dso_handle) { - LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror()); + LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno); } return dso_handle; diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index a5bc44122028c1191f511157bdde2e7c2d30c6aa..3b22718a8c6f994dbc2dc3e7aaa19a7163f716ba 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -214,7 +214,6 @@ void BindVarDsec(pybind11::module *m) { .def("set_shapes", &pd::VarDesc::SetShapes) .def("set_dtype", &pd::VarDesc::SetDataType) .def("set_dtypes", &pd::VarDesc::SetDataTypes) - .def("set_capacity", &pd::VarDesc::SetCapacity) .def("shape", &pd::VarDesc::GetShape, pybind11::return_value_policy::reference) .def("shapes", &pd::VarDesc::GetShapes, @@ -251,7 +250,6 @@ void BindVarDsec(pybind11::module *m) { .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES) .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE) .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY) - .value("CHANNEL", pd::proto::VarType::CHANNEL) .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST) .value("READER", pd::proto::VarType::READER) .value("RAW", pd::proto::VarType::RAW); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index ef2f1f2a20a2eddee8ac077ee4bbf4dfd777448d..295af1c5837d70c32b522cc47c8c3e12d5bd61c7 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -21,7 +21,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index a3b4e38f453835828a4a53130e11c854ac3f4a74..10c9eb80d0a7e07d5974ca10d740e71e7717b5c5 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -56,13 +56,13 @@ struct Style { }; template -static void PrettyLogEndl(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLogEndl(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset() << std::endl; } template -static void PrettyLog(const std::string& style, const char* fmt, - const Args&... args) { +static void PrettyLog(const std::string &style, const char *fmt, + const Args &... args) { std::cerr << style << Sprintf(fmt, args...) << reset(); } diff --git a/paddle/legacy/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt index 08548bea4c4a7fc4fa99d9305208abd4ee442572..fbefcced5643b65372072856bfeb6c87cd4071a8 100644 --- a/paddle/legacy/trainer/tests/CMakeLists.txt +++ b/paddle/legacy/trainer/tests/CMakeLists.txt @@ -16,7 +16,11 @@ endfunction() trainer_test(test_Compare) trainer_test(test_PyDataProviderWrapper) trainer_test(test_recurrent_machine_generation) -trainer_test(test_Trainer) +if(NOT APPLE) + trainer_test(test_Trainer) +else() + message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer") +endif() ############### test_TrainerOnePass ########################## if(WITH_PYTHON) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7d2fb7c6ce9e6a89df2c777323fc6a547fc227f4..02eb3dbfd7750b47904b70de4f4fd3757404d29c 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -395,7 +395,7 @@ EOF ctest --output-on-failure -j $1 # make install should also be test when unittest make install -j 8 - pip install /usr/local/opt/paddle/share/wheels/*.whl + pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi @@ -654,11 +654,21 @@ function gen_fluid_inference_lib() { if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then cat <