提交 0514882b 编写于 作者: N nhzlx

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_ut_for_trt

...@@ -136,10 +136,6 @@ def parse_args(): ...@@ -136,10 +136,6 @@ def parse_args():
'--no_random', '--no_random',
action='store_true', action='store_true',
help='If set, keep the random seed and do not shuffle the data.') help='If set, keep the random seed and do not shuffle the data.')
parser.add_argument(
'--use_lars',
action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.')
parser.add_argument( parser.add_argument(
'--reduce_strategy', '--reduce_strategy',
type=str, type=str,
......
...@@ -200,11 +200,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -200,11 +200,6 @@ def get_model(args, is_train, main_prog, startup_prog):
# configure optimize # configure optimize
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / (args.batch_size * args.gpus) + 1) step = int(total_images / (args.batch_size * args.gpus) + 1)
......
...@@ -224,11 +224,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -224,11 +224,6 @@ def get_model(args, is_train, main_prog, startup_prog):
# configure optimize # configure optimize
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1) step = int(total_images / args.batch_size + 1)
......
...@@ -244,11 +244,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -244,11 +244,6 @@ def get_model(args, is_train, main_prog, startup_prog):
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1) step = int(total_images / args.batch_size + 1)
...@@ -262,8 +257,7 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -262,8 +257,7 @@ def get_model(args, is_train, main_prog, startup_prog):
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr), boundaries=bd, values=lr),
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4), regularization=fluid.regularizer.L2Decay(1e-4))
LARS_weight_decay=lars_decay)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
if args.memory_optimize: if args.memory_optimize:
......
...@@ -29,7 +29,7 @@ INCLUDE(ExternalProject) ...@@ -29,7 +29,7 @@ INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml") SET(MKLML_PROJECT "extern_mklml")
IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL)) IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
MESSAGE(STATUS "use pre defined download url") MESSAGE(STATUS "use pre defined download url")
SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE) SET(MKLML_VER "mklml_lnx_2019.0.20180710" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
ENDIF() ENDIF()
MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
......
# For Readers and Developers
Thanks for reading PaddlePaddle documentation.
Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [Fluiddoc Repo](https://github.com/PaddlePaddle/Paddle) and updated in Fluiddoc Repo.
Please turn to Fluiddoc Repo for the latest documentation.
...@@ -73,7 +73,6 @@ paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', ...@@ -73,7 +73,6 @@ paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program',
paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True))
paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False))
paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0))
paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)) paddle.fluid.initializer.NormalInitializer.__init__ ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0))
...@@ -296,6 +295,7 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', ' ...@@ -296,6 +295,7 @@ paddle.fluid.layers.ssd_loss ArgSpec(args=['location', 'confidence', 'gt_box', '
paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral')) paddle.fluid.layers.detection_map ArgSpec(args=['detect_res', 'label', 'class_num', 'background_label', 'overlap_threshold', 'evaluate_difficult', 'has_state', 'input_states', 'out_states', 'ap_version'], varargs=None, keywords=None, defaults=(0, 0.3, True, None, None, None, 'integral'))
paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)) paddle.fluid.layers.rpn_target_assign ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True))
paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)) paddle.fluid.layers.anchor_generator ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None))
paddle.fluid.layers.roi_perspective_transform ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,))
paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True)) paddle.fluid.layers.generate_proposal_labels ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True))
paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)) paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None))
paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.iou_similarity ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
...@@ -350,25 +350,25 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi ...@@ -350,25 +350,25 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06)) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
......
...@@ -148,13 +148,13 @@ if(WITH_DISTRIBUTE) ...@@ -148,13 +148,13 @@ if(WITH_DISTRIBUTE)
else() else()
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass) cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
endif() endif()
if (NOT WIN32) if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph graph_viz_pass multi_devices_graph_pass graph graph_viz_pass multi_devices_graph_pass
multi_devices_graph_print_pass multi_devices_graph_check_pass multi_devices_graph_print_pass multi_devices_graph_check_pass
fast_threaded_ssa_graph_executor) fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
endif() # NOT WIN32 endif() # NOT WIN32
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
......
...@@ -54,6 +54,8 @@ struct BuildStrategy { ...@@ -54,6 +54,8 @@ struct BuildStrategy {
std::string debug_graphviz_path_{""}; std::string debug_graphviz_path_{""};
bool fuse_elewise_add_act_ops_{false};
bool enable_data_balance_{false}; bool enable_data_balance_{false};
}; };
......
...@@ -20,41 +20,79 @@ namespace paddle { ...@@ -20,41 +20,79 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
template <class T> // Change it to thread safe flags if needed.
class COWPtr { class ThreadUnsafeOwnershipFlags {
public: public:
typedef std::shared_ptr<T> RefPtr; explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
private: ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
RefPtr m_sp; ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void detach() { void SetOwnership(bool flag) { flag_ = flag; }
T* tmp = m_sp.get();
if (!(tmp == nullptr || m_sp.unique())) { // Invoke the callback if it is not owned.
m_sp = RefPtr(new T(*tmp)); template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
} }
} }
public: private:
COWPtr() : m_sp(nullptr) {} bool flag_;
explicit COWPtr(T* t) : m_sp(t) {} };
explicit COWPtr(const RefPtr& refptr) : m_sp(refptr) {}
const T& Data() const { return operator*(); } // Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public:
// Ctor from raw pointer.
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
T* MutableData() { return operator->(); } // Move methods. Steal ownership from origin
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
const T& operator*() const { return *m_sp; } // Copy methods. Not own payload
T& operator*() { COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
detach(); COWPtr& operator=(const COWPtr& other) {
return *m_sp; payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
} }
const T* operator->() const { return m_sp.operator->(); }
T* operator->() { // Access read only data.
detach(); const T& Data() const { return *payload_; }
return m_sp.operator->();
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() {
ownership_.AcquireOwnershipOnce(
[this] { payload_.reset(new T(*payload_)); });
return payload_.get();
} }
private:
// Actual data pointer.
std::shared_ptr<T> payload_;
// Ownership flag.
OwnershipFlags ownership_;
}; };
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -30,14 +30,6 @@ TEST(COWPtr, all) { ...@@ -30,14 +30,6 @@ TEST(COWPtr, all) {
ASSERT_EQ(ptr2.Data(), 10); ASSERT_EQ(ptr2.Data(), 10);
} }
TEST(COWPtr, change_old) {
COWPtr<int> ptr(new int{0});
COWPtr<int> ptr2 = ptr;
*ptr.MutableData() = 10;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(ptr.Data(), 10);
}
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -210,43 +210,6 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars( ...@@ -210,43 +210,6 @@ std::vector<std::string> MultiDevSSAGraphBuilder::FindDistTrainRecvVars(
return recv_vars; return recv_vars;
} }
bool MultiDevSSAGraphBuilder::IsDistTrainOp(
ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) const {
if (send_vars.size() == 0 || recv_vars.size() == 0) {
return false;
}
/**
* Check any of opvars contains `.block` and in sendvars
*/
auto checker = [](const std::vector<std::string> &opvars,
const std::vector<std::string> &rpc_vars) -> bool {
for (auto &var : opvars) {
// a variable name with the suffix `.block` means it's a splited
// variable by (DistributeTranspiler)
// [python/paddle/fluid/transpiler/distribute_transpiler.py]
if (var.find(".block") != std::string::npos &&
std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
return true;
}
}
return false;
};
std::vector<std::string> input_var_names;
std::vector<std::string> output_var_names;
for (ir::Node *input : node->inputs) {
input_var_names.push_back(input->Name());
}
for (ir::Node *output : node->outputs) {
output_var_names.push_back(output->Name());
}
return checker(output_var_names, send_vars) ||
checker(input_var_names, recv_vars);
}
size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID(
const std::vector<std::string> &var_names) const { const std::vector<std::string> &var_names) const {
int64_t numel_sum = 0; int64_t numel_sum = 0;
...@@ -370,7 +333,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -370,7 +333,9 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
} }
} }
is_dist_train = true; is_dist_train = true;
} else if (IsDistTrainOp(node, send_vars, recv_vars)) { } else if (boost::get<int>(node->Op()->GetAttr(
OpProtoAndCheckerMaker::OpRoleAttrName())) ==
static_cast<int>(OpRole::kDist)) {
int op_dev_id = CreateDistTrainOp(&result, node); int op_dev_id = CreateDistTrainOp(&result, node);
if (node->Op()->Type() == "concat") { if (node->Op()->Type() == "concat") {
auto origin_param_name = node->Op()->OutputArgumentNames()[0]; auto origin_param_name = node->Op()->OutputArgumentNames()[0];
...@@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, ...@@ -736,6 +701,7 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
.emplace(varname, op_dev_id); .emplace(varname, op_dev_id);
} }
} else { } else {
LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type();
PADDLE_THROW( PADDLE_THROW(
"the distribute training related op should be in [split_byref, " "the distribute training related op should be in [split_byref, "
"concat]."); "concat].");
......
...@@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass { ...@@ -51,12 +51,6 @@ class MultiDevSSAGraphBuilder : public ir::Pass {
int CreateRPCOp(ir::Graph *result, ir::Node *node) const; int CreateRPCOp(ir::Graph *result, ir::Node *node) const;
int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const;
/**
* Is this operator as the end-point operator before/after send operator.
*/
bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
const std::vector<std::string> &recv_vars) const;
std::vector<std::string> FindDistTrainSendVars( std::vector<std::string> FindDistTrainSendVars(
const std::vector<ir::Node *> &nodes) const; const std::vector<ir::Node *> &nodes) const;
......
...@@ -37,6 +37,8 @@ pass_library(fc_lstm_fuse_pass inference) ...@@ -37,6 +37,8 @@ pass_library(fc_lstm_fuse_pass inference)
pass_library(fc_gru_fuse_pass inference) pass_library(fc_gru_fuse_pass inference)
pass_library(seq_concat_fc_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference)
cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h"
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace ir {
std::unique_ptr<ir::Graph> FuseElewiseAddActPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
std::unordered_set<std::string> act_types = {"relu", "scale"};
graph = FuseActElewiseAdd(std::move(graph), act_types);
graph = FuseElewiseAddAct(std::move(graph), act_types);
// backward
{
std::unordered_set<std::string> in_place_act_types = {"relu_grad"};
graph = FuseElewiseAddActInplaceGrad(std::move(graph), in_place_act_types);
}
// Remove the removable intermediate_out.
RemoveIntermediateOut(graph.get());
return graph;
}
// ele_add(x, act(y))
std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddAct(
std::unique_ptr<ir::Graph> graph,
const std::unordered_set<std::string> &act_types) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("elewise_add_act", graph.get());
GraphPatternDetector gpd;
auto *x = gpd.mutable_pattern()
->NewNode("elewise_add_act/x")
->AsInput()
->assert_is_op_input("elementwise_add", "X");
patterns::ElewiseAddAct elewise_add_act_pattern(gpd.mutable_pattern(),
"elementwise_add");
elewise_add_act_pattern(x, act_types);
int found_elewise_add_act_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(4) << "handle FuseElewiseAddAct fuse";
GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
elewise_add_act_pattern);
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_pattern);
GET_IR_NODE_FROM_SUBGRAPH(act, act, elewise_add_act_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, elewise_add_act_pattern);
std::string ele_x_n = subgraph.at(x)->Name();
std::string ele_y_n = ele_y->Name();
std::string ele_out_n = ele_out->Name();
std::string act_out_n = act_out->Name();
Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n);
VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> "
<< ele_add->Name() << " -> " << ele_out_n << "\n"
<< "\t " << ele_out_n << " -> " << act->Name() << " -> "
<< act_out_n;
ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node);
found_elewise_add_act_count++;
};
gpd(graph.get(), handler);
AddStatis(found_elewise_add_act_count);
return graph;
}
// act(ele_add(x,y))
std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseActElewiseAdd(
std::unique_ptr<ir::Graph> graph,
const std::unordered_set<std::string> &act_types) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("act_elewise_add", graph.get());
GraphPatternDetector gpd;
auto *x = gpd.mutable_pattern()
->NewNode("act_elewise_add/x")
->AsInput()
->assert_is_ops_input(act_types, "X");
patterns::ActElewiseAdd act_elewise_add_pattern(gpd.mutable_pattern(),
"act_elewise_add");
act_elewise_add_pattern(x, act_types);
int found_elewise_add_act_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(4) << "handle FuseElewiseAddAct fuse";
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out,
act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(act, act, act_elewise_add_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, act_elewise_add_pattern);
std::string act_i_n = subgraph.at(x)->Name();
std::string act_o_n = act_out->Name();
std::string elewise_add_x_n = ele_x->Name();
std::string elewise_add_out_n = ele_out->Name();
Node *elewise_add_act_node = CreateFuseElewiseAddActNode(
g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n);
VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n
<< "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> "
<< ele_add->Name() << " -> " << elewise_add_out_n;
ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node);
found_elewise_add_act_count++;
};
gpd(graph.get(), handler);
AddStatis(found_elewise_add_act_count);
return graph;
}
// the backward of act(ele_add(x,y))
// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
// ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
std::unique_ptr<ir::Graph> FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad(
std::unique_ptr<ir::Graph> graph,
const std::unordered_set<std::string> &act_types) const {
PADDLE_ENFORCE(graph.get());
FusePassBase::Init("elewise_add_act_grad", graph.get());
GraphPatternDetector gpd;
auto *d_act_out = gpd.mutable_pattern()
->NewNode("elewise_add_act_grad_inplace/x")
->AsInput()
->assert_is_ops_input(act_types, GradVarName("Out"));
patterns::ElewiseAddActInplaceGrad elewise_add_act_grad_pattern(
gpd.mutable_pattern(), "elewise_add_act_grad_inplace");
elewise_add_act_grad_pattern(d_act_out, act_types);
int found_elewise_add_act_count = 0;
auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
Graph *g) {
VLOG(4) << "handle FuseElewiseAddActGrad1 fuse";
GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out,
elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad, ele_add_grad,
elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(d_ele_x, d_ele_x, elewise_add_act_grad_pattern);
GET_IR_NODE_FROM_SUBGRAPH(d_ele_y, d_ele_y, elewise_add_act_grad_pattern);
std::string d_act_out_n = subgraph.at(d_act_out)->Name();
std::string act_out_n = act_out->Name();
std::string d_itermediate_out_n = d_itermediate_out->Name();
std::string ele_y_n = ele_y->Name();
std::string d_ele_x_n = d_ele_x->Name();
std::string d_ele_y_n = d_ele_y->Name();
OpDesc desc;
desc.SetType("fused_elemwise_activation_grad");
desc.SetInput("IntermediateOut", {});
desc.SetInput("X", {});
desc.SetInput("Y", std::vector<std::string>({ele_y_n}));
desc.SetInput("Out", std::vector<std::string>({act_out_n}));
desc.SetInput(GradVarName("Out"), std::vector<std::string>({d_act_out_n}));
desc.SetOutput(GradVarName("X"), std::vector<std::string>({d_ele_x_n}));
desc.SetOutput(GradVarName("Y"), std::vector<std::string>({d_ele_y_n}));
desc.SetOutput(GradVarName("IntermediateOut"),
std::vector<std::string>({d_itermediate_out_n}));
desc.SetAttr("save_intermediate_out", false);
desc.SetAttr("functor_list",
std::vector<std::string>(
{act_grad->Op()->Type(), ele_add_grad->Op()->Type()}));
for (auto &n : {act_grad->Op(), ele_add_grad->Op()}) {
for (auto &m_ele : n->GetAttrMap()) {
desc.SetAttr(m_ele.first, m_ele.second);
}
}
auto fused_node = g->CreateOpNode(&desc);
VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
<< act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t "
<< d_itermediate_out_n << " and " << act_out_n << " -> "
<< ele_add_grad->Name() << " -> " << d_itermediate_out_n;
ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node);
found_elewise_add_act_count++;
};
gpd(graph.get(), handler);
AddStatis(found_elewise_add_act_count);
return graph;
}
Node *FuseElewiseAddActPass::CreateFuseElewiseAddActNode(
Graph *g, const Node *op_1, const Node *op_2, const std::string &ele_x_n,
const std::string &ele_y_n, const std::string &ele_out_n,
const std::string &act_out_n) const {
OpDesc desc;
desc.SetInput("X", std::vector<std::string>({ele_x_n}));
desc.SetInput("Y", std::vector<std::string>({ele_y_n}));
desc.SetOutput("Out", std::vector<std::string>({act_out_n}));
desc.SetOutput("IntermediateOut", std::vector<std::string>({ele_out_n}));
desc.SetType("fused_elemwise_activation");
desc.SetAttr("save_intermediate_out", true);
desc.SetAttr("functor_list", std::vector<std::string>(
{op_1->Op()->Type(), op_2->Op()->Type()}));
// Set attrs
for (auto &n : {op_1->Op(), op_2->Op()}) {
for (auto &m_ele : n->GetAttrMap()) {
desc.SetAttr(m_ele.first, m_ele.second);
}
}
auto elewise_add_act_node = g->CreateOpNode(&desc);
return elewise_add_act_node;
}
void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const {
std::unordered_set<const Node *> need_removed_nodes;
for (auto &cur_node : graph->Nodes()) {
if (cur_node->IsVar()) continue;
if (cur_node->Name() == "fused_elemwise_activation") {
bool save_intermediate_out =
boost::get<bool>(cur_node->Op()->GetAttr("save_intermediate_out"));
auto intermediate_out_args = cur_node->Op()->Output("IntermediateOut");
PADDLE_ENFORCE(
save_intermediate_out && !intermediate_out_args.empty(),
"The %s should save the intermediate_out in the fusing stage.",
cur_node->Name());
// If the intermediate_out's output is empty, it should be removed.
auto cur_node_outputs = cur_node->outputs;
for (auto &out : cur_node_outputs) {
if (out->Name() == intermediate_out_args[0]) {
if (out->outputs.size() == 0) {
cur_node->outputs = this->RemoveNode(out, cur_node->outputs);
need_removed_nodes.insert(std::move(out));
cur_node->Op()->SetAttr("save_intermediate_out", false);
}
}
}
} else if (cur_node->Name() == "fused_elemwise_activation_grad") {
auto intermediate_out_grad_args =
cur_node->Op()->Output(GradVarName("IntermediateOut"));
PADDLE_ENFORCE(
!intermediate_out_grad_args.empty(),
"The %s should save the intermediate_out in the fusing stage.",
cur_node->Name());
auto cur_node_outputs = cur_node->outputs;
// If the intermediate_out_g's output is empty, it should be removed.
for (auto &out : cur_node_outputs) {
if (out->Name() == intermediate_out_grad_args[0] &&
out->outputs.empty()) {
cur_node->Op()->SetOutput(GradVarName("IntermediateOut"), {});
cur_node->outputs = this->RemoveNode(out, cur_node->outputs);
need_removed_nodes.insert(std::move(out));
}
}
}
}
GraphSafeRemoveNodes(graph, need_removed_nodes);
}
void FuseElewiseAddActPass::ReLinkNodes(Graph *graph,
const Node *intermediate_out,
Node *op_1, Node *op_2,
Node *fused_op) const { // delete act
for (auto &in : op_1->inputs) {
fused_op->inputs.emplace_back(in);
in->outputs = this->ReplaceNode(op_1, fused_op, in->outputs);
}
std::unordered_set<const Node *> nodes2delete;
for (auto &out : op_1->outputs) {
if (out->IsCtrlVar()) {
auto result_iter = std::find_if(
op_2->inputs.begin(), op_2->inputs.end(),
[&out](const Node *node) -> bool { return node == out; });
if (result_iter == op_2->inputs.end()) {
IR_OP_VAR_LINK(fused_op, out);
} else {
nodes2delete.emplace(out);
}
} else {
PADDLE_ENFORCE(out == intermediate_out);
IR_OP_VAR_LINK(fused_op, out);
}
}
for (auto &in : op_2->inputs) {
if (in == intermediate_out || nodes2delete.count(in)) {
continue;
}
fused_op->inputs.emplace_back(in);
in->outputs = this->ReplaceNode(op_2, fused_op, in->outputs);
}
for (auto &out : op_2->outputs) {
IR_OP_VAR_LINK(fused_op, out);
}
nodes2delete.insert(std::move(op_1));
nodes2delete.insert(std::move(op_2));
GraphSafeRemoveNodes(graph, nodes2delete);
}
std::vector<Node *> FuseElewiseAddActPass::ReplaceNode(
Node *cur_node, Node *new_node, const std::vector<Node *> &nodes) const {
std::vector<Node *> new_list(nodes.size());
bool has_replaced = false;
std::transform(nodes.begin(), nodes.end(), new_list.begin(),
[&](Node *node) -> Node * {
if (node == cur_node) {
has_replaced = true;
return new_node;
}
return node;
});
PADDLE_ENFORCE(has_replaced, "Not find %s in the node list.",
cur_node->Name());
return new_list;
}
std::vector<Node *> FuseElewiseAddActPass::RemoveNode(
Node *trg_node, const std::vector<Node *> &nodes) const {
std::vector<Node *> new_list(nodes.size());
auto end_iter =
std::copy_if(nodes.begin(), nodes.end(), new_list.begin(),
[&](Node *node) -> bool { return node != trg_node; });
new_list.resize(
static_cast<uint64_t>(std::distance(new_list.begin(), end_iter)));
return new_list;
}
} // namespace ir
} // namespace framework
} // namespace paddle
REGISTER_PASS(fuse_elewise_add_act_pass,
paddle::framework::ir::FuseElewiseAddActPass);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass.h"
namespace paddle {
namespace framework {
namespace ir {
/*
* Fuse the ElewiseAdd and activation
*/
class FuseElewiseAddActPass : public FusePassBase {
public:
virtual ~FuseElewiseAddActPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> FuseElewiseAddAct(
std::unique_ptr<ir::Graph> graph,
const std::unordered_set<std::string> &act_types) const;
std::unique_ptr<ir::Graph> FuseActElewiseAdd(
std::unique_ptr<ir::Graph> graph,
const std::unordered_set<std::string> &act_types) const;
std::unique_ptr<ir::Graph> FuseElewiseAddActInplaceGrad(
std::unique_ptr<ir::Graph> graph,
const std::unordered_set<std::string> &act_types) const;
/**
* Remove the removable intermediate_out.
* - If the intermediate_out is only used by the backward op, but the
* backward op doesn't use intermediate_out.
* - If the intermediate_out_grad is not used by any op.
*/
void RemoveIntermediateOut(Graph *graph) const;
std::vector<Node *> ReplaceNode(Node *cur_node, Node *new_node,
const std::vector<Node *> &nodes) const;
std::vector<Node *> RemoveNode(Node *trg_node,
const std::vector<Node *> &nodes) const;
void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1,
Node *op_2, Node *fused_op) const;
Node *CreateFuseElewiseAddActNode(Graph *g, const Node *op_1,
const Node *op_2,
const std::string &ele_x_n,
const std::string &ele_y_n,
const std::string &ele_out_n,
const std::string &act_out_n) const;
};
} // namespace ir
} // namespace framework
} // namespace paddle
...@@ -20,10 +20,10 @@ ...@@ -20,10 +20,10 @@
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
...@@ -34,7 +34,7 @@ using string::Style; ...@@ -34,7 +34,7 @@ using string::Style;
size_t PDPattern::id_ = 0UL; size_t PDPattern::id_ = 0UL;
PDNode* PDPattern::NewNode(const std::string& name) { PDNode *PDPattern::NewNode(const std::string &name) {
if (!name.empty()) { if (!name.empty()) {
PADDLE_ENFORCE_EQ(node_map_.count(name), 0, PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
"PDNode's name should be unique, get duplicate [%s]", "PDNode's name should be unique, get duplicate [%s]",
...@@ -42,12 +42,12 @@ PDNode* PDPattern::NewNode(const std::string& name) { ...@@ -42,12 +42,12 @@ PDNode* PDPattern::NewNode(const std::string& name) {
} }
nodes_.emplace_back(new PDNode(this, name)); nodes_.emplace_back(new PDNode(this, name));
auto* cur = nodes_.back().get(); auto *cur = nodes_.back().get();
node_map_[name] = cur; node_map_[name] = cur;
return cur; return cur;
} }
PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) {
if (!name.empty()) { if (!name.empty()) {
PADDLE_ENFORCE_EQ(node_map_.count(name), 0, PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
"PDNode's name should be unique, get duplicate [%s]", "PDNode's name should be unique, get duplicate [%s]",
...@@ -55,12 +55,12 @@ PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { ...@@ -55,12 +55,12 @@ PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
} }
nodes_.emplace_back(new PDNode(std::move(teller), this, name)); nodes_.emplace_back(new PDNode(std::move(teller), this, name));
auto* cur = nodes_.back().get(); auto *cur = nodes_.back().get();
node_map_[name] = cur; node_map_[name] = cur;
return cur; return cur;
} }
PDNode* PDPattern::RetrieveNode(const std::string& id) const { PDNode *PDPattern::RetrieveNode(const std::string &id) const {
auto it = node_map_.find(id); auto it = node_map_.find(id);
if (it == node_map_.end()) { if (it == node_map_.end()) {
return nullptr; return nullptr;
...@@ -69,14 +69,14 @@ PDNode* PDPattern::RetrieveNode(const std::string& id) const { ...@@ -69,14 +69,14 @@ PDNode* PDPattern::RetrieveNode(const std::string& id) const {
return it->second; return it->second;
} }
void PDPattern::AddEdge(PDNode* a, PDNode* b) { void PDPattern::AddEdge(PDNode *a, PDNode *b) {
PADDLE_ENFORCE(a); PADDLE_ENFORCE(a);
PADDLE_ENFORCE(b); PADDLE_ENFORCE(b);
PADDLE_ENFORCE(a != b, "can't connect to the same nodes."); PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
edges_.emplace_back(a, b); edges_.emplace_back(a, b);
} }
void GraphPatternDetector::operator()(Graph* graph, void GraphPatternDetector::operator()(Graph *graph,
GraphPatternDetector::handle_t handler) { GraphPatternDetector::handle_t handler) {
if (!MarkPDNodesInGraph(*graph)) { if (!MarkPDNodesInGraph(*graph)) {
return; return;
...@@ -90,18 +90,18 @@ void GraphPatternDetector::operator()(Graph* graph, ...@@ -90,18 +90,18 @@ void GraphPatternDetector::operator()(Graph* graph,
if (subgraphs.empty()) return; if (subgraphs.empty()) return;
PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size()); PrettyLogEndl(Style::detail(), "--- detect %d subgraphs", subgraphs.size());
int id = 0; int id = 0;
for (auto& g : subgraphs) { for (auto &g : subgraphs) {
VLOG(3) << "optimizing #" << id++ << " subgraph"; VLOG(3) << "optimizing #" << id++ << " subgraph";
handler(g, graph); handler(g, graph);
} }
} }
bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
VLOG(3) << "mark pdnodes in graph"; VLOG(3) << "mark pdnodes in graph";
if (graph.Nodes().empty()) return false; if (graph.Nodes().empty()) return false;
for (auto& node : GraphTraits::DFS(graph)) { for (auto &node : GraphTraits::DFS(graph)) {
for (const auto& pdnode : pattern_.nodes()) { for (const auto &pdnode : pattern_.nodes()) {
if (pdnode->Tell(&node)) { if (pdnode->Tell(&node)) {
VLOG(4) << "pdnode " << pdnode->name() << " marked"; VLOG(4) << "pdnode " << pdnode->name() << " marked";
pdnodes2nodes_[pdnode.get()].insert(&node); pdnodes2nodes_[pdnode.get()].insert(&node);
...@@ -109,15 +109,15 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { ...@@ -109,15 +109,15 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
} }
} }
// Check to early stop if some PDNode can't find matched Node. // Check to early stop if some PDNode can't find matched Node.
for (auto& pdnode : pattern_.nodes()) { for (auto &pdnode : pattern_.nodes()) {
if (!pdnodes2nodes_.count(pdnode.get())) { if (!pdnodes2nodes_.count(pdnode.get())) {
VLOG(4) << pdnode->name() << " can't find matched Node, early stop"; VLOG(4) << pdnode->name() << " can't find matched Node, early stop";
// return false; // return false;
} }
} }
for (auto& item : pdnodes2nodes_) { for (auto &item : pdnodes2nodes_) {
for (auto& n : item.second) { for (auto &n : item.second) {
GetMarkedNodes(const_cast<Graph*>(&graph)).insert(n); GetMarkedNodes(const_cast<Graph *>(&graph)).insert(n);
} }
} }
VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; VLOG(3) << pdnodes2nodes_.size() << " nodes marked";
...@@ -128,28 +128,28 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) { ...@@ -128,28 +128,28 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph& graph) {
// The intermediate Nodes can only link to the nodes inside the pattern, or this // The intermediate Nodes can only link to the nodes inside the pattern, or this
// subgraph will be droped. // subgraph will be droped.
void GraphPatternDetector::ValidateByNodeRole( void GraphPatternDetector::ValidateByNodeRole(
std::vector<GraphPatternDetector::subgraph_t>* subgraphs) { std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
std::vector<GraphPatternDetector::subgraph_t> result; std::vector<GraphPatternDetector::subgraph_t> result;
subgraphs->erase( subgraphs->erase(
std::remove_if( std::remove_if(
subgraphs->begin(), subgraphs->end(), subgraphs->begin(), subgraphs->end(),
[](const GraphPatternDetector::subgraph_t& subgraph) -> bool { [](const GraphPatternDetector::subgraph_t &subgraph) -> bool {
// Collect the inputs and outputs. // Collect the inputs and outputs.
std::unordered_set<Node*> ios; std::unordered_set<Node *> ios;
for (auto& item : subgraph) { for (auto &item : subgraph) {
if (!item.first->IsIntermediate()) { if (!item.first->IsIntermediate()) {
ios.insert(item.second); ios.insert(item.second);
} }
} }
for (auto& item : subgraph) { for (auto &item : subgraph) {
if (item.first->IsIntermediate()) { if (item.first->IsIntermediate()) {
for (auto* x : item.second->inputs) { for (auto *x : item.second->inputs) {
if (!ios.count(x)) { if (!ios.count(x)) {
return true; return true;
} }
} }
for (auto* x : item.second->outputs) { for (auto *x : item.second->outputs) {
if (!ios.count(x)) { if (!ios.count(x)) {
return true; return true;
} }
...@@ -162,9 +162,9 @@ void GraphPatternDetector::ValidateByNodeRole( ...@@ -162,9 +162,9 @@ void GraphPatternDetector::ValidateByNodeRole(
} }
struct HitGroup { struct HitGroup {
std::unordered_map<PDNode*, Node*> roles; std::unordered_map<PDNode *, Node *> roles;
bool Match(Node* node, PDNode* pat) { bool Match(Node *node, PDNode *pat) {
if (nodes_.count(node)) { if (nodes_.count(node)) {
if (!roles.count(pat)) return false; if (!roles.count(pat)) return false;
return roles[pat] == node; return roles[pat] == node;
...@@ -172,18 +172,18 @@ struct HitGroup { ...@@ -172,18 +172,18 @@ struct HitGroup {
return !roles.count(pat) || roles.at(pat) == node; return !roles.count(pat) || roles.at(pat) == node;
} }
void Register(Node* node, PDNode* pat) { void Register(Node *node, PDNode *pat) {
roles[pat] = node; roles[pat] = node;
nodes_.insert(node); nodes_.insert(node);
} }
private: private:
std::unordered_set<Node*> nodes_; std::unordered_set<Node *> nodes_;
}; };
// Tell whether Node a links to b. // Tell whether Node a links to b.
bool IsNodesLink(Node* a, Node* b) { bool IsNodesLink(Node *a, Node *b) {
for (auto* node : a->outputs) { for (auto *node : a->outputs) {
if (b == node) { if (b == node) {
return true; return true;
} }
...@@ -198,10 +198,10 @@ GraphPatternDetector::DetectPatterns() { ...@@ -198,10 +198,10 @@ GraphPatternDetector::DetectPatterns() {
std::vector<HitGroup> init_groups; std::vector<HitGroup> init_groups;
std::array<std::vector<HitGroup>, 2> bi_records; std::array<std::vector<HitGroup>, 2> bi_records;
// PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed"); // PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
auto* first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get()
: pattern_.edges().front().first; : pattern_.edges().front().first;
if (!pdnodes2nodes_.count(first_pnode)) return result; if (!pdnodes2nodes_.count(first_pnode)) return result;
for (auto* node : pdnodes2nodes_[first_pnode]) { for (auto *node : pdnodes2nodes_[first_pnode]) {
HitGroup group; HitGroup group;
group.roles[first_pnode] = node; group.roles[first_pnode] = node;
init_groups.emplace_back(group); init_groups.emplace_back(group);
...@@ -212,21 +212,21 @@ GraphPatternDetector::DetectPatterns() { ...@@ -212,21 +212,21 @@ GraphPatternDetector::DetectPatterns() {
// Extend a PDNode to subgraphs by deducing the connection relations defined // Extend a PDNode to subgraphs by deducing the connection relations defined
// in edges of PDNodes. // in edges of PDNodes.
for (const auto& edge : pattern_.edges()) { for (const auto &edge : pattern_.edges()) {
VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name();
// TODO(Superjomn) Fix bug here, the groups might be duplicate here. // TODO(Superjomn) Fix bug here, the groups might be duplicate here.
// Each role has two PDNodes, which indicates two roles. // Each role has two PDNodes, which indicates two roles.
// Detect two Nodes that can match these two roles and they are connected. // Detect two Nodes that can match these two roles and they are connected.
auto& pre_groups = bi_records[step % 2]; auto &pre_groups = bi_records[step % 2];
auto& cur_groups = bi_records[1 - (step++ % 2)]; auto &cur_groups = bi_records[1 - (step++ % 2)];
cur_groups.clear(); cur_groups.clear();
if (pre_groups.empty()) break; if (pre_groups.empty()) break;
// source -> target // source -> target
for (Node* source : pdnodes2nodes_[edge.first]) { for (Node *source : pdnodes2nodes_[edge.first]) {
for (Node* target : pdnodes2nodes_[edge.second]) { for (Node *target : pdnodes2nodes_[edge.second]) {
VLOG(8) << "check " << source->id() << " -- " << target->id(); VLOG(8) << "check " << source->id() << " -- " << target->id();
// TODO(Superjomn) add some prune strategies. // TODO(Superjomn) add some prune strategies.
for (const auto& group : pre_groups) { for (const auto &group : pre_groups) {
HitGroup new_group = group; HitGroup new_group = group;
if (IsNodesLink(source, target) && if (IsNodesLink(source, target) &&
new_group.Match(source, edge.first)) { new_group.Match(source, edge.first)) {
...@@ -241,17 +241,17 @@ GraphPatternDetector::DetectPatterns() { ...@@ -241,17 +241,17 @@ GraphPatternDetector::DetectPatterns() {
} }
} }
VLOG(3) << "step " << step << " get records: " << cur_groups.size(); VLOG(3) << "step " << step << " get records: " << cur_groups.size();
for (auto& group : cur_groups) { for (auto &group : cur_groups) {
for (auto& item : group.roles) { for (auto &item : group.roles) {
VLOG(4) << "node " << item.second->id() << " as " << item.first->name(); VLOG(4) << "node " << item.second->id() << " as " << item.first->name();
} }
VLOG(4) << "========================================================="; VLOG(4) << "=========================================================";
} }
} }
for (auto& group : bi_records[step % 2]) { for (auto &group : bi_records[step % 2]) {
GraphPatternDetector::subgraph_t subgraph; GraphPatternDetector::subgraph_t subgraph;
for (auto& role : group.roles) { for (auto &role : group.roles) {
subgraph.emplace(role.first, role.second); subgraph.emplace(role.first, role.second);
} }
result.emplace_back(subgraph); result.emplace_back(subgraph);
...@@ -260,16 +260,16 @@ GraphPatternDetector::DetectPatterns() { ...@@ -260,16 +260,16 @@ GraphPatternDetector::DetectPatterns() {
} }
void GraphPatternDetector::UniquePatterns( void GraphPatternDetector::UniquePatterns(
std::vector<GraphPatternDetector::subgraph_t>* subgraphs) { std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
if (subgraphs->empty()) return; if (subgraphs->empty()) return;
std::vector<GraphPatternDetector::subgraph_t> result; std::vector<GraphPatternDetector::subgraph_t> result;
std::unordered_set<size_t> set; std::unordered_set<size_t> set;
for (auto& g : *subgraphs) { for (auto &g : *subgraphs) {
size_t key = 0; size_t key = 0;
for (auto& item : g) { for (auto &item : g) {
key ^= std::hash<void*>{}(item.first); key ^= std::hash<void *>{}(item.first);
key ^= std::hash<void*>{}(item.second); key ^= std::hash<void *>{}(item.second);
} }
if (!set.count(key)) { if (!set.count(key)) {
result.emplace_back(g); result.emplace_back(g);
...@@ -280,20 +280,20 @@ void GraphPatternDetector::UniquePatterns( ...@@ -280,20 +280,20 @@ void GraphPatternDetector::UniquePatterns(
} }
void GraphPatternDetector::RemoveOverlappedMatch( void GraphPatternDetector::RemoveOverlappedMatch(
std::vector<subgraph_t>* subgraphs) { std::vector<subgraph_t> *subgraphs) {
std::vector<subgraph_t> result; std::vector<subgraph_t> result;
std::unordered_set<Node*> node_set; std::unordered_set<Node *> node_set;
for (const auto& subgraph : *subgraphs) { for (const auto &subgraph : *subgraphs) {
bool valid = true; bool valid = true;
for (auto& item : subgraph) { for (auto &item : subgraph) {
if (item.first->IsIntermediate() && node_set.count(item.second)) { if (item.first->IsIntermediate() && node_set.count(item.second)) {
valid = false; valid = false;
break; break;
} }
} }
if (valid) { if (valid) {
for (auto& item : subgraph) { for (auto &item : subgraph) {
node_set.insert(item.second); node_set.insert(item.second);
} }
result.push_back(subgraph); result.push_back(subgraph);
...@@ -307,71 +307,81 @@ std::string PDPattern::DotString() const { ...@@ -307,71 +307,81 @@ std::string PDPattern::DotString() const {
Dot dot; Dot dot;
int id = 0; int id = 0;
// Create Nodes // Create Nodes
std::unordered_map<PDNode*, std::string> node2dot; std::unordered_map<PDNode *, std::string> node2dot;
for (const auto& node : nodes()) { for (const auto &node : nodes()) {
std::string node_id = "Node" + std::to_string(id++); std::string node_id = "Node" + std::to_string(id++);
dot.AddNode(node_id, {}, node->name()); dot.AddNode(node_id, {}, node->name());
node2dot[node.get()] = node_id; node2dot[node.get()] = node_id;
} }
// Create Edges // Create Edges
for (const auto& edge : edges()) { for (const auto &edge : edges()) {
if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) { if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) {
LOG(ERROR) << "no node " << edge.first << " " << edge.second; LOG(ERROR) << "no node " << edge.first << " " << edge.second;
continue; continue;
} }
auto& src = node2dot.at(edge.first); auto &src = node2dot.at(edge.first);
auto& trg = node2dot.at(edge.second); auto &trg = node2dot.at(edge.second);
dot.AddEdge(src, trg, {}); dot.AddEdge(src, trg, {});
} }
return dot.Build(); return dot.Build();
} }
PDNode& PDNode::LinksTo(const std::vector<PDNode*>& others) { PDNode &PDNode::LinksTo(const std::vector<PDNode *> &others) {
// extend outlinks. // extend outlinks.
for (PDNode* x : others) { for (PDNode *x : others) {
pattern_->AddEdge(this, x); pattern_->AddEdge(this, x);
} }
return *this; return *this;
} }
PDNode& PDNode::LinksFrom(const std::vector<PDNode*>& others) { PDNode &PDNode::LinksFrom(const std::vector<PDNode *> &others) {
// extend outlinks. // extend outlinks.
for (PDNode* x : others) { for (PDNode *x : others) {
pattern_->AddEdge(x, this); pattern_->AddEdge(x, this);
} }
return *this; return *this;
} }
PDNode* PDNode::assert_is_op() { PDNode *PDNode::assert_is_op() {
asserts_.emplace_back([](Node* x) { return x && x->IsOp(); }); asserts_.emplace_back([](Node *x) { return x && x->IsOp(); });
return this; return this;
} }
PDNode* PDNode::assert_is_op(const std::string& op_type) {
asserts_.emplace_back([op_type](Node* x) { PDNode *PDNode::assert_is_op(const std::string &op_type) {
asserts_.emplace_back([op_type](Node *x) {
return x && x->IsOp() && x->Op()->Type() == op_type; return x && x->IsOp() && x->Op()->Type() == op_type;
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_var() {
asserts_.emplace_back([](Node* x) { return x && x->IsVar(); }); PDNode *PDNode::assert_is_var() {
asserts_.emplace_back([](Node *x) { return x && x->IsVar(); });
return this;
}
PDNode *PDNode::assert_is_not_ctrl_var() {
asserts_.emplace_back([](Node *x) { return x && !x->IsCtrlVar(); });
return this; return this;
} }
PDNode* PDNode::assert_var_not_persistable() {
PDNode *PDNode::assert_var_not_persistable() {
assert_is_var(); assert_is_var();
asserts_.emplace_back([](Node* x) { return !x->Var()->Persistable(); }); asserts_.emplace_back([](Node *x) { return !x->Var()->Persistable(); });
return this; return this;
} }
PDNode* PDNode::assert_is_persistable_var() {
PDNode *PDNode::assert_is_persistable_var() {
assert_is_var(); assert_is_var();
asserts_.emplace_back([=](Node* x) { return x->Var()->Persistable(); }); asserts_.emplace_back([=](Node *x) { return x->Var()->Persistable(); });
return this; return this;
} }
PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
const std::string& argument, int nth) { PDNode *PDNode::assert_is_op_nth_input(const std::string &op_type,
const std::string &argument, int nth) {
assert_is_var(); assert_is_var();
assert_is_op_input(op_type); assert_is_op_input(op_type);
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node *x) {
for (auto* op : x->outputs) { for (auto *op : x->outputs) {
if (op->IsOp() && op->Op()->Type() == op_type && if (op->IsOp() && op->Op()->Type() == op_type &&
IsNthInput(x, op, argument, nth)) IsNthInput(x, op, argument, nth))
return true; return true;
...@@ -380,11 +390,12 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type, ...@@ -380,11 +390,12 @@ PDNode* PDNode::assert_is_op_nth_input(const std::string& op_type,
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
const std::string& argument, int nth) { PDNode *PDNode::assert_is_op_nth_output(const std::string &op_type,
const std::string &argument, int nth) {
assert_is_var(); assert_is_var();
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node *x) {
for (auto* op : x->inputs) { for (auto *op : x->inputs) {
if (op->IsOp() && op->Op()->Type() == op_type && if (op->IsOp() && op->Op()->Type() == op_type &&
IsNthOutput(x, op, argument, nth)) IsNthOutput(x, op, argument, nth))
return true; return true;
...@@ -393,10 +404,11 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type, ...@@ -393,10 +404,11 @@ PDNode* PDNode::assert_is_op_nth_output(const std::string& op_type,
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_only_input_of_op(const std::string& op_type) {
PDNode *PDNode::assert_is_only_input_of_op(const std::string &op_type) {
assert_is_var(); assert_is_var();
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node *x) {
for (auto* op : x->outputs) { for (auto *op : x->outputs) {
if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type && if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type &&
op->inputs.size() == 1) { op->inputs.size() == 1) {
return true; return true;
...@@ -406,10 +418,11 @@ PDNode* PDNode::assert_is_only_input_of_op(const std::string& op_type) { ...@@ -406,10 +418,11 @@ PDNode* PDNode::assert_is_only_input_of_op(const std::string& op_type) {
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_only_output_of_op(const std::string& op_type) {
PDNode *PDNode::assert_is_only_output_of_op(const std::string &op_type) {
assert_is_var(); assert_is_var();
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node *x) {
for (auto* op : x->inputs) { for (auto *op : x->inputs) {
if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type && if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type &&
op->outputs.size() == 1) { op->outputs.size() == 1) {
return true; return true;
...@@ -419,10 +432,11 @@ PDNode* PDNode::assert_is_only_output_of_op(const std::string& op_type) { ...@@ -419,10 +432,11 @@ PDNode* PDNode::assert_is_only_output_of_op(const std::string& op_type) {
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
PDNode *PDNode::assert_is_op_output(const std::string &op_type) {
assert_is_var(); assert_is_var();
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node *x) {
for (auto* op : x->inputs) { for (auto *op : x->inputs) {
if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) {
return true; return true;
} }
...@@ -431,16 +445,17 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) { ...@@ -431,16 +445,17 @@ PDNode* PDNode::assert_is_op_output(const std::string& op_type) {
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_op_output(const std::string& op_type,
const std::string& argument) { PDNode *PDNode::assert_is_op_output(const std::string &op_type,
const std::string &argument) {
assert_is_var(); assert_is_var();
assert_is_op_nth_output(op_type, argument, 0); assert_is_op_nth_output(op_type, argument, 0);
return this; return this;
} }
PDNode* PDNode::assert_is_op_input(const std::string& op_type) { PDNode *PDNode::assert_is_op_input(const std::string &op_type) {
assert_is_var(); assert_is_var();
asserts_.emplace_back([=](Node* x) { asserts_.emplace_back([=](Node *x) {
for (auto* op : x->outputs) { for (auto *op : x->outputs) {
if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) {
return true; return true;
} }
...@@ -449,72 +464,161 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) { ...@@ -449,72 +464,161 @@ PDNode* PDNode::assert_is_op_input(const std::string& op_type) {
}); });
return this; return this;
} }
PDNode* PDNode::assert_is_op_input(const std::string& op_type,
const std::string& argument) { PDNode *PDNode::assert_is_op_input(const std::string &op_type,
const std::string &argument) {
assert_is_var(); assert_is_var();
assert_is_op_nth_input(op_type, argument, 0); assert_is_op_nth_input(op_type, argument, 0);
return this; return this;
} }
PDNode* PDNode::assert_op_has_n_inputs(const std::string& op_type, size_t n) {
PDNode *PDNode::assert_op_has_n_inputs(const std::string &op_type, size_t n) {
assert_is_op(op_type); assert_is_op(op_type);
asserts_.emplace_back([=](Node* x) { return x->inputs.size() == n; }); asserts_.emplace_back([=](Node *x) { return x->inputs.size() == n; });
return this; return this;
} }
PDNode* PDNode::assert_op_has_n_outputs(const std::string& op_type, size_t n) {
PDNode *PDNode::assert_op_has_n_outputs(const std::string &op_type, size_t n) {
assert_is_op(op_type); assert_is_op(op_type);
asserts_.emplace_back([=](Node* x) { return x->outputs.size() == n; }); asserts_.emplace_back([=](Node *x) { return x->outputs.size() == n; });
return this; return this;
} }
PDNode* PDNode::assert_more(PDNode::teller_t&& teller) {
PDNode *PDNode::assert_more(PDNode::teller_t &&teller) {
asserts_.emplace_back(std::move(teller)); asserts_.emplace_back(std::move(teller));
return this; return this;
} }
bool VarLinksToOp(Node* node, const std::string& op_type) { PDNode *PDNode::assert_is_ops(const std::unordered_set<std::string> &op_types) {
for (auto* out : node->outputs) { asserts_.emplace_back([op_types](Node *x) {
return x && x->IsOp() && op_types.count(x->Op()->Type());
});
return this;
}
PDNode *PDNode::assert_is_ops_nth_input(
const std::unordered_set<std::string> &op_types,
const std::string &argument, int nth) {
assert_is_var();
assert_is_ops_input(op_types);
asserts_.emplace_back([=](Node *x) {
for (auto *op : x->outputs) {
if (op->IsOp() && op_types.count(op->Op()->Type()) &&
IsNthInput(x, op, argument, nth))
return true;
}
return false;
});
return this;
}
PDNode *PDNode::assert_is_ops_nth_output(
const std::unordered_set<std::string> &op_types,
const std::string &argument, int nth) {
assert_is_var();
asserts_.emplace_back([=](Node *x) {
for (auto *op : x->inputs) {
if (op->IsOp() && op_types.count(op->Op()->Type()) &&
IsNthOutput(x, op, argument, nth))
return true;
}
return false;
});
return this;
}
PDNode *PDNode::assert_is_ops_output(
const std::unordered_set<std::string> &op_types) {
assert_is_var();
asserts_.emplace_back([=](Node *x) {
for (auto *op : x->inputs) {
if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type())) {
return true;
}
}
return false;
});
return this;
}
PDNode *PDNode::assert_is_ops_output(
const std::unordered_set<std::string> &op_types,
const std::string &argument) {
assert_is_var();
assert_is_ops_nth_output(op_types, argument, 0);
return this;
}
PDNode *PDNode::assert_is_ops_input(
const std::unordered_set<std::string> &op_types) {
assert_is_var();
asserts_.emplace_back([=](Node *x) {
for (auto *op : x->outputs) {
if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type())) {
return true;
}
}
return false;
});
return this;
}
PDNode *PDNode::assert_is_ops_input(
const std::unordered_set<std::string> &op_types,
const std::string &argument) {
assert_is_var();
assert_is_ops_nth_input(op_types, argument, 0);
return this;
}
bool VarLinksToOp(Node *node, const std::string &op_type) {
for (auto *out : node->outputs) {
if (out->IsOp() && out->Op()->Type() == op_type) { if (out->IsOp() && out->Op()->Type() == op_type) {
return true; return true;
} }
} }
return false; return false;
} }
bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth) {
bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) {
PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp()); PADDLE_ENFORCE(op->IsOp());
if (op->Op()->Input(argument).size() <= nth) return false; if (op->Op()->Input(argument).size() <= nth) return false;
return var->Name() == op->Op()->Input(argument)[nth]; return var->Name() == op->Op()->Input(argument)[nth];
} }
bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth) {
bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) {
PADDLE_ENFORCE(var->IsVar()); PADDLE_ENFORCE(var->IsVar());
PADDLE_ENFORCE(op->IsOp()); PADDLE_ENFORCE(op->IsOp());
if (op->Op()->Output(argument).size() <= nth) return false; if (op->Op()->Output(argument).size() <= nth) return false;
return var->Name() == op->Op()->Output(argument)[nth]; return var->Name() == op->Op()->Output(argument)[nth];
} }
void GraphSafeRemoveNodes(Graph* graph,
const std::unordered_set<const Node*>& nodes) { void GraphSafeRemoveNodes(Graph *graph,
for (auto* node : nodes) { const std::unordered_set<const Node *> &nodes) {
graph->RemoveNode(const_cast<Node*>(node)); for (auto *node : nodes) {
graph->RemoveNode(const_cast<Node *>(node));
} }
for (auto* node : graph->Nodes()) { for (auto *node : graph->Nodes()) {
for (auto it = node->inputs.begin(); it != node->inputs.end();) { for (auto it = node->inputs.begin(); it != node->inputs.end();) {
if (nodes.count(*it)) { if (nodes.count(*it)) {
it = const_cast<Node*>(node)->inputs.erase(it); it = const_cast<Node *>(node)->inputs.erase(it);
} else { } else {
it++; it++;
} }
} }
for (auto it = node->outputs.begin(); it != node->outputs.end();) { for (auto it = node->outputs.begin(); it != node->outputs.end();) {
if (nodes.count(*it)) { if (nodes.count(*it)) {
it = const_cast<Node*>(node)->outputs.erase(it); it = const_cast<Node *>(node)->outputs.erase(it);
} else { } else {
it++; it++;
} }
} }
} }
} }
bool VarLinksFromOp(Node* node, const std::string& op_type) {
for (auto* out : node->inputs) { bool VarLinksFromOp(Node *node, const std::string &op_type) {
for (auto *out : node->inputs) {
if (out->IsOp() && out->Op()->Type() == op_type) { if (out->IsOp() && out->Op()->Type() == op_type) {
return true; return true;
} }
...@@ -522,30 +626,30 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) { ...@@ -522,30 +626,30 @@ bool VarLinksFromOp(Node* node, const std::string& op_type) {
return false; return false;
} }
PDNode* patterns::ConvReLU::operator()( PDNode *patterns::ConvReLU::operator()(
paddle::framework::ir::PDNode* conv_input) { paddle::framework::ir::PDNode *conv_input) {
// Create Operators // Create Operators
conv_input->assert_is_op_input("conv2d", "Input"); conv_input->assert_is_op_input("conv2d", "Input");
auto* conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
auto* relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu");
// Create variables // Create variables
// Filter // Filter
auto* conv_weight_var = pattern->NewNode(conv_weight_repr()) auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
->AsInput() ->AsInput()
->assert_is_persistable_var() ->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Filter"); ->assert_is_op_input("conv2d", "Filter");
// Bias // Bias
auto* conv_bias_var = pattern->NewNode(conv_bias_repr()) auto *conv_bias_var = pattern->NewNode(conv_bias_repr())
->AsInput() ->AsInput()
->assert_is_persistable_var() ->assert_is_persistable_var()
->assert_is_op_input("conv2d", "Bias"); ->assert_is_op_input("conv2d", "Bias");
// intermediate variable, will be removed in the IR after fuse. // intermediate variable, will be removed in the IR after fuse.
auto* conv_out_var = pattern->NewNode(conv_out_repr()) auto *conv_out_var = pattern->NewNode(conv_out_repr())
->AsIntermediate() ->AsIntermediate()
->assert_is_only_output_of_op("conv2d") ->assert_is_only_output_of_op("conv2d")
->assert_is_op_input("relu"); ->assert_is_op_input("relu");
// output // output
auto* relu_out_var = pattern->NewNode(relu_out_repr()) auto *relu_out_var = pattern->NewNode(relu_out_repr())
->AsOutput() ->AsOutput()
->assert_is_op_output("relu"); ->assert_is_op_output("relu");
...@@ -555,18 +659,18 @@ PDNode* patterns::ConvReLU::operator()( ...@@ -555,18 +659,18 @@ PDNode* patterns::ConvReLU::operator()(
return relu_out_var; return relu_out_var;
} }
PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
bool with_bias) { bool with_bias) {
// Create shared nodes. // Create shared nodes.
x->assert_is_op_input("mul", "X"); x->assert_is_op_input("mul", "X");
auto* mul = pattern->NewNode(mul_repr())->assert_is_op("mul"); auto *mul = pattern->NewNode(mul_repr())->assert_is_op("mul");
auto* mul_w_var = pattern->NewNode(w_repr()) auto *mul_w_var = pattern->NewNode(w_repr())
->AsInput() ->AsInput()
->assert_is_persistable_var() ->assert_is_persistable_var()
->assert_is_op_input("mul", "Y"); ->assert_is_op_input("mul", "Y");
auto* mul_out_var = auto *mul_out_var =
pattern->NewNode(mul_out_repr())->assert_is_op_output("mul"); pattern->NewNode(mul_out_repr())->assert_is_op_output("mul");
if (!with_bias) { // not with bias if (!with_bias) { // not with bias
...@@ -577,14 +681,14 @@ PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, ...@@ -577,14 +681,14 @@ PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
} else { // with bias } else { // with bias
mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
// Create operators. // Create operators.
auto* elementwise_add = pattern->NewNode(elementwise_add_repr()) auto *elementwise_add = pattern->NewNode(elementwise_add_repr())
->assert_is_op("elementwise_add"); ->assert_is_op("elementwise_add");
// Create variables. // Create variables.
auto* bias = pattern->NewNode(bias_repr()) auto *bias = pattern->NewNode(bias_repr())
->assert_is_op_input("elementwise_add") ->assert_is_op_input("elementwise_add")
->AsInput(); ->AsInput();
auto* fc_out = pattern->NewNode(Out_repr()) auto *fc_out = pattern->NewNode(Out_repr())
->AsOutput() ->AsOutput()
->assert_is_op_output("elementwise_add"); ->assert_is_op_output("elementwise_add");
...@@ -594,11 +698,11 @@ PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x, ...@@ -594,11 +698,11 @@ PDNode* patterns::FC::operator()(paddle::framework::ir::PDNode* x,
} }
} }
PDNode* patterns::LSTM::operator()(PDNode* x) { PDNode *patterns::LSTM::operator()(PDNode *x) {
x->assert_is_op_input("lstm", "Input"); x->assert_is_op_input("lstm", "Input");
auto* lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm"); auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
#define NEW_NODE(arg__, io__) \ #define NEW_NODE(arg__, io__) \
auto* arg__ = \ auto *arg__ = \
pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__); pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__);
// Currently, the H0 and C0 are optional // Currently, the H0 and C0 are optional
...@@ -619,11 +723,11 @@ PDNode* patterns::LSTM::operator()(PDNode* x) { ...@@ -619,11 +723,11 @@ PDNode* patterns::LSTM::operator()(PDNode* x) {
return Hidden; return Hidden;
} }
PDNode* patterns::GRU::operator()(PDNode* x) { PDNode *patterns::GRU::operator()(PDNode *x) {
x->assert_is_op_input("gru", "Input"); x->assert_is_op_input("gru", "Input");
auto* gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru"); auto *gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru");
#define NEW_NODE(arg__, io__) \ #define NEW_NODE(arg__, io__) \
auto* arg__ = \ auto *arg__ = \
pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__); pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__);
NEW_NODE(Weight, input); NEW_NODE(Weight, input);
...@@ -648,6 +752,100 @@ PDNode* patterns::GRU::operator()(PDNode* x) { ...@@ -648,6 +752,100 @@ PDNode* patterns::GRU::operator()(PDNode* x) {
return Hidden; return Hidden;
} }
PDNode *patterns::ActElewiseAdd::operator()(
paddle::framework::ir::PDNode *in_var,
std::unordered_set<std::string> act_types) {
in_var->assert_is_ops_input(act_types, "X");
auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
auto *act_out_var = pattern->NewNode(act_out_repr())
->assert_is_not_ctrl_var()
->assert_is_ops_output(act_types);
act_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
auto *ele_x_var = pattern->NewNode(ele_x_repr())
->assert_is_not_ctrl_var()
->assert_is_op_input("elementwise_add")
->AsInput();
auto *elementwise_add =
pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
auto *elewise_add_out = pattern->NewNode(elewise_add_out_repr())
->AsOutput()
->assert_is_op_output("elementwise_add", "Out");
act->LinksFrom({in_var}).LinksTo({act_out_var});
elementwise_add->LinksFrom({act_out_var, ele_x_var})
.LinksTo({elewise_add_out});
return elewise_add_out;
}
PDNode *patterns::ElewiseAddAct::operator()(
paddle::framework::ir::PDNode *ele_x_var,
std::unordered_set<std::string> act_types) {
auto *ele_y_var = pattern->NewNode(ele_y_repr())
->assert_is_op_input("elementwise_add", "Y");
auto *ele_add =
pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add");
auto *ele_out_var = pattern->NewNode(elewise_add_out_repr())
->assert_is_op_output("elementwise_add", "Out");
ele_out_var->AsIntermediate()->assert_is_ops_input(act_types);
auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
auto *act_out_var =
pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var});
act->LinksFrom({ele_out_var}).LinksTo({act_out_var});
return act_out_var;
}
PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
paddle::framework::ir::PDNode *d_act_out_var,
std::unordered_set<std::string> act_types) {
// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
// ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
auto *act_grad = pattern->NewNode(act_grad_repr())->assert_is_ops(act_types);
auto *act_out_var =
pattern->NewNode(act_out_repr())->assert_is_ops_input(act_types, "Out");
auto *d_intermediate_var =
pattern->NewNode(d_itermediate_out_repr())
->assert_is_ops_output(act_types, GradVarName("X"));
act_grad->LinksFrom({d_act_out_var, act_out_var})
.LinksTo({d_intermediate_var});
auto *ele_y_var = pattern->NewNode(ele_y_repr())
->assert_is_not_ctrl_var()
->assert_is_op_input("elementwise_add_grad", "Y");
auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr())
->assert_is_op("elementwise_add_grad");
auto *d_ele_x_var =
pattern->NewNode(d_ele_x_repr())
->assert_is_not_ctrl_var()
->assert_is_op_output("elementwise_add_grad", GradVarName("X"));
auto *d_ele_y_var =
pattern->NewNode(d_ele_y_repr())
->assert_is_not_ctrl_var()
->assert_is_op_output("elementwise_add_grad", GradVarName("Y"));
ele_add_grad->LinksFrom({d_intermediate_var, ele_y_var})
.LinksTo({d_ele_x_var, d_ele_y_var});
return ele_add_grad;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -95,6 +95,7 @@ struct PDNode { ...@@ -95,6 +95,7 @@ struct PDNode {
PDNode* assert_is_op(); PDNode* assert_is_op();
PDNode* assert_is_op(const std::string& op_type); PDNode* assert_is_op(const std::string& op_type);
PDNode* assert_is_var(); PDNode* assert_is_var();
PDNode* assert_is_not_ctrl_var();
PDNode* assert_var_not_persistable(); PDNode* assert_var_not_persistable();
PDNode* assert_is_persistable_var(); PDNode* assert_is_persistable_var();
PDNode* assert_is_op_output(const std::string& op_type); PDNode* assert_is_op_output(const std::string& op_type);
...@@ -113,6 +114,20 @@ struct PDNode { ...@@ -113,6 +114,20 @@ struct PDNode {
PDNode* assert_op_has_n_outputs(const std::string& op_type, size_t n); PDNode* assert_op_has_n_outputs(const std::string& op_type, size_t n);
PDNode* assert_more(teller_t&& teller); PDNode* assert_more(teller_t&& teller);
PDNode* assert_is_ops_output(const std::unordered_set<std::string>& op_types);
PDNode* assert_is_ops(const std::unordered_set<std::string>& op_types);
PDNode* assert_is_ops_output(const std::unordered_set<std::string>& op_types,
const std::string& argument);
PDNode* assert_is_ops_nth_input(
const std::unordered_set<std::string>& op_types,
const std::string& argument, int nth);
PDNode* assert_is_ops_input(const std::unordered_set<std::string>& op_types);
PDNode* assert_is_ops_input(const std::unordered_set<std::string>& op_types,
const std::string& argument);
PDNode* assert_is_ops_nth_output(
const std::unordered_set<std::string>& op_types,
const std::string& argument, int nth);
private: private:
PDNode(PDPattern* pattern, const std::string& name = "", PDNode(PDPattern* pattern, const std::string& name = "",
Type type = Type::kVar) Type type = Type::kVar)
...@@ -447,6 +462,68 @@ struct GRU : public PatternBase { ...@@ -447,6 +462,68 @@ struct GRU : public PatternBase {
PATTERN_DECL_NODE(Hidden); PATTERN_DECL_NODE(Hidden);
}; };
// The following patterns are used to fuse elewise_add and act
// formula: act(ele_add(x, y))
// op: elementwise_add + act
// named nodes: elementwise_add, act
// ele_x, ele_y, elewise_add_out, act_out
struct ElewiseAddAct : public PatternBase {
ElewiseAddAct(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elewise_add_act") {}
PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
// declare operator node's name
PATTERN_DECL_NODE(ele_add);
PATTERN_DECL_NODE(act);
// declare variable node's name
PATTERN_DECL_NODE(elewise_add_out);
PATTERN_DECL_NODE(ele_y);
PATTERN_DECL_NODE(act_out);
};
// formula: ele_add(x, act(y))
// op: elementwise_add + act
// named nodes: elementwise_add, act
// act_in, act_out, ele_x, elewise_add_out
struct ActElewiseAdd : public PatternBase {
ActElewiseAdd(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "act_elewise_add") {}
PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
// declare operator node's name
PATTERN_DECL_NODE(act);
PATTERN_DECL_NODE(ele_add);
// declare variable node's name
PATTERN_DECL_NODE(act_out);
PATTERN_DECL_NODE(ele_x);
PATTERN_DECL_NODE(elewise_add_out);
};
// the backward of act(ele_add(x, y))
// the act is inplace.
// op: elementwise_add_grad + act_grad
// named nodes: elementwise_add_grad, act_grad
// act_out, act_out_g, ele_y, d_itermediate_out, d_ele_x, d_ele_y
struct ElewiseAddActInplaceGrad : public PatternBase {
ElewiseAddActInplaceGrad(PDPattern* pattern, const std::string& name_scope)
: PatternBase(pattern, name_scope, "elewise_add_act_grad1") {}
// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
// ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"]
PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
// declare operator node's name
PATTERN_DECL_NODE(act_grad);
PATTERN_DECL_NODE(ele_add_grad);
// declare variable node's name
PATTERN_DECL_NODE(act_out);
PATTERN_DECL_NODE(d_itermediate_out);
PATTERN_DECL_NODE(d_ele_x);
PATTERN_DECL_NODE(d_ele_y);
PATTERN_DECL_NODE(ele_y);
};
} // namespace patterns } // namespace patterns
// Link two ir::Nodes from each other. // Link two ir::Nodes from each other.
...@@ -454,6 +531,12 @@ struct GRU : public PatternBase { ...@@ -454,6 +531,12 @@ struct GRU : public PatternBase {
a->outputs.push_back(b); \ a->outputs.push_back(b); \
b->inputs.push_back(a); b->inputs.push_back(a);
// Set the out_var as the output of the op
#define IR_OP_VAR_LINK(op, out_var) \
op->outputs.push_back(out_var); \
out_var->inputs.clear(); \
out_var->inputs.push_back(op);
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -48,6 +48,10 @@ class Node { ...@@ -48,6 +48,10 @@ class Node {
bool IsOp() const { return type_ == Type::kOperation; } bool IsOp() const { return type_ == Type::kOperation; }
bool IsVar() const { return type_ == Type::kVariable; } bool IsVar() const { return type_ == Type::kVariable; }
bool IsCtrlVar() const {
return type_ == Type::kVariable &&
Name().find(ir::Node::kControlDepVarName) != std::string::npos;
}
std::vector<Node*> inputs; std::vector<Node*> inputs;
std::vector<Node*> outputs; std::vector<Node*> outputs;
......
...@@ -17,12 +17,10 @@ ...@@ -17,12 +17,10 @@
#include <algorithm> #include <algorithm>
#include <initializer_list> #include <initializer_list>
#include <memory> #include <memory>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/cow_ptr.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/memcpy.h"
#include "glog/logging.h" #include "glog/logging.h"
...@@ -30,401 +28,206 @@ namespace paddle { ...@@ -30,401 +28,206 @@ namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
namespace details {
struct CUDABuffer {
void *data_{nullptr};
size_t size_{0};
platform::CUDAPlace place_;
CUDABuffer() {}
CUDABuffer(platform::Place place, size_t size)
: size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
data_ = memory::Alloc(place_, size);
}
~CUDABuffer() { ClearMemory(); }
CUDABuffer(const CUDABuffer &o) = delete;
CUDABuffer &operator=(const CUDABuffer &o) = delete;
void Resize(platform::Place place, size_t size) {
ClearMemory();
place_ = boost::get<platform::CUDAPlace>(place);
data_ = memory::Alloc(place_, size);
size_ = size;
}
void Swap(CUDABuffer &o) {
std::swap(data_, o.data_);
std::swap(place_, o.place_);
std::swap(size_, o.size_);
}
private:
void ClearMemory() const {
if (data_) {
memory::Free(place_, data_);
}
}
};
} // namespace details
// Vector<T> implements the std::vector interface, and can get Data or // Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside. // MutableData from any place. The data will be synced implicitly inside.
template <typename T> template <typename T>
class Vector { class Vector {
public: public:
using value_type = T; using value_type = T;
using iterator = typename std::vector<T>::iterator;
using const_iterator = typename std::vector<T>::const_iterator;
private:
// The actual class to implement vector logic
class VectorData {
public:
VectorData() : flag_(kDataInCPU) {}
VectorData(size_t count, const T &value)
: cpu_(count, value), flag_(kDataInCPU) {}
VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
template <typename U>
explicit VectorData(const std::vector<U> &dat)
: cpu_(dat), flag_(kDataInCPU) {}
VectorData(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
}
VectorData &operator=(const VectorData &o) {
o.ImmutableCPU();
cpu_ = o.cpu_;
flag_ = kDataInCPU;
details::CUDABuffer null;
gpu_.Swap(null);
return *this;
}
T &operator[](size_t i) {
MutableCPU();
return cpu_[i];
}
const T &operator[](size_t i) const {
ImmutableCPU();
return cpu_[i];
}
size_t size() const { return cpu_.size(); }
iterator begin() {
MutableCPU();
return cpu_.begin();
}
iterator end() {
MutableCPU();
return cpu_.end();
}
T &front() {
MutableCPU();
return cpu_.front();
}
T &back() {
MutableCPU();
return cpu_.back();
}
const_iterator begin() const {
ImmutableCPU();
return cpu_.begin();
}
const_iterator end() const {
ImmutableCPU();
return cpu_.end();
}
const T &back() const {
ImmutableCPU();
return cpu_.back();
}
T *data() { return &(*this)[0]; }
const T *data() const { return &(*this)[0]; }
const T &front() const {
ImmutableCPU();
return cpu_.front();
}
// assign this from iterator.
// NOTE: the iterator must support `end-begin`
template <typename Iter>
void assign(Iter begin, Iter end) {
MutableCPU();
cpu_.assign(begin, end);
}
// push_back. If the previous capacity is not enough, the memory will
// double.
void push_back(T elem) {
MutableCPU();
cpu_.push_back(elem);
}
// extend a vector by iterator.
// NOTE: the iterator must support end-begin
template <typename It>
void Extend(It begin, It end) {
MutableCPU();
auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
std::copy(begin, end, out_it);
}
// resize the vector
void resize(size_t size) {
MutableCPU();
cpu_.resize(size);
}
// get cuda ptr. immutable
const T *CUDAData(platform::Place place) const {
PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return reinterpret_cast<T *>(gpu_.data_);
}
// get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) {
const T *ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA;
return const_cast<T *>(ptr);
}
// clear
void clear() {
cpu_.clear();
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const { return cpu_.capacity(); }
// reserve data
void reserve(size_t size) { cpu_.reserve(size); }
// implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const {
ImmutableCPU();
return cpu_;
}
bool operator==(const VectorData &other) const {
ImmutableCPU();
other.ImmutableCPU();
return cpu_ == other.cpu_;
}
private:
enum DataFlag {
kDataInCPU = 0x01,
kDataInCUDA = 0x02,
// kDirty means the data has been changed in one device.
kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
void *src = gpu_.data_;
void *dst = cpu_.data();
memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
nullptr);
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
CopyCPUDataToCUDA(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() &&
!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
CopyCUDADataToAnotherPlace(place);
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
CopyCPUDataToCUDA(place);
SetFlag(kDataInCUDA);
} else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
CopyCUDADataToAnotherPlace(place);
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void CopyCUDADataToAnotherPlace(const platform::Place &place) const {
details::CUDABuffer tmp(place, gpu_.size_);
const void *src = gpu_.data_;
void *dst = tmp.data_;
memory::Copy(tmp.place_, dst, gpu_.place_, src, gpu_.size_, nullptr);
gpu_.Swap(tmp);
}
void CopyCPUDataToCUDA(const platform::Place &place) const {
void *src = cpu_.data();
gpu_.Resize(place, cpu_.size() * sizeof(T));
void *dst = gpu_.data_;
auto stream = static_cast<platform::CUDADeviceContext *>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
stream);
}
void ImmutableCPU() const {
if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or
// CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
SetFlag(kDataInCPU);
}
void UnsetFlag(int flag) const { flag_ &= ~flag; }
void SetFlag(int flag) const { flag_ |= flag; }
bool IsDirty() const { return flag_ & kDirty; }
bool IsInCUDA() const { return flag_ & kDataInCUDA; }
bool IsInCPU() const { return flag_ & kDataInCPU; }
mutable std::vector<T> cpu_;
mutable details::CUDABuffer gpu_;
mutable int flag_;
};
public:
// Default ctor. Create empty Vector // Default ctor. Create empty Vector
Vector() : m_(new VectorData()) {} Vector() { InitEmpty(); }
// Fill vector with value. The vector size is `count`. // Fill vector with value. The vector size is `count`.
explicit Vector(size_t count, const T &value = T()) explicit Vector(size_t count, const T &value = T()) {
: m_(new VectorData(count, value)) {} InitEmpty();
if (count != 0) {
resize(count);
T *ptr = begin();
for (size_t i = 0; i < count; ++i) {
ptr[i] = value;
}
}
}
// Ctor with init_list // Ctor with init_list
Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {} Vector(std::initializer_list<T> init) {
if (init.size() == 0) {
InitEmpty();
} else {
InitByIter(init.size(), init.begin(), init.end());
}
}
// implicit cast from std::vector. // implicit cast from std::vector.
template <typename U> template <typename U>
Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) { // NOLINT Vector(const std::vector<U> &dat) { // NOLINT
if (dat.size() == 0) {
InitEmpty();
} else {
InitByIter(dat.size(), dat.begin(), dat.end());
}
} }
// Copy ctor // Copy ctor
Vector(const Vector<T> &other) { m_ = other.m_; } Vector(const Vector<T> &other) { this->operator=(other); }
// Copy operator // Copy operator
Vector<T> &operator=(const Vector<T> &other) { Vector<T> &operator=(const Vector<T> &other) {
m_ = other.m_; if (other.size() != 0) {
this->InitByIter(other.size(), other.begin(), other.end());
} else {
InitEmpty();
}
return *this; return *this;
} }
// Move ctor // Move ctor
Vector(Vector<T> &&other) { m_ = std::move(other.m_); } Vector(Vector<T> &&other) {
this->size_ = other.size_;
this->flag_ = other.flag_;
if (other.cuda_vec_.memory_size()) {
this->cuda_vec_.ShareDataWith(other.cuda_vec_);
}
if (other.cpu_vec_.memory_size()) {
this->cpu_vec_.ShareDataWith(other.cpu_vec_);
}
}
// CPU data access method. Mutable. // CPU data access method. Mutable.
T &operator[](size_t i) { return (*m_)[i]; } T &operator[](size_t i) {
MutableCPU();
return const_cast<T *>(cpu_vec_.data<T>())[i];
}
// CPU data access method. Immutable. // CPU data access method. Immutable.
const T &operator[](size_t i) const { return (*m_)[i]; } const T &operator[](size_t i) const {
ImmutableCPU();
return cpu_vec_.data<T>()[i];
}
// std::vector iterator methods. Based on CPU data access method // std::vector iterator methods. Based on CPU data access method
size_t size() const { return m_->size(); } size_t size() const { return size_; }
iterator begin() { return m_->begin(); } T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
iterator end() { return m_->end(); } T *end() {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
T &front() { return m_->front(); } T &front() { return *begin(); }
T &back() { return m_->back(); } T &back() {
auto it = end();
--it;
return *it;
}
const_iterator begin() const { return m_->begin(); } const T *begin() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
}
const_iterator end() const { return m_->end(); } const T *end() const {
return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
}
const_iterator cbegin() const { return begin(); } const T *cbegin() const { return begin(); }
const_iterator cend() const { return end(); } const T *cend() const { return end(); }
const T &back() const { return m_->back(); } const T &back() const {
auto it = end();
--it;
return *it;
}
T *data() { return m_->data(); } T *data() { return begin(); }
const T *data() const { return m_->data(); } const T *data() const { return begin(); }
const T &front() const { return m_->front(); } const T &front() const { return *begin(); }
// end of std::vector iterator methods // end of std::vector iterator methods
// assign this from iterator. // assign this from iterator.
// NOTE: the iterator must support `end-begin` // NOTE: the iterator must support `end-begin`
template <typename Iter> template <typename Iter>
void assign(Iter begin, Iter end) { void assign(Iter begin, Iter end) {
m_->assign(begin, end); InitByIter(end - begin, begin, end);
} }
// push_back. If the previous capacity is not enough, the memory will // push_back. If the previous capacity is not enough, the memory will
// double. // double.
void push_back(T elem) { m_->push_back(elem); } void push_back(T elem) {
if (size_ + 1 > capacity()) {
reserve((size_ + 1) << 1);
}
*end() = elem;
++size_;
}
// extend a vector by iterator. // extend a vector by iterator.
// NOTE: the iterator must support end-begin // NOTE: the iterator must support end-begin
template <typename It> template <typename It>
void Extend(It begin, It end) { void Extend(It begin, It end) {
m_->Extend(begin, end); size_t pre_size = size_;
resize(pre_size + (end - begin));
T *ptr = this->begin() + pre_size;
for (; begin < end; ++begin, ++ptr) {
*ptr = *begin;
}
} }
// resize the vector // resize the vector
void resize(size_t size) { void resize(size_t size) {
if (m_.Data().size() != size) { if (size + 1 <= capacity()) {
m_->resize(size); size_ = size;
} else {
MutableCPU();
Tensor cpu_tensor;
platform::Place cpu = platform::CPUPlace();
T *ptr = cpu_tensor.mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
const T *old_ptr =
cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
if (old_ptr != nullptr) {
std::copy(old_ptr, old_ptr + size_, ptr);
}
size_ = size;
cpu_vec_.ShareDataWith(cpu_tensor);
} }
} }
// get cuda ptr. immutable // get cuda ptr. immutable
const T *CUDAData(platform::Place place) const { const T *CUDAData(platform::Place place) const {
return m_.Data().CUDAData(place); PADDLE_ENFORCE(platform::is_gpu_place(place),
"CUDA Data must on CUDA place");
ImmutableCUDA(place);
return cuda_vec_.data<T>();
} }
// get cuda ptr. mutable // get cuda ptr. mutable
T *CUDAMutableData(platform::Place place) { T *CUDAMutableData(platform::Place place) {
return m_->CUDAMutableData(place); const T *ptr = CUDAData(place);
flag_ = kDirty | kDataInCUDA;
return const_cast<T *>(ptr);
} }
// clear // clear
void clear() { m_->clear(); } void clear() {
size_ = 0;
flag_ = kDirty | kDataInCPU;
}
size_t capacity() const { return m_->capacity(); } size_t capacity() const {
return cpu_vec_.memory_size() / SizeOfType(typeid(T));
}
// reserve data // reserve data
void reserve(size_t size) { m_->reserve(size); } void reserve(size_t size) {
size_t pre_size = size_;
resize(size);
resize(pre_size);
}
// the unify method to access CPU or CUDA data. immutable. // the unify method to access CPU or CUDA data. immutable.
const T *Data(platform::Place place) const { const T *Data(platform::Place place) const {
...@@ -445,7 +248,12 @@ class Vector { ...@@ -445,7 +248,12 @@ class Vector {
} }
// implicit cast operator. Vector can be cast to std::vector implicitly. // implicit cast operator. Vector can be cast to std::vector implicitly.
operator std::vector<T>() const { return *m_; } operator std::vector<T>() const {
std::vector<T> result;
result.resize(size());
std::copy(begin(), end(), result.begin());
return result;
}
bool operator==(const Vector<T> &other) const { bool operator==(const Vector<T> &other) const {
if (size() != other.size()) return false; if (size() != other.size()) return false;
...@@ -459,11 +267,118 @@ class Vector { ...@@ -459,11 +267,118 @@ class Vector {
return true; return true;
} }
const void *Handle() const { return &m_.Data(); }
private: private:
// Vector is an COW object. void InitEmpty() {
details::COWPtr<VectorData> m_; size_ = 0;
flag_ = kDataInCPU;
}
template <typename Iter>
void InitByIter(size_t size, Iter begin, Iter end) {
platform::Place cpu = platform::CPUPlace();
T *ptr = this->cpu_vec_.template mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(size)}), cpu);
for (size_t i = 0; i < size; ++i) {
*ptr++ = *begin++;
}
flag_ = kDataInCPU | kDirty;
size_ = size;
}
enum DataFlag {
kDataInCPU = 0x01,
kDataInCUDA = 0x02,
// kDirty means the data has been changed in one device.
kDirty = 0x10
};
void CopyToCPU() const {
// COPY GPU Data To CPU
TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
WaitPlace(cuda_vec_.place());
}
void MutableCPU() {
if (IsInCUDA() && IsDirty()) {
CopyToCPU();
}
flag_ = kDirty | kDataInCPU;
}
void ImmutableCUDA(platform::Place place) const {
if (IsDirty()) {
if (IsInCPU()) {
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
UnsetFlag(kDirty);
SetFlag(kDataInCUDA);
} else if (IsInCUDA() && !(place == cuda_vec_.place())) {
framework::Tensor tmp;
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
cuda_vec_.ShareDataWith(tmp);
// Still dirty
} else {
// Dirty && DataInCUDA && Device is same
// Do nothing
}
} else {
if (!IsInCUDA()) {
// Even data is not dirty. However, data is not in CUDA. Copy data.
TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
&cuda_vec_);
WaitPlace(place);
SetFlag(kDataInCUDA);
} else if (!(place == cuda_vec_.place())) {
framework::Tensor tmp;
WaitPlace(cuda_vec_.place());
TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
WaitPlace(cuda_vec_.place());
WaitPlace(place);
cuda_vec_.ShareDataWith(tmp);
} else {
// Not Dirty && DataInCUDA && Device is same
// Do nothing.
}
}
}
void ImmutableCPU() const {
if (IsDirty() &&
!IsInCPU()) { // If data has been changed in CUDA, or CPU has no data.
CopyToCPU();
UnsetFlag(kDirty);
}
SetFlag(kDataInCPU);
}
void UnsetFlag(int flag) const { flag_ &= ~flag; }
void SetFlag(int flag) const { flag_ |= flag; }
bool IsDirty() const { return flag_ & kDirty; }
bool IsInCUDA() const { return flag_ & kDataInCUDA; }
bool IsInCPU() const { return flag_ & kDataInCPU; }
static void WaitPlace(const platform::Place place) {
if (platform::is_gpu_place(place)) {
platform::DeviceContextPool::Instance()
.Get(boost::get<platform::CUDAPlace>(place))
->Wait();
}
}
static T &EmptyDummy() {
static T dummy = T();
return dummy;
}
mutable int flag_;
mutable Tensor cpu_vec_;
mutable Tensor cuda_vec_;
size_t size_;
}; };
#else // PADDLE_WITH_CUDA #else // PADDLE_WITH_CUDA
......
...@@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, ...@@ -120,6 +120,7 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
{static_cast<int>(OpRole::kForward), {static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kBackward), static_cast<int>(OpRole::kBackward),
static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC), static_cast<int>(OpRole::kOptimize), static_cast<int>(OpRole::kRPC),
static_cast<int>(OpRole::kDist), static_cast<int>(OpRole::kLRSched),
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward), static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kForward),
static_cast<int>(OpRole::kLoss) | static_cast<int>(OpRole::kLoss) |
static_cast<int>(OpRole::kBackward), static_cast<int>(OpRole::kBackward),
......
...@@ -26,7 +26,13 @@ enum class OpRole { ...@@ -26,7 +26,13 @@ enum class OpRole {
kForward = 0x0000, kForward = 0x0000,
kBackward = 0x0001, kBackward = 0x0001,
kOptimize = 0x0002, kOptimize = 0x0002,
// RPC role is for send/recv releated op
kRPC = 0x0003, kRPC = 0x0003,
// Dist role is for split_byref/split_selected_rows/concat
// used for distributed training.
kDist = 0x0004,
// Tag all learning rate scheduler operators.
kLRSched = 0x0005,
kLoss = 0x0100, kLoss = 0x0100,
// The default value of op's role. This should be only used for unittests and // The default value of op's role. This should be only used for unittests and
......
...@@ -57,6 +57,21 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass( ...@@ -57,6 +57,21 @@ std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
graph = viz_pass->Apply(std::move(graph)); graph = viz_pass->Apply(std::move(graph));
} }
// Apply op fusion.
if (strategy.fuse_elewise_add_act_ops_) {
auto fuse_elewise_add_act_pass =
ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
// Apply a graph viz pass to record a graph.
if (!strategy.debug_graphviz_path_.empty()) {
auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
const std::string graph_path = string::Sprintf(
"%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
graph = viz_pass->Apply(std::move(graph));
}
}
// Convert graph to run on multi-devices. // Convert graph to run on multi-devices.
auto multi_devices_pass = auto multi_devices_pass =
ir::PassRegistry::Instance().Get("multi_devices_pass"); ir::PassRegistry::Instance().Get("multi_devices_pass");
...@@ -359,6 +374,7 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -359,6 +374,7 @@ ParallelExecutor::~ParallelExecutor() {
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
USE_PASS(fuse_elewise_add_act_pass);
USE_PASS(graph_viz_pass); USE_PASS(graph_viz_pass);
USE_PASS(multi_devices_pass); USE_PASS(multi_devices_pass);
USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_check_pass);
......
...@@ -103,108 +103,74 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -103,108 +103,74 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->assign({input_tensor}); input_slots->assign({input_tensor});
} }
const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, void SetConfig(AnalysisConfig *cfg) {
25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, cfg->model_dir = FLAGS_infer_model;
44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, cfg->use_gpu = false;
14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; cfg->device = 0;
cfg->specify_input_name = true;
void TestLACPrediction(const std::string &model_path, cfg->enable_ir_optim = true;
const std::string &data_file, const int batch_size, }
const int repeat, bool use_analysis = false) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs_slots; void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(data_file, batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
GetOneBatch(&input_slots, &data, batch_size); std::vector<PaddleTensor> input_slots;
std::unique_ptr<PaddlePredictor> predictor; int epoch = FLAGS_test_all_data ? data.batched_datas.size() : 1;
if (use_analysis) { LOG(INFO) << "number of samples: " << epoch;
predictor = for (int bid = 0; bid < epoch; ++bid) {
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); GetOneBatch(&input_slots, &data, FLAGS_batch_size);
} else { (*inputs).emplace_back(input_slots);
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
} }
Timer timer; }
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
input_slots_all.emplace_back(input_slots);
}
LOG(INFO) << "total number of samples: " << data.datasets.size();
TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
return;
}
timer.tic();
for (int i = 0; i < repeat; i++) {
predictor->Run(input_slots, &outputs_slots);
}
PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
// check result // Easy for profiling independently.
EXPECT_EQ(outputs_slots.size(), 1UL); TEST(Analyzer_LAC, profile) {
auto &out = outputs_slots[0]; AnalysisConfig cfg;
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, SetConfig(&cfg);
[](int a, int b) { return a * b; }); std::vector<PaddleTensor> outputs;
size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
PADDLE_ENFORCE_GT(size, 0);
EXPECT_GE(size, batch1_size);
int64_t *pdata = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < batch1_size; ++i) {
EXPECT_EQ(pdata[i], lac_ref_data[i]);
}
if (use_analysis) { std::vector<std::vector<PaddleTensor>> input_slots_all;
// run once for comparion as reference SetInput(&input_slots_all);
auto ref_predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
CompareResult(ref_outputs_slots, outputs_slots);
AnalysisPredictor *analysis_predictor = if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); // the first inference result
auto &fuse_statis = analysis_predictor->analysis_argument() const int64_t lac_ref_data[] = {
.Get<std::unordered_map<std::string, int>>( 24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
framework::ir::kFuseStatisAttr); 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
for (auto &item : fuse_statis) { 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
LOG(INFO) << "fused " << item.first << " " << item.second; PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
} size_t size = GetSize(outputs[0]);
int num_ops = 0; size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
for (auto &node : PADDLE_ENFORCE_GE(size, batch1_size);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { int64_t *pdata = static_cast<int64_t *>(outputs[0].data.data());
if (node->IsFunction()) { for (size_t i = 0; i < batch1_size; ++i) {
++num_ops; EXPECT_EQ(pdata[i], lac_ref_data[i]);
}
} }
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
EXPECT_EQ(num_ops, 11);
} }
} }
TEST(Analyzer_LAC, native) { // Check the fuse status
LOG(INFO) << "LAC with native"; TEST(Analyzer_LAC, fuse_statis) {
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, AnalysisConfig cfg;
FLAGS_repeat); SetConfig(&cfg);
int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
EXPECT_EQ(num_ops, 11);
} }
TEST(Analyzer_LAC, analysis) { // Compare result of NativeConfig and AnalysisConfig
LOG(INFO) << "LAC with analysis"; TEST(Analyzer_LAC, compare) {
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, AnalysisConfig cfg;
FLAGS_repeat, true); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
} // namespace analysis } // namespace analysis
......
...@@ -95,97 +95,73 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -95,97 +95,73 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
// the first inference result void SetConfig(AnalysisConfig *cfg) {
const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, cfg->prog_file = FLAGS_infer_model + "/__model__";
48, 39, 38, 16, 25}; cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
void TestChineseNERPrediction(bool use_analysis) { cfg->device = 0;
AnalysisConfig cfg; cfg->specify_input_name = true;
cfg.prog_file = FLAGS_infer_model + "/__model__"; cfg->enable_ir_optim = true;
cfg.param_file = FLAGS_infer_model + "/param"; }
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor;
Timer timer;
if (use_analysis) {
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
if (FLAGS_test_all_data) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
LOG(INFO) << "test all data";
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
input_slots_all.emplace_back(input_slots);
}
LOG(INFO) << "total number of samples: " << data.num_samples;
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
return;
}
// Prepare inputs.
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareInputs(&input_slots, &data, FLAGS_batch_size); std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
timer.tic(); LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int i = 0; i < FLAGS_repeat; i++) { for (int bid = 0; bid < epoch; ++bid) {
predictor->Run(input_slots, &outputs); PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
} }
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat); }
PADDLE_ENFORCE(outputs.size(), 1UL); // Easy for profiling independently.
auto &out = outputs[0]; TEST(Analyzer_Chinese_ner, profile) {
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, AnalysisConfig cfg;
[](int a, int b) { return a * b; }); SetConfig(&cfg);
PADDLE_ENFORCE_GT(size, 0); std::vector<PaddleTensor> outputs;
int64_t *result = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < std::min(11UL, size); i++) {
PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
}
if (use_analysis) { std::vector<std::vector<PaddleTensor>> input_slots_all;
// run once for comparion as reference SetInput(&input_slots_all);
auto ref_predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
CompareResult(ref_outputs_slots, outputs);
AnalysisPredictor *analysis_predictor = if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); // the first inference result
auto &fuse_statis = analysis_predictor->analysis_argument() const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
.Get<std::unordered_map<std::string, int>>( 48, 39, 38, 16, 25};
framework::ir::kFuseStatisAttr); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
for (auto &item : fuse_statis) { size_t size = GetSize(outputs[0]);
LOG(INFO) << "fused " << item.first << " " << item.second; PADDLE_ENFORCE_GT(size, 0);
} int64_t *result = static_cast<int64_t *>(outputs[0].data.data());
int num_ops = 0; for (size_t i = 0; i < std::min(11UL, size); i++) {
for (auto &node : EXPECT_EQ(result[i], chinese_ner_result_data[i]);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
} }
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
EXPECT_EQ(num_ops, 14);
} }
} }
TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); } // Check the fuse status
TEST(Analyzer_Chinese_ner, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); } int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
EXPECT_EQ(num_ops, 14);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Chinese_ner, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -25,6 +25,7 @@ struct DataRecord { ...@@ -25,6 +25,7 @@ struct DataRecord {
std::vector<size_t> lod1, lod2, lod3; std::vector<size_t> lod1, lod2, lod3;
std::vector<std::vector<float>> rnn_link_data, rnn_week_datas, std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
rnn_minute_datas; rnn_minute_datas;
size_t num_samples; // total number of samples
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
DataRecord() = default; DataRecord() = default;
...@@ -97,6 +98,7 @@ struct DataRecord { ...@@ -97,6 +98,7 @@ struct DataRecord {
week_data_all.push_back(std::move(week_data)); week_data_all.push_back(std::move(week_data));
minute_data_all.push_back(std::move(minute_data)); minute_data_all.push_back(std::move(minute_data));
} }
num_samples = num_lines;
} }
}; };
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...@@ -147,89 +149,72 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -147,89 +149,72 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
// Test with a really complicate model. void SetConfig(AnalysisConfig *cfg) {
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { cfg->prog_file = FLAGS_infer_model + "/__model__";
AnalysisConfig config; cfg->param_file = FLAGS_infer_model + "/param";
config.prog_file = FLAGS_infer_model + "/__model__"; cfg->use_gpu = false;
config.param_file = FLAGS_infer_model + "/param"; cfg->device = 0;
config.use_gpu = false; cfg->specify_input_name = true;
config.device = 0; cfg->enable_ir_optim = true;
config.specify_input_name = true; cfg->ir_passes.clear(); // Do not exclude any pass.
config.enable_ir_optim = activate_ir; }
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
auto base_predictor = void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots; std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size); int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
// Prepare inputs. LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
PrepareInputs(&input_slots, &data, batch_size); for (int bid = 0; bid < epoch; ++bid) {
std::vector<PaddleTensor> outputs, base_outputs; PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
}
}
base_predictor->Run(input_slots, &base_outputs); // Easy for profiling independently.
TEST(Analyzer_rnn1, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
input_slots_all.emplace_back(input_slots); SetInput(&input_slots_all);
if (num_threads == 1) { TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
TestOneThreadPrediction(config, input_slots_all, &outputs); }
CompareResult(outputs, base_outputs);
} else {
// only return the output of first thread
TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
}
if (use_analysis && activate_ir) { // Check the fuse status
AnalysisPredictor *analysis_predictor = TEST(Analyzer_rnn1, fuse_statis) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); AnalysisConfig cfg;
auto &fuse_statis = analysis_predictor->analysis_argument() SetConfig(&cfg);
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0; int num_ops;
for (auto &node : auto fuse_statis = GetFuseStatis(cfg, &num_ops);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { ASSERT_TRUE(fuse_statis.count("fc_fuse"));
if (node->IsFunction()) { EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
++num_ops; EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
} EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
} EXPECT_EQ(num_ops,
LOG(INFO) << "has num ops: " << num_ops; 13); // After graph optimization, only 13 operators exists.
}
ASSERT_TRUE(fuse_statis.count("fc_fuse")); // Compare result of NativeConfig and AnalysisConfig
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); TEST(Analyzer_rnn1, compare) {
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM AnalysisConfig cfg;
EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); SetConfig(&cfg);
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists. std::vector<std::vector<PaddleTensor>> input_slots_all;
} SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
// Inference with analysis and IR, easy for profiling independently. // Test Multi-Thread.
TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } TEST(Analyzer_rnn1, multi_thread) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
// Other unit-tests of RNN1, test different options of use_analysis, std::vector<std::vector<PaddleTensor>> input_slots_all;
// activate_ir and multi-threads. SetInput(&input_slots_all);
TEST(Analyzer, RNN_tests) { TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
int num_threads[2] = {1, 4};
for (auto i : num_threads) {
// Directly infer with the original model.
TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestRNN1Prediction(true, true, i);
}
} }
} // namespace inference } // namespace inference
......
...@@ -12,24 +12,7 @@ ...@@ -12,24 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -41,6 +24,7 @@ struct DataRecord { ...@@ -41,6 +24,7 @@ struct DataRecord {
std::vector<size_t> lod; std::vector<size_t> lod;
std::vector<std::vector<float>> rnn_link_data; std::vector<std::vector<float>> rnn_link_data;
std::vector<float> result_data; std::vector<float> result_data;
size_t num_samples; // total number of samples
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
DataRecord() = default; DataRecord() = default;
...@@ -100,6 +84,7 @@ struct DataRecord { ...@@ -100,6 +84,7 @@ struct DataRecord {
result_data.insert(result_data.end(), tmp.begin(), tmp.end()); result_data.insert(result_data.end(), tmp.begin(), tmp.end());
} }
} }
num_samples = num_lines / 2;
} }
}; };
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...@@ -118,64 +103,58 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -118,64 +103,58 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->assign({feed_tensor}); input_slots->assign({feed_tensor});
} }
void CompareResult(const std::vector<PaddleTensor> &outputs, void SetConfig(AnalysisConfig *cfg) {
const std::vector<float> &base_result) { cfg->prog_file = FLAGS_infer_model + "/__model__";
PADDLE_ENFORCE_GT(outputs.size(), 0); cfg->param_file = FLAGS_infer_model + "/param";
for (size_t i = 0; i < outputs.size(); i++) { cfg->use_gpu = false;
auto &out = outputs[i]; cfg->device = 0;
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, cfg->specify_input_name = true;
[](int a, int b) { return a * b; }); cfg->enable_ir_optim = true;
PADDLE_ENFORCE_GT(size, 0); }
float *data = static_cast<float *>(out.data.data());
for (size_t i = 0; i < size; i++) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
EXPECT_NEAR(data[i], base_result[i], 1e-3); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
} std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int bid = 0; bid < epoch; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
} }
} }
// Test with a really complicate model.
void TestRNN2Prediction() {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = true;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
int batch_size = FLAGS_batch_size; // Easy for profiling independently.
int num_times = FLAGS_repeat; TEST(Analyzer_rnn2, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
auto base_predictor = std::vector<std::vector<PaddleTensor>> input_slots_all;
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); SetInput(&input_slots_all);
auto predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
Timer timer1; if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
timer1.tic(); // the first inference result
for (int i = 0; i < num_times; i++) { DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
base_predictor->Run(input_slots, &base_outputs); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(result[i], data.result_data[i], 1e-3);
}
} }
PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times); }
Timer timer2; // Compare result of NativeConfig and AnalysisConfig
timer2.tic(); TEST(Analyzer_rnn2, compare) {
for (int i = 0; i < num_times; i++) { AnalysisConfig cfg;
predictor->Run(input_slots, &outputs); SetConfig(&cfg);
}
PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
CompareResult(base_outputs, data.result_data); std::vector<std::vector<PaddleTensor>> input_slots_all;
CompareResult(outputs, data.result_data); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -46,54 +46,63 @@ struct DataReader { ...@@ -46,54 +46,63 @@ struct DataReader {
std::unique_ptr<std::ifstream> file; std::unique_ptr<std::ifstream> file;
}; };
void Main(int batch_size) { void SetConfig(AnalysisConfig *cfg) {
// shape -- cfg->model_dir = FLAGS_infer_model;
// Create Predictor -- cfg->use_gpu = false;
AnalysisConfig config; cfg->device = 0;
config.model_dir = FLAGS_infer_model; cfg->specify_input_name = true;
config.use_gpu = false; cfg->enable_ir_optim = true;
config.enable_ir_optim = true; }
std::vector<PaddleTensor> input_slots, output_slots; void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
std::vector<PaddleTensor> input_slots;
DataReader reader(FLAGS_infer_data); DataReader reader(FLAGS_infer_data);
std::vector<std::vector<PaddleTensor>> input_slots_all; int num_batches = 0;
while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
if (FLAGS_test_all_data) { (*inputs).emplace_back(input_slots);
LOG(INFO) << "test all data"; ++num_batches;
int num_batches = 0; if (!FLAGS_test_all_data) return;
while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
input_slots_all.emplace_back(input_slots);
++num_batches;
}
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
return;
} }
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
}
// one batch starts // Easy for profiling independently.
// data -- TEST(Analyzer_Text_Classification, profile) {
reader.NextBatch(&input_slots, FLAGS_batch_size); AnalysisConfig cfg;
input_slots_all.emplace_back(input_slots); SetConfig(&cfg);
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); std::vector<PaddleTensor> outputs;
// Get output std::vector<std::vector<PaddleTensor>> input_slots_all;
LOG(INFO) << "get outputs " << output_slots.size(); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
for (auto &output : output_slots) { if (FLAGS_num_threads == 1) {
LOG(INFO) << "output.shape: " << to_string(output.shape); // Get output
// no lod ? LOG(INFO) << "get outputs " << outputs.size();
CHECK_EQ(output.lod.size(), 0UL); for (auto &output : outputs) {
LOG(INFO) << "output.dtype: " << output.dtype; LOG(INFO) << "output.shape: " << to_string(output.shape);
std::stringstream ss; // no lod ?
for (int i = 0; i < 5; i++) { CHECK_EQ(output.lod.size(), 0UL);
ss << static_cast<float *>(output.data.data())[i] << " "; LOG(INFO) << "output.dtype: " << output.dtype;
std::stringstream ss;
for (int i = 0; i < 5; i++) {
ss << static_cast<float *>(output.data.data())[i] << " ";
}
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
} }
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
} }
} }
TEST(text_classification, basic) { Main(FLAGS_batch_size); } // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Text_Classification, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -49,84 +49,83 @@ Record ProcessALine(const std::string &line) { ...@@ -49,84 +49,83 @@ Record ProcessALine(const std::string &line) {
return record; return record;
} }
/* void SetConfig(AnalysisConfig *cfg) {
* Use the native and analysis fluid engine to inference the demo. cfg->param_file = FLAGS_infer_model + "/__params__";
* ocr, mobilenet and se_resnext50 cfg->prog_file = FLAGS_infer_model + "/__model__";
*/ cfg->use_gpu = false;
void TestVisualPrediction(bool use_mkldnn) { cfg->device = 0;
std::unique_ptr<PaddlePredictor> predictor; cfg->enable_ir_optim = true;
AnalysisConfig cfg; cfg->specify_input_name = true;
cfg.param_file = FLAGS_infer_model + "/__params__";
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.use_gpu = false;
cfg._use_mkldnn = use_mkldnn;
cfg.device = 0;
cfg.enable_ir_optim = true;
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg.ir_passes.push_back("fc_gru_fuse_pass"); cfg->ir_passes.push_back("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
cfg->_use_mkldnn = true;
// disable mkldnn fuse since it should have some bugs // disable mkldnn fuse since it should have some bugs
cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); cfg->ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
#endif #endif
predictor = }
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
// Only have single batch of data. void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::string line; std::string line;
std::ifstream file(FLAGS_infer_data); std::ifstream file(FLAGS_infer_data);
std::getline(file, line); std::getline(file, line);
auto record = ProcessALine(line); auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input; PaddleTensor input;
input.shape = record.shape; input.shape = record.shape;
input.data =
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32; input.dtype = PaddleDType::FLOAT32;
size_t input_size = record.data.size() * sizeof(float);
input.data.Resize(input_size);
memcpy(input.data.data(), record.data.data(), input_size);
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
}
std::vector<PaddleTensor> outputs_slots; // Easy for profiling independently.
Timer timer; // ocr, mobilenet and se_resnext50
timer.tic(); TEST(Analyzer_vis, profile) {
for (int i = 0; i < FLAGS_repeat; i++) { AnalysisConfig cfg;
predictor->Run({input}, &outputs_slots); SetConfig(&cfg);
} std::vector<PaddleTensor> outputs;
PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
timer.toc() / FLAGS_repeat); std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
VLOG(3) << "output.size " << outputs_slots.size(); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
// run native as reference if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
auto ref_predictor = const float ocr_result_data[] = {
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg); 5.273636460856323538e-08, 3.296741795111302054e-07,
std::vector<PaddleTensor> ref_outputs_slots; 1.873261190610264748e-08, 3.403730275408634043e-08,
ref_predictor->Run({input}, &ref_outputs_slots); 3.383312474625199684e-08};
CompareResult(outputs_slots, ref_outputs_slots); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
// print what are fused size_t size = GetSize(outputs[0]);
AnalysisPredictor *analysis_predictor = PADDLE_ENFORCE_GT(size, 0);
dynamic_cast<AnalysisPredictor *>(predictor.get()); float *result = static_cast<float *>(outputs[0].data.data());
auto &fuse_statis = analysis_predictor->analysis_argument() for (size_t i = 0; i < std::min(5UL, size); i++) {
.Get<std::unordered_map<std::string, int>>( EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
} }
} }
LOG(INFO) << "has num ops: " << num_ops;
} }
TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); } // Check the fuse status
#ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_vis, fuse_statis) {
TEST(Analyzer_vis, analysis_mkldnn) { AnalysisConfig cfg;
TestVisualPrediction(/*use_mkldnn*/ true); SetConfig(&cfg);
int num_ops;
GetFuseStatis(cfg, &num_ops);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_vis, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
#endif
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
...@@ -28,17 +29,18 @@ ...@@ -28,17 +29,18 @@
DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file"); DEFINE_string(infer_data, "", "data file");
DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0); EXPECT_GT(outputs.size(), 0UL);
EXPECT_EQ(outputs.size(), ref_outputs.size()); EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i]; auto &out = outputs[i];
...@@ -72,14 +74,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -72,14 +74,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
} }
std::unique_ptr<PaddlePredictor> GetPrediction(AnalysisConfig config,
bool use_analysis = true) {
if (use_analysis) {
return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
} else {
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
config);
}
}
size_t GetSize(const PaddleTensor &out) {
return std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
}
std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
int *num_ops) {
auto predictor = GetPrediction(config);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num;
}
}
*num_ops = num;
return fuse_statis;
}
void TestOneThreadPrediction( void TestOneThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs, AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs) { std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
auto predictor = auto predictor = GetPrediction(config, use_analysis);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
Timer timer; Timer timer;
timer.tic(); timer.tic();
for (int i = 0; i < num_times; i++) { for (int i = 0; i < num_times; i++) {
...@@ -93,7 +131,8 @@ void TestOneThreadPrediction( ...@@ -93,7 +131,8 @@ void TestOneThreadPrediction(
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs, AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) { std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
std::vector<std::thread> threads; std::vector<std::thread> threads;
...@@ -101,9 +140,7 @@ void TestMultiThreadPrediction( ...@@ -101,9 +140,7 @@ void TestMultiThreadPrediction(
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged. // because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back( predictors.emplace_back(GetPrediction(config, use_analysis));
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
} }
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
...@@ -129,13 +166,25 @@ void TestMultiThreadPrediction( ...@@ -129,13 +166,25 @@ void TestMultiThreadPrediction(
void TestPrediction(AnalysisConfig config, void TestPrediction(AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) { std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = FLAGS_use_analysis) {
LOG(INFO) << "use_analysis: " << use_analysis;
if (num_threads == 1) { if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs); TestOneThreadPrediction(config, inputs, outputs, use_analysis);
} else { } else {
TestMultiThreadPrediction(config, inputs, outputs, num_threads); TestMultiThreadPrediction(config, inputs, outputs, num_threads,
use_analysis);
} }
} }
void CompareNativeAndAnalysis(
AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs) {
std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
CompareResult(analysis_outputs, native_outputs);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -174,12 +174,13 @@ struct SparseAdamFunctor { ...@@ -174,12 +174,13 @@ struct SparseAdamFunctor {
const int64_t* rows_; const int64_t* rows_;
int64_t row_numel_; int64_t row_numel_;
int64_t row_count_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out, const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows, const T* param, T* param_out, const int64_t* rows,
int64_t row_numel) int64_t row_numel, int64_t row_count)
: beta1_(beta1), : beta1_(beta1),
beta2_(beta2), beta2_(beta2),
epsilon_(epsilon), epsilon_(epsilon),
...@@ -194,28 +195,47 @@ struct SparseAdamFunctor { ...@@ -194,28 +195,47 @@ struct SparseAdamFunctor {
param_(param), param_(param),
param_out_(param_out), param_out_(param_out),
rows_(rows), rows_(rows),
row_numel_(row_numel) {} row_numel_(row_numel),
row_count_(row_count) {}
inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
int64_t beg = 0, end = row_count_ - 1;
while (beg <= end) {
auto mid = ((beg + end) >> 1);
if (rows_[mid] == row)
return mid;
else if (rows_[mid] < row)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
inline HOSTDEVICE void operator()(size_t i) const { inline HOSTDEVICE void operator()(size_t i) const {
int64_t row = i / row_numel_;
auto row_idx = BinarySearchInRows(row);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
// The following code is the same as dense
T mom1 = moment1_[i];
T mom2 = moment2_[i];
T lr = *lr_;
T beta1_pow = *beta1_pow_; T beta1_pow = *beta1_pow_;
T beta2_pow = *beta2_pow_; T beta2_pow = *beta2_pow_;
for (int64_t j = 0; j < row_numel_; ++j) { T p = param_[i];
T g = grad_[i * row_numel_ + j];
T mom1 = moment1_[rows_[i] * row_numel_ + j]; // Calculation
T mom2 = moment2_[rows_[i] * row_numel_ + j]; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
T lr = *lr_;
T p = param_[rows_[i] * row_numel_ + j]; mom1 = beta1_ * mom1 + (1 - beta1_) * g;
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
mom1 = beta1_ * mom1 + (1 - beta1_) * g; // Write back to global memory
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; moment1_out_[i] = mom1;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); moment2_out_[i] = mom2;
param_out_[i] = p;
moment1_out_[rows_[i] * row_numel_ + j] = mom1;
moment2_out_[rows_[i] * row_numel_ + j] = mom2;
param_out_[rows_[i] * row_numel_ + j] = p;
} // for col id
} }
}; };
...@@ -287,9 +307,14 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -287,9 +307,14 @@ class AdamOpKernel : public framework::OpKernel<T> {
return; return;
} }
// merge duplicated rows if any. // merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
scatter::MergeAdd<DeviceContext, T> merge_func; scatter::MergeAdd<DeviceContext, T> merge_func;
auto grad_merge = auto& grad_merge = *(ctx.scope()
merge_func(ctx.template device_context<DeviceContext>(), grad); .NewScope()
.Var("sparse_adam_grad_merge")
->GetMutable<framework::SelectedRows>());
merge_func(ctx.template device_context<DeviceContext>(), grad,
&grad_merge);
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
int64_t* rows = nullptr; int64_t* rows = nullptr;
...@@ -314,10 +339,11 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -314,10 +339,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2.template data<T>(), mom2.template data<T>(),
mom2_out.template mutable_data<T>(ctx.GetPlace()), mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(), lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel); param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size());
platform::ForRange<DeviceContext> for_range( platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()), static_cast<const DeviceContext&>(ctx.device_context()),
grad_merge.rows().size()); param.numel());
for_range(functor); for_range(functor);
} else { } else {
PADDLE_THROW("Variable type not supported by adam_op"); PADDLE_THROW("Variable type not supported by adam_op");
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
namespace paddle { namespace paddle {
...@@ -61,14 +62,32 @@ class ClipKernel : public framework::OpKernel<T> { ...@@ -61,14 +62,32 @@ class ClipKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto max = context.Attr<T>("max"); auto max = context.Attr<T>("max");
auto min = context.Attr<T>("min"); auto min = context.Attr<T>("min");
auto* x = context.Input<Tensor>("X"); auto* x_var = context.InputVar("X");
auto* out = context.Output<Tensor>("Out"); if (x_var->IsType<framework::LoDTensor>()) {
T* out_data = out->mutable_data<T>(context.GetPlace()); auto* x = context.Input<framework::LoDTensor>("X");
const T* x_data = x->data<T>(); auto* out = context.Output<framework::LoDTensor>("Out");
int64_t numel = x->numel(); T* out_data = out->mutable_data<T>(context.GetPlace());
Transform<DeviceContext> trans; const T* x_data = x->data<T>();
trans(context.template device_context<DeviceContext>(), x_data, int64_t numel = x->numel();
x_data + numel, out_data, ClipFunctor<T>(min, max)); Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), x_data,
x_data + numel, out_data, ClipFunctor<T>(min, max));
} else if (x_var->IsType<framework::SelectedRows>()) {
auto* x = context.Input<framework::SelectedRows>("X");
auto* out = context.Output<framework::SelectedRows>("Out");
PADDLE_ENFORCE_NE(x, out,
"Inplace clip is not allowed when x is SelectedRows");
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(context.template device_context<DeviceContext>(), *x, out);
auto* out_tensor = out->mutable_value();
auto* out_data = out_tensor->data<T>();
int64_t numel = out_tensor->numel();
Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), out_data,
out_data + numel, out_data, ClipFunctor<T>(min, max));
} else {
PADDLE_THROW("ClipOp only supports LoDTensor and SelectedRows");
}
} }
}; };
...@@ -78,10 +97,12 @@ class ClipGradKernel : public framework::OpKernel<T> { ...@@ -78,10 +97,12 @@ class ClipGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto max = context.Attr<T>("max"); auto max = context.Attr<T>("max");
auto min = context.Attr<T>("min"); auto min = context.Attr<T>("min");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out")); auto* d_out =
auto* d_x = context.Output<Tensor>(framework::GradVarName("X")); context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
auto* d_x =
context.Output<framework::LoDTensor>(framework::GradVarName("X"));
if (d_x != nullptr) { if (d_x != nullptr) {
auto* x = context.Input<Tensor>("X"); auto* x = context.Input<framework::LoDTensor>("X");
int64_t numel = d_out->numel(); int64_t numel = d_out->numel();
auto* d_x_data = d_x->mutable_data<T>(context.GetPlace()); auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
const T* d_out_data = d_out->data<T>(); const T* d_out_data = d_out->data<T>();
......
...@@ -31,5 +31,6 @@ polygon_box_transform_op.cu) ...@@ -31,5 +31,6 @@ polygon_box_transform_op.cu)
detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
detection_library(generate_proposals_op SRCS generate_proposals_op.cc) detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu)
#Export local libraries to parent #Export local libraries to parent
set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
static constexpr int kROISize = 4;
template <typename T>
bool GT_E(T a, T b) {
return (a > b) || fabs(a - b) < 1e-4;
}
template <typename T>
bool LT_E(T a, T b) {
return (a < b) || fabs(a - b) < 1e-4;
}
template <typename T>
bool GT(T a, T b) {
return (a - b) > 1e-4;
}
/*
*check if (x, y) is in the boundary of roi
*/
template <typename T>
bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
for (int i = 0; i < 4; i++) {
T xs = roi_x[i];
T ys = roi_y[i];
T xe = roi_x[(i + 1) % 4];
T ye = roi_y[(i + 1) % 4];
if (fabs(ys - ye) < 1e-4) {
if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 &&
GT_E<T>(x, std::min(xs, xe)) && LT_E<T>(x, std::max(xs, xe))) {
return true;
}
} else {
T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
if (fabs(intersec_x - x) < 1e-4 && GT_E<T>(y, std::min(ys, ye)) &&
LT_E<T>(y, std::max(ys, ye))) {
return true;
}
}
}
int n_cross = 0;
for (int i = 0; i < 4; i++) {
T xs = roi_x[i];
T ys = roi_y[i];
T xe = roi_x[(i + 1) % 4];
T ye = roi_y[(i + 1) % 4];
if (fabs(ys - ye) < 1e-4) {
continue;
}
if (LT_E<T>(y, std::min(ys, ye)) || GT<T>(y, std::max(ys, ye))) {
continue;
}
T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs;
if (fabs(intersec_x - x) < 1e-4) {
return true;
}
if (GT<T>(intersec_x, x)) {
n_cross++;
}
}
return (n_cross % 2 == 1);
}
/**
* Get the matrix of perspective transform.
*
* dx1 = x1 - x2
* dx2 = x3 - x2
* dx3 = x0 - x1 + x2 - x3
* dy1 = y1 - y2
* dy2 = y3 - y2
* dy3 = y0 - y1 + y2 - y3
*
* a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
* a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
* a13 = x0
* a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
* a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
* a23 = y0
* a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
* a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
* a33 = 1
*
*/
template <typename T>
void get_transform_matrix(const int transformed_width,
const int transformed_height, T roi_x[], T roi_y[],
T matrix[]) {
T x0 = roi_x[0];
T x1 = roi_x[1];
T x2 = roi_x[2];
T x3 = roi_x[3];
T y0 = roi_y[0];
T y1 = roi_y[1];
T y2 = roi_y[2];
T y3 = roi_y[3];
// Estimate the height and width of RoI
T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
T estimated_height = (len2 + len4) / 2.0;
T estimated_width = (len1 + len3) / 2.0;
// Get the normalized height and normalized width
int normalized_height = transformed_height;
int normalized_width =
std::round(estimated_width * (normalized_height - 1) / estimated_height) +
1;
normalized_width = std::min(normalized_width, transformed_width);
T dx1 = x1 - x2;
T dx2 = x3 - x2;
T dx3 = x0 - x1 + x2 - x3;
T dy1 = y1 - y2;
T dy2 = y3 - y2;
T dy3 = y0 - y1 + y2 - y3;
matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
(normalized_width - 1);
matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
(normalized_height - 1);
matrix[8] = 1;
matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
(normalized_width - 1);
matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
(normalized_height - 1);
matrix[5] = y0;
matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
(normalized_width - 1);
matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
(normalized_height - 1);
matrix[2] = x0;
}
/**
* Get the source coordinates in the input feature map.
*
* (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix
*
* in_w = u / w
* in_h = v / w
*
*/
template <typename T>
void get_source_coords(T matrix[], int out_w, int out_h, T* in_w, T* in_h) {
T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
in_w[0] = u / w;
in_h[0] = v / w;
}
/**
* Perform bilinear interpolation in the input feature map.
*/
template <typename T>
void bilinear_interpolate(const T* in_data, const int channels, const int width,
const int height, int in_n, int in_c, T in_w, T in_h,
T* val) {
// Deal with cases that source coords are out of feature map boundary
if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
GT<T>(in_h, height - 0.5)) {
// empty
val[0] = 0.0;
return;
}
if (GT<T>(0, in_w)) {
in_w = 0;
}
if (GT<T>(0, in_h)) {
in_h = 0;
}
int in_w_floor = floor(in_w);
int in_h_floor = floor(in_h);
int in_w_ceil;
int in_h_ceil;
if (GT_E<T>(in_w_floor, width - 1)) {
in_w_ceil = in_w_floor = width - 1;
in_w = static_cast<T>(in_w_floor);
} else {
in_w_ceil = in_w_floor + 1;
}
if (GT_E<T>(in_h_floor, height - 1)) {
in_h_ceil = in_h_floor = height - 1;
in_h = static_cast<T>(in_h_floor);
} else {
in_h_ceil = in_h_floor + 1;
}
T w_floor = in_w - in_w_floor;
T h_floor = in_h - in_h_floor;
T w_ceil = 1 - w_floor;
T h_ceil = 1 - h_floor;
const T* data = in_data + (in_n * channels + in_c) * height * width;
// Do bilinear interpolation
T v1 = data[in_h_floor * width + in_w_floor];
T v2 = data[in_h_ceil * width + in_w_floor];
T v3 = data[in_h_ceil * width + in_w_ceil];
T v4 = data[in_h_floor * width + in_w_ceil];
T w1 = w_ceil * h_ceil;
T w2 = w_ceil * h_floor;
T w3 = w_floor * h_floor;
T w4 = w_floor * h_ceil;
val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
}
template <typename T>
class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out = ctx.Output<framework::Tensor>("Out");
auto transformed_height = ctx.Attr<int>("transformed_height");
auto transformed_width = ctx.Attr<int>("transformed_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto in_dims = in->dims();
int channels = in_dims[1];
int in_height = in_dims[2];
int in_width = in_dims[3];
int rois_num = rois->dims()[0];
const T* input_data = in->data<T>();
framework::Tensor roi2image;
roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i;
}
}
T* output_data = out->mutable_data<T>(ctx.GetPlace());
const T* rois_data = rois->data<T>();
for (int n = 0; n < rois_num; ++n) {
const T* n_rois = rois_data + n * 8;
T roi_x[4];
T roi_y[4];
for (int k = 0; k < 4; ++k) {
roi_x[k] = n_rois[2 * k] * spatial_scale;
roi_y[k] = n_rois[2 * k + 1] * spatial_scale;
}
int image_id = roi2image_data[n];
// Get transform matrix
T transform_matrix[9];
get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
roi_y, transform_matrix);
for (int c = 0; c < channels; ++c) {
for (int out_h = 0; out_h < transformed_height; ++out_h) {
for (int out_w = 0; out_w < transformed_width; ++out_w) {
int out_index =
n * channels * transformed_height * transformed_width +
c * transformed_height * transformed_width +
out_h * transformed_width + out_w;
T in_w, in_h;
get_source_coords<T>(transform_matrix, out_w, out_h, &in_w, &in_h);
if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
if (GT<T>(-0.5, in_w) ||
GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
GT<T>(-0.5, in_h) ||
GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
output_data[out_index] = 0.0;
} else {
bilinear_interpolate(input_data, channels, in_width, in_height,
image_id, c, in_w, in_h,
output_data + out_index);
}
} else {
output_data[out_index] = 0.0;
}
}
}
}
}
}
};
template <typename T>
T get_feature_gradient(T xs, T ys, int w, int h, const int width,
const int height) {
if (GT<T>(-0.5, xs) || GT<T>(xs, width - 0.5) || GT<T>(-0.5, ys) ||
GT<T>(ys, height - 0.5)) {
return 0;
}
if (GT<T>(0, xs)) {
xs = 0;
}
if (GT<T>(0, ys)) {
ys = 0;
}
int xs_floor = floor(xs);
int ys_floor = floor(ys);
int xs_ceil;
int ys_ceil;
if (GT_E(xs_floor, width - 1)) {
xs_ceil = xs_floor = width - 1;
xs = static_cast<T>(xs_floor);
} else {
xs_ceil = xs_floor + 1;
}
if (GT_E(ys_floor, height - 1)) {
ys_ceil = ys_floor = height - 1;
ys = static_cast<T>(ys_floor);
} else {
ys_ceil = ys_floor + 1;
}
T weight = 0;
if (w == xs_floor) {
if (h == ys_floor) {
weight = (w + 1 - xs) * (h + 1 - ys);
} else if (h == ys_ceil) {
weight = (w + 1 - xs) * (ys + 1 - h);
}
} else if (w == xs_ceil) {
if (h == ys_floor) {
weight = (xs + 1 - w) * (h + 1 - ys);
} else if (h == ys_ceil) {
weight = (xs + 1 - w) * (ys + 1 - h);
}
}
return weight;
}
template <typename T>
class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto transformed_height = ctx.Attr<int>("transformed_height");
auto transformed_width = ctx.Attr<int>("transformed_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto in_dims = in->dims();
int batch_size = in_dims[0];
int channels = in_dims[1];
int in_height = in_dims[2];
int in_width = in_dims[3];
int rois_num = rois->dims()[0];
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const T* rois_data = rois->data<T>();
framework::Tensor roi2image;
roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(ctx.GetPlace());
auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i;
}
}
for (int n = 0; n < batch_size; ++n) {
for (int c = 0; c < channels; ++c) {
for (int in_h = 0; in_h < in_height; ++in_h) {
for (int in_w = 0; in_w < in_width; ++in_w) {
T gradient = 0.0;
for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
const T* rois = rois_data + roi_idx * 8;
T roi_x[4];
T roi_y[4];
for (int k = 0; k < 4; ++k) {
roi_x[k] = rois[2 * k] * spatial_scale;
roi_y[k] = rois[2 * k + 1] * spatial_scale;
}
// Get transform matrix
T matrix[9];
get_transform_matrix<T>(transformed_width, transformed_height,
roi_x, roi_y, matrix);
const T* out_grad_ptr = out_grad_data +
(roi_idx * channels + c) *
transformed_height *
transformed_width;
for (int out_h = 0; out_h < transformed_height; ++out_h) {
for (int out_w = 0; out_w < transformed_width; ++out_w) {
T src_w;
T src_h;
get_source_coords<T>(matrix, out_w, out_h, &src_w, &src_h);
if (in_quad<T>(src_w, src_h, roi_x, roi_y)) {
if (GT<T>(-0.5, src_w) ||
GT<T>(src_w, static_cast<T>(in_width - 0.5)) ||
GT<T>(-0.5, src_h) ||
GT<T>(src_h, static_cast<T>(in_height - 0.5))) {
continue;
}
T weight = get_feature_gradient<T>(src_w, src_h, in_w, in_h,
in_width, in_height);
gradient +=
out_grad_ptr[out_h * transformed_width + out_w] *
weight;
}
}
}
}
int out_idx = (n * channels + c) * in_height * in_width +
in_h * in_width + in_w;
in_grad_data[out_idx] = gradient;
}
}
}
}
}
};
class ROIPerspectiveTransformOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of ROIPerspectiveTransformOp should not be null.");
PADDLE_ENFORCE(
ctx->HasInput("ROIs"),
"Input(ROIs) of ROIPerspectiveTransformOp should not be null.");
PADDLE_ENFORCE(
ctx->HasOutput("Out"),
"Output(Out) of ROIPerspectiveTransformOp should not be null.");
auto input_dims = ctx->GetInputDim("X");
auto rois_dims = ctx->GetInputDim("ROIs");
PADDLE_ENFORCE(input_dims.size() == 4,
"The format of input tensor is NCHW.");
PADDLE_ENFORCE(rois_dims.size() == 2,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
"given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]");
PADDLE_ENFORCE(rois_dims[1] == 8,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 8)"
"given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...].");
int transformed_height = ctx->Attrs().Get<int>("transformed_height");
int transformed_width = ctx->Attrs().Get<int>("transformed_width");
float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
PADDLE_ENFORCE_GT(transformed_height, 0,
"The transformed output height must greater than 0");
PADDLE_ENFORCE_GT(transformed_width, 0,
"The transformed output width must greater than 0");
PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
"The spatial scale must greater than 0");
std::vector<int64_t> out_dims_v({rois_dims[0], // num_rois
input_dims[1], // channels
static_cast<int64_t>(transformed_height),
static_cast<int64_t>(transformed_width)});
auto out_dims = framework::make_ddim(out_dims_v);
ctx->SetOutputDim("Out", out_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
};
class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"The gradient of Out should not be null.");
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
"The gradient of X should not be null.");
ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.device_context());
}
};
class ROIPerspectiveTransformOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor), "
"the input of ROIPerspectiveTransformOp. "
"The format of input tensor is NCHW. Where N is batch size, "
"C is the number of input channels, "
"H is the height of the feature, and "
"W is the width of the feature.");
AddInput("ROIs",
"(LoDTensor), "
"ROIs (Regions of Interest) to be transformed. "
"should be a 2-D LoDTensor of shape (num_rois, 8)"
"given as [[x1, y1, x2, y2, x3, y3, x4, y4], ...]."
"(x1, y1) is the top left coordinates, and "
"(x2, y2) is the top right coordinates, and"
"(x3, y3) is the bottom right coordinates, and"
"(x4, y4) is the bottom left coordinates.");
AddOutput(
"Out",
"(Tensor), "
"The output of ROIPerspectiveTransformOp is a 4-D tensor with shape "
"(num_rois, channels, transformed_h, transformed_w).");
AddAttr<float>("spatial_scale",
"(float, default 1.0), "
"Spatial scale factor to scale ROI coords.")
.SetDefault(1.0);
AddAttr<int>("transformed_height",
"(int, default 1), "
"The height of transformed output.")
.SetDefault(1);
AddAttr<int>("transformed_width",
"(int, default 1), "
"The width of transformed output.")
.SetDefault(1);
AddComment(R"DOC(
**ROIPerspectiveTransform Operator**
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp,
ops::ROIPerspectiveTransformOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(roi_perspective_transform_grad,
ops::ROIPerspectiveTransformGradOp);
REGISTER_OP_CPU_KERNEL(roi_perspective_transform,
ops::CPUROIPerspectiveTransformOpKernel<float>);
REGISTER_OP_CPU_KERNEL(roi_perspective_transform_grad,
ops::CPUROIPerspectiveTransformGradOpKernel<float>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <algorithm>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
// CUDA: index helpers
#define idx4_4(index, d1, d2, d3, d4) (index % d4)
#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3)
#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2)
#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1)
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__device__ bool GT_E(T a, T b) {
return (a > b) || fabs(a - b) < 1e-4;
}
template <typename T>
__device__ bool LT_E(T a, T b) {
return (a < b) || fabs(a - b) < 1e-4;
}
template <typename T>
__device__ bool GT(T a, T b) {
return (a - b) > 1e-4;
}
template <typename T>
__device__ T max(T a, T b) {
return a > b ? a : b;
}
template <typename T>
__device__ T min(T a, T b) {
return a < b ? a : b;
}
/*
* check if (x, y) is in the boundary of roi
*/
template <typename T>
__device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) {
for (int i = 0; i < 4; i++) {
T start_w = roi_x[i];
T start_h = roi_y[i];
T end_w = roi_x[(i + 1) % 4];
T end_h = roi_y[(i + 1) % 4];
if (fabs(start_h - end_h) < 1e-4) {
if (fabs(y - start_h) < 1e-4 && fabs(y - end_h) < 1e-4 &&
GT_E<T>(x, min<T>(start_w, end_w)) &&
LT_E<T>(x, max<T>(start_w, end_w))) {
return true;
}
} else {
T intersec_x =
(y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
if (fabs(intersec_x - x) < 1e-4 && GT_E(y, min<T>(start_h, end_h)) &&
LT_E<T>(y, max<T>(start_h, end_h))) {
return true;
}
}
}
int n_cross = 0;
for (int i = 0; i < 4; i++) {
T start_w = roi_x[i];
T start_h = roi_y[i];
T end_w = roi_x[(i + 1) % 4];
T end_h = roi_y[(i + 1) % 4];
if (fabs(start_h - end_h) < 1e-4) {
continue;
}
if (LT_E<T>(y, min<T>(start_h, end_h)) ||
GT<T>(y, max<T>(start_h, end_h))) {
continue;
}
T intersec_x =
(y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w;
if (fabs(intersec_x - x) < 1e-4) {
return true;
}
if (GT<T>(intersec_x, x)) {
n_cross++;
}
}
return (n_cross % 2 == 1);
}
/**
* Perform bilinear interpolation in the input feature map.
*/
template <typename T>
__device__ void bilinear_interpolate(const T* in_data, const int channels,
const int width, const int height,
int in_n, int in_c, T in_w, T in_h,
T* val) {
// Deal with cases that source coords are out of feature map boundary
if (GT<T>(-0.5, in_w) || GT<T>(in_w, width - 0.5) || GT<T>(-0.5, in_h) ||
GT<T>(in_h, height - 0.5)) {
val[0] = 0.0;
return;
}
if (GT<T>(0, in_w)) {
in_w = 0;
}
if (GT<T>(0, in_h)) {
in_h = 0;
}
int in_w_floor = floor(in_w);
int in_h_floor = floor(in_h);
int in_w_ceil;
int in_h_ceil;
if (GT_E<T>(in_w_floor, width - 1)) {
in_w_ceil = in_w_floor = width - 1;
in_w = static_cast<T>(in_w_floor);
} else {
in_w_ceil = in_w_floor + 1;
}
if (GT_E<T>(in_h_floor, height - 1)) {
in_h_ceil = in_h_floor = height - 1;
in_h = static_cast<T>(in_h_floor);
} else {
in_h_ceil = in_h_floor + 1;
}
T w_floor = in_w - in_w_floor;
T h_floor = in_h - in_h_floor;
T w_ceil = 1 - w_floor;
T h_ceil = 1 - h_floor;
const T* data = in_data + (in_n * channels + in_c) * height * width;
// Do bilinear interpolation
T v1 = data[in_h_floor * width + in_w_floor];
T v2 = data[in_h_ceil * width + in_w_floor];
T v3 = data[in_h_ceil * width + in_w_ceil];
T v4 = data[in_h_floor * width + in_w_ceil];
T w1 = w_ceil * h_ceil;
T w2 = w_ceil * h_floor;
T w3 = w_floor * h_floor;
T w4 = w_floor * h_ceil;
val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
}
/**
* Get the source coordinates in the input feature map.
*
* (u, v, w)^matrix = T * (out_w, out_h, 1)^matrix
*
* in_w = u / w
* in_h = v / w
*
*/
template <typename T>
__device__ void get_source_coords(T matrix[], int out_w, int out_h, T* in_w,
T* in_h) {
T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2];
T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5];
T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8];
in_w[0] = u / w;
in_h[0] = v / w;
}
/**
* Get the matrix of perspective transform.
*
* dx1 = x1 - x2
* dx2 = x3 - x2
* dx3 = x0 - x1 + x2 - x3
* dy1 = y1 - y2
* dy2 = y3 - y2
* dy3 = y0 - y1 + y2 - y3
*
* a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1)
* a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1)
* a13 = x0
* a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1)
* a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1)
* a23 = y0
* a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1)
* a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1)
* a33 = 1
*
*/
template <typename T>
__device__ void get_transform_matrix(const int transformed_width,
const int transformed_height, T roi_x[],
T roi_y[], T matrix[]) {
T x0 = roi_x[0];
T x1 = roi_x[1];
T x2 = roi_x[2];
T x3 = roi_x[3];
T y0 = roi_y[0];
T y1 = roi_y[1];
T y2 = roi_y[2];
T y3 = roi_y[3];
// Estimate the height and width of RoI
T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1));
T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2));
T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3));
T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0));
T estimated_height = (len2 + len4) / 2.0;
T estimated_width = (len1 + len3) / 2.0;
// Get the normalized height and normalized width
int normalized_height = transformed_height;
int normalized_width =
round(estimated_width * (normalized_height - 1) / estimated_height) + 1;
normalized_width = min(normalized_width, transformed_width);
T dx1 = x1 - x2;
T dx2 = x3 - x2;
T dx3 = x0 - x1 + x2 - x3;
T dy1 = y1 - y2;
T dy2 = y3 - y2;
T dy3 = y0 - y1 + y2 - y3;
matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) /
(normalized_width - 1);
matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) /
(normalized_height - 1);
matrix[8] = 1;
matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) /
(normalized_width - 1);
matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) /
(normalized_height - 1);
matrix[5] = y0;
matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) /
(normalized_width - 1);
matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) /
(normalized_height - 1);
matrix[2] = x0;
}
template <typename T>
__global__ void RoiTransformKernel(const float* input_data,
const float* rois_data,
const int* roi2image_data, int num_rois,
int in_height, int in_width, int channels,
int transformed_height,
int transformed_width, float spatial_scale,
T* output_data) {
int output_size =
num_rois * transformed_height * transformed_width * channels;
CUDA_1D_KERNEL_LOOP(index, output_size) {
// (n, c, out_h, out_w) is an element in the transformed output
int out_w = idx4_4(index, num_rois, channels, transformed_height,
transformed_width);
int out_h = idx4_3(index, num_rois, channels, transformed_height,
transformed_width);
int c = idx4_2(index, num_rois, channels, transformed_height,
transformed_width);
int n = idx4_1(index, num_rois, channels, transformed_height,
transformed_width);
auto bottom_rois = rois_data + n * 8;
int roi_batch_ind = bottom_rois[0];
T roi_x[4];
T roi_y[4];
for (int k = 0; k < 4; ++k) {
roi_x[k] = bottom_rois[2 * k] * spatial_scale;
roi_y[k] = bottom_rois[2 * k + 1] * spatial_scale;
}
// Get transform matrix
T matrix[9];
get_transform_matrix<T>(transformed_width, transformed_height, roi_x, roi_y,
matrix);
// Get source coords
T in_w;
T in_h;
get_source_coords<T>(matrix, out_w, out_h, &in_w, &in_h);
if (in_quad<T>(in_w, in_h, roi_x, roi_y)) {
if (GT<T>(-0.5, in_w) || GT<T>(in_w, static_cast<T>(in_width - 0.5)) ||
GT<T>(-0.5, in_h) || GT<T>(in_h, static_cast<T>(in_height - 0.5))) {
// Skip if source coords is not in input image
output_data[index] = 0.0;
} else {
// Perform bilinear interpolation
int in_n = roi2image_data[n];
bilinear_interpolate<T>(input_data, channels, in_width, in_height, in_n,
c, in_w, in_h, output_data + index);
}
} else {
// Skip if source coords is not in quad
output_data[index] = 0.0;
}
}
}
template <typename T>
class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out = ctx.Output<framework::Tensor>("Out");
auto transformed_height = ctx.Attr<int>("transformed_height");
auto transformed_width = ctx.Attr<int>("transformed_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto in_dims = in->dims();
int batch_size = in_dims[0];
int channels = in_dims[1];
int in_height = in_dims[2];
int in_width = in_dims[3];
int rois_num = rois->dims()[0];
const T* input_data = in->data<T>();
T* output_data = out->mutable_data<T>(ctx.GetPlace());
const T* rois_data = rois->data<T>();
framework::Tensor roi2image;
framework::Tensor roi2image_dev;
roi2image.Resize({rois_num});
int* roi2image_data = roi2image.mutable_data<int>(platform::CPUPlace());
auto lod = rois->lod().back();
for (int i = 0; i < lod.size() - 1; ++i) {
for (int j = lod[i]; j < lod[i + 1]; ++j) {
roi2image_data[j] = i;
}
}
TensorCopySync(roi2image, ctx.GetPlace(), &roi2image_dev);
int out_size = rois_num * transformed_height * transformed_width * channels;
auto stream = ctx.cuda_device_context().stream();
int block = 512;
int grid = (out_size + block - 1) / block;
RoiTransformKernel<T><<<grid, block, 0, stream>>>(
input_data, rois_data, roi2image_dev.data<int>(), rois_num, in_height,
in_width, channels, transformed_height, transformed_width,
spatial_scale, output_data);
}
};
template <typename T>
__device__ T get_feature_gradient(T xs, T ys, int w, int h, const int width,
const int height) {
if (GT<T>(-0.5, xs) || GT<T>(xs, width - 0.5) || GT<T>(-0.5, ys) ||
GT<T>(ys, height - 0.5)) {
return 0;
}
if (GT<T>(0, xs)) {
xs = 0;
}
if (GT<T>(0, ys)) {
ys = 0;
}
int xs_floor = floor(xs);
int ys_floor = floor(ys);
int xs_ceil;
int ys_ceil;
if (GT_E<T>(xs_floor, width - 1)) {
xs_ceil = xs_floor = width - 1;
xs = static_cast<T>(xs_floor);
} else {
xs_ceil = xs_floor + 1;
}
if (GT_E(ys_floor, height - 1)) {
ys_ceil = ys_floor = height - 1;
ys = static_cast<T>(ys_floor);
} else {
ys_ceil = ys_floor + 1;
}
T weight = 0;
if (w == xs_floor) {
if (h == ys_floor) {
weight = (w + 1 - xs) * (h + 1 - ys);
} else if (h == ys_ceil) {
weight = (w + 1 - xs) * (ys + 1 - h);
}
} else if (w == xs_ceil) {
if (h == ys_floor) {
weight = (xs + 1 - w) * (h + 1 - ys);
} else if (h == ys_ceil) {
weight = (xs + 1 - w) * (ys + 1 - h);
}
}
return weight;
}
template <typename T>
__global__ void RoiTransformGradKernel(
const size_t* lod, const T* rois_data, int batch_size, int num_rois,
int in_height, int in_width, int channels, int transformed_height,
int transformed_width, float spatial_scale, const T* out_grad_data,
T* in_grad_data) {
int input_size = batch_size * in_height * in_width * channels;
CUDA_1D_KERNEL_LOOP(index, input_size) {
// (n, c, h, w) coords in input
int in_w = idx4_4(index, batch_size, channels, in_height, in_width);
int in_h = idx4_3(index, batch_size, channels, in_height, in_width);
int c = idx4_2(index, batch_size, channels, in_height, in_width);
int n = idx4_1(index, batch_size, channels, in_height, in_width);
T gradient = 0.0;
// Accumulate gradient over all RoIs that interpolated this element
for (int roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) {
const T* rois = rois_data + roi_idx * 8;
T roi_x[4];
T roi_y[4];
for (int k = 0; k < 4; ++k) {
roi_x[k] = rois[2 * k] * spatial_scale;
roi_y[k] = rois[2 * k + 1] * spatial_scale;
}
// Get transform matrix
T matrix[9];
get_transform_matrix<T>(transformed_width, transformed_height, roi_x,
roi_y, matrix);
const T* out_grad_ptr =
out_grad_data +
(roi_idx * channels + c) * transformed_height * transformed_width;
for (int out_h = 0; out_h < transformed_height; ++out_h) {
for (int out_w = 0; out_w < transformed_width; ++out_w) {
T src_w;
T src_h;
get_source_coords<T>(matrix, out_w, out_h, &src_w, &src_h);
if (in_quad<T>(src_w, src_h, roi_x, roi_y)) {
if (GT<T>(-0.5, src_w) ||
GT<T>(src_w, static_cast<T>(in_width - 0.5)) ||
GT<T>(-0.5, src_h) ||
GT<T>(src_h, static_cast<T>(in_height - 0.5))) {
continue;
}
T weight = get_feature_gradient<T>(src_w, src_h, in_w, in_h,
in_width, in_height);
gradient +=
out_grad_ptr[out_h * transformed_width + out_w] * weight;
}
}
}
}
in_grad_data[index] = gradient;
}
}
template <typename T>
class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* in = ctx.Input<framework::Tensor>("X");
auto* rois = ctx.Input<framework::LoDTensor>("ROIs");
auto* out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
auto transformed_height = ctx.Attr<int>("transformed_height");
auto transformed_width = ctx.Attr<int>("transformed_width");
auto spatial_scale = ctx.Attr<float>("spatial_scale");
auto in_dims = in->dims();
int batch_size = in_dims[0];
int channels = in_dims[1];
int in_height = in_dims[2];
int in_width = in_dims[3];
int rois_num = rois->dims()[0];
T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
const T* out_grad_data = out_grad->data<T>();
const T* rois_data = rois->data<T>();
auto lod = rois->lod().back();
auto lod_data = lod.CUDAData(ctx.GetPlace());
int in_size = in->numel();
auto stream = ctx.cuda_device_context().stream();
int block = 512;
int grid = (in_size + block - 1) / block;
RoiTransformGradKernel<T><<<grid, block, 0, stream>>>(
lod_data, rois_data, batch_size, rois_num, in_height, in_width,
channels, transformed_height, transformed_width, spatial_scale,
out_grad_data, in_grad_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(roi_perspective_transform,
ops::CUDAROIPerspectiveTransformOpKernel<float>);
REGISTER_OP_CUDA_KERNEL(roi_perspective_transform_grad,
ops::CUDAROIPerspectiveTransformGradOpKernel<float>);
...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type")); auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num"); int class_num = ctx.Attr<int>("class_num");
auto& label_lod = in_label->lod(); auto label_lod = in_label->lod();
auto& detect_lod = in_detect->lod(); auto detect_lod = in_detect->lod();
PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, PADDLE_ENFORCE_EQ(label_lod.size(), 1UL,
"Only support one level sequence now."); "Only support one level sequence now.");
PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(),
...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto labels = framework::EigenTensor<T, 2>::From(input_label); auto labels = framework::EigenTensor<T, 2>::From(input_label);
auto detect = framework::EigenTensor<T, 2>::From(input_detect); auto detect = framework::EigenTensor<T, 2>::From(input_detect);
auto& label_lod = input_label.lod(); auto label_lod = input_label.lod();
auto& detect_lod = input_detect.lod(); auto detect_lod = input_detect.lod();
int batch_size = label_lod[0].size() - 1; int batch_size = label_lod[0].size() - 1;
auto& label_index = label_lod[0]; auto label_index = label_lod[0];
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
std::map<int, std::vector<Box>> boxes; std::map<int, std::vector<Box>> boxes;
...@@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -274,6 +274,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
output_true_pos->set_lod(true_pos_lod); output_true_pos->set_lod(true_pos_lod);
output_false_pos->set_lod(false_pos_lod); output_false_pos->set_lod(false_pos_lod);
return;
} }
void GetInputPos(const framework::Tensor& input_pos_count, void GetInputPos(const framework::Tensor& input_pos_count,
...@@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -291,7 +292,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
auto SetData = [](const framework::LoDTensor& pos_tensor, auto SetData = [](const framework::LoDTensor& pos_tensor,
std::map<int, std::vector<std::pair<T, int>>>& pos) { std::map<int, std::vector<std::pair<T, int>>>& pos) {
const T* pos_data = pos_tensor.data<T>(); const T* pos_data = pos_tensor.data<T>();
auto& pos_data_lod = pos_tensor.lod()[0]; auto pos_data_lod = pos_tensor.lod()[0];
for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) {
for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) {
T score = pos_data[j * 2]; T score = pos_data[j * 2];
...@@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -316,23 +317,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
std::map<int, std::vector<std::pair<T, int>>>* false_pos) const { std::map<int, std::vector<std::pair<T, int>>>* false_pos) const {
int batch_size = gt_boxes.size(); int batch_size = gt_boxes.size();
for (int n = 0; n < batch_size; ++n) { for (int n = 0; n < batch_size; ++n) {
auto& image_gt_boxes = gt_boxes[n]; auto image_gt_boxes = gt_boxes[n];
for (auto& image_gt_box : image_gt_boxes) { for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) {
size_t count = 0; size_t count = 0;
auto& labeled_bboxes = image_gt_box.second; auto labeled_bboxes = it->second;
if (evaluate_difficult) { if (evaluate_difficult) {
count = labeled_bboxes.size(); count = labeled_bboxes.size();
} else { } else {
for (auto& box : labeled_bboxes) { for (size_t i = 0; i < labeled_bboxes.size(); ++i)
if (!box.is_difficult) { if (!(labeled_bboxes[i].is_difficult)) ++count;
++count;
}
}
} }
if (count == 0) { if (count == 0) {
continue; continue;
} }
int label = image_gt_box.first; int label = it->first;
if (label_pos_count->find(label) == label_pos_count->end()) { if (label_pos_count->find(label) == label_pos_count->end()) {
(*label_pos_count)[label] = count; (*label_pos_count)[label] = count;
} else { } else {
......
...@@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData( ...@@ -92,9 +92,14 @@ bool VariableResponse::CopyLodTensorData(
::google::protobuf::io::CodedInputStream* input, ::google::protobuf::io::CodedInputStream* input,
const platform::DeviceContext& ctx, const framework::DDim& dims, const platform::DeviceContext& ctx, const framework::DDim& dims,
int length) { int length) {
auto server_var = GetVar();
if (!server_var) {
LOG(ERROR) << "recved var should not on current server: "
<< meta_.varname();
return false;
}
auto* tensor = GetVar()->GetMutable<framework::LoDTensor>(); auto* tensor = GetVar()->GetMutable<framework::LoDTensor>();
tensor->Resize(dims); tensor->Resize(dims);
framework::LoD lod; framework::LoD lod;
for (int i = 0; i < meta_.lod_level(); ++i) { for (int i = 0; i < meta_.lod_level(); ++i) {
framework::Vector<size_t> v; framework::Vector<size_t> v;
...@@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData( ...@@ -107,7 +112,6 @@ bool VariableResponse::CopyLodTensorData(
void* tensor_data = void* tensor_data =
tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type())); tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type()));
if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) {
return false; return false;
} }
......
...@@ -987,18 +987,28 @@ void FusedElemwiseAndActComputeWithBroadcast( ...@@ -987,18 +987,28 @@ void FusedElemwiseAndActComputeWithBroadcast(
} }
// --- backward // --- backward
template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut> template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool UseIntermediateOut>
struct FusedElemwiseAndActGradNoBroadcast { struct FusedElemwiseAndActGradNoBroadcast {
HOSTDEVICE void operator()(size_t i) { HOSTDEVICE void operator()(size_t i) {
if (dx_ != nullptr) { if (dx_ != nullptr) {
dx_[i] = UseIntermediateOut ? dx_op_(x_[i], y_[i], intermediate_out_[i], dx_[i] = UseIntermediateOut
out_[i], dout_[i]) ? dx_op_.UseIntermediateOut(
: dx_op_(x_[i], y_[i], out_[i], dout_[i]); x_[i], y_[i], intermediate_out_[i], out_[i], dout_[i])
: dx_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]);
} }
if (dy_ != nullptr) { if (dy_ != nullptr) {
dy_[i] = UseIntermediateOut ? dy_op_(x_[i], y_[i], intermediate_out_[i], dy_[i] = UseIntermediateOut
out_[i], dout_[i]) ? dy_op_.UseIntermediateOut(
: dy_op_(x_[i], y_[i], out_[i], dout_[i]); x_[i], y_[i], intermediate_out_[i], out_[i], dout_[i])
: dy_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]);
}
if (dintermediate_ != nullptr) {
dintermediate_[i] =
UseIntermediateOut
? dintermediate_op_.UseIntermediateOut(
x_[i], intermediate_out_[i], out_[i], dout_[i])
: dintermediate_op_.Recompute(x_[i], y_[i], out_[i], dout_[i]);
} }
} }
...@@ -1009,37 +1019,44 @@ struct FusedElemwiseAndActGradNoBroadcast { ...@@ -1009,37 +1019,44 @@ struct FusedElemwiseAndActGradNoBroadcast {
const T *dout_; const T *dout_;
DX_OP dx_op_; DX_OP dx_op_;
DY_OP dy_op_; DY_OP dy_op_;
DIntermediate_OP dintermediate_op_;
T *dx_; T *dx_;
T *dy_; T *dy_;
T *dintermediate_;
}; };
template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP, template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
bool UseIntermediateOut> typename DIntermediate_OP, bool UseIntermediateOut>
void FusedElemwiseAndActGradComputeNoBroadcast( void FusedElemwiseAndActGradComputeNoBroadcast(
const framework::ExecutionContext &ctx, const framework::DDim &x_dim, const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
const framework::DDim &y_dim, const framework::Tensor *x, const framework::DDim &y_dim, const framework::Tensor *x,
const framework::Tensor *y, const framework::Tensor *intermediate_out, const framework::Tensor *y, const framework::Tensor *intermediate_out,
const framework::Tensor *out, const framework::Tensor *dout, int axis, const framework::Tensor *out, const framework::Tensor *dout, int axis,
framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { framework::Tensor *dx, framework::Tensor *dy,
framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op,
DIntermediate_OP dintermediate_op) {
size_t N = static_cast<size_t>(framework::product(x_dim)); size_t N = static_cast<size_t>(framework::product(x_dim));
platform::ForRange<DeviceContext> for_range( platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(), N); ctx.template device_context<DeviceContext>(), N);
for_range( for_range(
FusedElemwiseAndActGradNoBroadcast<T, DX_OP, DY_OP, UseIntermediateOut>{ FusedElemwiseAndActGradNoBroadcast<T, DX_OP, DY_OP, DIntermediate_OP,
UseIntermediateOut>{
x->data<T>(), y->data<T>(), x->data<T>(), y->data<T>(),
intermediate_out ? intermediate_out->data<T>() : nullptr, intermediate_out ? intermediate_out->data<T>() : nullptr,
out->data<T>(), dout->data<T>(), dx_op, dy_op, out->data<T>(), dout->data<T>(), dx_op, dy_op, dintermediate_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()), dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())}); dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
ctx.GetPlace())});
} }
template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut, template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool BcastY, bool SameShapeOfIntermediateOutAndOut> bool UseIntermediateOut, bool BcastY,
static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, bool SameShapeOfIntermediateOutAndOut>
const T *intermediate_out, static void FusedElemwiseAndActGradBroadcast1CPU(
const T *out, const T *dout, const T *x, const T *y, const T *intermediate_out, const T *out,
int h, int w, DX_OP dx_op, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
DY_OP dy_op, T *dx, T *dy) { DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
int64_t tmp_out_idx, x_idx, y_idx; int64_t tmp_out_idx, x_idx, y_idx;
for (int i = 0; i < h; ++i) { for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; ++j) { for (int j = 0; j < w; ++j) {
...@@ -1055,9 +1072,11 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, ...@@ -1055,9 +1072,11 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y,
if (dx != nullptr) { if (dx != nullptr) {
T tmp = UseIntermediateOut T tmp = UseIntermediateOut
? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx],
out[offset], dout[offset]) intermediate_out[tmp_out_idx],
: dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); out[offset], dout[offset])
: dx_op.Recompute(x[x_idx], y[y_idx], out[offset],
dout[offset]);
if (BcastY) { if (BcastY) {
dx[x_idx] = tmp; dx[x_idx] = tmp;
...@@ -1071,9 +1090,11 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, ...@@ -1071,9 +1090,11 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y,
} }
if (dy != nullptr) { if (dy != nullptr) {
T tmp = UseIntermediateOut T tmp = UseIntermediateOut
? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx],
out[offset], dout[offset]) intermediate_out[tmp_out_idx],
: dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); out[offset], dout[offset])
: dy_op.Recompute(x[x_idx], y[y_idx], out[offset],
dout[offset]);
if (BcastY) { if (BcastY) {
if (i == 0) { if (i == 0) {
dy[y_idx] = tmp; dy[y_idx] = tmp;
...@@ -1084,18 +1105,34 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y, ...@@ -1084,18 +1105,34 @@ static void FusedElemwiseAndActGradBroadcast1CPU(const T *x, const T *y,
dy[y_idx] = tmp; dy[y_idx] = tmp;
} }
} }
if (d_intermediate != nullptr) {
T tmp = UseIntermediateOut
? dintermediate_op.UseIntermediateOut(
x[x_idx], intermediate_out[tmp_out_idx], out[offset],
dout[offset])
: dintermediate_op.Recompute(x[x_idx], y[y_idx],
out[offset], dout[i]);
if (SameShapeOfIntermediateOutAndOut) {
d_intermediate[tmp_out_idx] = tmp;
} else {
if (i == 0) {
d_intermediate[tmp_out_idx] = tmp;
} else {
d_intermediate[tmp_out_idx] += tmp;
}
}
}
} }
} }
} }
template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut, template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool BcastY, bool SameShapeOfIntermediateOutAndOut> bool UseIntermediateOut, bool BcastY,
static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, bool SameShapeOfIntermediateOutAndOut>
const T *intermediate_out, static void FusedElemwiseAndActGradBroadcast2CPU(
const T *out, const T *dout, const T *x, const T *y, const T *intermediate_out, const T *out,
int pre, int n, int post, const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op,
DX_OP dx_op, DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
T *dx, T *dy) {
int64_t tmp_out_idx, x_idx, y_idx; int64_t tmp_out_idx, x_idx, y_idx;
for (int i = 0; i < pre; ++i) { for (int i = 0; i < pre; ++i) {
for (int j = 0; j < n; ++j) { for (int j = 0; j < n; ++j) {
...@@ -1112,9 +1149,11 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, ...@@ -1112,9 +1149,11 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y,
if (dx != nullptr) { if (dx != nullptr) {
T tmp = UseIntermediateOut T tmp = UseIntermediateOut
? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx],
out[offset], dout[offset]) intermediate_out[tmp_out_idx],
: dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); out[offset], dout[offset])
: dx_op.Recompute(x[x_idx], y[y_idx], out[offset],
dout[offset]);
if (BcastY) { if (BcastY) {
dx[x_idx] = tmp; dx[x_idx] = tmp;
...@@ -1128,9 +1167,11 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, ...@@ -1128,9 +1167,11 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y,
} }
if (dy != nullptr) { if (dy != nullptr) {
T tmp = UseIntermediateOut T tmp = UseIntermediateOut
? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx],
out[offset], dout[offset]) intermediate_out[tmp_out_idx],
: dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); out[offset], dout[offset])
: dy_op.Recompute(x[x_idx], y[y_idx], out[offset],
dout[offset]);
if (BcastY) { if (BcastY) {
if (i == 0 && k == 0) { if (i == 0 && k == 0) {
dy[y_idx] = tmp; dy[y_idx] = tmp;
...@@ -1141,21 +1182,40 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y, ...@@ -1141,21 +1182,40 @@ static void FusedElemwiseAndActGradBroadcast2CPU(const T *x, const T *y,
dy[y_idx] = tmp; dy[y_idx] = tmp;
} }
} }
if (d_intermediate != nullptr) {
T tmp = UseIntermediateOut
? dintermediate_op.UseIntermediateOut(
x[x_idx], intermediate_out[tmp_out_idx],
out[offset], dout[offset])
: dintermediate_op.Recompute(x[x_idx], y[y_idx],
out[offset], dout[i]);
if (SameShapeOfIntermediateOutAndOut) {
d_intermediate[tmp_out_idx] = tmp;
} else {
if (i == 0) {
d_intermediate[tmp_out_idx] = tmp;
} else {
d_intermediate[tmp_out_idx] += tmp;
}
}
}
} }
} }
} }
} }
#ifdef __NVCC__ #ifdef __NVCC__
template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut, template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool BcastY, bool SameShapeOfIntermediateOutAndOut> bool UseIntermediateOut, bool BcastY,
bool SameShapeOfIntermediateOutAndOut>
static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
const T *x, const T *y, const T *intermediate_out, const T *out, const T *x, const T *y, const T *intermediate_out, const T *out,
const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
int j = blockIdx.x; int j = blockIdx.x;
int i = threadIdx.x; int i = threadIdx.x;
int tid = threadIdx.x; int tid = threadIdx.x;
T val(0); T val(0), inter_val(0);
int64_t tmp_out_idx, x_idx, y_idx; int64_t tmp_out_idx, x_idx, y_idx;
do { do {
...@@ -1170,10 +1230,12 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( ...@@ -1170,10 +1230,12 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
} }
if (dx != nullptr) { if (dx != nullptr) {
T tmp = UseIntermediateOut T tmp =
? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], UseIntermediateOut
out[offset], dout[offset]) ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx],
: dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); intermediate_out[tmp_out_idx],
out[offset], dout[offset])
: dx_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]);
if (BcastY) { if (BcastY) {
dx[x_idx] = tmp; dx[x_idx] = tmp;
...@@ -1182,23 +1244,38 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( ...@@ -1182,23 +1244,38 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
} }
} }
if (dy != nullptr) { if (dy != nullptr) {
T tmp = UseIntermediateOut T tmp =
? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], UseIntermediateOut
out[offset], dout[offset]) ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx],
: dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); intermediate_out[tmp_out_idx],
out[offset], dout[offset])
: dy_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]);
if (BcastY) { if (BcastY) {
val += tmp; val += tmp;
} else { } else {
dy[y_idx] = tmp; dy[y_idx] = tmp;
} }
} }
if (d_intermediate != nullptr) {
T tmp = UseIntermediateOut
? dintermediate_op.UseIntermediateOut(
y[y_idx], intermediate_out[tmp_out_idx], out[offset],
dout[offset])
: dintermediate_op.Recompute(x[x_idx], y[y_idx], out[offset],
dout[offset]);
if (SameShapeOfIntermediateOutAndOut) {
d_intermediate[tmp_out_idx] = tmp;
} else {
inter_val += tmp;
}
}
i += ELEMWISE_MAX_BLOCK_DIM; i += ELEMWISE_MAX_BLOCK_DIM;
} while (i < h); } while (i < h);
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
if (BcastY) { if (BcastY) {
if (dy) { if (dy) {
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = paddle::platform::reduceSum(val, tid, h); val = paddle::platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
...@@ -1206,41 +1283,49 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel( ...@@ -1206,41 +1283,49 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
} }
} else { } else {
if (dx) { if (dx) {
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = paddle::platform::reduceSum(val, tid, h); val = paddle::platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dx[j] = val; dx[j] = val;
} }
} }
} }
if (!SameShapeOfIntermediateOutAndOut) {
if (d_intermediate) {
inter_val = paddle::platform::reduceSum(inter_val, tid, h);
if (threadIdx.x == 0) {
d_intermediate[j] = inter_val;
}
}
}
} }
template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut, template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool BcastY, bool SameShapeOfIntermediateOutAndOut> bool UseIntermediateOut, bool BcastY,
static void FusedElemwiseAndActGradBroadcast1CUDA(cudaStream_t stream, bool SameShapeOfIntermediateOutAndOut>
const T *x, const T *y, static void FusedElemwiseAndActGradBroadcast1CUDA(
const T *intermediate_out, cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
const T *out, const T *dout, const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op,
int h, int w, DX_OP dx_op, DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
DY_OP dy_op, T *dx, T *dy) {
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h);
int gird_size = w; int gird_size = w;
FusedElemwiseAndActGradBroadcast1CUDAKernel< FusedElemwiseAndActGradBroadcast1CUDAKernel<
T, DX_OP, DY_OP, UseIntermediateOut, BcastY, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY,
SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>( SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dx, dy); x, y, intermediate_out, out, dout, h, w, dx_op, dy_op, dintermediate_op,
dx, dy, d_intermediate);
} }
template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut, template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool BcastY, bool SameShapeOfIntermediateOutAndOut> bool UseIntermediateOut, bool BcastY,
bool SameShapeOfIntermediateOutAndOut>
static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
const T *x, const T *y, const T *intermediate_out, const T *out, const T *x, const T *y, const T *intermediate_out, const T *out,
const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op, T *dx, const T *dout, int pre, int n, int post, DX_OP dx_op, DY_OP dy_op,
T *dy) { DIntermediate_OP dintermediate_op, T *dx, T *dy, T *d_intermediate) {
int tid = threadIdx.x; int tid = threadIdx.x;
int j = blockIdx.x; int j = blockIdx.x;
T val(0); T val(0), inter_val(0);
int ttid = tid; int ttid = tid;
int64_t tmp_out_idx, x_idx, y_idx; int64_t tmp_out_idx, x_idx, y_idx;
while (true) { while (true) {
...@@ -1259,10 +1344,12 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( ...@@ -1259,10 +1344,12 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
} }
if (dx != nullptr) { if (dx != nullptr) {
T tmp = UseIntermediateOut T tmp =
? dx_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], UseIntermediateOut
out[offset], dout[offset]) ? dx_op.UseIntermediateOut(x[x_idx], y[y_idx],
: dx_op(x[x_idx], y[y_idx], out[offset], dout[offset]); intermediate_out[tmp_out_idx],
out[offset], dout[offset])
: dx_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]);
if (BcastY) { if (BcastY) {
dx[x_idx] = tmp; dx[x_idx] = tmp;
...@@ -1271,24 +1358,38 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( ...@@ -1271,24 +1358,38 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
} }
} }
if (dy != nullptr) { if (dy != nullptr) {
T tmp = UseIntermediateOut T tmp =
? dy_op(x[x_idx], y[y_idx], intermediate_out[tmp_out_idx], UseIntermediateOut
out[offset], dout[offset]) ? dy_op.UseIntermediateOut(x[x_idx], y[y_idx],
: dy_op(x[x_idx], y[y_idx], out[offset], dout[offset]); intermediate_out[tmp_out_idx],
out[offset], dout[offset])
: dy_op.Recompute(x[x_idx], y[y_idx], out[offset], dout[offset]);
if (BcastY) { if (BcastY) {
val += tmp; val += tmp;
} else { } else {
dy[y_idx] = tmp; dy[y_idx] = tmp;
} }
} }
if (d_intermediate != nullptr) {
T tmp = UseIntermediateOut
? dintermediate_op.UseIntermediateOut(
y[y_idx], intermediate_out[tmp_out_idx], out[offset],
dout[offset])
: dintermediate_op.Recompute(x[x_idx], y[y_idx], out[offset],
dout[offset]);
if (SameShapeOfIntermediateOutAndOut) {
d_intermediate[tmp_out_idx] = tmp;
} else {
inter_val += tmp;
}
}
ttid += ELEMWISE_MAX_BLOCK_DIM; ttid += ELEMWISE_MAX_BLOCK_DIM;
} }
int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
if (BcastY) { if (BcastY) {
if (dy) { if (dy) {
int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = paddle::platform::reduceSum(val, tid, h); val = paddle::platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dy[j] = val; dy[j] = val;
...@@ -1296,40 +1397,51 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel( ...@@ -1296,40 +1397,51 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
} }
} else { } else {
if (dx) { if (dx) {
int h = pre * post;
h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
val = paddle::platform::reduceSum(val, tid, h); val = paddle::platform::reduceSum(val, tid, h);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
dx[j] = val; dx[j] = val;
} }
} }
} }
if (!SameShapeOfIntermediateOutAndOut) {
if (d_intermediate) {
inter_val = paddle::platform::reduceSum(inter_val, tid, h);
if (threadIdx.x == 0) {
d_intermediate[j] = inter_val;
}
}
}
} }
template <typename T, typename DX_OP, typename DY_OP, bool UseIntermediateOut, template <typename T, typename DX_OP, typename DY_OP, typename DIntermediate_OP,
bool BcastY, bool SameShapeOfIntermediateOutAndOut> bool UseIntermediateOut, bool BcastY,
bool SameShapeOfIntermediateOutAndOut>
static void FusedElemwiseAndActGradBroadcast2CUDA( static void FusedElemwiseAndActGradBroadcast2CUDA(
cudaStream_t stream, const T *x, const T *y, const T *intermediate_out, cudaStream_t stream, const T *x, const T *y, const T *intermediate_out,
const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op, const T *out, const T *dout, int pre, int n, int post, DX_OP dx_op,
DY_OP dy_op, T *dx, T *dy) { DY_OP dy_op, DIntermediate_OP dintermediate_op, T *dx, T *dy,
T *dintermediate) {
int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post); int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, pre * post);
int gird_size = n; int gird_size = n;
FusedElemwiseAndActGradBroadcast2CUDAKernel< FusedElemwiseAndActGradBroadcast2CUDAKernel<
T, DX_OP, DY_OP, UseIntermediateOut, BcastY, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut, BcastY,
SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>( SameShapeOfIntermediateOutAndOut><<<gird_size, block_size, 0, stream>>>(
x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op, dx, dy); x, y, intermediate_out, out, dout, pre, n, post, dx_op, dy_op,
dintermediate_op, dx, dy, dintermediate);
} }
#endif #endif
template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP, template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
bool UseIntermediateOut, bool BcastY, typename DIntermediate_OP, bool UseIntermediateOut, bool BcastY,
bool SameShapeOfIntermediateOutAndOut> bool SameShapeOfIntermediateOutAndOut>
void FusedElemwiseAndActGradComputeWithBroadcast( void FusedElemwiseAndActGradComputeWithBroadcast(
const framework::ExecutionContext &ctx, const framework::DDim &x_dim, const framework::ExecutionContext &ctx, const framework::DDim &x_dim,
const framework::DDim &y_dim_untrimed, const framework::Tensor *x, const framework::DDim &y_dim_untrimed, const framework::Tensor *x,
const framework::Tensor *y, const framework::Tensor *intermediate_out, const framework::Tensor *y, const framework::Tensor *intermediate_out,
const framework::Tensor *out, const framework::Tensor *dout, int axis, const framework::Tensor *out, const framework::Tensor *dout, int axis,
framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, DY_OP dy_op) { framework::Tensor *dx, framework::Tensor *dy,
framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op,
DIntermediate_OP dintermediate_op) {
axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis); axis = (axis == -1 ? x_dim.size() - y_dim_untrimed.size() : axis);
auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); auto y_dim = trim_trailing_singular_dims(y_dim_untrimed);
axis = (y_dim.size() == 0) ? x_dim.size() : axis; axis = (y_dim.size() == 0) ? x_dim.size() : axis;
...@@ -1341,70 +1453,82 @@ void FusedElemwiseAndActGradComputeWithBroadcast( ...@@ -1341,70 +1453,82 @@ void FusedElemwiseAndActGradComputeWithBroadcast(
int w = n; int w = n;
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__ #ifdef __NVCC__
FusedElemwiseAndActGradBroadcast1CUDA<T, DX_OP, DY_OP, UseIntermediateOut, FusedElemwiseAndActGradBroadcast1CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
BcastY, UseIntermediateOut, BcastY,
SameShapeOfIntermediateOutAndOut>( SameShapeOfIntermediateOutAndOut>(
ctx.template device_context<DeviceContext>().stream(), x->data<T>(), ctx.template device_context<DeviceContext>().stream(), x->data<T>(),
y->data<T>(), y->data<T>(),
intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(), intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, dintermediate_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()), dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())); dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
ctx.GetPlace()));
#endif #endif
} else { } else {
FusedElemwiseAndActGradBroadcast1CPU<T, DX_OP, DY_OP, UseIntermediateOut, FusedElemwiseAndActGradBroadcast1CPU<T, DX_OP, DY_OP, DIntermediate_OP,
BcastY, UseIntermediateOut, BcastY,
SameShapeOfIntermediateOutAndOut>( SameShapeOfIntermediateOutAndOut>(
x->data<T>(), y->data<T>(), x->data<T>(), y->data<T>(),
intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(), intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, out->data<T>(), dout->data<T>(), h, w, dx_op, dy_op, dintermediate_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()), dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())); dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
ctx.GetPlace()));
} }
} else { } else {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef __NVCC__ #ifdef __NVCC__
FusedElemwiseAndActGradBroadcast2CUDA<T, DX_OP, DY_OP, UseIntermediateOut, FusedElemwiseAndActGradBroadcast2CUDA<T, DX_OP, DY_OP, DIntermediate_OP,
BcastY, UseIntermediateOut, BcastY,
SameShapeOfIntermediateOutAndOut>( SameShapeOfIntermediateOutAndOut>(
ctx.template device_context<DeviceContext>().stream(), x->data<T>(), ctx.template device_context<DeviceContext>().stream(), x->data<T>(),
y->data<T>(), y->data<T>(),
intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(), intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
out->data<T>(), dout->data<T>(), pre, n, post, dx_op, dy_op, out->data<T>(), dout->data<T>(), pre, n, post, dx_op, dy_op,
dintermediate_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()), dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())); dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
ctx.GetPlace()));
#endif #endif
} else { } else {
FusedElemwiseAndActGradBroadcast2CPU<T, DX_OP, DY_OP, UseIntermediateOut, FusedElemwiseAndActGradBroadcast2CPU<T, DX_OP, DY_OP, DIntermediate_OP,
BcastY, UseIntermediateOut, BcastY,
SameShapeOfIntermediateOutAndOut>( SameShapeOfIntermediateOutAndOut>(
x->data<T>(), y->data<T>(), x->data<T>(), y->data<T>(),
intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(), intermediate_out == nullptr ? nullptr : intermediate_out->data<T>(),
out->data<T>(), dout->data<T>(), pre, n, post, dx_op, dy_op, out->data<T>(), dout->data<T>(), pre, n, post, dx_op, dy_op,
dintermediate_op,
dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()), dx == nullptr ? nullptr : dx->mutable_data<T>(ctx.GetPlace()),
dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace())); dy == nullptr ? nullptr : dy->mutable_data<T>(ctx.GetPlace()),
dintermediate == nullptr ? nullptr : dintermediate->mutable_data<T>(
ctx.GetPlace()));
} }
} }
} }
template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP, template <typename DeviceContext, typename T, typename DX_OP, typename DY_OP,
bool UseIntermediateOut, bool SameShapeOfIntermediateOutAndOut> typename DIntermediate_OP, bool UseIntermediateOut,
bool SameShapeOfIntermediateOutAndOut>
void FusedElemwiseAndActGradComputeEx( void FusedElemwiseAndActGradComputeEx(
const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::ExecutionContext &ctx, const framework::Tensor *x,
const framework::Tensor *y, const framework::Tensor *out, const framework::Tensor *y, const framework::Tensor *out,
const framework::Tensor *intermediate_out, const framework::Tensor *dout, const framework::Tensor *intermediate_out, const framework::Tensor *dout,
int axis, framework::Tensor *dx, framework::Tensor *dy, DX_OP dx_op, int axis, framework::Tensor *dx, framework::Tensor *dy,
DY_OP dy_op) { framework::Tensor *dintermediate, DX_OP dx_op, DY_OP dy_op,
DIntermediate_OP dintermediate_op) {
const framework::DDim &x_dim = x->dims(); const framework::DDim &x_dim = x->dims();
const framework::DDim &y_dim = y->dims(); const framework::DDim &y_dim = y->dims();
if (UseIntermediateOut) { if (UseIntermediateOut) {
PADDLE_ENFORCE(intermediate_out, "intermediate_out should not be nullptr"); PADDLE_ENFORCE(intermediate_out, "intermediate_out should not be nullptr");
} }
if (x_dim == y_dim) { if (x_dim == y_dim) {
FusedElemwiseAndActGradComputeNoBroadcast<DeviceContext, T, DX_OP, DY_OP, FusedElemwiseAndActGradComputeNoBroadcast<
UseIntermediateOut>( DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut>(
ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy, ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy,
dx_op, dy_op); dintermediate, dx_op, dy_op, dintermediate_op);
} else { // Y is a scalar } else { // Y is a scalar
bool bcast_y = x_dim.size() >= y_dim.size(); bool bcast_y = x_dim.size() >= y_dim.size();
if (x_dim.size() == y_dim.size()) { if (x_dim.size() == y_dim.size()) {
...@@ -1420,16 +1544,16 @@ void FusedElemwiseAndActGradComputeEx( ...@@ -1420,16 +1544,16 @@ void FusedElemwiseAndActGradComputeEx(
// z = f1(f2(x, y)) // z = f1(f2(x, y))
if (bcast_y) { // Y should be broadcast. if (bcast_y) { // Y should be broadcast.
FusedElemwiseAndActGradComputeWithBroadcast< FusedElemwiseAndActGradComputeWithBroadcast<
DeviceContext, T, DX_OP, DY_OP, UseIntermediateOut, true /*BcastY*/, DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut,
SameShapeOfIntermediateOutAndOut>(ctx, x_dim, y_dim, x, y, true /*BcastY*/, SameShapeOfIntermediateOutAndOut>(
intermediate_out, out, dout, axis, ctx, x_dim, y_dim, x, y, intermediate_out, out, dout, axis, dx, dy,
dx, dy, dx_op, dy_op); dintermediate, dx_op, dy_op, dintermediate_op);
} else { } else {
FusedElemwiseAndActGradComputeWithBroadcast< FusedElemwiseAndActGradComputeWithBroadcast<
DeviceContext, T, DX_OP, DY_OP, UseIntermediateOut, false /*BcastY*/, DeviceContext, T, DX_OP, DY_OP, DIntermediate_OP, UseIntermediateOut,
SameShapeOfIntermediateOutAndOut>(ctx, y_dim, x_dim, x, y, false /*BcastY*/, SameShapeOfIntermediateOutAndOut>(
intermediate_out, out, dout, axis, ctx, y_dim, x_dim, x, y, intermediate_out, out, dout, axis, dx, dy,
dx, dy, dx_op, dy_op); dintermediate, dx_op, dy_op, dintermediate_op);
} }
} }
} }
...@@ -1444,7 +1568,7 @@ void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx, ...@@ -1444,7 +1568,7 @@ void FusedElemwiseAndActComputeEx(const framework::ExecutionContext &ctx,
framework::Tensor *intermediate_out) { framework::Tensor *intermediate_out) {
if (KeepIntermediateOut) { if (KeepIntermediateOut) {
PADDLE_ENFORCE(intermediate_out, PADDLE_ENFORCE(intermediate_out,
"The keep_intermediate_value is opened, " "The save_intermediate_out is opened, "
"intermediate_out should not be nullptr."); "intermediate_out should not be nullptr.");
} }
......
...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { ...@@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase {
auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>(); auto &in = scope.FindVar(Input("X"))->Get<framework::SelectedRows>();
auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>(); auto out = scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
auto &in_rows = in.rows(); auto in_rows = in.rows();
auto out_dim = framework::make_ddim( auto out_dim = framework::make_ddim(
std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1}); std::vector<int64_t>{static_cast<int64_t>(in_rows.size()), 1});
auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place()); auto dst_ptr = out->mutable_data<int64_t>(out_dim, in.place());
......
...@@ -13,18 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,18 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/fused_elemwise_activation_op.h" #include "paddle/fluid/operators/fused_elemwise_activation_op.h"
#include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
/* bool IsUnaryCompound(const std::vector<std::string> &functor_list) {
* Whether the compound function is Unary(Binary(X, Y)).
* For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
* out.
*/
static bool IsUnaryCompound(const std::vector<std::string> &functor_list) {
PADDLE_ENFORCE_EQ(functor_list.size(), 2); PADDLE_ENFORCE_EQ(functor_list.size(), 2);
static std::unordered_set<std::string> binary_fun = { static std::unordered_set<std::string> binary_fun = {
"elementwise_add", "elementwise_mul", "elementwise_add_grad", "elementwise_add", "elementwise_mul", "elementwise_add_grad",
...@@ -32,10 +25,17 @@ static bool IsUnaryCompound(const std::vector<std::string> &functor_list) { ...@@ -32,10 +25,17 @@ static bool IsUnaryCompound(const std::vector<std::string> &functor_list) {
return binary_fun.count(functor_list[1]) != 0; return binary_fun.count(functor_list[1]) != 0;
} }
/* bool HasInPlaceUnary(const std::vector<std::string> &functor_list) {
* Whether the Input(X) could be absent. PADDLE_ENFORCE_EQ(functor_list.size(), 2);
*/ static std::unordered_set<std::string> InplaceOpSet = {"relu", "relu_grad"};
static bool InputXCanBeAbsent(const std::vector<std::string> &functor_list) { bool is_in_place = false;
for (auto &func_name : functor_list) {
is_in_place |= (InplaceOpSet.count(func_name) == 1);
}
return is_in_place;
}
bool InputXCanBeAbsent(const std::vector<std::string> &functor_list) {
PADDLE_ENFORCE_EQ(functor_list.size(), 2); PADDLE_ENFORCE_EQ(functor_list.size(), 2);
static std::unordered_set<std::string> binary_fun = {"elementwise_add_grad"}; static std::unordered_set<std::string> binary_fun = {"elementwise_add_grad"};
return binary_fun.count(functor_list[0]) != 0 || return binary_fun.count(functor_list[0]) != 0 ||
...@@ -86,20 +86,12 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel { ...@@ -86,20 +86,12 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
// Whether the shape of Y is a continuous subsequence of X, // Whether the shape of Y is a continuous subsequence of X,
// For more information please refer to the op's introduction. // For more information please refer to the op's introduction.
bool bcast_y = x_dim.size() >= y_dim.size(); bool bcast_y = IsBcastY(x_dim, y_dim);
if (x_dim.size() == y_dim.size()) {
for (int i = 0; i < x_dim.size(); ++i) {
if (x_dim[i] < y_dim[i]) {
bcast_y = false;
break;
}
}
}
auto &out_dim = bcast_y ? x_dim : y_dim; auto &out_dim = bcast_y ? x_dim : y_dim;
std::string out_lod = bcast_y ? "X" : "Y"; std::string out_lod = bcast_y ? "X" : "Y";
if (ctx->Attrs().Get<bool>("keep_intermediate_value")) { if (ctx->Attrs().Get<bool>("save_intermediate_out")) {
PADDLE_ENFORCE(ctx->HasOutput("IntermediateOut"), PADDLE_ENFORCE(ctx->HasOutput("IntermediateOut"),
"Output(IntermediateOut) of FusedElemwiseActivationOp " "Output(IntermediateOut) of FusedElemwiseActivationOp "
"should not be null."); "should not be null.");
...@@ -123,6 +115,20 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel { ...@@ -123,6 +115,20 @@ class FusedElemwiseActivationOp : public framework::OperatorWithKernel {
ctx->ShareLoD(out_lod, /*->*/ "Out"); ctx->ShareLoD(out_lod, /*->*/ "Out");
} }
static bool IsBcastY(const framework::DDim &x_dim,
const framework::DDim &y_dim) {
bool bcast_y = x_dim.size() >= y_dim.size();
if (x_dim.size() == y_dim.size()) {
for (int i = 0; i < x_dim.size(); ++i) {
if (x_dim[i] < y_dim[i]) {
bcast_y = false;
break;
}
}
}
return bcast_y;
}
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
...@@ -157,17 +163,7 @@ class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker { ...@@ -157,17 +163,7 @@ class FusedElemwiseActivationMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("scale", AddAttr<float>("scale",
"scale is used by scale_op, the default value is 0.0.") "scale is used by scale_op, the default value is 0.0.")
.SetDefault(0.0); .SetDefault(0.0);
AddAttr<bool>( AddAttr<bool>("save_intermediate_out",
"recomputation",
"Whether to recompute the Out."
"The computation of fused_elemwise_activation_grad has two methods to "
"get the dx and dy, one is to use the 'Out', and the other is not. "
"The former method will save the time of recomputing the 'Out', but it "
"must occupy the memory to store the 'out'. While, the later method "
"can avoid occupying the memory, but it must recompute the 'Out'. "
"It is useful for Unary(Binary(X, Y)). The default value is true.")
.SetDefault(true);
AddAttr<bool>("keep_intermediate_value",
"Whether to save the intermediate_out.") "Whether to save the intermediate_out.")
.SetDefault(false); .SetDefault(false);
AddAttr<std::vector<std::string>>("functor_list", AddAttr<std::vector<std::string>>("functor_list",
...@@ -227,30 +223,38 @@ class FusedElemwiseActivationGradMaker ...@@ -227,30 +223,38 @@ class FusedElemwiseActivationGradMaker
protected: protected:
std::unique_ptr<framework::OpDesc> Apply() const override { std::unique_ptr<framework::OpDesc> Apply() const override {
auto *op_desc_ptr = new framework::OpDesc(); auto *grad_op = new framework::OpDesc();
op_desc_ptr->SetType(this->ForwardOpType() + "_grad"); grad_op->SetType(this->ForwardOpType() + "_grad");
for (auto &input_param : this->InputNames()) { for (auto &input_param : this->InputNames()) {
op_desc_ptr->SetInput(input_param, this->Input(input_param)); grad_op->SetInput(input_param, this->Input(input_param));
op_desc_ptr->SetOutput(framework::GradVarName(input_param), grad_op->SetOutput(framework::GradVarName(input_param),
this->InputGrad(input_param, true)); this->InputGrad(input_param, true));
} }
for (auto &output_param : this->OutputNames()) { grad_op->SetInput("Out", this->Output("Out"));
op_desc_ptr->SetInput(output_param, this->Output(output_param)); grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op_desc_ptr->SetInput(framework::GradVarName(output_param),
this->OutputGrad(output_param));
}
op_desc_ptr->SetAttrMap(this->Attrs()); grad_op->SetAttrMap(this->Attrs());
std::vector<std::string> functor_names = std::vector<std::string> functor_names =
boost::get<std::vector<std::string>>( boost::get<std::vector<std::string>>(grad_op->GetAttr("functor_list"));
op_desc_ptr->GetAttr("functor_list"));
functor_names[0] += "_grad"; functor_names[0] += "_grad";
functor_names[1] += "_grad"; functor_names[1] += "_grad";
op_desc_ptr->SetAttr("functor_list", functor_names); grad_op->SetAttr("functor_list", functor_names);
return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
if (boost::get<bool>(grad_op->GetAttr("save_intermediate_out"))) {
PADDLE_ENFORCE_NE(Output("IntermediateOut").size(), 0);
grad_op->SetInput("IntermediateOut", this->Output("IntermediateOut"));
grad_op->SetOutput(framework::GradVarName("IntermediateOut"),
this->OutputGrad("IntermediateOut"));
} else {
grad_op->SetInput("IntermediateOut", {});
grad_op->SetOutput(framework::GradVarName("IntermediateOut"), {});
}
return std::unique_ptr<framework::OpDesc>(grad_op);
} }
}; };
...@@ -261,56 +265,65 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel { ...@@ -261,56 +265,65 @@ class FusedElemwiseActivationOpGrad : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
"Input(Out@Grad) should not be null"); "Input(Out@Grad) should not be null");
if (ctx->Attrs().Get<bool>("keep_intermediate_value")) {
auto functor_list =
ctx->Attrs().Get<std::vector<std::string>>("functor_list");
if (ctx->Attrs().Get<bool>("save_intermediate_out")) {
PADDLE_ENFORCE(ctx->HasInput("IntermediateOut"), PADDLE_ENFORCE(ctx->HasInput("IntermediateOut"),
"Input(IntermediateOut) should not be null"); "Input(IntermediateOut) should not be null");
} else { } else {
PADDLE_ENFORCE_EQ(ctx->Inputs(framework::GradVarName("Out")).size(), 1); if (!InputXCanBeAbsent(functor_list)) {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
}
} }
auto funtor_list =
ctx->Attrs().Get<std::vector<std::string>>("functor_list");
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
auto y_grad_name = framework::GradVarName("Y"); auto y_grad_name = framework::GradVarName("Y");
auto inter_grad_name = framework::GradVarName("IntermediateOut");
if (ctx->HasOutput(x_grad_name)) { if (ctx->HasOutput(x_grad_name)) {
if (ctx->HasInputs("X")) { if (ctx->HasInputs("X")) {
ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X")); ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
ctx->ShareLoD("X", x_grad_name); ctx->ShareLoD("X", x_grad_name);
} else { } else {
// Node: If "X" is absence, the shape of Y should be a continuous
// subsequence of X, if not, we could not infer the shape of dx.
// Currently, only when Binary is elementwise_add or elementwise_sub, // Currently, only when Binary is elementwise_add or elementwise_sub,
// the "X" could be absent. // the "X" could be absent.
PADDLE_ENFORCE(InputXCanBeAbsent(funtor_list), PADDLE_ENFORCE(InputXCanBeAbsent(functor_list),
"Only when BinaryFunctor is elementwise_add, the 'X' " "Only when BinaryFunctor is elementwise_add, the 'X' "
"could be absent."); "could be absent.");
// For Unary(Binary(X, Y)), IntermediateOut should not be empty. // Node: If "X" is absence, the shape of Y should be a continuous
if (IsUnaryCompound(funtor_list)) { // subsequence of X, otherwise, we could not infer the shape of dx.
PADDLE_ENFORCE(
ctx->HasInputs("IntermediateOut"),
"If the compound_functor is Unary(Binary(X, Y)) and Binary "
"is elementwise_add, the intermediate_out must be not absent.");
}
ctx->SetOutputDim(x_grad_name, ctx->SetOutputDim(x_grad_name,
ctx->GetInputDim(framework::GradVarName("Out"))); ctx->GetInputDim(framework::GradVarName("Out")));
ctx->ShareLoD(framework::GradVarName("Out"), x_grad_name); ctx->ShareLoD(framework::GradVarName("Out"), x_grad_name);
} }
} }
if (ctx->HasOutput(y_grad_name)) { if (ctx->HasOutput(y_grad_name)) {
PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null"); PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y")); ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("Y"));
ctx->ShareLoD("Y", y_grad_name); ctx->ShareLoD("Y", y_grad_name);
} }
if (ctx->HasOutput(inter_grad_name)) {
// For Unary(Binary(X, Y)), IntermediateOut should not be empty.
if (IsUnaryCompound(functor_list)) {
ctx->SetOutputDim(inter_grad_name,
ctx->GetInputDim(framework::GradVarName("Out")));
ctx->ShareLoD(framework::GradVarName("Out"), inter_grad_name);
} else {
ctx->SetOutputDim(inter_grad_name, ctx->GetInputDim("Y"));
ctx->ShareLoD("Y", inter_grad_name);
}
}
} }
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
// PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
auto input_data_type_index = ctx.Input<framework::Tensor>("Y")->type(); auto input_data_type_index = ctx.Input<framework::Tensor>("Y")->type();
auto input_data_type = framework::ToDataType(input_data_type_index); auto input_data_type = framework::ToDataType(input_data_type_index);
return framework::OpKernelType(input_data_type, ctx.GetPlace()); return framework::OpKernelType(input_data_type, ctx.GetPlace());
......
...@@ -26,6 +26,24 @@ limitations under the License. */ ...@@ -26,6 +26,24 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
/**
* Whether the compound function is Unary(Binary(X, Y)).
* For Unary(Binary(X, Y)), the intermediate_out's shape is the same the final
* out.
*/
bool IsUnaryCompound(const std::vector<std::string> &functor_list);
/**
* For the in-place unary functor, the inputs of op_desc only have Out and
* Out@Grad.
*/
bool HasInPlaceUnary(const std::vector<std::string> &functor_list);
/**
* Whether the Input(X) could be absent.
*/
bool InputXCanBeAbsent(const std::vector<std::string> &functor_list);
template <typename DeviceContext, typename T, typename BinaryFunctor, template <typename DeviceContext, typename T, typename BinaryFunctor,
typename UnaryFunctor> typename UnaryFunctor>
static void RunBinaryCompoundFunctor( static void RunBinaryCompoundFunctor(
...@@ -39,7 +57,7 @@ static void RunBinaryCompoundFunctor( ...@@ -39,7 +57,7 @@ static void RunBinaryCompoundFunctor(
paddle::operators::math::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor> paddle::operators::math::BinaryCompoundFunctor<T, BinaryFunctor, UnaryFunctor>
compound_func(binary_functor, unary_functor); compound_func(binary_functor, unary_functor);
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
if (ctx.Attr<bool>("keep_intermediate_value")) { if (ctx.Attr<bool>("save_intermediate_out")) {
FusedElemwiseAndActComputeEx<DeviceContext, T, FusedElemwiseAndActComputeEx<DeviceContext, T,
paddle::operators::math::BinaryCompoundFunctor< paddle::operators::math::BinaryCompoundFunctor<
T, BinaryFunctor, UnaryFunctor>, T, BinaryFunctor, UnaryFunctor>,
...@@ -71,7 +89,7 @@ static void RunUnaryCompoundFunctors( ...@@ -71,7 +89,7 @@ static void RunUnaryCompoundFunctors(
paddle::operators::math::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor> paddle::operators::math::UnaryCompoundFunctor<T, UnaryFunctor, BinaryFunctor>
compound_func(unary_functor, binary_functor); compound_func(unary_functor, binary_functor);
if (ctx.Attr<bool>("keep_intermediate_value")) { if (ctx.Attr<bool>("save_intermediate_out")) {
FusedElemwiseAndActComputeEx<DeviceContext, T, FusedElemwiseAndActComputeEx<DeviceContext, T,
paddle::operators::math::UnaryCompoundFunctor< paddle::operators::math::UnaryCompoundFunctor<
T, UnaryFunctor, BinaryFunctor>, T, UnaryFunctor, BinaryFunctor>,
...@@ -89,7 +107,7 @@ static void RunUnaryCompoundFunctors( ...@@ -89,7 +107,7 @@ static void RunUnaryCompoundFunctors(
} }
template <typename DeviceContext, typename T, typename BinaryGradFunctor, template <typename DeviceContext, typename T, typename BinaryGradFunctor,
typename UnaryFunctor, typename UnaryGradFunctor> typename UnaryFunctor, typename UnaryGradFunctor, bool InPlace>
static void RunBinaryCompoundGradFunctors( static void RunBinaryCompoundGradFunctors(
const framework::ExecutionContext &ctx, const framework::ExecutionContext &ctx,
const BinaryGradFunctor &binary_grad_functor, const BinaryGradFunctor &binary_grad_functor,
...@@ -98,7 +116,7 @@ static void RunBinaryCompoundGradFunctors( ...@@ -98,7 +116,7 @@ static void RunBinaryCompoundGradFunctors(
const framework::Tensor *in_y, const framework::Tensor *in_out, const framework::Tensor *in_y, const framework::Tensor *in_out,
const framework::Tensor *in_intermediate_out, const framework::Tensor *in_intermediate_out,
const framework::Tensor *in_out_grad, framework::Tensor *x_grad, const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
framework::Tensor *y_grad) { framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) {
// Z = Binary(X, Unary(Y)) // Z = Binary(X, Unary(Y))
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
...@@ -107,32 +125,40 @@ static void RunBinaryCompoundGradFunctors( ...@@ -107,32 +125,40 @@ static void RunBinaryCompoundGradFunctors(
UnaryFunctor>; UnaryFunctor>;
using BinaryCompoundDyFunctor = using BinaryCompoundDyFunctor =
paddle::operators::math::BinaryCompoundGradDyFunctor< paddle::operators::math::BinaryCompoundGradDyFunctor<
T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor>; T, BinaryGradFunctor, UnaryFunctor, UnaryGradFunctor, InPlace>;
using BinaryCompoundDIntermedaiteOutFunctor =
paddle::operators::math::BinaryCompoundGradDIntermedaiteOutFunctor<
T, BinaryGradFunctor, UnaryFunctor>;
if (in_intermediate_out) { if (in_intermediate_out) {
FusedElemwiseAndActGradComputeEx< FusedElemwiseAndActGradComputeEx<
DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor, DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor,
true /*UseIntermediateOut*/, BinaryCompoundDIntermedaiteOutFunctor, true /*UseIntermediateOut*/,
false /*SameShapeOfIntermediateOutAndOut*/>( false /*SameShapeOfIntermediateOutAndOut*/>(
ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
y_grad, BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), y_grad, d_intermediate_out,
BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
BinaryCompoundDyFunctor(binary_grad_functor, unary_functor, BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
unary_grad_functor)); unary_grad_functor),
BinaryCompoundDIntermedaiteOutFunctor(binary_grad_functor,
unary_functor));
} else { } else {
FusedElemwiseAndActGradComputeEx< FusedElemwiseAndActGradComputeEx<
DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor, DeviceContext, T, BinaryCompoundDxFunctor, BinaryCompoundDyFunctor,
false /*UseIntermediateOut*/, BinaryCompoundDIntermedaiteOutFunctor, false /*UseIntermediateOut*/,
false /*SameShapeOfIntermediateOutAndOut*/>( false /*SameShapeOfIntermediateOutAndOut*/>(
ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
y_grad, BinaryCompoundDxFunctor(binary_grad_functor, unary_functor), y_grad, d_intermediate_out,
BinaryCompoundDxFunctor(binary_grad_functor, unary_functor),
BinaryCompoundDyFunctor(binary_grad_functor, unary_functor, BinaryCompoundDyFunctor(binary_grad_functor, unary_functor,
unary_grad_functor)); unary_grad_functor),
BinaryCompoundDIntermedaiteOutFunctor(binary_grad_functor,
unary_functor));
} }
} }
template <typename DeviceContext, typename T, typename UnaryGradFunctor, template <typename DeviceContext, typename T, typename UnaryGradFunctor,
typename BinaryFunctor, typename BinaryGradFunctor, typename BinaryFunctor, typename BinaryGradFunctor, bool InPlace>
bool Recomputation = true>
static void RunUnaryCompoundGradFunctors( static void RunUnaryCompoundGradFunctors(
const framework::ExecutionContext &ctx, const framework::ExecutionContext &ctx,
const UnaryGradFunctor &unary_grad_functor, const UnaryGradFunctor &unary_grad_functor,
...@@ -141,36 +167,44 @@ static void RunUnaryCompoundGradFunctors( ...@@ -141,36 +167,44 @@ static void RunUnaryCompoundGradFunctors(
const framework::Tensor *in_y, const framework::Tensor *in_out, const framework::Tensor *in_y, const framework::Tensor *in_out,
const framework::Tensor *in_intermediate_out, const framework::Tensor *in_intermediate_out,
const framework::Tensor *in_out_grad, framework::Tensor *x_grad, const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
framework::Tensor *y_grad) { framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) {
// Z = Unary(Binary(X, Y)) // Z = Unary(Binary(X, Y))
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
using UnaryCompoundDxFunctor = using UnaryCompoundDxFunctor =
paddle::operators::math::UnaryCompoundGradDxFunctor< paddle::operators::math::UnaryCompoundGradDxFunctor<
T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, Recomputation>; T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>;
using UnaryCompoundDyFunctor = using UnaryCompoundDyFunctor =
paddle::operators::math::UnaryCompoundGradDyFunctor< paddle::operators::math::UnaryCompoundGradDyFunctor<
T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, Recomputation>; T, UnaryGradFunctor, BinaryFunctor, BinaryGradFunctor, InPlace>;
using UnaryCompoundDIntermediateFunctor =
paddle::operators::math::UnaryCompoundGradDIntermediateFunctor<
T, UnaryGradFunctor, BinaryFunctor, InPlace>;
if (in_intermediate_out) { if (in_intermediate_out) {
FusedElemwiseAndActGradComputeEx< FusedElemwiseAndActGradComputeEx<
DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor, DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor,
true /*UseIntermediateOut*/, true /*SameShapeOfIntermediateOutAndOut*/>( UnaryCompoundDIntermediateFunctor, true /*UseIntermediateOut*/,
true /*SameShapeOfIntermediateOutAndOut*/>(
ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
y_grad, UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, y_grad, d_intermediate_out,
binary_grad_functor), UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
binary_grad_functor),
UnaryCompoundDyFunctor(unary_grad_functor, binary_functor, UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
binary_grad_functor)); binary_grad_functor),
UnaryCompoundDIntermediateFunctor(unary_grad_functor, binary_functor));
} else { } else {
FusedElemwiseAndActGradComputeEx<DeviceContext, T, UnaryCompoundDxFunctor, FusedElemwiseAndActGradComputeEx<
UnaryCompoundDyFunctor, DeviceContext, T, UnaryCompoundDxFunctor, UnaryCompoundDyFunctor,
false /*UseIntermediateOut*/, UnaryCompoundDIntermediateFunctor, false /*UseIntermediateOut*/,
true /*SameShapeOfIntermediateOutAndOut*/>( true /*SameShapeOfIntermediateOutAndOut*/>(
ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad, ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, axis, x_grad,
y_grad, UnaryCompoundDxFunctor(unary_grad_functor, binary_functor, y_grad, d_intermediate_out,
binary_grad_functor), UnaryCompoundDxFunctor(unary_grad_functor, binary_functor,
binary_grad_functor),
UnaryCompoundDyFunctor(unary_grad_functor, binary_functor, UnaryCompoundDyFunctor(unary_grad_functor, binary_functor,
binary_grad_functor)); binary_grad_functor),
UnaryCompoundDIntermediateFunctor(unary_grad_functor, binary_functor));
} }
} }
...@@ -226,72 +260,67 @@ static void RunFunctors(const framework::ExecutionContext &ctx, ...@@ -226,72 +260,67 @@ static void RunFunctors(const framework::ExecutionContext &ctx,
} }
} }
template <typename DeviceContext, typename T, bool ReComputation> template <typename DeviceContext, typename T, bool InPlace>
static void RunGradFunctors(const framework::ExecutionContext &ctx, static void RunGradFunctors(
const framework::Tensor *in_x, const framework::ExecutionContext &ctx, const framework::Tensor *in_x,
const framework::Tensor *in_y, const framework::Tensor *in_y, const framework::Tensor *in_out,
const framework::Tensor *in_out, const framework::Tensor *in_intermediate_out,
const framework::Tensor *in_intermediate_out, const framework::Tensor *in_out_grad, framework::Tensor *x_grad,
const framework::Tensor *in_out_grad, framework::Tensor *y_grad, framework::Tensor *d_intermediate_out) {
framework::Tensor *x_grad,
framework::Tensor *y_grad) {
auto &functors = ctx.Attr<std::vector<std::string>>("functor_list"); auto &functors = ctx.Attr<std::vector<std::string>>("functor_list");
auto funcs_str = functors[0] + "," + functors[1]; auto funcs_str = functors[0] + "," + functors[1];
// TODO(zcd): The following code can be refined. for example, use registrition
if (funcs_str == "elementwise_add_grad,scale_grad") { if (funcs_str == "elementwise_add_grad,scale_grad") {
// The backward of Z = Binary(X, Unary(Y)) // The backward of Z = Binary(X, Unary(Y))
T scale = static_cast<T>(ctx.Attr<float>("scale")); T scale = static_cast<T>(ctx.Attr<float>("scale"));
RunBinaryCompoundGradFunctors<DeviceContext, T, RunBinaryCompoundGradFunctors<
paddle::operators::math::AddGradFunctor<T>, DeviceContext, T, paddle::operators::math::AddGradFunctor<T>,
paddle::operators::math::ScaleFunctor<T>, paddle::operators::math::ScaleFunctor<T>,
paddle::operators::math::ScaleGradFunctor<T>>( paddle::operators::math::ScaleGradFunctor<T>, InPlace>(
ctx, paddle::operators::math::AddGradFunctor<T>(), ctx, paddle::operators::math::AddGradFunctor<T>(),
paddle::operators::math::ScaleFunctor<T>(scale), paddle::operators::math::ScaleFunctor<T>(scale),
paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
in_intermediate_out, in_out_grad, x_grad, y_grad); in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
} else if (funcs_str == "scale_grad,elementwise_add_grad") { } else if (funcs_str == "scale_grad,elementwise_add_grad") {
// The backward of Z = Unary(Binary(X, Y)) // The backward of Z = Unary(Binary(X, Y))
T scale = static_cast<T>(ctx.Attr<float>("scale")); T scale = static_cast<T>(ctx.Attr<float>("scale"));
RunUnaryCompoundGradFunctors<DeviceContext, T, RunUnaryCompoundGradFunctors<
paddle::operators::math::ScaleGradFunctor<T>, DeviceContext, T, paddle::operators::math::ScaleGradFunctor<T>,
paddle::operators::math::AddFunctor<T>, paddle::operators::math::AddFunctor<T>,
paddle::operators::math::AddGradFunctor<T>, paddle::operators::math::AddGradFunctor<T>, InPlace>(
ReComputation /*Recomputation*/>(
ctx, paddle::operators::math::ScaleGradFunctor<T>(scale), ctx, paddle::operators::math::ScaleGradFunctor<T>(scale),
paddle::operators::math::AddFunctor<T>(), paddle::operators::math::AddFunctor<T>(),
paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out, paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
in_intermediate_out, in_out_grad, x_grad, y_grad); in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
} else if (funcs_str == "elementwise_add_grad,relu_grad") { } else if (funcs_str == "elementwise_add_grad,relu_grad") {
RunBinaryCompoundGradFunctors<DeviceContext, T, RunBinaryCompoundGradFunctors<
paddle::operators::math::AddGradFunctor<T>, DeviceContext, T, paddle::operators::math::AddGradFunctor<T>,
paddle::operators::math::ReluFunctor<T>, paddle::operators::math::ReluFunctor<T>,
paddle::operators::math::ReluGradFunctor<T>>( paddle::operators::math::ReluGradFunctor<T>, InPlace>(
ctx, paddle::operators::math::AddGradFunctor<T>(), ctx, paddle::operators::math::AddGradFunctor<T>(),
paddle::operators::math::ReluFunctor<T>(), paddle::operators::math::ReluFunctor<T>(),
paddle::operators::math::ReluGradFunctor<T>(), in_x, in_y, in_out, paddle::operators::math::ReluGradFunctor<T>(), in_x, in_y, in_out,
in_intermediate_out, in_out_grad, x_grad, y_grad); in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
} else if (funcs_str == "relu_grad,elementwise_add_grad") { } else if (funcs_str == "relu_grad,elementwise_add_grad") {
RunUnaryCompoundGradFunctors<DeviceContext, T, RunUnaryCompoundGradFunctors<
paddle::operators::math::ReluGradFunctor<T>, DeviceContext, T, paddle::operators::math::ReluGradFunctor<T>,
paddle::operators::math::AddFunctor<T>, paddle::operators::math::AddFunctor<T>,
paddle::operators::math::AddGradFunctor<T>, paddle::operators::math::AddGradFunctor<T>, InPlace>(
ReComputation /*Recomputation*/>(
ctx, paddle::operators::math::ReluGradFunctor<T>(), ctx, paddle::operators::math::ReluGradFunctor<T>(),
paddle::operators::math::AddFunctor<T>(), paddle::operators::math::AddFunctor<T>(),
paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out, paddle::operators::math::AddGradFunctor<T>(), in_x, in_y, in_out,
in_intermediate_out, in_out_grad, x_grad, y_grad); in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
} else if (funcs_str == "elementwise_mul_grad,scale_grad") { } else if (funcs_str == "elementwise_mul_grad,scale_grad") {
// The backward of Z = Binary(X, Unary(Y)) // The backward of Z = Binary(X, Unary(Y))
T scale = static_cast<T>(ctx.Attr<float>("scale")); T scale = static_cast<T>(ctx.Attr<float>("scale"));
RunBinaryCompoundGradFunctors<DeviceContext, T, RunBinaryCompoundGradFunctors<
paddle::operators::math::MulGradFunctor<T>, DeviceContext, T, paddle::operators::math::MulGradFunctor<T>,
paddle::operators::math::ScaleFunctor<T>, paddle::operators::math::ScaleFunctor<T>,
paddle::operators::math::ScaleGradFunctor<T>>( paddle::operators::math::ScaleGradFunctor<T>, InPlace>(
ctx, paddle::operators::math::MulGradFunctor<T>(), ctx, paddle::operators::math::MulGradFunctor<T>(),
paddle::operators::math::ScaleFunctor<T>(scale), paddle::operators::math::ScaleFunctor<T>(scale),
paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out, paddle::operators::math::ScaleGradFunctor<T>(scale), in_x, in_y, in_out,
in_intermediate_out, in_out_grad, x_grad, y_grad); in_intermediate_out, in_out_grad, x_grad, y_grad, d_intermediate_out);
} else { } else {
PADDLE_THROW("%s has not been implemented.", funcs_str); PADDLE_THROW("%s has not been implemented.", funcs_str);
} }
...@@ -313,9 +342,9 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> { ...@@ -313,9 +342,9 @@ class FusedElemwiseActivationKernel : public framework::OpKernel<T> {
std::vector<framework::Tensor *> outputs; std::vector<framework::Tensor *> outputs;
outputs.emplace_back(output); outputs.emplace_back(output);
if (ctx.Attr<bool>("keep_intermediate_value")) { if (ctx.Attr<bool>("save_intermediate_out")) {
PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"), PADDLE_ENFORCE(ctx.HasOutput("IntermediateOut"),
"The keep_intermediate_value is enable, so the " "The save_intermediate_out is enable, so the "
"IntermediateOut should not be empty."); "IntermediateOut should not be empty.");
auto intermediate_out = ctx.Output<framework::Tensor>("IntermediateOut"); auto intermediate_out = ctx.Output<framework::Tensor>("IntermediateOut");
outputs.emplace_back(intermediate_out); outputs.emplace_back(intermediate_out);
...@@ -331,65 +360,63 @@ template <typename DeviceContext, typename T> ...@@ -331,65 +360,63 @@ template <typename DeviceContext, typename T>
class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> { class FusedElemwiseActivationGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
auto x = ctx.Input<framework::Tensor>("X"); auto in_y = ctx.Input<framework::Tensor>("Y");
auto y = ctx.Input<framework::Tensor>("Y"); PADDLE_ENFORCE(in_y != nullptr, "Input(Y) should not be nullptr.");
auto in_out = ctx.Input<framework::Tensor>("Out"); auto in_out = ctx.Input<framework::Tensor>("Out");
PADDLE_ENFORCE(in_out != nullptr, "Input(Out) should not be nullptr.");
auto in_out_grad = auto in_out_grad =
ctx.Input<framework::Tensor>(framework::GradVarName("Out")); ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
PADDLE_ENFORCE(in_out_grad != nullptr,
"Input(Out@Grad) should not be nullptr.");
framework::Tensor *in_x =
const_cast<framework::Tensor *>(ctx.Input<framework::Tensor>("X"));
framework::Tensor *x_grad = framework::Tensor *x_grad =
ctx.Output<framework::Tensor>(framework::GradVarName("X")); ctx.Output<framework::Tensor>(framework::GradVarName("X"));
framework::Tensor *y_grad = framework::Tensor *y_grad =
ctx.Output<framework::Tensor>(framework::GradVarName("Y")); ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
framework::Tensor *d_intermediate_out = ctx.Output<framework::Tensor>(
framework::GradVarName("IntermediateOut"));
PADDLE_ENFORCE(y != nullptr, "Input(Y) should not be nullptr.");
if (ctx.Attr<bool>("recomputation")) {
PADDLE_ENFORCE(
x != nullptr,
"The recomputation is opened, so Input(X) should not be absent.");
} else {
PADDLE_ENFORCE(in_out != nullptr,
"The recomputation is disabled, so the Input('Out') "
"should not be empty.");
}
framework::Tensor *in_x;
auto functor_list = ctx.Attr<std::vector<std::string>>("functor_list"); auto functor_list = ctx.Attr<std::vector<std::string>>("functor_list");
// If functor_list contains elementwise_add, the backward doesn't use // Get intermediate_out
// in_x, and in_outs. framework::Tensor *in_intermediate_out = nullptr;
if (x == nullptr) { if (ctx.Attr<bool>("save_intermediate_out")) {
PADDLE_ENFORCE(functor_list[0] == "elementwise_add_grad" || // if save_intermediate_out is true, for Unary(Binary(x, y)) and
functor_list[1] == "elementwise_add_grad", // Binary(x, Unary(y)), the Binary(x, y) and Unary(y) not need to
"Only when the compoundfunctor contains " // recompute.
"elementwise_add_grad, the 'X' could be absent.");
in_x = const_cast<framework::Tensor *>(in_out_grad);
in_out = const_cast<framework::Tensor *>(in_out_grad);
} else {
in_x = const_cast<framework::Tensor *>(x);
}
framework::Tensor *in_intermediate_out;
if (ctx.Attr<bool>("keep_intermediate_value")) {
in_intermediate_out = const_cast<framework::Tensor *>( in_intermediate_out = const_cast<framework::Tensor *>(
ctx.Input<framework::Tensor>("IntermediateOut")); ctx.Input<framework::Tensor>("IntermediateOut"));
PADDLE_ENFORCE(in_intermediate_out != nullptr, PADDLE_ENFORCE(in_intermediate_out != nullptr,
"The option of 'keep_intermediate_value' is opened, " "The option of 'save_intermediate_out' is opened, "
"so the number of 'Out' should be two."); "so the number of 'Out' should be two.");
} else { } else {
in_intermediate_out = nullptr; if (!InputXCanBeAbsent(functor_list)) {
PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be null.");
}
}
// Get in_x
if (ctx.HasInput("X")) {
PADDLE_ENFORCE(in_x != nullptr, "Input(X) should not be nullptr.");
} else {
// If functor_list contains elementwise_add, the backward doesn't use
// in_x, in_y and in_out.
PADDLE_ENFORCE(InputXCanBeAbsent(functor_list),
"Only when the compoundfunctor contains "
"elementwise_add_grad, the 'X' could be absent.");
in_x = const_cast<framework::Tensor *>(in_out_grad);
} }
if (ctx.Attr<bool>("recomputation")) { bool has_in_place = HasInPlaceUnary(functor_list);
RunGradFunctors<DeviceContext, T, true /*Recomputation*/>( if (has_in_place) {
ctx, in_x, y, in_out, in_intermediate_out, in_out_grad, x_grad, RunGradFunctors<DeviceContext, T, true /*InPlace*/>(
y_grad); ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad,
y_grad, d_intermediate_out);
} else { } else {
RunGradFunctors<DeviceContext, T, false /*Recomputation*/>( RunGradFunctors<DeviceContext, T, false /*InPlace*/>(
ctx, in_x, y, in_out, in_intermediate_out, in_out_grad, x_grad, ctx, in_x, in_y, in_out, in_intermediate_out, in_out_grad, x_grad,
y_grad); y_grad, d_intermediate_out);
} }
} }
}; };
......
...@@ -22,11 +22,11 @@ namespace paddle { ...@@ -22,11 +22,11 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
// Z = BinaryFunctor(X, UnaryFunctor(Y))
template <typename T, typename BinaryFunctor, typename UnaryFunctor> template <typename T, typename BinaryFunctor, typename UnaryFunctor>
struct BinaryCompoundFunctor { struct BinaryCompoundFunctor {
BinaryCompoundFunctor(const BinaryFunctor func1, const UnaryFunctor func2) BinaryCompoundFunctor(const BinaryFunctor func1, const UnaryFunctor func2)
: func1_(func1), func2_(func2) {} : func1_(func1), func2_(func2) {}
// Z = BinaryFunctor(X, UnaryFunctor(Y))
inline HOSTDEVICE T GetOut(T x, T y) { return func1_(x, func2_(y)); } inline HOSTDEVICE T GetOut(T x, T y) { return func1_(x, func2_(y)); }
...@@ -40,11 +40,11 @@ struct BinaryCompoundFunctor { ...@@ -40,11 +40,11 @@ struct BinaryCompoundFunctor {
UnaryFunctor func2_; UnaryFunctor func2_;
}; };
// Z = UnaryFunctor(BinaryFunctor(X, Y))
template <typename T, typename UnaryFunctor, typename BinaryFunctor> template <typename T, typename UnaryFunctor, typename BinaryFunctor>
struct UnaryCompoundFunctor { struct UnaryCompoundFunctor {
UnaryCompoundFunctor(const UnaryFunctor func1, const BinaryFunctor func2) UnaryCompoundFunctor(const UnaryFunctor func1, const BinaryFunctor func2)
: func1_(func1), func2_(func2) {} : func1_(func1), func2_(func2) {}
// Z = UnaryFunctor(BinaryFunctor(X, Y))
inline HOSTDEVICE T GetOut(T x, T y) { return func1_(func2_(x, y)); } inline HOSTDEVICE T GetOut(T x, T y) { return func1_(func2_(x, y)); }
...@@ -58,23 +58,19 @@ struct UnaryCompoundFunctor { ...@@ -58,23 +58,19 @@ struct UnaryCompoundFunctor {
BinaryFunctor func2_; BinaryFunctor func2_;
}; };
// FIXME(zcd): DBinaryFun and DUnaryFun have to method to get // Z = BinaryFunctor(X, UnaryFunctor(Y))
// the dx, one is to use the 'out', and the other is not to use it.
// the former method will save the time of recomputing the
// 'out', but it must occupy the memory to store the 'out'.
// While the later method can avoid occupying this memory,
// but it must recompute the 'out'.
template <typename T, typename DBinaryFun, typename UnaryFun> template <typename T, typename DBinaryFun, typename UnaryFun>
struct BinaryCompoundGradDxFunctor { struct BinaryCompoundGradDxFunctor {
BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun, BinaryCompoundGradDxFunctor(const DBinaryFun &d_binary_fun,
const UnaryFun &unary_fun) const UnaryFun &unary_fun)
: d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {} : d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
return dout * d_binary_fun_.Dx(x, unary_fun_(y)); return dout * d_binary_fun_.Dx(x, unary_fun_(y));
} }
inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
T dout) {
return dout * d_binary_fun_.Dx(x, intermediate_out); return dout * d_binary_fun_.Dx(x, intermediate_out);
} }
...@@ -83,8 +79,9 @@ struct BinaryCompoundGradDxFunctor { ...@@ -83,8 +79,9 @@ struct BinaryCompoundGradDxFunctor {
UnaryFun unary_fun_; UnaryFun unary_fun_;
}; };
// Z = BinaryFunctor(X, UnaryFunctor(Y))
template <typename T, typename DBinaryFun, typename UnaryFun, template <typename T, typename DBinaryFun, typename UnaryFun,
typename DUnaryFun> typename DUnaryFun, bool InPlace>
struct BinaryCompoundGradDyFunctor { struct BinaryCompoundGradDyFunctor {
BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun, BinaryCompoundGradDyFunctor(const DBinaryFun &d_binary_fun,
const UnaryFun &unary_fun, const UnaryFun &unary_fun,
...@@ -93,13 +90,19 @@ struct BinaryCompoundGradDyFunctor { ...@@ -93,13 +90,19 @@ struct BinaryCompoundGradDyFunctor {
unary_fun_(unary_fun), unary_fun_(unary_fun),
d_unary_fun_(d_unary_fun) {} d_unary_fun_(d_unary_fun) {}
inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_(y); return dout * d_binary_fun_.Dy(x, unary_fun_(y)) * d_unary_fun_.UseX(y);
} }
inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
return dout * d_binary_fun_.Dy(x, intermediate_out) * T dout) {
d_unary_fun_(y, intermediate_out); if (InPlace) {
return dout * d_binary_fun_.Dy(x, intermediate_out) *
d_unary_fun_.UseOut(intermediate_out);
} else {
return dout * d_binary_fun_.Dy(x, intermediate_out) *
d_unary_fun_.UseXAndOut(y, intermediate_out);
}
} }
private: private:
...@@ -108,8 +111,9 @@ struct BinaryCompoundGradDyFunctor { ...@@ -108,8 +111,9 @@ struct BinaryCompoundGradDyFunctor {
DUnaryFun d_unary_fun_; DUnaryFun d_unary_fun_;
}; };
// Z = UnaryFunctor(BinaryFunctor(X, Y))
template <typename T, typename DUnaryFun, typename BinaryFun, template <typename T, typename DUnaryFun, typename BinaryFun,
typename DBinaryFun, bool Recomputation = true> typename DBinaryFun, bool InPlace>
struct UnaryCompoundGradDxFunctor { struct UnaryCompoundGradDxFunctor {
UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun, UnaryCompoundGradDxFunctor(const DUnaryFun &d_unary_fun,
const BinaryFun &binary_fun, const BinaryFun &binary_fun,
...@@ -118,22 +122,23 @@ struct UnaryCompoundGradDxFunctor { ...@@ -118,22 +122,23 @@ struct UnaryCompoundGradDxFunctor {
binary_fun_(binary_fun), binary_fun_(binary_fun),
d_binary_fun_(d_binary_fun) {} d_binary_fun_(d_binary_fun) {}
inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
T base; T base;
if (Recomputation) { if (InPlace) {
base = dout * d_unary_fun_(binary_fun_(x, y)); base = dout * d_unary_fun_.UseOut(out);
} else { } else {
base = dout * d_unary_fun_(binary_fun_(x, y), out); base = dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out);
} }
return base * d_binary_fun_.Dx(x, y); return base * d_binary_fun_.Dx(x, y);
} }
inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
T dout) {
T base; T base;
if (Recomputation) { if (InPlace) {
base = dout * d_unary_fun_(intermediate_out); base = dout * d_unary_fun_.UseOut(out);
} else { } else {
base = dout * d_unary_fun_(intermediate_out, out); base = dout * d_unary_fun_.UseXAndOut(intermediate_out, out);
} }
return base * d_binary_fun_.Dx(x, y); return base * d_binary_fun_.Dx(x, y);
} }
...@@ -144,8 +149,9 @@ struct UnaryCompoundGradDxFunctor { ...@@ -144,8 +149,9 @@ struct UnaryCompoundGradDxFunctor {
DBinaryFun d_binary_fun_; DBinaryFun d_binary_fun_;
}; };
// Z = UnaryFunctor(BinaryFunctor(X, Y))
template <typename T, typename DUnaryFun, typename BinaryFun, template <typename T, typename DUnaryFun, typename BinaryFun,
typename DBinaryFun, bool Recomputation = true> typename DBinaryFun, bool InPlace>
struct UnaryCompoundGradDyFunctor { struct UnaryCompoundGradDyFunctor {
UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun, UnaryCompoundGradDyFunctor(const DUnaryFun &d_unary_fun,
const BinaryFun &binary_fun, const BinaryFun &binary_fun,
...@@ -154,22 +160,23 @@ struct UnaryCompoundGradDyFunctor { ...@@ -154,22 +160,23 @@ struct UnaryCompoundGradDyFunctor {
binary_fun_(binary_fun), binary_fun_(binary_fun),
d_binary_fun_(d_binary_fun) {} d_binary_fun_(d_binary_fun) {}
inline HOSTDEVICE T operator()(T x, T y, T out, T dout) { inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
T base; T base;
if (Recomputation) { if (InPlace) {
base = dout * d_unary_fun_(binary_fun_(x, y)); base = dout * d_unary_fun_.UseOut(out);
} else { } else {
base = dout * d_unary_fun_(binary_fun_(x, y), out); base = dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out);
} }
return base * d_binary_fun_.Dy(x, y); return base * d_binary_fun_.Dy(x, y);
} }
inline HOSTDEVICE T operator()(T x, T y, T intermediate_out, T out, T dout) { inline HOSTDEVICE T UseIntermediateOut(T x, T y, T intermediate_out, T out,
T dout) {
T base; T base;
if (Recomputation) { if (InPlace) {
base = dout * d_unary_fun_(intermediate_out); base = dout * d_unary_fun_.UseOut(out);
} else { } else {
base = dout * d_unary_fun_(intermediate_out, out); base = dout * d_unary_fun_.UseXAndOut(intermediate_out, out);
} }
return base * d_binary_fun_.Dy(x, y); return base * d_binary_fun_.Dy(x, y);
} }
...@@ -180,6 +187,56 @@ struct UnaryCompoundGradDyFunctor { ...@@ -180,6 +187,56 @@ struct UnaryCompoundGradDyFunctor {
DBinaryFun d_binary_fun_; DBinaryFun d_binary_fun_;
}; };
// Z = BinaryFunctor(X, UnaryFunctor(Y))
template <typename T, typename DBinaryFun, typename UnaryFun>
struct BinaryCompoundGradDIntermedaiteOutFunctor {
BinaryCompoundGradDIntermedaiteOutFunctor(const DBinaryFun &d_binary_fun,
const UnaryFun &unary_fun)
: d_binary_fun_(d_binary_fun), unary_fun_(unary_fun) {}
inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
return dout * d_binary_fun_.Dy(x, unary_fun_(y));
}
inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out,
T dout) {
return dout * d_binary_fun_.Dy(x, intermediate_out);
}
private:
DBinaryFun d_binary_fun_;
UnaryFun unary_fun_;
};
// Z = UnaryFunctor(BinaryFunctor(X, Y))
template <typename T, typename DUnaryFun, typename BinaryFun, bool InPlace>
struct UnaryCompoundGradDIntermediateFunctor {
UnaryCompoundGradDIntermediateFunctor(const DUnaryFun &d_unary_fun,
const BinaryFun &binary_fun)
: d_unary_fun_(d_unary_fun), binary_fun_(binary_fun) {}
inline HOSTDEVICE T Recompute(T x, T y, T out, T dout) {
if (InPlace) {
return dout * d_unary_fun_.UseOut(out);
} else {
return dout * d_unary_fun_.UseXAndOut(binary_fun_(x, y), out);
}
}
inline HOSTDEVICE T UseIntermediateOut(T x, T intermediate_out, T out,
T dout) {
if (InPlace) {
return dout * d_unary_fun_.UseOut(out);
} else {
return dout * d_unary_fun_.UseXAndOut(intermediate_out, out);
}
}
private:
DUnaryFun d_unary_fun_;
BinaryFun binary_fun_;
};
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -85,26 +85,59 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, ...@@ -85,26 +85,59 @@ void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
T *prev_output_value, int frame_size, T *prev_output_value, int frame_size,
ActivationType active_gate) { ActivationType active_gate) {
#ifdef __AVX__ #ifdef __AVX__
__m256 r_value_update_gate; __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
__m256 r_value_reset_gate; __m256 r_value_reset_gate, r_value_reset_gate_last = _mm256_set1_ps(0.0f);
__m256 r_value_reset_output; __m256 r_value_reset_output;
__m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f),
__m256 *update_gate = reinterpret_cast<__m256 *>(gate_value); r_prev_out_last = _mm256_set1_ps(0.0f);
__m256 *reset_gate = reinterpret_cast<__m256 *>(gate_value + frame_size); T *update_gate = gate_value;
T *reset_gate = gate_value + frame_size;
int block = 8;
const int n = frame_size;
const int rest = n % block;
const int end = n - rest;
int i = 0;
if (rest > 0) {
i = n - block;
r_value_update_gate_last =
_mm256_loadu_ps((const float *)(update_gate + i));
r_value_reset_gate_last = _mm256_loadu_ps((const float *)(reset_gate + i));
if (prev_output_value) {
r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
}
}
for (int i = 0; i < frame_size / 8; i++) { for (i = 0; i < end; i += block) {
r_value_update_gate = update_gate[i]; r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
r_value_reset_gate = reset_gate[i]; r_value_reset_gate = _mm256_loadu_ps((const float *)(reset_gate + i));
if (prev_output_value) { if (prev_output_value) {
r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
} }
op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out, op_reset_output(&r_value_update_gate, &r_value_reset_gate, &r_prev_out,
&r_value_reset_output, active_gate); &r_value_reset_output, active_gate);
update_gate[i] = r_value_update_gate; _mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
reset_gate[i] = r_value_reset_gate; r_value_update_gate);
(reinterpret_cast<__m256 *>(reset_output_value))[i] = r_value_reset_output; _mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
r_value_reset_gate);
_mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
r_value_reset_output);
}
if (rest > 0) {
i = n - block;
op_reset_output(&r_value_update_gate_last, &r_value_reset_gate_last,
&r_prev_out_last, &r_value_reset_output, active_gate);
_mm256_storeu_ps(reinterpret_cast<float *>(update_gate + i),
r_value_update_gate_last);
_mm256_storeu_ps(reinterpret_cast<float *>(reset_gate + i),
r_value_reset_gate_last);
_mm256_storeu_ps(reinterpret_cast<float *>(reset_output_value + i),
r_value_reset_output);
} }
#endif #endif
} }
...@@ -115,26 +148,55 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, ...@@ -115,26 +148,55 @@ void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
T *output_value, int frame_size, T *output_value, int frame_size,
ActivationType active_node) { ActivationType active_node) {
#ifdef __AVX__ #ifdef __AVX__
__m256 r_value_update_gate; __m256 r_value_update_gate, r_value_update_gate_last = _mm256_set1_ps(0.0f);
__m256 r_value_frame_state; __m256 r_value_frame_state, r_value_frame_state_last = _mm256_set1_ps(0.0f);
__m256 r_prev_out = _mm256_set1_ps(0.0f); __m256 r_prev_out = _mm256_set1_ps(0.0f),
r_prev_out_last = _mm256_set1_ps(0.0f);
__m256 r_output; __m256 r_output;
__m256 *update_gate = reinterpret_cast<__m256 *>(gate_value); T *update_gate = gate_value;
__m256 *frame_state = reinterpret_cast<__m256 *>(gate_value + frame_size * 2); T *frame_state = gate_value + frame_size * 2;
int block = 8;
const int n = frame_size;
const int rest = n % block;
const int end = n - rest;
int i = 0;
if (rest > 0) {
i = n - block;
r_value_update_gate_last =
_mm256_loadu_ps((const float *)(update_gate + i));
r_value_frame_state_last =
_mm256_loadu_ps((const float *)(frame_state + i));
if (prev_output_value) {
r_prev_out_last = _mm256_loadu_ps((const float *)(prev_output_value + i));
}
}
for (int i = 0; i < frame_size / 8; i++) { for (i = 0; i < end; i += block) {
r_value_update_gate = update_gate[i]; r_value_update_gate = _mm256_loadu_ps((const float *)(update_gate + i));
r_value_frame_state = frame_state[i]; r_value_frame_state = _mm256_loadu_ps((const float *)(frame_state + i));
if (prev_output_value) { if (prev_output_value) {
r_prev_out = (reinterpret_cast<__m256 *>(prev_output_value))[i]; r_prev_out = _mm256_loadu_ps((const float *)(prev_output_value + i));
} }
op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out, op_final_output(&r_value_update_gate, &r_value_frame_state, &r_prev_out,
&r_output, active_node); &r_output, active_node);
frame_state[i] = r_value_frame_state; _mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
(reinterpret_cast<__m256 *>(output_value))[i] = r_output; r_value_frame_state);
_mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
}
if (rest > 0) {
i = n - block;
op_final_output(&r_value_update_gate_last, &r_value_frame_state_last,
&r_prev_out_last, &r_output, active_node);
_mm256_storeu_ps(reinterpret_cast<float *>(frame_state + i),
r_value_frame_state_last);
_mm256_storeu_ps(reinterpret_cast<float *>(output_value + i), r_output);
} }
#endif #endif
} }
...@@ -143,7 +205,8 @@ inline void forward_reset_output(OpResetOutput op_reset_output, ...@@ -143,7 +205,8 @@ inline void forward_reset_output(OpResetOutput op_reset_output,
GRUMetaValue<T> value, int frame_size, GRUMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_gate) { int batch_size, ActivationType active_gate) {
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { if (OpResetOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
(sizeof(T) == 4)) {
hl_avx_gru_forward_reset_output( hl_avx_gru_forward_reset_output(
op_reset_output, value.gate_value, value.reset_output_value, op_reset_output, value.gate_value, value.reset_output_value,
value.prev_out_value, frame_size, active_gate); value.prev_out_value, frame_size, active_gate);
...@@ -166,7 +229,8 @@ inline void forward_final_output(OpFinalOutput op_final_output, ...@@ -166,7 +229,8 @@ inline void forward_final_output(OpFinalOutput op_final_output,
GRUMetaValue<T> value, int frame_size, GRUMetaValue<T> value, int frame_size,
int batch_size, ActivationType active_node) { int batch_size, ActivationType active_node) {
for (int b = 0; b < batch_size; b++) { for (int b = 0; b < batch_size; b++) {
if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { if (OpFinalOutput::avx && (frame_size > static_cast<int>(8 - 1)) &&
(sizeof(T) == 4)) {
hl_avx_gru_forward_final_output(op_final_output, value.gate_value, hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
value.prev_out_value, value.output_value, value.prev_out_value, value.output_value,
frame_size, active_node); frame_size, active_node);
......
...@@ -58,9 +58,9 @@ template <typename T> ...@@ -58,9 +58,9 @@ template <typename T>
struct ScaleGradFunctor { struct ScaleGradFunctor {
explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {} explicit ScaleGradFunctor(T coeff) : coeff_(coeff) {}
inline HOSTDEVICE T operator()(T x) { return coeff_; } inline HOSTDEVICE T UseX(T x) { return coeff_; }
inline HOSTDEVICE T UseOut(T out) { return coeff_; }
inline HOSTDEVICE T operator()(T x, T out) { return coeff_; } inline HOSTDEVICE T UseXAndOut(T x, T out) { return coeff_; }
private: private:
T coeff_; T coeff_;
...@@ -73,9 +73,9 @@ struct ReluFunctor { ...@@ -73,9 +73,9 @@ struct ReluFunctor {
template <typename T> template <typename T>
struct ReluGradFunctor { struct ReluGradFunctor {
inline HOSTDEVICE T operator()(T x) { return x > 0 ? 1 : 0; } inline HOSTDEVICE T UseX(T x) { return x > 0 ? 1 : 0; }
inline HOSTDEVICE T UseOut(T out) { return out > 0 ? 1 : 0; }
inline HOSTDEVICE T operator()(T x, T out) { return x > 0 ? 1 : 0; } inline HOSTDEVICE T UseXAndOut(T x, T out) { return out > 0 ? 1 : 0; }
}; };
} // namespace math } // namespace math
......
...@@ -199,6 +199,14 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -199,6 +199,14 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context, framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) { const framework::SelectedRows& input) {
framework::SelectedRows out; framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
auto input_rows = input.rows(); auto input_rows = input.rows();
std::set<int64_t> row_set(input_rows.begin(), input_rows.end()); std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end()); std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
...@@ -223,7 +231,6 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -223,7 +231,6 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
out_data[out_i * input_width + j] += input_data[i * input_width + j]; out_data[out_i * input_width + j] += input_data[i * input_width + j];
} }
} }
return out;
} }
}; };
......
...@@ -60,9 +60,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> { ...@@ -60,9 +60,11 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
auto out_place = context.GetPlace(); auto out_place = context.GetPlace();
PADDLE_ENFORCE(platform::is_gpu_place(out_place)); PADDLE_ENFORCE(platform::is_gpu_place(out_place));
memory::Copy(boost::get<platform::CUDAPlace>(out_place), out_data, memory::Copy(
boost::get<platform::CUDAPlace>(in1_place), in1_data, boost::get<platform::CUDAPlace>(out_place), out_data,
in1_value.numel() * sizeof(T), context.stream()); boost::get<platform::CUDAPlace>(in1_place), in1_data,
in1_value.numel() * sizeof(T),
reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
auto* in2_data = in2_value.data<T>(); auto* in2_data = in2_value.data<T>();
memory::Copy(boost::get<platform::CUDAPlace>(out_place), memory::Copy(boost::get<platform::CUDAPlace>(out_place),
...@@ -107,7 +109,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> { ...@@ -107,7 +109,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, out_dims[0]); PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
...@@ -146,7 +148,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> { ...@@ -146,7 +148,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto in1_height = input1.height(); auto in1_height = input1.height();
PADDLE_ENFORCE_EQ(in1_height, input2->height()); PADDLE_ENFORCE_EQ(in1_height, input2->height());
auto& in1_rows = input1.rows(); framework::Vector<int64_t> in1_rows(input1.rows());
auto& in2_rows = *(input2->mutable_rows()); auto& in2_rows = *(input2->mutable_rows());
auto& in1_value = input1.value(); auto& in1_value = input1.value();
...@@ -206,7 +208,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> { ...@@ -206,7 +208,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]); PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
auto& in1_value = input1.value(); auto& in1_value = input1.value();
framework::Vector<int64_t> in1_rows(input1.rows()); auto& in1_rows = input1.rows();
int64_t in1_row_numel = in1_value.numel() / in1_rows.size(); int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
...@@ -234,7 +236,7 @@ template <typename T, int block_size> ...@@ -234,7 +236,7 @@ template <typename T, int block_size>
__global__ void MergeAddKernel(const T* input, const int64_t* input_rows, __global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
T* out, const int64_t* out_rows, T* out, const int64_t* out_rows,
size_t out_rows_size, int64_t row_numel) { size_t out_rows_size, int64_t row_numel) {
const int ty = blockIdx.y; const int ty = blockIdx.x;
int tid = threadIdx.x; int tid = threadIdx.x;
__shared__ size_t out_idx; __shared__ size_t out_idx;
...@@ -260,6 +262,14 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -260,6 +262,14 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
framework::SelectedRows operator()(const platform::CUDADeviceContext& context, framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input) { const framework::SelectedRows& input) {
framework::SelectedRows out; framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
framework::Vector<int64_t> input_rows(input.rows()); framework::Vector<int64_t> input_rows(input.rows());
std::set<int64_t> row_set(input_rows.begin(), input_rows.end()); std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end()); std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
...@@ -281,16 +291,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -281,16 +291,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid1(1, input_rows.size()); dim3 grid1(input_rows.size(), 1);
MergeAddKernel< MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
input_data, input_rows.CUDAData(context.GetPlace()), out_data, input_data, input_rows.CUDAData(context.GetPlace()), out_data,
out.mutable_rows()->CUDAMutableData(context.GetPlace()), out.mutable_rows()->CUDAMutableData(context.GetPlace()),
out.rows().size(), input_width); out.rows().size(), input_width);
return out;
} }
}; };
......
...@@ -65,6 +65,9 @@ struct MergeAdd { ...@@ -65,6 +65,9 @@ struct MergeAdd {
// the input SelectedRows object. // the input SelectedRows object.
framework::SelectedRows operator()(const DeviceContext& context, framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input); const framework::SelectedRows& input);
void operator()(const DeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -20,7 +20,9 @@ limitations under the License. */ ...@@ -20,7 +20,9 @@ limitations under the License. */
TEST(selected_rows_functor, gpu_add) { TEST(selected_rows_functor, gpu_add) {
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDADeviceContext ctx(gpu_place); paddle::platform::CUDADeviceContext& ctx =
*reinterpret_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext, paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
float> float>
functor; functor;
...@@ -132,7 +134,9 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -132,7 +134,9 @@ TEST(selected_rows_functor, gpu_add) {
TEST(selected_rows_functor, gpu_add_to) { TEST(selected_rows_functor, gpu_add_to) {
paddle::platform::CUDAPlace gpu_place(0); paddle::platform::CUDAPlace gpu_place(0);
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CUDADeviceContext ctx(gpu_place); paddle::platform::CUDADeviceContext& ctx =
*reinterpret_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext, paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
float> float>
functor; functor;
......
...@@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -123,6 +123,7 @@ class SumKernel : public framework::OpKernel<T> {
out_value->Resize(framework::make_ddim(in_dim)); out_value->Resize(framework::make_ddim(in_dim));
out_value->mutable_data<T>(context.GetPlace()); out_value->mutable_data<T>(context.GetPlace());
// if all the input sparse vars are empty, no need to // if all the input sparse vars are empty, no need to
// merge these vars. // merge these vars.
if (first_dim == 0UL) { if (first_dim == 0UL) {
......
...@@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) { ...@@ -36,7 +36,9 @@ void BindConstValue(pybind11::module* m) {
.value("Backward", framework::OpRole::kBackward) .value("Backward", framework::OpRole::kBackward)
.value("Optimize", framework::OpRole::kOptimize) .value("Optimize", framework::OpRole::kOptimize)
.value("Loss", framework::OpRole::kLoss) .value("Loss", framework::OpRole::kLoss)
.value("RPC", framework::OpRole::kRPC); .value("RPC", framework::OpRole::kRPC)
.value("Dist", framework::OpRole::kDist)
.value("LRSched", framework::OpRole::kLRSched);
op_proto_and_checker_maker.def( op_proto_and_checker_maker.def(
"kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName); "kOpRoleAttrName", framework::OpProtoAndCheckerMaker::OpRoleAttrName);
......
...@@ -670,7 +670,14 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -670,7 +670,14 @@ All parameter, weight, gradient are variables in Paddle.
.def_property( .def_property(
"enable_data_balance", "enable_data_balance",
[](const BuildStrategy &self) { return self.enable_data_balance_; }, [](const BuildStrategy &self) { return self.enable_data_balance_; },
[](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; }); [](BuildStrategy &self, bool b) { self.enable_data_balance_ = b; })
.def_property("fuse_elewise_add_act_ops",
[](const BuildStrategy &self) {
return self.fuse_elewise_add_act_ops_;
},
[](BuildStrategy &self, bool b) {
self.fuse_elewise_add_act_ops_ = b;
});
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::unordered_set<std::string> &,
......
...@@ -46,7 +46,7 @@ from . import transpiler ...@@ -46,7 +46,7 @@ from . import transpiler
from .param_attr import ParamAttr, WeightNormParamAttr from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope
from .transpiler import DistributeTranspiler, InferenceTranspiler, \ from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig memory_optimize, release_memory, DistributeTranspilerConfig
from .lod_tensor import create_lod_tensor, create_random_int_lodtensor from .lod_tensor import create_lod_tensor, create_random_int_lodtensor
from . import clip from . import clip
......
...@@ -1509,6 +1509,30 @@ class Program(object): ...@@ -1509,6 +1509,30 @@ class Program(object):
self._op_role_var = [] self._op_role_var = []
self._current_role = OpRole.Forward self._current_role = OpRole.Forward
@contextlib.contextmanager
def _lr_schedule_guard(self):
"""
A with guard to set :code:`LRSched` :code:`OpRole` and
:code:`OpRoleVar` automatically. The :code:`OpRoleVar` is
set to the target learning rate.
Notes: This is a very low level API. Users should not use it directly.
Examples:
>>> p, g = backward(...)
>>> with program.lr_schedule_guard():
>>> lr = lr * decay
"""
OpRole = core.op_proto_and_checker_maker.OpRole
self._current_role = OpRole.LRSched
# TODO(typhoonzero): how to set target learning rate var
self._op_role_var = []
yield
self._op_role_var = []
self._current_role = OpRole.Forward
def __str__(self): def __str__(self):
""" """
Get the protobuf debug string of this Program. Get the protobuf debug string of this Program.
......
...@@ -74,7 +74,7 @@ class Initializer(object): ...@@ -74,7 +74,7 @@ class Initializer(object):
directly, but need to use one of its implementations. directly, but need to use one of its implementations.
""" """
def __init_(self): def __init__(self):
pass pass
def __call__(self, param, block): def __call__(self, param, block):
...@@ -293,7 +293,7 @@ class TruncatedNormalInitializer(Initializer): ...@@ -293,7 +293,7 @@ class TruncatedNormalInitializer(Initializer):
assert loc is not None assert loc is not None
assert scale is not None assert scale is not None
assert seed is not None assert seed is not None
super(NormalInitializer, self).__init__() super(TruncatedNormalInitializer, self).__init__()
self._mean = loc self._mean = loc
self._std_dev = scale self._std_dev = scale
self._seed = seed self._seed = seed
......
...@@ -27,8 +27,7 @@ from . import core ...@@ -27,8 +27,7 @@ from . import core
__all__ = [ __all__ = [
'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
'load_persistables', 'save_inference_model', 'load_inference_model', 'load_persistables', 'save_inference_model', 'load_inference_model'
'get_inference_program'
] ]
...@@ -504,23 +503,6 @@ def load_persistables(executor, dirname, main_program=None, filename=None): ...@@ -504,23 +503,6 @@ def load_persistables(executor, dirname, main_program=None, filename=None):
filename=filename) filename=filename)
def get_inference_program(target_vars, main_program=None):
if main_program is None:
main_program = default_main_program()
if not isinstance(target_vars, list):
target_vars = [target_vars]
vars = []
for var in target_vars:
if isinstance(var, Evaluator):
vars.extend(var.states)
vars.extend(var.metrics)
else:
vars.append(var)
pruned_program = main_program._prune(targets=vars)
inference_program = pruned_program._inference_optimize()
return inference_program
def prepend_feed_ops(inference_program, def prepend_feed_ops(inference_program,
feed_target_names, feed_target_names,
feed_holder_name='feed'): feed_holder_name='feed'):
......
...@@ -39,6 +39,7 @@ __all__ = [ ...@@ -39,6 +39,7 @@ __all__ = [
'detection_map', 'detection_map',
'rpn_target_assign', 'rpn_target_assign',
'anchor_generator', 'anchor_generator',
'roi_perspective_transform',
'generate_proposal_labels', 'generate_proposal_labels',
'generate_proposals', 'generate_proposals',
] ]
...@@ -1262,6 +1263,54 @@ def anchor_generator(input, ...@@ -1262,6 +1263,54 @@ def anchor_generator(input,
return anchor, var return anchor, var
def roi_perspective_transform(input,
rois,
transformed_height,
transformed_width,
spatial_scale=1.0):
"""
ROI perspective transform op.
Args:
input (Variable): The input of ROIPerspectiveTransformOp. The format of
input tensor is NCHW. Where N is batch size, C is the
number of input channels, H is the height of the feature,
and W is the width of the feature.
rois (Variable): ROIs (Regions of Interest) to be transformed. It should be
a 2-D LoDTensor of shape (num_rois, 8). Given as
[[x1, y1, x2, y2, x3, y3, x4, y4], ...], (x1, y1) is the
top left coordinates, and (x2, y2) is the top right
coordinates, and (x3, y3) is the bottom right coordinates,
and (x4, y4) is the bottom left coordinates.
transformed_height (integer): The height of transformed output.
transformed_height (integer): The width of transformed output.
spatial_scale (float): Spatial scale factor to scale ROI coords. Default: 1.0
Returns:
Variable: The output of ROIPerspectiveTransformOp which is a 4-D tensor with shape
(num_rois, channels, transformed_h, transformed_w).
Examples:
.. code-block:: python
out = fluid.layers.roi_perspective_transform(input, rois, 7, 7, 1.0)
"""
helper = LayerHelper('roi_perspective_transform', **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
helper.append_op(
type="roi_perspective_transform",
inputs={"X": input,
"ROIs": rois},
outputs={"Out": out},
attrs={
"transformed_height": transformed_height,
"transformed_width": transformed_width,
"spatial_scale": spatial_scale
})
return out
def generate_proposal_labels(rpn_rois, def generate_proposal_labels(rpn_rois,
gt_classes, gt_classes,
is_crowd, is_crowd,
......
...@@ -27,7 +27,7 @@ from . import nn ...@@ -27,7 +27,7 @@ from . import nn
from . import ops from . import ops
from . import tensor from . import tensor
from ..initializer import init_on_cpu from ..initializer import init_on_cpu
from ..framework import default_main_program, Parameter from ..framework import default_main_program, Parameter, unique_name
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
...@@ -63,11 +63,12 @@ def noam_decay(d_model, warmup_steps): ...@@ -63,11 +63,12 @@ def noam_decay(d_model, warmup_steps):
Returns: Returns:
The decayed learning rate. The decayed learning rate.
""" """
global_step = _decay_step_counter(1) with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter(1)
a = global_step**-0.5 a = global_step**-0.5
b = (warmup_steps**-1.5) * global_step b = (warmup_steps**-1.5) * global_step
lr_value = (d_model**-0.5) * ops.elementwise_min(a, b) lr_value = (d_model**-0.5) * ops.elementwise_min(a, b)
return lr_value return lr_value
...@@ -108,14 +109,15 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -108,14 +109,15 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate * (decay_rate**div_res) decayed_lr = learning_rate * (decay_rate**div_res)
return decayed_lr return decayed_lr
def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
...@@ -136,14 +138,15 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -136,14 +138,15 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
Returns: Returns:
The decayed learning rate The decayed learning rate
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
return decayed_lr return decayed_lr
def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
...@@ -181,15 +184,16 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): ...@@ -181,15 +184,16 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
staircase=True)) staircase=True))
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
div_res = global_step / decay_steps div_res = global_step / decay_steps
if staircase: if staircase:
div_res = ops.floor(div_res) div_res = ops.floor(div_res)
decayed_lr = learning_rate / (1 + decay_rate * div_res) decayed_lr = learning_rate / (1 + decay_rate * div_res)
return decayed_lr return decayed_lr
def polynomial_decay(learning_rate, def polynomial_decay(learning_rate,
...@@ -220,25 +224,28 @@ def polynomial_decay(learning_rate, ...@@ -220,25 +224,28 @@ def polynomial_decay(learning_rate,
Returns: Returns:
Variable: The decayed learning rate Variable: The decayed learning rate
""" """
global_step = _decay_step_counter() with default_main_program()._lr_schedule_guard():
global_step = _decay_step_counter()
if cycle:
div_res = ops.ceil(global_step / decay_steps) if cycle:
zero_var = tensor.fill_constant(shape=[1], dtype='float32', value=0.0) div_res = ops.ceil(global_step / decay_steps)
one_var = tensor.fill_constant(shape=[1], dtype='float32', value=1.0) zero_var = tensor.fill_constant(
shape=[1], dtype='float32', value=0.0)
with control_flow.Switch() as switch: one_var = tensor.fill_constant(
with switch.case(global_step == zero_var): shape=[1], dtype='float32', value=1.0)
tensor.assign(input=one_var, output=div_res)
decay_steps = decay_steps * div_res with control_flow.Switch() as switch:
else: with switch.case(global_step == zero_var):
decay_steps_var = tensor.fill_constant( tensor.assign(input=one_var, output=div_res)
shape=[1], dtype='float32', value=float(decay_steps)) decay_steps = decay_steps * div_res
global_step = ops.elementwise_min(x=global_step, y=decay_steps_var) else:
decay_steps_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(decay_steps))
global_step = ops.elementwise_min(x=global_step, y=decay_steps_var)
decayed_lr = (learning_rate - end_learning_rate) * \ decayed_lr = (learning_rate - end_learning_rate) * \
((1 - global_step / decay_steps) ** power) + end_learning_rate ((1 - global_step / decay_steps) ** power) + end_learning_rate
return decayed_lr return decayed_lr
def piecewise_decay(boundaries, values): def piecewise_decay(boundaries, values):
...@@ -266,34 +273,36 @@ def piecewise_decay(boundaries, values): ...@@ -266,34 +273,36 @@ def piecewise_decay(boundaries, values):
""" """
with default_main_program()._lr_schedule_guard():
if len(values) - len(boundaries) != 1:
raise ValueError("len(values) - len(boundaries) should be 1")
if len(values) - len(boundaries) != 1: global_step = _decay_step_counter()
raise ValueError("len(values) - len(boundaries) should be 1")
global_step = _decay_step_counter()
lr = tensor.create_global_var( lr = tensor.create_global_var(
shape=[1], shape=[1],
value=0.0, value=0.0,
dtype='float32', dtype='float32',
persistable=True, persistable=True,
name="learning_rate") name="learning_rate")
with control_flow.Switch() as switch: with control_flow.Switch() as switch:
for i in range(len(boundaries)): for i in range(len(boundaries)):
boundary_val = tensor.fill_constant( boundary_val = tensor.fill_constant(
shape=[1],
dtype='float32',
value=float(boundaries[i]),
force_cpu=True)
value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], shape=[1],
dtype='float32', dtype='float32',
value=float(boundaries[i]), value=float(values[len(values) - 1]))
force_cpu=True) with switch.default():
value_var = tensor.fill_constant( tensor.assign(last_value_var, lr)
shape=[1], dtype='float32', value=float(values[i]))
with switch.case(global_step < boundary_val):
tensor.assign(value_var, lr)
last_value_var = tensor.fill_constant(
shape=[1], dtype='float32', value=float(values[len(values) - 1]))
with switch.default():
tensor.assign(last_value_var, lr)
return lr return lr
......
...@@ -43,11 +43,7 @@ class Optimizer(object): ...@@ -43,11 +43,7 @@ class Optimizer(object):
but need to use one of it's implementation. but need to use one of it's implementation.
""" """
def __init__(self, def __init__(self, learning_rate, regularization=None, name=None):
learning_rate,
regularization=None,
LARS_weight_decay=0.0,
name=None):
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable): not isinstance(learning_rate, framework.Variable):
raise TypeError("learning rate should be float or Variable") raise TypeError("learning rate should be float or Variable")
...@@ -68,7 +64,6 @@ class Optimizer(object): ...@@ -68,7 +64,6 @@ class Optimizer(object):
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
self.helper = None self.helper = None
self._LARS_weight_decay = LARS_weight_decay
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
lr = self._global_learning_rate() lr = self._global_learning_rate()
...@@ -109,7 +104,6 @@ class Optimizer(object): ...@@ -109,7 +104,6 @@ class Optimizer(object):
param = param_and_grad[0] param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] param_lr = param.optimize_attr['learning_rate']
if type(param_lr) == Variable: if type(param_lr) == Variable:
# param learning rate has been updated (LARS)
print("returns updated param lr ", param_lr) print("returns updated param lr ", param_lr)
return param_lr return param_lr
else: else:
...@@ -227,10 +221,6 @@ class Optimizer(object): ...@@ -227,10 +221,6 @@ class Optimizer(object):
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
self._create_global_learning_rate() self._create_global_learning_rate()
if self._LARS_weight_decay > 0.0:
layers.append_LARS(parameters_and_grads,
self._global_learning_rate(),
self._LARS_weight_decay)
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
...@@ -287,6 +277,9 @@ class SGDOptimizer(Optimizer): ...@@ -287,6 +277,9 @@ class SGDOptimizer(Optimizer):
Args: Args:
learning_rate (float|Variable): the learning rate used to update parameters. \ learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -295,10 +288,12 @@ class SGDOptimizer(Optimizer): ...@@ -295,10 +288,12 @@ class SGDOptimizer(Optimizer):
sgd_optimizer.minimize(cost) sgd_optimizer.minimize(cost)
""" """
def __init__(self, learning_rate, **kwargs): def __init__(self, learning_rate, regularization=None, name=None):
assert learning_rate is not None assert learning_rate is not None
super(SGDOptimizer, self).__init__( super(SGDOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "sgd" self.type = "sgd"
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
...@@ -343,6 +338,9 @@ class MomentumOptimizer(Optimizer): ...@@ -343,6 +338,9 @@ class MomentumOptimizer(Optimizer):
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
momentum (float): momentum factor momentum (float): momentum factor
use_nesterov (bool): enables Nesterov momentum use_nesterov (bool): enables Nesterov momentum
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -352,11 +350,18 @@ class MomentumOptimizer(Optimizer): ...@@ -352,11 +350,18 @@ class MomentumOptimizer(Optimizer):
""" """
_velocity_acc_str = "velocity" _velocity_acc_str = "velocity"
def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): def __init__(self,
learning_rate,
momentum,
use_nesterov=False,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert momentum is not None assert momentum is not None
super(MomentumOptimizer, self).__init__( super(MomentumOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "momentum" self.type = "momentum"
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
...@@ -412,6 +417,9 @@ class AdagradOptimizer(Optimizer): ...@@ -412,6 +417,9 @@ class AdagradOptimizer(Optimizer):
learning_rate (float|Variable): the learning rate used to update parameters. \ learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -421,11 +429,17 @@ class AdagradOptimizer(Optimizer): ...@@ -421,11 +429,17 @@ class AdagradOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): def __init__(self,
learning_rate,
epsilon=1.0e-6,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert epsilon is not None assert epsilon is not None
super(AdagradOptimizer, self).__init__( super(AdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adagrad" self.type = "adagrad"
self._epsilon = epsilon self._epsilon = epsilon
...@@ -485,6 +499,9 @@ class AdamOptimizer(Optimizer): ...@@ -485,6 +499,9 @@ class AdamOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates. beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -503,13 +520,16 @@ class AdamOptimizer(Optimizer): ...@@ -503,13 +520,16 @@ class AdamOptimizer(Optimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
**kwargs): regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamOptimizer, self).__init__( super(AdamOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adam" self.type = "adam"
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
...@@ -629,6 +649,9 @@ class AdamaxOptimizer(Optimizer): ...@@ -629,6 +649,9 @@ class AdamaxOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates. beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -645,13 +668,16 @@ class AdamaxOptimizer(Optimizer): ...@@ -645,13 +668,16 @@ class AdamaxOptimizer(Optimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
**kwargs): regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamaxOptimizer, self).__init__( super(AdamaxOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adamax" self.type = "adamax"
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
...@@ -742,6 +768,9 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -742,6 +768,9 @@ class DecayedAdagradOptimizer(Optimizer):
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
decay (float): decay rate. decay (float): decay rate.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -751,13 +780,20 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -751,13 +780,20 @@ class DecayedAdagradOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs): def __init__(self,
learning_rate,
decay=0.95,
epsilon=1.0e-6,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert decay is not None assert decay is not None
assert epsilon is not None assert epsilon is not None
super(DecayedAdagradOptimizer, self).__init__( super(DecayedAdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "decayed_adagrad" self.type = "decayed_adagrad"
self._decay = decay self._decay = decay
self._epsilon = epsilon self._epsilon = epsilon
...@@ -811,6 +847,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -811,6 +847,9 @@ class AdadeltaOptimizer(Optimizer):
learning_rate(float): global learning rate learning_rate(float): global learning rate
rho(float): rho in equation rho(float): rho in equation
epsilon(float): epsilon in equation epsilon(float): epsilon in equation
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -823,7 +862,12 @@ class AdadeltaOptimizer(Optimizer): ...@@ -823,7 +862,12 @@ class AdadeltaOptimizer(Optimizer):
_avg_squared_grad_acc_str = "_avg_squared_grad" _avg_squared_grad_acc_str = "_avg_squared_grad"
_avg_squared_update_acc_str = "_avg_squared_update" _avg_squared_update_acc_str = "_avg_squared_update"
def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs): def __init__(self,
learning_rate,
epsilon=1.0e-6,
rho=0.95,
regularization=None,
name=None):
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
if epsilon is None: if epsilon is None:
...@@ -831,7 +875,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -831,7 +875,9 @@ class AdadeltaOptimizer(Optimizer):
if rho is None: if rho is None:
raise ValueError("rho is not set.") raise ValueError("rho is not set.")
super(AdadeltaOptimizer, self).__init__( super(AdadeltaOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adadelta" self.type = "adadelta"
self._epsilon = epsilon self._epsilon = epsilon
self._rho = rho self._rho = rho
...@@ -932,6 +978,9 @@ class RMSPropOptimizer(Optimizer): ...@@ -932,6 +978,9 @@ class RMSPropOptimizer(Optimizer):
the gradient; if False, by the uncentered second moment. Setting this to the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False. computation and memory. Defaults to False.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -953,9 +1002,12 @@ class RMSPropOptimizer(Optimizer): ...@@ -953,9 +1002,12 @@ class RMSPropOptimizer(Optimizer):
epsilon=1.0e-6, epsilon=1.0e-6,
momentum=0.0, momentum=0.0,
centered=False, centered=False,
**kwargs): regularization=None,
name=None):
super(RMSPropOptimizer, self).__init__( super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
if rho is None: if rho is None:
...@@ -1061,6 +1113,9 @@ class FtrlOptimizer(Optimizer): ...@@ -1061,6 +1113,9 @@ class FtrlOptimizer(Optimizer):
l1 (float): l1 (float):
l2 (float): l2 (float):
lr_power (float): lr_power (float):
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -1075,9 +1130,17 @@ class FtrlOptimizer(Optimizer): ...@@ -1075,9 +1130,17 @@ class FtrlOptimizer(Optimizer):
_squared_acc_str = "squared" _squared_acc_str = "squared"
_linear_acc_str = "linear" _linear_acc_str = "linear"
def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs): def __init__(self,
learning_rate,
l1=0.0,
l2=0.0,
lr_power=-0.5,
regularization=None,
name=None):
super(FtrlOptimizer, self).__init__( super(FtrlOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
...@@ -1155,7 +1218,9 @@ class ModelAverage(Optimizer): ...@@ -1155,7 +1218,9 @@ class ModelAverage(Optimizer):
average_window_rate: The rate of average window. average_window_rate: The rate of average window.
min_average_window: The minimum size of average window. min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window. max_average_window: The maximum size of average window.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1178,8 +1243,10 @@ class ModelAverage(Optimizer): ...@@ -1178,8 +1243,10 @@ class ModelAverage(Optimizer):
average_window_rate, average_window_rate,
min_average_window=10000, min_average_window=10000,
max_average_window=10000, max_average_window=10000,
**kwargs): regularization=None,
super(ModelAverage, self).__init__(0.0, **kwargs) name=None):
super(ModelAverage, self).__init__(
0.0, regularization=regularization, name=name)
self.average_window = average_window_rate self.average_window = average_window_rate
self.min_average_window = min_average_window self.min_average_window = min_average_window
self.max_average_window = max_average_window self.max_average_window = max_average_window
......
...@@ -190,14 +190,11 @@ class L1DecayRegularizer(WeightDecayRegularizer): ...@@ -190,14 +190,11 @@ class L1DecayRegularizer(WeightDecayRegularizer):
Examples: Examples:
.. code-block:: python .. code-block:: python
program = fluid.framework.Program() optimizer = fluid.optimizer.Adagrad(
block = program.global_block() learning_rate=1e-4,
mul_x = block.create_parameter( regularization=fluid.regularizer.L1DecayRegularizer(
dtype="float32", regularization_coeff=0.1))
shape=[5, 10], optimizer.minimize(avg_cost)
lod_level=0,
name="mul.x",
regularizer=fluid.regularizer.L1DecayRegularizer(0.5))
""" """
def __init__(self, regularization_coeff=0.0): def __init__(self, regularization_coeff=0.0):
......
...@@ -99,7 +99,7 @@ def train(nn_type, ...@@ -99,7 +99,7 @@ def train(nn_type,
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
...@@ -34,12 +34,13 @@ if(APPLE) ...@@ -34,12 +34,13 @@ if(APPLE)
list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_desc_clone)
list(REMOVE_ITEM TEST_OPS test_program_code) list(REMOVE_ITEM TEST_OPS test_program_code)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_dist_se_resnext") message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
# this op is not support on mac # this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
# TODO: add the unitest back when it fixed # TODO: add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_detection_map_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
endif() endif()
function(py_test_modules TARGET_NAME) function(py_test_modules TARGET_NAME)
...@@ -79,7 +80,8 @@ if(WITH_DISTRIBUTE) ...@@ -79,7 +80,8 @@ if(WITH_DISTRIBUTE)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL)
endif(NOT APPLE) endif(NOT APPLE)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) #FIXME(gongwb): random fails.
#py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL)
endif() endif()
py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
......
...@@ -437,13 +437,8 @@ def split_data(data, num_part): ...@@ -437,13 +437,8 @@ def split_data(data, num_part):
] ]
def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, def test_context(test_program, avg_cost, train_exe, dev_count, data_input_names,
sum_cost, token_num): sum_cost, token_num):
# Context to do validation.
test_program = train_progm.clone()
with fluid.program_guard(test_program):
test_program = fluid.io.get_inference_program([avg_cost])
val_data = DataReader( val_data = DataReader(
src_vocab_fpath=TrainTaskConfig.src_vocab_fpath, src_vocab_fpath=TrainTaskConfig.src_vocab_fpath,
trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath, trg_vocab_fpath=TrainTaskConfig.trg_vocab_fpath,
...@@ -505,7 +500,7 @@ def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, ...@@ -505,7 +500,7 @@ def test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names,
def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
token_num, predict): token_num, predict, test_program):
# Initialize the parameters. # Initialize the parameters.
if TrainTaskConfig.ckpt_path: if TrainTaskConfig.ckpt_path:
lr_scheduler.current_steps = TrainTaskConfig.start_step lr_scheduler.current_steps = TrainTaskConfig.start_step
...@@ -554,7 +549,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, ...@@ -554,7 +549,7 @@ def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler,
-1] + label_data_input_fields -1] + label_data_input_fields
if TrainTaskConfig.val_file_pattern is not None: if TrainTaskConfig.val_file_pattern is not None:
test = test_context(train_progm, avg_cost, train_exe, dev_count, test = test_context(test_program, avg_cost, train_exe, dev_count,
data_input_names, sum_cost, token_num) data_input_names, sum_cost, token_num)
# the best cross-entropy value with label smoothing # the best cross-entropy value with label smoothing
...@@ -1647,6 +1642,8 @@ def get_model(is_dist, is_async): ...@@ -1647,6 +1642,8 @@ def get_model(is_dist, is_async):
local_lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, local_lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
TrainTaskConfig.warmup_steps, TrainTaskConfig.warmup_steps,
TrainTaskConfig.learning_rate) TrainTaskConfig.learning_rate)
# Context to do validation.
test_program = fluid.default_main_program().clone(for_test=True)
if not is_dist: if not is_dist:
optimizer = fluid.optimizer.Adam( optimizer = fluid.optimizer.Adam(
...@@ -1671,7 +1668,7 @@ def get_model(is_dist, is_async): ...@@ -1671,7 +1668,7 @@ def get_model(is_dist, is_async):
epsilon=TrainTaskConfig.eps) epsilon=TrainTaskConfig.eps)
optimizer.minimize(sum_cost) optimizer.minimize(sum_cost)
return sum_cost, avg_cost, predict, token_num, local_lr_scheduler return sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program
def update_args(): def update_args():
...@@ -1705,7 +1702,7 @@ class DistTransformer2x2(TestDistRunnerBase): ...@@ -1705,7 +1702,7 @@ class DistTransformer2x2(TestDistRunnerBase):
def run_trainer(self, use_cuda, args): def run_trainer(self, use_cuda, args):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
TrainTaskConfig.use_gpu = use_cuda TrainTaskConfig.use_gpu = use_cuda
sum_cost, avg_cost, predict, token_num, local_lr_scheduler = get_model( sum_cost, avg_cost, predict, token_num, local_lr_scheduler, test_program = get_model(
args.is_dist, not args.sync_mode) args.is_dist, not args.sync_mode)
if args.is_dist: if args.is_dist:
...@@ -1726,7 +1723,7 @@ class DistTransformer2x2(TestDistRunnerBase): ...@@ -1726,7 +1723,7 @@ class DistTransformer2x2(TestDistRunnerBase):
TrainTaskConfig.local = not args.is_dist TrainTaskConfig.local = not args.is_dist
train_loop(startup_exe, trainer_prog, 1, sum_cost, avg_cost, train_loop(startup_exe, trainer_prog, 1, sum_cost, avg_cost,
local_lr_scheduler, token_num, predict) local_lr_scheduler, token_num, predict, test_program)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -47,8 +47,7 @@ def get_numeric_gradient(place, ...@@ -47,8 +47,7 @@ def get_numeric_gradient(place,
input_to_check, input_to_check,
output_names, output_names,
delta=0.005, delta=0.005,
in_place=False, in_place=False):
sum_outputs=None):
# FIXME: change this method by compile time concepts # FIXME: change this method by compile time concepts
set_input(scope, op, inputs, place) set_input(scope, op, inputs, place)
...@@ -59,8 +58,6 @@ def get_numeric_gradient(place, ...@@ -59,8 +58,6 @@ def get_numeric_gradient(place,
sum = [] sum = []
op.run(scope, place) op.run(scope, place)
for output_name in output_names: for output_name in output_names:
if sum_outputs and output_name not in sum_outputs:
continue
sum.append( sum.append(
np.array(scope.find_var(output_name).get_tensor()).mean()) np.array(scope.find_var(output_name).get_tensor()).mean())
return np.array(sum).sum() / len(output_names) return np.array(sum).sum() / len(output_names)
...@@ -348,7 +345,7 @@ class OpTest(unittest.TestCase): ...@@ -348,7 +345,7 @@ class OpTest(unittest.TestCase):
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
"Output (" + out_name + ") has diff at " + str(place) + "Output (" + out_name + ") has diff at " + str(place) +
"\nExpect " + str(expect_t) + "\n" + "But Got" + "\nExpect " + str(expect_t) + "\n" + "But Got" +
str(actual_t) + " in class " + self.__class__.__name__) str(actual_t))
if isinstance(expect, tuple): if isinstance(expect, tuple):
self.assertListEqual(actual.recursive_sequence_lengths(), self.assertListEqual(actual.recursive_sequence_lengths(),
expect[1], "Output (" + out_name + expect[1], "Output (" + out_name +
...@@ -407,14 +404,13 @@ class OpTest(unittest.TestCase): ...@@ -407,14 +404,13 @@ class OpTest(unittest.TestCase):
numeric_grad_delta=0.005, numeric_grad_delta=0.005,
in_place=False, in_place=False,
max_relative_error=0.005, max_relative_error=0.005,
user_defined_grads=None, user_defined_grads=None):
sum_outputs=None):
places = self._get_places() places = self._get_places()
for place in places: for place in places:
self.check_grad_with_place(place, inputs_to_check, output_names, self.check_grad_with_place(place, inputs_to_check, output_names,
no_grad_set, numeric_grad_delta, no_grad_set, numeric_grad_delta,
in_place, max_relative_error, in_place, max_relative_error,
user_defined_grads, sum_outputs) user_defined_grads)
def check_grad_with_place(self, def check_grad_with_place(self,
place, place,
...@@ -424,8 +420,7 @@ class OpTest(unittest.TestCase): ...@@ -424,8 +420,7 @@ class OpTest(unittest.TestCase):
numeric_grad_delta=0.005, numeric_grad_delta=0.005,
in_place=False, in_place=False,
max_relative_error=0.005, max_relative_error=0.005,
user_defined_grads=None, user_defined_grads=None):
sum_outputs=None):
self.scope = core.Scope() self.scope = core.Scope()
op_inputs = self.inputs if hasattr(self, "inputs") else dict() op_inputs = self.inputs if hasattr(self, "inputs") else dict()
op_outputs = self.outputs if hasattr(self, "outputs") else dict() op_outputs = self.outputs if hasattr(self, "outputs") else dict()
...@@ -448,8 +443,7 @@ class OpTest(unittest.TestCase): ...@@ -448,8 +443,7 @@ class OpTest(unittest.TestCase):
input_to_check, input_to_check,
output_names, output_names,
delta=numeric_grad_delta, delta=numeric_grad_delta,
in_place=in_place, in_place=in_place) for input_to_check in inputs_to_check
sum_outputs=sum_outputs) for input_to_check in inputs_to_check
] ]
analytic_grads = self._get_gradient(inputs_to_check, place, analytic_grads = self._get_gradient(inputs_to_check, place,
output_names, no_grad_set) output_names, no_grad_set)
......
...@@ -38,6 +38,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -38,6 +38,7 @@ class TestParallelExecutorBase(unittest.TestCase):
seed=None, seed=None,
use_parallel_executor=True, use_parallel_executor=True,
use_reduce=False, use_reduce=False,
fuse_elewise_add_act_ops=False,
optimizer=fluid.optimizer.Adam, optimizer=fluid.optimizer.Adam,
use_fast_executor=False): use_fast_executor=False):
def run_executor(exe, feed, fetch_list, program=None): def run_executor(exe, feed, fetch_list, program=None):
...@@ -78,6 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase): ...@@ -78,6 +79,7 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
if use_parallel_executor: if use_parallel_executor:
exe = fluid.ParallelExecutor( exe = fluid.ParallelExecutor(
......
...@@ -20,7 +20,6 @@ import six ...@@ -20,7 +20,6 @@ import six
import sys import sys
import collections import collections
import math import math
import paddle.fluid as fluid
from op_test import OpTest from op_test import OpTest
...@@ -33,7 +32,7 @@ class TestDetectionMAPOp(OpTest): ...@@ -33,7 +32,7 @@ class TestDetectionMAPOp(OpTest):
self.detect = np.array(self.detect).astype('float32') self.detect = np.array(self.detect).astype('float32')
self.mAP = np.array(self.mAP).astype('float32') self.mAP = np.array(self.mAP).astype('float32')
if len(self.class_pos_count) > 0: if (len(self.class_pos_count) > 0):
self.class_pos_count = np.array(self.class_pos_count).astype( self.class_pos_count = np.array(self.class_pos_count).astype(
'int32') 'int32')
self.true_pos = np.array(self.true_pos).astype('float32') self.true_pos = np.array(self.true_pos).astype('float32')
...@@ -274,7 +273,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): ...@@ -274,7 +273,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp):
class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp):
def init_test_case(self): def init_test_case(self):
super(TestDetectionMAPOpMultiBatch, self).init_test_case() super(TestDetectionMAPOpMultiBatch, self).init_test_case()
self.class_pos_count = [0, 2, 1, 0] self.class_pos_count = [0, 2, 1]
self.true_pos_lod = [[0, 3, 2]] self.true_pos_lod = [[0, 3, 2]]
self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]]
self.false_pos_lod = [[0, 3, 2]] self.false_pos_lod = [[0, 3, 2]]
......
...@@ -22,7 +22,7 @@ class TestDistMnist2x2(TestDistBase): ...@@ -22,7 +22,7 @@ class TestDistMnist2x2(TestDistBase):
self._sync_mode = True self._sync_mode = True
self._use_reduce = False self._use_reduce = False
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-7)
...@@ -31,7 +31,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase): ...@@ -31,7 +31,7 @@ class TestDistMnist2x2WithMemopt(TestDistBase):
self._sync_mode = True self._sync_mode = True
self._mem_opt = True self._mem_opt = True
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=1e-7) self.check_with_place("dist_mnist.py", delta=1e-7)
...@@ -40,7 +40,7 @@ class TestDistMnistAsync(TestDistBase): ...@@ -40,7 +40,7 @@ class TestDistMnistAsync(TestDistBase):
self._sync_mode = False self._sync_mode = False
self._use_reduce = False self._use_reduce = False
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_mnist.py", delta=200) self.check_with_place("dist_mnist.py", delta=200)
......
...@@ -21,7 +21,16 @@ class TestDistSeResneXt2x2(TestDistBase): ...@@ -21,7 +21,16 @@ class TestDistSeResneXt2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7)
class TestDistseResnXt2x2WithMemopt(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._mem_opt = True
def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=1e-7) self.check_with_place("dist_se_resnext.py", delta=1e-7)
...@@ -29,7 +38,7 @@ class TestDistSeResneXt2x2Async(TestDistBase): ...@@ -29,7 +38,7 @@ class TestDistSeResneXt2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_se_resnext.py", delta=100) self.check_with_place("dist_se_resnext.py", delta=100)
......
...@@ -59,7 +59,7 @@ class TestDistTransformer2x2Sync(TestDistBase): ...@@ -59,7 +59,7 @@ class TestDistTransformer2x2Sync(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
def test_transformer(self): def test_dist_train(self):
download_files() download_files()
self.check_with_place("dist_transformer.py", delta=1e-5) self.check_with_place("dist_transformer.py", delta=1e-5)
...@@ -68,7 +68,7 @@ class TestDistTransformer2x2Async(TestDistBase): ...@@ -68,7 +68,7 @@ class TestDistTransformer2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
def test_transformer(self): def test_dist_train(self):
download_files() download_files()
self.check_with_place("dist_transformer.py", delta=1.0) self.check_with_place("dist_transformer.py", delta=1.0)
......
...@@ -17,19 +17,28 @@ import unittest ...@@ -17,19 +17,28 @@ import unittest
from test_dist_base import TestDistBase from test_dist_base import TestDistBase
class TestDistSeResneXt2x2(TestDistBase): class TestDistW2V2x2(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1e-4) self.check_with_place("dist_word2vec.py", delta=1e-4)
class TestDistSeResneXt2x2Async(TestDistBase): class TestDistW2V2x2WithMemOpt(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._mem_opt = True
def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1e-4)
class TestDistW2V2x2Async(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
def test_se_resnext(self): def test_dist_train(self):
self.check_with_place("dist_word2vec.py", delta=1) self.check_with_place("dist_word2vec.py", delta=1)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest
import os
MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
def simple_fc_net(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
def fc_with_batchnorm(use_feed):
if use_feed:
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img
for _ in range(2):
hidden = fluid.layers.fc(
hidden,
size=200,
act='relu',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
hidden = fluid.layers.batch_norm(input=hidden)
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder)
def _init_data(self, random=True):
np.random.seed(5)
if random:
img = np.random.random(size=[32, 784]).astype(np.float32)
else:
img = np.ones(shape=[32, 784], dtype='float32')
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
def _compare_fuse_elewise_add_act_ops(self,
model,
use_cuda,
random_data=True):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data(random_data)
def _optimizer(learning_rate=1e-6):
optimizer = fluid.optimizer.SGD(
learning_rate=learning_rate,
regularization=fluid.regularizer.L2Decay(1e-6))
return optimizer
not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
fuse_elewise_add_act_ops=False,
memory_opt=False,
optimizer=_optimizer)
fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence(
model,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
fuse_elewise_add_act_ops=True,
memory_opt=False,
optimizer=_optimizer)
for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss):
self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def test_simple_fc_with_fuse_op(self):
self._compare_fuse_elewise_add_act_ops(simple_fc_net, True)
self._compare_fuse_elewise_add_act_ops(simple_fc_net, False)
def test_batchnorm_fc_with_fuse_op(self):
self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, True)
self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, False)
if __name__ == '__main__':
unittest.main()
...@@ -48,7 +48,7 @@ def create_test_class(test_case, callback, attrs): ...@@ -48,7 +48,7 @@ def create_test_class(test_case, callback, attrs):
'X': OpTest.np_dtype_to_fluid_dtype(self.x), 'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y) 'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
} }
if self.attrs["keep_intermediate_value"]: if self.attrs["save_intermediate_out"]:
self.outputs = { self.outputs = {
'Out': self.out, 'Out': self.out,
"IntermediateOut": self.intermediate_out "IntermediateOut": self.intermediate_out
...@@ -73,22 +73,19 @@ def create_test_class(test_case, callback, attrs): ...@@ -73,22 +73,19 @@ def create_test_class(test_case, callback, attrs):
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
# FIXME(zcd): the intermediate_out_grad is not checked.
def test_check_grad_normal(self): def test_check_grad_normal(self):
if self.attrs["keep_intermediate_value"]: if self.attrs["save_intermediate_out"]:
self.check_grad( self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
['X', 'Y'], ['Out', 'IntermediateOut'],
max_relative_error=0.005,
sum_outputs=['Out'])
else: else:
self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005) self.check_grad(['X', 'Y'], ['Out'], max_relative_error=0.005)
def test_check_grad_ingore_x(self): def test_check_grad_ingore_x(self):
if self.attrs["keep_intermediate_value"]: if self.attrs["save_intermediate_out"]:
self.check_grad( self.check_grad(
['Y'], ['Out', 'IntermediateOut'], ['Y'], ['Out'],
max_relative_error=0.005, max_relative_error=0.005,
no_grad_set=set("X"), no_grad_set=set("X"))
sum_outputs=['Out'])
else: else:
self.check_grad( self.check_grad(
['Y'], ['Out'], ['Y'], ['Out'],
...@@ -96,12 +93,11 @@ def create_test_class(test_case, callback, attrs): ...@@ -96,12 +93,11 @@ def create_test_class(test_case, callback, attrs):
no_grad_set=set("X")) no_grad_set=set("X"))
def test_check_grad_ingore_y(self): def test_check_grad_ingore_y(self):
if self.attrs["keep_intermediate_value"]: if self.attrs["save_intermediate_out"]:
self.check_grad( self.check_grad(
['X'], ['Out', 'IntermediateOut'], ['X'], ['Out'],
max_relative_error=0.005, max_relative_error=0.005,
no_grad_set=set("Y"), no_grad_set=set("Y"))
sum_outputs=['Out'])
else: else:
self.check_grad( self.check_grad(
['X'], ['Out'], ['X'], ['Out'],
...@@ -303,39 +299,32 @@ for mode in {0, 1}: ...@@ -303,39 +299,32 @@ for mode in {0, 1}:
relu_add_func = partial(relu_add_func, mode=mode) relu_add_func = partial(relu_add_func, mode=mode)
add_relu_func = partial(add_relu_func, mode=mode) add_relu_func = partial(add_relu_func, mode=mode)
for recomputation in {True, False}: for save_intermediate_out in {True, False}:
for keep_intermediate_value in {True, False}: suffix = ("_save_intermediate_out" if save_intermediate_out else "") \
suffix = ("_keep_intermediate_value" if keep_intermediate_value else "") \ + ("_mode_"+ str(mode))
+ ("_recomputation" if recomputation else "") \ create_test_class('scale_add' + suffix, scale_add_func, {
+ ("_mode_"+ str(mode)) 'scale': scale,
create_test_class('scale_add' + suffix, scale_add_func, { 'functor_list': ["scale", "elementwise_add"],
'scale': scale, 'save_intermediate_out': save_intermediate_out,
'functor_list': ["scale", "elementwise_add"], })
'keep_intermediate_value': keep_intermediate_value, create_test_class('add_scale' + suffix, add_scale_func, {
'recomputation': recomputation 'scale': scale,
}) 'functor_list': ["elementwise_add", "scale"],
create_test_class('add_scale' + suffix, add_scale_func, { 'save_intermediate_out': save_intermediate_out,
'scale': scale, })
'functor_list': ["elementwise_add", "scale"], create_test_class('add_relu' + suffix, add_relu_func, {
'keep_intermediate_value': keep_intermediate_value, 'functor_list': ["elementwise_add", "relu"],
'recomputation': recomputation 'save_intermediate_out': save_intermediate_out,
}) })
create_test_class('add_relu' + suffix, add_relu_func, { create_test_class('relu_add' + suffix, relu_add_func, {
'functor_list': ["elementwise_add", "relu"], 'functor_list': ["relu", "elementwise_add"],
'keep_intermediate_value': keep_intermediate_value, 'save_intermediate_out': save_intermediate_out,
'recomputation': recomputation })
}) create_test_class('mul_scale' + suffix, mul_scale_func, {
create_test_class('relu_add' + suffix, relu_add_func, { 'scale': scale,
'functor_list': ["relu", "elementwise_add"], 'functor_list': ["elementwise_mul", "scale"],
'keep_intermediate_value': keep_intermediate_value, 'save_intermediate_out': save_intermediate_out,
'recomputation': recomputation })
})
create_test_class('mul_scale' + suffix, mul_scale_func, {
'scale': scale,
'functor_list': ["elementwise_mul", "scale"],
'keep_intermediate_value': keep_intermediate_value,
'recomputation': recomputation
})
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -30,7 +30,8 @@ def gru( ...@@ -30,7 +30,8 @@ def gru(
bias, # 1 x 3D bias, # 1 x 3D
is_reverse, is_reverse,
act_state, act_state,
act_gate): act_gate,
dtype='float32'):
def _seq_to_batch(lod, is_reverse): def _seq_to_batch(lod, is_reverse):
idx_in_seq_list = [] idx_in_seq_list = []
seq_lens = lod[0] seq_lens = lod[0]
...@@ -71,10 +72,10 @@ def gru( ...@@ -71,10 +72,10 @@ def gru(
T = sum(lod[0]) T = sum(lod[0])
N = len(lod[0]) N = len(lod[0])
D = weight.shape[0] D = weight.shape[0]
batch_gate = np.zeros((T, 3 * D), dtype='float64') batch_gate = np.zeros((T, 3 * D), dtype=dtype)
batch_reset_hidden_prev = np.zeros((T, D), dtype='float64') batch_reset_hidden_prev = np.zeros((T, D), dtype=dtype)
batch_hidden = np.zeros((T, D), dtype='float64') batch_hidden = np.zeros((T, D), dtype=dtype)
hidden = np.zeros((T, D), dtype='float64') hidden = np.zeros((T, D), dtype=dtype)
idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse) idx_in_seq_list, sorted_seqs = _seq_to_batch(lod, is_reverse)
h_p = h0[sorted_seqs] h_p = h0[sorted_seqs]
...@@ -108,23 +109,24 @@ class TestGRUOp(OpTest): ...@@ -108,23 +109,24 @@ class TestGRUOp(OpTest):
self.with_bias = True self.with_bias = True
self.act_state = 'tanh' self.act_state = 'tanh'
self.act_gate = 'sigmoid' self.act_gate = 'sigmoid'
self.dtype = 'float64'
self.set_confs() self.set_confs()
T = sum(self.lod[0]) T = sum(self.lod[0])
N = len(self.lod[0]) N = len(self.lod[0])
input = np.random.rand(T, 3 * self.D).astype('float64') input = np.random.rand(T, 3 * self.D).astype(self.dtype)
weight = np.random.rand(self.D, 3 * self.D).astype('float64') weight = np.random.rand(self.D, 3 * self.D).astype(self.dtype)
bias = np.random.rand( bias = np.random.rand(
1, 3 * self.D).astype('float64') if self.with_bias else np.zeros( 1, 3 * self.D).astype(self.dtype) if self.with_bias else np.zeros(
(1, 3 * self.D), dtype='float64') (1, 3 * self.D), dtype=self.dtype)
h0 = np.random.rand( h0 = np.random.rand(
N, self.D).astype('float64') if self.with_h0 else np.zeros( N, self.D).astype(self.dtype) if self.with_h0 else np.zeros(
(N, self.D), dtype='float64') (N, self.D), dtype=self.dtype)
batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru( batch_gate, batch_reset_hidden_prev, batch_hidden, hidden = gru(
input, self.lod, h0, weight, bias, self.is_reverse, input, self.lod, h0, weight, bias, self.is_reverse,
ACTIVATION[self.act_state], ACTIVATION[self.act_gate]) ACTIVATION[self.act_state], ACTIVATION[self.act_gate], self.dtype)
self.inputs = {'Input': (input, self.lod), 'Weight': weight} self.inputs = {'Input': (input, self.lod), 'Weight': weight}
if self.with_bias: if self.with_bias:
...@@ -153,6 +155,12 @@ class TestGRUOp(OpTest): ...@@ -153,6 +155,12 @@ class TestGRUOp(OpTest):
self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
class TestGRUOp2(TestGRUOp):
def set_confs(self):
self.D = 19
self.dtype = 'float32'
class TestGRUOpNoInitial(TestGRUOp): class TestGRUOpNoInitial(TestGRUOp):
def set_confs(self): def set_confs(self):
self.with_h0 = False self.with_h0 = False
......
...@@ -573,6 +573,16 @@ class TestBook(unittest.TestCase): ...@@ -573,6 +573,16 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(out) self.assertIsNotNone(out)
print(str(program)) print(str(program))
def test_roi_perspective_transform(self):
program = Program()
with program_guard(program):
x = layers.data(name="x", shape=[256, 30, 30], dtype="float32")
rois = layers.data(
name="rois", shape=[8], dtype="float32", lod_level=1)
output = layers.roi_perspective_transform(x, rois, 7, 7, 0.6)
self.assertIsNotNone(output)
print(str(program))
def test_sequence_enumerate(self): def test_sequence_enumerate(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -79,7 +79,7 @@ class TestReshapeOpWithInputShape(OpTest): ...@@ -79,7 +79,7 @@ class TestReshapeOpWithInputShape(OpTest):
self.check_output(no_check_set=['XShape']) self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(["X"], "Out", sum_outputs=["Out"]) self.check_grad(["X"], "Out")
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License")
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUWARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import math
import sys
import paddle.compat as cpt
from op_test import OpTest
from math import sqrt
from math import floor
def gt_e(a, b):
return a > b or abs(a - b) < 1e-4
def gt(a, b):
return (a - b) > 1e-4
def lt_e(a, b):
return a < b or abs(a - b) < 1e-4
def in_quad(x, y, roi_x, roi_y):
# check if (x, y) is in the boundary of roi
for i in range(4):
xs = roi_x[i]
ys = roi_y[i]
xe = roi_x[(i + 1) % 4]
ye = roi_y[(i + 1) % 4]
if abs(ys - ye) < 1e-4:
if abs(y - ys) < 1e-4 and abs(y - ye) < 1e-4 and gt_e(
x, min(xs, xe)) and lt_e(x, max(xs, xe)):
return True
else:
intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs
if abs(intersec_x - x) < 1e-4 and gt_e(y, min(ys, ye)) and lt_e(
y, max(ys, ye)):
return True
n_cross = 0
for i in range(4):
xs = roi_x[i]
ys = roi_y[i]
xe = roi_x[(i + 1) % 4]
ye = roi_y[(i + 1) % 4]
if abs(ys - ye) < 1e-4:
continue
if lt_e(y, min(ys, ye)) or gt(y, max(ys, ye)):
continue
intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs
if abs(intersec_x - x) < 1e-4:
return True
if gt(intersec_x, x):
n_cross += 1
return (n_cross % 2 == 1)
def get_transform_matrix(transformed_width, transformed_height, roi_x, roi_y):
x0 = roi_x[0]
x1 = roi_x[1]
x2 = roi_x[2]
x3 = roi_x[3]
y0 = roi_y[0]
y1 = roi_y[1]
y2 = roi_y[2]
y3 = roi_y[3]
len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1))
len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2))
len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3))
len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0))
estimated_height = (len2 + len4) / 2.0
estimated_width = (len1 + len3) / 2.0
normalized_height = transformed_height
normalized_width = round(estimated_width *
(normalized_height - 1) / estimated_height) + 1
normalized_width = min(normalized_width, transformed_width)
dx1 = x1 - x2
dx2 = x3 - x2
dx3 = x0 - x1 + x2 - x3
dy1 = y1 - y2
dy2 = y3 - y2
dy3 = y0 - y1 + y2 - y3
matrix = np.zeros([9])
matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (
normalized_width - 1)
matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (
normalized_height - 1)
matrix[8] = 1
matrix[3] = (y1 - y0 + matrix[6] *
(normalized_width - 1) * y1) / (normalized_width - 1)
matrix[4] = (y3 - y0 + matrix[7] *
(normalized_height - 1) * y3) / (normalized_height - 1)
matrix[5] = y0
matrix[0] = (x1 - x0 + matrix[6] *
(normalized_width - 1) * x1) / (normalized_width - 1)
matrix[1] = (x3 - x0 + matrix[7] *
(normalized_height - 1) * x3) / (normalized_height - 1)
matrix[2] = x0
return matrix
def get_source_coords(matrix, out_w, out_h):
u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]
v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]
w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]
in_w = u / w
in_h = v / w
return in_w, in_h
def bilinear_interpolate(in_data, in_n, in_c, in_w, in_h):
batch_size = in_data.shape[0]
channels = in_data.shape[1]
height = in_data.shape[2]
width = in_data.shape[3]
if gt(-0.5, in_w) or gt(in_w, width - 0.5) or gt(-0.5, in_h) or gt(
in_h, height - 0.5):
return 0.0
if gt(0, in_w):
in_w = 0
if gt(0, in_h):
in_h = 0
in_w_floor = floor(in_w)
in_h_floor = floor(in_h)
if gt_e(in_w_floor, width - 1):
in_w_ceil = width - 1
in_w_floor = width - 1
in_w = in_w_floor
else:
in_w_ceil = in_w_floor + 1
if gt_e(in_h_floor, height - 1):
in_h_ceil = height - 1
in_h_floor = height - 1
in_h = in_h_floor
else:
in_h_ceil = in_h_floor + 1
w_floor = in_w - in_w_floor
h_floor = in_h - in_h_floor
w_ceil = 1 - w_floor
h_ceil = 1 - h_floor
v1 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_floor)]
v2 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_floor)]
v3 = in_data[in_n][in_c][int(in_h_ceil)][int(in_w_ceil)]
v4 = in_data[in_n][in_c][int(in_h_floor)][int(in_w_ceil)]
w1 = w_ceil * h_ceil
w2 = w_ceil * h_floor
w3 = w_floor * h_floor
w4 = w_floor * h_ceil
val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
return val
def lod_convert(lod):
ret = [0]
for count in lod:
ret.append(ret[-1] + count)
return ret
def roi_transform(in_data, rois, rois_lod, transformed_height,
transformed_width, spatial_scale):
channels = in_data.shape[1]
in_height = in_data.shape[2]
in_width = in_data.shape[3]
rois_num = rois.shape[0]
roi2image = [0] * rois_num
rois_lod = lod_convert(rois_lod[0])
for i in range(len(rois_lod) - 1):
for j in range(rois_lod[i], rois_lod[i + 1]):
roi2image[j] = i
out = np.zeros([rois_num, channels, transformed_height, transformed_width])
for n in range(rois_num):
roi_x = []
roi_y = []
for k in range(4):
roi_x.append(rois[n][2 * k] * spatial_scale)
roi_y.append(rois[n][2 * k + 1] * spatial_scale)
image_id = roi2image[n]
transform_matrix = get_transform_matrix(
transformed_width, transformed_height, roi_x, roi_y)
for c in range(channels):
for out_h in range(transformed_height):
for out_w in range(transformed_width):
in_w, in_h = get_source_coords(transform_matrix, out_w,
out_h)
if in_quad(in_w, in_h, roi_x, roi_y) and gt_e(
in_w, -0.5) and lt_e(in_w, in_width - 0.5) and gt_e(
in_h, -0.5) and lt_e(in_h, in_height - 0.5):
out[n][c][out_h][out_w] = bilinear_interpolate(
in_data, image_id, c, in_w, in_h)
else:
out[n][c][out_h][out_w] = 0.0
return out.astype("float32")
class TestROIPoolOp(OpTest):
def set_data(self):
self.init_test_case()
self.make_rois()
self.inputs = {'X': self.x, 'ROIs': (self.rois, self.rois_lod)}
self.attrs = {
'spatial_scale': self.spatial_scale,
'transformed_height': self.transformed_height,
'transformed_width': self.transformed_width
}
out = roi_transform(self.x, self.rois, self.rois_lod,
self.transformed_height, self.transformed_width,
self.spatial_scale)
self.outputs = {'Out': out}
def init_test_case(self):
self.batch_size = 2
self.channels = 2
self.height = 8
self.width = 8
# n, c, h, w
self.x_dim = (self.batch_size, self.channels, self.height, self.width)
self.spatial_scale = 1.0 / 2.0
self.transformed_height = 2
self.transformed_width = 3
self.x = np.random.random(self.x_dim).astype('float32')
def make_rois(self):
rois = []
self.rois_lod = [[]]
for bno in range(self.batch_size):
self.rois_lod[0].append(bno + 1)
for i in range(bno + 1):
x1 = np.random.randint(
0,
self.width // self.spatial_scale - self.transformed_width)
y1 = np.random.randint(
0,
self.height // self.spatial_scale - self.transformed_height)
x2 = np.random.randint(x1 + self.transformed_width,
self.width // self.spatial_scale)
y2 = np.random.randint(
0,
self.height // self.spatial_scale - self.transformed_height)
x3 = np.random.randint(x1 + self.transformed_width,
self.width // self.spatial_scale)
y3 = np.random.randint(y1 + self.transformed_height,
self.height // self.spatial_scale)
x4 = np.random.randint(
0,
self.width // self.spatial_scale - self.transformed_width)
y4 = np.random.randint(y1 + self.transformed_height,
self.height // self.spatial_scale)
roi = [x1, y1, x2, y2, x3, y3, x4, y4]
rois.append(roi)
self.rois_num = len(rois)
self.rois = np.array(rois).astype("float32")
def setUp(self):
self.op_type = "roi_perspective_transform"
self.set_data()
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
if __name__ == '__main__':
unittest.main()
...@@ -34,7 +34,7 @@ class TestTransposeOp(OpTest): ...@@ -34,7 +34,7 @@ class TestTransposeOp(OpTest):
self.check_output(no_check_set=['XShape']) self.check_output(no_check_set=['XShape'])
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['X'], 'Out', sum_outputs=['Out']) self.check_grad(['X'], 'Out')
def initTestCase(self): def initTestCase(self):
self.shape = (3, 4) self.shape = (3, 4)
......
...@@ -21,13 +21,12 @@ import paddle ...@@ -21,13 +21,12 @@ import paddle
def delete_ops(block, ops): def delete_ops(block, ops):
try: for op in ops:
start = list(block.ops).index(ops[0]) try:
end = list(block.ops).index(ops[-1]) idx = list(block.ops).index(op)
[block._remove_op(start) for _ in six.moves.range(end - start + 1)] block._remove_op(idx)
except Exception as e: except Exception as e:
raise e print(e)
block.program._sync_with_cpp()
def find_op_by_input_arg(block, arg_name): def find_op_by_input_arg(block, arg_name):
...@@ -37,10 +36,18 @@ def find_op_by_input_arg(block, arg_name): ...@@ -37,10 +36,18 @@ def find_op_by_input_arg(block, arg_name):
return -1 return -1
def find_op_by_output_arg(block, arg_name): def find_op_by_output_arg(block, arg_name, reverse=False):
for index, op in enumerate(block.ops): if reverse:
if arg_name in op.output_arg_names: pos = len(block.ops) - 1
return index while pos >= 0:
op = block.ops[pos]
if arg_name in op.output_arg_names:
return pos
pos -= 1
else:
for index, op in enumerate(block.ops):
if arg_name in op.output_arg_names:
return index
return -1 return -1
......
...@@ -50,6 +50,15 @@ OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName() ...@@ -50,6 +50,15 @@ OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName()
RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName( RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName(
) )
RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC
DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist
LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched
PRINT_LOG = False
def log(*args):
if PRINT_LOG:
print(args)
class VarBlock: class VarBlock:
...@@ -127,6 +136,7 @@ class DistributeTranspilerConfig(object): ...@@ -127,6 +136,7 @@ class DistributeTranspilerConfig(object):
slice_var_up = True slice_var_up = True
split_method = None split_method = None
min_block_size = 8192 min_block_size = 8192
print_log = False
class DistributeTranspiler(object): class DistributeTranspiler(object):
...@@ -174,6 +184,9 @@ class DistributeTranspiler(object): ...@@ -174,6 +184,9 @@ class DistributeTranspiler(object):
if self.config.split_method is None: if self.config.split_method is None:
self.config.split_method = RoundRobin self.config.split_method = RoundRobin
global PRINT_LOG
if self.config.print_log:
PRINT_LOG = True
assert (self.config.min_block_size >= 8192) assert (self.config.min_block_size >= 8192)
assert (self.config.split_method.__bases__[0] == PSDispatcher) assert (self.config.split_method.__bases__[0] == PSDispatcher)
...@@ -257,12 +270,12 @@ class DistributeTranspiler(object): ...@@ -257,12 +270,12 @@ class DistributeTranspiler(object):
splited_grad_varname = grad_varname splited_grad_varname = grad_varname
if len(splited_vars) == 1: if len(splited_vars) == 1:
splited_grad_varname = splited_vars[0].name splited_grad_varname = splited_vars[0].name
index = find_op_by_output_arg(program.global_block(), index = find_op_by_output_arg(
splited_grad_varname) program.global_block(), splited_grad_varname, reverse=True)
elif len(splited_vars) > 1: elif len(splited_vars) > 1:
orig_var = program.global_block().vars[splited_grad_varname] orig_var = program.global_block().vars[splited_grad_varname]
index = find_op_by_output_arg(program.global_block(), index = find_op_by_output_arg(
splited_grad_varname) program.global_block(), splited_grad_varname, reverse=True)
self._insert_split_op(program, orig_var, index, splited_vars) self._insert_split_op(program, orig_var, index, splited_vars)
index += 1 index += 1
else: else:
...@@ -301,7 +314,7 @@ class DistributeTranspiler(object): ...@@ -301,7 +314,7 @@ class DistributeTranspiler(object):
self.grad_name_to_send_dummy_out[ self.grad_name_to_send_dummy_out[
self.table_name] = program.global_block().create_var( self.table_name] = program.global_block().create_var(
name=framework.generate_control_dev_var_name()) name=framework.generate_control_dev_var_name())
input_deps = self.grad_name_to_send_dummy_out.values() input_deps = list(self.grad_name_to_send_dummy_out.values())
program.global_block().append_op( program.global_block().append_op(
type="send_barrier", type="send_barrier",
...@@ -377,7 +390,10 @@ class DistributeTranspiler(object): ...@@ -377,7 +390,10 @@ class DistributeTranspiler(object):
type="concat", type="concat",
inputs={"X": splited_var}, inputs={"X": splited_var},
outputs={"Out": [orig_param]}, outputs={"Out": [orig_param]},
attrs={"axis": 0}) attrs={
"axis": 0,
RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
})
self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist) self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist)
...@@ -496,9 +512,9 @@ class DistributeTranspiler(object): ...@@ -496,9 +512,9 @@ class DistributeTranspiler(object):
# NOTE: assume blocks of the same variable is not distributed # NOTE: assume blocks of the same variable is not distributed
# on the same pserver, only change param/grad varnames for # on the same pserver, only change param/grad varnames for
# trainers to fetch. # trainers to fetch.
sys.stderr.write("get_pserver_program() is deprecated, call\ sys.stderr.write("get_pserver_program() is deprecated, call \
get_pserver_programs() to get pserver main and startup\ get_pserver_programs() to get pserver main and startup \
in a single call.") in a single call.")
# step1 # step1
pserver_program = Program() pserver_program = Program()
pserver_program.random_seed = self.origin_program.random_seed pserver_program.random_seed = self.origin_program.random_seed
...@@ -615,22 +631,31 @@ class DistributeTranspiler(object): ...@@ -615,22 +631,31 @@ class DistributeTranspiler(object):
for idx, opt_op in enumerate(opt_op_on_pserver): for idx, opt_op in enumerate(opt_op_on_pserver):
per_opt_block = pserver_program._create_block(pre_block_idx) per_opt_block = pserver_program._create_block(pre_block_idx)
optimize_blocks.append(per_opt_block) optimize_blocks.append(per_opt_block)
optimize_target_param_name = opt_op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
# append grad merging ops before clip and weight decay # append grad merging ops before clip and weight decay
# cases may like: # e.g. merge grad -> L2Decay op -> clip op -> optimize
# L2Decay op -> clip op -> optimize merged_var = None
for _, op in enumerate(self.optimize_ops): for _, op in enumerate(self.optimize_ops):
# find the origin @GRAD var before clipping # find the origin grad var before clipping/L2Decay,
grad_varname_for_block = __op_have_grad_input__(op) # merged_var should be the input var name of L2Decaybuil
if ufind.is_connected(op, opt_op) and grad_varname_for_block: grad_varname_for_block = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
if op.attr(OP_ROLE_VAR_ATTR_NAME)[
0] == optimize_target_param_name:
merged_var = self._append_pserver_grad_merge_ops( merged_var = self._append_pserver_grad_merge_ops(
per_opt_block, grad_varname_for_block, endpoint, per_opt_block, grad_varname_for_block, endpoint,
grad_to_block_id, self.origin_program) grad_to_block_id, self.origin_program)
break # append optimize op once then append other ops. if merged_var:
for _, op in enumerate(self.optimize_ops): break # append optimize op once then append other ops.
# optimizer is connected to itself if merged_var:
if ufind.is_connected(op, opt_op) and op not in global_ops: for _, op in enumerate(self.optimize_ops):
__append_optimize_op__(op, per_opt_block, grad_to_block_id, # optimizer is connected to itself
merged_var, lr_ops) if op.attr(OP_ROLE_VAR_ATTR_NAME)[0] == optimize_target_param_name and \
op not in global_ops:
log("append opt op: ", op.type, op.input_arg_names,
merged_var)
__append_optimize_op__(op, per_opt_block,
grad_to_block_id, merged_var,
lr_ops)
# dedup grad to ids list # dedup grad to ids list
grad_to_block_id = list(set(grad_to_block_id)) grad_to_block_id = list(set(grad_to_block_id))
...@@ -726,17 +751,17 @@ class DistributeTranspiler(object): ...@@ -726,17 +751,17 @@ class DistributeTranspiler(object):
Returns: Returns:
Program: parameter server side startup program. Program: parameter server side startup program.
""" """
sys.stderr.write("get_startup_program() is deprecated, call\ sys.stderr.write("get_startup_program() is deprecated, call \
get_pserver_programs() to get pserver main and startup\ get_pserver_programs() to get pserver main and startup \
in a single call.") in a single call.")
if pserver_program != None: if pserver_program != None:
sys.stderr.write("passing pserver_program to get_startup_program()\ sys.stderr.write("passing pserver_program to get_startup_program() \
is deprecated, you can use new API get_pserver_programs() to\ is deprecated, you can use new API get_pserver_programs() to \
get both pserver main program and startup program.") get both pserver main program and startup program.")
if startup_program != None: if startup_program != None:
sys.stderr.write("passing startup_program to get_startup_program()\ sys.stderr.write("passing startup_program to get_startup_program() \
is deprecated, use fluid.program_guard() or pass this argument\ is deprecated, use fluid.program_guard() or pass this argument \
to transpile() call.") to transpile() call.")
s_prog = Program() s_prog = Program()
orig_s_prog = self.startup_program orig_s_prog = self.startup_program
...@@ -1302,7 +1327,10 @@ class DistributeTranspiler(object): ...@@ -1302,7 +1327,10 @@ class DistributeTranspiler(object):
type="split_selected_rows", type="split_selected_rows",
inputs={"X": orig_var}, inputs={"X": orig_var},
outputs={"Out": splited_vars}, outputs={"Out": splited_vars},
attrs={"height_sections": height_sections}) attrs={
"height_sections": height_sections,
RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
})
elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR: elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
sections = [] sections = []
for v in splited_vars: for v in splited_vars:
...@@ -1312,8 +1340,10 @@ class DistributeTranspiler(object): ...@@ -1312,8 +1340,10 @@ class DistributeTranspiler(object):
type="split_byref", type="split_byref",
inputs={"X": orig_var}, inputs={"X": orig_var},
outputs={"Out": splited_vars}, outputs={"Out": splited_vars},
attrs={"sections": sections} # assume split evenly attrs={
) "sections": sections,
RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE
})
else: else:
AssertionError("Variable type should be in set " AssertionError("Variable type should be in set "
"[LOD_TENSOR, SELECTED_ROWS]") "[LOD_TENSOR, SELECTED_ROWS]")
...@@ -1381,15 +1411,15 @@ class DistributeTranspiler(object): ...@@ -1381,15 +1411,15 @@ class DistributeTranspiler(object):
if not grad_block: if not grad_block:
# do not append this op if current endpoint # do not append this op if current endpoint
# is not dealing with this grad block # is not dealing with this grad block
return return None
orig_varname, block_name, trainer_name = self._get_varname_parts( orig_varname, block_name, trainer_name = self._get_varname_parts(
grad_block.name) grad_block.name)
if block_name: if block_name:
merged_var_name = '.'.join([orig_varname, block_name]) merged_var_name = '.'.join([orig_varname, block_name])
else: else:
merged_var_name = orig_varname merged_var_name = orig_varname
merged_var = \
pserver_block.vars[merged_var_name] merged_var = pserver_block.vars[merged_var_name]
grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx)) grad_to_block_id.append(merged_var.name + ":" + str(optimize_block.idx))
if self.sync_mode and self.trainer_num > 1: if self.sync_mode and self.trainer_num > 1:
vars2merge = [] vars2merge = []
...@@ -1473,7 +1503,6 @@ class DistributeTranspiler(object): ...@@ -1473,7 +1503,6 @@ class DistributeTranspiler(object):
outputs = self._get_output_map_from_op( outputs = self._get_output_map_from_op(
self.origin_program.global_block().vars, opt_op) self.origin_program.global_block().vars, opt_op)
outputs["ParamOut"] = new_inputs["Param"] outputs["ParamOut"] = new_inputs["Param"]
optimize_block.append_op( optimize_block.append_op(
type=opt_op.type, type=opt_op.type,
inputs=new_inputs, inputs=new_inputs,
...@@ -1618,6 +1647,16 @@ class DistributeTranspiler(object): ...@@ -1618,6 +1647,16 @@ class DistributeTranspiler(object):
return iomap return iomap
def _get_lr_ops(self): def _get_lr_ops(self):
lr_ops = []
block = self.origin_program.global_block()
for op in block.ops:
if int(op.attr(RPC_OP_ROLE_ATTR_NAME)) == int(
LR_SCHED_OP_ROLE_ATTR_VALUE):
lr_ops.append(op)
log("append lr op: ", op.type)
return lr_ops
def _get_lr_ops_deprecated(self):
lr_ops = [] lr_ops = []
# find learning rate variables by optimize op # find learning rate variables by optimize op
lr_vars = set() lr_vars = set()
...@@ -1670,20 +1709,21 @@ class DistributeTranspiler(object): ...@@ -1670,20 +1709,21 @@ class DistributeTranspiler(object):
block = self.origin_program.global_block() block = self.origin_program.global_block()
opt_ops = [] opt_ops = []
params_grads = [] params_grads = []
# tmp set to dedup
optimize_params = set()
origin_var_dict = self.origin_program.global_block().vars origin_var_dict = self.origin_program.global_block().vars
for op in block.ops: for op in block.ops:
if self._is_opt_role_op(op): if self._is_opt_role_op(op):
opt_ops.append(op) opt_ops.append(op)
# HACK(wuyi): if we find grad vars from input of optimize if op.attr(OP_ROLE_VAR_ATTR_NAME):
# ops, we may get the output of clip op. Use syntax "@GRAD" param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
# and op_role_var to get the pair. grad_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[1]
for input_name in op.input_arg_names: if not param_name in optimize_params:
if input_name.find("@GRAD") != -1 and \ optimize_params.add(param_name)
op.attr(RPC_OP_ROLE_ATTR_NAME): log("adding param_grad pair: ", param_name, grad_name)
param_name = op.attr(OP_ROLE_VAR_ATTR_NAME)[0]
params_grads.append([ params_grads.append([
origin_var_dict[param_name], origin_var_dict[param_name],
origin_var_dict[input_name] origin_var_dict[grad_name]
]) ])
else: else:
pass pass
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
from __future__ import print_function from __future__ import print_function
from collections import defaultdict from collections import defaultdict, OrderedDict, Callable
from .. import core from .. import core
from ... import compat as cpt from ... import compat as cpt
from ..framework import Program, default_main_program, Parameter from ..framework import Program, default_main_program, Parameter, Variable
from ..backward import _rename_arg_ from ..backward import _rename_arg_
from functools import reduce from functools import reduce
from six.moves import range from six.moves import range
...@@ -113,8 +113,10 @@ class ControlFlowGraph(object): ...@@ -113,8 +113,10 @@ class ControlFlowGraph(object):
def _fill_pool(self, i, is_forward): def _fill_pool(self, i, is_forward):
block_desc = self._ops[i].block() block_desc = self._ops[i].block()
in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i]) in_diff, _ = self._get_diff(self._live_in[i], self._live_out[i])
# NOTE: must sort the in_diff set for cases that get different cache var.
# FIXME(typhoonzero): maybe use a "sorted set" is better than this.
can_optimize = [ can_optimize = [
x for x in in_diff x for x in sorted(list(in_diff))
if self._check_var_validity(block_desc, x, is_forward) if self._check_var_validity(block_desc, x, is_forward)
] ]
if can_optimize: if can_optimize:
...@@ -220,8 +222,9 @@ class ControlFlowGraph(object): ...@@ -220,8 +222,9 @@ class ControlFlowGraph(object):
block_desc = op.block() block_desc = op.block()
is_forward = i < self._forward_num is_forward = i < self._forward_num
if self.pool: if self.pool:
# NOTE: must sort the in_diff set for cases that get different cache var.
defs_can_optimize = [ defs_can_optimize = [
x for x in self._defs[i] x for x in sorted(list(self._defs[i]))
if self._check_var_validity(block_desc, x, is_forward) if self._check_var_validity(block_desc, x, is_forward)
] ]
out_pair = [ out_pair = [
...@@ -271,6 +274,8 @@ class ControlFlowGraph(object): ...@@ -271,6 +274,8 @@ class ControlFlowGraph(object):
self._program.block(block_desc.id).var(cpt.to_text( self._program.block(block_desc.id).var(cpt.to_text(
x)).desc = self._find_var(block_desc, cache_var, x)).desc = self._find_var(block_desc, cache_var,
is_forward) is_forward)
self._program.block(block_desc.id).vars[cpt.to_text(x)] = \
Variable(self._program.block(block_desc.id), name=cpt.to_text(x))
self._update_graph(x, cache_var, begin_idx=i) self._update_graph(x, cache_var, begin_idx=i)
break break
self._fill_pool(i, is_forward) self._fill_pool(i, is_forward)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册