提交 3821fc39 编写于 作者: Y Yihua Xu

Merge branch 'develop' into develop_4f71a6ee_conv3d_bias_fusion_mkldnn_impl

test=develop
...@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH}) ...@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
return() return()
ENDIF() ENDIF()
INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph") SET(NGRAPH_PROJECT "extern_ngraph")
...@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0") ...@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION}) SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
ExternalProject_Add( ExternalProject_Add(
${NGRAPH_PROJECT} ${NGRAPH_PROJECT}
...@@ -63,18 +69,6 @@ ExternalProject_Add( ...@@ -63,18 +69,6 @@ ExternalProject_Add(
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
) )
if(UNIX AND NOT APPLE)
include(GNUInstallDirs)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
else()
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
endif()
MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
# Workaround for nGraph expecting mklml to be in mkldnn install directory. # Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step( ExternalProject_Add_Step(
${NGRAPH_PROJECT} ${NGRAPH_PROJECT}
......
...@@ -129,6 +129,15 @@ if (WITH_MKLDNN) ...@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
) )
endif () endif ()
if (WITH_NGRAPH)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
copy(ngraph_lib
SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
DSTS ${dst_dir} ${dst_dir}
DEPS ngraph
)
endif ()
if (NOT WIN32) if (NOT WIN32)
if (NOT MOBILE_INFERENCE AND NOT RPI) if (NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
......
...@@ -166,6 +166,8 @@ function(op_library TARGET) ...@@ -166,6 +166,8 @@ function(op_library TARGET)
# Append first implemented MKLDNN activation operator # Append first implemented MKLDNN activation operator
if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
else() else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif() endif()
......
...@@ -182,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, ...@@ -182,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
...@@ -194,6 +194,8 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non ...@@ -194,6 +194,8 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
...@@ -299,6 +301,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i ...@@ -299,6 +301,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i
paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
...@@ -419,3 +422,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None ...@@ -419,3 +422,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None
paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope
paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable
paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None)
paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None)
paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None)
paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None)
paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None)
paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None)
paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,))
paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain'))
paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n'))
paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000))
paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None)
paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,))
...@@ -118,8 +118,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) ...@@ -118,8 +118,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context)
cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler transfer_scope_cache) shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
...@@ -127,8 +128,9 @@ cc_library(version SRCS version.cc) ...@@ -127,8 +128,9 @@ cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version) cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
if(NOT WIN32) if(NOT WIN32)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler) shape_inference data_transform lod_tensor profiler)
endif(NOT WIN32) endif(NOT WIN32)
...@@ -190,7 +192,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry ...@@ -190,7 +192,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
cc_test(tuple_test SRCS tuple_test.cc ) cc_test(tuple_test SRCS tuple_test.cc )
......
...@@ -33,11 +33,7 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) { ...@@ -33,11 +33,7 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
CheckInit(); CheckInit();
for (size_t i = 0; i < use_slots_.size(); ++i) { for (size_t i = 0; i < use_slots_.size(); ++i) {
if (name == use_slots_[i]) { if (name == use_slots_[i]) {
if (use_slots_is_dense_[i]) { feed_vec_[i] = var->GetMutable<LoDTensor>();
feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
} else {
feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
}
} }
} }
} }
...@@ -301,6 +297,7 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) { ...@@ -301,6 +297,7 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
"the data, please check if the data contains unresolvable " "the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s", "characters.\nplease check this error line: %s",
str); str);
if (idx != -1) { if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]); (*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float if ((*instance)[idx].GetType()[0] == 'f') { // float
...@@ -337,6 +334,7 @@ void MultiSlotDataFeed::AddInstanceToInsVec( ...@@ -337,6 +334,7 @@ void MultiSlotDataFeed::AddInstanceToInsVec(
(*ins_vec)[i].InitOffset(); (*ins_vec)[i].InitOffset();
} }
} }
for (size_t i = 0; i < instance.size(); ++i) { for (size_t i = 0; i < instance.size(); ++i) {
(*ins_vec)[i].AddIns(instance[i]); (*ins_vec)[i].AddIns(instance[i]);
} }
...@@ -348,36 +346,25 @@ void MultiSlotDataFeed::PutToFeedVec( ...@@ -348,36 +346,25 @@ void MultiSlotDataFeed::PutToFeedVec(
const auto& type = ins_vec[i].GetType(); const auto& type = ins_vec[i].GetType();
const auto& offset = ins_vec[i].GetOffset(); const auto& offset = ins_vec[i].GetOffset();
int total_instance = static_cast<int>(offset.back()); int total_instance = static_cast<int>(offset.back());
if (type[0] == 'f') { // float if (type[0] == 'f') { // float
const auto& feasign = ins_vec[i].GetFloatData(); const auto& feasign = ins_vec[i].GetFloatData();
if (feed_vec_[i].IsDense()) { float* tensor_ptr = feed_vec_[i]->mutable_data<float>(
int size_in_each_batch = total_instance / batch_size_; {total_instance, 1}, platform::CPUPlace());
float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>( memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
{batch_size_, size_in_each_batch}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else {
float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
LoD data_lod{offset};
feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
}
} else if (type[0] == 'u') { // uint64 } else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle // no uint64_t type in paddlepaddle
const auto& feasign = ins_vec[i].GetUint64Data(); const auto& feasign = ins_vec[i].GetUint64Data();
if (feed_vec_[i].IsDense()) { int64_t* tensor_ptr = feed_vec_[i]->mutable_data<int64_t>(
int size_in_each_batch = total_instance / batch_size_; {total_instance, 1}, platform::CPUPlace());
int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>( memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
{batch_size_, size_in_each_batch}, platform::CPUPlace()); }
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
} else { LoD data_lod{offset};
int64_t* tensor_ptr = feed_vec_[i]->set_lod(data_lod);
feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>( if (use_slots_is_dense_[i]) {
{total_instance, 1}, platform::CPUPlace()); int dim = total_instance / batch_size_;
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); feed_vec_[i]->Resize({batch_size_, dim});
LoD data_lod{offset};
feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
}
} }
} }
} }
......
...@@ -30,35 +30,6 @@ limitations under the License. */ ...@@ -30,35 +30,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
// Pack Tensor type and LoDTensor type into MixTensor type, in order
// to record either Tensor or LoDTensor information at the same time.
class MixTensor {
public:
MixTensor() {}
explicit MixTensor(LoDTensor* lodtensor) {
is_dense_ = false;
lodtensor_ = lodtensor;
}
explicit MixTensor(Tensor* tensor) {
is_dense_ = true;
tensor_ = tensor;
}
bool IsDense() { return is_dense_; }
LoDTensor* GetLoDTensor() {
PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
return lodtensor_;
}
Tensor* GetTensor() {
PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
return tensor_;
}
private:
bool is_dense_;
LoDTensor* lodtensor_;
Tensor* tensor_;
};
// DataFeed is the base virtual class for all ohther DataFeeds. // DataFeed is the base virtual class for all ohther DataFeeds.
// It is used to read files and parse the data for subsequent trainer. // It is used to read files and parse the data for subsequent trainer.
// Example: // Example:
...@@ -133,7 +104,7 @@ class DataFeed { ...@@ -133,7 +104,7 @@ class DataFeed {
use_slots_index_; // -1: not used; >=0: the index of use_slots_ use_slots_index_; // -1: not used; >=0: the index of use_slots_
// The data read by DataFeed will be stored here // The data read by DataFeed will be stored here
std::vector<MixTensor> feed_vec_; std::vector<LoDTensor*> feed_vec_;
// the batch size defined by user // the batch size defined by user
int default_batch_size_; int default_batch_size_;
......
...@@ -152,19 +152,13 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set, ...@@ -152,19 +152,13 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
const auto& multi_slot_desc = data_feed_desc.multi_slot_desc(); const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
std::map<std::string, const paddle::framework::LoDTensor*> std::map<std::string, const paddle::framework::LoDTensor*>
lodtensor_targets; lodtensor_targets;
std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
const auto& slot = multi_slot_desc.slots(i); const auto& slot = multi_slot_desc.slots(i);
if (slot.is_used()) { if (slot.is_used()) {
const auto& name = slot.name(); const auto& name = slot.name();
readers[idx]->AddFeedVar(scope->Var(name), name); readers[idx]->AddFeedVar(scope->Var(name), name);
if (slot.is_dense()) { lodtensor_targets[name] =
tensor_targets[name] = &scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
&scope->FindVar(name)->Get<paddle::framework::Tensor>();
} else {
lodtensor_targets[name] =
&scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
}
} }
} }
readers[idx]->Start(); readers[idx]->Start();
...@@ -175,8 +169,9 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set, ...@@ -175,8 +169,9 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
if (!slot.is_used()) { if (!slot.is_used()) {
continue; continue;
} }
const paddle::framework::LoDTensor* tens =
lodtensor_targets[slot.name()];
if (slot.is_dense()) { // dense branch if (slot.is_dense()) { // dense branch
const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
if (slot.type() == "uint64") { if (slot.type() == "uint64") {
const int64_t* data = tens->data<int64_t>(); const int64_t* data = tens->data<int64_t>();
int batch_size = tens->dims()[0]; int batch_size = tens->dims()[0];
...@@ -202,8 +197,6 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set, ...@@ -202,8 +197,6 @@ void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
PADDLE_THROW("Error type in proto file."); PADDLE_THROW("Error type in proto file.");
} }
} else { // sparse branch } else { // sparse branch
const paddle::framework::LoDTensor* tens =
lodtensor_targets[slot.name()];
if (slot.type() == "uint64") { if (slot.type() == "uint64") {
const int64_t* data = tens->data<int64_t>(); const int64_t* data = tens->data<int64_t>();
for (size_t i = 0; i < tens->NumElements(); ++i) { for (size_t i = 0; i < tens->NumElements(); ++i) {
......
...@@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR,
// this is a distributed or inter-process call, find a better way.
#ifdef PADDLE_WITH_CUDA
if (NoDummyInputSize() == 1 &&
local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) {
#else
if (NoDummyInputSize() == 1) { if (NoDummyInputSize() == 1) {
#endif
return; // No need to all reduce when GPU count = 1; return; // No need to all reduce when GPU count = 1;
} else { } else {
// Wait input done // Wait input done
......
...@@ -62,6 +62,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -62,6 +62,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
auto multi_devices_pass = AppendPass("multi_devices_pass"); auto multi_devices_pass = AppendPass("multi_devices_pass");
multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
&strategy_); &strategy_);
multi_devices_pass->Set<int>("num_trainers",
new int(strategy_.num_trainers_));
// Add a graph print pass to record a graph with device info. // Add a graph print pass to record a graph with device info.
if (!strategy_.debug_graphviz_path_.empty()) { if (!strategy_.debug_graphviz_path_.empty()) {
......
...@@ -133,6 +133,7 @@ static const char kPlaces[] = "places"; ...@@ -133,6 +133,7 @@ static const char kPlaces[] = "places";
static const char kParams[] = "params"; static const char kParams[] = "params";
static const char kLocalScopes[] = "local_scopes"; static const char kLocalScopes[] = "local_scopes";
static const char kStrategy[] = "strategy"; static const char kStrategy[] = "strategy";
static const char kNumTrainers[] = "num_trainers";
void MultiDevSSAGraphBuilder::Init() const { void MultiDevSSAGraphBuilder::Init() const {
all_vars_.clear(); all_vars_.clear();
...@@ -299,6 +300,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -299,6 +300,8 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
auto nodes = graph->ReleaseNodes(); auto nodes = graph->ReleaseNodes();
ir::Graph &result = *graph; ir::Graph &result = *graph;
int num_trainers = Get<int>(kNumTrainers);
for (auto &node : nodes) { for (auto &node : nodes) {
if (node->IsVar() && node->Var()) { if (node->IsVar() && node->Var()) {
all_vars_.emplace(node->Name(), node->Var()); all_vars_.emplace(node->Name(), node->Var());
...@@ -383,7 +386,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl( ...@@ -383,7 +386,7 @@ std::unique_ptr<ir::Graph> MultiDevSSAGraphBuilder::ApplyImpl(
CreateComputationalOps(&result, node, places_.size()); CreateComputationalOps(&result, node, places_.size());
} }
if (!is_forwarding && places_.size() > 1) { if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) {
// Currently, we assume that once gradient is generated, it can be // Currently, we assume that once gradient is generated, it can be
// broadcast, and each gradient is only broadcast once. // broadcast, and each gradient is only broadcast once.
if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr( if (static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
...@@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass, ...@@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass,
.RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kPlaces)
.RequirePassAttr(paddle::framework::details::kParams) .RequirePassAttr(paddle::framework::details::kParams)
.RequirePassAttr(paddle::framework::details::kLocalScopes) .RequirePassAttr(paddle::framework::details::kLocalScopes)
.RequirePassAttr(paddle::framework::details::kStrategy); .RequirePassAttr(paddle::framework::details::kStrategy)
.RequirePassAttr(paddle::framework::details::kNumTrainers);
...@@ -32,9 +32,7 @@ enum OpInfoFillType { ...@@ -32,9 +32,7 @@ enum OpInfoFillType {
kOpProtoAndCheckerMaker = 1, kOpProtoAndCheckerMaker = 1,
kGradOpDescMaker = 2, kGradOpDescMaker = 2,
kVarTypeInference = 3, kVarTypeInference = 3,
kShapeInference = 4, kShapeInference = 4
kEstimateFlops = 5,
kUnknown = -1
}; };
template <typename T> template <typename T>
...@@ -50,10 +48,8 @@ struct OpInfoFillTypeID { ...@@ -50,10 +48,8 @@ struct OpInfoFillTypeID {
? kVarTypeInference ? kVarTypeInference
: (std::is_base_of<InferShapeBase, T>::value : (std::is_base_of<InferShapeBase, T>::value
? kShapeInference ? kShapeInference
: (std::is_base_of<EstimateFlopsBase, : static_cast<OpInfoFillType>(
T>::value -1)))));
? kEstimateFlops
: kUnknown)))));
} }
}; };
...@@ -143,16 +139,6 @@ struct OpInfoFiller<T, kShapeInference> { ...@@ -143,16 +139,6 @@ struct OpInfoFiller<T, kShapeInference> {
} }
}; };
template <typename T>
struct OpInfoFiller<T, kEstimateFlops> {
void operator()(const char* op_tpe, OpInfo* info) const {
info->estimate_flops_ = [](InferShapeContext* ctx) {
T estimate_flops;
return estimate_flops(ctx);
};
}
};
} // namespace details } // namespace details
} // namespace framework } // namespace framework
......
...@@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() { ...@@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() {
static unsigned concurrency_cap = std::thread::hardware_concurrency(); static unsigned concurrency_cap = std::thread::hardware_concurrency();
int thread_id = this->thread_id_; int thread_id = this->thread_id_;
if (thread_id < concurrency_cap) { if (static_cast<unsigned>(thread_id) < concurrency_cap) {
unsigned proc = thread_id; unsigned proc = thread_id;
cpu_set_t mask; cpu_set_t mask;
......
...@@ -177,14 +177,13 @@ class Graph { ...@@ -177,14 +177,13 @@ class Graph {
return nullptr; return nullptr;
} }
const ProgramDesc &program() const { return program_; }
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
void ResolveHazard( void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes); const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
private: private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
// This method takes ownership of `node`. // This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) { ir::Node *AddNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
......
...@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl( ...@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
for (const Node* n : graph->Nodes()) { for (const Node* n : graph->Nodes()) {
if (n->IsOp()) { if (n->IsOp()) {
auto* op = n->Op(); auto* op = n->Op();
if (op->HasAttr("is_test")) { if (n->RuntimeHasAttr("is_test")) {
op->SetAttr("is_test", true); op->SetAttr("is_test", true);
} else if (std::find(begin(op_list), end(op_list), op->Type()) != } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
end(op_list)) { end(op_list)) {
......
...@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) { ...@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
auto* op = node->Op(); auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name")); auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv3") { if (op_name == "conv3") {
ASSERT_FALSE(op->HasAttr("is_test")); ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
} else { } else {
ASSERT_TRUE(op->HasAttr("is_test")); ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test"))); EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
} }
} }
......
...@@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl( ...@@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const { std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Aplies MKL-DNN placement strategy."; VLOG(3) << "Aplies MKL-DNN placement strategy.";
for (const Node* n : graph->Nodes()) { for (const Node* n : graph->Nodes()) {
if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) { if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
n->Op()->SetAttr("use_mkldnn", true); n->Op()->SetAttr("use_mkldnn", true);
} }
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_info.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[]; ...@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
const char Node::kControlDepVarName[] = "__control_var"; const char Node::kControlDepVarName[] = "__control_var";
#endif #endif
std::unique_ptr<Node> CreateNodeForTest(const std::string& name, std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
Node::Type type) { Node::Type type) {
return std::unique_ptr<Node>(new Node(name, type)); return std::unique_ptr<Node>(new Node(name, type));
} }
bool Node::RuntimeHasAttr(const std::string &name) const {
if (Op()->HasAttr(name)) {
return true;
} else {
auto &op_info = OpInfoMap::Instance();
auto op_type = Op()->Type();
if (op_info.Has(op_type)) {
auto op_info_ptr = op_info.Get(op_type);
if (op_info_ptr.HasOpProtoAndChecker()) {
const proto::OpProto &proto = op_info_ptr.Proto();
for (int i = 0; i != proto.attrs_size(); ++i) {
const proto::OpProto::Attr &attr = proto.attrs(i);
if (attr.name() == name) {
return true;
}
}
}
}
}
return false;
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -108,6 +108,18 @@ class Node { ...@@ -108,6 +108,18 @@ class Node {
Name().find(ir::Node::kControlDepVarName) != std::string::npos; Name().find(ir::Node::kControlDepVarName) != std::string::npos;
} }
// RuntimeHasAttr is different with HasAttr now.
// 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
// thus, if stored program_desc_ are old which don't have an attr, a new
// library which adds the attr already will fail on this function.
// Details:
// https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
// 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
// problem.
// TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
// RuntimeHasAttr.
bool RuntimeHasAttr(const std::string& name) const;
std::vector<Node*> inputs; std::vector<Node*> inputs;
std::vector<Node*> outputs; std::vector<Node*> outputs;
......
...@@ -15,23 +15,105 @@ limitations under the License. */ ...@@ -15,23 +15,105 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH #ifdef PADDLE_WITH_NGRAPH
#include <algorithm> #include <algorithm>
#include <functional> #include <functional>
#include <vector>
#include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static std::shared_ptr<ngraph::Node> GetNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
const VariableNameMap& var_map,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = var_map.at(prm);
PADDLE_ENFORCE_EQ(var_names.size(), 1,
"op %s prm %s expects one associated var", op->Type(), prm);
if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
return (*ngb_node_map)[var_names[0]];
} else {
return nullptr;
}
}
static std::shared_ptr<ngraph::Node> GetInputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Inputs(), ngb_node_map);
}
static std::shared_ptr<ngraph::Node> GetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Outputs(), ngb_node_map);
}
static void SetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<ngraph::Node> node,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = op->Outputs().at(prm);
if (var_names.size() == 1) {
(*ngb_node_map)[var_names[0]] = node;
} else if (var_names.size() == 0) {
(*ngb_node_map)[""] = node;
} else {
PADDLE_THROW("prm %s has more than 1 var_names.", prm);
}
}
static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
const std::string prm) {
auto& outputs = op->Outputs();
if (outputs.find(prm) == outputs.end()) return false;
return outputs.at(prm).size() > 0;
}
template <typename T>
static void BuildBinaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = GetInputNode(op, "X", ngb_node_map);
auto y = GetInputNode(op, "Y", ngb_node_map);
auto out = std::make_shared<T>(x, y);
SetOutputNode(op, "Out", out, ngb_node_map);
}
template <typename T>
static void BuildUnaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = GetInputNode(op, "X", ngb_node_map);
auto out = std::make_shared<T>(input);
SetOutputNode(op, "Out", out, ngb_node_map);
}
std::map<std::string, std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&, std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map< std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>> std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {}; NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
{"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) { void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type(); auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map); NG_NODE_MAP[op_type](op, ngb_node_map_);
} }
} // namespace framework } // namespace framework
......
...@@ -20,16 +20,14 @@ limitations under the License. */ ...@@ -20,16 +20,14 @@ limitations under the License. */
#include <map> #include <map>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h" #include "ngraph/node.hpp"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class OperatorBase;
class NgraphBridge { class NgraphBridge {
public: public:
static std::map< static std::map<
...@@ -43,14 +41,14 @@ class NgraphBridge { ...@@ -43,14 +41,14 @@ class NgraphBridge {
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map) var_node_map)
: ngb_node_map(var_node_map) {} : ngb_node_map_(var_node_map) {}
void build_graph(const std::shared_ptr<OperatorBase>& op); void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
private: private:
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map; ngb_node_map_;
}; };
} // namespace framework } // namespace framework
......
...@@ -19,14 +19,29 @@ limitations under the License. */ ...@@ -19,14 +19,29 @@ limitations under the License. */
#include <map> #include <map>
#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/ngraph_operator.h" #include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "ngraph/ngraph.hpp"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static ngraph::Shape Ddim2Shape(const DDim& dims) {
ngraph::Shape sp;
for (int i = 0; i < dims.size(); ++i) {
int k = dims[i];
k = k == 0 ? 1 : k;
sp.push_back(k);
}
return sp;
}
static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = { static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
{proto::VarType::FP32, ngraph::element::f32}, {proto::VarType::FP32, ngraph::element::f32},
{proto::VarType::FP64, ngraph::element::f64}, {proto::VarType::FP64, ngraph::element::f64},
...@@ -42,6 +57,7 @@ typedef enum { /* nGraph support state on ops */ ...@@ -42,6 +57,7 @@ typedef enum { /* nGraph support state on ops */
PARTIAL_TEST /* Support partial list of ops for test */ PARTIAL_TEST /* Support partial list of ops for test */
} op_state; } op_state;
// perform graph build through bridge and execute computation
class NgraphOperator { class NgraphOperator {
public: public:
explicit NgraphOperator(const Scope& scope, const platform::Place& place, explicit NgraphOperator(const Scope& scope, const platform::Place& place,
...@@ -59,13 +75,23 @@ class NgraphOperator { ...@@ -59,13 +75,23 @@ class NgraphOperator {
persistables_(persist), persistables_(persist),
fetches_(fetches), fetches_(fetches),
post_op_inputs_(post_op_inputs), post_op_inputs_(post_op_inputs),
ng_op_state_(ng_op_state) {} ng_op_state_(ng_op_state) {
var_in_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
var_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
BuildNgIO();
GetNgFunction();
}
void Run(const Scope& scope, const platform::Place& place) const; void Run(const Scope& scope, const platform::Place& place) const;
private: private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>> static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
func_cache; func_cache_;
const Scope& scope_; const Scope& scope_;
const platform::Place& place_; const platform::Place& place_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_; std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
...@@ -74,6 +100,35 @@ class NgraphOperator { ...@@ -74,6 +100,35 @@ class NgraphOperator {
std::unordered_set<std::string> fetches_; std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_; std::unordered_set<std::string> post_op_inputs_;
op_state ng_op_state_; op_state ng_op_state_;
// ngraph backend eg. CPU
static std::shared_ptr<ngraph::runtime::Backend> backend_;
// ngraph function to call and execute
std::shared_ptr<ngraph::Function> ngraph_function_;
// var_name of inputs
std::vector<std::string> var_in_;
// var_name of outputs from fetch in order
std::vector<std::string> var_out_;
// map input vars to nodes
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_in_node_map_;
// map each var name with a ngraph node
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map_;
// cache key to check if function is cached
std::shared_ptr<std::string> GetCacheKey();
// get ngraph input and define ngraph input parameters
void GetNgInputShape(std::shared_ptr<OperatorBase> op);
// Call ngraph bridge to map ops
void BuildNgNodes();
// get the ngraph input and output var list
void BuildNgIO();
// build ngraph function call
void BuildNgFunction();
// Check cache for ngraph function or otherwise build the function
void GetNgFunction();
}; };
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>> std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
...@@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals( ...@@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals(
} }
size_t size = ops->size(); size_t size = ops->size();
size_t left = 0; size_t left = 0;
while (left < size && ops.at(left)->Type() != kFeedOpType) { while (left < size && ops->at(left)->Type() != kFeedOpType) {
++left; ++left;
} }
if (left == size) { if (left == size) {
...@@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals( ...@@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals(
size_t start = pivot, end = start; size_t start = pivot, end = start;
while (pivot < right && while (pivot < right &&
(paddle::framework::NgraphBridge::NG_NODE_MAP.find( (paddle::framework::NgraphBridge::NG_NODE_MAP.find(
ops.at(pivot)->Type()) != ops->at(pivot)->Type()) !=
paddle::framework::NgraphBridge::NG_NODE_MAP.end())) { paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
++pivot; ++pivot;
++end; ++end;
...@@ -136,7 +191,9 @@ FusedOperator::FusedOperator( ...@@ -136,7 +191,9 @@ FusedOperator::FusedOperator(
std::vector<std::unique_ptr<OperatorBase>>::iterator end, std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type, const VariableNameMap& inputs, const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs) const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) { : OperatorBase(type, inputs, outputs, attrs),
pdesc_(prog),
block_(block_id) {
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start; for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
it != end; ++it) { it != end; ++it) {
fused_ops_.push_back(std::move(*it)); fused_ops_.push_back(std::move(*it));
...@@ -152,7 +209,7 @@ FusedOperator::FusedOperator( ...@@ -152,7 +209,7 @@ FusedOperator::FusedOperator(
} }
if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) { if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
is_complete = true; is_full_ = true;
} }
Process(); Process();
...@@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope, ...@@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope,
} }
} }
if (is_full) { if (is_full_) {
ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN; ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
} }
...@@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope, ...@@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope,
ngraph_op.Run(scope, place); ngraph_op.Run(scope, place);
} }
std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
NgraphOperator::func_cache_ = {};
std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
ngraph::runtime::Backend::create("CPU");
void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
op->RuntimeInferShape(scope_, place_);
for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto sp = Ddim2Shape(tensor_pd->dims());
if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
var_in_.end()) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto ng_type = var_type_map_.at(var_name);
auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
(*var_node_map_)[var_name] = prm;
(*var_in_node_map_)[var_name] = prm;
}
}
}
}
}
}
void NgraphOperator::BuildNgNodes() {
for (auto& var_name : var_out_) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
auto ng_shape = Ddim2Shape(ddim);
auto ng_type = var_type_map_.at(var_name);
auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
(*var_node_map_)[var_name] = prm;
}
}
}
paddle::framework::NgraphBridge ngb(var_node_map_);
for (auto& op : fused_ops_) {
ngb.BuildNgNode(op);
}
}
void NgraphOperator::BuildNgIO() {
std::unordered_set<std::string> inputs;
std::unordered_set<std::string> outputs;
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) {
inputs.insert(var_name);
const bool is_output = outputs.find(var_name) != outputs.end();
if (!is_output &&
std::find(var_in_.begin(), var_in_.end(), var_name) ==
var_in_.end()) {
// fill var_in here to keep lhs and rhs order
var_in_.push_back(var_name);
}
}
}
if (op->Type() != "fill_constant") {
GetNgInputShape(op);
}
for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet",
op->Type());
for (auto& var_name : var_name_item.second) {
outputs.insert(var_name);
}
}
}
// var_out.clear();
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet",
op->Type());
for (auto& var_name : var_name_item.second) {
switch (ng_op_state_) {
case PARTIAL_TEST:
if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
fetches_.find(var_name) != fetches_.end()) {
var_out_.push_back(var_name);
}
break;
case FULL_TEST:
if (fetches_.find(var_name) != fetches_.end()) {
var_out_.push_back(var_name);
}
break;
case PARTIAL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() ||
post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name);
}
break;
case FULL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() ||
persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name);
}
break;
default:
var_out_.push_back(var_name);
}
}
}
}
}
void NgraphOperator::BuildNgFunction() {
BuildNgNodes();
ngraph_function_ = nullptr;
ngraph::NodeVector func_outputs;
ngraph::op::ParameterVector func_inputs;
for (auto& vo : var_out_) {
func_outputs.push_back(var_node_map_->at(vo));
}
for (auto& vi : var_in_) {
std::shared_ptr<ngraph::op::Parameter> prm =
std::dynamic_pointer_cast<ngraph::op::Parameter>(
var_in_node_map_->at(vi));
func_inputs.push_back(prm);
}
ngraph_function_ =
std::make_shared<ngraph::Function>(func_outputs, func_inputs);
}
std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
auto cache_key = std::make_shared<std::string>("");
*cache_key += std::to_string(fused_ops_.size());
for (auto& op : fused_ops_) {
*cache_key += op->Type();
}
for (auto& var_name : var_in_) {
auto shape = var_node_map_->at(var_name)->get_shape();
*cache_key += var_name;
*cache_key += var_type_map_.at(var_name).c_type_string();
for (size_t i = 0; i < shape.size(); ++i) {
*cache_key += std::to_string(shape.at(i));
}
}
for (auto& var_name : var_out_) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
for (int i = 0; i < ddim.size(); ++i) {
*cache_key += std::to_string(ddim[i]);
}
}
}
return cache_key;
}
void NgraphOperator::GetNgFunction() {
bool cache_on = true;
if (cache_on) {
std::string cache_key_val = *GetCacheKey();
if (func_cache_.find(cache_key_val) != func_cache_.end()) {
ngraph_function_ = func_cache_.at(cache_key_val);
} else {
BuildNgFunction();
func_cache_[cache_key_val] = ngraph_function_;
}
} else {
BuildNgFunction();
}
}
void NgraphOperator::Run(const Scope& scope,
const platform::Place& place) const {
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
for (size_t i = 0; i < var_in_.size(); ++i) {
auto vi = var_in_.at(i);
auto sp = var_node_map_->at(vi)->get_shape();
std::shared_ptr<ngraph::runtime::Tensor> ti;
auto* var = scope.FindVar(vi);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
"Ensure ngraph tensor layout align with paddle tensor");
if (tensor_pd->type().hash_code() ==
typeid(float).hash_code()) { // NOLINT
const float* arr = tensor_pd->data<float>();
ti = backend_->create_tensor(ngraph::element::f32, sp,
const_cast<float*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(int).hash_code()) { // NOLINT
const int* arr = tensor_pd->data<int>();
ti = backend_->create_tensor(ngraph::element::i32, sp,
const_cast<int*>(arr));
} else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
const int64_t* arr = tensor_pd->data<int64_t>();
ti = backend_->create_tensor(ngraph::element::i64, sp,
const_cast<int64_t*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(double).hash_code()) { // NOLINT
const double* arr = tensor_pd->data<double>();
ti = backend_->create_tensor(ngraph::element::f64, sp,
const_cast<double*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(bool).hash_code()) { // NOLINT
const bool* arr = tensor_pd->data<bool>();
ti = backend_->create_tensor(ngraph::element::boolean, sp,
const_cast<bool*>(arr));
} else {
PADDLE_THROW("Data type not handling for var %s", vi);
}
} else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
}
bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
? true
: false;
bool is_persistable =
(persistables_.find(vi) != persistables_.end()) ? true : false;
if (is_test && is_persistable) {
ti->set_stale(false);
}
t_in.push_back(ti);
}
for (size_t i = 0; i < var_out_.size(); ++i) {
auto var_name = var_out_[i];
auto* var = scope.FindVar(var_name);
std::shared_ptr<ngraph::runtime::Tensor> to;
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
auto dd = tensor_pd->dims();
ngraph::Shape sp = Ddim2Shape(dd);
auto ng_type = var_type_map_.at(var_name);
if (ng_type == ngraph::element::f32) {
auto pd_arr = tensor_pd->mutable_data<float>(place);
to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
} else if (ng_type == ngraph::element::i64) {
auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
} else if (ng_type == ngraph::element::f64) {
auto pd_arr = tensor_pd->mutable_data<double>(place);
to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
} else if (ng_type == ngraph::element::boolean) {
auto pd_arr = tensor_pd->mutable_data<bool>(place);
to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
} else {
PADDLE_THROW("Data type not handled in for var %s", var_name);
}
t_out.push_back(to);
} else {
PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
}
}
backend_->call(ngraph_function_, t_out, t_in);
} // NgraphOperator::RunImpl
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
#endif #endif
...@@ -17,24 +17,19 @@ limitations under the License. */ ...@@ -17,24 +17,19 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH #ifdef PADDLE_WITH_NGRAPH
#include <algorithm> #include <algorithm>
#include <atomic>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/variant.h" #include "paddle/fluid/platform/variant.h"
#include "ngraph/ngraph.hpp" #include "ngraph/type/element_type.hpp"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
......
...@@ -31,12 +31,6 @@ class InferShapeBase { ...@@ -31,12 +31,6 @@ class InferShapeBase {
virtual void operator()(InferShapeContext*) const = 0; virtual void operator()(InferShapeContext*) const = 0;
}; };
class EstimateFlopsBase {
public:
virtual ~EstimateFlopsBase() = default;
virtual size_t operator()(InferShapeContext*) const = 0;
};
struct OpInfo { struct OpInfo {
OpCreator creator_; OpCreator creator_;
GradOpMakerFN grad_op_maker_; GradOpMakerFN grad_op_maker_;
...@@ -44,7 +38,6 @@ struct OpInfo { ...@@ -44,7 +38,6 @@ struct OpInfo {
OpAttrChecker* checker_{nullptr}; OpAttrChecker* checker_{nullptr};
InferVarTypeFN infer_var_type_; InferVarTypeFN infer_var_type_;
InferShapeFN infer_shape_; InferShapeFN infer_shape_;
EstimateFlopsFN estimate_flops_;
bool HasOpProtoAndChecker() const { bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr; return proto_ != nullptr && checker_ != nullptr;
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_kernel_type.h"
namespace paddle {
namespace framework {
size_t OpKernelType::Hash::operator()(const OpKernelType& key) const {
int cur_loc = 0;
int place = key.place_.which();
cur_loc += OpKernelType::kPlaceBits;
int data_type = static_cast<int>(key.data_type_) << cur_loc;
cur_loc += OpKernelType::kPrimaryDTypeBits;
int data_layout = static_cast<int>(key.data_layout_) << cur_loc;
cur_loc += OpKernelType::kLayoutBits;
int library_type = static_cast<int>(key.library_type_) << cur_loc;
cur_loc += OpKernelType::kLibBits;
int customized_value = key.customized_type_value_;
PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits));
customized_value = customized_value << cur_loc;
cur_loc += OpKernelType::kCustomizeBits;
PADDLE_ENFORCE(cur_loc < 64);
std::hash<int> hasher;
return hasher(place + data_type + data_layout + library_type +
customized_value);
}
bool OpKernelType::operator==(const OpKernelType& o) const {
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
library_type_ == o.library_type_ &&
customized_type_value_ == o.customized_type_value_;
}
} // namespace framework
} // namespace paddle
...@@ -24,54 +24,55 @@ limitations under the License. */ ...@@ -24,54 +24,55 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
struct OpKernelType { class OpKernelType {
struct Hash { public:
size_t operator()(const OpKernelType& key) const { constexpr static int kDefaultCustomizedTypeValue = 0;
int place = key.place_.which();
int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
int library_type = static_cast<int>(key.library_type_)
<< (LEFT_SHIFT * 3);
std::hash<int> hasher;
return hasher(place + data_type + data_layout + library_type);
}
};
// place, data_type, library_type kinds less than 2^8 // In total should be smaller than 64.
constexpr static int LEFT_SHIFT = 8; constexpr static int kPlaceBits = 4;
constexpr static int kPrimaryDTypeBits = 8;
proto::VarType::Type data_type_; constexpr static int kLayoutBits = 4;
DataLayout data_layout_; constexpr static int kLibBits = 4;
platform::Place place_; constexpr static int kCustomizeBits = 4;
LibraryType library_type_;
OpKernelType(proto::VarType::Type data_type, platform::Place place, OpKernelType(proto::VarType::Type data_type, platform::Place place,
DataLayout data_layout = DataLayout::kAnyLayout, DataLayout data_layout = DataLayout::kAnyLayout,
LibraryType library_type = LibraryType::kPlain) LibraryType library_type = LibraryType::kPlain,
int customized_type_value = kDefaultCustomizedTypeValue)
: data_type_(data_type), : data_type_(data_type),
data_layout_(data_layout), data_layout_(data_layout),
place_(place), place_(place),
library_type_(library_type) {} library_type_(library_type),
customized_type_value_(customized_type_value) {}
OpKernelType(proto::VarType::Type data_type, OpKernelType(proto::VarType::Type data_type,
const platform::DeviceContext& dev_ctx, const platform::DeviceContext& dev_ctx,
DataLayout data_layout = DataLayout::kAnyLayout, DataLayout data_layout = DataLayout::kAnyLayout,
LibraryType library_type = LibraryType::kPlain) LibraryType library_type = LibraryType::kPlain,
int customized_type_value = kDefaultCustomizedTypeValue)
: data_type_(data_type), : data_type_(data_type),
data_layout_(data_layout), data_layout_(data_layout),
place_(dev_ctx.GetPlace()), place_(dev_ctx.GetPlace()),
library_type_(library_type) {} library_type_(library_type),
customized_type_value_(customized_type_value) {}
virtual ~OpKernelType() {}
struct Hash {
size_t operator()(const OpKernelType& key) const;
};
size_t hash_key() const { return Hash()(*this); } size_t hash_key() const { return Hash()(*this); }
bool operator==(const OpKernelType& o) const { bool operator==(const OpKernelType& o) const;
return platform::places_are_same_class(place_, o.place_) &&
data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
library_type_ == o.library_type_;
}
bool operator!=(const OpKernelType& o) const { return !(*this == o); } bool operator!=(const OpKernelType& o) const { return !(*this == o); }
proto::VarType::Type data_type_;
DataLayout data_layout_;
platform::Place place_;
LibraryType library_type_;
int customized_type_value_;
}; };
inline std::ostream& operator<<(std::ostream& os, inline std::ostream& operator<<(std::ostream& os,
......
...@@ -35,6 +35,7 @@ limitations under the License. */ ...@@ -35,6 +35,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class Registrar { class Registrar {
public: public:
// In our design, various kinds of classes, e.g., operators and kernels, // In our design, various kinds of classes, e.g., operators and kernels,
...@@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor; ...@@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor;
template <typename PlaceType, typename T, typename Func> template <typename PlaceType, typename T, typename Func>
inline void RegisterKernelClass(const char* op_type, const char* library_type, inline void RegisterKernelClass(const char* op_type, const char* library_type,
Func func) { int customized_type_value, Func func) {
std::string library(library_type); std::string library(library_type);
std::string data_layout = "ANYLAYOUT"; std::string data_layout = "ANYLAYOUT";
if (library == "MKLDNN") { if (library == "MKLDNN") {
...@@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type, ...@@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type,
} }
OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
StringToDataLayout(data_layout), StringToDataLayout(data_layout),
StringToLibraryType(library_type)); StringToLibraryType(library_type), customized_type_value);
OperatorWithKernel::AllOpKernels()[op_type][key] = func; OperatorWithKernel::AllOpKernels()[op_type][key] = func;
} }
...@@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> { ...@@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
using KERNEL_TYPE = using KERNEL_TYPE =
typename std::tuple_element<I, std::tuple<KernelTypes...>>::type; typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
void operator()(const char* op_type, const char* library_type) const { void operator()(const char* op_type, const char* library_type,
int customized_type_value) const {
using T = typename KERNEL_TYPE::ELEMENT_TYPE; using T = typename KERNEL_TYPE::ELEMENT_TYPE;
RegisterKernelClass<PlaceType, T>( RegisterKernelClass<PlaceType, T>(
op_type, library_type, [](const framework::ExecutionContext& ctx) { op_type, library_type, customized_type_value,
[](const framework::ExecutionContext& ctx) {
KERNEL_TYPE().Compute(ctx); KERNEL_TYPE().Compute(ctx);
}); });
constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value; constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...> OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
func; func;
func(op_type, library_type); func(op_type, library_type, customized_type_value);
} }
}; };
template <typename PlaceType, size_t I, typename... KernelType> template <typename PlaceType, size_t I, typename... KernelType>
struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> { struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
void operator()(const char* op_type, const char* library_type) const {} void operator()(const char* op_type, const char* library_type,
int customized_type_value) const {}
}; };
// User can register many kernel in one place. The data type could be // User can register many kernel in one place. The data type could be
...@@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> { ...@@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
template <typename PlaceType, typename... KernelType> template <typename PlaceType, typename... KernelType>
class OpKernelRegistrar : public Registrar { class OpKernelRegistrar : public Registrar {
public: public:
explicit OpKernelRegistrar(const char* op_type, const char* library_type) { explicit OpKernelRegistrar(const char* op_type, const char* library_type,
int customized_type_value) {
OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func; OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
func(op_type, library_type); func(op_type, library_type, customized_type_value);
} }
}; };
...@@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx; ...@@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx;
template <typename PlaceType, typename... DataTypeAndKernelType> template <typename PlaceType, typename... DataTypeAndKernelType>
class OpKernelRegistrarEx : public Registrar { class OpKernelRegistrarEx : public Registrar {
public: public:
explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) { explicit OpKernelRegistrarEx(const char* op_type, const char* library_type,
int customized_type_value) {
OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...> OpKernelRegistrarFunctorEx<PlaceType, false, 0, DataTypeAndKernelType...>
func; func;
func(op_type, library_type); func(op_type, library_type, customized_type_value);
} }
}; };
template <typename PlaceType, size_t I, typename... DataTypeAndKernelType> template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
struct OpKernelRegistrarFunctorEx<PlaceType, true, I, struct OpKernelRegistrarFunctorEx<PlaceType, true, I,
DataTypeAndKernelType...> { DataTypeAndKernelType...> {
void operator()(const char* op_type, const char* library_type) const {} void operator()(const char* op_type, const char* library_type,
int customized_type_value) const {}
}; };
template <typename PlaceType, size_t I, typename... DataTypeAndKernelType> template <typename PlaceType, size_t I, typename... DataTypeAndKernelType>
...@@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
typename std::tuple_element<I, typename std::tuple_element<I,
std::tuple<DataTypeAndKernelType...>>::type; std::tuple<DataTypeAndKernelType...>>::type;
void operator()(const char* op_type, const char* library_type) const { void operator()(const char* op_type, const char* library_type,
RegisterKernelClass<PlaceType, T>(op_type, library_type, Functor()); int customized_type_value) const {
RegisterKernelClass<PlaceType, T>(op_type, library_type,
customized_type_value, Functor());
constexpr auto size = constexpr auto size =
std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value; std::tuple_size<std::tuple<DataTypeAndKernelType...>>::value;
OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2, OpKernelRegistrarFunctorEx<PlaceType, I + 2 >= size, I + 2,
DataTypeAndKernelType...> DataTypeAndKernelType...>
func; func;
func(op_type, library_type); func(op_type, library_type, customized_type_value);
} }
}; };
// clang-format off
/** /**
* check if MACRO is used in GLOBAL NAMESPACE. * check if MACRO is used in GLOBAL NAMESPACE.
*/ */
...@@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
/** /**
* Macro to register OperatorKernel. * Macro to register OperatorKernel.
*/ */
#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ #define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ place_class, customized_name, \
__reg_op_kernel_##op_type##_##library_type##__, \ customized_type_value, ...) \
"REGISTER_OP_KERNEL must be called in global namespace"); \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \ __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \
__op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ "REGISTER_OP_KERNEL must be called in " \
#library_type); \ "global namespace"); \
int TouchOpKernelRegistrar_##op_type##_##library_type() { \ static ::paddle::framework::OpKernelRegistrar<place_class, \
__op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ __VA_ARGS__> \
return 0; \ __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
#op_type, #library_type, customized_type_value); \
int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
__op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \
.Touch(); \
return 0; \
} }
#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( \
op_type, library_type, place_class, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...) \ #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ customized_name, \
__reg_op_kernel_##op_type##_##library_type##__, \ customized_type_value, \
"REGISTER_OP_KERNEL_EX must be called in global namespace"); \ ...) \
static ::paddle::framework::OpKernelRegistrarEx<place_class, __VA_ARGS__> \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
__op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \
#library_type); \ "REGISTER_OP_KERNEL_EX must be called in " \
int TouchOpKernelRegistrar_##op_type##_##library_type() { \ "global namespace"); \
__op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ static ::paddle::framework::OpKernelRegistrarEx<place_class, \
return 0; \ __VA_ARGS__> \
__op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\
#op_type, #library_type, customized_type_value); \
int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\
__op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \
.Touch(); \
return 0; \
} }
#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \ #define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \ REGISTER_OP_KERNEL_EX( \
__VA_ARGS__) op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL_EX( \
op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/** /**
* Macro to mark what Operator and Kernel * Macro to mark what Operator and Kernel
...@@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
extern int TouchOpRegistrar_##op_type(); \ extern int TouchOpRegistrar_##op_type(); \
UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type() UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \ #define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \
STATIC_ASSERT_GLOBAL_NAMESPACE( \ LIBRARY_TYPE, \
__use_op_kernel_##op_type##_##LIBRARY_TYPE##__, \ customized_name) \
"USE_OP_DEVICE_KERNEL must be in global namespace"); \ STATIC_ASSERT_GLOBAL_NAMESPACE( \
extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \ __use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##__, \
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ = \ "USE_OP_DEVICE_KERNEL must be in global namespace"); \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() extern int \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \
UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##DEFAULT_TYPE##_ = /* NOLINT */ \
TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name()
#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \
USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE)
// TODO(fengjiayi): The following macros // TODO(fengjiayi): The following macros
// seems ugly, do we have better method? // seems ugly, do we have better method?
...@@ -280,6 +319,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -280,6 +319,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define USE_OP(op_type) \ #define USE_OP(op_type) \
USE_OP_ITSELF(op_type); \ USE_OP_ITSELF(op_type); \
USE_OP_KERNEL(op_type) USE_OP_KERNEL(op_type)
// clang-format off
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name, ...@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
"Tensor %s contains NAN", name); "Tensor %s contains NAN", name);
} }
void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);
}
void OperatorWithKernel::RunImpl(const Scope& scope, void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
......
...@@ -128,6 +128,8 @@ class OperatorBase { ...@@ -128,6 +128,8 @@ class OperatorBase {
virtual std::vector<std::string> OutputVars(bool has_intermediate) const; virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
virtual void RuntimeInferShape(const Scope& scope,
const platform::Place& place) const {}
protected: protected:
std::string type_; std::string type_;
...@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase { ...@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
} }
void RuntimeInferShape(const Scope& scope,
const platform::Place& place) const override;
protected: protected:
virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
virtual OpKernelType GetKernelTypeForVar( virtual OpKernelType GetKernelTypeForVar(
......
...@@ -50,6 +50,8 @@ class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker { ...@@ -50,6 +50,8 @@ class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
AddInput("input", "input of test op"); AddInput("input", "input of test op");
AddOutput("output", "output of test op"); AddOutput("output", "output of test op");
AddAttr<float>("scale", "scale of cosine op"); AddAttr<float>("scale", "scale of cosine op");
AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
.SetDefault(0);
AddComment("This is test op"); AddComment("This is test op");
} }
}; };
...@@ -95,6 +97,8 @@ TEST(OperatorBase, all) { ...@@ -95,6 +97,8 @@ TEST(OperatorBase, all) {
namespace paddle { namespace paddle {
namespace framework { namespace framework {
static int special_type_value = 1;
class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public: public:
void Make() { void Make() {
...@@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { ...@@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
AddAttr<float>("scale", "scale of cosine op") AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0) .SetDefault(1.0)
.GreaterThan(0.0); .GreaterThan(0.0);
AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
.SetDefault(0);
AddComment("This is test op"); AddComment("This is test op");
} }
}; };
static int cpu_kernel_run_num = 0; static int cpu_kernel_run_num = 0;
static int cpu_kernel2_run_num = 0;
class OpWithKernelTest : public OperatorWithKernel { class OpWithKernelTest : public OperatorWithKernel {
public: public:
...@@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel { ...@@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel {
void InferShape(framework::InferShapeContext* ctx) const override {} void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetExpectedKernelType( OpKernelType GetExpectedKernelType(
const ExecutionContext& ctx) const override { const ExecutionContext& ctx) const override {
return OpKernelType(proto::VarType::FP32, ctx.GetPlace()); int sub_type = ctx.Attr<int>("kernel_sub_type");
return OpKernelType(proto::VarType::FP32, ctx.GetPlace(),
framework::DataLayout::kAnyLayout,
framework::LibraryType::kPlain, sub_type);
} }
}; };
...@@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel<float> { ...@@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel<float> {
} }
}; };
template <typename T1, typename T2>
class CPUKernel2Test : public OpKernel<float> {
public:
void Compute(const ExecutionContext& ctx) const {
std::cout << ctx.op().DebugString() << std::endl;
cpu_kernel2_run_num++;
ASSERT_EQ(ctx.op().Input("x"), "IN1");
ASSERT_EQ(ctx.op().Output("y"), "OUT1");
}
};
class OpKernelTestMultiInputsProtoAndCheckerMaker class OpKernelTestMultiInputsProtoAndCheckerMaker
: public OpProtoAndCheckerMaker { : public OpProtoAndCheckerMaker {
public: public:
...@@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker ...@@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
AddAttr<float>("scale", "scale of cosine op") AddAttr<float>("scale", "scale of cosine op")
.SetDefault(1.0) .SetDefault(1.0)
.GreaterThan(0.0); .GreaterThan(0.0);
AddAttr<int>("kernel_sub_type", "kernels with different implementations.")
.SetDefault(0);
AddComment("This is test op"); AddComment("This is test op");
} }
}; };
...@@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel<float> { ...@@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel<float> {
REGISTER_OP_WITHOUT_GRADIENT( REGISTER_OP_WITHOUT_GRADIENT(
op_with_kernel, paddle::framework::OpWithKernelTest, op_with_kernel, paddle::framework::OpWithKernelTest,
paddle::framework::OpKernelTestProtoAndCheckerMaker); paddle::framework::OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(op_with_kernel, REGISTER_OP_CPU_KERNEL(op_with_kernel,
paddle::framework::CPUKernelTest<float, float>); paddle::framework::CPUKernelTest<float, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME,
paddle::framework::special_type_value,
paddle::framework::CPUKernel2Test<float, float>);
// test with single input // test with single input
TEST(OpKernel, all) { TEST(OpKernel, all) {
paddle::framework::InitDevices(true); paddle::framework::InitDevices(true);
...@@ -211,7 +240,19 @@ TEST(OpKernel, all) { ...@@ -211,7 +240,19 @@ TEST(OpKernel, all) {
auto op = paddle::framework::OpRegistry::CreateOp(op_desc); auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
op->Run(scope, cpu_place); op->Run(scope, cpu_place);
// kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called.
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0);
attr = op_desc.mutable_attrs()->Add();
attr->set_name("kernel_sub_type");
attr->set_type(paddle::framework::proto::AttrType::INT);
attr->set_i(1);
auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc);
op2->Run(scope, cpu_place);
// kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called.
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1);
} }
REGISTER_OP_WITHOUT_GRADIENT( REGISTER_OP_WITHOUT_GRADIENT(
......
...@@ -32,8 +32,7 @@ namespace framework { ...@@ -32,8 +32,7 @@ namespace framework {
class SelectedRows { class SelectedRows {
/* /*
* @brief We can use the SelectedRows structure to reproduce a sparse table. * @brief We can use the SelectedRows structure to reproduce a sparse table.
* A sparse table is a key-value structure that the key is an `int64_t` * A sparse table is a key-value structure that the key is an `int64_t`,
* number,
* and the value is a Tensor which the first dimension is 0. * and the value is a Tensor which the first dimension is 0.
* You can use the following interface to operate the sparse table, and you * You can use the following interface to operate the sparse table, and you
* can find * can find
......
...@@ -54,7 +54,5 @@ using InferVarTypeFN = ...@@ -54,7 +54,5 @@ using InferVarTypeFN =
using InferShapeFN = std::function<void(InferShapeContext*)>; using InferShapeFN = std::function<void(InferShapeContext*)>;
using EstimateFlopsFN = std::function<void(InferShapeContext*)>;
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -103,6 +103,7 @@ struct Argument { ...@@ -103,6 +103,7 @@ struct Argument {
// Model specified with program and parameters files. // Model specified with program and parameters files.
DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string);
DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
// The overall graph to work on. // The overall graph to work on.
DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
......
...@@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
output_mapping.push_back(output_name_map[name]); output_mapping.push_back(output_name_map[name]);
} }
*block_desc.Proto()->mutable_vars() = auto *vars = block_desc.Proto()->mutable_vars();
const_cast<framework::ProgramDesc *>(&graph->program()) for (framework::ir::Node *node : graph->Nodes()) {
->Proto() if (node->IsVar() && node->Var()) {
->blocks(0) *vars->Add() = *node->Var()->Proto();
.vars(); }
}
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
"the block has no var-desc"); "the block has no var-desc");
PADDLE_ENFORCE(!output_mapping.empty()); PADDLE_ENFORCE(!output_mapping.empty());
......
...@@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { ...@@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
argument->model_params_path_valid()) { argument->model_params_path_valid()) {
auto program = auto program =
LoadModel(argument->model_program_path(), argument->model_params_path(), LoadModel(argument->model_program_path(), argument->model_params_path(),
argument->scope_ptr(), place); argument->scope_ptr(), place, argument->model_from_memory());
argument->SetMainProgram(program.release()); argument->SetMainProgram(program.release());
} else { } else {
PADDLE_THROW( PADDLE_THROW(
...@@ -68,9 +68,14 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel( ...@@ -68,9 +68,14 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel( std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
const std::string &program_path, const std::string &params_path, const std::string &program_path, const std::string &params_path,
framework::Scope *scope, const platform::Place &place) { framework::Scope *scope, const platform::Place &place,
bool model_from_memory) {
framework::Executor exe(place); framework::Executor exe(place);
return Load(&exe, scope, program_path, params_path); if (!model_from_memory) {
return Load(&exe, scope, program_path, params_path);
} else {
return LoadFromMemory(&exe, scope, program_path, params_path);
}
} }
std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; } std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }
......
...@@ -24,7 +24,7 @@ namespace inference { ...@@ -24,7 +24,7 @@ namespace inference {
namespace analysis { namespace analysis {
/* /*
* Load program and parameter to memory from the disk. * Load program and parameter to memory from the disk or directly from memory.
*/ */
class IrGraphBuildPass : public AnalysisPass { class IrGraphBuildPass : public AnalysisPass {
public: public:
...@@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass { ...@@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass {
const platform::Place &place); const platform::Place &place);
std::unique_ptr<framework::ProgramDesc> LoadModel( std::unique_ptr<framework::ProgramDesc> LoadModel(
const std::string &program_path, const std::string &params_path, const std::string &program_path, const std::string &params_path,
framework::Scope *scope, const platform::Place &place); framework::Scope *scope, const platform::Place &place,
bool model_from_memory);
std::string model_binary_str_; std::string model_binary_str_;
}; };
......
...@@ -53,6 +53,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { ...@@ -53,6 +53,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
use_tensorrt_ = other.use_tensorrt_; use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
model_from_memory_ = other.model_from_memory_;
if (use_gpu) { if (use_gpu) {
pass_builder_.reset(new GpuPassStrategy( pass_builder_.reset(new GpuPassStrategy(
...@@ -80,6 +81,8 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { ...@@ -80,6 +81,8 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) {
use_tensorrt_ = other.use_tensorrt_; use_tensorrt_ = other.use_tensorrt_;
tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_;
tensorrt_workspace_size_ = other.tensorrt_workspace_size_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_;
model_from_memory_ = other.model_from_memory_;
pass_builder_ = std::move(other.pass_builder_); pass_builder_ = std::move(other.pass_builder_);
} }
...@@ -102,4 +105,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, ...@@ -102,4 +105,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); pass_builder()->InsertPass(1, "tensorrt_subgraph_pass");
} }
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size,
const char *param_buffer,
size_t param_buffer_size) {
prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size);
param_file = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true;
}
} // namespace paddle } // namespace paddle
...@@ -308,6 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -308,6 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseGPU(config_.use_gpu); argument_.SetUseGPU(config_.use_gpu);
argument_.SetGPUDeviceId(config_.device); argument_.SetGPUDeviceId(config_.device);
argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program // Analyze inference_program
if (!config_.model_dir.empty()) { if (!config_.model_dir.empty()) {
argument_.SetModelDir(config_.model_dir); argument_.SetModelDir(config_.model_dir);
...@@ -448,20 +449,24 @@ bool AnalysisPredictor::LoadProgramDesc() { ...@@ -448,20 +449,24 @@ bool AnalysisPredictor::LoadProgramDesc() {
return false; return false;
} }
std::string pb_content;
// Read binary
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
fin.seekg(0, std::ios::end);
pb_content.resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&(pb_content.at(0)), pb_content.size());
fin.close();
// Create ProgramDesc // Create ProgramDesc
framework::proto::ProgramDesc proto; framework::proto::ProgramDesc proto;
proto.ParseFromString(pb_content); if (!config_.model_from_memory()) {
std::string pb_content;
// Read binary
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin.is_open()), "Cannot open file %s",
filename);
fin.seekg(0, std::ios::end);
pb_content.resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&(pb_content.at(0)), pb_content.size());
fin.close();
proto.ParseFromString(pb_content);
} else {
proto.ParseFromString(config_.prog_file);
}
inference_program_.reset(new framework::ProgramDesc(proto)); inference_program_.reset(new framework::ProgramDesc(proto));
return true; return true;
} }
...@@ -469,6 +474,7 @@ bool AnalysisPredictor::LoadProgramDesc() { ...@@ -469,6 +474,7 @@ bool AnalysisPredictor::LoadProgramDesc() {
bool AnalysisPredictor::LoadParameters() { bool AnalysisPredictor::LoadParameters() {
PADDLE_ENFORCE_NOT_NULL(inference_program_.get(), PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
"The inference program should be loaded first."); "The inference program should be loaded first.");
const auto &global_block = inference_program_->MutableBlock(0); const auto &global_block = inference_program_->MutableBlock(0);
// create a temporary program to load parameters. // create a temporary program to load parameters.
......
...@@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") ...@@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
link_directories("${PADDLE_LIB}/paddle/lib") link_directories("${PADDLE_LIB}/paddle/lib")
if (NOT WIN32)
set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
if(EXISTS ${NGRAPH_PATH})
include(GNUInstallDirs)
include_directories("${NGRAPH_PATH}/include")
link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
endif()
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
if(WITH_MKL) if(WITH_MKL)
...@@ -106,7 +116,7 @@ endif() ...@@ -106,7 +116,7 @@ endif()
if (NOT WIN32) if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS} set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB} ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
glog gflags protobuf snappystream snappy z xxhash glog gflags protobuf snappystream snappy z xxhash
${EXTERNAL_LIB}) ${EXTERNAL_LIB})
else() else()
......
...@@ -52,10 +52,13 @@ struct AnalysisConfig : public NativeConfig { ...@@ -52,10 +52,13 @@ struct AnalysisConfig : public NativeConfig {
bool use_tensorrt() const { return use_tensorrt_; } bool use_tensorrt() const { return use_tensorrt_; }
void EnableMKLDNN(); void EnableMKLDNN();
// NOTE this is just for internal development, please not use it.
// NOT stable yet.
bool use_mkldnn() const { return use_mkldnn_; } bool use_mkldnn() const { return use_mkldnn_; }
// Specify the memory buffer of program and parameter
void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size,
const char* program_buffer, size_t program_buffer_size);
bool model_from_memory() const { return model_from_memory_; }
friend class ::paddle::AnalysisPredictor; friend class ::paddle::AnalysisPredictor;
protected: protected:
...@@ -64,6 +67,7 @@ struct AnalysisConfig : public NativeConfig { ...@@ -64,6 +67,7 @@ struct AnalysisConfig : public NativeConfig {
int tensorrt_workspace_size_; int tensorrt_workspace_size_;
int tensorrt_max_batchsize_; int tensorrt_max_batchsize_;
std::unique_ptr<PassStrategy> pass_builder_; std::unique_ptr<PassStrategy> pass_builder_;
bool model_from_memory_{false};
}; };
// Configurations for Anakin engine. // Configurations for Anakin engine.
......
...@@ -69,7 +69,8 @@ bool IsPersistable(const framework::VarDesc* var) { ...@@ -69,7 +69,8 @@ bool IsPersistable(const framework::VarDesc* var) {
void LoadPersistables(framework::Executor* executor, framework::Scope* scope, void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program, const framework::ProgramDesc& main_program,
const std::string& dirname, const std::string& dirname,
const std::string& param_filename) { const std::string& param_filename,
bool model_from_memory = false) {
const framework::BlockDesc& global_block = main_program.Block(0); const framework::BlockDesc& global_block = main_program.Block(0);
framework::ProgramDesc* load_program = new framework::ProgramDesc(); framework::ProgramDesc* load_program = new framework::ProgramDesc();
...@@ -108,6 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, ...@@ -108,6 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
op->SetType("load_combine"); op->SetType("load_combine");
op->SetOutput("Out", paramlist); op->SetOutput("Out", paramlist);
op->SetAttr("file_path", {param_filename}); op->SetAttr("file_path", {param_filename});
op->SetAttr("model_from_memory", {model_from_memory});
op->CheckAttrs(); op->CheckAttrs();
} }
...@@ -130,16 +132,17 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor, ...@@ -130,16 +132,17 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
"model version %ld is not supported.", "model version %ld is not supported.",
main_program->Version()); main_program->Version());
LoadPersistables(executor, scope, *main_program, dirname, ""); // model_from_memory is false in seperate parameters.
LoadPersistables(executor, scope, *main_program, dirname, "",
false /* model_from_memory */);
return main_program; return main_program;
} }
std::unique_ptr<framework::ProgramDesc> Load( std::unique_ptr<framework::ProgramDesc> Load(
framework::Executor* executor, framework::Scope* scope, framework::Executor* executor, framework::Scope* scope,
const std::string& prog_filename, const std::string& param_filename) { const std::string& prog_filename, const std::string& param_filename) {
std::string model_filename = prog_filename;
std::string program_desc_str; std::string program_desc_str;
ReadBinaryFile(model_filename, &program_desc_str); ReadBinaryFile(prog_filename, &program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(program_desc_str)); new framework::ProgramDesc(program_desc_str));
...@@ -147,7 +150,22 @@ std::unique_ptr<framework::ProgramDesc> Load( ...@@ -147,7 +150,22 @@ std::unique_ptr<framework::ProgramDesc> Load(
"model version %ld is not supported.", "model version %ld is not supported.",
main_program->Version()); main_program->Version());
LoadPersistables(executor, scope, *main_program, "", param_filename); LoadPersistables(executor, scope, *main_program, "", param_filename,
false /* model_from_memory */);
return main_program;
}
std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
framework::Executor* executor, framework::Scope* scope,
const std::string& prog_buffer, const std::string& param_buffer) {
std::unique_ptr<framework::ProgramDesc> main_program(
new framework::ProgramDesc(prog_buffer));
PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()),
"model version %ld is not supported.",
main_program->Version());
LoadPersistables(executor, scope, *main_program, "", param_buffer,
true /* model_filename */);
return main_program; return main_program;
} }
......
...@@ -30,7 +30,8 @@ void Init(const std::vector<std::string> argv); ...@@ -30,7 +30,8 @@ void Init(const std::vector<std::string> argv);
void LoadPersistables(framework::Executor* executor, framework::Scope* scope, void LoadPersistables(framework::Executor* executor, framework::Scope* scope,
const framework::ProgramDesc& main_program, const framework::ProgramDesc& main_program,
const std::string& dirname, const std::string& dirname,
const std::string& param_filename); const std::string& param_filename,
bool model_from_memory);
std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor, std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
framework::Scope* scope, framework::Scope* scope,
...@@ -41,6 +42,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor, ...@@ -41,6 +42,10 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
const std::string& prog_filename, const std::string& prog_filename,
const std::string& param_filename); const std::string& param_filename);
std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
framework::Executor* executor, framework::Scope* scope,
const std::string& prog_buffer, const std::string& param_buffer);
// Save the variables from a scope to disk. // Save the variables from a scope to disk.
void SaveVars(const framework::Scope& scope, void SaveVars(const framework::Scope& scope,
const std::vector<std::string>& vars, const std::string& dirname, const std::vector<std::string>& vars, const std::string& dirname,
......
...@@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) { ...@@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) {
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
// USE_OP(prelu); USE_OP(prelu);
USE_CPU_ONLY_OP(prelu);
nv_library(tensorrt_plugin nv_library(tensorrt_plugin
SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
avg_pool_op_plugin.cu avg_pool_op_plugin.cu
DEPS enforce tensorrt_engine) DEPS enforce tensorrt_engine prelu)
...@@ -14,92 +14,16 @@ ...@@ -14,92 +14,16 @@
#include <stdio.h> #include <stdio.h>
#include <cassert> #include <cassert>
#include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
#include "paddle/fluid/operators/math/prelu.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
namespace plugin { namespace plugin {
static const int CUDA_NUM_THREADS = 1024;
static const int CUDA_MAX_NUM_BLOCKS = 65535;
inline static int GET_NUM_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
__global__ void PReluChannelWiseKernel(const float *input, const float *alpha,
float *output, int channel,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
float *out = output + offset;
float scale = alpha[blockIdx.x % channel];
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
__global__ void PReluElementWiseKernel(const float *input, const float *alpha,
float *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
const float *scale = alpha + offset;
float *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale[i] * x;
}
}
__global__ void PReluScalarKernel(const float *input, const float *alpha,
float *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const float *in = input + offset;
float scale = *alpha;
float *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
float x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
static inline void PReluChannelWise(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size,
const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, dims.d[0], spatial_size);
}
static inline void PReluElementWise(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size,
const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
static inline void PReluScalar(cudaStream_t stream, const float *input,
const float *alpha, float *output,
int batch_size, const nvinfer1::Dims &dims) {
size_t unroll = batch_size * dims.d[0];
size_t spatial_size = dims.d[1] * dims.d[2];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
const nvinfer1::Dims *inputDims, const nvinfer1::Dims *inputDims,
int nbInputs) { int nbInputs) {
...@@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, ...@@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
return output_dims; return output_dims;
} }
int PReluPlugin::enqueue(int batchSize, const void *const *inputs, int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
void **outputs, void *workspace, cudaStream_t stream) { void **outputs, void *workspace, cudaStream_t stream) {
// input dims is CHW. // input dims is CHW.
const auto &input_dims = this->getInputDims(0); const auto &input_dims = this->getInputDims(0);
const float *input = reinterpret_cast<const float *>(inputs[0]); const float *input = reinterpret_cast<const float *>(inputs[0]);
const float *alpha = reinterpret_cast<const float *>(alpha_.get().values); const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
float *output = reinterpret_cast<float **>(outputs)[0]; float *output = reinterpret_cast<float **>(outputs)[0];
std::vector<int> input_shape;
input_shape.push_back(batch_size);
for (int i = 0; i < input_dims.nbDims; i++) {
input_shape.push_back(input_dims.d[i]);
}
if (mode_ == "channel") { if (mode_ == "channel") {
PReluChannelWise(stream, input, alpha, output, batchSize, input_dims); operators::math::PreluChannelWiseDirectCUDAFunctor<float>
prelu_channel_wise;
prelu_channel_wise(stream, input, alpha, output, input_shape);
} else if (mode_ == "element") { } else if (mode_ == "element") {
PReluElementWise(stream, input, alpha, output, batchSize, input_dims); operators::math::PreluElementWiseDirectCUDAFunctor<float>
prelu_element_wise;
prelu_element_wise(stream, input, alpha, output, input_shape);
} else { } else {
PReluScalar(stream, input, alpha, output, batchSize, input_dims); operators::math::PreluScalarDirectCUDAFunctor<float> prelu_scalar;
prelu_scalar(stream, input, alpha, output, input_shape);
} }
return cudaGetLastError() != cudaSuccess; return cudaGetLastError() != cudaSuccess;
} }
......
...@@ -93,9 +93,17 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -93,9 +93,17 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
void SetConfig(contrib::AnalysisConfig *cfg) { void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) {
cfg->prog_file = FLAGS_infer_model + "/__model__"; if (memory_load) {
cfg->param_file = FLAGS_infer_model + "/param"; std::string buffer_prog, buffer_param;
ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog);
ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param);
cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0],
buffer_param.size());
} else {
cfg->prog_file = FLAGS_infer_model + "/__model__";
cfg->param_file = FLAGS_infer_model + "/param";
}
cfg->use_gpu = false; cfg->use_gpu = false;
cfg->device = 0; cfg->device = 0;
cfg->specify_input_name = true; cfg->specify_input_name = true;
...@@ -114,9 +122,9 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -114,9 +122,9 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
} }
// Easy for profiling independently. // Easy for profiling independently.
TEST(Analyzer_Chinese_ner, profile) { void profile(bool memory_load = false) {
contrib::AnalysisConfig cfg; contrib::AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg, memory_load);
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) { ...@@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) {
} }
} }
TEST(Analyzer_Chinese_ner, profile) { profile(); }
TEST(Analyzer_Chinese_ner, profile_memory_load) {
profile(true /* memory_load */);
}
// Check the fuse status // Check the fuse status
TEST(Analyzer_Chinese_ner, fuse_statis) { TEST(Analyzer_Chinese_ner, fuse_statis) {
contrib::AnalysisConfig cfg; contrib::AnalysisConfig cfg;
......
...@@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { ...@@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
os << GenSpaces(num_spaces) << "device: " << config.device << "\n"; os << GenSpaces(num_spaces) << "device: " << config.device << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
<< "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n"; << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
<< "specify_input_name: " << config.specify_input_name << "\n"; << "specify_input_name: " << config.specify_input_name << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
...@@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os, ...@@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os,
os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n";
num_spaces++; num_spaces++;
os << *reinterpret_cast<const NativeConfig *>(&config); os << *reinterpret_cast<const NativeConfig *>(&config);
if (!config.model_from_memory()) {
os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n";
os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n";
} else {
os << GenSpaces(num_spaces)
<< "prog_file and param_file: load from memory \n";
}
os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim
<< "\n"; << "\n";
os << GenSpaces(num_spaces) os << GenSpaces(num_spaces)
......
cc_library(benchmark SRCS benchmark.cc DEPS enforce) cc_library(benchmark SRCS benchmark.cc DEPS enforce)
cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark)
cc_binary(visualizer SRCS visualizer.cc DEPS analysis
paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes)
if(WIN32)
target_link_libraries(visualizer shlwapi)
endif(WIN32)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/utils/visualizer.h"
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <fstream>
#include <memory>
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/platform/init.h"
DEFINE_string(model_dir, "", "model directory");
DEFINE_string(model_program_path, "", "model program path");
DEFINE_string(model_params_path, "", "model params path");
USE_PASS(graph_viz_pass);
USE_PASS(graph_to_program_pass);
using paddle::inference::analysis::Argument;
namespace paddle {
namespace inference {
namespace utils {
void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
bool Visualizer::Run() {
paddle::framework::InitDevices(false);
paddle::inference::analysis::Analyzer().Run(argument_);
return true;
}
} // namespace utils
} // namespace inference
} // namespace paddle
// Generate a dot file describing the structure of graph.
// To use this tool, run command: ./visualizer [options...]
// Options:
// --model_dir: the directory of model
// --model_program_path: the path of program
// --model_params_path: the path of params
int main(int argc, char *argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
google::InitGoogleLogging(argv[0]);
paddle::inference::analysis::Argument argument;
argument.SetUseGPU(false);
argument.SetUseTensorRT(false);
if (FLAGS_model_dir.empty()) {
if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) {
LOG(ERROR) << "Please set model_dir"
" or model_program_path and model_params_path";
return -1;
} else {
argument.SetModelProgramPath(FLAGS_model_program_path);
argument.SetModelParamsPath(FLAGS_model_params_path);
}
} else {
argument.SetModelDir(FLAGS_model_dir);
}
// Only 1 pass, default filename is 0_ir_origin.dot
// For more details, looking for paddle::inference::analysis::IRPassManager
argument.SetIrAnalysisPasses({"graph_viz_pass"});
std::unique_ptr<paddle::framework::Scope> scope{
new paddle::framework::Scope()};
argument.SetScopeNotOwned(
const_cast<paddle::framework::Scope *>(scope.get()));
paddle::inference::utils::Visualizer visualizer;
visualizer.SetArgument(&argument);
visualizer.Run();
return 0;
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/fluid/inference/analysis/argument.h"
namespace paddle {
namespace inference {
namespace utils {
using paddle::inference::analysis::Argument;
class Visualizer final {
public:
Visualizer() = default;
~Visualizer() = default;
Visualizer(const Visualizer &) = delete;
Visualizer &operator=(const Visualizer &) = delete;
void SetArgument(Argument *);
bool Run();
private:
Argument *argument_;
};
} // namespace utils
} // namespace inference
} // namespace paddle
...@@ -14,11 +14,13 @@ ...@@ -14,11 +14,13 @@
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <string> #include <string>
#include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h"
DEFINE_bool(init_allocated_mem, false, DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by " "It is a mistake that the values of the memory allocated by "
...@@ -86,7 +88,7 @@ struct NaiveAllocator { ...@@ -86,7 +88,7 @@ struct NaiveAllocator {
template <> template <>
void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) { void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
VLOG(1) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void *p = GetCPUBuddyAllocator()->Alloc(size); void *p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size); memset(p, 0xEF, size);
...@@ -97,7 +99,7 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) { ...@@ -97,7 +99,7 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
template <> template <>
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) { void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p) {
VLOG(1) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
} }
...@@ -110,19 +112,21 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) { ...@@ -110,19 +112,21 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
static std::once_flag init_flag; static std::once_flag init_flag;
static detail::BuddyAllocator **a_arr = nullptr; static detail::BuddyAllocator **a_arr = nullptr;
static std::vector<int> devices;
std::call_once(init_flag, [gpu_id]() { std::call_once(init_flag, [gpu_id]() {
int gpu_num = platform::GetCUDADeviceCount(); devices = platform::GetSelectedDevices();
PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id, int gpu_num = devices.size();
gpu_num);
a_arr = new BuddyAllocator *[gpu_num]; a_arr = new BuddyAllocator *[gpu_num];
for (int i = 0; i < gpu_num; i++) { for (size_t i = 0; i < devices.size(); ++i) {
int dev_id = devices[i];
a_arr[i] = nullptr; a_arr[i] = nullptr;
platform::SetDeviceId(i); platform::SetDeviceId(dev_id);
a_arr[i] = new BuddyAllocator( a_arr[i] = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
std::unique_ptr<detail::SystemAllocator>(new detail::GPUAllocator(i)), new detail::GPUAllocator(dev_id)),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use " VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100 << FLAGS_fraction_of_gpu_memory_to_use * 100
...@@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { ...@@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) {
}); });
platform::SetDeviceId(gpu_id); platform::SetDeviceId(gpu_id);
return a_arr[gpu_id]; auto pos = std::distance(devices.begin(),
std::find(devices.begin(), devices.end(), gpu_id));
return a_arr[pos];
} }
#endif #endif
......
...@@ -70,7 +70,7 @@ endif() ...@@ -70,7 +70,7 @@ endif()
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
if (WITH_GPU) if (WITH_GPU)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu)
endif() endif()
# FIXME(typhoonzero): operator deps may not needed. # FIXME(typhoonzero): operator deps may not needed.
......
...@@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, ...@@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
} }
#endif #endif
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>(name)->type()), framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout,
ctx.GetPlace(), layout, library); library);
} }
class ActivationOp : public framework::OperatorWithKernel { class ActivationOp : public framework::OperatorWithKernel {
......
...@@ -41,6 +41,12 @@ static std::unordered_set<std::string> InplaceOpSet = { ...@@ -41,6 +41,12 @@ static std::unordered_set<std::string> InplaceOpSet = {
"floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid",
}; };
/* The following operator can be used to process SelectedRows, because the
* output of those operator for zero is zero too.
*/
static std::unordered_set<std::string> CanBeUsedBySelectedRows = {
"abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"};
static bool IsInplace(std::string op) { return InplaceOpSet.count(op); } static bool IsInplace(std::string op) { return InplaceOpSet.count(op); }
template <typename DeviceContext, typename Functor> template <typename DeviceContext, typename Functor>
...@@ -50,16 +56,38 @@ class ActivationKernel ...@@ -50,16 +56,38 @@ class ActivationKernel
using T = typename Functor::ELEMENT_TYPE; using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto& X = detail::Ref(context.Input<framework::Tensor>("X"), auto x_var = context.InputVar("X");
"Cannot get input tensor X, variable name = %s", auto out_var = context.OutputVar("Out");
context.op().Input("X")); PADDLE_ENFORCE(x_var != nullptr,
"Cannot get input Variable X, variable name = %s",
auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"), context.op().Input("X"));
"Cannot get output tensor Out, variable name = %s", PADDLE_ENFORCE(out_var != nullptr,
context.op().Output("Out")); "Cannot get output Variable Out, variable name = %s",
Out.mutable_data<T>(context.GetPlace()); context.op().Output("Out"));
framework::Tensor X, *Out;
if (CanBeUsedBySelectedRows.count(context.op().Type())) {
X = detail::Ref(
paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var),
"Cannot get input Tensor X, variable name = %s",
context.op().Input("X"));
Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
out_var);
} else {
X = detail::Ref(context.Input<framework::Tensor>("X"),
"Cannot get input Tensor X, variable name = %s",
context.op().Input("X"));
Out = context.Output<framework::Tensor>("Out");
}
PADDLE_ENFORCE(Out != nullptr,
"Cannot get output tensor Out, variable name = %s",
context.op().Output("Out"));
Out->mutable_data<T>(context.GetPlace());
auto x = framework::EigenVector<T>::Flatten(X); auto x = framework::EigenVector<T>::Flatten(X);
auto out = framework::EigenVector<T>::Flatten(Out); auto out = framework::EigenVector<T>::Flatten(*Out);
auto* place = auto* place =
context.template device_context<DeviceContext>().eigen_device(); context.template device_context<DeviceContext>().eigen_device();
Functor functor; Functor functor;
...@@ -78,14 +106,54 @@ class ActivationGradKernel ...@@ -78,14 +106,54 @@ class ActivationGradKernel
public: public:
using T = typename Functor::ELEMENT_TYPE; using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* Out = context.Input<framework::Tensor>("Out"); auto out_var = context.InputVar("Out");
auto* dOut = auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
context.Input<framework::Tensor>(framework::GradVarName("Out")); auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X")); PADDLE_ENFORCE(out_var != nullptr,
"Cannot get input Variable Out, variable name = %s",
context.op().Input("Out"));
PADDLE_ENFORCE(out_grad_var != nullptr,
"Cannot get input Variable %s, variable name = %s",
framework::GradVarName("Out"),
context.op().Input(framework::GradVarName("Out")));
PADDLE_ENFORCE(x_grad_var != nullptr,
"Cannot get output Variable %s, variable name = %s",
framework::GradVarName("X"),
context.op().Output(framework::GradVarName("X")));
framework::Tensor Out, dOut, *dX;
if (CanBeUsedBySelectedRows.count(context.op().Type())) {
Out = detail::Ref(
paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var),
"Cannot get input Tensor Out, variable name = %s",
context.op().Input("Out"));
dOut =
detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
*out_grad_var),
"Cannot get input Tensor %s, variable name = %s",
framework::GradVarName("Out"),
context.op().Input(framework::GradVarName("Out")));
dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
x_grad_var);
} else {
Out = detail::Ref(context.Input<framework::Tensor>("Out"),
"Cannot get input Tensor Out, variable name = %s",
context.op().Input("Out"));
dOut = detail::Ref(
context.Input<framework::Tensor>(framework::GradVarName("Out")),
"Cannot get input Tensor %s, variable name = %s",
framework::GradVarName("Out"),
context.op().Input(framework::GradVarName("Out")));
dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
}
PADDLE_ENFORCE(dX != nullptr,
"Cannot get output tensor %s, variable name = %s",
framework::GradVarName("X"),
context.op().Output(framework::GradVarName("X")));
dX->mutable_data<T>(context.GetPlace()); dX->mutable_data<T>(context.GetPlace());
auto dout = framework::EigenVector<T>::Flatten(*dOut); auto dout = framework::EigenVector<T>::Flatten(dOut);
auto out = framework::EigenVector<T>::Flatten(*Out); auto out = framework::EigenVector<T>::Flatten(Out);
auto dx = framework::EigenVector<T>::Flatten(*dX); auto dx = framework::EigenVector<T>::Flatten(*dX);
auto* place = auto* place =
context.template device_context<DeviceContext>().eigen_device(); context.template device_context<DeviceContext>().eigen_device();
...@@ -96,8 +164,19 @@ class ActivationGradKernel ...@@ -96,8 +164,19 @@ class ActivationGradKernel
} }
bool inplace = functor.Inplace(); bool inplace = functor.Inplace();
if (!inplace) { if (!inplace) {
auto* X = context.Input<framework::Tensor>("X"); auto x_var = context.InputVar("X");
auto x = framework::EigenVector<T>::Flatten(*X); PADDLE_ENFORCE(x_var != nullptr,
"Cannot get input tensor X, variable name = %s",
context.op().Input("X"));
framework::Tensor X;
if (CanBeUsedBySelectedRows.count(context.op().Type())) {
X = detail::Ref(
paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var));
} else {
X = detail::Ref(context.Input<framework::Tensor>("X"));
}
auto x = framework::EigenVector<T>::Flatten(X);
functor(*place, x, out, dout, dx); functor(*place, x, out, dout, dx);
} else { } else {
VLOG(10) << " Inplace activation "; VLOG(10) << " Inplace activation ";
......
...@@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM. ...@@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM.
template <typename T> template <typename T>
inline void bias_relu(const int n, const T* x, const T* bias, T* y) { inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
if (bias) { if (bias) {
math::vec_add_bias<T, platform::jit::avx>(n, *bias, x, y); math::vec_add_bias<T, platform::avx>(n, *bias, x, y);
math::vec_relu<T, platform::jit::avx>(n, y, y); math::vec_relu<T, platform::avx>(n, y, y);
} else { } else {
math::vec_relu<T, platform::jit::avx>(n, x, y); math::vec_relu<T, platform::avx>(n, x, y);
} }
} }
...@@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) { ...@@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) {
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
scalar = scalar < x[i] ? x[i] : scalar; scalar = scalar < x[i] ? x[i] : scalar;
} }
math::vec_add_bias<T, platform::jit::avx>(n, -scalar, x, y); // sub math::vec_add_bias<T, platform::avx>(n, -scalar, x, y); // sub
math::vec_exp<T>(n, y, y); // exp math::vec_exp<T>(n, y, y); // exp
// sum // sum
scalar = T(0); scalar = T(0);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
...@@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> { ...@@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::MayIUse(platform::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor; math::VecActivations<T, platform::avx> act_functor;
act_gate = act_functor(act_gate_str); act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str); act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str); act_cand = act_functor(act_cand_str);
} else { } else {
math::VecActivations<T, platform::jit::isa_any> act_functor; math::VecActivations<T, platform::isa_any> act_functor;
act_gate = act_functor(act_gate_str); act_gate = act_functor(act_gate_str);
act_cell = act_functor(act_cell_str); act_cell = act_functor(act_cell_str);
act_cand = act_functor(act_cand_str); act_cand = act_functor(act_cand_str);
......
...@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
auto x_dims = framework::vectorize(input->dims()); auto x_dims = framework::vectorize(input->dims());
auto f_dims = framework::vectorize(filter->dims()); auto f_dims = framework::vectorize(filter->dims());
if (activation == "identity") { if (!exhaustive_search) {
// Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
// enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
} else if (!exhaustive_search) {
CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
...@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> { ...@@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit,
"workspace_size to be allocated exceeds the limit"); "workspace_size to be allocated exceeds the limit");
// ------------------- cudnn conv+bias+act forward -------------------- if ((activation == "identity") &&
ScalingParamType<T> alpha1 = 1.0f; (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) &&
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f; (!residual)) {
auto cudnn_func = [&](void* cudnn_workspace) { // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, // But test in some case, the speed is slower, change to use
filter_data, cudnn_conv_desc, algo, cudnn_workspace, // cudnnConvolutionForward and cudnnAddTensor
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, // ------------- cudnn conv forward and bias add ---------------------
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, ScalingParamType<T> alpha = 1.0f, beta = 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward(
handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
CUDNN_ENFORCE(platform::dynload::cudnnAddTensor(
handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc,
output_data)); output_data));
}; } else {
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); if (activation == "identity") {
algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
}
// ------------------- cudnn conv+bias+act forward --------------------
ScalingParamType<T> alpha1 = 1.0f;
ScalingParamType<T> alpha2 = residual ? 1.0f : 0.0f;
auto cudnn_func = [&](void* cudnn_workspace) {
CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward(
handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc,
filter_data, cudnn_conv_desc, algo, cudnn_workspace,
workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data,
cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc,
output_data));
};
workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
}
} }
}; };
#endif #endif
......
...@@ -28,6 +28,46 @@ using mkldnn::stream; ...@@ -28,6 +28,46 @@ using mkldnn::stream;
using platform::to_void_cast; using platform::to_void_cast;
using platform::GetMKLDNNFormat; using platform::GetMKLDNNFormat;
inline void GetWeightsTz(std::vector<int>& weights_tz, int groups, // NOLINT
bool is_conv3d) {
if (groups > 1) {
if (is_conv3d) {
int output = weights_tz[0];
int input = weights_tz[1];
int dimension = weights_tz[2];
int height = weights_tz[3];
int width = weights_tz[4];
weights_tz.resize(6);
weights_tz[0] = groups;
weights_tz[1] = output / groups;
weights_tz[2] = input;
weights_tz[3] = dimension;
weights_tz[4] = height;
weights_tz[5] = width;
} else {
int output = weights_tz[0];
int input = weights_tz[1];
int height = weights_tz[2];
int width = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = groups;
weights_tz[1] = output / groups;
weights_tz[2] = input;
weights_tz[3] = height;
weights_tz[4] = width;
}
}
}
inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
int groups, bool is_conv3d) {
if (is_conv3d) {
return (groups == 1) ? format : mkldnn::memory::format::goidhw;
} else {
return (groups == 1) ? format : mkldnn::memory::format::goihw;
}
}
template <typename T> template <typename T>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
...@@ -52,10 +92,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -52,10 +92,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != memory::format::format_undef, filter->format() != memory::format::format_undef,
"Wrong layout/format set for Filter tensor"); "Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->dims().size() == 4, PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
"Input must be with 4 dimensions, i.e. NCHW"); "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
PADDLE_ENFORCE(filter->dims().size() == 4, PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
"Filter must be with 4 dimensions, i.e. OIHW"); "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
if (bias) { if (bias) {
PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
bias->format() != memory::format::format_undef, bias->format() != memory::format::format_undef,
...@@ -71,9 +111,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -71,9 +111,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection"); bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
bool is_conv3d = strides.size() == 3U;
// TODO(tpatejko): add support for dilation // TODO(tpatejko): add support for dilation
PADDLE_ENFORCE( PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, is_conv3d
? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
dilations[2] == 1
: dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet"); "dilation in convolution is not implemented yet");
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
...@@ -83,18 +127,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -83,18 +127,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> weights_tz = std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims()); paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1); int g = std::max(groups, 1);
if (g > 1) { GetWeightsTz(weights_tz, g, is_conv3d);
int o = weights_tz[0];
int i = weights_tz[1];
int h = weights_tz[2];
int w = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = g;
weights_tz[1] = o / g;
weights_tz[2] = i;
weights_tz[3] = h;
weights_tz[4] = w;
}
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives // Get unique name for storing MKLDNN primitives
...@@ -105,11 +138,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -105,11 +138,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<primitive> pipeline; std::vector<primitive> pipeline;
auto src_format = input->format();
mkldnn::memory::format weights_format =
GetWeightsFormat(filter->format(), g, is_conv3d);
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format()); {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
auto user_weights_md = platform::MKLDNNMemDesc( auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
(g == 1) ? filter->format() : mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
...@@ -119,10 +155,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -119,10 +155,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format = auto chosen_memory_format =
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
if (is_conv3d) {
chosen_memory_format =
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
}
weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation. std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr. // Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
...@@ -263,8 +305,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -263,8 +305,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const mkldnn::engine& engine, const bool fuse_relu, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn, const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) const { mkldnn::prop_kind fwd_prop_kind) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = strides;
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = paddings;
auto conv_desc = mkldnn::convolution_forward::desc( auto conv_desc = mkldnn::convolution_forward::desc(
fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst, fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
...@@ -288,8 +330,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -288,8 +330,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const mkldnn::engine& engine, const bool fuse_relu, const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn, const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) const { mkldnn::prop_kind fwd_prop_kind) const {
memory::dims stride_dims = {strides[0], strides[1]}; memory::dims stride_dims = strides;
memory::dims padding_dims = {paddings[0], paddings[1]}; memory::dims padding_dims = paddings;
auto conv_desc = mkldnn::convolution_forward::desc( auto conv_desc = mkldnn::convolution_forward::desc(
fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst, fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
...@@ -349,6 +391,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -349,6 +391,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations"); std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups"); int groups = ctx.Attr<int>("groups");
bool is_conv3d = strides.size() == 3U;
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>(); const T* filter_data = filter->data<T>();
const T* output_grad_data = output_grad->data<T>(); const T* output_grad_data = output_grad->data<T>();
...@@ -358,8 +401,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -358,8 +401,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims()); std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz = std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims()); paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
GetWeightsTz(weights_tz, g, is_conv3d);
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims()); std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
auto src_format = input->format();
mkldnn::memory::format weights_format =
GetWeightsFormat(filter->format(), g, is_conv3d);
// Get an unique name from "argument" name of "Output" variable // Get an unique name from "argument" name of "Output" variable
// as well as attributes of primitive to be created // as well as attributes of primitive to be created
// This name will be used as key when saving info into device context // This name will be used as key when saving info into device context
...@@ -372,9 +421,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -372,9 +421,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// Create user memory descriptors // Create user memory descriptors
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format()); {src_tz}, platform::MKLDNNGetDataType<T>(), src_format);
auto user_weights_md = platform::MKLDNNMemDesc( auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<T>(), filter->format()); {weights_tz}, platform::MKLDNNGetDataType<T>(), weights_format);
auto user_diff_dst_md = platform::MKLDNNMemDesc( auto user_diff_dst_md = platform::MKLDNNMemDesc(
{dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format()); {dst_tz}, platform::MKLDNNGetDataType<T>(), output_grad->format());
...@@ -386,14 +435,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -386,14 +435,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto chosen_memory_format = auto chosen_memory_format =
platform::data_format_to_memory_format(data_format); platform::data_format_to_memory_format(data_format);
if (is_conv3d) {
chosen_memory_format =
platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
}
weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d);
auto src_md = platform::MKLDNNMemDesc( auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto diff_src_md = platform::MKLDNNMemDesc( auto diff_src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc( auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
auto diff_weights_md = platform::MKLDNNMemDesc( auto diff_weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); weights_tz, platform::MKLDNNGetDataType<T>(), weights_format);
auto diff_dst_md = platform::MKLDNNMemDesc( auto diff_dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
...@@ -491,8 +546,22 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> { ...@@ -491,8 +546,22 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
ops::ConvMKLDNNOpKernel<float>); ::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::ConvMKLDNNOpKernel<float>);
ops::ConvMKLDNNGradOpKernel<float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNGradOpKernel<float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNGradOpKernel<float>);
...@@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOp::GetExpectedKernelType( framework::OpKernelType ConvOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
int customized_type_value =
framework::OpKernelType::kDefaultCustomizedTypeValue;
framework::LibraryType library{framework::LibraryType::kPlain}; framework::LibraryType library{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
...@@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN; library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN; layout = framework::DataLayout::kMKLDNN;
customized_type_value = kConvMKLDNNFP32;
} }
#endif #endif
...@@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
} }
return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
library); library, customized_type_value);
} }
void Conv2DOpMaker::Make() { void Conv2DOpMaker::Make() {
...@@ -131,14 +134,14 @@ void Conv2DOpMaker::Make() { ...@@ -131,14 +134,14 @@ void Conv2DOpMaker::Make() {
"The format of output tensor is X (one-dimensional) of size equal" "The format of output tensor is X (one-dimensional) of size equal"
"to the number of output channels. Only used with MKL-DNN.") "to the number of output channels. Only used with MKL-DNN.")
.AsDispensable(); .AsDispensable();
AddOutput("Output",
"(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.");
AddInput("ResidualData", AddInput("ResidualData",
"(Tensor) Tensor with residual data " "(Tensor) Tensor with residual data "
"to which convolution output will be added." "to which convolution output will be added."
"Used with fuse_residual_connection fusion.") "Used with fuse_residual_connection fusion.")
.AsDispensable(); .AsDispensable();
AddOutput("Output",
"(Tensor) The output tensor of convolution operator. "
"The format of output tensor is also NCHW.");
AddAttr<std::vector<int>>("strides", AddAttr<std::vector<int>>("strides",
"(vector<int> default:{1, 1}), the " "(vector<int> default:{1, 1}), the "
"strides(h_stride, w_stride) of " "strides(h_stride, w_stride) of "
...@@ -229,6 +232,10 @@ $$ ...@@ -229,6 +232,10 @@ $$
} }
void Conv3DOpMaker::Make() { void Conv3DOpMaker::Make() {
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddInput( AddInput(
"Input", "Input",
"(Tensor) The input tensor of convolution operator. " "(Tensor) The input tensor of convolution operator. "
...@@ -244,6 +251,11 @@ void Conv3DOpMaker::Make() { ...@@ -244,6 +251,11 @@ void Conv3DOpMaker::Make() {
"is the width of the filter." "is the width of the filter."
"If the groups attribute is greater than 1, C equals the number of " "If the groups attribute is greater than 1, C equals the number of "
"input image channels divided by the groups."); "input image channels divided by the groups.");
AddInput("ResidualData",
"(Tensor) Tensor with residual data "
"to which convolution output will be added."
"Used with fuse_residual_connection fusion.")
.AsDispensable();
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution operator." "(Tensor) The output tensor of convolution operator."
"The format of output tensor is also NCDHW."); "The format of output tensor is also NCDHW.");
...@@ -277,6 +289,13 @@ void Conv3DOpMaker::Make() { ...@@ -277,6 +289,13 @@ void Conv3DOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_residual_connection",
"(bool, default false) Only used in mkldnn kernel. Used "
"whenever convolution output is as an input to residual "
"connection.")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
...@@ -342,6 +361,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -342,6 +361,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOpGrad::GetExpectedKernelType( framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
int customized_type_value =
framework::OpKernelType::kDefaultCustomizedTypeValue;
framework::LibraryType library_{framework::LibraryType::kPlain}; framework::LibraryType library_{framework::LibraryType::kPlain};
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready // TODO(pzelazko-intel): enable MKLDNN layout when it's ready
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
...@@ -357,12 +378,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( ...@@ -357,12 +378,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
platform::CanMKLDNNBeUsed(ctx)) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN; library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN; layout_ = framework::DataLayout::kMKLDNN;
customized_type_value = kConvMKLDNNFP32;
} }
#endif #endif
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
layout_, library_); layout_, library_, customized_type_value);
} }
} // namespace operators } // namespace operators
......
...@@ -27,6 +27,8 @@ namespace paddle { ...@@ -27,6 +27,8 @@ namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
constexpr int kConvMKLDNNFP32 = 1;
constexpr int kConvMKLDNNINT8 = 2;
// Base convolution operator definations for other conv // Base convolution operator definations for other conv
// like operators to reuse the implementation. // like operators to reuse the implementation.
......
...@@ -177,11 +177,19 @@ struct CudnnRNNCache { ...@@ -177,11 +177,19 @@ struct CudnnRNNCache {
seed_)); seed_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
#if CUDNN_VERSION >= 6000
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
CUDNN_LINEAR_INPUT, CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
#else
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor(
rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_DATA_FLOAT));
#endif
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
......
...@@ -60,15 +60,37 @@ template <typename DeviceContext, typename T> ...@@ -60,15 +60,37 @@ template <typename DeviceContext, typename T>
class ElementwiseMulKernel : public framework::OpKernel<T> { class ElementwiseMulKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<framework::LoDTensor>("X"); auto x_var = ctx.InputVar("X");
PADDLE_ENFORCE(x_var != nullptr,
"Cannot get input Variable X, variable name = %s",
ctx.op().Input("X"));
auto* y = ctx.Input<framework::LoDTensor>("Y"); auto* y = ctx.Input<framework::LoDTensor>("Y");
auto* z = ctx.Output<framework::LoDTensor>("Out");
framework::Tensor x, *z;
if (x_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(y->dims().size() == 1 && y->dims()[0] == 1,
"For elementwise_op, if X is Sparse, Y must be scalar.");
auto& x_sele = x_var->Get<framework::SelectedRows>();
auto out_sele = ctx.Output<framework::SelectedRows>("Out");
x = x_sele.value();
out_sele->set_rows(x_sele.rows());
out_sele->set_height(x_sele.height());
out_sele->mutable_value()->Resize(x_sele.value().dims());
out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type());
z = ctx.Output<framework::SelectedRows>("Out")->mutable_value();
} else if (x_var->IsType<framework::LoDTensor>()) {
x = x_var->Get<framework::LoDTensor>();
z = ctx.Output<framework::LoDTensor>("Out");
} else {
PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
x_var->Type().name());
}
z->mutable_data<T>(ctx.GetPlace()); z->mutable_data<T>(ctx.GetPlace());
if (x->numel() == y->numel()) { if (x.numel() == y->numel()) {
elementwise_mul<DeviceContext, T>(ctx, x, y, z); elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
} else { } else {
default_elementwise_mul<DeviceContext, T>(ctx, x, y, z); default_elementwise_mul<DeviceContext, T>(ctx, &x, y, z);
} }
} }
}; };
......
...@@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel { ...@@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of elementwise op should not be null."); "Output(Out) of elementwise op should not be null.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("X").front() ==
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->GetInputsVarType("Y").front() == ctx->GetInputsVarType("Y").front() ==
framework::proto::VarType::LOD_TENSOR, framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s", "The input var's type should be LoDTensor, but the received is %s [%s]",
ctx->Inputs("Y").front(), ctx->GetInputsVarType("Y").front()); ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front());
auto x_dim = ctx->GetInputDim("X"); if (ctx->GetInputsVarType("X").front() ==
auto y_dim = ctx->GetInputDim("Y"); framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), auto x_dim = ctx->GetInputDim("X");
"Rank of first input must >= rank of second input."); auto y_dim = ctx->GetInputDim("Y");
PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
"Rank of first input must >= rank of second input.");
} else if (ctx->GetInputsVarType("X").front() ==
framework::proto::VarType::SELECTED_ROWS) {
PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) &&
(ctx->GetInputDim("Y")[0] == 1),
"For elementwise_op, if X is Sparse, "
"Y must be scalar.");
} else {
PADDLE_THROW("X's type[%s] is not supported by elementwise_op.",
ctx->GetInputsVarType("X").front());
}
ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareDim("X", /*->*/ "Out");
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
......
...@@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \ auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \ auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \ auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (platform::jit::MayIUse(platform::jit::avx)) { \ if (platform::MayIUse(platform::avx)) { \
math::VecActivations<T, platform::jit::avx> act_functor; \ math::VecActivations<T, platform::avx> act_functor; \
act_gate = act_functor(act_gate_str); \ act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \ act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \ act_cand = act_functor(act_cand_str); \
} else { \ } else { \
math::VecActivations<T, platform::jit::isa_any> act_functor; \ math::VecActivations<T, platform::isa_any> act_functor; \
act_gate = act_functor(act_gate_str); \ act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \ act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \ act_cand = act_functor(act_cand_str); \
......
...@@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> { ...@@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
std::function<void(const int, const T*, T*)> fc_act; std::function<void(const int, const T*, T*)> fc_act;
auto& fc_act_str = ctx.Attr<std::string>("fc_activation"); auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::MayIUse(platform::avx)) {
math::VecActivations<T, platform::jit::avx> act_functor; math::VecActivations<T, platform::avx> act_functor;
fc_act = act_functor(fc_act_str); fc_act = act_functor(fc_act_str);
} else { } else {
math::VecActivations<T, platform::jit::isa_any> act_functor; math::VecActivations<T, platform::isa_any> act_functor;
fc_act = act_functor(fc_act_str); fc_act = act_functor(fc_act_str);
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
namespace paddle {
namespace operators {
class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"GetTensorFromSelectedRowsOp must has input X.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"GetTensorFromSelectedRowsOp must has output Out.");
PADDLE_ENFORCE(
ctx->GetInputsVarType("X").front() ==
framework::proto::VarType::SELECTED_ROWS,
"The input X's type should be SelectedRows, but the received is %s",
ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front());
PADDLE_ENFORCE(
ctx->GetOutputsVarType("Out").front() ==
framework::proto::VarType::LOD_TENSOR,
"The output Out's type should be LoDTensor, but the received is %s",
ctx->Outputs("Out").front(), ctx->GetOutputsVarType("Out").front());
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.device_context());
}
};
class GetTensorFromSelectedRowsKernel {
public:
void operator()(const framework::ExecutionContext &ctx) const {
auto *x = ctx.Input<framework::SelectedRows>("X");
auto *out = ctx.Output<framework::LoDTensor>("Out");
out->Resize(x->value().dims());
out->mutable_data(ctx.GetPlace(), x->value().type());
framework::TensorCopy(x->value(), ctx.GetPlace(), ctx.device_context(),
out);
}
};
class GetTensorFromSelectedRowsOpProtoMaker
: public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "The input type is SelectedRows.");
AddOutput("Out", "The output type is LoDTensor.");
AddComment(
R"DOC(
GetTensorFromSelectedRows Operator
GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
)DOC");
}
};
class GetTensorFromSelectedRowsOpVarTypeInference
: public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const final {
auto out_var_name = op_desc.Output("Out").front();
auto in_var_name = op_desc.Input("X").front();
auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
out_var.SetType(framework::proto::VarType::LOD_TENSOR);
out_var.SetDataType(in_var.GetDataType());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(get_tensor_from_selected_rows,
ops::GetTensorFromSelectedRowsOp,
ops::GetTensorFromSelectedRowsOpProtoMaker,
ops::GetTensorFromSelectedRowsOpVarTypeInference);
REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
ops::GetTensorFromSelectedRowsKernel, double,
ops::GetTensorFromSelectedRowsKernel, int,
ops::GetTensorFromSelectedRowsKernel, int64_t,
ops::GetTensorFromSelectedRowsKernel);
#ifdef PADDLE_WITH_CUDA
REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float,
ops::GetTensorFromSelectedRowsKernel, double,
ops::GetTensorFromSelectedRowsKernel, int,
ops::GetTensorFromSelectedRowsKernel, int64_t,
ops::GetTensorFromSelectedRowsKernel);
#endif
...@@ -150,14 +150,14 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { ...@@ -150,14 +150,14 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
"Output(W@Grad should not be null."); "Output(W@Grad should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@Grad should not be null."); "Output(X@Grad should not be null.");
if (!ctx->Attrs().Get<bool>("is_sparse")) {
if (ctx->HasOutput(framework::GradVarName("Bias"))) { if (ctx->HasOutput(framework::GradVarName("Bias"))) {
ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->SetOutputDim(framework::GradVarName("Bias"),
ctx->GetInputDim("Bias")); ctx->GetInputDim("Bias"));
}
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
} }
ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
} }
protected: protected:
......
...@@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
ctx.Output<framework::SelectedRows>(framework::GradVarName("W")); ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
w_grad->set_rows(real_rows); w_grad->set_rows(real_rows);
// Build a map of id -> row_index to speed up finding the index of one id // Build a map of id -> row_index to speed up finding the index of one id
w_grad->SyncIndex();
w_grad->set_height(w.dims()[0]); w_grad->set_height(w.dims()[0]);
auto* w_grad_value = w_grad->mutable_value(); auto* w_grad_value = w_grad->mutable_value();
framework::DDim temp_dim(w.dims()); framework::DDim temp_dim(w.dims());
......
...@@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase { ...@@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase {
const platform::Place &place) const override { const platform::Place &place) const override {
auto filename = Attr<std::string>("file_path"); auto filename = Attr<std::string>("file_path");
auto load_as_fp16 = Attr<bool>("load_as_fp16"); auto load_as_fp16 = Attr<bool>("load_as_fp16");
auto model_from_memory = Attr<bool>("model_from_memory");
std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
auto out_var_names = Outputs("Out"); auto out_var_names = Outputs("Out");
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
static_cast<int>(out_var_names.size()), 0, static_cast<int>(out_var_names.size()), 0,
"The number of output variables should be greater than 0."); "The number of output variables should be greater than 0.");
if (!model_from_memory) {
std::ifstream fin(filename);
PADDLE_ENFORCE(static_cast<bool>(fin),
"Cannot open file %s for load_combine op", filename);
LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
} else {
PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory");
std::stringstream fin(filename);
LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names);
}
}
void LoadParamsFromBuffer(
const framework::Scope &scope, const platform::Place &place,
std::istream *buffer, bool load_as_fp16,
const std::vector<std::string> &out_var_names) const {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
...@@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase { ...@@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase {
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
// Error checking // Error checking
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s", PADDLE_ENFORCE(static_cast<bool>(buffer), "Cannot read more");
filename);
// Get data from fin to tensor // Get data from fin to tensor
DeserializeFromStream(fin, tensor, dev_ctx); DeserializeFromStream(*buffer, tensor, dev_ctx);
auto in_dtype = framework::ToDataType(tensor->type()); auto in_dtype = framework::ToDataType(tensor->type());
auto out_dtype = auto out_dtype =
...@@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
"LoDTensors will be loaded from \"file_path\".") "LoDTensors will be loaded from \"file_path\".")
.AddCustomChecker( .AddCustomChecker(
[](const std::string &path) { return !path.empty(); }); [](const std::string &path) { return !path.empty(); });
AddAttr<bool>("model_from_memory",
"(boolean, default false)"
"If true, file_path is in memory, and LoDTensors will be "
"loaded directly from memory")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
LoadCombine Operator. LoadCombine Operator.
LoadCombine operator loads LoDTensor variables from a file. The file should LoadCombine operator loads LoDTensor variables from a file, which could be
contain one or more LoDTensors serialized using the SaveCombine operator. The loaded in memory already. The file should contain one or more LoDTensors
serialized using the SaveCombine operator. The
LoadCombine operator applies a deserialization strategy to appropriately load LoadCombine operator applies a deserialization strategy to appropriately load
the LodTensors, and this strategy complements the serialization strategy used the LodTensors, and this strategy complements the serialization strategy used
in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
......
...@@ -59,6 +59,7 @@ math_library(matrix_bit_code) ...@@ -59,6 +59,7 @@ math_library(matrix_bit_code)
math_library(unpooling) math_library(unpooling)
math_library(vol2col) math_library(vol2col)
math_library(prelu)
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
......
...@@ -77,7 +77,7 @@ inline void vec_scal<double>(const int n, const double a, double* x) { ...@@ -77,7 +77,7 @@ inline void vec_scal<double>(const int n, const double a, double* x) {
#endif #endif
// MKL scal only support inplace, choose this if src and dst are not equal // MKL scal only support inplace, choose this if src and dst are not equal
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_scal(const int n, const T a, const T* x, T* y) { inline void vec_scal(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = a * x[i]; y[i] = a * x[i];
...@@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) { ...@@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_scal<float, platform::jit::avx>(const int n, const float a, inline void vec_scal<float, platform::avx>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_scal<float, platform::jit::isa_any>(n, a, x, y); vec_scal<float, platform::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -114,24 +114,24 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a, ...@@ -114,24 +114,24 @@ inline void vec_scal<float, platform::jit::avx>(const int n, const float a,
y[i] = a * x[i]; y[i] = a * x[i];
} }
#else #else
vec_scal<float, platform::jit::isa_any>(n, a, x, y); vec_scal<float, platform::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_scal<float, platform::jit::avx2>(const int n, const float a, inline void vec_scal<float, platform::avx2>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
vec_scal<float, platform::jit::avx>(n, a, x, y); vec_scal<float, platform::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_scal<float, platform::jit::avx512f>(const int n, const float a, inline void vec_scal<float, platform::avx512f>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_scal<float, platform::jit::avx2>(n, a, x, y); vec_scal<float, platform::avx2>(n, a, x, y);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = a - x[i]; y[i] = a - x[i];
...@@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { ...@@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a, inline void vec_bias_sub<float, platform::avx>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y); vec_bias_sub<float, platform::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -168,27 +168,25 @@ inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a, ...@@ -168,27 +168,25 @@ inline void vec_bias_sub<float, platform::jit::avx>(const int n, const float a,
y[i] = a - x[i]; y[i] = a - x[i];
} }
#else #else
vec_bias_sub<float, platform::jit::isa_any>(n, a, x, y); vec_bias_sub<float, platform::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_bias_sub<float, platform::jit::avx2>(const int n, const float a, inline void vec_bias_sub<float, platform::avx2>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
vec_bias_sub<float, platform::jit::avx>(n, a, x, y); vec_bias_sub<float, platform::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_bias_sub<float, platform::jit::avx512f>(const int n, inline void vec_bias_sub<float, platform::avx512f>(const int n, const float a,
const float a, const float* x, float* y) {
const float* x,
float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_bias_sub<float, platform::jit::avx2>(n, a, x, y); vec_bias_sub<float, platform::avx2>(n, a, x, y);
} }
// out = x*y + (1-x)*z // out = x*y + (1-x)*z
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i]; out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
...@@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { ...@@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
} }
template <> template <>
inline void vec_cross<float, platform::jit::avx>(const int n, const float* x, inline void vec_cross<float, platform::avx>(const int n, const float* x,
const float* y, const float* z, const float* y, const float* z,
float* out) { float* out) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out); vec_cross<float, platform::isa_any>(n, x, y, z, out);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -228,25 +226,26 @@ inline void vec_cross<float, platform::jit::avx>(const int n, const float* x, ...@@ -228,25 +226,26 @@ inline void vec_cross<float, platform::jit::avx>(const int n, const float* x,
out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
} }
#else #else
vec_cross<float, platform::jit::isa_any>(n, x, y, z, out); vec_cross<float, platform::isa_any>(n, x, y, z, out);
#endif #endif
} }
template <> template <>
inline void vec_cross<float, platform::jit::avx2>(const int n, const float* x, inline void vec_cross<float, platform::avx2>(const int n, const float* x,
const float* y, const float* y, const float* z,
const float* z, float* out) { float* out) {
vec_cross<float, platform::jit::avx>(n, x, y, z, out); vec_cross<float, platform::avx>(n, x, y, z, out);
} }
template <> template <>
inline void vec_cross<float, platform::jit::avx512f>( inline void vec_cross<float, platform::avx512f>(const int n, const float* x,
const int n, const float* x, const float* y, const float* z, float* out) { const float* y, const float* z,
float* out) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_cross<float, platform::jit::avx>(n, x, y, z, out); vec_cross<float, platform::avx>(n, x, y, z, out);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_add_bias(const int n, const T a, const T* x, T* y) { inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = x[i] + a; y[i] = x[i] + a;
...@@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) { ...@@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
} }
template <> template <>
inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a, inline void vec_add_bias<float, platform::avx>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_add_bias<float, platform::jit::isa_any>(n, a, x, y); vec_add_bias<float, platform::isa_any>(n, a, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -283,32 +282,30 @@ inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a, ...@@ -283,32 +282,30 @@ inline void vec_add_bias<float, platform::jit::avx>(const int n, const float a,
y[i] = x[i] + a; y[i] = x[i] + a;
} }
#else #else
vec_add_bias<float, platform::jit::isa_any>(n, a, x, y); vec_add_bias<float, platform::isa_any>(n, a, x, y);
#endif #endif
} }
template <> template <>
inline void vec_add_bias<float, platform::jit::avx2>(const int n, const float a, inline void vec_add_bias<float, platform::avx2>(const int n, const float a,
const float* x, float* y) { const float* x, float* y) {
vec_add_bias<float, platform::jit::avx>(n, a, x, y); vec_add_bias<float, platform::avx>(n, a, x, y);
} }
template <> template <>
inline void vec_add_bias<float, platform::jit::avx512f>(const int n, inline void vec_add_bias<float, platform::avx512f>(const int n, const float a,
const float a, const float* x, float* y) {
const float* x,
float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_add_bias<float, platform::jit::avx2>(n, a, x, y); vec_add_bias<float, platform::avx2>(n, a, x, y);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_identity(const int n, const T* x, T* y) { inline void vec_identity(const int n, const T* x, T* y) {
// do nothing // do nothing
return; return;
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_sigmoid(const int n, const T* x, T* y) { inline void vec_sigmoid(const int n, const T* x, T* y) {
const T min = SIGMOID_THRESHOLD_MIN; const T min = SIGMOID_THRESHOLD_MIN;
const T max = SIGMOID_THRESHOLD_MAX; const T max = SIGMOID_THRESHOLD_MAX;
...@@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { ...@@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) {
} }
template <> template <>
inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x, inline void vec_sigmoid<float, platform::avx>(const int n, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block) { if (n < block) {
vec_sigmoid<float, platform::jit::isa_any>(n, x, y); vec_sigmoid<float, platform::isa_any>(n, x, y);
return; return;
} }
const int rest = n % block; const int rest = n % block;
...@@ -377,25 +374,24 @@ inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x, ...@@ -377,25 +374,24 @@ inline void vec_sigmoid<float, platform::jit::avx>(const int n, const float* x,
y[i] = 1.f / (1.f + y[i]); y[i] = 1.f / (1.f + y[i]);
} }
#else #else
vec_sigmoid<float, platform::jit::isa_any>(n, x, y); vec_sigmoid<float, platform::isa_any>(n, x, y);
#endif #endif
} }
template <> template <>
inline void vec_sigmoid<float, platform::jit::avx2>(const int n, const float* x, inline void vec_sigmoid<float, platform::avx2>(const int n, const float* x,
float* y) { float* y) {
vec_sigmoid<float, platform::jit::avx>(n, x, y); vec_sigmoid<float, platform::avx>(n, x, y);
} }
template <> template <>
inline void vec_sigmoid<float, platform::jit::avx512f>(const int n, inline void vec_sigmoid<float, platform::avx512f>(const int n, const float* x,
const float* x, float* y) {
float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_sigmoid<float, platform::jit::avx2>(n, x, y); vec_sigmoid<float, platform::avx2>(n, x, y);
} }
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_tanh(const int n, const T* x, T* y) { inline void vec_tanh(const int n, const T* x, T* y) {
vec_scal<T, isa>(n, static_cast<T>(2), x, y); vec_scal<T, isa>(n, static_cast<T>(2), x, y);
vec_sigmoid<T, isa>(n, y, y); vec_sigmoid<T, isa>(n, y, y);
...@@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) { ...@@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) {
} }
// TODO(TJ): make relu clip // TODO(TJ): make relu clip
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
inline void vec_relu(const int n, const T* x, T* y) { inline void vec_relu(const int n, const T* x, T* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = x[i] > 0 ? x[i] : 0; y[i] = x[i] > 0 ? x[i] : 0;
...@@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) { ...@@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) {
} }
template <> template <>
inline void vec_relu<float, platform::jit::avx>(const int n, const float* x, inline void vec_relu<float, platform::avx>(const int n, const float* x,
float* y) { float* y) {
#ifdef __AVX__ #ifdef __AVX__
constexpr int block = YMM_FLOAT_BLOCK; constexpr int block = YMM_FLOAT_BLOCK;
if (n < block * 4) { if (n < block * 4) {
vec_relu<float, platform::jit::isa_any>(n, x, y); vec_relu<float, platform::isa_any>(n, x, y);
return; return;
} }
...@@ -441,26 +437,26 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x, ...@@ -441,26 +437,26 @@ inline void vec_relu<float, platform::jit::avx>(const int n, const float* x,
#undef MOVE_ONE_STEP #undef MOVE_ONE_STEP
#else #else
vec_relu<float, platform::jit::isa_any>(n, x, y); vec_relu<float, platform::isa_any>(n, x, y);
#endif #endif
} }
template <> template <>
inline void vec_relu<float, platform::jit::avx2>(const int n, const float* x, inline void vec_relu<float, platform::avx2>(const int n, const float* x,
float* y) { float* y) {
vec_relu<float, platform::jit::avx>(n, x, y); vec_relu<float, platform::avx>(n, x, y);
} }
template <> template <>
inline void vec_relu<float, platform::jit::avx512f>(const int n, const float* x, inline void vec_relu<float, platform::avx512f>(const int n, const float* x,
float* y) { float* y) {
// TODO(TJ): enable me // TODO(TJ): enable me
vec_relu<float, platform::jit::avx2>(n, x, y); vec_relu<float, platform::avx2>(n, x, y);
} }
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
template <typename T, platform::jit::cpu_isa_t isa = platform::jit::isa_any> template <typename T, platform::cpu_isa_t isa = platform::isa_any>
class VecActivations { class VecActivations {
public: public:
std::function<void(const int, const T*, T*)> operator()( std::function<void(const int, const T*, T*)> operator()(
......
...@@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt, ...@@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function<void(const int, const T*, T*)> tgt,
} }
TEST(CpuVecTest, sigmoid) { TEST(CpuVecTest, sigmoid) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>); TestAndBench<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>); TestAndBench<float>(sz, vec_sigmoid<float, platform::avx>,
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>); ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, jit::avx512f>, TestAndBench<float>(sz, vec_sigmoid<float, platform::avx2>,
ref_sigmoid<float>);
TestAndBench<float>(sz, vec_sigmoid<float, platform::avx512f>,
ref_sigmoid<float>); ref_sigmoid<float>);
} }
TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>); TestAndBench<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
} }
TEST(CpuVecTest, tanh) { TEST(CpuVecTest, tanh) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
TestAndBench<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>); TestAndBench<float>(sz, vec_tanh<float, platform::avx512f>,
ref_tanh<float>);
} }
TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>); TestAndBench<double>(30, vec_tanh<double>, ref_tanh<double>);
} }
TEST(CpuVecTest, relu) { TEST(CpuVecTest, relu) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
TestAndBench<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>); TestAndBench<float>(sz, vec_relu<float, platform::avx512f>,
ref_relu<float>);
} }
TestAndBench<double>(30, vec_relu<double>, ref_relu<double>); TestAndBench<double>(30, vec_relu<double>, ref_relu<double>);
} }
...@@ -162,38 +166,40 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt, ...@@ -162,38 +166,40 @@ void TestInplace(const int n, std::function<void(const int, const T*, T*)> tgt,
} }
TEST(CpuVecTest, inplace_sigmoid) { TEST(CpuVecTest, inplace_sigmoid) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>); TestInplace<float>(sz, vec_sigmoid<float>, ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx>, ref_sigmoid<float>); TestInplace<float>(sz, vec_sigmoid<float, platform::avx>,
TestInplace<float>(sz, vec_sigmoid<float, jit::avx2>, ref_sigmoid<float>); ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, jit::avx512f>, TestInplace<float>(sz, vec_sigmoid<float, platform::avx2>,
ref_sigmoid<float>);
TestInplace<float>(sz, vec_sigmoid<float, platform::avx512f>,
ref_sigmoid<float>); ref_sigmoid<float>);
} }
TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>); TestInplace<double>(30, vec_sigmoid<double>, ref_sigmoid<double>);
} }
TEST(CpuVecTest, inplace_tanh) { TEST(CpuVecTest, inplace_tanh) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float, platform::avx>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx2>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float, platform::avx2>, ref_tanh<float>);
TestInplace<float>(sz, vec_tanh<float, jit::avx512f>, ref_tanh<float>); TestInplace<float>(sz, vec_tanh<float, platform::avx512f>, ref_tanh<float>);
} }
TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>); TestInplace<double>(30, vec_tanh<double>, ref_tanh<double>);
} }
TEST(CpuVecTest, inplace_relu) { TEST(CpuVecTest, inplace_relu) {
namespace jit = paddle::platform::jit; namespace platform = paddle::platform;
using namespace paddle::operators::math; // NOLINT using namespace paddle::operators::math; // NOLINT
for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) {
TestInplace<float>(sz, vec_relu<float>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float, platform::avx>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx2>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float, platform::avx2>, ref_relu<float>);
TestInplace<float>(sz, vec_relu<float, jit::avx512f>, ref_relu<float>); TestInplace<float>(sz, vec_relu<float, platform::avx512f>, ref_relu<float>);
} }
TestInplace<double>(30, vec_relu<double>, ref_relu<double>); TestInplace<double>(30, vec_relu<double>, ref_relu<double>);
} }
...@@ -22,7 +22,7 @@ namespace math { ...@@ -22,7 +22,7 @@ namespace math {
namespace jitkernel { namespace jitkernel {
namespace gen { namespace gen {
using namespace platform::jit; // NOLINT using namespace platform; // NOLINT
bool VXXJitCode::init(int d, int scalar_index) { bool VXXJitCode::init(int d, int scalar_index) {
// It's not necessary to use avx512 since it would slow down the frequency // It's not necessary to use avx512 since it would slow down the frequency
......
...@@ -179,7 +179,7 @@ class VActJitCode : public JitCode { ...@@ -179,7 +179,7 @@ class VActJitCode : public JitCode {
template <typename JMM> template <typename JMM>
void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT
int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) {
using namespace platform::jit; // NOLINT using namespace platform; // NOLINT
// check all idx can not equal // check all idx can not equal
JMM jmm_src = JMM(src_idx); JMM jmm_src = JMM(src_idx);
JMM jmm_fx = JMM(fx_idx); JMM jmm_fx = JMM(fx_idx);
......
...@@ -36,7 +36,7 @@ void JitCode::preCode() { ...@@ -36,7 +36,7 @@ void JitCode::preCode() {
for (int i = 0; i < num_g_abi_regs; ++i) { for (int i = 0; i < num_g_abi_regs; ++i) {
push(Xbyak::Reg64(g_abi_regs[i])); push(Xbyak::Reg64(g_abi_regs[i]));
} }
if (platform::jit::MayIUse(platform::jit::avx512f)) { if (platform::MayIUse(platform::avx512f)) {
mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
} }
} }
......
...@@ -21,8 +21,6 @@ namespace operators { ...@@ -21,8 +21,6 @@ namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
KernelPool& KernelPool::Instance() { KernelPool& KernelPool::Instance() {
static thread_local KernelPool g_jit_kernels; static thread_local KernelPool g_jit_kernels;
return g_jit_kernels; return g_jit_kernels;
......
...@@ -30,7 +30,6 @@ namespace paddle { ...@@ -30,7 +30,6 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
template <typename T> template <typename T>
...@@ -125,7 +124,7 @@ bool VMulKernelImpl<float>::useJIT(int d) { ...@@ -125,7 +124,7 @@ bool VMulKernelImpl<float>::useJIT(int d) {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
template <> template <>
bool VMulKernelImpl<float>::useMKL(int d) { bool VMulKernelImpl<float>::useMKL(int d) {
return jit::MayIUse(jit::avx512f) && d > 512; return platform::MayIUse(platform::avx512f) && d > 512;
} }
template <> template <>
......
...@@ -25,10 +25,8 @@ namespace operators { ...@@ -25,10 +25,8 @@ namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
/* CRF Decode JitKernel */ /* CRF Decode JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block> template <typename T, platform::cpu_isa_t isa, jit_block>
class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
public: public:
explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() { explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel<T>() {
...@@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
#define INTRIAVX_FLOAT(block) \ #define INTRIAVX_FLOAT(block) \
template <> \ template <> \
CRFDecodeKernelImpl<float, jit::avx, block>::CRFDecodeKernelImpl( \ CRFDecodeKernelImpl<float, platform::avx, block>::CRFDecodeKernelImpl( \
int tag_num) \ int tag_num) \
: CRFDecodeKernel<float>() { \ : CRFDecodeKernel<float>() { \
this->num_ = tag_num; \ this->num_ = tag_num; \
...@@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \
} \ } \
template <> \ template <> \
void CRFDecodeKernelImpl<float, jit::avx, block>::Compute( \ void CRFDecodeKernelImpl<float, platform::avx, block>::Compute( \
const int seq_len, const float* x, const float* w, float* alpha, \ const int seq_len, const float* x, const float* w, float* alpha, \
int* track) const { \ int* track) const { \
INIT_ALPHA(YMM_FLOAT_BLOCK) \ INIT_ALPHA(YMM_FLOAT_BLOCK) \
...@@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
#define INTRIAVX512_FLOAT(block) \ #define INTRIAVX512_FLOAT(block) \
template <> \ template <> \
CRFDecodeKernelImpl<float, jit::avx512f, block>::CRFDecodeKernelImpl( \ CRFDecodeKernelImpl<float, platform::avx512f, block>::CRFDecodeKernelImpl( \
int tag_num) \ int tag_num) \
: CRFDecodeKernel<float>() { \ : CRFDecodeKernel<float>() { \
this->num_ = tag_num; \ this->num_ = tag_num; \
...@@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> { ...@@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \ this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \
} \ } \
template <> \ template <> \
void CRFDecodeKernelImpl<float, jit::avx512f, block>::Compute( \ void CRFDecodeKernelImpl<float, platform::avx512f, block>::Compute( \
const int seq_len, const float* x, const float* w, float* alpha, \ const int seq_len, const float* x, const float* w, float* alpha, \
int* track) const { \ int* track) const { \
INIT_ALPHA(ZMM_FLOAT_BLOCK) \ INIT_ALPHA(ZMM_FLOAT_BLOCK) \
...@@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16); ...@@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16);
INTRIAVX_FLOAT(kGT16); INTRIAVX_FLOAT(kGT16);
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
INTRIAVX2_FLOAT(jit::avx2, kEQ8); INTRIAVX2_FLOAT(platform::avx2, kEQ8);
INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); INTRIAVX2_FLOAT(platform::avx2, kGT8LT16);
INTRIAVX2_FLOAT(jit::avx2, kEQ16); INTRIAVX2_FLOAT(platform::avx2, kEQ16);
INTRIAVX2_FLOAT(jit::avx2, kGT16); INTRIAVX2_FLOAT(platform::avx2, kGT16);
#endif #endif
#ifdef __AVX512F__ #ifdef __AVX512F__
INTRIAVX2_FLOAT(jit::avx512f, kEQ8); INTRIAVX2_FLOAT(platform::avx512f, kEQ8);
INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16);
INTRIAVX512_FLOAT(kEQ16); INTRIAVX512_FLOAT(kEQ16);
INTRIAVX512_FLOAT(kGT16); INTRIAVX512_FLOAT(kGT16);
#endif #endif
......
...@@ -29,7 +29,6 @@ namespace paddle { ...@@ -29,7 +29,6 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
// try to use MKL to speedup // try to use MKL to speedup
......
...@@ -22,10 +22,8 @@ namespace operators { ...@@ -22,10 +22,8 @@ namespace operators {
namespace math { namespace math {
namespace jitkernel { namespace jitkernel {
namespace jit = platform::jit;
/* Layer Norm JitKernel */ /* Layer Norm JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block> template <typename T, platform::cpu_isa_t isa, jit_block>
class LayerNormKernelImpl : public LayerNormKernel<T> { class LayerNormKernelImpl : public LayerNormKernel<T> {
public: public:
explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() { explicit LayerNormKernelImpl(int right) : LayerNormKernel<T>() {
...@@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> { ...@@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
this->end_ = this->num_ - this->rest_; \ this->end_ = this->num_ - this->rest_; \
} \ } \
template <> \ template <> \
void LayerNormKernelImpl<float, jit::avx, block>::Compute( \ void LayerNormKernelImpl<float, platform::avx, block>::Compute( \
float* x, float* out, float* mean, float* var, const float* scale, \ float* x, float* out, float* mean, float* var, const float* scale, \
const float* bias, int height, const float epsilon) const { \ const float* bias, int height, const float epsilon) const { \
__m256 sum; \ __m256 sum; \
...@@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel<T> { ...@@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
} }
#ifdef __AVX__ #ifdef __AVX__
INTRIAVX_FLOAT(jit::avx, kEQ8); INTRIAVX_FLOAT(platform::avx, kEQ8);
INTRIAVX_FLOAT(jit::avx, kGT8LT16); INTRIAVX_FLOAT(platform::avx, kGT8LT16);
INTRIAVX_FLOAT(jit::avx, kEQ16); INTRIAVX_FLOAT(platform::avx, kEQ16);
INTRIAVX_FLOAT(jit::avx, kGT16); INTRIAVX_FLOAT(platform::avx, kGT16);
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
INTRIAVX_FLOAT(jit::avx2, kEQ8); INTRIAVX_FLOAT(platform::avx2, kEQ8);
INTRIAVX_FLOAT(jit::avx2, kGT8LT16); INTRIAVX_FLOAT(platform::avx2, kGT8LT16);
INTRIAVX_FLOAT(jit::avx2, kEQ16); INTRIAVX_FLOAT(platform::avx2, kEQ16);
INTRIAVX_FLOAT(jit::avx2, kGT16); INTRIAVX_FLOAT(platform::avx2, kGT16);
#endif #endif
#undef INTRIAVX_FLOAT #undef INTRIAVX_FLOAT
......
...@@ -92,7 +92,6 @@ namespace jitkernel { ...@@ -92,7 +92,6 @@ namespace jitkernel {
JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \ JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \
JITKERNEL_IMPL) JITKERNEL_IMPL)
namespace jit = platform::jit;
// TODO(TJ): below defines are deprecated, would be remove recently // TODO(TJ): below defines are deprecated, would be remove recently
#define SEARCH_BLOCK(macro_, ker, dtype, isa) \ #define SEARCH_BLOCK(macro_, ker, dtype, isa) \
if (d < YMM_FLOAT_BLOCK) { \ if (d < YMM_FLOAT_BLOCK) { \
...@@ -107,15 +106,15 @@ namespace jit = platform::jit; ...@@ -107,15 +106,15 @@ namespace jit = platform::jit;
macro_(ker, dtype, isa, kGT16); \ macro_(ker, dtype, isa, kGT16); \
} }
#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ #define SEARCH_ISA_BLOCK(macro_, ker, dtype) \
if (jit::MayIUse(jit::avx512f)) { \ if (platform::MayIUse(platform::avx512f)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \
} else if (jit::MayIUse(jit::avx2)) { \ } else if (platform::MayIUse(platform::avx2)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ SEARCH_BLOCK(macro_, ker, dtype, platform::avx2); \
} else if (jit::MayIUse(jit::avx)) { \ } else if (platform::MayIUse(platform::avx)) { \
SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ SEARCH_BLOCK(macro_, ker, dtype, platform::avx); \
} else { \ } else { \
SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \
} }
#define JITKERNEL_KEY(ker_key, dtype_key) \ #define JITKERNEL_KEY(ker_key, dtype_key) \
...@@ -156,10 +155,10 @@ namespace jit = platform::jit; ...@@ -156,10 +155,10 @@ namespace jit = platform::jit;
marco_declare, macro_key, macro_impl) marco_declare, macro_key, macro_impl)
#define FOR_EACH_ISA(macro_, block) \ #define FOR_EACH_ISA(macro_, block) \
macro_(jit::avx512f, block); \ macro_(platform::avx512f, block); \
macro_(jit::avx2, block); \ macro_(platform::avx2, block); \
macro_(jit::avx, block); \ macro_(platform::avx, block); \
macro_(jit::isa_any, block) macro_(platform::isa_any, block)
#define FOR_EACH_BLOCK(macro_, isa) \ #define FOR_EACH_BLOCK(macro_, isa) \
macro_(isa, kLT8); \ macro_(isa, kLT8); \
...@@ -168,11 +167,11 @@ namespace jit = platform::jit; ...@@ -168,11 +167,11 @@ namespace jit = platform::jit;
macro_(isa, kEQ16); \ macro_(isa, kEQ16); \
macro_(isa, kGT16) macro_(isa, kGT16)
#define FOR_EACH_ISA_BLOCK(macro_) \ #define FOR_EACH_ISA_BLOCK(macro_) \
FOR_EACH_BLOCK(macro_, jit::avx512f); \ FOR_EACH_BLOCK(macro_, platform::avx512f); \
FOR_EACH_BLOCK(macro_, jit::avx2); \ FOR_EACH_BLOCK(macro_, platform::avx2); \
FOR_EACH_BLOCK(macro_, jit::avx); \ FOR_EACH_BLOCK(macro_, platform::avx); \
FOR_EACH_BLOCK(macro_, jit::isa_any) FOR_EACH_BLOCK(macro_, platform::isa_any)
} // namespace jitkernel } // namespace jitkernel
} // namespace math } // namespace math
......
...@@ -705,7 +705,7 @@ TEST(JitKernel, pool) { ...@@ -705,7 +705,7 @@ TEST(JitKernel, pool) {
jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false);
// empty call it to avoid unknown flag 'use_pinned_memory' on Mac // empty call it to avoid unknown flag 'use_pinned_memory' on Mac
paddle::platform::jit::MayIUse(paddle::platform::jit::avx); paddle::platform::MayIUse(paddle::platform::avx);
const auto& plstm1 = const auto& plstm1 =
jit::KernelPool::Instance() jit::KernelPool::Instance()
.template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr); .template Get<jit::LSTMKernel<float>, const jit::lstm_attr_t&>(attr);
......
...@@ -89,6 +89,8 @@ template <typename T> ...@@ -89,6 +89,8 @@ template <typename T>
void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat, void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
const framework::Tensor& weight, const framework::Tensor& weight,
const framework::Tensor& input) { const framework::Tensor& input) {
auto blas =
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
size_t num_samples = tmat->dims()[0]; size_t num_samples = tmat->dims()[0];
size_t tmat_width = tmat->dims()[1]; size_t tmat_width = tmat->dims()[1];
size_t input_width = input.dims()[1]; size_t input_width = input.dims()[1];
...@@ -99,13 +101,12 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat, ...@@ -99,13 +101,12 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table_->get_code(i); auto code = code_table_->get_code(i);
int code_length = code->get_length(); int code_length = code->get_length();
const T* input_row = input_value + input_width * i;
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j); size_t index = code->calc_index(j);
const T* weight_row = weight_value + weight_width * index;
T sum = static_cast<T>(0.0); T sum = static_cast<T>(0.0);
for (size_t k = 0; k < input_width; ++k) { sum = blas.DOT(input_width, weight_row, input_row);
sum += weight_value[weight_width * index + k] *
input_value[input_width * i + k];
}
tmat_value[i * tmat_width + j] += sum; tmat_value[i * tmat_width + j] += sum;
} }
} }
...@@ -115,6 +116,8 @@ template <typename T> ...@@ -115,6 +116,8 @@ template <typename T>
void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
framework::Tensor* weight, framework::Tensor* weight,
const framework::Tensor& input) { const framework::Tensor& input) {
auto blas =
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
size_t num_samples = tmat.dims()[0]; size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1]; size_t input_width = input.dims()[1];
size_t tmat_width = tmat.dims()[1]; size_t tmat_width = tmat.dims()[1];
...@@ -122,16 +125,25 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, ...@@ -122,16 +125,25 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
auto tmat_value = tmat.data<T>(); auto tmat_value = tmat.data<T>();
auto weight_value = weight->data<T>(); auto weight_value = weight->data<T>();
auto input_value = input.data<T>(); auto input_value = input.data<T>();
std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table_->get_code(i); auto code = code_table_->get_code(i);
int code_length = code->get_length(); int code_length = code->get_length();
const T* input_value_row = input_value + input_width * i;
const T* tmat_row = tmat_value + i * tmat_width;
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j); ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
}
for (size_t k = 0; k < input_width; ++k) { }
weight_value[weight_width * index + k] += for (auto& op : ops) {
tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; auto& op_in_row = op.second;
} for (auto& pair : op_in_row) {
auto& scale = pair.first;
auto* input_row = pair.second;
T* weight_row = weight_value + op.first * weight_width;
blas.AXPY(input_width, scale, input_row, weight_row);
} }
} }
} }
...@@ -140,6 +152,8 @@ template <typename T> ...@@ -140,6 +152,8 @@ template <typename T>
void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
framework::SelectedRows* weight, framework::SelectedRows* weight,
const framework::Tensor& input) { const framework::Tensor& input) {
auto blas =
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
size_t num_samples = tmat.dims()[0]; size_t num_samples = tmat.dims()[0];
size_t input_width = input.dims()[1]; size_t input_width = input.dims()[1];
size_t tmat_width = tmat.dims()[1]; size_t tmat_width = tmat.dims()[1];
...@@ -147,17 +161,28 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat, ...@@ -147,17 +161,28 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
auto tmat_value = tmat.data<T>(); auto tmat_value = tmat.data<T>();
auto weight_value = weight->mutable_value()->data<T>(); auto weight_value = weight->mutable_value()->data<T>();
auto input_value = input.data<T>(); auto input_value = input.data<T>();
std::unordered_map<int, std::vector<std::pair<T, const T*>>> ops;
ops.reserve(weight->rows().size());
for (size_t i = 0; i < num_samples; ++i) { for (size_t i = 0; i < num_samples; ++i) {
auto code = code_table_->get_code(i); auto code = code_table_->get_code(i);
int code_length = code->get_length(); int code_length = code->get_length();
const T* input_value_row = input_value + input_width * i;
const T* tmat_row = tmat_value + i * tmat_width;
for (int j = 0; j < code_length; ++j) { for (int j = 0; j < code_length; ++j) {
size_t index = code->calc_index(j); ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row);
for (size_t k = 0; k < input_width; ++k) { }
int64_t row_index = weight->GetIndexFromId(static_cast<int64_t>(index)); }
weight_value[row_index * weight_width + k] +=
tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; for (auto& row : weight->rows()) {
} auto& op_in_row = ops[row];
for (auto& pair : op_in_row) {
auto& scale = pair.first;
auto* input_row = pair.second;
blas.AXPY(input_width, scale, input_row, weight_value);
} }
weight_value += weight_width;
} }
} }
......
...@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,14 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <unordered_map>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#if defined(_WIN32) #if defined(_WIN32)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/prelu.h"
namespace paddle {
namespace operators {
namespace math {
static const int CUDA_NUM_THREADS = 1024;
static const int CUDA_MAX_NUM_BLOCKS = 65535;
inline static int GET_NUM_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
template <typename T>
__global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
T *output, int channel,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
T *out = output + offset;
T scale = alpha[blockIdx.x % channel];
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
template <typename T>
__global__ void PReluElementWiseKernel(const T *input, const T *alpha,
T *output, size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
const T *scale = alpha + offset;
T *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale[i] * x;
}
}
template <typename T>
__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
size_t spatial_size) {
size_t offset = blockIdx.x * spatial_size;
const T *in = input + offset;
T scale = *alpha;
T *out = output + offset;
for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) {
T x = in[i];
out[i] = (x > 0) ? x : scale * x;
}
}
template <typename T>
static inline void PReluChannelWise(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, input_shape[1], spatial_size);
}
template <typename T>
static inline void PReluElementWise(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template <typename T>
static inline void PReluScalar(cudaStream_t stream, const T *input,
const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template <typename T>
void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
cudaStream_t stream, const T *input, const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluChannelWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, input_shape[1], spatial_size);
}
template <typename T>
void PreluElementWiseDirectCUDAFunctor<T>::operator()(
cudaStream_t stream, const T *input, const T *alpha, T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluElementWiseKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template <typename T>
void PreluScalarDirectCUDAFunctor<T>::operator()(cudaStream_t stream,
const T *input, const T *alpha,
T *output,
std::vector<int> input_shape) {
size_t unroll = input_shape[0] * input_shape[1];
size_t spatial_size = input_shape[2] * input_shape[3];
CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS);
PReluScalarKernel<<<unroll, CUDA_NUM_THREADS, 0, stream>>>(
input, alpha, output, spatial_size);
}
template class PreluChannelWiseDirectCUDAFunctor<float>;
template class PreluChannelWiseDirectCUDAFunctor<double>;
template class PreluElementWiseDirectCUDAFunctor<float>;
template class PreluElementWiseDirectCUDAFunctor<double>;
template class PreluScalarDirectCUDAFunctor<float>;
template class PreluScalarDirectCUDAFunctor<double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
namespace math {
#ifdef PADDLE_WITH_CUDA
template <typename T>
class PreluChannelWiseDirectCUDAFunctor {
public:
void operator()(cudaStream_t stream, const T *input, const T *alpha,
T *output, std::vector<int> input_shape);
};
template <typename T>
class PreluElementWiseDirectCUDAFunctor {
public:
void operator()(cudaStream_t stream, const T *input, const T *alpha,
T *output, std::vector<int> input_shape);
};
template <typename T>
class PreluScalarDirectCUDAFunctor {
public:
void operator()(cudaStream_t stream, const T *input, const T *alpha,
T *output, std::vector<int> input_shape);
};
#endif
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/merge_selected_rows_op.h"
namespace paddle {
namespace operators {
class MergeSelectedRowsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of MergeSelectedRowsOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of MergeSelectedRowsOp should not be null.");
ctx->ShareDim("X", /*->*/ "Out");
}
};
class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input type is SelectedRows, and the selected rows may be "
"duplicated.");
AddOutput("Out",
"The output type is SelectedRows, and the selected rows are not "
"duplicated.");
AddComment(
R"DOC(
MergeSelectedRows Operator.
MergeSelectedRows is used to merge the duplicated rows of the input.
)DOC");
}
};
class MergeSelectedRowsOpInferVarType
: public framework::PassInDtypeAndVarTypeToOutput {
protected:
std::unordered_map<std::string, std::string> GetInputOutputWithSameType()
const override {
return std::unordered_map<std::string, std::string>{{"X", /*->*/ "Out"}};
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOp,
ops::MergeSelectedRowsOpMaker,
ops::MergeSelectedRowsOpInferVarType);
REGISTER_OP_CPU_KERNEL(
merge_selected_rows,
ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, float>,
ops::MergeSelectedRowsKernel<plat::CPUDeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/merge_selected_rows_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
merge_selected_rows,
ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, float>,
ops::MergeSelectedRowsKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class MergeSelectedRowsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<framework::SelectedRows>("X");
auto* out = context.Output<framework::SelectedRows>("Out");
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(context.template device_context<DeviceContext>(), *x, out);
}
};
} // namespace operators
} // namespace paddle
...@@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel { ...@@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace()); ctx.device_context());
} }
}; };
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/prelu.h"
#include "paddle/fluid/operators/prelu_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class CUDAPReluKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X");
auto* alpha = context.Input<Tensor>("Alpha");
auto* out = context.Output<Tensor>("Out");
const T* x_ptr = x->data<T>();
T* o_ptr = out->mutable_data<T>(context.GetPlace());
const T* alpha_ptr = alpha->data<T>();
auto& mode = context.Attr<std::string>("mode");
int numel = x->numel();
auto dim = x->dims();
std::vector<int> input_shape = framework::vectorize2int(dim);
if (mode == "channel") {
math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
alpha_ptr, o_ptr, input_shape);
} else if (mode == "element") {
math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
alpha_ptr, o_ptr, input_shape);
} else {
math::PreluScalarDirectCUDAFunctor<T> prelu_scalar;
prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr,
o_ptr, input_shape);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
prelu, ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, float>,
ops::CUDAPReluKernel<paddle::platform::CUDADeviceContext, double>);
...@@ -36,12 +36,10 @@ class SequenceMaskOp : public framework::OperatorWithKernel { ...@@ -36,12 +36,10 @@ class SequenceMaskOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist"); PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist"); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
auto maxlen = ctx->Attrs().Get<int>("maxlen"); int maxlen = ctx->Attrs().Get<int>("maxlen");
if (maxlen > 0) { // We can only infershape when maxlen > 0 auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
auto dim = framework::vectorize2int(ctx->GetInputDim("X")); dim.push_back(maxlen > 0 ? maxlen : -1);
dim.push_back(maxlen); ctx->SetOutputDim("Y", framework::make_ddim(dim));
ctx->SetOutputDim("Y", framework::make_ddim(dim));
}
} }
}; };
......
...@@ -18,6 +18,7 @@ namespace paddle { ...@@ -18,6 +18,7 @@ namespace paddle {
namespace operators { namespace operators {
using framework::Tensor; using framework::Tensor;
const int kIgnoreIndex = -100;
class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
public: public:
...@@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker ...@@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
AddOutput("Out", AddOutput("Out",
"(Tensor, default Tensor<float>), a 2-D tensor with shape N x D " "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
" of elementwise logistic losses."); " of elementwise logistic losses.");
AddAttr<int>("ignore_index",
"(int, default kIgnoreIndex), Specifies a target value that "
"is ignored and"
"does not contribute to the input gradient.")
.SetDefault(kIgnoreIndex);
AddComment(R"DOC( AddComment(R"DOC(
SigmoidCrossEntropyWithLogits Operator. SigmoidCrossEntropyWithLogits Operator.
......
...@@ -15,33 +15,72 @@ limitations under the License. */ ...@@ -15,33 +15,72 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/legacy/utils/Logging.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
struct SigmoidCrossEntropyWithLogitsForward {
HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
: ignore_index(ignore_index) {}
HOSTDEVICE T operator()(const T &x, const T &label) const {
if (static_cast<int>(label) == ignore_index) {
return static_cast<T>(0.);
}
T term1 = (x > 0) ? x : 0;
T term2 = x * label;
T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
return term1 - term2 + term3;
}
int ignore_index;
};
template <typename T>
struct SigmoidCrossEntropyWithLogitsBackward {
HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
: ignore_index(ignore_index) {}
HOSTDEVICE T operator()(const T &x, const T &label) const {
if (static_cast<int>(label) == ignore_index) {
return static_cast<T>(0.);
}
T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
return simoid_x - label;
}
int ignore_index;
};
// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> { class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X"); const Tensor *X = context.Input<Tensor>("X");
const framework::Tensor *Labels = context.Input<framework::Tensor>("Label"); const Tensor *Labels = context.Input<Tensor>("Label");
framework::Tensor *Out = context.Output<framework::Tensor>("Out"); Tensor *Out = context.Output<Tensor>("Out");
Out->mutable_data<T>(context.GetPlace()); Out->mutable_data<T>(context.GetPlace());
int ignore_index = context.Attr<int>("ignore_index");
auto x = framework::EigenVector<T>::Flatten(*X); auto x = EigenVector<T>::Flatten(*X);
auto labels = framework::EigenVector<T>::Flatten(*Labels); auto labels = EigenVector<T>::Flatten(*Labels);
auto out = framework::EigenVector<T>::Flatten(*Out); auto out = EigenVector<T>::Flatten(*Out);
auto &place = *context.device_context<DeviceContext>().eigen_device(); auto &place = *context.device_context<DeviceContext>().eigen_device();
// term1 = max(x, 0) out.device(place) = x.binaryExpr(
auto term1 = x.cwiseMax(static_cast<T>(0)); labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
// term2 = x * labels
auto term2 = x * labels;
// term3 = log(1 + exp(-abs(x)))
auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
out.device(place) = term1 - term2 + term3;
} }
}; };
...@@ -50,23 +89,23 @@ template <typename DeviceContext, typename T> ...@@ -50,23 +89,23 @@ template <typename DeviceContext, typename T>
class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> { class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &context) const override { void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X"); const Tensor *X = context.Input<Tensor>("X");
const framework::Tensor *Labels = context.Input<framework::Tensor>("Label"); const Tensor *Labels = context.Input<Tensor>("Label");
const framework::Tensor *dOut = const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
context.Input<framework::Tensor>(framework::GradVarName("Out")); Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
framework::Tensor *dX =
context.Output<framework::Tensor>(framework::GradVarName("X"));
dX->mutable_data<T>(context.GetPlace()); dX->mutable_data<T>(context.GetPlace());
auto x = framework::EigenVector<T>::Flatten(*X); auto ignore_index = context.Attr<int>("ignore_index");
auto labels = framework::EigenVector<T>::Flatten(*Labels); auto x = EigenVector<T>::Flatten(*X);
auto dout = framework::EigenVector<T>::Flatten(*dOut); auto labels = EigenVector<T>::Flatten(*Labels);
auto dx = framework::EigenVector<T>::Flatten(*dX); auto dout = EigenVector<T>::Flatten(*dOut);
auto dx = EigenVector<T>::Flatten(*dX);
auto &place = auto &place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp()); auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
dx.device(place) = dout * (sigmoid_x - labels); static_cast<int>(ignore_index)));
dx.device(place) = dout * diff;
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/yolov3_loss_op.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using framework::Tensor;
class Yolov3LossOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of Yolov3LossOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("GTBox"),
"Input(GTBox) of Yolov3LossOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("GTLabel"),
"Input(GTLabel) of Yolov3LossOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Loss"),
"Output(Loss) of Yolov3LossOp should not be null.");
auto dim_x = ctx->GetInputDim("X");
auto dim_gtbox = ctx->GetInputDim("GTBox");
auto dim_gtlabel = ctx->GetInputDim("GTLabel");
auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
auto class_num = ctx->Attrs().Get<int>("class_num");
PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
"Input(X) dim[3] and dim[4] should be euqal.");
PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
"Input(X) dim[1] should be equal to (anchor_number * (5 "
"+ class_num)).");
PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
"Input(GTBox) should be a 3-D tensor");
PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
"Input(GTBox) should be a 2-D tensor");
PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
"Input(GTBox) and Input(GTLabel) dim[0] should be same");
PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
"Input(GTBox) and Input(GTLabel) dim[1] should be same");
PADDLE_ENFORCE_GT(anchors.size(), 0,
"Attr(anchors) length should be greater then 0.");
PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
"Attr(anchors) length should be even integer.");
PADDLE_ENFORCE_GT(class_num, 0,
"Attr(class_num) should be an integer greater then 0.");
std::vector<int64_t> dim_out({1});
ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace());
}
};
class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"The input tensor of YOLO v3 loss operator, "
"This is a 4-D tensor with shape of [N, C, H, W]."
"H and W should be same, and the second dimention(C) stores"
"box locations, confidence score and classification one-hot"
"key of each anchor box");
AddInput("GTBox",
"The input tensor of ground truth boxes, "
"This is a 3-D tensor with shape of [N, max_box_num, 5], "
"max_box_num is the max number of boxes in each image, "
"In the third dimention, stores x, y, w, h coordinates, "
"x, y is the center cordinate of boxes and w, h is the "
"width and height and x, y, w, h should be divided by "
"input image height to scale to [0, 1].");
AddInput("GTLabel",
"The input tensor of ground truth label, "
"This is a 2-D tensor with shape of [N, max_box_num], "
"and each element shoudl be an integer to indicate the "
"box class id.");
AddOutput("Loss",
"The output yolov3 loss tensor, "
"This is a 1-D tensor with shape of [1]");
AddAttr<int>("class_num", "The number of classes to predict.");
AddAttr<std::vector<int>>("anchors",
"The anchor width and height, "
"it will be parsed pair by pair.");
AddAttr<float>("ignore_thresh",
"The ignore threshold to ignore confidence loss.");
AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
.SetDefault(1.0);
AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
.SetDefault(1.0);
AddAttr<float>(
"loss_weight_conf_target",
"The weight of confidence score loss in locations with target object.")
.SetDefault(1.0);
AddAttr<float>("loss_weight_conf_notarget",
"The weight of confidence score loss in locations without "
"target object.")
.SetDefault(1.0);
AddAttr<float>("loss_weight_class", "The weight of classification loss.")
.SetDefault(1.0);
AddComment(R"DOC(
This operator generate yolov3 loss by given predict result and ground
truth boxes.
The output of previous network is in shape [N, C, H, W], while H and W
should be the same, specify the grid size, each grid point predict given
number boxes, this given number is specified by anchors, it should be
half anchors length, which following will be represented as S. In the
second dimention(the channel dimention), C should be S * (class_num + 5),
class_num is the box categoriy number of source dataset(such as coco),
so in the second dimention, stores 4 box location coordinates x, y, w, h
and confidence score of the box and class one-hot key of each anchor box.
While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
correspnd to:
$$
b_x = \sigma(t_x) + c_x
b_y = \sigma(t_y) + c_y
b_w = p_w e^{t_w}
b_h = p_h e^{t_h}
$$
While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
is specified by anchors.
As for confidence score, it is the logistic regression value of IoU between
anchor boxes and ground truth boxes, the score of the anchor box which has
the max IoU should be 1, and if the anchor box has IoU bigger then ignore
thresh, the confidence score loss of this anchor box will be ignored.
Therefore, the yolov3 loss consist of three major parts, box location loss,
confidence score loss, and classification loss. The MSE loss is used for
box location, and binary cross entropy loss is used for confidence score
loss and classification loss.
Final loss will be represented as follow.
$$
loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh}
+ \loss_weight_{conf_target} * loss_{conf_target}
+ \loss_weight_{conf_notarget} * loss_{conf_notarget}
+ \loss_weight_{class} * loss_{class}
$$
)DOC");
}
};
class Yolov3LossOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
"Input(Loss@GRAD) should not be null");
auto dim_x = ctx->GetInputDim("X");
if (ctx->HasOutput(framework::GradVarName("X"))) {
ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()),
platform::CPUPlace());
}
};
class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* op = new framework::OpDesc();
op->SetType("yolov3_loss_grad");
op->SetInput("X", Input("X"));
op->SetInput("GTBox", Input("GTBox"));
op->SetInput("GTLabel", Input("GTLabel"));
op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetOutput(framework::GradVarName("GTBox"), {});
op->SetOutput(framework::GradVarName("GTLabel"), {});
return std::unique_ptr<framework::OpDesc>(op);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker,
ops::Yolov3LossGradMaker);
REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad);
REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel<float>,
ops::Yolov3LossKernel<double>);
REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel<float>,
ops::Yolov3LossGradKernel<double>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, size_t D, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
using Array5 = Eigen::DSizes<int64_t, 5>;
template <typename T>
static inline bool isZero(T x) {
return fabs(x) < 1e-6;
}
template <typename T>
static inline T sigmoid(T x) {
return 1.0 / (exp(-1.0 * x) + 1.0);
}
template <typename T>
static inline T CalcMaskPointNum(const Tensor& mask) {
auto mask_t = EigenVector<int>::Flatten(mask);
T count = 0.0;
for (int i = 0; i < mask_t.dimensions()[0]; i++) {
if (mask_t(i)) {
count += 1.0;
}
}
return count;
}
template <typename T>
static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y,
const Tensor& mask) {
auto x_t = EigenVector<T>::Flatten(x);
auto y_t = EigenVector<T>::Flatten(y);
auto mask_t = EigenVector<int>::Flatten(mask);
T error_sum = 0.0;
T points = 0.0;
for (int i = 0; i < x_t.dimensions()[0]; i++) {
if (mask_t(i)) {
error_sum += pow(x_t(i) - y_t(i), 2);
points += 1;
}
}
return (error_sum / points);
}
template <typename T>
static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y,
const Tensor& mask, T mf) {
auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
auto x_t = EigenVector<T>::Flatten(x);
auto y_t = EigenVector<T>::Flatten(y);
auto mask_t = EigenVector<int>::Flatten(mask);
for (int i = 0; i < x_t.dimensions()[0]; i++) {
if (mask_t(i)) {
grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf;
}
}
}
template <typename T>
static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y,
const Tensor& mask) {
auto x_t = EigenVector<T>::Flatten(x);
auto y_t = EigenVector<T>::Flatten(y);
auto mask_t = EigenVector<int>::Flatten(mask);
T error_sum = 0.0;
T points = 0.0;
for (int i = 0; i < x_t.dimensions()[0]; i++) {
if (mask_t(i)) {
error_sum +=
-1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i)));
points += 1;
}
}
return (error_sum / points);
}
template <typename T>
static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x,
const Tensor& y, const Tensor& mask,
T mf) {
auto grad_t = EigenVector<T>::Flatten(*grad).setConstant(0.0);
auto x_t = EigenVector<T>::Flatten(x);
auto y_t = EigenVector<T>::Flatten(y);
auto mask_t = EigenVector<int>::Flatten(mask);
for (int i = 0; i < x_t.dimensions()[0]; i++) {
if (mask_t(i)) {
grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf;
}
}
}
template <typename T>
static void CalcPredResult(const Tensor& input, Tensor* pred_conf,
Tensor* pred_class, Tensor* pred_x, Tensor* pred_y,
Tensor* pred_w, Tensor* pred_h, const int anchor_num,
const int class_num) {
const int n = input.dims()[0];
const int h = input.dims()[2];
const int w = input.dims()[3];
const int box_attr_num = 5 + class_num;
auto input_t = EigenTensor<T, 4>::From(input);
auto pred_conf_t = EigenTensor<T, 4>::From(*pred_conf);
auto pred_class_t = EigenTensor<T, 5>::From(*pred_class);
auto pred_x_t = EigenTensor<T, 4>::From(*pred_x);
auto pred_y_t = EigenTensor<T, 4>::From(*pred_y);
auto pred_w_t = EigenTensor<T, 4>::From(*pred_w);
auto pred_h_t = EigenTensor<T, 4>::From(*pred_h);
for (int i = 0; i < n; i++) {
for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
for (int j = 0; j < h; j++) {
for (int k = 0; k < w; k++) {
pred_x_t(i, an_idx, j, k) =
sigmoid(input_t(i, box_attr_num * an_idx, j, k));
pred_y_t(i, an_idx, j, k) =
sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k));
pred_w_t(i, an_idx, j, k) =
input_t(i, box_attr_num * an_idx + 2, j, k);
pred_h_t(i, an_idx, j, k) =
input_t(i, box_attr_num * an_idx + 3, j, k);
pred_conf_t(i, an_idx, j, k) =
sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k));
for (int c = 0; c < class_num; c++) {
pred_class_t(i, an_idx, j, k, c) =
sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k));
}
}
}
}
}
}
template <typename T>
static T CalcBoxIoU(std::vector<T> box1, std::vector<T> box2) {
T b1_x1 = box1[0] - box1[2] / 2;
T b1_x2 = box1[0] + box1[2] / 2;
T b1_y1 = box1[1] - box1[3] / 2;
T b1_y2 = box1[1] + box1[3] / 2;
T b2_x1 = box2[0] - box2[2] / 2;
T b2_x2 = box2[0] + box2[2] / 2;
T b2_y1 = box2[1] - box2[3] / 2;
T b2_y2 = box2[1] + box2[3] / 2;
T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1);
T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1);
T inter_rect_x1 = std::max(b1_x1, b2_x1);
T inter_rect_y1 = std::max(b1_y1, b2_y1);
T inter_rect_x2 = std::min(b1_x2, b2_x2);
T inter_rect_y2 = std::min(b1_y2, b2_y2);
T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast<T>(0.0)) *
std::max(inter_rect_y2 - inter_rect_y1, static_cast<T>(0.0));
return inter_area / (b1_area + b2_area - inter_area);
}
template <typename T>
static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label,
const float ignore_thresh, std::vector<int> anchors,
const int grid_size, Tensor* obj_mask,
Tensor* noobj_mask, Tensor* tx, Tensor* ty,
Tensor* tw, Tensor* th, Tensor* tconf,
Tensor* tclass) {
const int n = gt_box.dims()[0];
const int b = gt_box.dims()[1];
const int anchor_num = anchors.size() / 2;
auto gt_box_t = EigenTensor<T, 3>::From(gt_box);
auto gt_label_t = EigenTensor<int, 2>::From(gt_label);
auto obj_mask_t = EigenTensor<int, 4>::From(*obj_mask).setConstant(0);
auto noobj_mask_t = EigenTensor<int, 4>::From(*noobj_mask).setConstant(1);
auto tx_t = EigenTensor<T, 4>::From(*tx).setConstant(0.0);
auto ty_t = EigenTensor<T, 4>::From(*ty).setConstant(0.0);
auto tw_t = EigenTensor<T, 4>::From(*tw).setConstant(0.0);
auto th_t = EigenTensor<T, 4>::From(*th).setConstant(0.0);
auto tconf_t = EigenTensor<T, 4>::From(*tconf).setConstant(0.0);
auto tclass_t = EigenTensor<T, 5>::From(*tclass).setConstant(0.0);
for (int i = 0; i < n; i++) {
for (int j = 0; j < b; j++) {
if (isZero<T>(gt_box_t(i, j, 0)) && isZero<T>(gt_box_t(i, j, 1)) &&
isZero<T>(gt_box_t(i, j, 2)) && isZero<T>(gt_box_t(i, j, 3))) {
continue;
}
int cur_label = gt_label_t(i, j);
T gx = gt_box_t(i, j, 0) * grid_size;
T gy = gt_box_t(i, j, 1) * grid_size;
T gw = gt_box_t(i, j, 2) * grid_size;
T gh = gt_box_t(i, j, 3) * grid_size;
int gi = static_cast<int>(gx);
int gj = static_cast<int>(gy);
T max_iou = static_cast<T>(0);
T iou;
int best_an_index = -1;
std::vector<T> gt_box_shape({0, 0, gw, gh});
for (int an_idx = 0; an_idx < anchor_num; an_idx++) {
std::vector<T> anchor_shape({0, 0, static_cast<T>(anchors[2 * an_idx]),
static_cast<T>(anchors[2 * an_idx + 1])});
iou = CalcBoxIoU<T>(gt_box_shape, anchor_shape);
if (iou > max_iou) {
max_iou = iou;
best_an_index = an_idx;
}
if (iou > ignore_thresh) {
noobj_mask_t(i, an_idx, gj, gi) = 0;
}
}
obj_mask_t(i, best_an_index, gj, gi) = 1;
noobj_mask_t(i, best_an_index, gj, gi) = 0;
tx_t(i, best_an_index, gj, gi) = gx - gi;
ty_t(i, best_an_index, gj, gi) = gy - gj;
tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]);
th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]);
tclass_t(i, best_an_index, gj, gi, cur_label) = 1;
tconf_t(i, best_an_index, gj, gi) = 1;
}
}
}
static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand,
const Tensor& obj_mask) {
const int n = obj_mask_expand->dims()[0];
const int an_num = obj_mask_expand->dims()[1];
const int h = obj_mask_expand->dims()[2];
const int w = obj_mask_expand->dims()[3];
const int class_num = obj_mask_expand->dims()[4];
auto obj_mask_expand_t = EigenTensor<int, 5>::From(*obj_mask_expand);
auto obj_mask_t = EigenTensor<int, 4>::From(obj_mask);
obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1))
.broadcast(Array5(1, 1, 1, 1, class_num));
}
template <typename T>
static void AddAllGradToInputGrad(
Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y,
const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x,
const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h,
const Tensor& grad_conf_target, const Tensor& grad_conf_notarget,
const Tensor& grad_class, const int class_num, const float loss_weight_xy,
const float loss_weight_wh, const float loss_weight_conf_target,
const float loss_weight_conf_notarget, const float loss_weight_class) {
const int n = pred_x.dims()[0];
const int an_num = pred_x.dims()[1];
const int h = pred_x.dims()[2];
const int w = pred_x.dims()[3];
const int attr_num = class_num + 5;
auto grad_t = EigenTensor<T, 4>::From(*grad).setConstant(0.0);
auto pred_x_t = EigenTensor<T, 4>::From(pred_x);
auto pred_y_t = EigenTensor<T, 4>::From(pred_y);
auto pred_conf_t = EigenTensor<T, 4>::From(pred_conf);
auto pred_class_t = EigenTensor<T, 5>::From(pred_class);
auto grad_x_t = EigenTensor<T, 4>::From(grad_x);
auto grad_y_t = EigenTensor<T, 4>::From(grad_y);
auto grad_w_t = EigenTensor<T, 4>::From(grad_w);
auto grad_h_t = EigenTensor<T, 4>::From(grad_h);
auto grad_conf_target_t = EigenTensor<T, 4>::From(grad_conf_target);
auto grad_conf_notarget_t = EigenTensor<T, 4>::From(grad_conf_notarget);
auto grad_class_t = EigenTensor<T, 5>::From(grad_class);
for (int i = 0; i < n; i++) {
for (int j = 0; j < an_num; j++) {
for (int k = 0; k < h; k++) {
for (int l = 0; l < w; l++) {
grad_t(i, j * attr_num, k, l) =
grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) *
(1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy;
grad_t(i, j * attr_num + 1, k, l) =
grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) *
(1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy;
grad_t(i, j * attr_num + 2, k, l) =
grad_w_t(i, j, k, l) * loss * loss_weight_wh;
grad_t(i, j * attr_num + 3, k, l) =
grad_h_t(i, j, k, l) * loss * loss_weight_wh;
grad_t(i, j * attr_num + 4, k, l) =
grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
(1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target;
grad_t(i, j * attr_num + 4, k, l) +=
grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) *
(1.0 - pred_conf_t(i, j, k, l)) * loss *
loss_weight_conf_notarget;
for (int c = 0; c < class_num; c++) {
grad_t(i, j * attr_num + 5 + c, k, l) =
grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) *
(1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class;
}
}
}
}
}
}
template <typename T>
class Yolov3LossKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto* loss = ctx.Output<Tensor>("Loss");
auto anchors = ctx.Attr<std::vector<int>>("anchors");
int class_num = ctx.Attr<int>("class_num");
float ignore_thresh = ctx.Attr<float>("ignore_thresh");
float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
float loss_weight_conf_notarget =
ctx.Attr<float>("loss_weight_conf_notarget");
float loss_weight_class = ctx.Attr<float>("loss_weight_class");
const int n = input->dims()[0];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int an_num = anchors.size() / 2;
Tensor pred_x, pred_y, pred_w, pred_h;
Tensor pred_conf, pred_class;
pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
&pred_w, &pred_h, an_num, class_num);
Tensor obj_mask, noobj_mask;
Tensor tx, ty, tw, th, tconf, tclass;
obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
&noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
Tensor obj_mask_expand;
obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
ctx.GetPlace());
ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
T loss_x = CalcMSEWithMask<T>(pred_x, tx, obj_mask);
T loss_y = CalcMSEWithMask<T>(pred_y, ty, obj_mask);
T loss_w = CalcMSEWithMask<T>(pred_w, tw, obj_mask);
T loss_h = CalcMSEWithMask<T>(pred_h, th, obj_mask);
T loss_conf_target = CalcBCEWithMask<T>(pred_conf, tconf, obj_mask);
T loss_conf_notarget = CalcBCEWithMask<T>(pred_conf, tconf, noobj_mask);
T loss_class = CalcBCEWithMask<T>(pred_class, tclass, obj_mask_expand);
auto* loss_data = loss->mutable_data<T>({1}, ctx.GetPlace());
loss_data[0] = loss_weight_xy * (loss_x + loss_y) +
loss_weight_wh * (loss_w + loss_h) +
loss_weight_conf_target * loss_conf_target +
loss_weight_conf_notarget * loss_conf_notarget +
loss_weight_class * loss_class;
}
};
template <typename T>
class Yolov3LossGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<Tensor>("X");
auto* gt_box = ctx.Input<Tensor>("GTBox");
auto* gt_label = ctx.Input<Tensor>("GTLabel");
auto anchors = ctx.Attr<std::vector<int>>("anchors");
int class_num = ctx.Attr<int>("class_num");
float ignore_thresh = ctx.Attr<float>("ignore_thresh");
auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
const T loss = output_grad->data<T>()[0];
float loss_weight_xy = ctx.Attr<float>("loss_weight_xy");
float loss_weight_wh = ctx.Attr<float>("loss_weight_wh");
float loss_weight_conf_target = ctx.Attr<float>("loss_weight_conf_target");
float loss_weight_conf_notarget =
ctx.Attr<float>("loss_weight_conf_notarget");
float loss_weight_class = ctx.Attr<float>("loss_weight_class");
const int n = input->dims()[0];
const int c = input->dims()[1];
const int h = input->dims()[2];
const int w = input->dims()[3];
const int an_num = anchors.size() / 2;
Tensor pred_x, pred_y, pred_w, pred_h;
Tensor pred_conf, pred_class;
pred_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_conf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
pred_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
CalcPredResult<T>(*input, &pred_conf, &pred_class, &pred_x, &pred_y,
&pred_w, &pred_h, an_num, class_num);
Tensor obj_mask, noobj_mask;
Tensor tx, ty, tw, th, tconf, tclass;
obj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
noobj_mask.mutable_data<int>({n, an_num, h, w}, ctx.GetPlace());
tx.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
ty.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tw.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
th.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tconf.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
tclass.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
PreProcessGTBox<T>(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask,
&noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass);
Tensor obj_mask_expand;
obj_mask_expand.mutable_data<int>({n, an_num, h, w, class_num},
ctx.GetPlace());
ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask);
Tensor grad_x, grad_y, grad_w, grad_h;
Tensor grad_conf_target, grad_conf_notarget, grad_class;
grad_x.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_y.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_w.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_h.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_conf_target.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_conf_notarget.mutable_data<T>({n, an_num, h, w}, ctx.GetPlace());
grad_class.mutable_data<T>({n, an_num, h, w, class_num}, ctx.GetPlace());
T obj_mf = CalcMaskPointNum<int>(obj_mask);
T noobj_mf = CalcMaskPointNum<int>(noobj_mask);
T obj_expand_mf = CalcMaskPointNum<int>(obj_mask_expand);
CalcMSEGradWithMask<T>(&grad_x, pred_x, tx, obj_mask, obj_mf);
CalcMSEGradWithMask<T>(&grad_y, pred_y, ty, obj_mask, obj_mf);
CalcMSEGradWithMask<T>(&grad_w, pred_w, tw, obj_mask, obj_mf);
CalcMSEGradWithMask<T>(&grad_h, pred_h, th, obj_mask, obj_mf);
CalcBCEGradWithMask<T>(&grad_conf_target, pred_conf, tconf, obj_mask,
obj_mf);
CalcBCEGradWithMask<T>(&grad_conf_notarget, pred_conf, tconf, noobj_mask,
noobj_mf);
CalcBCEGradWithMask<T>(&grad_class, pred_class, tclass, obj_mask_expand,
obj_expand_mf);
input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
AddAllGradToInputGrad<T>(
input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y,
grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class,
class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target,
loss_weight_conf_notarget, loss_weight_class);
}
};
} // namespace operators
} // namespace paddle
...@@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() { ...@@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() {
return CUDAPinnedMaxAllocSize() / 256; return CUDAPinnedMaxAllocSize() / 256;
} }
namespace jit {
#ifdef PADDLE_WITH_XBYAK #ifdef PADDLE_WITH_XBYAK
static Xbyak::util::Cpu cpu; static Xbyak::util::Cpu cpu;
bool MayIUse(const cpu_isa_t cpu_isa) { bool MayIUse(const cpu_isa_t cpu_isa) {
...@@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) { ...@@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
} }
#endif #endif
} // namespace jit
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize(); ...@@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator. //! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize(); size_t CUDAPinnedMaxChunkSize();
namespace jit {
typedef enum { typedef enum {
isa_any, isa_any,
sse42, sse42,
...@@ -55,7 +54,5 @@ typedef enum { ...@@ -55,7 +54,5 @@ typedef enum {
// May I use some instruction // May I use some instruction
bool MayIUse(const cpu_isa_t cpu_isa); bool MayIUse(const cpu_isa_t cpu_isa);
} // namespace jit
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
auto *kernel = auto *kernel =
reinterpret_cast<const CUpti_ActivityKernel3 *>(record); reinterpret_cast<const CUpti_ActivityKernel3 *>(record);
tracer->AddKernelRecords(kernel->start, kernel->end, tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end,
kernel->deviceId, kernel->streamId, kernel->deviceId, kernel->streamId,
kernel->correlationId); kernel->correlationId);
break; break;
...@@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t stream_id, uint32_t correlation_id) { int64_t device_id, int64_t stream_id,
uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) { if (start == 0 || end == 0) {
VLOG(3) << correlation_id << " cannot be traced"; VLOG(3) << correlation_id << " cannot be traced";
...@@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer {
} }
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.push_back( kernel_records_.push_back(
KernelRecord{start, end, device_id, stream_id, correlation_id}); KernelRecord{name, start, end, device_id, stream_id, correlation_id});
} }
bool IsEnabled() { bool IsEnabled() {
...@@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer {
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
if (correlations_.find(r.correlation_id) == correlations_.end()) {
fprintf(stderr, "cannot relate a kernel activity\n");
continue;
}
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
event->set_name(correlations_.at(r.correlation_id)); if (correlations_.find(r.correlation_id) != correlations_.end()) {
event->set_name(correlations_.at(r.correlation_id));
} else {
event->set_name(r.name);
}
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id); event->set_sub_device_id(r.stream_id);
......
...@@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() { ...@@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() {
class DeviceTracer { class DeviceTracer {
public: public:
struct KernelRecord { struct KernelRecord {
std::string name;
uint64_t start_ns; uint64_t start_ns;
uint64_t end_ns; uint64_t end_ns;
int64_t device_id; int64_t device_id;
...@@ -84,8 +85,9 @@ class DeviceTracer { ...@@ -84,8 +85,9 @@ class DeviceTracer {
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
int64_t stream_id, uint32_t correlation_id) = 0; int64_t device_id, int64_t stream_id,
uint32_t correlation_id) = 0;
// Generate a proto after done (Disabled). // Generate a proto after done (Disabled).
virtual proto::Profile GenProfile(const std::string& profile_path) = 0; virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
......
...@@ -125,8 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); ...@@ -125,8 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnRNNBackwardWeights); \ __macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \ __macro(cudnnRNNForwardInference); \
__macro(cudnnDestroyDropoutDescriptor); \ __macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \ __macro(cudnnDestroyRNNDescriptor);
__macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
...@@ -165,6 +164,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) ...@@ -165,6 +164,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif #endif
// APIs in R6
#if CUDNN_VERSION >= 6000
#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#endif
#if CUDNN_VERSION >= 7001 #if CUDNN_VERSION >= 7001
#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
__macro(cudnnSetConvolutionGroupCount); \ __macro(cudnnSetConvolutionGroupCount); \
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/split.h"
#ifndef _WIN32 #ifndef _WIN32
constexpr static float fraction_of_gpu_memory_to_use = 0.92f; constexpr static float fraction_of_gpu_memory_to_use = 0.92f;
...@@ -45,6 +46,15 @@ DEFINE_bool( ...@@ -45,6 +46,15 @@ DEFINE_bool(
"input and output must be half precision) and recurrent neural networks " "input and output must be half precision) and recurrent neural networks "
"(RNNs)."); "(RNNs).");
DEFINE_string(selected_gpus, "",
"A list of device ids separated by comma, like: 0,1,2,3. "
"This option is useful when doing multi process training and "
"each process have only one device (GPU). If you want to use "
"all visible devices, set this to empty string. NOTE: the "
"reason of doing this is that we want to use P2P communication"
"between GPU devices, use CUDA_VISIBLE_DEVICES can only use"
"share-memory only.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -121,6 +131,24 @@ int GetCurrentDeviceId() { ...@@ -121,6 +131,24 @@ int GetCurrentDeviceId() {
return device_id; return device_id;
} }
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices() {
// use user specified GPUs in single-node multi-process mode.
std::vector<int> devices;
if (!FLAGS_selected_gpus.empty()) {
auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ',');
for (auto id : devices_str) {
devices.push_back(atoi(id.c_str()));
}
} else {
int count = GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
devices.push_back(i);
}
}
return devices;
}
void SetDeviceId(int id) { void SetDeviceId(int id) {
// TODO(qijun): find a better way to cache the cuda device count // TODO(qijun): find a better way to cache the cuda device count
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <stddef.h> #include <stddef.h>
#include <string> #include <string>
#include <vector>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i); ...@@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i);
//! Get the current GPU device id in system. //! Get the current GPU device id in system.
int GetCurrentDeviceId(); int GetCurrentDeviceId();
//! Get a list of device ids from environment variable or use all.
std::vector<int> GetSelectedDevices();
//! Set the GPU device id for next execution. //! Set the GPU device id for next execution.
void SetDeviceId(int device_id); void SetDeviceId(int device_id);
......
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/string/split.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
...@@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) { ...@@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) {
std::vector<int> devices; std::vector<int> devices;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
try { try {
int count = platform::GetCUDADeviceCount(); // use user specified GPUs in single-node multi-process mode.
for (int i = 0; i < count; ++i) { devices = platform::GetSelectedDevices();
devices.push_back(i);
}
} catch (const std::exception &exp) { } catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
} }
...@@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) { ...@@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) {
void InitDevices(bool init_p2p, const std::vector<int> devices) { void InitDevices(bool init_p2p, const std::vector<int> devices) {
std::vector<platform::Place> places; std::vector<platform::Place> places;
int count = 0;
#ifdef PADDLE_WITH_CUDA
try {
count = platform::GetCUDADeviceCount();
} catch (const std::exception &exp) {
LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
}
#endif
for (size_t i = 0; i < devices.size(); ++i) { for (size_t i = 0; i < devices.size(); ++i) {
if (devices[i] >= count || devices[i] < 0) { // In multi process multi gpu mode, we may have gpuid = 7
// but count = 1.
if (devices[i] < 0) {
LOG(WARNING) << "Invalid devices id."; LOG(WARNING) << "Invalid devices id.";
continue; continue;
} }
places.emplace_back(platform::CUDAPlace(devices[i])); places.emplace_back(platform::CUDAPlace(devices[i]));
} }
if (init_p2p) { if (init_p2p) {
...@@ -122,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -122,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
#endif #endif
#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__) #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__)
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::MayIUse(platform::avx)) {
#ifndef __AVX__ #ifndef __AVX__
LOG(WARNING) << "AVX is available, Please re-compile on local machine"; LOG(WARNING) << "AVX is available, Please re-compile on local machine";
#endif #endif
...@@ -137,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -137,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
" version or compile from source code." " version or compile from source code."
#ifdef __AVX512F__ #ifdef __AVX512F__
if (!platform::jit::MayIUse(platform::jit::avx512f)) { if (!platform::MayIUse(platform::avx512f)) {
if (platform::jit::MayIUse(platform::jit::avx2)) { if (platform::MayIUse(platform::avx2)) {
AVX_GUIDE(AVX512, AVX2); AVX_GUIDE(AVX512, AVX2);
} else if (platform::jit::MayIUse(platform::jit::avx)) { } else if (platform::MayIUse(platform::avx)) {
AVX_GUIDE(AVX512, AVX); AVX_GUIDE(AVX512, AVX);
} else { } else {
AVX_GUIDE(AVX512, NonAVX); AVX_GUIDE(AVX512, NonAVX);
...@@ -149,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -149,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
#endif #endif
#ifdef __AVX2__ #ifdef __AVX2__
if (!platform::jit::MayIUse(platform::jit::avx2)) { if (!platform::MayIUse(platform::avx2)) {
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::MayIUse(platform::avx)) {
AVX_GUIDE(AVX2, AVX); AVX_GUIDE(AVX2, AVX);
} else { } else {
AVX_GUIDE(AVX2, NonAVX); AVX_GUIDE(AVX2, NonAVX);
...@@ -159,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -159,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
#endif #endif
#ifdef __AVX__ #ifdef __AVX__
if (!platform::jit::MayIUse(platform::jit::avx)) { if (!platform::MayIUse(platform::avx)) {
AVX_GUIDE(AVX, NonAVX); AVX_GUIDE(AVX, NonAVX);
} }
#endif #endif
......
...@@ -113,6 +113,18 @@ inline mkldnn::memory::format MKLDNNFormatForSize( ...@@ -113,6 +113,18 @@ inline mkldnn::memory::format MKLDNNFormatForSize(
return mkldnn::memory::format::x; return mkldnn::memory::format::x;
} else if (dims_size == 2) { } else if (dims_size == 2) {
return mkldnn::memory::format::nc; return mkldnn::memory::format::nc;
} else if (dims_size == 3) {
if (data_format == mkldnn::memory::format::nchw) {
return mkldnn::memory::format::ncw;
} else if (data_format == mkldnn::memory::format::nhwc) {
return mkldnn::memory::format::nwc;
}
} else if (dims_size == 5) {
if (data_format == mkldnn::memory::format::nchw) {
return mkldnn::memory::format::ncdhw;
} else if (data_format == mkldnn::memory::format::nhwc) {
return mkldnn::memory::format::ndhwc;
}
} }
return data_format; return data_format;
} }
......
...@@ -97,7 +97,7 @@ struct NCCLContextMap { ...@@ -97,7 +97,7 @@ struct NCCLContextMap {
order_.size(), contexts_.size(), order_.size(), contexts_.size(),
"NCCL Context Map does not support contain two or more same device"); "NCCL Context Map does not support contain two or more same device");
if (places.size() <= 1) { if (places.size() <= 1 && num_trainers == 1) {
return; return;
} }
std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]); std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
...@@ -111,12 +111,19 @@ struct NCCLContextMap { ...@@ -111,12 +111,19 @@ struct NCCLContextMap {
{ {
int nranks = num_trainers * order_.size(); int nranks = num_trainers * order_.size();
NCCLGroupGuard gurad; NCCLGroupGuard gurad;
for (auto &gpu_id : order_) { for (size_t i = 0; i < order_.size(); ++i) {
int rank = trainer_id * order_.size() + gpu_id; int gpu_id = order_[i];
VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks; int rank;
if (order_.size() > 1) {
rank = trainer_id * order_.size() + i;
} else {
rank = trainer_id;
}
VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks
<< "gpu id: " << gpu_id;
PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(cudaSetDevice(gpu_id));
PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
comms.get() + gpu_id, nranks, *nccl_id, rank)); comms.get() + i, nranks, *nccl_id, rank));
} }
} }
} }
......
...@@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc) ...@@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc)
cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
cc_test(to_string_test SRCS to_string_test.cc) cc_test(to_string_test SRCS to_string_test.cc)
cc_test(split_test SRCS split_test.cc)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <sstream>
#include <string>
#include <vector>
namespace paddle {
namespace string {
static inline std::vector<std::string> Split(std::string const& original,
char separator) {
std::vector<std::string> results;
std::string token;
std::istringstream is(original);
while (std::getline(is, token, separator)) {
if (!token.empty()) {
results.push_back(token);
}
}
return results;
}
} // namespace string
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/string/split.h"
#include <string>
#include "gtest/gtest.h"
TEST(StringSplit, StringSplit) {
std::string to_split = "0,1,2,3,4,5";
int i = 0;
for (auto s : paddle::string::Split(to_split, ',')) {
EXPECT_EQ(atoi(s.c_str()), i);
i++;
}
}
...@@ -437,7 +437,7 @@ EOF ...@@ -437,7 +437,7 @@ EOF
export http_proxy= export http_proxy=
export https_proxy= export https_proxy=
# TODO: jiabin need to refine this part when these tests fixed on mac # TODO: jiabin need to refine this part when these tests fixed on mac
ctest --output-on-failure -j $1 ctest --output-on-failure -j $2
# make install should also be test when unittest # make install should also be test when unittest
make install -j 8 make install -j 8
if [ "$1" == "cp27-cp27m" ]; then if [ "$1" == "cp27-cp27m" ]; then
...@@ -449,7 +449,7 @@ EOF ...@@ -449,7 +449,7 @@ EOF
elif [ "$1" == "cp37-cp37m" ]; then elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version paddle version
fi fi
...@@ -472,12 +472,15 @@ function assert_api_not_changed() { ...@@ -472,12 +472,15 @@ function assert_api_not_changed() {
virtualenv .env virtualenv .env
source .env/bin/activate source .env/bin/activate
pip install ${PADDLE_ROOT}/build/python/dist/*whl pip install ${PADDLE_ROOT}/build/python/dist/*whl
python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec
if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then
# Use sed to make python2 and python3 sepc keeps the same # Use sed to make python2 and python3 sepc keeps the same
sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i 's/arg0: str/arg0: unicode/g' new.spec
sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec
fi fi
# ComposeNotAligned has significant difference between py2 and py3
sed -i '/.*ComposeNotAligned.*/d' new.spec
python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec
deactivate deactivate
} }
...@@ -487,7 +490,19 @@ function assert_api_spec_approvals() { ...@@ -487,7 +490,19 @@ function assert_api_spec_approvals() {
BRANCH="develop" BRANCH="develop"
fi fi
API_FILES=("paddle/fluid/API.spec" "paddle/fluid/framework/operator.h") API_FILES=("paddle/fluid/API.spec"
"paddle/fluid/framework/operator.h"
"paddle/fluid/framework/tensor.h"
"paddle/fluid/framework/lod_tensor.h"
"paddle/fluid/framework/selected_rows.h"
"paddle/fluid/framework/op_desc.h"
"paddle/fluid/framework/block_desc.h"
"paddle/fluid/framework/var_desc.h"
"paddle/fluid/framework/scope.h"
"paddle/fluid/framework/ir/node.h"
"paddle/fluid/framework/ir/graph.h"
"paddle/fluid/framework/framework.proto"
"paddle/fluid/operators/distributed/send_recv.proto.in")
for API_FILE in ${API_FILES[*]}; do for API_FILE in ${API_FILES[*]}; do
API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true` API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`
echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
...@@ -901,7 +916,7 @@ function main() { ...@@ -901,7 +916,7 @@ function main() {
maccheck) maccheck)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
build_mac build_mac
run_mac_test ${PROC_RUN:-1} run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1}
;; ;;
macbuild) macbuild)
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
......
...@@ -147,7 +147,7 @@ def __bootstrap__(): ...@@ -147,7 +147,7 @@ def __bootstrap__():
read_env_flags += [ read_env_flags += [
'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic',
'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit',
'cudnn_exhaustive_search' 'cudnn_exhaustive_search', 'selected_gpus'
] ]
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
......
...@@ -134,12 +134,12 @@ class GradientClipByValue(BaseGradientClipAttr): ...@@ -134,12 +134,12 @@ class GradientClipByValue(BaseGradientClipAttr):
Examples: Examples:
.. code-block:: python .. code-block:: python
w_param_attrs = ParamAttr(name=None, w_param_attrs = fluid.ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0, learning_rate=1.0,
regularizer=L1Decay(1.0), regularizer=fluid.regularizer.L1Decay(1.0),
trainable=True, trainable=True,
clip=GradientClipByValue(-1.0, 1.0)) clip=fluid.clip.GradientClipByValue(-1.0, 1.0))
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
""" """
...@@ -185,12 +185,12 @@ class GradientClipByNorm(BaseGradientClipAttr): ...@@ -185,12 +185,12 @@ class GradientClipByNorm(BaseGradientClipAttr):
Examples: Examples:
.. code-block:: python .. code-block:: python
w_param_attrs = ParamAttr(name=None, w_param_attrs = flui.ParamAttr(name=None,
initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0),
learning_rate=1.0, learning_rate=1.0,
regularizer=L1Decay(1.0), regularizer=fluid.regularizer.L1Decay(1.0),
trainable=True, trainable=True,
clip=GradientClipByNorm(clip_norm=2.0)) clip=fluid.clip.GradientClipByNorm(clip_norm=2.0))
y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs)
""" """
...@@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
"All parameters' 'clip_norm' of a same group should be the same" "All parameters' 'clip_norm' of a same group should be the same"
) )
square = grad * grad merge_grad = grad
if grad.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(grad)
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
square = layers.square(merge_grad)
local_norm_var = layers.reduce_sum(input=square) local_norm_var = layers.reduce_sum(input=square)
context[self.group_name].append(local_norm_var) context[self.group_name].append(local_norm_var)
...@@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ...@@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr):
new_grad = layers.elementwise_mul( new_grad = layers.elementwise_mul(
x=grad, y=self.context[group_scale_name]) x=grad, y=self.context[group_scale_name])
return param, new_grad return param, new_grad
......
...@@ -20,7 +20,7 @@ import six ...@@ -20,7 +20,7 @@ import six
from .framework import Program, default_main_program, Variable from .framework import Program, default_main_program, Variable
from . import core from . import core
__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope'] __all__ = ['Executor', 'global_scope', 'scope_guard']
g_scope = core.Scope() g_scope = core.Scope()
...@@ -407,16 +407,17 @@ class Executor(object): ...@@ -407,16 +407,17 @@ class Executor(object):
Examples: Examples:
>>> data = layers.data(name='X', shape=[1], dtype='float32') >>> data = fluid.layers.data(name='X', shape=[1], dtype='float32')
>>> hidden = layers.fc(input=data, size=10) >>> out = fluid.layers.create_tensor(dtype='float32')
>>> layers.assign(hidden, out) >>> hidden = fluid.layers.fc(input=data, size=10)
>>> loss = layers.mean(out) >>> fluid.layers.assign(hidden,out)
>>> loss = fluid.layers.mean(out)
>>> adam = fluid.optimizer.Adam() >>> adam = fluid.optimizer.Adam()
>>> adam.minimize(loss) >>> adam.minimize(loss)
>>> cpu = core.CPUPlace() >>> cpu = core.CPUPlace()
>>> exe = Executor(cpu) >>> exe = fluid.Executor(cpu)
>>> exe.run(default_startup_program()) >>> exe.run(fluid.default_startup_program())
>>> x = numpy.random.random(size=(10, 1)).astype('float32') >>> x = numpy.random.random(size=(10, 1)).astype('float32')
>>> outs = exe.run( >>> outs = exe.run(
......
...@@ -89,12 +89,13 @@ def name_scope(prefix=None): ...@@ -89,12 +89,13 @@ def name_scope(prefix=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
with name_scope("encoder"): with name_scope("encoder"):
... ...
with name_scope("decoder"): with name_scope("decoder"):
... ...
with name_scope("attention"): with name_scope("attention"):
... ...
""" """
# TODO(panyx0718): Only [0-9a-z]. # TODO(panyx0718): Only [0-9a-z].
assert prefix, "namescope prefix cannot be empty." assert prefix, "namescope prefix cannot be empty."
......
...@@ -20,6 +20,7 @@ from __future__ import print_function ...@@ -20,6 +20,7 @@ from __future__ import print_function
from .layer_function_generator import generate_layer_fn from .layer_function_generator import generate_layer_fn
from .layer_function_generator import autodoc, templatedoc from .layer_function_generator import autodoc, templatedoc
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
from ..framework import Variable
from . import tensor from . import tensor
from . import nn from . import nn
from . import ops from . import ops
...@@ -46,6 +47,7 @@ __all__ = [ ...@@ -46,6 +47,7 @@ __all__ = [
'iou_similarity', 'iou_similarity',
'box_coder', 'box_coder',
'polygon_box_transform', 'polygon_box_transform',
'yolov3_loss',
] ]
...@@ -401,6 +403,113 @@ def polygon_box_transform(input, name=None): ...@@ -401,6 +403,113 @@ def polygon_box_transform(input, name=None):
return output return output
@templatedoc(op_type="yolov3_loss")
def yolov3_loss(x,
gtbox,
gtlabel,
anchors,
class_num,
ignore_thresh,
loss_weight_xy=None,
loss_weight_wh=None,
loss_weight_conf_target=None,
loss_weight_conf_notarget=None,
loss_weight_class=None,
name=None):
"""
${comment}
Args:
x (Variable): ${x_comment}
gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4],
in the third dimenstion, x, y, w, h should be stored
and x, y, w, h should be relative value of input image.
N is the batch number and B is the max box number in
an image.
gtlabel (Variable): class id of ground truth boxes, shoud be ins shape
of [N, B].
anchors (list|tuple): ${anchors_comment}
class_num (int): ${class_num_comment}
ignore_thresh (float): ${ignore_thresh_comment}
loss_weight_xy (float|None): ${loss_weight_xy_comment}
loss_weight_wh (float|None): ${loss_weight_wh_comment}
loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment}
loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment}
loss_weight_class (float|None): ${loss_weight_class_comment}
name (string): the name of yolov3 loss
Returns:
Variable: A 1-D tensor with shape [1], the value of yolov3 loss
Raises:
TypeError: Input x of yolov3_loss must be Variable
TypeError: Input gtbox of yolov3_loss must be Variable"
TypeError: Input gtlabel of yolov3_loss must be Variable"
TypeError: Attr anchors of yolov3_loss must be list or tuple
TypeError: Attr class_num of yolov3_loss must be an integer
TypeError: Attr ignore_thresh of yolov3_loss must be a float number
Examples:
.. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
anchors = [10, 13, 16, 30, 33, 23]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80
anchors=anchors, ignore_thresh=0.5)
"""
helper = LayerHelper('yolov3_loss', **locals())
if not isinstance(x, Variable):
raise TypeError("Input x of yolov3_loss must be Variable")
if not isinstance(gtbox, Variable):
raise TypeError("Input gtbox of yolov3_loss must be Variable")
if not isinstance(gtlabel, Variable):
raise TypeError("Input gtlabel of yolov3_loss must be Variable")
if not isinstance(anchors, list) and not isinstance(anchors, tuple):
raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
if not isinstance(class_num, int):
raise TypeError("Attr class_num of yolov3_loss must be an integer")
if not isinstance(ignore_thresh, float):
raise TypeError(
"Attr ignore_thresh of yolov3_loss must be a float number")
if name is None:
loss = helper.create_variable_for_type_inference(dtype=x.dtype)
else:
loss = helper.create_variable(
name=name, dtype=x.dtype, persistable=False)
attrs = {
"anchors": anchors,
"class_num": class_num,
"ignore_thresh": ignore_thresh,
}
if loss_weight_xy is not None and isinstance(loss_weight_xy, float):
self.attrs['loss_weight_xy'] = loss_weight_xy
if loss_weight_wh is not None and isinstance(loss_weight_wh, float):
self.attrs['loss_weight_wh'] = loss_weight_wh
if loss_weight_conf_target is not None and isinstance(
loss_weight_conf_target, float):
self.attrs['loss_weight_conf_target'] = loss_weight_conf_target
if loss_weight_conf_notarget is not None and isinstance(
loss_weight_conf_notarget, float):
self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget
if loss_weight_class is not None and isinstance(loss_weight_class, float):
self.attrs['loss_weight_class'] = loss_weight_class
helper.append_op(
type='yolov3_loss',
inputs={"X": x,
"GTBox": gtbox,
"GTLabel": gtlabel},
outputs={'Loss': loss},
attrs=attrs)
return loss
@templatedoc() @templatedoc()
def detection_map(detect_res, def detection_map(detect_res,
label, label,
......
...@@ -943,7 +943,18 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None): ...@@ -943,7 +943,18 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None):
def shuffle(reader, buffer_size): def shuffle(reader, buffer_size):
""" """
Shuffle the reader. Creates a data reader whose data output is shuffled.
Output from the iterator that created by original reader will be
buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
is determined by argument buf_size.
Args:
param reader: the original reader whose output will be shuffled.
type reader: callable
param buf_size: shuffle buffer size.
type buf_size: int
return: the new reader whose output is shuffled.
rtype: callable
""" """
return __create_unshared_decorated_reader__( return __create_unshared_decorated_reader__(
'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)})
......
...@@ -20,7 +20,7 @@ import string ...@@ -20,7 +20,7 @@ import string
from six.moves import cStringIO from six.moves import cStringIO
from ..proto import framework_pb2 from ..proto import framework_pb2
from ..framework import OpProtoHolder, Variable from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_
from ..layer_helper import LayerHelper from ..layer_helper import LayerHelper
__all__ = [ __all__ = [
...@@ -178,6 +178,15 @@ def generate_layer_fn(op_type): ...@@ -178,6 +178,15 @@ def generate_layer_fn(op_type):
"operator {0} must input same dtype. {1} vs {2}".format( "operator {0} must input same dtype. {1} vs {2}".format(
op_type, dtype, each.dtype)) op_type, dtype, each.dtype))
if dtype is None:
arg_dtype = kwargs.get("dtype")
if arg_dtype:
if not isinstance(arg_dtype, core.VarDesc.VarType):
dtype = convert_np_dtype_to_dtype_(arg_dtype)
else:
dtype = arg_dtype
else:
dtype = core.VarDesc.VarType.FP32
return dtype return dtype
def func(*args, **kwargs): def func(*args, **kwargs):
......
...@@ -308,13 +308,9 @@ def piecewise_decay(boundaries, values): ...@@ -308,13 +308,9 @@ def piecewise_decay(boundaries, values):
def append_LARS(params_grads, learning_rate, weight_decay): def append_LARS(params_grads, learning_rate, weight_decay):
"""Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for """
each layer. Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
each layer.
```python
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
```
Args: Args:
learning_rate: A learning rate Variable. This learning_rate: A learning rate Variable. This
...@@ -323,6 +319,11 @@ def append_LARS(params_grads, learning_rate, weight_decay): ...@@ -323,6 +319,11 @@ def append_LARS(params_grads, learning_rate, weight_decay):
Returns: Returns:
The decayed learning rate The decayed learning rate
Examples:
.. code-block:: python
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
""" """
def _balanced_weight(param_norm, grad_norm): def _balanced_weight(param_norm, grad_norm):
......
...@@ -169,9 +169,13 @@ __all__ = [ ...@@ -169,9 +169,13 @@ __all__ = [
'log_loss', 'log_loss',
'add_position_encoding', 'add_position_encoding',
'bilinear_tensor_product', 'bilinear_tensor_product',
'merge_selected_rows',
'get_tensor_from_selected_rows',
'lstm', 'lstm',
] ]
kIgnoreIndex = -100
def fc(input, def fc(input,
size, size,
...@@ -926,7 +930,7 @@ def dynamic_gru(input, ...@@ -926,7 +930,7 @@ def dynamic_gru(input,
emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
hidden_dim = 512 hidden_dim = 512
x = fluid.layers.fc(input=emb, size=hidden_dim * 3) x = fluid.layers.fc(input=emb, size=hidden_dim * 3)
hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim) hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
""" """
helper = LayerHelper('gru', **locals()) helper = LayerHelper('gru', **locals())
...@@ -1267,7 +1271,7 @@ def dropout(x, ...@@ -1267,7 +1271,7 @@ def dropout(x,
return out return out
def cross_entropy(input, label, soft_label=False, ignore_index=-100): def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
""" """
**Cross Entropy Layer** **Cross Entropy Layer**
...@@ -1314,7 +1318,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100): ...@@ -1314,7 +1318,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
labels. Default: `False`. labels. Default: `False`.
ignore_index (int): Specifies a target value that is ignored and does ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100 if soft_label is set to False. Default: kIgnoreIndex
Returns: Returns:
A 2-D tensor with shape [N x 1], the cross entropy loss. A 2-D tensor with shape [N x 1], the cross entropy loss.
...@@ -3584,6 +3588,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None): ...@@ -3584,6 +3588,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
# Suppose `ids` and `scores` are LodTensorArray variables reserving # Suppose `ids` and `scores` are LodTensorArray variables reserving
# the selected ids and scores of all steps # the selected ids and scores of all steps
finished_ids, finished_scores = layers.beam_search_decode( finished_ids, finished_scores = layers.beam_search_decode(
...@@ -5081,7 +5086,7 @@ def im2sequence(input, ...@@ -5081,7 +5086,7 @@ def im2sequence(input,
output.lod = [[4, 4]] output.lod = [[4, 4]]
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -5185,7 +5190,7 @@ def multiplex(inputs, index): ...@@ -5185,7 +5190,7 @@ def multiplex(inputs, index):
def softmax_with_cross_entropy(logits, def softmax_with_cross_entropy(logits,
label, label,
soft_label=False, soft_label=False,
ignore_index=-100, ignore_index=kIgnoreIndex,
numeric_stable_mode=False, numeric_stable_mode=False,
return_softmax=False): return_softmax=False):
""" """
...@@ -5243,7 +5248,7 @@ def softmax_with_cross_entropy(logits, ...@@ -5243,7 +5248,7 @@ def softmax_with_cross_entropy(logits,
labels as soft labels. By default, `soft_label` is set to False. labels as soft labels. By default, `soft_label` is set to False.
ignore_index (int): Specifies a target value that is ignored and does ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100 if soft_label is set to False. Default: kIgnoreIndex
numeric_stable_mode (bool): A flag to indicate whether to use a more numeric_stable_mode (bool): A flag to indicate whether to use a more
numerically stable algorithm. Only valid numerically stable algorithm. Only valid
when soft_label is False and GPU is used. when soft_label is False and GPU is used.
...@@ -5868,24 +5873,23 @@ def pad_constant_like(x, y, pad_value=0., name=None): ...@@ -5868,24 +5873,23 @@ def pad_constant_like(x, y, pad_value=0., name=None):
[[38, 39, 40]], [[38, 39, 40]],
[[41, 42, 43]]]] [[41, 42, 43]]]]
Y.shape = (1, 3, 1, 3) Y.shape = (1, 3, 1, 3)
And
pad_value = -1,
And Return:
pad_value = -1, Out = [[[[35, 36, 37],
[-1, -1, -1]],
Return: [[38, 39, 40],
Out = [[[[35, 36, 37], [-1, -1, -1]],
[-1, -1, -1]], [[41, 42, 43],
[[38, 39, 40], [-1, -1, -1]]],
[-1, -1, -1]], [[[-1, -1, -1],
[[41, 42, 43], [-1, -1, -1]],
[-1, -1, -1]]], [[-1, -1, -1],
[[[-1, -1, -1], [-1, -1, -1]],
[-1, -1, -1]], [[-1, -1, -1],
[[-1, -1, -1], [-1, -1, -1]]]]
[-1, -1, -1]], Out.shape = (2, 3, 2, 3)
[[-1, -1, -1],
[-1, -1, -1]]]]
Out.shape = (2, 3, 2, 3)
Args: Args:
x (Variable): The input tensor variable. x (Variable): The input tensor variable.
...@@ -6124,6 +6128,7 @@ def image_resize(input, ...@@ -6124,6 +6128,7 @@ def image_resize(input,
Supporting resample methods: Supporting resample methods:
'BILINEAR' : Bilinear interpolation 'BILINEAR' : Bilinear interpolation
'NEAREST' : Nearest neighbor interpolation 'NEAREST' : Nearest neighbor interpolation
Args: Args:
...@@ -6779,7 +6784,7 @@ def crop(x, shape=None, offsets=None, name=None): ...@@ -6779,7 +6784,7 @@ def crop(x, shape=None, offsets=None, name=None):
# or # or
z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32") z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32")
crop = fluid.layers.crop(z, shape=[2, 3]) crop = fluid.layers.crop(z, shape=[-1, 2, 3])
""" """
helper = LayerHelper('crop', **locals()) helper = LayerHelper('crop', **locals())
...@@ -7060,39 +7065,40 @@ def pad2d(input, ...@@ -7060,39 +7065,40 @@ def pad2d(input,
than height-1. And the width dimension has the same condition. than height-1. And the width dimension has the same condition.
Example: Example:
.. code-block:: text
Given that X is a channel of image from input: Given that X is a channel of image from input:
X = [[1, 2, 3], X = [[1, 2, 3],
[4, 5, 6]] [4, 5, 6]]
Case 0: Case 0:
paddings = [0, 1, 2, 3], paddings = [0, 1, 2, 3],
mode = 'constant' mode = 'constant'
pad_value = 0 pad_value = 0
Out = [[0, 0, 1, 2, 3, 0, 0, 0] Out = [[0, 0, 1, 2, 3, 0, 0, 0]
[0, 0, 4, 5, 6, 0, 0, 0] [0, 0, 4, 5, 6, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0]] [0, 0, 0, 0, 0, 0, 0, 0]]
Case 1: Case 1:
paddings = [0, 1, 2, 1], paddings = [0, 1, 2, 1],
mode = 'reflect' mode = 'reflect'
Out = [[3, 2, 1, 2, 3, 2] Out = [[3, 2, 1, 2, 3, 2]
[6, 5, 4, 5, 6, 5] [6, 5, 4, 5, 6, 5]
[3, 2, 1, 2, 3, 2]] [3, 2, 1, 2, 3, 2]]
Case 2: Case 2:
paddings = [0, 1, 2, 1], paddings = [0, 1, 2, 1],
mode = 'edge' mode = 'edge'
Out = [[1, 1, 1, 2, 3, 3] Out = [[1, 1, 1, 2, 3, 3]
[4, 4, 4, 5, 6, 6] [4, 4, 4, 5, 6, 6]
[4, 4, 4, 5, 6, 6]] [4, 4, 4, 5, 6, 6]]
Args: Args:
...@@ -7330,13 +7336,13 @@ def prelu(x, mode, param_attr=None, name=None): ...@@ -7330,13 +7336,13 @@ def prelu(x, mode, param_attr=None, name=None):
Args: Args:
x (Variable): The input tensor. x (Variable): The input tensor.
param_attr(ParamAttr|None): The parameter attribute for the learnable param_attr(ParamAttr|None): The parameter attribute for the learnable
weight (alpha). weight (alpha).
mode (string): The mode for weight sharing. It supports all, channel mode (string): The mode for weight sharing. It supports all, channel
and element. all: all elements share same weight and element. all: all elements share same weight
channel:elements in a channel share same weight channel:elements in a channel share same weight
element:each element has a weight element:each element has a weight
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically. will be named automatically.
Returns: Returns:
Variable: The output tensor with the same shape as input. Variable: The output tensor with the same shape as input.
...@@ -8378,6 +8384,29 @@ def mean(x, name=None): ...@@ -8378,6 +8384,29 @@ def mean(x, name=None):
return out return out
@templatedoc()
def merge_selected_rows(x, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper("merge_selected_rows", **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type="merge_selected_rows",
inputs={"X": x},
attrs={},
outputs={"Out": out})
return out
@templatedoc() @templatedoc()
def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
""" """
...@@ -8415,13 +8444,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): ...@@ -8415,13 +8444,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
@templatedoc() @templatedoc()
def sigmoid_cross_entropy_with_logits(x, label, name=None): def sigmoid_cross_entropy_with_logits(x,
label,
ignore_index=kIgnoreIndex,
name=None):
""" """
${comment} ${comment}
Args: Args:
x(${x_type}): ${x_comment} x(${x_type}): ${x_comment}
label(${label_type}): ${label_comment} label(${label_type}): ${label_comment}
ignore_index(&{ignore_index}): ${ignore_index_comment}
name(basestring|None): Name of the output. name(basestring|None): Name of the output.
Returns: Returns:
...@@ -8440,7 +8473,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None): ...@@ -8440,7 +8473,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
type="sigmoid_cross_entropy_with_logits", type="sigmoid_cross_entropy_with_logits",
inputs={"X": x, inputs={"X": x,
"Label": label}, "Label": label},
attrs={}, attrs={"ignore_index": ignore_index},
outputs={"Out": out}) outputs={"Out": out})
return out return out
...@@ -9026,3 +9059,26 @@ def bilinear_tensor_product(x, ...@@ -9026,3 +9059,26 @@ def bilinear_tensor_product(x,
# add activation # add activation
return helper.append_activation(out) return helper.append_activation(out)
@templatedoc()
def get_tensor_from_selected_rows(x, name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
name(basestring|None): Name of the output.
Returns:
out(${out_type}): ${out_comment}
"""
helper = LayerHelper('get_tensor_from_selected_rows', **locals())
out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(
type='get_tensor_from_selected_rows',
inputs={'X': x},
outputs={'Out': out},
attrs={})
return out
...@@ -622,7 +622,7 @@ def reverse(x, axis): ...@@ -622,7 +622,7 @@ def reverse(x, axis):
out = helper.create_variable_for_type_inference(dtype=x.dtype) out = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op( helper.append_op(
type='reverse', type='reverse',
inputs={'Input': x}, inputs={'X': x},
outputs={'Out': [out]}, outputs={'Out': [out]},
attrs={'axis': axis}) attrs={'axis': axis})
return out return out
......
...@@ -222,13 +222,13 @@ class Precision(MetricBase): ...@@ -222,13 +222,13 @@ class Precision(MetricBase):
Examples: Examples:
.. code-block:: python .. code-block:: python
metric = fluid.metrics.Precision() metric = fluid.metrics.Precision()
for pass in range(PASSES): for pass in range(PASSES):
metric.reset() metric.reset()
for data in train_reader(): for data in train_reader():
loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
metric.update(preds=preds, labels=labels) metric.update(preds=preds, labels=labels)
numpy_precision = metric.eval() numpy_precision = metric.eval()
""" """
def __init__(self, name=None): def __init__(self, name=None):
...@@ -267,13 +267,13 @@ class Recall(MetricBase): ...@@ -267,13 +267,13 @@ class Recall(MetricBase):
Examples: Examples:
.. code-block:: python .. code-block:: python
metric = fluid.metrics.Recall() metric = fluid.metrics.Recall()
for pass in range(PASSES): for pass in range(PASSES):
metric.reset() metric.reset()
for data in train_reader(): for data in train_reader():
loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) loss, preds, labels = exe.run(fetch_list=[cost, preds, labels])
metric.update(preds=preds, labels=labels) metric.update(preds=preds, labels=labels)
numpy_recall = metric.eval() numpy_recall = metric.eval()
""" """
def __init__(self, name=None): def __init__(self, name=None):
...@@ -449,8 +449,9 @@ class EditDistance(MetricBase): ...@@ -449,8 +449,9 @@ class EditDistance(MetricBase):
distance_evaluator.update(distances, seq_num) distance_evaluator.update(distances, seq_num)
distance, instance_error = distance_evaluator.eval() distance, instance_error = distance_evaluator.eval()
In the above example: In the above example:
'distance' is the average of the edit distance in a pass. 'distance' is the average of the edit distance in a pass.
'instance_error' is the instance error rate in a pass. 'instance_error' is the instance error rate in a pass.
""" """
......
...@@ -95,7 +95,14 @@ class ParallelExecutor(object): ...@@ -95,7 +95,14 @@ class ParallelExecutor(object):
self._places = [] self._places = []
self._act_places = [] self._act_places = []
if use_cuda: if use_cuda:
for i in six.moves.range(core.get_cuda_device_count()): gpus = []
gpus_env = os.getenv("FLAGS_selected_gpus")
if gpus_env:
gpus = [int(s) for s in gpus_env.split(",")]
else:
for i in six.moves.range(core.get_cuda_device_count()):
gpus.append(i)
for i in gpus:
p = core.Place() p = core.Place()
self._act_places.append(core.CUDAPlace(i)) self._act_places.append(core.CUDAPlace(i))
p.set_place(self._act_places[-1]) p.set_place(self._act_places[-1])
......
...@@ -50,8 +50,9 @@ class ParamAttr(object): ...@@ -50,8 +50,9 @@ class ParamAttr(object):
w_param_attrs = fluid.ParamAttr(name="fc_weight", w_param_attrs = fluid.ParamAttr(name="fc_weight",
learning_rate=0.5, learning_rate=0.5,
regularizer=fluid.L2Decay(1.0), regularizer=fluid.regularizer.L2Decay(1.0),
trainable=True) trainable=True)
x = fluid.layers.data(name='X', shape=[1], dtype='float32')
y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs) y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs)
""" """
......
...@@ -388,5 +388,18 @@ class TestGenerateProposals(unittest.TestCase): ...@@ -388,5 +388,18 @@ class TestGenerateProposals(unittest.TestCase):
print(rpn_rois.shape) print(rpn_rois.shape)
class TestYoloDetection(unittest.TestCase):
def test_yolov3_loss(self):
program = Program()
with program_guard(program):
x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10,
0.5)
self.assertIsNotNone(loss)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import paddle
import paddle.fluid as fluid
BATCH_SIZE = 128
CLIP = 1
prog = fluid.framework.Program()
with fluid.program_guard(main_program=prog):
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
prog_clip = prog.clone()
avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
p_g = fluid.backward.append_backward(loss=avg_cost)
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
with fluid.program_guard(main_program=prog_clip):
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
grad_list = [elem[1] for elem in p_g]
grad_clip_list = [elem[1] for elem in p_g_clip]
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
exe.run(fluid.default_startup_program())
count = 0
for data in train_reader():
count += 1
if count > 5:
break
out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
out_clip = exe.run(prog_clip,
feed=feeder.feed(data),
fetch_list=grad_clip_list)
global_norm = 0
for v in out[1:]:
global_norm += np.sum(np.power(v, 2))
global_norm = np.sqrt(global_norm)
global_norm_clip = 0
for v in out_clip[1:]:
global_norm_clip += np.sum(np.power(v, 2))
global_norm_clip = np.sqrt(global_norm_clip)
if not np.isclose(
a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3):
exit(1)
exit(0)
...@@ -43,13 +43,14 @@ if(APPLE) ...@@ -43,13 +43,14 @@ if(APPLE)
list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_desc_clone)
list(REMOVE_ITEM TEST_OPS test_program_code) list(REMOVE_ITEM TEST_OPS test_program_code)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") message(WARNING "These tests has been disabled in OSX before being fixed: \n test_gradient_clip \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
# this op is not support on mac # this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
# TODO: add the unitest back when it fixed # TODO: add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_detection_map_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass) list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
list(REMOVE_ITEM TEST_OPS test_gradient_clip)
endif() endif()
if(NOT WITH_MKLML) if(NOT WITH_MKLML)
# this op is not support on openblas # this op is not support on openblas
...@@ -95,13 +96,12 @@ if(WITH_DISTRIBUTE) ...@@ -95,13 +96,12 @@ if(WITH_DISTRIBUTE)
if(NOT APPLE) if(NOT APPLE)
set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
# FIXME(typhoonzero): add these tests back # FIXME(typhoonzero): add these tests back
# py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
# set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
# py_test_modules(test_dist_transformer MODULES test_dist_transformer) # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
# set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000)
# TODO(typhoonzero): make dist test parallel when fix port management issue set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE)
set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE)
endif(NOT APPLE) endif(NOT APPLE)
py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler)
endif() endif()
......
...@@ -102,7 +102,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): ...@@ -102,7 +102,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
if args.mem_opt: if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
if args.is_dist: if args.update_method == "pserver":
t = self.get_transpiler(args.trainer_id, t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), fluid.default_main_program(),
args.endpoints, args.trainers, args.endpoints, args.trainers,
...@@ -147,7 +147,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): ...@@ -147,7 +147,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2):
def get_data(): def get_data():
origin_batch = next(reader_generator) origin_batch = next(reader_generator)
if args.is_dist and args.use_reader_alloc: if args.update_method == "pserver" and args.use_reader_alloc:
new_batch = [] new_batch = []
for offset, item in enumerate(origin_batch): for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id: if offset % 2 == args.trainer_id:
......
...@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp): ...@@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp):
self.activation = 'identity' self.activation = 'identity'
class TestIdentityActivation(TestConv2dFusionOp):
def init_activation(self):
self.activation = 'identity'
self.add_residual_data = False
class TestWithGroup(TestConv2dFusionOp): class TestWithGroup(TestConv2dFusionOp):
def init_group(self): def init_group(self):
self.groups = 3 self.groups = 3
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1
class TestMKLDNN(TestConv3dOp):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNCase1(TestCase1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNGroup1(TestWithGroup1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNGroup2(TestWithGroup2):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNWith1x1(TestWith1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1):
def init_kernel_type(self):
self.use_mkldnn = True
self.data_format = "NCHW"
if __name__ == '__main__':
unittest.main()
...@@ -74,6 +74,8 @@ class TestConv3dOp(OpTest): ...@@ -74,6 +74,8 @@ class TestConv3dOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "conv3d" self.op_type = "conv3d"
self.use_cudnn = False self.use_cudnn = False
self.use_mkldnn = False
self.data_format = "AnyLayout"
self.dtype = np.float32 self.dtype = np.float32
self.init_kernel_type() self.init_kernel_type()
self.init_group() self.init_group()
...@@ -83,8 +85,7 @@ class TestConv3dOp(OpTest): ...@@ -83,8 +85,7 @@ class TestConv3dOp(OpTest):
conv3d_param = { conv3d_param = {
'stride': self.stride, 'stride': self.stride,
'pad': self.pad, 'pad': self.pad,
'dilations': self.dilations, 'dilations': self.dilations
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
} }
input = np.random.random(self.input_size).astype(self.dtype) input = np.random.random(self.input_size).astype(self.dtype)
...@@ -101,7 +102,9 @@ class TestConv3dOp(OpTest): ...@@ -101,7 +102,9 @@ class TestConv3dOp(OpTest):
'paddings': self.pad, 'paddings': self.pad,
'groups': self.groups, 'groups': self.groups,
'dilations': self.dilations, 'dilations': self.dilations,
'use_cudnn': self.use_cudnn 'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format
} }
self.outputs = {'Output': output} self.outputs = {'Output': output}
...@@ -109,59 +112,35 @@ class TestConv3dOp(OpTest): ...@@ -109,59 +112,35 @@ class TestConv3dOp(OpTest):
return core.is_compiled_with_cuda() and self.use_cudnn return core.is_compiled_with_cuda() and self.use_cudnn
def test_check_output(self): def test_check_output(self):
if self.testcudnn(): place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
place = core.CUDAPlace(0) self.check_output_with_place(place, atol=1e-5)
self.check_output_with_place(place, atol=1e-5)
else:
self.check_output()
def test_check_grad(self): def test_check_grad(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
if self.testcudnn(): place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
place = core.CUDAPlace(0) self.check_grad_with_place(
self.check_grad_with_place( place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03)
place,
set(['Input', 'Filter']),
'Output',
max_relative_error=0.03)
else:
self.check_grad(
set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
def test_check_grad_no_filter(self): def test_check_grad_no_filter(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
if self.testcudnn(): place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
place = core.CUDAPlace(0) self.check_grad_with_place(
self.check_grad_with_place( place, ['Input'],
place, ['Input'], 'Output',
'Output', max_relative_error=0.03,
max_relative_error=0.03, no_grad_set=set(['Filter']))
no_grad_set=set(['Filter']))
else:
self.check_grad(
['Input'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Filter']))
def test_check_grad_no_input(self): def test_check_grad_no_input(self):
if self.dtype == np.float16: if self.dtype == np.float16:
return return
if self.testcudnn(): place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
place = core.CUDAPlace(0) self.check_grad_with_place(
self.check_grad_with_place( place, ['Input'],
place, ['Filter'], 'Output',
'Output', max_relative_error=0.03,
max_relative_error=0.03, no_grad_set=set(['Input']))
no_grad_set=set(['Input']))
else:
self.check_grad(
['Filter'],
'Output',
max_relative_error=0.03,
no_grad_set=set(['Input']))
def init_test_case(self): def init_test_case(self):
self.pad = [0, 0, 0] self.pad = [0, 0, 0]
......
...@@ -76,12 +76,24 @@ class TestDistRunnerBase(object): ...@@ -76,12 +76,24 @@ class TestDistRunnerBase(object):
if args.mem_opt: if args.mem_opt:
fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) fluid.memory_optimize(fluid.default_main_program(), skip_grads=True)
if args.is_dist: if args.update_method == "pserver":
t = self.get_transpiler(args.trainer_id, t = self.get_transpiler(args.trainer_id,
fluid.default_main_program(), fluid.default_main_program(),
args.endpoints, args.trainers, args.endpoints, args.trainers,
args.sync_mode, args.dc_asgd) args.sync_mode, args.dc_asgd)
trainer_prog = t.get_trainer_program() trainer_prog = t.get_trainer_program()
elif args.update_method == "nccl2":
# transpile for nccl2
config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2"
nccl2_t = fluid.DistributeTranspiler(config=config)
nccl2_t.transpile(
args.trainer_id,
program=fluid.default_main_program(),
startup_program=fluid.default_startup_program(),
trainers=args.endpoints,
current_endpoint=args.current_endpoint)
trainer_prog = fluid.default_main_program()
else: else:
trainer_prog = fluid.default_main_program() trainer_prog = fluid.default_main_program()
...@@ -110,11 +122,20 @@ class TestDistRunnerBase(object): ...@@ -110,11 +122,20 @@ class TestDistRunnerBase(object):
len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
mypass.set_int("num_repeats", args.batch_merge_repeat) mypass.set_int("num_repeats", args.batch_merge_repeat)
if args.update_method == "nccl2":
num_trainers = len(args.endpoints.split(","))
trainer_id = args.trainer_id
else:
num_trainers = 1
trainer_id = 0
exe = fluid.ParallelExecutor( exe = fluid.ParallelExecutor(
args.use_cuda, args.use_cuda,
loss_name=avg_cost.name, loss_name=avg_cost.name,
exec_strategy=strategy, exec_strategy=strategy,
build_strategy=build_stra) build_strategy=build_stra,
num_trainers=num_trainers,
trainer_id=trainer_id)
feed_var_list = [ feed_var_list = [
var for var in trainer_prog.global_block().vars.values() var for var in trainer_prog.global_block().vars.values()
...@@ -126,7 +147,7 @@ class TestDistRunnerBase(object): ...@@ -126,7 +147,7 @@ class TestDistRunnerBase(object):
def get_data(): def get_data():
origin_batch = next(reader_generator) origin_batch = next(reader_generator)
if args.is_dist and args.use_reader_alloc: if args.update_method != "local" and args.use_reader_alloc:
new_batch = [] new_batch = []
for offset, item in enumerate(origin_batch): for offset, item in enumerate(origin_batch):
if offset % 2 == args.trainer_id: if offset % 2 == args.trainer_id:
...@@ -151,7 +172,11 @@ def runtime_main(test_class): ...@@ -151,7 +172,11 @@ def runtime_main(test_class):
parser.add_argument( parser.add_argument(
'--role', type=str, required=True, choices=['pserver', 'trainer']) '--role', type=str, required=True, choices=['pserver', 'trainer'])
parser.add_argument('--endpoints', type=str, required=False, default="") parser.add_argument('--endpoints', type=str, required=False, default="")
parser.add_argument('--is_dist', action='store_true') parser.add_argument(
'--update_method',
type=str,
default="local",
choices=["pserver", "nccl2", "local"])
parser.add_argument('--trainer_id', type=int, required=False, default=0) parser.add_argument('--trainer_id', type=int, required=False, default=0)
parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument('--trainers', type=int, required=False, default=1)
parser.add_argument( parser.add_argument(
...@@ -170,7 +195,7 @@ def runtime_main(test_class): ...@@ -170,7 +195,7 @@ def runtime_main(test_class):
args = parser.parse_args() args = parser.parse_args()
model = test_class() model = test_class()
if args.role == "pserver" and args.is_dist: if args.role == "pserver" and args.update_method == "pserver":
model.run_pserver(args) model.run_pserver(args)
else: else:
model.run_trainer(args) model.run_trainer(args)
...@@ -208,6 +233,7 @@ class TestDistBase(unittest.TestCase): ...@@ -208,6 +233,7 @@ class TestDistBase(unittest.TestCase):
self._use_reduce = False self._use_reduce = False
self._dc_asgd = False # must use with async mode self._dc_asgd = False # must use with async mode
self._use_reader_alloc = True self._use_reader_alloc = True
self._nccl2_mode = False
self._setup_config() self._setup_config()
self._after_setup_config() self._after_setup_config()
...@@ -218,7 +244,7 @@ class TestDistBase(unittest.TestCase): ...@@ -218,7 +244,7 @@ class TestDistBase(unittest.TestCase):
def start_pserver(self, model_file, check_error_log, required_envs): def start_pserver(self, model_file, check_error_log, required_envs):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver"
ps0_cmd = ps_cmd % \ ps0_cmd = ps_cmd % \
(self._python_interp, model_file, self._ps_endpoints, ps0_ep, (self._python_interp, model_file, self._ps_endpoints, ps0_ep,
self._trainers) self._trainers)
...@@ -270,7 +296,8 @@ class TestDistBase(unittest.TestCase): ...@@ -270,7 +296,8 @@ class TestDistBase(unittest.TestCase):
else: else:
env_local = {'CPU_NUM': '1'} env_local = {'CPU_NUM': '1'}
envs.update(env_local) env_local.update(envs)
print("local_cmd: {}, env: {}".format(cmd, env_local))
if check_error_log: if check_error_log:
err_log = open("/tmp/trainer.err.log", "wb") err_log = open("/tmp/trainer.err.log", "wb")
...@@ -278,21 +305,21 @@ class TestDistBase(unittest.TestCase): ...@@ -278,21 +305,21 @@ class TestDistBase(unittest.TestCase):
cmd.split(" "), cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=err_log, stderr=err_log,
env=envs) env=env_local)
else: else:
local_proc = subprocess.Popen( local_proc = subprocess.Popen(
cmd.split(" "), cmd.split(" "),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
env=envs) env=env_local)
local_out, local_err = local_proc.communicate() local_out, local_err = local_proc.communicate()
if check_error_log: if check_error_log:
err_log.close() err_log.close()
sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
sys.stderr.write('local_stderr: %s\n' % local_err) sys.stderr.write('local_stderr: %s\n' % local_err)
sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out))
return pickle.loads(local_out) return pickle.loads(local_out)
...@@ -303,7 +330,7 @@ class TestDistBase(unittest.TestCase): ...@@ -303,7 +330,7 @@ class TestDistBase(unittest.TestCase):
ps0_ep, ps1_ep = self._ps_endpoints.split(",") ps0_ep, ps1_ep = self._ps_endpoints.split(",")
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver"
tr0_cmd = tr_cmd % \ tr0_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints, (self._python_interp, model, self._ps_endpoints,
0, ps0_ep, self._trainers) 0, ps0_ep, self._trainers)
...@@ -335,8 +362,8 @@ class TestDistBase(unittest.TestCase): ...@@ -335,8 +362,8 @@ class TestDistBase(unittest.TestCase):
env0.update(envs) env0.update(envs)
env1.update(envs) env1.update(envs)
print("tr0_cmd:{}".format(tr0_cmd)) print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0))
print("tr1_cmd:{}".format(tr1_cmd)) print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb") tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb")
...@@ -357,12 +384,9 @@ class TestDistBase(unittest.TestCase): ...@@ -357,12 +384,9 @@ class TestDistBase(unittest.TestCase):
# close trainer file # close trainer file
tr0_pipe.close() tr0_pipe.close()
tr1_pipe.close() tr1_pipe.close()
ps0_pipe.close() ps0_pipe.close()
ps1_pipe.close() ps1_pipe.close()
# FIXME: use terminate() instead of sigkill.
os.kill(ps0.pid, signal.SIGKILL)
os.kill(ps1.pid, signal.SIGKILL)
ps0.terminate() ps0.terminate()
ps1.terminate() ps1.terminate()
...@@ -372,7 +396,71 @@ class TestDistBase(unittest.TestCase): ...@@ -372,7 +396,71 @@ class TestDistBase(unittest.TestCase):
sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out))
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
# return tr0_losses, tr1_losses return pickle.loads(tr0_out), pickle.loads(tr1_out)
def _run_cluster_nccl2(self, model, envs, check_error_log):
# NOTE: we reuse ps_endpoints as nccl2 worker endpoints
worker_endpoints = self._ps_endpoints.split(",")
w0_ep, w1_ep = worker_endpoints
tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2"
tr0_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints,
0, w0_ep)
tr1_cmd = tr_cmd % \
(self._python_interp, model, self._ps_endpoints,
1, w1_ep)
if self._mem_opt:
tr0_cmd += " --mem_opt"
tr1_cmd += " --mem_opt"
if self._use_reduce:
tr0_cmd += " --use_reduce"
tr1_cmd += " --use_reduce"
if self._use_reader_alloc:
tr0_cmd += " --use_reader_alloc"
tr1_cmd += " --use_reader_alloc"
if self.__use_cuda:
tr0_cmd += " --use_cuda"
tr1_cmd += " --use_cuda"
env0 = {"CUDA_VISIBLE_DEVICES": "0"}
env1 = {"CUDA_VISIBLE_DEVICES": "1"}
else:
env0 = {'CPU_NUM': '1'}
env1 = {'CPU_NUM': '1'}
env0.update(envs)
env1.update(envs)
print("tr0_cmd:{}, env: {}".format(tr0_cmd, env0))
print("tr1_cmd:{}, env: {}".format(tr1_cmd, env1))
tr0_pipe = open("/tmp/tr0_err.log", "wb")
tr1_pipe = open("/tmp/tr1_err.log", "wb")
tr0_proc = subprocess.Popen(
tr0_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=tr0_pipe,
env=env0)
tr1_proc = subprocess.Popen(
tr1_cmd.strip().split(" "),
stdout=subprocess.PIPE,
stderr=tr1_pipe,
env=env1)
tr0_out, tr0_err = tr0_proc.communicate()
tr1_out, tr1_err = tr1_proc.communicate()
# close trainer file
tr0_pipe.close()
tr1_pipe.close()
# print log
sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err)
sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err)
sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out)
sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out)
return pickle.loads(tr0_out), pickle.loads(tr1_out) return pickle.loads(tr0_out), pickle.loads(tr1_out)
def check_with_place(self, def check_with_place(self,
...@@ -387,20 +475,25 @@ class TestDistBase(unittest.TestCase): ...@@ -387,20 +475,25 @@ class TestDistBase(unittest.TestCase):
"LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""),
"FLAGS_fraction_of_gpu_memory_to_use": "0.15", "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
"FLAGS_cudnn_deterministic": "1", "FLAGS_cudnn_deterministic": "1",
"http_proxy": "" "http_proxy": "",
"NCCL_P2P_DISABLE": "1"
} }
required_envs.update(need_envs) required_envs.update(need_envs)
if check_error_log: if check_error_log:
required_envs["GLOG_v"] = "7" required_envs["GLOG_v"] = "3"
required_envs["GLOG_logtostderr"] = "1" required_envs["GLOG_logtostderr"] = "1"
local_losses\ local_losses\
= self._run_local(model_file, required_envs, = self._run_local(model_file, required_envs,
check_error_log) check_error_log)
tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs, if self._nccl2_mode:
check_error_log) tr0_losses, tr1_losses = self._run_cluster_nccl2(
model_file, required_envs, check_error_log)
else:
tr0_losses, tr1_losses = self._run_cluster(
model_file, required_envs, check_error_log)
for step_id in range(RUN_STEP): for step_id in range(RUN_STEP):
local_loss = local_losses[step_id] local_loss = local_losses[step_id]
......
...@@ -26,6 +26,19 @@ class TestDistMnist2x2(TestDistBase): ...@@ -26,6 +26,19 @@ class TestDistMnist2x2(TestDistBase):
self.check_with_place("dist_mnist.py", delta=1e-5) self.check_with_place("dist_mnist.py", delta=1e-5)
class TestDistMnistNCCL2(TestDistBase):
def _setup_config(self):
self._sync_mode = True
self._use_reduce = False
self._use_reader_alloc = False
self._nccl2_mode = True
def test_dist_train(self):
import paddle.fluid as fluid
if fluid.core.is_compiled_with_cuda():
self.check_with_place("dist_mnist.py", delta=1)
class TestDistMnist2x2Lars(TestDistBase): class TestDistMnist2x2Lars(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = True self._sync_mode = True
......
...@@ -44,7 +44,7 @@ class TestDistSaveLoadDense2x2(TestDistBase): ...@@ -44,7 +44,7 @@ class TestDistSaveLoadDense2x2(TestDistBase):
required_envs.update(need_envs) required_envs.update(need_envs)
if check_error_log: if check_error_log:
required_envs["GLOG_v"] = "7" required_envs["GLOG_v"] = "3"
required_envs["GLOG_logtostderr"] = "1" required_envs["GLOG_logtostderr"] = "1"
model_dir = tempfile.mkdtemp() model_dir = tempfile.mkdtemp()
......
...@@ -769,6 +769,7 @@ class TestNCCL2Transpile(TranspilerTest): ...@@ -769,6 +769,7 @@ class TestNCCL2Transpile(TranspilerTest):
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2" config.mode = "nccl2"
config.wait_port = False
t = fluid.DistributeTranspiler(config=config) t = fluid.DistributeTranspiler(config=config)
t.transpile( t.transpile(
0, 0,
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid.core as core
import numpy as np
from paddle.fluid.op import Operator
class TestGetTensorFromSelectedRows(unittest.TestCase):
def get_places(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def check_with_place(self, place):
scope = core.Scope()
x_rows = [0, 5, 5, 4, 20]
height = 20
row_numel = 2
np_array = np.ones((len(x_rows), row_numel)).astype("float32")
np_array[1, :] = 2.0
np_array[2, :] = 3.0
np_array[3, :] = 4.0
# initialize input variable X
x = scope.var('X').get_selected_rows()
x.set_rows(x_rows)
x.set_height(height)
x_tensor = x.get_tensor()
x_tensor.set(np_array, place)
# initialize input variable Out
out = scope.var("Out").get_tensor()
op = Operator("get_tensor_from_selected_rows", X="X", Out="Out")
op.run(scope, place)
out_array = np.array(out)
self.assertEqual((5, 2), out_array.shape)
assert (out_array == np_array).all()
def test_check_output(self):
for place in self.get_places():
self.check_with_place(place)
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
import paddle.fluid as fluid
BATCH_SIZE = 128
CLIP = 1
def bow_net(data,
label,
dict_dim,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
fluid/PaddleNLP/text_classification/nets.py
"""
emb = fluid.layers.embedding(
input=data, is_sparse=True, size=[dict_dim, emb_dim])
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
bow_tanh = fluid.layers.tanh(bow)
fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
cost = fluid.layers.cross_entropy(input=prediction, label=label)
avg_cost = fluid.layers.mean(x=cost)
return avg_cost
class TestGradientClip(unittest.TestCase):
def setUp(self):
self.word_dict = paddle.dataset.imdb.word_dict()
self.BATCH_SIZE = 2
self.train_data = paddle.batch(
paddle.dataset.imdb.train(self.word_dict),
batch_size=self.BATCH_SIZE)
def get_places(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def check_operators(self, place):
prog = fluid.framework.Program()
startup_program = fluid.framework.Program()
with fluid.program_guard(
main_program=prog, startup_program=startup_program):
image = fluid.layers.data(name='x', shape=[784], dtype='float32')
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
prog_clip = prog.clone()
avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
p_g = fluid.backward.append_backward(loss=avg_cost)
p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
with fluid.program_guard(main_program=prog_clip):
fluid.clip.set_gradient_clip(
fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
grad_list = [elem[1] for elem in p_g]
grad_clip_list = [elem[1] for elem in p_g_clip]
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=8192),
batch_size=BATCH_SIZE)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
exe.run(startup_program)
count = 0
for data in train_reader():
count += 1
if count > 5:
break
out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
out_clip = exe.run(prog_clip,
feed=feeder.feed(data),
fetch_list=grad_clip_list)
global_norm = 0
for v in out[1:]:
global_norm += np.sum(np.power(v, 2))
global_norm = np.sqrt(global_norm)
global_norm_clip = 0
for v in out_clip[1:]:
global_norm_clip += np.sum(np.power(v, 2))
global_norm_clip = np.sqrt(global_norm_clip)
assert np.isclose(
a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3)
def check_sparse_gradient_clip(self, place):
prog = fluid.framework.Program()
startup_program = fluid.framework.Program()
with fluid.program_guard(
main_program=prog, startup_program=startup_program):
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
cost = bow_net(data, label, len(self.word_dict))
fluid.clip.set_gradient_clip(
clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(cost)
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
exe.run(startup_program)
data = next(self.train_data())
val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0]
self.assertEqual((1, ), val.shape)
print(val)
self.assertFalse(np.isnan(val))
def test_operators(self):
self.check_operators(core.CPUPlace())
def test_sparse_gradient_clip(self):
for place in self.get_places():
self.check_sparse_gradient_clip(place)
if __name__ == '__main__':
unittest.main()
...@@ -170,9 +170,10 @@ class TestBook(unittest.TestCase): ...@@ -170,9 +170,10 @@ class TestBook(unittest.TestCase):
with program_guard(program): with program_guard(program):
dat = layers.data(name='data', shape=[10], dtype='float32') dat = layers.data(name='data', shape=[10], dtype='float32')
lbl = layers.data(name='label', shape=[10], dtype='float32') lbl = layers.data(name='label', shape=[10], dtype='float32')
ignore_index = -1
self.assertIsNotNone( self.assertIsNotNone(
layers.sigmoid_cross_entropy_with_logits( layers.sigmoid_cross_entropy_with_logits(
x=dat, label=lbl)) x=dat, label=lbl, ignore_index=ignore_index))
print(str(program)) print(str(program))
def test_hsigmoid(self): def test_hsigmoid(self):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid.core as core
import numpy as np
from paddle.fluid.op import Operator
class TestMergeSelectedRows(unittest.TestCase):
def get_places(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
return places
def check_with_place(self, place):
scope = core.Scope()
x_rows = [0, 5, 5, 4, 20]
out_rows = [0, 4, 5, 20]
height = 20
row_numel = 2
np_array = np.ones((len(x_rows), row_numel)).astype("float32")
np_array[1, :] = 2.0
np_array[2, :] = 3.0
np_array[3, :] = 4.0
# initialize input variable X
x = scope.var('X').get_selected_rows()
x.set_rows(x_rows)
x.set_height(height)
x_tensor = x.get_tensor()
x_tensor.set(np_array, place)
# initialize input variable Out
out = scope.var("Out").get_selected_rows()
op = Operator("merge_selected_rows", X="X", Out="Out")
op.run(scope, place)
self.assertEqual(out.rows(), out_rows)
self.assertEqual(out.height(), height)
out_array = np.array(out.get_tensor())
self.assertEqual((4, 2), out_array.shape)
assert (out_array[0, :] == 1.0).all()
assert (out_array[1, :] == 4.0).all()
assert (out_array[2, :] == 5.0).all()
assert (out_array[3, :] == 1.0).all()
def test_check_output(self):
for place in self.get_places():
self.check_with_place(place)
if __name__ == "__main__":
unittest.main()
...@@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): ...@@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
"""Test sigmoid_cross_entropy_with_logit_op with probabalistic label """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
""" """
def setUp(self):
self.op_type = "sigmoid_cross_entropy_with_logits"
batch_size = 64
num_classes = 20
ignore_index = -1
self.inputs = {
'X': logit(
np.random.uniform(0, 1, (batch_size, num_classes))
.astype("float32")),
'Label': np.random.randint(-1, 2, (batch_size, num_classes))
.astype("float32")
}
self.attrs = {'ignore_index': ignore_index, }
# Fw Pass is implemented as elementwise sigmoid followed by
# elementwise logistic loss
# Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
sigmoid_X = expit(self.inputs['X'])
term1 = self.inputs['Label'] * np.log(sigmoid_X)
term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
out = -term1 - term2
out[np.where(self.inputs['Label'] == ignore_index)] = 0
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
"""Test sigmoid_cross_entropy_with_logit_op with probabalistic label
"""
def setUp(self): def setUp(self):
self.op_type = "sigmoid_cross_entropy_with_logits" self.op_type = "sigmoid_cross_entropy_with_logits"
batch_size = 64 batch_size = 64
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
import unittest
import numpy as np
from op_test import OpTest
from paddle.fluid import core
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-1.0 * x))
def mse(x, y, num):
return ((y - x)**2).sum() / num
def bce(x, y, mask):
x = x.reshape((-1))
y = y.reshape((-1))
mask = mask.reshape((-1))
error_sum = 0.0
count = 0
for i in range(x.shape[0]):
if mask[i] > 0:
error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i])
count += 1
return error_sum / (-1.0 * count)
def box_iou(box1, box2):
b1_x1 = box1[0] - box1[2] / 2
b1_x2 = box1[0] + box1[2] / 2
b1_y1 = box1[1] - box1[3] / 2
b1_y2 = box1[1] + box1[3] / 2
b2_x1 = box2[0] - box2[2] / 2
b2_x2 = box2[0] + box2[2] / 2
b2_y1 = box2[1] - box2[3] / 2
b2_y2 = box2[1] + box2[3] / 2
b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
inter_rect_x1 = max(b1_x1, b2_x1)
inter_rect_y1 = max(b1_y1, b2_y1)
inter_rect_x2 = min(b1_x2, b2_x2)
inter_rect_y2 = min(b1_y2, b2_y2)
inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max(
inter_rect_y2 - inter_rect_y1, 0)
return inter_area / (b1_area + b2_area + inter_area)
def build_target(gtboxs, gtlabel, attrs, grid_size):
n, b, _ = gtboxs.shape
ignore_thresh = attrs["ignore_thresh"]
anchors = attrs["anchors"]
class_num = attrs["class_num"]
an_num = len(anchors) // 2
obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32')
tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32')
tcls = np.zeros(
(n, an_num, grid_size, grid_size, class_num)).astype('float32')
for i in range(n):
for j in range(b):
if gtboxs[i, j, :].sum() == 0:
continue
gt_label = gtlabel[i, j]
gx = gtboxs[i, j, 0] * grid_size
gy = gtboxs[i, j, 1] * grid_size
gw = gtboxs[i, j, 2] * grid_size
gh = gtboxs[i, j, 3] * grid_size
gi = int(gx)
gj = int(gy)
gtbox = [0, 0, gw, gh]
max_iou = 0
for k in range(an_num):
anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]]
iou = box_iou(gtbox, anchor_box)
if iou > max_iou:
max_iou = iou
best_an_index = k
if iou > ignore_thresh:
noobj_mask[i, best_an_index, gj, gi] = 0
obj_mask[i, best_an_index, gj, gi] = 1
noobj_mask[i, best_an_index, gj, gi] = 0
tx[i, best_an_index, gj, gi] = gx - gi
ty[i, best_an_index, gj, gi] = gy - gj
tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 *
best_an_index])
th[i, best_an_index, gj, gi] = np.log(
gh / anchors[2 * best_an_index + 1])
tconf[i, best_an_index, gj, gi] = 1
tcls[i, best_an_index, gj, gi, gt_label] = 1
return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask)
def YoloV3Loss(x, gtbox, gtlabel, attrs):
n, c, h, w = x.shape
an_num = len(attrs['anchors']) // 2
class_num = attrs["class_num"]
x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
pred_x = sigmoid(x[:, :, :, :, 0])
pred_y = sigmoid(x[:, :, :, :, 1])
pred_w = x[:, :, :, :, 2]
pred_h = x[:, :, :, :, 3]
pred_conf = sigmoid(x[:, :, :, :, 4])
pred_cls = sigmoid(x[:, :, :, :, 5:])
tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target(
gtbox, gtlabel, attrs, x.shape[2])
obj_mask_expand = np.tile(
np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num'])))
loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum())
loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum())
loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum())
loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum())
loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask)
loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask,
noobj_mask)
loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand,
obj_mask_expand)
return attrs['loss_weight_xy'] * (loss_x + loss_y) \
+ attrs['loss_weight_wh'] * (loss_w + loss_h) \
+ attrs['loss_weight_conf_target'] * loss_conf_target \
+ attrs['loss_weight_conf_notarget'] * loss_conf_notarget \
+ attrs['loss_weight_class'] * loss_class
class TestYolov3LossOp(OpTest):
def setUp(self):
self.loss_weight_xy = 1.0
self.loss_weight_wh = 1.0
self.loss_weight_conf_target = 1.0
self.loss_weight_conf_notarget = 1.0
self.loss_weight_class = 1.0
self.initTestCase()
self.op_type = 'yolov3_loss'
x = np.random.random(size=self.x_shape).astype('float32')
gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
gtlabel = np.random.randint(0, self.class_num,
self.gtbox_shape[:2]).astype('int32')
self.attrs = {
"anchors": self.anchors,
"class_num": self.class_num,
"ignore_thresh": self.ignore_thresh,
"loss_weight_xy": self.loss_weight_xy,
"loss_weight_wh": self.loss_weight_wh,
"loss_weight_conf_target": self.loss_weight_conf_target,
"loss_weight_conf_notarget": self.loss_weight_conf_notarget,
"loss_weight_class": self.loss_weight_class,
}
self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel}
self.outputs = {
'Loss': np.array(
[YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32')
}
def test_check_output(self):
place = core.CPUPlace()
self.check_output_with_place(place, atol=1e-3)
def test_check_grad_ignore_gtbox(self):
place = core.CPUPlace()
self.check_grad_with_place(
place, ['X'],
'Loss',
no_grad_set=set(["GTBox", "GTLabel"]),
max_relative_error=0.06)
def initTestCase(self):
self.anchors = [10, 13, 12, 12]
self.class_num = 10
self.ignore_thresh = 0.5
self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7)
self.gtbox_shape = (5, 10, 4)
self.loss_weight_xy = 2.5
self.loss_weight_wh = 0.8
self.loss_weight_conf_target = 1.5
self.loss_weight_conf_notarget = 0.5
self.loss_weight_class = 1.2
if __name__ == "__main__":
unittest.main()
...@@ -125,13 +125,14 @@ def slice_variable(var_list, slice_count, min_block_size): ...@@ -125,13 +125,14 @@ def slice_variable(var_list, slice_count, min_block_size):
class DistributeTranspilerConfig(object): class DistributeTranspilerConfig(object):
""" """
slice_var_up (bool): Do Tensor slice for pservers, default is True. Args:
split_method (PSDispatcher): RoundRobin or HashName can be used slice_var_up (bool): Do Tensor slice for pservers, default is True.
try to choose the best method to balance loads for pservers. split_method (PSDispatcher): RoundRobin or HashName can be used
min_block_size (int): Minimum splitted element number in block. try to choose the best method to balance loads for pservers.
According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 min_block_size (int): Minimum splitted element number in block.
We can use bandwidth effiently when data size is larger than 2MB.If you According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156
want to change it, please be sure you see the slice_variable function. We can use bandwidth effiently when data size is larger than 2MB.If you
want to change it, please be sure you see the slice_variable function.
""" """
slice_var_up = True slice_var_up = True
...@@ -141,6 +142,7 @@ class DistributeTranspilerConfig(object): ...@@ -141,6 +142,7 @@ class DistributeTranspilerConfig(object):
# supported modes: pserver, nccl2 # supported modes: pserver, nccl2
mode = "pserver" mode = "pserver"
print_log = False print_log = False
wait_port = True
class DistributeTranspiler(object): class DistributeTranspiler(object):
...@@ -163,35 +165,34 @@ class DistributeTranspiler(object): ...@@ -163,35 +165,34 @@ class DistributeTranspiler(object):
Examples: Examples:
.. code-block:: python .. code-block:: python
# for pserver mode # for pserver mode
pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
current_endpoint = "192.168.0.1:6174" current_endpoint = "192.168.0.1:6174"
trainer_id = 0 trainer_id = 0
trainers = 4 trainers = 4
role = os.getenv("PADDLE_TRAINING_ROLE") role = os.getenv("PADDLE_TRAINING_ROLE")
t = fluid.DistributeTranspiler()
t = fluid.DistributeTranspiler() t.transpile(
t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers)
trainer_id, pservers=pserver_endpoints, trainers=trainers) if role == "PSERVER":
if role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint)
pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program(current_endpoint,
pserver_startup_program = t.get_startup_program(current_endpoint,
pserver_program) pserver_program)
elif role == "TRAINER": elif role == "TRAINER":
trainer_program = t.get_trainer_program() trainer_program = t.get_trainer_program()
# for nccl2 mode # for nccl2 mode
config = fluid.DistributeTranspilerConfig() config = fluid.DistributeTranspilerConfig()
config.mode = "nccl2" config.mode = "nccl2"
t = fluid.DistributeTranspiler(config=config) t = fluid.DistributeTranspiler(config=config)
t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep) t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep)
exe = fluid.ParallelExecutor( exe = fluid.ParallelExecutor(
use_cuda, use_cuda,
loss_name=loss_var.name, loss_name=loss_var.name,
num_trainers=len(trainers.split(",)), num_trainers=len(trainers.split(",)),
trainer_id=trainer_id trainer_id=trainer_id
) )
""" """
def __init__(self, config=None): def __init__(self, config=None):
...@@ -213,13 +214,16 @@ class DistributeTranspiler(object): ...@@ -213,13 +214,16 @@ class DistributeTranspiler(object):
trainer_id, trainer_id,
trainers, trainers,
current_endpoint, current_endpoint,
startup_program=None): startup_program=None,
wait_port=True):
if not startup_program: if not startup_program:
startup_program = default_startup_program() startup_program = default_startup_program()
if trainer_id >= 0: if trainer_id >= 0:
worker_endpoints = trainers.split(",") worker_endpoints = trainers.split(",")
# send NCCL_ID to others or recv from trainer 0 # send NCCL_ID to others or recv from trainer 0
worker_endpoints.remove(current_endpoint) worker_endpoints.remove(current_endpoint)
if trainer_id == 0 and wait_port:
wait_server_ready(worker_endpoints)
nccl_id_var = startup_program.global_block().create_var( nccl_id_var = startup_program.global_block().create_var(
name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
...@@ -305,7 +309,8 @@ class DistributeTranspiler(object): ...@@ -305,7 +309,8 @@ class DistributeTranspiler(object):
trainer_id, trainer_id,
trainers, trainers,
current_endpoint, current_endpoint,
startup_program=startup_program) startup_program=startup_program,
wait_port=self.config.wait_port)
return return
self.trainer_num = trainers self.trainer_num = trainers
...@@ -651,9 +656,6 @@ class DistributeTranspiler(object): ...@@ -651,9 +656,6 @@ class DistributeTranspiler(object):
# NOTE: assume blocks of the same variable is not distributed # NOTE: assume blocks of the same variable is not distributed
# on the same pserver, only change param/grad varnames for # on the same pserver, only change param/grad varnames for
# trainers to fetch. # trainers to fetch.
sys.stderr.write("get_pserver_program() is deprecated, call \
get_pserver_programs() to get pserver main and startup \
in a single call.")
# step1 # step1
pserver_program = Program() pserver_program = Program()
pserver_program.random_seed = self.origin_program.random_seed pserver_program.random_seed = self.origin_program.random_seed
...@@ -921,18 +923,6 @@ in a single call.") ...@@ -921,18 +923,6 @@ in a single call.")
Returns: Returns:
Program: parameter server side startup program. Program: parameter server side startup program.
""" """
sys.stderr.write("get_startup_program() is deprecated, call \
get_pserver_programs() to get pserver main and startup \
in a single call.")
if pserver_program != None:
sys.stderr.write("passing pserver_program to get_startup_program() \
is deprecated, you can use new API get_pserver_programs() to \
get both pserver main program and startup program.")
if startup_program != None:
sys.stderr.write("passing startup_program to get_startup_program() \
is deprecated, use fluid.program_guard() or pass this argument \
to transpile() call.")
s_prog = Program() s_prog = Program()
orig_s_prog = self.startup_program orig_s_prog = self.startup_program
s_prog.random_seed = orig_s_prog.random_seed s_prog.random_seed = orig_s_prog.random_seed
......
...@@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON': ...@@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path)
package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name] package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
if '${CMAKE_BUILD_TYPE}' == 'Release': if '${WITH_MKLDNN}' == 'ON':
# only change rpath in Release mode. if '${CMAKE_BUILD_TYPE}' == 'Release':
if '${WITH_MKLDNN}' == 'ON': # only change rpath in Release mode.
# TODO(typhoonzero): use install_name_tool to patch mkl libs once # TODO(typhoonzero): use install_name_tool to patch mkl libs once
# we can support mkl on mac. # we can support mkl on mac.
# #
...@@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release': ...@@ -177,14 +177,19 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}"
if os.system(command) != 0: if os.system(command) != 0:
raise Exception("patch libmkldnn.so failed, command: %s" % command) raise Exception("patch libmkldnn.so failed, command: %s" % command)
package_data['paddle.libs']+=['libmkldnn.so.0'] package_data['paddle.libs']+=['libmkldnn.so.0']
shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
if '${WITH_NGRAPH}' == 'ON': if '${WITH_NGRAPH}' == 'ON':
# only change rpath in Release mode,
# since in Debug mode, nGraph lib may be too large to be changed?
if '${CMAKE_BUILD_TYPE}' == 'Release': if '${CMAKE_BUILD_TYPE}' == 'Release':
# only change rpath in Release mode. if os.name != 'nt':
command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}" if "@APPLE@" == "1":
if os.system(command) != 0: command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}"
raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command) else:
command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
if os.system(command) != 0:
raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
shutil.copy('${NGRAPH_SHARED_LIB}', libs_path) shutil.copy('${NGRAPH_SHARED_LIB}', libs_path)
shutil.copy('${NGRAPH_CPU_LIB}', libs_path) shutil.copy('${NGRAPH_CPU_LIB}', libs_path)
shutil.copy('${NGRAPH_TBB_LIB}', libs_path) shutil.copy('${NGRAPH_TBB_LIB}', libs_path)
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
Print all signature of a python module in alphabet order. Print all signature of a python module in alphabet order.
Usage: Usage:
./print_signature "paddle.fluid" > signature.txt ./print_signature "paddle.fluid,paddle.reader" > signature.txt
""" """
from __future__ import print_function from __future__ import print_function
...@@ -43,7 +43,8 @@ def visit_member(parent_name, member): ...@@ -43,7 +43,8 @@ def visit_member(parent_name, member):
line.strip() for line in pydoc.render_doc(member).split('\n') line.strip() for line in pydoc.render_doc(member).split('\n')
if "->" in line if "->" in line
]) ])
elif inspect.isgetsetdescriptor(member):
return
else: else:
raise RuntimeError("Unsupported generate signature of member, type {0}". raise RuntimeError("Unsupported generate signature of member, type {0}".
format(str(type(member)))) format(str(type(member))))
...@@ -63,7 +64,9 @@ def visit_all_module(mod): ...@@ -63,7 +64,9 @@ def visit_all_module(mod):
visit_member(mod.__name__, instance) visit_member(mod.__name__, instance)
visit_all_module(importlib.import_module(sys.argv[1])) modules = sys.argv[1].split(",")
for m in modules:
visit_all_module(importlib.import_module(m))
for name in member_dict: for name in member_dict:
print(name, member_dict[name]) print(name, member_dict[name])
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册