提交 64ad051b 编写于 作者: S sneaxiy

merge develop

test=develop
......@@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
return()
ENDIF()
INCLUDE(GNUInstallDirs)
INCLUDE(ExternalProject)
SET(NGRAPH_PROJECT "extern_ngraph")
......@@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph)
SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph)
SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so)
SET(NGRAPH_TBB_LIB_NAME libtbb.so.2)
SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
ExternalProject_Add(
${NGRAPH_PROJECT}
......@@ -63,18 +69,6 @@ ExternalProject_Add(
CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
)
if(UNIX AND NOT APPLE)
include(GNUInstallDirs)
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
else()
SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
endif()
MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
# Workaround for nGraph expecting mklml to be in mkldnn install directory.
ExternalProject_Add_Step(
${NGRAPH_PROJECT}
......
......@@ -129,6 +129,15 @@ if (WITH_MKLDNN)
)
endif ()
if (WITH_NGRAPH)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
copy(ngraph_lib
SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
DSTS ${dst_dir} ${dst_dir}
DEPS ngraph
)
endif ()
if (NOT WIN32)
if (NOT MOBILE_INFERENCE AND NOT RPI)
set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
......
......@@ -32,6 +32,13 @@ paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.c
paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core.ParallelExecutor.BuildStrategy) -> None
paddle.fluid.create_lod_tensor ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None)
paddle.fluid.create_random_int_lodtensor ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.__init__ ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None)
paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,))
paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None))
......@@ -69,7 +76,7 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name']
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False))
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
paddle.fluid.layers.conv3d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
......@@ -175,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
......@@ -187,6 +194,7 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1))
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
......
......@@ -34,6 +34,7 @@ add_subdirectory(ir)
add_subdirectory(details)
# ddim lib
proto_library(framework_proto SRCS framework.proto)
proto_library(async_executor_proto SRCS data_feed.proto)
cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
......@@ -126,8 +127,9 @@ cc_library(version SRCS version.cc)
cc_test(version_test SRCS version_test.cc DEPS version)
cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
if(NOT WIN32)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler)
endif(NOT WIN32)
......@@ -135,7 +137,7 @@ endif(NOT WIN32)
cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
py_proto_compile(framework_py_proto SRCS framework.proto)
py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)
# Generate an empty __init__.py to make framework_py_proto as a valid python module.
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init)
......@@ -157,18 +159,19 @@ endif(NOT WIN32)
cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
if(WITH_DISTRIBUTE)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
else()
if(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
else(NOT WIN32)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
endif(NOT WIN32)
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif()
......@@ -176,8 +179,11 @@ endif()
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph build_strategy
fast_threaded_ssa_graph_executor)
fast_threaded_ssa_graph_executor variable_helper)
cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper)
cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor)
cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/async_executor.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace framework {
AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place)
: root_scope_(scope), place_(place) {}
void AsyncExecutor::CreateThreads(
ExecutorThreadWorker* worker, const ProgramDesc& main_program,
const std::shared_ptr<DataFeed>& reader,
const std::vector<std::string>& fetch_var_names, Scope* root_scope,
const int thread_index, const bool debug) {
worker->SetThreadId(thread_index);
worker->SetDebug(debug);
worker->SetRootScope(root_scope);
worker->CreateThreadResource(main_program, place_);
worker->SetDataFeed(reader);
worker->SetFetchVarNames(fetch_var_names);
worker->BindingDataFeedMemory();
}
void PrepareReaders(std::vector<std::shared_ptr<DataFeed>>& readers, // NOLINT
const int thread_num, const DataFeedDesc& data_feed_desc,
const std::vector<std::string>& filelist) {
readers.resize(thread_num);
for (size_t i = 0; i < readers.size(); ++i) {
readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name());
readers[i]->Init(data_feed_desc); // set batch_size and queue_size here
}
readers[0]->SetFileList(filelist);
}
void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
const std::string& data_feed_desc_str,
const std::vector<std::string>& filelist,
const int thread_num,
const std::vector<std::string>& fetch_var_names,
const bool debug) {
std::vector<std::thread> threads;
auto& block = main_program.Block(0);
for (auto var_name : fetch_var_names) {
auto var_desc = block.FindVar(var_name);
auto shapes = var_desc->GetShape();
PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1,
"var %s: Fetched var has wrong shape, "
"only variables with the last dimension size 1 supported",
var_name);
}
DataFeedDesc data_feed_desc;
google::protobuf::TextFormat::ParseFromString(data_feed_desc_str,
&data_feed_desc);
int actual_thread_num = thread_num;
int file_cnt = filelist.size();
PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty");
if (actual_thread_num > file_cnt) {
VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt
<< ". Changing thread_num = " << file_cnt;
actual_thread_num = file_cnt;
}
/*
readerDesc: protobuf description for reader initlization
argument: class_name, batch_size, use_slot, queue_size, buffer_size,
padding_index
reader:
1) each thread has a reader, reader will read input data and
put it into input queue
2) each reader has a Next() iterface, that can fetch an instance
from the input queue
*/
// todo: should be factory method for creating datafeed
std::vector<std::shared_ptr<DataFeed>> readers;
PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist);
std::vector<std::shared_ptr<ExecutorThreadWorker>> workers;
workers.resize(actual_thread_num);
for (auto& worker : workers) {
worker.reset(new ExecutorThreadWorker);
}
// prepare thread resource here
for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
CreateThreads(workers[thidx].get(), main_program, readers[thidx],
fetch_var_names, root_scope_, thidx, debug);
}
// start executing ops in multiple threads
for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
threads.push_back(
std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
}
for (auto& th : threads) {
th.join();
}
root_scope_->DropKids();
return;
}
} // einit_modelnd namespace framework
} // end namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <string>
#include <thread> // NOLINT
#include <typeinfo>
#include <vector>
#include "paddle/fluid/framework/data_feed.pb.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
class AsyncExecutor {
public:
AsyncExecutor(Scope* scope, const platform::Place& place);
virtual ~AsyncExecutor() {}
void RunFromFile(const ProgramDesc& main_program,
const std::string& data_feed_desc_str,
const std::vector<std::string>& filelist,
const int thread_num,
const std::vector<std::string>& fetch_names,
const bool debug = false);
private:
void CreateThreads(ExecutorThreadWorker* worker,
const ProgramDesc& main_program,
const std::shared_ptr<DataFeed>& reader,
const std::vector<std::string>& fetch_var_names,
Scope* root_scope, const int thread_index,
const bool debug);
public:
Scope* root_scope_;
platform::Place place_;
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
namespace paddle {
namespace framework {
std::vector<std::string> DataFeed::filelist_;
size_t DataFeed::file_idx_;
std::mutex DataFeed::mutex_for_pick_file_;
bool DataFeed::finish_set_filelist_;
void DataFeed::AddFeedVar(Variable* var, const std::string& name) {
CheckInit();
for (size_t i = 0; i < use_slots_.size(); ++i) {
if (name == use_slots_[i]) {
if (use_slots_is_dense_[i]) {
feed_vec_[i] = MixTensor(var->GetMutable<Tensor>());
} else {
feed_vec_[i] = MixTensor(var->GetMutable<LoDTensor>());
}
}
}
}
bool DataFeed::SetFileList(const std::vector<std::string>& files) {
std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
CheckInit();
if (finish_set_filelist_) {
VLOG(3) << "info: you have set the filelist.";
return false;
}
PADDLE_ENFORCE(files.size(), "You have set an empty filelist.");
filelist_.assign(files.begin(), files.end());
file_idx_ = 0;
finish_set_filelist_ = true;
return true;
}
void DataFeed::SetBatchSize(int batch_size) {
PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size);
default_batch_size_ = batch_size;
}
bool DataFeed::PickOneFile(std::string* filename) {
std::unique_lock<std::mutex> lock(mutex_for_pick_file_);
if (file_idx_ == filelist_.size()) {
return false;
}
*filename = filelist_[file_idx_++];
return true;
}
void DataFeed::CheckInit() {
PADDLE_ENFORCE(finish_init_, "Initialization did not succeed.");
}
void DataFeed::CheckSetFileList() {
PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed.");
}
void DataFeed::CheckStart() {
PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet.");
}
template <typename T>
void PrivateQueueDataFeed<T>::SetQueueSize(int queue_size) {
PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size);
queue_size_ = queue_size;
queue_ = std::unique_ptr<paddle::operators::reader::BlockingQueue<T>>(
new paddle::operators::reader::BlockingQueue<T>(queue_size_));
}
template <typename T>
bool PrivateQueueDataFeed<T>::Start() {
CheckSetFileList();
read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this);
read_thread_.detach();
finish_start_ = true;
return true;
}
template <typename T>
void PrivateQueueDataFeed<T>::ReadThread() {
std::string filename;
while (PickOneFile(&filename)) {
file_.open(filename.c_str()); // is_text_feed
PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str());
T instance;
while (ParseOneInstance(&instance)) {
queue_->Send(instance);
}
file_.close();
}
queue_->Close();
}
template <typename T>
int PrivateQueueDataFeed<T>::Next() {
CheckStart();
int index = 0;
T instance;
T ins_vec;
while (index < default_batch_size_) {
if (!queue_->Receive(&instance)) {
break;
}
AddInstanceToInsVec(&ins_vec, instance, index++);
}
batch_size_ = index;
if (batch_size_ != 0) {
PutToFeedVec(ins_vec);
}
return batch_size_;
}
#ifdef _WIN32
template class PrivateQueueDataFeed<std::vector<MultiSlotType>>;
#endif
void MultiSlotDataFeed::Init(
const paddle::framework::DataFeedDesc& data_feed_desc) {
finish_init_ = false;
finish_set_filelist_ = false;
finish_start_ = false;
PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(),
"Multi_slot_desc has not been set.");
paddle::framework::MultiSlotDesc multi_slot_desc =
data_feed_desc.multi_slot_desc();
SetBatchSize(data_feed_desc.batch_size());
SetQueueSize(data_feed_desc.batch_size());
size_t all_slot_num = multi_slot_desc.slots_size();
all_slots_.resize(all_slot_num);
all_slots_type_.resize(all_slot_num);
use_slots_index_.resize(all_slot_num);
use_slots_.clear();
use_slots_is_dense_.clear();
for (size_t i = 0; i < all_slot_num; ++i) {
const auto& slot = multi_slot_desc.slots(i);
all_slots_[i] = slot.name();
all_slots_type_[i] = slot.type();
use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1;
if (slot.is_used()) {
use_slots_.push_back(all_slots_[i]);
use_slots_is_dense_.push_back(slot.is_dense());
}
}
feed_vec_.resize(use_slots_.size());
finish_init_ = true;
}
bool MultiSlotDataFeed::CheckFile(const char* filename) {
CheckInit(); // get info of slots
std::ifstream fin(filename);
if (!fin.good()) {
VLOG(1) << "error: open file<" << filename << "> fail";
return false;
}
std::string line;
int instance_cout = 0;
std::string all_slots_alias = "";
for (const auto& alias : all_slots_) {
all_slots_alias += alias + " ";
}
std::string use_slots_alias = "";
for (const auto& alias : use_slots_) {
use_slots_alias += alias + " ";
}
VLOG(3) << "total slots num: " << all_slots_.size();
VLOG(3) << "total slots alias: " << all_slots_alias;
VLOG(3) << "used slots num: " << use_slots_.size();
VLOG(3) << "used slots alias: " << use_slots_alias;
while (getline(fin, line)) {
++instance_cout;
const char* str = line.c_str();
char* endptr = const_cast<char*>(str);
int len = line.length();
for (size_t i = 0; i < all_slots_.size(); ++i) {
int num = strtol(endptr, &endptr, 10);
if (num < 0) {
VLOG(0) << "error: the number of ids is a negative number: " << num;
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
} else if (num == 0) {
VLOG(0)
<< "error: the number of ids can not be zero, you need "
"padding it in data generator; or if there is something wrong"
" with the data, please check if the data contains unresolvable "
"characters.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
} else if (errno == ERANGE || num > INT_MAX) {
VLOG(0) << "error: the number of ids greater than INT_MAX";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
if (all_slots_type_[i] == "float") {
for (int i = 0; i < num; ++i) {
strtof(endptr, &endptr);
if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of "
"representable values for float";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
if (i + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
}
} else if (all_slots_type_[i] == "uint64") {
for (int i = 0; i < num; ++i) {
strtoull(endptr, &endptr, 10);
if (errno == ERANGE) {
VLOG(0) << "error: the value is out of the range of "
"representable values for uint64_t";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
if (i + 1 != num && endptr - str == len) {
VLOG(0) << "error: there is a wrong with the number of ids.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
}
} else {
VLOG(0) << "error: this type<" << all_slots_type_[i]
<< "> is not supported";
return false;
}
}
// It may be added '\t' character to the end of the output of reduce
// task when processes data by Hadoop(when the output of the reduce
// task of Hadoop has only one field, it will add a '\t' at the end
// of the line by default, and you can use this option to avoid it:
// `-D mapred.textoutputformat.ignoreseparator=true`), which does
// not affect the correctness of the data. Therefore, it should be
// judged that the data is not normal when the end of each line of
// data contains characters which are not spaces.
while (endptr - str != len) {
if (!isspace(*(endptr++))) {
VLOG(0)
<< "error: there is some extra characters at the end of the line.";
VLOG(0) << "please check line<" << instance_cout << "> in file<"
<< filename << ">";
return false;
}
}
}
VLOG(3) << "instances cout: " << instance_cout;
VLOG(3) << "The file format is correct";
return true;
}
bool MultiSlotDataFeed::ParseOneInstance(std::vector<MultiSlotType>* instance) {
std::string line;
if (getline(file_, line)) {
int use_slots_num = use_slots_.size();
instance->resize(use_slots_num);
// parse line
const char* str = line.c_str();
char* endptr = const_cast<char*>(str);
int pos = 0;
for (size_t i = 0; i < use_slots_index_.size(); ++i) {
int idx = use_slots_index_[i];
int num = strtol(&str[pos], &endptr, 10);
PADDLE_ENFORCE(
num,
"The number of ids can not be zero, you need padding "
"it in data generator; or if there is something wrong with "
"the data, please check if the data contains unresolvable "
"characters.\nplease check this error line: %s",
str);
if (idx != -1) {
(*instance)[idx].Init(all_slots_type_[i]);
if ((*instance)[idx].GetType()[0] == 'f') { // float
for (int j = 0; j < num; ++j) {
float feasign = strtof(endptr, &endptr);
(*instance)[idx].AddValue(feasign);
}
} else if ((*instance)[idx].GetType()[0] == 'u') { // uint64
for (int j = 0; j < num; ++j) {
uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10);
(*instance)[idx].AddValue(feasign);
}
}
pos = endptr - str;
} else {
for (int j = 0; j <= num; ++j) {
pos = line.find_first_of(' ', pos + 1);
}
}
}
} else {
return false;
}
return true;
}
void MultiSlotDataFeed::AddInstanceToInsVec(
std::vector<MultiSlotType>* ins_vec,
const std::vector<MultiSlotType>& instance, int index) {
if (index == 0) {
ins_vec->resize(instance.size());
for (size_t i = 0; i < instance.size(); ++i) {
(*ins_vec)[i].Init(instance[i].GetType());
(*ins_vec)[i].InitOffset();
}
}
for (size_t i = 0; i < instance.size(); ++i) {
(*ins_vec)[i].AddIns(instance[i]);
}
}
void MultiSlotDataFeed::PutToFeedVec(
const std::vector<MultiSlotType>& ins_vec) {
for (size_t i = 0; i < use_slots_.size(); ++i) {
const auto& type = ins_vec[i].GetType();
const auto& offset = ins_vec[i].GetOffset();
int total_instance = static_cast<int>(offset.back());
if (type[0] == 'f') { // float
const auto& feasign = ins_vec[i].GetFloatData();
if (feed_vec_[i].IsDense()) {
int size_in_each_batch = total_instance / batch_size_;
float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<float>(
{batch_size_, size_in_each_batch}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
} else {
float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data<float>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float));
LoD data_lod{offset};
feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
}
} else if (type[0] == 'u') { // uint64
// no uint64_t type in paddlepaddle
const auto& feasign = ins_vec[i].GetUint64Data();
if (feed_vec_[i].IsDense()) {
int size_in_each_batch = total_instance / batch_size_;
int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data<int64_t>(
{batch_size_, size_in_each_batch}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
} else {
int64_t* tensor_ptr =
feed_vec_[i].GetLoDTensor()->mutable_data<int64_t>(
{total_instance, 1}, platform::CPUPlace());
memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t));
LoD data_lod{offset};
feed_vec_[i].GetLoDTensor()->set_lod(data_lod);
}
}
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/data_feed.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/reader/blocking_queue.h"
namespace paddle {
namespace framework {
// Pack Tensor type and LoDTensor type into MixTensor type, in order
// to record either Tensor or LoDTensor information at the same time.
class MixTensor {
public:
MixTensor() {}
explicit MixTensor(LoDTensor* lodtensor) {
is_dense_ = false;
lodtensor_ = lodtensor;
}
explicit MixTensor(Tensor* tensor) {
is_dense_ = true;
tensor_ = tensor;
}
bool IsDense() { return is_dense_; }
LoDTensor* GetLoDTensor() {
PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr.");
return lodtensor_;
}
Tensor* GetTensor() {
PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr.");
return tensor_;
}
private:
bool is_dense_;
LoDTensor* lodtensor_;
Tensor* tensor_;
};
// DataFeed is the base virtual class for all ohther DataFeeds.
// It is used to read files and parse the data for subsequent trainer.
// Example:
// DataFeed* reader =
// paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name);
// reader->Init(data_feed_desc); // data_feed_desc is a protobuf object
// reader->SetFileList(filelist);
// const std::vector<std::string> & use_slot_alias =
// reader->GetUseSlotAlias();
// for (auto name: use_slot_alias){ // for binding memory
// reader->AddFeedVar(scope->Var(name), name);
// }
// reader->Start();
// while (reader->Next()) {
// // trainer do something
// }
class DataFeed {
public:
DataFeed() {}
virtual ~DataFeed() {}
virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
virtual bool CheckFile(const char* filename) {
PADDLE_THROW("This function(CheckFile) is not implemented.");
}
// Set filelist for DataFeed.
// Pay attention that it must init all readers before call this function.
// Otherwise, Init() function will init finish_set_filelist_ flag.
virtual bool SetFileList(const std::vector<std::string>& files);
virtual bool Start() = 0;
// The trainer calls the Next() function, and the DataFeed will load a new
// batch to the feed_vec. The return value of this function is the batch
// size of the current batch.
virtual int Next() = 0;
// Get all slots' alias which defined in protofile
virtual const std::vector<std::string>& GetAllSlotAlias() {
return all_slots_;
}
// Get used slots' alias which defined in protofile
virtual const std::vector<std::string>& GetUseSlotAlias() {
return use_slots_;
}
// This function is used for binding feed_vec memory
virtual void AddFeedVar(Variable* var, const std::string& name);
protected:
// The following three functions are used to check if it is executed in this
// order:
// Init() -> SetFileList() -> Start() -> Next()
virtual void CheckInit();
virtual void CheckSetFileList();
virtual void CheckStart();
virtual void SetBatchSize(
int batch); // batch size will be set in Init() function
// This function is used to pick one file from the global filelist(thread
// safe).
virtual bool PickOneFile(std::string* filename);
static std::vector<std::string> filelist_;
static size_t file_idx_;
static std::mutex mutex_for_pick_file_;
// the alias of used slots, and its order is determined by
// data_feed_desc(proto object)
std::vector<std::string> use_slots_;
std::vector<bool> use_slots_is_dense_;
// the alias of all slots, and its order is determined by data_feed_desc(proto
// object)
std::vector<std::string> all_slots_;
std::vector<std::string> all_slots_type_;
std::vector<int>
use_slots_index_; // -1: not used; >=0: the index of use_slots_
// The data read by DataFeed will be stored here
std::vector<MixTensor> feed_vec_;
// the batch size defined by user
int default_batch_size_;
// current batch size
int batch_size_;
bool finish_init_;
static bool finish_set_filelist_;
bool finish_start_;
};
// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds.
// It use a read-thread to read file and parse data to a private-queue
// (thread level), and get data from this queue when trainer call Next().
template <typename T>
class PrivateQueueDataFeed : public DataFeed {
public:
PrivateQueueDataFeed() {}
virtual ~PrivateQueueDataFeed() {}
virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0;
virtual bool Start();
virtual int Next();
protected:
// The thread implementation function for reading file and parse.
virtual void ReadThread();
// This function is used to set private-queue size, and the most
// efficient when the queue size is close to the batch size.
virtual void SetQueueSize(int queue_size);
// The reading and parsing method called in the ReadThread.
virtual bool ParseOneInstance(T* instance) = 0;
// This function is used to put instance to vec_ins
virtual void AddInstanceToInsVec(T* vec_ins, const T& instance,
int index) = 0;
// This function is used to put ins_vec to feed_vec
virtual void PutToFeedVec(const T& ins_vec) = 0;
// The thread for read files
std::thread read_thread_;
// using ifstream one line and one line parse is faster
// than using fread one buffer and one buffer parse.
// for a 601M real data:
// ifstream one line and one line parse: 6034 ms
// fread one buffer and one buffer parse: 7097 ms
std::ifstream file_;
size_t queue_size_;
// The queue for store parsed data
std::unique_ptr<paddle::operators::reader::BlockingQueue<T>> queue_;
};
// This class define the data type of instance(ins_vec) in MultiSlotDataFeed
class MultiSlotType {
public:
MultiSlotType() {}
~MultiSlotType() {}
void Init(const std::string& type) {
CheckType(type);
if (type_[0] == 'f') {
float_feasign_.clear();
} else if (type_[0] == 'u') {
uint64_feasign_.clear();
}
type_ = type;
}
void InitOffset() {
offset_.resize(1);
// LoDTensor' lod is counted from 0, the size of lod
// is one size larger than the size of data.
offset_[0] = 0;
}
const std::vector<size_t>& GetOffset() const { return offset_; }
void AddValue(const float v) {
CheckFloat();
float_feasign_.push_back(v);
}
void AddValue(const uint64_t v) {
CheckUint64();
uint64_feasign_.push_back(v);
}
void AddIns(const MultiSlotType& ins) {
if (ins.GetType()[0] == 'f') { // float
CheckFloat();
auto& vec = ins.GetFloatData();
offset_.push_back(offset_.back() + vec.size());
float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end());
} else if (ins.GetType()[0] == 'u') { // uint64
CheckUint64();
auto& vec = ins.GetUint64Data();
offset_.push_back(offset_.back() + vec.size());
uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end());
}
}
const std::vector<float>& GetFloatData() const { return float_feasign_; }
const std::vector<uint64_t>& GetUint64Data() const { return uint64_feasign_; }
const std::string& GetType() const { return type_; }
private:
void CheckType(const std::string& type) const {
PADDLE_ENFORCE((type == "uint64") || (type == "float"),
"There is no this type<%s>.", type);
}
void CheckFloat() const {
PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_);
}
void CheckUint64() const {
PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_);
}
std::vector<float> float_feasign_;
std::vector<uint64_t> uint64_feasign_;
std::string type_;
std::vector<size_t> offset_;
};
// This DataFeed is used to feed multi-slot type data.
// The format of multi-slot type data:
// [n feasign_0 feasign_1 ... feasign_n]*
class MultiSlotDataFeed
: public PrivateQueueDataFeed<std::vector<MultiSlotType>> {
public:
MultiSlotDataFeed() {}
virtual ~MultiSlotDataFeed() {}
virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc);
virtual bool CheckFile(const char* filename);
protected:
virtual void AddInstanceToInsVec(std::vector<MultiSlotType>* vec_ins,
const std::vector<MultiSlotType>& instance,
int index);
virtual bool ParseOneInstance(std::vector<MultiSlotType>* instance);
virtual void PutToFeedVec(const std::vector<MultiSlotType>& ins_vec);
};
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax = "proto2";
package paddle.framework;
message Slot {
required string name = 1;
required string type = 2;
optional bool is_dense = 3 [ default = false ];
optional bool is_used = 4 [ default = false ];
}
message MultiSlotDesc { repeated Slot slots = 1; }
message DataFeedDesc {
optional string name = 1;
optional int32 batch_size = 2 [ default = 32 ];
optional MultiSlotDesc multi_slot_desc = 3;
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_feed_factory.h"
#include <memory>
#include <string>
#include <unordered_map>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle {
namespace framework {
typedef std::shared_ptr<DataFeed> (*Createdata_feedFunction)();
typedef std::unordered_map<std::string, Createdata_feedFunction> data_feedMap;
data_feedMap g_data_feed_map;
#define REGISTER_DATAFEED_CLASS(data_feed_class) \
namespace { \
std::shared_ptr<DataFeed> Creator_##data_feed_class() { \
return std::shared_ptr<DataFeed>(new data_feed_class); \
} \
class __Registerer_##data_feed_class { \
public: \
__Registerer_##data_feed_class() { \
g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \
} \
}; \
__Registerer_##data_feed_class g_registerer_##data_feed_class; \
} // namespace
std::string DataFeedFactory::DataFeedTypeList() {
std::string data_feed_types;
for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end();
++iter) {
if (iter != g_data_feed_map.begin()) {
data_feed_types += ", ";
}
data_feed_types += iter->first;
}
return data_feed_types;
}
std::shared_ptr<DataFeed> DataFeedFactory::CreateDataFeed(
std::string data_feed_class) {
if (g_data_feed_map.count(data_feed_class) < 1) {
exit(-1);
}
return g_data_feed_map[data_feed_class]();
}
REGISTER_DATAFEED_CLASS(MultiSlotDataFeed);
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include "paddle/fluid/framework/data_feed.h"
namespace paddle {
namespace framework {
class DataFeedFactory {
public:
static std::string DataFeedTypeList();
static std::shared_ptr<DataFeed> CreateDataFeed(std::string data_feed_class);
};
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/data_feed.h"
#include <fcntl.h>
#include <chrono> // NOLINT
#include <fstream>
#include <iostream>
#include <map>
#include <mutex> // NOLINT
#include <set>
#include <thread> // NOLINT
#include <utility>
#include <vector>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
paddle::framework::DataFeedDesc load_datafeed_param_from_file(
const char* filename) {
paddle::framework::DataFeedDesc data_feed_desc;
int file_descriptor = open(filename, O_RDONLY);
PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename);
google::protobuf::io::FileInputStream fileInput(file_descriptor);
google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc);
close(file_descriptor);
return data_feed_desc;
}
const std::vector<std::string> load_filelist_from_file(const char* filename) {
std::vector<std::string> filelist;
std::ifstream fin(filename);
PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename);
std::string line;
while (getline(fin, line)) {
filelist.push_back(line);
}
fin.close();
return filelist;
}
void GenerateFileForTest(const char* protofile, const char* filelist) {
std::ofstream w_protofile(protofile);
w_protofile << "name: \"MultiSlotDataFeed\"\n"
"batch_size: 2\n"
"multi_slot_desc {\n"
" slots {\n"
" name: \"uint64_sparse_slot\"\n"
" type: \"uint64\"\n"
" is_dense: false\n"
" is_used: true\n"
" }\n"
" slots {\n"
" name: \"float_sparse_slot\"\n"
" type: \"float\"\n"
" is_dense: false\n"
" is_used: true\n"
" }\n"
" slots {\n"
" name: \"uint64_dense_slot\"\n"
" type: \"uint64\"\n"
" is_dense: true\n"
" is_used: true\n"
" }\n"
" slots {\n"
" name: \"float_dense_slot\"\n"
" type: \"float\"\n"
" is_dense: true\n"
" is_used: true\n"
" }\n"
" slots {\n"
" name: \"not_used_slot\"\n"
" type: \"uint64\"\n"
" is_dense: false\n"
" is_used: false\n"
" }\n"
"}";
w_protofile.close();
std::ofstream w_filelist(filelist);
int total_file = 4;
for (int i = 0; i < total_file; ++i) {
std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i);
w_filelist << filename;
if (i + 1 != total_file) {
w_filelist << std::endl;
}
std::ofstream w_datafile(filename.c_str());
w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n"
"2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n"
"1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n";
w_datafile.close();
}
w_filelist.close();
}
class MultiTypeSet {
public:
MultiTypeSet() {
uint64_set_.clear();
float_set_.clear();
}
~MultiTypeSet() {}
void AddValue(uint64_t v) { uint64_set_.insert(v); }
void AddValue(float v) { float_set_.insert(v); }
const std::set<uint64_t>& GetUint64Set() const { return uint64_set_; }
const std::set<float>& GetFloatSet() const { return float_set_; }
private:
std::set<uint64_t> uint64_set_;
std::set<float> float_set_;
};
void GetElemSetFromReader(std::vector<MultiTypeSet>* reader_elem_set,
const paddle::framework::DataFeedDesc& data_feed_desc,
const std::vector<std::string>& filelist,
const int thread_num) {
int used_slot_num = 0;
for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
++used_slot_num;
}
}
reader_elem_set->resize(used_slot_num);
std::vector<std::thread> threads;
std::vector<std::shared_ptr<paddle::framework::DataFeed>> readers;
readers.resize(thread_num);
for (int i = 0; i < thread_num; ++i) {
readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed(
data_feed_desc.name());
readers[i]->Init(data_feed_desc);
}
readers[0]->SetFileList(filelist);
std::mutex mu;
for (int idx = 0; idx < thread_num; ++idx) {
threads.emplace_back(std::thread([&, idx] {
std::unique_ptr<paddle::framework::Scope> scope(
new paddle::framework::Scope());
const auto& multi_slot_desc = data_feed_desc.multi_slot_desc();
std::map<std::string, const paddle::framework::LoDTensor*>
lodtensor_targets;
std::map<std::string, const paddle::framework::Tensor*> tensor_targets;
for (int i = 0; i < multi_slot_desc.slots_size(); ++i) {
const auto& slot = multi_slot_desc.slots(i);
if (slot.is_used()) {
const auto& name = slot.name();
readers[idx]->AddFeedVar(scope->Var(name), name);
if (slot.is_dense()) {
tensor_targets[name] =
&scope->FindVar(name)->Get<paddle::framework::Tensor>();
} else {
lodtensor_targets[name] =
&scope->FindVar(name)->Get<paddle::framework::LoDTensor>();
}
}
}
readers[idx]->Start();
while (readers[idx]->Next()) {
int index = 0;
for (int k = 0; k < multi_slot_desc.slots_size(); ++k) {
const auto& slot = multi_slot_desc.slots(k);
if (!slot.is_used()) {
continue;
}
if (slot.is_dense()) { // dense branch
const paddle::framework::Tensor* tens = tensor_targets[slot.name()];
if (slot.type() == "uint64") {
const int64_t* data = tens->data<int64_t>();
int batch_size = tens->dims()[0];
int dim = tens->dims()[1];
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < dim; ++j) {
std::lock_guard<std::mutex> lock(mu);
(*reader_elem_set)[index].AddValue(
(uint64_t)data[i * dim + j]);
}
}
} else if (slot.type() == "float") {
const float* data = tens->data<float>();
int batch_size = tens->dims()[0];
int dim = tens->dims()[1];
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < dim; ++j) {
std::lock_guard<std::mutex> lock(mu);
(*reader_elem_set)[index].AddValue(data[i * dim + j]);
}
}
} else {
PADDLE_THROW("Error type in proto file.");
}
} else { // sparse branch
const paddle::framework::LoDTensor* tens =
lodtensor_targets[slot.name()];
if (slot.type() == "uint64") {
const int64_t* data = tens->data<int64_t>();
for (size_t i = 0; i < tens->NumElements(); ++i) {
std::pair<size_t, size_t> element = tens->lod_element(0, i);
for (size_t j = element.first; j < element.second; ++j) {
std::lock_guard<std::mutex> lock(mu);
(*reader_elem_set)[index].AddValue((uint64_t)data[j]);
}
}
} else if (slot.type() == "float") {
const float* data = tens->data<float>();
for (size_t i = 0; i < tens->NumElements(); ++i) {
std::pair<size_t, size_t> element = tens->lod_element(0, i);
for (size_t j = element.first; j < element.second; ++j) {
std::lock_guard<std::mutex> lock(mu);
(*reader_elem_set)[index].AddValue(data[j]);
}
}
} else {
PADDLE_THROW("Error type in proto file.");
}
} // end sparse branch
++index;
} // end slots loop
} // end while Next()
})); // end anonymous function
}
for (auto& th : threads) {
th.join();
}
}
void CheckIsUnorderedSame(const std::vector<MultiTypeSet>& s1,
const std::vector<MultiTypeSet>& s2) {
EXPECT_EQ(s1.size(), s2.size());
for (size_t i = 0; i < s1.size(); ++i) {
// check for uint64
const std::set<uint64_t>& uint64_s1 = s1[i].GetUint64Set();
const std::set<uint64_t>& uint64_s2 = s2[i].GetUint64Set();
EXPECT_EQ(uint64_s1.size(), uint64_s2.size());
auto uint64_it1 = uint64_s1.begin();
auto uint64_it2 = uint64_s2.begin();
while (uint64_it1 != uint64_s1.end()) {
EXPECT_EQ(*uint64_it1, *uint64_it2);
++uint64_it1;
++uint64_it2;
}
// check for float
const std::set<float>& float_s1 = s1[i].GetFloatSet();
const std::set<float>& float_s2 = s2[i].GetFloatSet();
EXPECT_EQ(float_s1.size(), float_s2.size());
auto float_it1 = float_s1.begin();
auto float_it2 = float_s2.begin();
while (float_it1 != float_s1.end()) {
EXPECT_EQ(*float_it1, *float_it2);
++float_it1;
++float_it2;
}
}
}
void GetElemSetFromFile(std::vector<MultiTypeSet>* file_elem_set,
const paddle::framework::DataFeedDesc& data_feed_desc,
const std::vector<std::string>& filelist) {
int used_slot_num = 0;
for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
if (data_feed_desc.multi_slot_desc().slots(i).is_used()) {
++used_slot_num;
}
}
file_elem_set->resize(used_slot_num);
for (const auto& file : filelist) {
std::ifstream fin(file.c_str());
PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str());
while (1) {
bool end_flag = false;
int index = 0;
for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) {
int num;
if (fin >> num) {
auto slot = data_feed_desc.multi_slot_desc().slots(i);
auto type = slot.type();
if (type == "uint64") {
while (num--) {
uint64_t feasign;
fin >> feasign;
if (slot.is_used()) {
(*file_elem_set)[index].AddValue(feasign);
}
}
} else if (type == "float") {
while (num--) {
float feasign;
fin >> feasign;
if (slot.is_used()) {
(*file_elem_set)[index].AddValue(feasign);
}
}
} else {
PADDLE_THROW("Error type in proto file.");
}
if (slot.is_used()) {
++index;
}
} else {
end_flag = true;
break;
}
}
if (end_flag) {
break;
}
}
fin.close();
}
}
TEST(DataFeed, MultiSlotUnitTest) {
const char* protofile = "data_feed_desc.prototxt";
const char* filelist_name = "filelist.txt";
GenerateFileForTest(protofile, filelist_name);
const std::vector<std::string> filelist =
load_filelist_from_file(filelist_name);
paddle::framework::DataFeedDesc data_feed_desc =
load_datafeed_param_from_file(protofile);
std::vector<MultiTypeSet> reader_elem_set;
std::vector<MultiTypeSet> file_elem_set;
GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4);
GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist);
CheckIsUnorderedSame(reader_elem_set, file_elem_set);
}
......@@ -862,7 +862,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(
if (node->Op()->Type() == "fetch_barrier") {
outvar_dev_id =
GetVarDeviceID(*result, output->Name(), *sharded_var_device);
PADDLE_ENFORCE_NE(outvar_dev_id, -1);
PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name());
}
p = places_[outvar_dev_id];
ir::Node *new_node = nullptr;
......
......@@ -32,7 +32,9 @@ enum OpInfoFillType {
kOpProtoAndCheckerMaker = 1,
kGradOpDescMaker = 2,
kVarTypeInference = 3,
kShapeInference = 4
kShapeInference = 4,
kEstimateFlops = 5,
kUnknown = -1
};
template <typename T>
......@@ -48,8 +50,10 @@ struct OpInfoFillTypeID {
? kVarTypeInference
: (std::is_base_of<InferShapeBase, T>::value
? kShapeInference
: static_cast<OpInfoFillType>(
-1)))));
: (std::is_base_of<EstimateFlopsBase,
T>::value
? kEstimateFlops
: kUnknown)))));
}
};
......@@ -139,6 +143,16 @@ struct OpInfoFiller<T, kShapeInference> {
}
};
template <typename T>
struct OpInfoFiller<T, kEstimateFlops> {
void operator()(const char* op_tpe, OpInfo* info) const {
info->estimate_flops_ = [](InferShapeContext* ctx) {
T estimate_flops;
return estimate_flops(ctx);
};
}
};
} // namespace details
} // namespace framework
......
......@@ -16,7 +16,7 @@
#include <stdexcept>
#include <string>
#include <vector>
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle {
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/transfer_scope_cache.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -156,36 +157,6 @@ void Executor::Close() {
#endif
}
void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope*>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
var_type);
}
}
void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope,
int block_id) {
auto& global_block = pdesc.Block(block_id);
......
......@@ -26,7 +26,6 @@ limitations under the License. */
namespace paddle {
namespace framework {
extern void InitializeVariable(Variable* var, proto::VarType::Type var_type);
struct ExecutorPrepareContext {
ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id,
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/executor_thread_worker.h"
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/message.h"
#include "google/protobuf/text_format.h"
#include "gflags/gflags.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace framework {
void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) {
auto& block = program.Block(0);
op_names_.clear();
for (auto& op_desc : block.AllOps()) {
std::unique_ptr<OperatorBase> local_op = OpRegistry::CreateOp(*op_desc);
op_names_.push_back(op_desc->Type());
OperatorBase* local_op_ptr = local_op.release();
ops_.push_back(local_op_ptr);
continue;
}
}
void ExecutorThreadWorker::CreateThreadResource(
const framework::ProgramDesc& program,
const paddle::platform::Place& place) {
CreateThreadScope(program);
CreateThreadOperators(program);
SetMainProgram(program);
SetPlace(place);
}
void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) {
auto& block = program.Block(0);
PADDLE_ENFORCE_NOT_NULL(
root_scope_, "root_scope should be set before creating thread scope");
thread_scope_ = &root_scope_->NewScope();
for (auto& var : block.AllVars()) {
if (var->Persistable()) {
auto* ptr = root_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
} else {
auto* ptr = thread_scope_->Var(var->Name());
InitializeVariable(ptr, var->GetType());
}
}
}
void ExecutorThreadWorker::SetDataFeed(
const std::shared_ptr<DataFeed>& datafeed) {
thread_reader_ = datafeed;
}
void ExecutorThreadWorker::BindingDataFeedMemory() {
const std::vector<std::string>& input_feed =
thread_reader_->GetUseSlotAlias();
for (auto name : input_feed) {
thread_reader_->AddFeedVar(thread_scope_->Var(name), name);
}
}
void ExecutorThreadWorker::SetFetchVarNames(
const std::vector<std::string>& fetch_var_names) {
fetch_var_names_.clear();
fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(),
fetch_var_names.end());
}
void ExecutorThreadWorker::SetDevice() {
#if defined _WIN32 || defined __APPLE__
return;
#else
static unsigned concurrency_cap = std::thread::hardware_concurrency();
int thread_id = this->thread_id_;
if (thread_id < concurrency_cap) {
unsigned proc = thread_id;
cpu_set_t mask;
CPU_ZERO(&mask);
CPU_SET(proc, &mask);
if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) {
VLOG(1) << "WARNING: Failed to set thread affinity for thread "
<< thread_id;
} else {
CPU_ZERO(&mask);
if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) ||
(CPU_ISSET(proc, &mask) == 0)) {
VLOG(3) << "WARNING: Failed to set thread affinity for thread "
<< thread_id;
}
}
} else {
VLOG(1) << "WARNING: Failed to set thread affinity for thread "
<< thread_id;
}
#endif
}
template <typename T>
void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) {
auto inspect = lod_tensor.data<T>();
auto element_num = lod_tensor.numel();
std::ostringstream sstream;
sstream << var_name << " (element num " << element_num << "): [";
sstream << inspect[0];
for (int j = 1; j < element_num; ++j) {
sstream << " " << inspect[j];
}
sstream << "]";
std::cout << sstream.str() << std::endl;
}
void print_fetch_var(Scope* scope, std::string var_name) {
const LoDTensor& tensor = scope->FindVar(var_name)->Get<LoDTensor>();
if (std::type_index(tensor.type()) ==
std::type_index(typeid(platform::float16))) {
print_lod_tensor<platform::float16>(var_name, tensor);
} else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) {
print_lod_tensor<float>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(double))) {
print_lod_tensor<double>(var_name, tensor);
} else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) {
print_lod_tensor<int>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(int64_t))) {
print_lod_tensor<int64_t>(var_name, tensor);
} else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) {
print_lod_tensor<bool>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(uint8_t))) {
print_lod_tensor<uint8_t>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(int16_t))) {
print_lod_tensor<int16_t>(var_name, tensor);
} else if (std::type_index(tensor.type()) ==
std::type_index(typeid(int8_t))) {
print_lod_tensor<int8_t>(var_name, tensor);
} else {
VLOG(1) << "print_fetch_var: unrecognized data type:"
<< tensor.type().name();
}
return;
}
void ExecutorThreadWorker::TrainFiles() {
// todo: configurable
SetDevice();
int fetch_var_num = fetch_var_names_.size();
fetch_values_.clear();
fetch_values_.resize(fetch_var_num);
thread_reader_->Start();
int cur_batch;
int batch_cnt = 0;
while ((cur_batch = thread_reader_->Next()) > 0) {
// executor run here
for (auto& op : ops_) {
op->Run(*thread_scope_, place_);
}
++batch_cnt;
thread_scope_->DropKids();
if (debug_ == false || thread_id_ != 0) {
continue;
}
for (int i = 0; i < fetch_var_num; ++i) {
print_fetch_var(thread_scope_, fetch_var_names_[i]);
} // end for (int i = 0...)
} // end while ()
}
void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; }
void ExecutorThreadWorker::SetPlace(const platform::Place& place) {
place_ = place;
}
void ExecutorThreadWorker::SetMainProgram(
const ProgramDesc& main_program_desc) {
main_program_.reset(new ProgramDesc(main_program_desc));
}
void ExecutorThreadWorker::SetRootScope(Scope* g_scope) {
root_scope_ = g_scope;
}
} // einit_modelnd namespace framework
} // end namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <set>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
namespace paddle {
namespace framework {
void CreateTensor(Variable* var, proto::VarType::Type var_type);
class ExecutorThreadWorker {
public:
ExecutorThreadWorker()
: thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {}
~ExecutorThreadWorker() {}
void CreateThreadResource(const framework::ProgramDesc& program,
const paddle::platform::Place& place);
void SetThreadId(int tid);
void SetDebug(const bool debug) { debug_ = debug; }
void SetRootScope(Scope* g_scope);
// set cpu device in this function
// cpu binding is used by default
void SetDevice();
// since we read data into memory that can not be accessed by program
// we need to bind memory of data with corresponding variables in program
// this function should be called after data feed is set
void BindingDataFeedMemory();
// set data feed declared in executor
void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
// A multi-thread training function
void TrainFiles();
// set fetch variable names from python interface assigned by users
void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
private:
void CreateThreadScope(const framework::ProgramDesc& program);
void CreateThreadOperators(const framework::ProgramDesc& program);
void SetMainProgram(const ProgramDesc& main_program_desc);
void SetPlace(const paddle::platform::Place& place);
protected:
// thread index
std::shared_ptr<DataFeed> thread_reader_; // shared queue, thread buffer
int thread_id_;
// operator name
std::vector<std::string> op_names_;
// thread level, local operators for forward and backward
std::vector<OperatorBase*> ops_;
// main program for training
std::unique_ptr<framework::ProgramDesc> main_program_;
// execution place
platform::Place place_;
// root scope for model parameters
Scope* root_scope_;
// a thread scope, father scope is global score which is shared
Scope* thread_scope_;
private:
std::vector<std::string> fetch_var_names_;
std::vector<std::vector<float>> fetch_values_;
bool debug_;
};
} // namespace framework
} // namespace paddle
......@@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
for (const Node* n : graph->Nodes()) {
if (n->IsOp()) {
auto* op = n->Op();
if (op->HasAttr("is_test")) {
if (n->RuntimeHasAttr("is_test")) {
op->SetAttr("is_test", true);
} else if (std::find(begin(op_list), end(op_list), op->Type()) !=
end(op_list)) {
......
......@@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
auto* op = node->Op();
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv3") {
ASSERT_FALSE(op->HasAttr("is_test"));
ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
} else {
ASSERT_TRUE(op->HasAttr("is_test"));
ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
}
}
......
......@@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
std::unique_ptr<ir::Graph> graph) const {
VLOG(3) << "Aplies MKL-DNN placement strategy.";
for (const Node* n : graph->Nodes()) {
if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
n->Op()->SetAttr("use_mkldnn", true);
}
}
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/ir/node.h"
#include "paddle/fluid/framework/op_info.h"
namespace paddle {
namespace framework {
......@@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
const char Node::kControlDepVarName[] = "__control_var";
#endif
std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
Node::Type type) {
return std::unique_ptr<Node>(new Node(name, type));
}
bool Node::RuntimeHasAttr(const std::string &name) const {
if (Op()->HasAttr(name)) {
return true;
} else {
auto &op_info = OpInfoMap::Instance();
auto op_type = Op()->Type();
if (op_info.Has(op_type)) {
auto op_info_ptr = op_info.Get(op_type);
if (op_info_ptr.HasOpProtoAndChecker()) {
const proto::OpProto &proto = op_info_ptr.Proto();
for (int i = 0; i != proto.attrs_size(); ++i) {
const proto::OpProto::Attr &attr = proto.attrs(i);
if (attr.name() == name) {
return true;
}
}
}
}
}
return false;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -108,6 +108,18 @@ class Node {
Name().find(ir::Node::kControlDepVarName) != std::string::npos;
}
// RuntimeHasAttr is different with HasAttr now.
// 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
// thus, if stored program_desc_ are old which don't have an attr, a new
// library which adds the attr already will fail on this function.
// Details:
// https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
// 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
// problem.
// TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
// RuntimeHasAttr.
bool RuntimeHasAttr(const std::string& name) const;
std::vector<Node*> inputs;
std::vector<Node*> outputs;
......
......@@ -21,42 +21,11 @@
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
// These code can be shared with Executor.
static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope *>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
var_type);
}
}
void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc,
int block_id, bool with_feed_fetch_ops) {
if (!scope) {
......
......@@ -15,23 +15,105 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <functional>
#include <vector>
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
static std::shared_ptr<ngraph::Node> GetNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
const VariableNameMap& var_map,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = var_map.at(prm);
PADDLE_ENFORCE_EQ(var_names.size(), 1,
"op %s prm %s expects one associated var", op->Type(), prm);
if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
return (*ngb_node_map)[var_names[0]];
} else {
return nullptr;
}
}
static std::shared_ptr<ngraph::Node> GetInputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Inputs(), ngb_node_map);
}
static std::shared_ptr<ngraph::Node> GetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
return GetNode(op, prm, op->Outputs(), ngb_node_map);
}
static void SetOutputNode(
const std::shared_ptr<OperatorBase>& op, const std::string prm,
std::shared_ptr<ngraph::Node> node,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto& var_names = op->Outputs().at(prm);
if (var_names.size() == 1) {
(*ngb_node_map)[var_names[0]] = node;
} else if (var_names.size() == 0) {
(*ngb_node_map)[""] = node;
} else {
PADDLE_THROW("prm %s has more than 1 var_names.", prm);
}
}
static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
const std::string prm) {
auto& outputs = op->Outputs();
if (outputs.find(prm) == outputs.end()) return false;
return outputs.at(prm).size() > 0;
}
template <typename T>
static void BuildBinaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto x = GetInputNode(op, "X", ngb_node_map);
auto y = GetInputNode(op, "Y", ngb_node_map);
auto out = std::make_shared<T>(x, y);
SetOutputNode(op, "Out", out, ngb_node_map);
}
template <typename T>
static void BuildUnaryNode(
const std::shared_ptr<OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto input = GetInputNode(op, "X", ngb_node_map);
auto out = std::make_shared<T>(input);
SetOutputNode(op, "Out", out, ngb_node_map);
}
std::map<std::string,
std::function<void(const std::shared_ptr<OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {};
NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
{"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map);
NG_NODE_MAP[op_type](op, ngb_node_map_);
}
} // namespace framework
......
......@@ -20,16 +20,14 @@ limitations under the License. */
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/enforce.h"
#include "ngraph/ngraph.hpp"
#include "ngraph/node.hpp"
namespace paddle {
namespace framework {
class OperatorBase;
class NgraphBridge {
public:
static std::map<
......@@ -43,14 +41,14 @@ class NgraphBridge {
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map)
: ngb_node_map(var_node_map) {}
: ngb_node_map_(var_node_map) {}
void build_graph(const std::shared_ptr<OperatorBase>& op);
void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
private:
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map;
ngb_node_map_;
};
} // namespace framework
......
......@@ -19,14 +19,29 @@ limitations under the License. */
#include <map>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/ngraph_operator.h"
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/framework/var_type.h"
#include "ngraph/ngraph.hpp"
namespace paddle {
namespace framework {
static ngraph::Shape Ddim2Shape(const DDim& dims) {
ngraph::Shape sp;
for (int i = 0; i < dims.size(); ++i) {
int k = dims[i];
k = k == 0 ? 1 : k;
sp.push_back(k);
}
return sp;
}
static std::map<proto::VarType::Type, ngraph::element::Type> pd2ng_type_map = {
{proto::VarType::FP32, ngraph::element::f32},
{proto::VarType::FP64, ngraph::element::f64},
......@@ -42,6 +57,7 @@ typedef enum { /* nGraph support state on ops */
PARTIAL_TEST /* Support partial list of ops for test */
} op_state;
// perform graph build through bridge and execute computation
class NgraphOperator {
public:
explicit NgraphOperator(const Scope& scope, const platform::Place& place,
......@@ -59,13 +75,23 @@ class NgraphOperator {
persistables_(persist),
fetches_(fetches),
post_op_inputs_(post_op_inputs),
ng_op_state_(ng_op_state) {}
ng_op_state_(ng_op_state) {
var_in_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
var_node_map_ = std::make_shared<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>();
BuildNgIO();
GetNgFunction();
}
void Run(const Scope& scope, const platform::Place& place) const;
private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
func_cache;
func_cache_;
const Scope& scope_;
const platform::Place& place_;
std::vector<std::shared_ptr<OperatorBase>> fused_ops_;
......@@ -74,6 +100,35 @@ class NgraphOperator {
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_;
op_state ng_op_state_;
// ngraph backend eg. CPU
static std::shared_ptr<ngraph::runtime::Backend> backend_;
// ngraph function to call and execute
std::shared_ptr<ngraph::Function> ngraph_function_;
// var_name of inputs
std::vector<std::string> var_in_;
// var_name of outputs from fetch in order
std::vector<std::string> var_out_;
// map input vars to nodes
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_in_node_map_;
// map each var name with a ngraph node
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map_;
// cache key to check if function is cached
std::shared_ptr<std::string> GetCacheKey();
// get ngraph input and define ngraph input parameters
void GetNgInputShape(std::shared_ptr<OperatorBase> op);
// Call ngraph bridge to map ops
void BuildNgNodes();
// get the ngraph input and output var list
void BuildNgIO();
// build ngraph function call
void BuildNgFunction();
// Check cache for ngraph function or otherwise build the function
void GetNgFunction();
};
std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
......@@ -86,7 +141,7 @@ FusedOperator::FusedOpIntervals(
}
size_t size = ops->size();
size_t left = 0;
while (left < size && ops.at(left)->Type() != kFeedOpType) {
while (left < size && ops->at(left)->Type() != kFeedOpType) {
++left;
}
if (left == size) {
......@@ -116,7 +171,7 @@ FusedOperator::FusedOpIntervals(
size_t start = pivot, end = start;
while (pivot < right &&
(paddle::framework::NgraphBridge::NG_NODE_MAP.find(
ops.at(pivot)->Type()) !=
ops->at(pivot)->Type()) !=
paddle::framework::NgraphBridge::NG_NODE_MAP.end())) {
++pivot;
++end;
......@@ -136,7 +191,9 @@ FusedOperator::FusedOperator(
std::vector<std::unique_ptr<OperatorBase>>::iterator end,
const std::string& type, const VariableNameMap& inputs,
const VariableNameMap& outputs, const AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs), pdesc(prog), block(block_id) {
: OperatorBase(type, inputs, outputs, attrs),
pdesc_(prog),
block_(block_id) {
for (std::vector<std::unique_ptr<OperatorBase>>::iterator it = start;
it != end; ++it) {
fused_ops_.push_back(std::move(*it));
......@@ -152,7 +209,7 @@ FusedOperator::FusedOperator(
}
if ((*(start - 1))->Type() == kFeedOpType && (*end)->Type() == kFetchOpType) {
is_complete = true;
is_full_ = true;
}
Process();
......@@ -205,7 +262,7 @@ void FusedOperator::RunImpl(const Scope& scope,
}
}
if (is_full) {
if (is_full_) {
ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
}
......@@ -215,6 +272,280 @@ void FusedOperator::RunImpl(const Scope& scope,
ngraph_op.Run(scope, place);
}
std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
NgraphOperator::func_cache_ = {};
std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
ngraph::runtime::Backend::create("CPU");
void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
op->RuntimeInferShape(scope_, place_);
for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto sp = Ddim2Shape(tensor_pd->dims());
if (std::find(var_in_.begin(), var_in_.end(), var_name) !=
var_in_.end()) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto ng_type = var_type_map_.at(var_name);
auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, sp, true);
(*var_node_map_)[var_name] = prm;
(*var_in_node_map_)[var_name] = prm;
}
}
}
}
}
}
void NgraphOperator::BuildNgNodes() {
for (auto& var_name : var_out_) {
if (var_node_map_->find(var_name) == var_node_map_->end()) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
auto ng_shape = Ddim2Shape(ddim);
auto ng_type = var_type_map_.at(var_name);
auto prm =
std::make_shared<ngraph::op::Parameter>(ng_type, ng_shape, true);
(*var_node_map_)[var_name] = prm;
}
}
}
paddle::framework::NgraphBridge ngb(var_node_map_);
for (auto& op : fused_ops_) {
ngb.BuildNgNode(op);
}
}
void NgraphOperator::BuildNgIO() {
std::unordered_set<std::string> inputs;
std::unordered_set<std::string> outputs;
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Inputs()) {
for (auto& var_name : var_name_item.second) {
inputs.insert(var_name);
const bool is_output = outputs.find(var_name) != outputs.end();
if (!is_output &&
std::find(var_in_.begin(), var_in_.end(), var_name) ==
var_in_.end()) {
// fill var_in here to keep lhs and rhs order
var_in_.push_back(var_name);
}
}
}
if (op->Type() != "fill_constant") {
GetNgInputShape(op);
}
for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet",
op->Type());
for (auto& var_name : var_name_item.second) {
outputs.insert(var_name);
}
}
}
// var_out.clear();
for (auto& op : fused_ops_) {
for (auto& var_name_item : op->Outputs()) {
PADDLE_ENFORCE_LE(var_name_item.second.size(), 1,
"op %s has more than 1 output - Not handling yet",
op->Type());
for (auto& var_name : var_name_item.second) {
switch (ng_op_state_) {
case PARTIAL_TEST:
if (post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
fetches_.find(var_name) != fetches_.end()) {
var_out_.push_back(var_name);
}
break;
case FULL_TEST:
if (fetches_.find(var_name) != fetches_.end()) {
var_out_.push_back(var_name);
}
break;
case PARTIAL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() ||
post_op_inputs_.find(var_name) != post_op_inputs_.end() ||
persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name);
}
break;
case FULL_TRAIN:
if (fetches_.find(var_name) != fetches_.end() ||
persistables_.find(var_name) != persistables_.end()) {
var_out_.push_back(var_name);
}
break;
default:
var_out_.push_back(var_name);
}
}
}
}
}
void NgraphOperator::BuildNgFunction() {
BuildNgNodes();
ngraph_function_ = nullptr;
ngraph::NodeVector func_outputs;
ngraph::op::ParameterVector func_inputs;
for (auto& vo : var_out_) {
func_outputs.push_back(var_node_map_->at(vo));
}
for (auto& vi : var_in_) {
std::shared_ptr<ngraph::op::Parameter> prm =
std::dynamic_pointer_cast<ngraph::op::Parameter>(
var_in_node_map_->at(vi));
func_inputs.push_back(prm);
}
ngraph_function_ =
std::make_shared<ngraph::Function>(func_outputs, func_inputs);
}
std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
auto cache_key = std::make_shared<std::string>("");
*cache_key += std::to_string(fused_ops_.size());
for (auto& op : fused_ops_) {
*cache_key += op->Type();
}
for (auto& var_name : var_in_) {
auto shape = var_node_map_->at(var_name)->get_shape();
*cache_key += var_name;
*cache_key += var_type_map_.at(var_name).c_type_string();
for (size_t i = 0; i < shape.size(); ++i) {
*cache_key += std::to_string(shape.at(i));
}
}
for (auto& var_name : var_out_) {
auto* var = scope_.FindVar(var_name);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto& ddim = tensor_pd->dims();
for (int i = 0; i < ddim.size(); ++i) {
*cache_key += std::to_string(ddim[i]);
}
}
}
return cache_key;
}
void NgraphOperator::GetNgFunction() {
bool cache_on = true;
if (cache_on) {
std::string cache_key_val = *GetCacheKey();
if (func_cache_.find(cache_key_val) != func_cache_.end()) {
ngraph_function_ = func_cache_.at(cache_key_val);
} else {
BuildNgFunction();
func_cache_[cache_key_val] = ngraph_function_;
}
} else {
BuildNgFunction();
}
}
void NgraphOperator::Run(const Scope& scope,
const platform::Place& place) const {
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
for (size_t i = 0; i < var_in_.size(); ++i) {
auto vi = var_in_.at(i);
auto sp = var_node_map_->at(vi)->get_shape();
std::shared_ptr<ngraph::runtime::Tensor> ti;
auto* var = scope.FindVar(vi);
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var);
PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()),
"Ensure ngraph tensor layout align with paddle tensor");
if (tensor_pd->type().hash_code() ==
typeid(float).hash_code()) { // NOLINT
const float* arr = tensor_pd->data<float>();
ti = backend_->create_tensor(ngraph::element::f32, sp,
const_cast<float*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(int).hash_code()) { // NOLINT
const int* arr = tensor_pd->data<int>();
ti = backend_->create_tensor(ngraph::element::i32, sp,
const_cast<int*>(arr));
} else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) {
const int64_t* arr = tensor_pd->data<int64_t>();
ti = backend_->create_tensor(ngraph::element::i64, sp,
const_cast<int64_t*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(double).hash_code()) { // NOLINT
const double* arr = tensor_pd->data<double>();
ti = backend_->create_tensor(ngraph::element::f64, sp,
const_cast<double*>(arr));
} else if (tensor_pd->type().hash_code() ==
typeid(bool).hash_code()) { // NOLINT
const bool* arr = tensor_pd->data<bool>();
ti = backend_->create_tensor(ngraph::element::boolean, sp,
const_cast<bool*>(arr));
} else {
PADDLE_THROW("Data type not handling for var %s", vi);
}
} else {
PADDLE_THROW("Cannot find var or tensor with var name %s", vi);
}
bool is_test = (ng_op_state_ == PARTIAL_TEST || ng_op_state_ == FULL_TEST)
? true
: false;
bool is_persistable =
(persistables_.find(vi) != persistables_.end()) ? true : false;
if (is_test && is_persistable) {
ti->set_stale(false);
}
t_in.push_back(ti);
}
for (size_t i = 0; i < var_out_.size(); ++i) {
auto var_name = var_out_[i];
auto* var = scope.FindVar(var_name);
std::shared_ptr<ngraph::runtime::Tensor> to;
if (var && var->IsType<LoDTensor>()) {
auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var);
auto dd = tensor_pd->dims();
ngraph::Shape sp = Ddim2Shape(dd);
auto ng_type = var_type_map_.at(var_name);
if (ng_type == ngraph::element::f32) {
auto pd_arr = tensor_pd->mutable_data<float>(place);
to = backend_->create_tensor(ngraph::element::f32, sp, pd_arr);
} else if (ng_type == ngraph::element::i64) {
auto pd_arr = tensor_pd->mutable_data<int64_t>(place);
to = backend_->create_tensor(ngraph::element::i64, sp, pd_arr);
} else if (ng_type == ngraph::element::f64) {
auto pd_arr = tensor_pd->mutable_data<double>(place);
to = backend_->create_tensor(ngraph::element::f64, sp, pd_arr);
} else if (ng_type == ngraph::element::boolean) {
auto pd_arr = tensor_pd->mutable_data<bool>(place);
to = backend_->create_tensor(ngraph::element::boolean, sp, pd_arr);
} else {
PADDLE_THROW("Data type not handled in for var %s", var_name);
}
t_out.push_back(to);
} else {
PADDLE_THROW("Cannot find var or tensor with var name %s", var_name);
}
}
backend_->call(ngraph_function_, t_out, t_in);
} // NgraphOperator::RunImpl
} // namespace framework
} // namespace paddle
#endif
......@@ -17,24 +17,19 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH
#include <algorithm>
#include <atomic>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/ngraph_bridge.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/platform/variant.h"
#include "ngraph/ngraph.hpp"
#include "ngraph/type/element_type.hpp"
namespace paddle {
namespace framework {
......
......@@ -31,6 +31,12 @@ class InferShapeBase {
virtual void operator()(InferShapeContext*) const = 0;
};
class EstimateFlopsBase {
public:
virtual ~EstimateFlopsBase() = default;
virtual size_t operator()(InferShapeContext*) const = 0;
};
struct OpInfo {
OpCreator creator_;
GradOpMakerFN grad_op_maker_;
......@@ -38,6 +44,7 @@ struct OpInfo {
OpAttrChecker* checker_{nullptr};
InferVarTypeFN infer_var_type_;
InferShapeFN infer_shape_;
EstimateFlopsFN estimate_flops_;
bool HasOpProtoAndChecker() const {
return proto_ != nullptr && checker_ != nullptr;
......
......@@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
"Tensor %s contains NAN", name);
}
void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);
}
void OperatorWithKernel::RunImpl(const Scope& scope,
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
......
......@@ -128,6 +128,8 @@ class OperatorBase {
virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
virtual void RuntimeInferShape(const Scope& scope,
const platform::Place& place) const {}
protected:
std::string type_;
......@@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
}
void RuntimeInferShape(const Scope& scope,
const platform::Place& place) const override;
protected:
virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
virtual OpKernelType GetKernelTypeForVar(
......
......@@ -54,5 +54,7 @@ using InferVarTypeFN =
using InferShapeFN = std::function<void(InferShapeContext*)>;
using EstimateFlopsFN = std::function<void(InferShapeContext*)>;
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/variable_helper.h"
#include <vector>
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/lod_rank_table.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace framework {
void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
if (var_type == proto::VarType::LOD_TENSOR) {
var->GetMutable<LoDTensor>();
} else if (var_type == proto::VarType::SELECTED_ROWS) {
var->GetMutable<SelectedRows>();
} else if (var_type == proto::VarType::FEED_MINIBATCH) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::FETCH_LIST) {
var->GetMutable<FeedFetchList>();
} else if (var_type == proto::VarType::STEP_SCOPES) {
var->GetMutable<std::vector<framework::Scope*>>();
} else if (var_type == proto::VarType::LOD_RANK_TABLE) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarType::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else if (var_type == proto::VarType::READER) {
var->GetMutable<ReaderHolder>();
} else if (var_type == proto::VarType::RAW) {
// GetMutable will be called in operator
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
"LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
var_type);
}
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace framework {
void InitializeVariable(Variable *var, proto::VarType::Type var_type);
}
}
......@@ -46,8 +46,6 @@ class AnalysisPass {
protected:
// User should implement these.
virtual void RunImpl(Argument* argument) = 0;
Argument* argument_{nullptr};
};
} // namespace analysis
......
cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass)
cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
set(analysis_deps ${analysis_deps}
ir_graph_build_pass
......
......@@ -61,6 +61,7 @@ void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) {
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass",
});
for (const auto &pass : passes) {
VLOG(2) << "Run pass " << pass;
......
......@@ -36,12 +36,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
// so that the parameters will on the same device, or they will keep copying
// between difference devices.
platform::Place place;
if (argument->use_gpu()) {
PADDLE_ENFORCE(argument->gpu_device_id_valid());
place = platform::CUDAPlace(argument->gpu_device_id());
} else {
place = platform::CPUPlace();
}
if (argument->model_dir_valid()) {
auto program =
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace inference {
namespace analysis {
void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
PADDLE_ENFORCE(argument->scope_valid());
PADDLE_ENFORCE(argument->use_gpu_valid());
platform::Place place;
// The parameters are on the cpu, therefore, synchronization is not necessary.
if (!argument->use_gpu()) return;
LOG(INFO) << "Sync params from CPU to GPU";
PADDLE_ENFORCE(argument->gpu_device_id_valid());
place = platform::CUDAPlace(argument->gpu_device_id());
auto *scope = argument->scope_ptr();
std::vector<std::string> all_vars = scope->LocalVarNames();
// We get all the vars from local_scope instead of the ProgramDesc.
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
for (auto &var_name : all_vars) {
auto *var = scope->FindLocalVar(var_name);
PADDLE_ENFORCE(var != nullptr);
if (var->IsType<framework::LoDTensor>() ||
var->IsType<framework::Tensor>()) {
auto *t = var->GetMutable<framework::LoDTensor>();
platform::CPUPlace cpu_place;
framework::LoDTensor temp_tensor;
temp_tensor.Resize(t->dims());
temp_tensor.mutable_data<float>(cpu_place);
// Copy the parameter data to a tmp tensor.
TensorCopySync(*t, cpu_place, &temp_tensor);
// Reallocation the space on GPU
t->mutable_data<float>(place);
// Copy parameter data to newly allocated GPU space.
TensorCopySync(temp_tensor, place, t);
}
}
}
std::string IrParamsSyncAmongDevicesPass::repr() const {
return "ir-params-sync-among-devices-pass";
}
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* Sync parameter from CPU to GPU.
*/
class IrParamsSyncAmongDevicesPass : public AnalysisPass {
public:
void RunImpl(Argument *argument) override;
std::string repr() const override;
};
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -16,6 +16,7 @@
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
namespace paddle {
namespace inference {
......@@ -27,6 +28,9 @@ PassRegistry::PassRegistry() {
std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
passes_.emplace("ir_analysis_compose_pass",
std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
passes_.emplace(
"ir_params_sync_among_devices_pass",
std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
}
} // namespace analysis
......
......@@ -190,9 +190,13 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.ResetTensorArray();
// All the containers in the scope will be hold in inference, but the
// operators assume that the container will be reset after each batch.
// Here is a bugfix, collect all the container variables, and reset then to a
// bool; the next time, the operator will call MutableData and construct a new
// container again, so that the container will be empty for each batch.
tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_);
tensor_array_batch_cleaner_.ResetNoTensorVars();
return true;
}
......@@ -417,7 +421,7 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
bool AnalysisPredictor::ZeroCopyRun() {
executor_->Run();
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
tensor_array_batch_cleaner_.ResetTensorArray();
return true;
}
......
......@@ -154,9 +154,9 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get());
tensor_array_batch_cleaner_.ResetTensorArray();
// For some other vector like containers not cleaned after each batch.
tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get());
tensor_array_batch_cleaner_.ResetNoTensorVars();
return true;
}
......
......@@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
link_directories("${PADDLE_LIB}/paddle/lib")
if (NOT WIN32)
set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
if(EXISTS ${NGRAPH_PATH})
include(GNUInstallDirs)
include_directories("${NGRAPH_PATH}/include")
link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
endif()
endif()
add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
if(WITH_MKL)
......@@ -106,7 +116,7 @@ endif()
if (NOT WIN32)
set(EXTERNAL_LIB "-lrt -ldl -lpthread")
set(DEPS ${DEPS}
${MATH_LIB} ${MKLDNN_LIB}
${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
glog gflags protobuf snappystream snappy z xxhash
${EXTERNAL_LIB})
else()
......
......@@ -46,5 +46,28 @@ void TensorArrayBatchCleaner::ResetTensorArray() {
}
}
void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) {
if (no_tensor_flag_) {
for (auto &var_name : scope->LocalVarNames()) {
auto *var = scope->FindVar(var_name);
if (!var->IsInitialized()) continue;
if (!valid_types_.count(var->Type())) {
no_tensor_vars_.insert(var);
}
}
for (auto *kid : scope->kids()) {
CollectTensorArrays(kid);
}
no_tensor_flag_ = false; // Only collect one time.
}
}
void TensorArrayBatchCleaner::ResetNoTensorVars() {
for (auto *var : no_tensor_vars_) {
var->Clear();
}
}
} // namespace details
} // namespace paddle
......@@ -14,9 +14,11 @@
#pragma once
#include <unordered_set>
#include <vector>
#include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h"
namespace paddle {
namespace details {
......@@ -24,13 +26,28 @@ namespace details {
// Clean the TensorArray each batch to make the behavior the same with the
// training phase.
struct TensorArrayBatchCleaner {
TensorArrayBatchCleaner() {
valid_types_.insert(typeid(framework::Tensor));
valid_types_.insert(typeid(framework::LoDTensor));
}
// Collect the variables that are not Tensor or LoDTensor, and reset them to a
// bool(trick), because some of them are containers, and some operators just
// keep inserting new items without clearing the containers first; So the
// memory grow larger and larger in inference service deployed online.
void CollectNoTensorVars(framework::Scope *scope);
void ResetNoTensorVars();
// Fix the tensor array not clear in the inference scenarios.
void CollectTensorArrays(framework::Scope *scope);
void ResetTensorArray();
private:
bool flag_{true};
bool no_tensor_flag_{true};
std::vector<framework::LoDTensorArray *> arrays_;
std::unordered_set<std::type_index> valid_types_;
std::unordered_set<framework::Variable *> no_tensor_vars_;
};
} // namespace details
......
......@@ -116,12 +116,8 @@ class CpuPassStrategy : public PassStrategy {
class GpuPassStrategy : public PassStrategy {
public:
GpuPassStrategy() : PassStrategy({}) {
// TODO(NHZlX) Problem with Data synchronization between GPU and CPU
// When running in GPU mode, the parameters are all on GPU. But the
// opearations of "conv_bn_fuse_pass" are on CPU.
passes_.assign({
"infer_clean_graph_pass",
// "infer_clean_graph_pass", "conv_bn_fuse_pass",
"infer_clean_graph_pass", "conv_bn_fuse_pass",
});
}
......
......@@ -46,11 +46,18 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
# DAM
# normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc)
# small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz")
inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1)
# chinese_ner
set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner")
download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz")
......
......@@ -14,38 +14,54 @@
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_int32(max_turn_num, 9,
"The max turn number: 1 for the small and 9 for the normal.");
namespace paddle {
namespace inference {
using contrib::AnalysisConfig;
#define MAX_TURN_NUM 9
#define MAX_TURN_LEN 50
constexpr int32_t kMaxTurnLen = 50;
static std::vector<float> result_data;
struct DataRecord {
std::vector<std::vector<int64_t>>
turns[MAX_TURN_NUM]; // turns data : MAX_TURN_NUM
std::vector<std::vector<float>>
turns_mask[MAX_TURN_NUM]; // turns mask data : MAX_TURN_NUM
std::vector<std::vector<int64_t>> *turns;
std::vector<std::vector<float>> *turns_mask;
std::vector<std::vector<int64_t>> response; // response data : 1
std::vector<std::vector<float>> response_mask; // response mask data : 1
size_t batch_iter{0};
size_t batch_size{1};
size_t num_samples; // total number of samples
DataRecord() = default;
DataRecord() {
turns = new std::vector<std::vector<
int64_t>>[FLAGS_max_turn_num]; // turns data : FLAGS_max_turn_num
turns_mask = new std::vector<std::vector<
float>>[FLAGS_max_turn_num]; // turns mask data : FLAGS_max_turn_num
}
explicit DataRecord(const std::string &path, int batch_size = 1)
: batch_size(batch_size) {
: DataRecord() {
this->batch_size = batch_size;
Load(path);
}
~DataRecord() {
delete[] turns;
delete[] turns_mask;
}
DataRecord NextBatch() {
DataRecord data;
size_t batch_end = batch_iter + batch_size;
// NOTE skip the final batch, if no enough data is provided.
if (batch_end <= response.size()) {
for (int i = 0; i < MAX_TURN_NUM; ++i) {
for (int i = 0; i < FLAGS_max_turn_num; ++i) {
data.turns[i].assign(turns[i].begin() + batch_iter,
turns[i].begin() + batch_end);
}
for (int i = 0; i < MAX_TURN_NUM; ++i) {
for (int i = 0; i < FLAGS_max_turn_num; ++i) {
data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter,
turns_mask[i].begin() + batch_end);
}
......@@ -60,6 +76,7 @@ struct DataRecord {
batch_iter += batch_size;
return data;
}
void Load(const std::string &path) {
std::ifstream file(path);
std::string line;
......@@ -69,30 +86,30 @@ struct DataRecord {
num_lines++;
std::vector<std::string> data;
split(line, ',', &data);
CHECK_EQ(data.size(), (size_t)(2 * MAX_TURN_NUM + 3));
CHECK_EQ(data.size(), (size_t)(2 * FLAGS_max_turn_num + 3));
// load turn data
std::vector<int64_t> turns_tmp[MAX_TURN_NUM];
for (int i = 0; i < MAX_TURN_NUM; ++i) {
std::vector<int64_t> turns_tmp[FLAGS_max_turn_num];
for (int i = 0; i < FLAGS_max_turn_num; ++i) {
split_to_int64(data[i], ' ', &turns_tmp[i]);
turns[i].push_back(std::move(turns_tmp[i]));
}
// load turn_mask data
std::vector<float> turns_mask_tmp[MAX_TURN_NUM];
for (int i = 0; i < MAX_TURN_NUM; ++i) {
split_to_float(data[MAX_TURN_NUM + i], ' ', &turns_mask_tmp[i]);
std::vector<float> turns_mask_tmp[FLAGS_max_turn_num];
for (int i = 0; i < FLAGS_max_turn_num; ++i) {
split_to_float(data[FLAGS_max_turn_num + i], ' ', &turns_mask_tmp[i]);
turns_mask[i].push_back(std::move(turns_mask_tmp[i]));
}
// load response data
std::vector<int64_t> response_tmp;
split_to_int64(data[2 * MAX_TURN_NUM], ' ', &response_tmp);
split_to_int64(data[2 * FLAGS_max_turn_num], ' ', &response_tmp);
response.push_back(std::move(response_tmp));
// load response_mask data
std::vector<float> response_mask_tmp;
split_to_float(data[2 * MAX_TURN_NUM + 1], ' ', &response_mask_tmp);
split_to_float(data[2 * FLAGS_max_turn_num + 1], ' ', &response_mask_tmp);
response_mask.push_back(std::move(response_mask_tmp));
// load result data
float result_tmp;
result_tmp = std::stof(data[2 * MAX_TURN_NUM + 2]);
result_tmp = std::stof(data[2 * FLAGS_max_turn_num + 2]);
result_data.push_back(result_tmp);
}
num_samples = num_lines;
......@@ -101,8 +118,8 @@ struct DataRecord {
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
int batch_size) {
PaddleTensor turns_tensor[MAX_TURN_NUM];
PaddleTensor turns_mask_tensor[MAX_TURN_NUM];
PaddleTensor turns_tensor[FLAGS_max_turn_num];
PaddleTensor turns_mask_tensor[FLAGS_max_turn_num];
PaddleTensor response_tensor;
PaddleTensor response_mask_tensor;
std::string turn_pre = "turn_";
......@@ -110,16 +127,16 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
auto one_batch = data->NextBatch();
int size = one_batch.response[0].size();
CHECK_EQ(size, MAX_TURN_LEN);
CHECK_EQ(size, kMaxTurnLen);
// turn tensor assignment
for (int i = 0; i < MAX_TURN_NUM; ++i) {
for (int i = 0; i < FLAGS_max_turn_num; ++i) {
turns_tensor[i].name = turn_pre + std::to_string(i);
turns_tensor[i].shape.assign({batch_size, size, 1});
turns_tensor[i].dtype = PaddleDType::INT64;
TensorAssignData<int64_t>(&turns_tensor[i], one_batch.turns[i]);
}
// turn mask tensor assignment
for (int i = 0; i < MAX_TURN_NUM; ++i) {
for (int i = 0; i < FLAGS_max_turn_num; ++i) {
turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i);
turns_mask_tensor[i].shape.assign({batch_size, size, 1});
turns_mask_tensor[i].dtype = PaddleDType::FLOAT32;
......@@ -137,10 +154,10 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
TensorAssignData<float>(&response_mask_tensor, one_batch.response_mask);
// Set inputs.
for (int i = 0; i < MAX_TURN_NUM; ++i) {
for (int i = 0; i < FLAGS_max_turn_num; ++i) {
input_slots->push_back(std::move(turns_tensor[i]));
}
for (int i = 0; i < MAX_TURN_NUM; ++i) {
for (int i = 0; i < FLAGS_max_turn_num; ++i) {
input_slots->push_back(std::move(turns_mask_tensor[i]));
}
input_slots->push_back(std::move(response_tensor));
......@@ -202,8 +219,6 @@ TEST(Analyzer_dam, fuse_statis) {
auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 317);
EXPECT_EQ(num_ops, 2020);
}
// Compare result of NativeConfig and AnalysisConfig
......
......@@ -33,7 +33,7 @@ std::string Benchmark::SerializeToString() const {
ss << batch_size_ << "\t";
ss << num_threads_ << "\t";
ss << latency_ << "\t";
ss << 1000 / latency_;
ss << 1000.0 / latency_;
ss << '\n';
return ss.str();
}
......
......@@ -11,9 +11,11 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <fstream>
#include <iostream>
#include <string>
namespace paddle {
namespace inference {
......@@ -31,8 +33,8 @@ struct Benchmark {
bool use_gpu() const { return use_gpu_; }
void SetUseGpu() { use_gpu_ = true; }
int latency() const { return latency_; }
void SetLatency(int x) { latency_ = x; }
float latency() const { return latency_; }
void SetLatency(float x) { latency_ = x; }
const std::string& name() const { return name_; }
void SetName(const std::string& name) { name_ = name; }
......@@ -43,7 +45,7 @@ struct Benchmark {
private:
bool use_gpu_{false};
int batch_size_{0};
int latency_;
float latency_;
int num_threads_{1};
std::string name_;
};
......
......@@ -37,7 +37,13 @@ if (WITH_GPU)
SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
endif()
register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
SET(OP_PREFETCH_DEPS "")
if (WITH_DISTRIBUTE)
SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch)
endif()
register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS})
# warpctc_op needs cudnn 7 above
if (WITH_GPU AND NOT WIN32)
......
......@@ -14,7 +14,7 @@ limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
......@@ -146,7 +146,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const bool fuse_with_relu = ctx.Attr<bool>("fuse_with_relu");
bool global_stats = is_test || use_global_stats;
const auto *x = ctx.Input<Tensor>("X");
const auto *mean = ctx.Input<Tensor>("Mean");
......@@ -177,12 +179,13 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
T *batch_mean_data = nullptr;
T *batch_variance_data = nullptr;
if (!is_test) {
if (!global_stats) {
batch_mean_data = batch_mean->mutable_data<T>(ctx.GetPlace());
batch_variance_data = batch_variance->mutable_data<T>(ctx.GetPlace());
}
auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring
auto propagation = global_stats == true
? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto src_tz = paddle::framework::vectorize2int(x->dims());
......@@ -199,7 +202,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
shift->data<T>() + ic, &scaleshift_data);
unsigned flags = mkldnn::use_scale_shift;
if (is_test) flags |= mkldnn::use_global_stats;
if (global_stats) flags |= mkldnn::use_global_stats;
if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
// create mkldnn memory from input x tensor
......@@ -208,7 +211,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// keys for backward pass
const std::string key = BatchNormMKLDNNHandler::GetHash(
src_tz, epsilon, flags, is_test, input_format,
src_tz, epsilon, flags, global_stats, input_format,
ctx.op().Output("SavedMean"));
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
......@@ -239,7 +242,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data);
std::shared_ptr<batch_norm_fwd> batch_norm_p;
if (is_test) {
if (global_stats) {
// create mkldnn memory for stats (as input)
std::shared_ptr<memory> mean_memory =
handler.AcquireMeanMemoryFromPrimitive(to_void_cast(mean_data));
......@@ -269,7 +272,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
pipeline.push_back(*batch_norm_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
if (!is_test) {
if (!global_stats) {
// mkldnn only compute stats for current batch
// so we need compute momentum stats via Eigen lib
EigenVectorArrayMap<T> batch_mean_e(batch_mean_data, ic);
......
......@@ -159,6 +159,14 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>("fuse_with_relu",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("use_global_stats",
"(bool, default false) Whether to use global mean and "
"variance. In inference or test mode, set use_global_stats "
"to true or is_test true. the behavior is equivalent. "
"In train mode, when setting use_global_stats True, the "
"global mean and variance are also used during train time, "
"the BN acts as scaling and shiffting.")
.SetDefault(false);
AddComment(R"DOC(
Batch Normalization.
......@@ -190,6 +198,10 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
const float epsilon = ctx.Attr<float>("epsilon");
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
bool global_stats = is_test || use_global_stats;
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
......@@ -217,7 +229,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
saved_mean->mutable_data<T>(ctx.GetPlace());
saved_variance->mutable_data<T>(ctx.GetPlace());
if (!is_test) {
if (!global_stats) {
// saved_xx is use just in this batch of data
EigenVectorArrayMap<T> saved_mean_e(
saved_mean->mutable_data<T>(ctx.GetPlace()), C);
......@@ -234,7 +246,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
if ((N * sample_size) == 1) {
LOG(WARNING) << "Only 1 element in normalization dimension, "
<< "we skip the batch norm calculation, let y = x.";
framework::TensorCopySync(*x, ctx.GetPlace(), y);
framework::TensorCopy(*x, ctx.GetPlace(), y);
return;
}
......@@ -277,7 +289,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
// use SavedMean and SavedVariance to do normalize
Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
if (is_test) {
if (global_stats) {
ConstEigenVectorArrayMap<T> var_arr(
ctx.Input<Tensor>("Variance")->data<T>(), C);
inv_std = (var_arr + epsilon).sqrt().inverse();
......@@ -289,7 +301,7 @@ class BatchNormKernel<platform::CPUDeviceContext, T>
inv_std = saved_inv_std;
}
ConstEigenVectorArrayMap<T> mean_arr(
is_test ? ctx.Input<Tensor>("Mean")->data<T>()
global_stats ? ctx.Input<Tensor>("Mean")->data<T>()
: ctx.Output<Tensor>("SavedMean")->data<T>(),
C);
......@@ -336,15 +348,27 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
void InferShape(framework::InferShapeContext *ctx) const override {
// check input
PADDLE_ENFORCE(ctx->HasInput("X"));
PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("SavedMean"),
"Input(SavedMean) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("SavedVariance"),
"Input(SavedVariance) should not be null");
// check output
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
if (ctx->HasOutput(framework::GradVarName("Scale"))) {
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
"Output(Scale@GRAD) and Output(Bias@GRAD) should not be "
"null at same time");
}
const bool use_global_stats = ctx->Attrs().Get<bool>("use_global_stats");
if (use_global_stats) {
PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_mkldnn"),
"Using global stats during training is not supported "
"in gradient op kernel of batch_norm_mkldnn_op now.");
}
const auto x_dims = ctx->GetInputDim("X");
const DataLayout data_layout = framework::StringToDataLayout(
......@@ -354,9 +378,11 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
: x_dims[x_dims.size() - 1]);
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
if (ctx->HasOutput(framework::GradVarName("Scale"))) {
ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
......@@ -405,6 +431,8 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
// SavedVariance have been reverted in forward operator
const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const float epsilon = ctx.Attr<float>("epsilon");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
......@@ -419,38 +447,60 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
: x_dims[x_dims.size() - 1]);
const int sample_size = x->numel() / N / C;
ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
// init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
const T *mean_data = saved_mean->data<T>();
const T *inv_var_data = saved_inv_variance->data<T>();
Tensor inv_var_tensor;
if (use_global_stats) {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_variance = ctx.Input<Tensor>("Variance");
mean_data = running_mean->data<T>();
T *running_inv_var_data = inv_var_tensor.mutable_data<T>(ctx.GetPlace());
EigenVectorArrayMap<T> inv_var_tmp(running_inv_var_data, C);
ConstEigenVectorArrayMap<T> var_arr(running_variance->data<T>(), C);
inv_var_tmp = (var_arr + epsilon).sqrt().inverse().eval();
inv_var_data = running_inv_var_data;
}
ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
T *d_bias_data = nullptr;
T *d_scale_data = nullptr;
if (d_scale && d_bias) {
d_scale->mutable_data<T>(ctx.GetPlace());
d_bias->mutable_data<T>(ctx.GetPlace());
d_bias_data = d_bias->mutable_data<T>(ctx.GetPlace());
d_scale_data = d_scale->mutable_data<T>(ctx.GetPlace());
}
// d_bias = np.sum(d_y, axis=0)
// d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
// d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
// - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
EigenVectorArrayMap<T> d_bias_arr(d_bias_data, C);
EigenVectorArrayMap<T> d_scale_arr(d_scale_data, C);
EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
C);
EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
C);
if (d_scale && d_bias) {
d_bias_arr.setZero();
d_scale_arr.setZero();
}
if ((N * sample_size) == 1) {
framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
if ((N * sample_size) == 1 && !use_global_stats) {
framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
return;
}
const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
int scale_coefff = use_global_stats ? 1 : N * sample_size;
const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff;
switch (data_layout) {
case DataLayout::kNCHW: {
......@@ -460,19 +510,29 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
sample_size, N * C);
d_x_arr.setZero();
if (d_scale && d_bias) {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_bias_arr(c) += d_y_arr.col(nc).sum();
d_scale_arr(c) +=
((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
d_scale_arr(c) += ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) *
d_y_arr.col(nc))
.sum();
}
}
if (!use_global_stats) {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_x_arr.col(nc) +=
scale_inv_var_nhw(c) *
(d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
(x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
(x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) *
inv_var_arr(c));
}
} else {
for (int nc = 0; nc < N * C; ++nc) {
int c = nc % C;
d_x_arr.col(nc) += scale_inv_var_nhw(c) * d_y_arr.col(nc);
}
}
break;
}
......@@ -488,16 +548,28 @@ class BatchNormGradKernel<platform::CPUDeviceContext, T>
const auto d_y_mul_x_minus_mean_row_sum =
(d_y_arr * x_minus_mean).rowwise().sum();
const auto inv_var_sqr = inv_var_arr * inv_var_arr;
if (d_scale && d_bias) {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_bias_arr += d_y_arr.col(nhw);
d_scale_arr +=
(x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
}
}
if (!use_global_stats) {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_x_arr.col(nhw) +=
scale_inv_var_nhw *
(d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
x_minus_mean.col(nhw) * inv_var_sqr *
d_y_mul_x_minus_mean_row_sum);
}
} else {
for (int nhw = 0; nhw < N * sample_size; ++nhw) {
d_x_arr.col(nhw) += scale_inv_var_nhw * d_y_arr.col(nhw);
}
}
break;
}
default:
......@@ -522,6 +594,10 @@ class BatchNormGradMaker : public framework::SingleGradOpDescMaker {
op->SetInput("SavedMean", Output("SavedMean"));
op->SetInput("SavedVariance", Output("SavedVariance"));
// used when setting use_global_stats True during training
op->SetInput("Mean", Output("MeanOut"));
op->SetInput("Variance", Output("VarianceOut"));
op->SetAttrMap(Attrs());
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
......
......@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/batch_norm_op.h"
#include <algorithm>
#include <cfloat>
#include <string>
#include <vector>
#include "cub/cub.cuh"
#include "paddle/fluid/framework/data_layout.h"
#include "paddle/fluid/operators/batch_norm_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
#include "paddle/fluid/platform/float16.h"
......@@ -59,6 +63,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const float momentum = ctx.Attr<float>("momentum");
const bool is_test = ctx.Attr<bool>("is_test");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
......@@ -121,7 +126,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
auto handle = dev_ctx.cudnn_handle();
// Now, depending on whether we are running test or not, we have two paths.
if (is_test) {
if (is_test || use_global_stats) {
// only when test we use input to do computation.
const auto *est_mean = ctx.Input<Tensor>("Mean");
const auto *est_var = ctx.Input<Tensor>("Variance");
......@@ -163,7 +168,7 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
if ((N * H * W * D) == 1) {
LOG(WARNING) << "Only 1 element in normalization dimension, "
<< "we skip the batch norm calculation, let y = x.";
framework::TensorCopySync(*x, ctx.GetPlace(), y);
framework::TensorCopy(*x, ctx.GetPlace(), y);
} else {
double this_factor = 1. - momentum;
......@@ -191,6 +196,58 @@ class BatchNormKernel<platform::CUDADeviceContext, T>
}
};
template <typename T, framework::DataLayout layout>
static __global__ void KeBNBackwardData(const T *dy,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *variance,
const double epsilon, const int C,
const int HxW, const int num, T *dx) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = gid; i < num; i += stride) {
const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> inv_var = 1.0 / sqrt(variance[c] + epsilon);
dx[i] = static_cast<T>(static_cast<BatchNormParamType<T>>(dy[i]) *
scale[c] * inv_var);
}
}
template <typename T, int BlockDim, framework::DataLayout layout>
static __global__ void KeBNBackwardScaleBias(
const T *dy, const T *x, const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *variance, const double epsilon, const int N,
const int C, const int HxW, BatchNormParamType<T> *dscale,
BatchNormParamType<T> *dbias) {
const int outer_size = C;
const int inner_size = N * HxW;
typedef cub::BlockReduce<BatchNormParamType<T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage ds_storage;
__shared__ typename BlockReduce::TempStorage db_storage;
for (int i = blockIdx.x; i < outer_size; i += gridDim.x) {
BatchNormParamType<T> ds_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> db_sum = static_cast<BatchNormParamType<T>>(0);
BatchNormParamType<T> inv_var_i = 1.0 / sqrt(variance[i] + epsilon);
BatchNormParamType<T> mean_i = mean[i];
for (int j = threadIdx.x; j < inner_size; j += blockDim.x) {
const int index = layout == framework::DataLayout::kNCHW
? (j / HxW * C + i) * HxW + j % HxW
: j * outer_size + i;
ds_sum += static_cast<BatchNormParamType<T>>(dy[index]) *
(static_cast<BatchNormParamType<T>>(x[index]) - mean_i);
db_sum += static_cast<BatchNormParamType<T>>(dy[index]);
}
ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum());
db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum());
if (threadIdx.x == 0) {
dscale[i] = ds_sum * inv_var_i;
dbias[i] = db_sum;
}
__syncthreads();
}
}
template <typename T>
class BatchNormGradKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
......@@ -200,6 +257,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
"It must use CUDAPlace.");
double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
const bool use_global_stats = ctx.Attr<bool>("use_global_stats");
const DataLayout data_layout =
framework::StringToDataLayout(data_layout_str);
const auto *x = ctx.Input<Tensor>("X");
......@@ -219,12 +278,27 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
d_x->mutable_data<T>(ctx.GetPlace());
if (d_scale && d_bias) {
d_scale->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
d_bias->mutable_data<BatchNormParamType<T>>(ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
PADDLE_ENFORCE_EQ(scale->dims()[0], C);
std::vector<int> dims;
std::vector<int> strides;
if (data_layout == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
if (!use_global_stats) {
if ((N * H * W * D) == 1) {
framework::TensorCopySync(*d_y, ctx.GetPlace(), d_x);
framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
functor;
functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
......@@ -232,15 +306,13 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
return;
}
PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
PADDLE_ENFORCE_EQ(scale->dims()[0], C);
// ------------------- cudnn descriptors ---------------------
cudnnTensorDescriptor_t data_desc_;
cudnnTensorDescriptor_t bn_param_desc_;
cudnnBatchNormMode_t mode_;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
......@@ -255,15 +327,6 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
mode_ = CUDNN_BATCHNORM_SPATIAL;
#endif
std::vector<int> dims;
std::vector<int> strides;
if (data_layout == DataLayout::kNCHW) {
dims = {N, C, H, W, D};
strides = {C * H * W * D, H * W * D, W * D, D, 1};
} else {
dims = {N, C, H, W, D};
strides = {H * W * C * D, 1, W * D * C, D * C, C};
}
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
data_desc_, CudnnDataType<T>::type,
x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
......@@ -289,9 +352,56 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
epsilon, saved_mean_data, saved_var_data));
// clean when exit.
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
} else {
const auto *running_mean = ctx.Input<Tensor>("Mean");
const auto *running_var = ctx.Input<Tensor>("Variance");
const auto *running_mean_data =
running_mean->template data<BatchNormParamType<T>>();
const auto *running_var_data =
running_var->template data<BatchNormParamType<T>>();
const int num = x->numel();
const int block = 512;
int max_threads = dev_ctx.GetMaxPhysicalThreadCount();
const int max_blocks = std::max(max_threads / block, 1);
int grid1 = (num + block - 1) / block;
int grid2 = std::min(C, max_blocks);
if (data_layout == framework::DataLayout::kNCHW) {
if (d_x) {
KeBNBackwardData<T, framework::DataLayout::kNCHW><<<
grid1, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
running_var_data, epsilon, C, H * W, num, d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
} else {
if (d_x) {
KeBNBackwardData<T, framework::DataLayout::kNHWC><<<
grid1, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), scale->data<BatchNormParamType<T>>(),
running_var_data, epsilon, C, H * W, num, d_x->data<T>());
}
if (d_scale && d_bias) {
KeBNBackwardScaleBias<T, block, framework::DataLayout::kNCHW><<<
grid2, block, 0, dev_ctx.stream()>>>(
d_y->data<T>(), x->data<T>(), running_mean_data, running_var_data,
epsilon, C, H * W, num, d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>());
}
}
}
}
};
......
......@@ -15,7 +15,7 @@
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
......@@ -28,259 +28,6 @@ using mkldnn::stream;
using platform::to_void_cast;
using platform::GetMKLDNNFormat;
class ConvMKLDNNHandler : public platform::MKLDNNHandler {
public:
ConvMKLDNNHandler(
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key) {
conv_pd_ = conv_pd;
}
ConvMKLDNNHandler(
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd,
std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
conv_bwd_data_pd,
std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
conv_bwd_weights_pd,
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key),
conv_pd_(conv_pd),
conv_bwd_weights_pd_(conv_bwd_weights_pd),
conv_bwd_data_pd_(conv_bwd_data_pd) {
// If we are in Grad operatgor then update a key with BWD suffix to
// distinguish from FWD memory primitives
key_ += "-BWD";
}
size_t GetDstMemorySize() const {
return conv_pd_->dst_primitive_desc().get_size();
}
mkldnn::memory::format GetDstFormat() const {
return static_cast<mkldnn::memory::format>(
conv_pd_->dst_primitive_desc().desc().data.format);
}
size_t GetDiffWeightsMemorySize() const {
return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
}
size_t GetDiffSourceMemorySize() const {
return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
}
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p,
"@weights-src_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
"@weights-diff_dst_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
void* ptr) {
return this->AcquireMemoryFromPrimitive(
conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
"@diff_weights_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
"@data-diff_dst_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
auto user_pd = user_weights_memory_p->get_primitive_desc();
return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
"@data-weights_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
void* dst_ptr,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
return this->AcquireMemory(user_residual_memory_p,
this->AcquireDstMemoryFromPrimitive(dst_ptr),
"@residual_data_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
void* ptr) {
return this->AcquireMemoryFromPrimitive(
conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
"@dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto src_pd = conv_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) {
auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd,
user_weights_memory_p, "@weights_mem_p",
pipeline, is_persistent);
}
std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
auto bias_pd = conv_pd_->bias_primitive_desc();
return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
"@bias_mem_p", pipeline);
}
std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> dst_memory_p) {
auto prim_key = key_ + "@conv_p";
auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution primitive in device context");
if (conv_p == nullptr) {
conv_p = std::make_shared<mkldnn::convolution_forward>(
*conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
*(dst_memory_p.get()));
dev_ctx_.SetBlob(prim_key, conv_p);
} else {
is_reusing_ = true;
}
return conv_p;
}
std::shared_ptr<mkldnn::convolution_forward> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> bias_memory_p,
std::shared_ptr<mkldnn::memory> dst_memory_p) {
auto prim_key = key_ + "@conv_p";
auto conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution primitive in device context");
if (conv_p == nullptr) {
conv_p = std::make_shared<mkldnn::convolution_forward>(
*conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
*(bias_memory_p.get()), *(dst_memory_p.get()));
dev_ctx_.SetBlob(prim_key, conv_p);
} else {
is_reusing_ = true;
}
return conv_p;
}
std::shared_ptr<mkldnn::convolution_backward_weights>
AcquireConvolutionBackwardWeights(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
auto prim_key = key_ + "@conv_bwd_weights_p";
auto conv_bwd_weights_p =
std::static_pointer_cast<mkldnn::convolution_backward_weights>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE(
(conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution bwd weights primitive in device context");
if (conv_bwd_weights_p == nullptr) {
// create backward conv primitive for weights
conv_bwd_weights_p =
std::make_shared<mkldnn::convolution_backward_weights>(
*conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
*diff_weights_memory_p);
dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
} else {
is_reusing_ = true;
}
return conv_bwd_weights_p;
}
std::shared_ptr<mkldnn::convolution_backward_data>
AcquireConvolutionBackwardData(
std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
auto prim_key = key_ + "@conv_bwd_data_p";
auto conv_bwd_data_p =
std::static_pointer_cast<mkldnn::convolution_backward_data>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE(
(conv_bwd_data_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution bwd data primitive in device context");
if (conv_bwd_data_p == nullptr) {
conv_bwd_data_p = std::make_shared<mkldnn::convolution_backward_data>(
*conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
*diff_src_memory_p);
dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
} else {
is_reusing_ = true;
}
return conv_bwd_data_p;
}
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static std::string GetHash(memory::dims& input_dims, // NOLINT
memory::dims& weights_dims, // NOLINT
std::vector<int>& strides, // NOLINT
std::vector<int>& paddings, // NOLINT
std::vector<int>& dilations, // NOLINT
int groups, const std::string& suffix) {
return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
suffix;
}
private:
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd_;
std::shared_ptr<mkldnn::convolution_backward_weights::primitive_desc>
conv_bwd_weights_pd_;
std::shared_ptr<mkldnn::convolution_backward_data::primitive_desc>
conv_bwd_data_pd_;
};
template <typename T>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
......@@ -351,7 +98,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives
const std::string key = ConvMKLDNNHandler::GetHash(
const std::string key = platform::ConvMKLDNNHandler::GetHash(
src_tz, weights_tz, strides, paddings, dilations, groups,
ctx.op().Output("Output"));
const std::string key_conv_pd = key + "@conv_pd";
......@@ -400,7 +147,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
// Save conv_pd/src_memory/weights_memory for backward pass
if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p =
......@@ -616,9 +363,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
// Get an unique name from "argument" name of "Output" variable
// as well as attributes of primitive to be created
// This name will be used as key when saving info into device context
const std::string key =
ConvMKLDNNHandler::GetHash(src_tz, weights_tz, strides, paddings,
dilations, groups, ctx.op().Input("Output"));
const std::string key = platform::ConvMKLDNNHandler::GetHash(
src_tz, weights_tz, strides, paddings, dilations, groups,
ctx.op().Input("Output"));
const std::string key_conv_pd = key + "@conv_pd";
std::vector<primitive> pipeline;
......@@ -673,8 +420,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
std::make_shared<mkldnn::convolution_backward_data::primitive_desc>(
conv_bwd_data_desc, mkldnn_engine, *conv_pd);
ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd, conv_bwd_weights_pd,
dev_ctx, mkldnn_engine, key);
platform::ConvMKLDNNHandler handler(conv_pd, conv_bwd_data_pd,
conv_bwd_weights_pd, dev_ctx,
mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p =
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using framework::DataLayout;
template <typename T>
class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
const bool is_test = ctx.Attr<bool>("is_test");
PADDLE_ENFORCE(
is_test == true,
"ConvTransposeMKLDNN works only for inference!. Set is_test = True");
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output");
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
input->format() != mkldnn::memory::format::format_undef,
"Wrong layout/format set for Input tensor");
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != mkldnn::memory::format::format_undef,
"Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->dims().size() == 4,
"Input must be with 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(filter->dims().size() == 4,
"Filter must be with 4 dimensions, i.e. OIHW");
if (bias) {
PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
bias->format() != mkldnn::memory::format::format_undef,
"Wrong layout/format set for Bias tensor");
PADDLE_ENFORCE(bias->dims().size() == 1,
"Bias must only have 1 dimension, i.e. X");
}
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
// TODO(tpatejko): add support for dilation
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> iohw_weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> weights_tz = iohw_weights_tz;
// IOHW -> OIHW
weights_tz[0] = iohw_weights_tz[1];
weights_tz[1] = iohw_weights_tz[0];
// Custom Reorder from IOHW to OIHW
auto iohw2oihw_reorder =
[&iohw_weights_tz](const T* filter_data) -> std::shared_ptr<T> {
int o = iohw_weights_tz[1];
int c = iohw_weights_tz[0];
int h = iohw_weights_tz[2];
int w = iohw_weights_tz[3];
std::shared_ptr<T> reordered_filter_data(new T[o * c * h * w](),
std::default_delete<T[]>());
for (int i = 0; i < c; ++i) {
for (int j = 0; j < o; ++j) {
int in_offset = j * h * w + i * o * h * w;
int out_offset = j * c * h * w + i * h * w;
std::memcpy(&(reordered_filter_data.get())[out_offset],
&filter_data[in_offset], h * w * sizeof(T));
}
}
return reordered_filter_data;
};
int g = std::max(groups, 1);
if (g > 1) {
int o = weights_tz[0];
int i = weights_tz[1];
int h = weights_tz[2];
int w = weights_tz[3];
weights_tz.resize(5);
weights_tz[0] = g;
weights_tz[1] = o / g;
weights_tz[2] = i;
weights_tz[3] = h;
weights_tz[4] = w;
}
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives
const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash(
src_tz, weights_tz, strides, paddings, dilations, groups,
ctx.op().Output("Output"));
const std::string key_conv_transpose_pd = key + "@conv_transpose_pd";
std::vector<mkldnn::primitive> pipeline;
auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input->format());
auto user_weights_md =
platform::MKLDNNMemDesc({weights_tz}, platform::MKLDNNGetDataType<T>(),
(g == 1) ? mkldnn::memory::format::oihw
: mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
bool fuse_relu = ctx.Attr<bool>("fuse_relu");
auto src_md = platform::MKLDNNMemDesc(
src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
std::vector<int> bias_tz; // TODO(mgallus): avoid empty vector creation.
// Currently used whenever bias is != nullptr.
auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
// create a deconv(conv transpose) primitive descriptor and save it for
// usage in backward
std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc>
conv_transpose_pd;
auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
: mkldnn::prop_kind::forward_training;
if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
fuse_relu, fwd_prop_kind);
} else {
conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
src_md, weights_md, dst_md, strides, paddings, mkldnn_engine,
fuse_relu, fwd_prop_kind);
}
// Save conv_pd/src_memory/weights_memory for backward pass
if (!is_test) dev_ctx.SetBlob(key_conv_transpose_pd, conv_transpose_pd);
platform::ConvTransposeMKLDNNHandler handler(conv_transpose_pd, dev_ctx,
mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p = handler.AcquireSrcMemory(
user_src_md, platform::to_void_cast<T>(input_data));
auto user_weights_memory_p = handler.AcquireWeightsMemory(
user_weights_md, platform::to_void_cast<T>(filter_data),
is_test ? iohw2oihw_reorder : platform::user_function());
// create reorder primitive if the input format is not the preferred one
auto src_memory_p =
handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
auto weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test);
std::shared_ptr<mkldnn::memory> dst_memory_p;
auto output_data = output->mutable_data<T>(
ctx.GetPlace(), paddle::memory::Allocator::kDefault,
handler.GetDstMemorySize());
dst_memory_p = handler.AcquireDstMemoryFromPrimitive(
platform::to_void_cast<T>(output_data));
// create convolution op primitive
std::shared_ptr<mkldnn::deconvolution_forward> conv_p;
if (bias) {
const T* bias_data = bias->data<T>();
auto user_bias_md =
platform::MKLDNNMemDesc({bias_tz}, platform::MKLDNNGetDataType<T>(),
mkldnn::memory::format::x);
auto user_bias_memory_p = handler.AcquireBiasMemory(
user_bias_md, platform::to_void_cast<T>(bias_data));
auto bias_memory_p =
handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
}
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p);
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
}
private:
mkldnn::primitive_attr CreatePostOps(bool fuse_relu) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 0.0f;
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
ConvTransposeFwdPrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const mkldnn::engine& engine,
const bool fuse_relu, mkldnn::prop_kind fwd_prop_kind) const {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto deconv_desc = mkldnn::deconvolution_forward::desc(
fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
auto p_conv_transpose_pd =
new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
deconv_attr, engine);
return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
p_conv_transpose_pd);
}
std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
ConvTransposeFwdPrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& bias, const mkldnn::memory::desc& dst,
const std::vector<int>& strides, const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu,
mkldnn::prop_kind fwd_prop_kind) const {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto deconv_desc = mkldnn::deconvolution_forward::desc(
fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, bias, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
auto p_conv_transpose_pd =
new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
deconv_attr, engine);
return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
p_conv_transpose_pd);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d_transpose, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvTransposeMKLDNNOpKernel<float>);
......@@ -16,6 +16,10 @@ limitations under the License. */
#include <string>
#include <vector>
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -78,29 +82,38 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
framework::LibraryType library_{framework::LibraryType::kPlain};
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
}
}
#endif
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
layout_ = framework::DataLayout::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
layout_, library_);
}
void Conv2DTransposeOpMaker::Make() {
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddInput(
"Input",
"(Tensor) The input tensor of convolution transpose operator. "
......@@ -145,6 +158,11 @@ void Conv2DTransposeOpMaker::Make() {
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......@@ -238,6 +256,9 @@ void Conv3DTransposeOpMaker::Make() {
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class CudnnLSTMOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"),
"Input(Weight) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("InitH"),
"Input(init_h) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("InitC"),
"Input(init_c) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Cache"),
"Input(Cache) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("last_h"),
"Output(last_h) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("last_c"),
"Output(last_c) of LSTM should not be null.");
auto in_dims = ctx->GetInputDim("Input");
PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3.");
ctx->SetOutputDim("Out", ctx->GetInputDim("Input"));
ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH"));
ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC"));
}
};
class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput(
"Input",
"(Tensor) RNN input tensor, which support variable-time length input "
"sequence."
"The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
"seq_len is the total time step in this mini-batch (CAN be change in "
"different batch)"
"batch_size is the instance number of this batch"
"input_size is the hidden size of the input."
"input_hidden_size and the hidden_size in the next may not be same");
AddInput("InitH",
"(Tensor) the initial hidden state of the LSTM"
"input. This is a tensor with shape (num_layers x batch_size x "
"hidden_size)"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)");
AddInput("InitC",
"(Tensor) the initial cell state of the LSTm "
"input. This is a tensor with shape (num_layers x batch_size x "
"hidden_size)"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)");
AddInput("W",
"(Tensor) the learnable hidden-hidden weights."
" The shape is (N), where N is total weight size of the LSTM. "
" cudnn concatenate all the weight to one Tensor");
AddInput("Cache",
"The cache of dropout op, a RAW type variable including random "
"number generator states and some descriptors, which is used in "
"cudnn kernel.")
.AsDispensable();
AddOutput("Out",
"(Tensor) the hidden state of LSTM operator. "
"The shape is ( seq_len x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirec is True, the shape will be ( seq_len x "
"batch_size x hidden_size * 2) ");
AddOutput("last_h",
"(Tensor) the hidden state of the last step. "
"The shape is ( num_layers x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)");
AddOutput("last_c",
"(Tensor) the cell state of the last step"
"The shape is ( num_layers x batch_size x hidden_size) if "
"is_bidirec is False"
"and When is_bidirect is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size*2)");
AddAttr<int>("max_len",
"max length of the LSTM op"
"the first dim of the Input can NOT be greater than max_len")
.SetDefault(20);
AddAttr<float>(
"dropout_prob",
"dropout prob of the dropout op"
"the dropout ONLY work between lstm layers, not between time steps"
"There is no dropout work on the Out tensor")
.SetDefault(0.0);
AddAttr<bool>("is_bidirec",
"is_bidirec"
"if it is bidirection rnn"
"The will affect the shape of the Out, last_h, and last_c")
.SetDefault(false);
AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
AddAttr<int>("num_layers", "the total layer number of the LSTM")
.SetDefault(1);
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1);
AddComment(R"DOC(
CUDNN LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
$$ h_t = o_t \\odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
)DOC");
}
};
class CudnnLSTMGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("last_h"),
"Input(last_h) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("last_c"),
"Input(last_c) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Cache"),
"Input(last_c) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("InitH"),
"Input(init_h) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("InitC"),
"Input(init_c) of LSTM should not be null.");
auto SetOutGradDim = [&ctx](const std::string& name) {
auto g_name = framework::GradVarName(name);
if (ctx->HasOutput(g_name)) {
ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
}
};
SetOutGradDim("Input");
SetOutGradDim("W");
SetOutGradDim("InitH");
SetOutGradDim("InitC");
}
};
template <typename T>
class NotImpleKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(
"CPU is not support for this kernel now. Will be add in the future");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker,
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp);
REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel<float>);
REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel<float>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_helper.h"
namespace paddle {
namespace operators {
using LoDTensor = framework::LoDTensor;
using Tensor = framework::Tensor;
struct CudnnRNNCache {
CudnnRNNCache() {
x_desc_ = NULL;
y_desc_ = NULL;
dx_desc_ = NULL;
dy_desc_ = NULL;
}
~CudnnRNNCache() { release(); }
cudnnRNNDescriptor_t rnn_desc_;
cudnnTensorDescriptor_t *x_desc_;
cudnnTensorDescriptor_t *y_desc_;
cudnnTensorDescriptor_t *dx_desc_;
cudnnTensorDescriptor_t *dy_desc_;
cudnnTensorDescriptor_t hx_desc_;
cudnnTensorDescriptor_t cx_desc_;
cudnnTensorDescriptor_t hy_desc_;
cudnnTensorDescriptor_t cy_desc_;
cudnnTensorDescriptor_t dhx_desc_;
cudnnTensorDescriptor_t dcx_desc_;
cudnnTensorDescriptor_t dhy_desc_;
cudnnTensorDescriptor_t dcy_desc_;
cudnnTensorDescriptor_t output_x_desc_;
cudnnTensorDescriptor_t output_y_desc_;
cudnnDropoutDescriptor_t dropout_desc_;
size_t weights_size_;
cudnnFilterDescriptor_t w_desc_;
cudnnFilterDescriptor_t dw_desc_;
size_t workspace_size_;
size_t reserve_size_;
Tensor reserve_data_;
Tensor workspace_data_;
Tensor dropout_state_;
size_t max_length_;
float dropout_prob_;
bool is_bidirec_;
int batch_size_;
int input_size_;
int hidden_size_;
int num_layers_;
int seed_;
void init(cudnnHandle_t handle, const framework::ExecutionContext &ctx,
size_t max_len, int batch_size, int input_size, int hidden_size,
int num_layers, float dropout_prob, bool is_bidirec, int seed,
int weight_numel) {
max_length_ = max_len;
batch_size_ = batch_size;
input_size_ = input_size;
hidden_size_ = hidden_size;
num_layers_ = num_layers;
dropout_prob_ = dropout_prob;
is_bidirec_ = is_bidirec;
seed_ = seed;
x_desc_ = new cudnnTensorDescriptor_t[max_length_];
y_desc_ = new cudnnTensorDescriptor_t[max_length_];
dx_desc_ = new cudnnTensorDescriptor_t[max_length_];
dy_desc_ = new cudnnTensorDescriptor_t[max_length_];
int dim_a[3];
int stride_a[3];
for (size_t i = 0; i < max_length_; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i]));
dim_a[0] = batch_size_;
dim_a[1] = input_size_;
dim_a[2] = 1;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
dim_a[0] = batch_size_;
dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_;
dim_a[2] = 1;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
}
dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1);
dim_a[1] = batch_size_;
dim_a[2] = hidden_size_;
stride_a[0] = dim_a[2] * dim_a[1];
stride_a[1] = dim_a[2];
stride_a[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a));
CUDNN_ENFORCE(
platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_));
size_t state_size;
CUDNN_ENFORCE(
platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size);
dropout_state_.Resize({static_cast<int64_t>(state_size)}));
auto *dropout_state_data =
dropout_state_.mutable_data<uint8_t>(ctx.GetPlace());
CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor(
dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size,
seed_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6(
handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_,
CUDNN_LINEAR_INPUT,
is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM,
CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT));
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize(
handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT));
PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel,
"cudnn lstm weight size should be SAME");
int dim_w[3];
dim_w[0] = weights_size_ / sizeof(float);
dim_w[1] = 1;
dim_w[2] = 1;
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor(
dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize(
handle, rnn_desc_, max_length_, x_desc_, &workspace_size_));
CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize(
handle, rnn_desc_, max_length_, x_desc_, &reserve_size_));
reserve_data_.Resize({static_cast<int64_t>(reserve_size_)});
reserve_data_.mutable_data<uint8_t>(ctx.GetPlace());
workspace_data_.Resize({static_cast<int64_t>(workspace_size_)});
workspace_data_.mutable_data<uint8_t>(ctx.GetPlace());
}
void release() {
for (size_t i = 0; i < max_length_; ++i) {
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i]));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i]));
}
delete[] x_desc_;
delete[] y_desc_;
delete[] dx_desc_;
delete[] dy_desc_;
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_));
CUDNN_ENFORCE(
platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_));
CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_));
}
};
template <typename T>
class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const Tensor *x = ctx.Input<Tensor>("Input");
const Tensor *init_h = ctx.Input<Tensor>("InitH");
const Tensor *init_c = ctx.Input<Tensor>("InitC");
auto w = ctx.Input<Tensor>("W");
Tensor *out = ctx.Output<Tensor>("Out");
Tensor *last_h = ctx.Output<Tensor>("last_h");
Tensor *last_c = ctx.Output<Tensor>("last_c");
const T *x_data = x->data<T>();
const T *init_h_data = init_h->data<T>();
const T *init_c_data = init_c->data<T>();
const T *w_data = w->data<T>();
T *out_data = out->mutable_data<T>(ctx.GetPlace());
T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
size_t max_len = ctx.Attr<int>("max_len");
float dropout_prob = ctx.Attr<float>("dropout_prob");
bool is_bidirec = ctx.Attr<bool>("is_bidirec");
int input_size = ctx.Attr<int>("input_size");
int hidden_size = ctx.Attr<int>("hidden_size");
int num_layers = ctx.Attr<int>("num_layers");
bool is_test = ctx.Attr<bool>("is_test");
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto *cache_var = ctx.InputVar("Cache");
if (!cache_var) {
// The RAW type cache variable wouldn't be created and broadcasted on
// multi-devices before the first running.
// use parent scope to make cache persistable
auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
auto cache_var_name = ctx.Inputs("Cache")[0];
cache_var = scope->Var(cache_var_name);
}
CudnnRNNCache *cudnn_rnn_cache = nullptr;
if (cache_var->IsInitialized()) {
cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
->GetMutable<CudnnRNNCache>();
} else {
cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
->GetMutable<CudnnRNNCache>();
std::random_device rnd;
int seed = ctx.Attr<int>("seed");
if (seed == -1) {
seed = rnd();
}
auto input_w_numel = w->numel();
auto batch_size = x->dims()[1];
cudnn_rnn_cache->init(handle, ctx, max_len, batch_size, input_size,
hidden_size, num_layers, dropout_prob, is_bidirec,
seed, input_w_numel);
}
auto run_seq_len = x->dims()[0];
if (is_test) {
// for inference
CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
cudnn_rnn_cache->workspace_size_));
} else {
// for train
CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_,
init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data,
cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data,
cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_,
last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
cudnn_rnn_cache->workspace_size_,
cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
cudnn_rnn_cache->reserve_size_));
}
}
};
template <typename T>
class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto *input = ctx.Input<Tensor>("Input");
auto *weight = ctx.Input<Tensor>("W");
auto *init_h = ctx.Input<Tensor>("InitH");
auto *init_c = ctx.Input<Tensor>("InitC");
// auto * last_h = ctx.Input<Tensor>("last_h");
// auto * last_c = ctx.Input<Tensor>("last_c");
auto *out = ctx.Input<Tensor>("Out");
auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h"));
auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c"));
// auto* init_h = ctx.Input<Tensor>("init_h");
// auto* init_c = ctx.Input<Tensor>("init_c");
auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
auto *init_h_grad = ctx.Output<Tensor>(framework::GradVarName("InitH"));
auto *init_c_grad = ctx.Output<Tensor>(framework::GradVarName("InitC"));
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle();
auto *cache_var = ctx.InputVar("Cache");
PADDLE_ENFORCE(cache_var->IsInitialized());
CudnnRNNCache *cudnn_rnn_cache =
const_cast<framework::Variable *>(cache_var)
->GetMutable<CudnnRNNCache>();
auto input_dims = input->dims();
auto weight_dims = weight->dims();
auto init_h_dims = init_h->dims();
auto init_c_dims = init_c->dims();
in_grad->mutable_data<T>(ctx.GetPlace());
weight_grad->mutable_data<T>(ctx.GetPlace());
math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
zero(dev_ctx, in_grad, static_cast<T>(0.0));
zero(dev_ctx, weight_grad, static_cast<T>(0.0));
T *init_h_grad_data = NULL;
if (init_h_grad == nullptr) {
Tensor init_h_grad_temp;
init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
init_h_grad_data = init_h_grad_temp.data<T>();
} else {
init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
zero(dev_ctx, init_h_grad, static_cast<T>(0.0));
init_h_grad_data = init_h_grad->data<T>();
}
T *init_c_grad_data = NULL;
if (init_c_grad == nullptr) {
Tensor init_c_grad_temp;
init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
init_c_grad_data = init_c_grad_temp.data<T>();
} else {
init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
zero(dev_ctx, init_c_grad, static_cast<T>(0.0));
init_c_grad_data = init_c_grad->data<T>();
}
const T *last_h_grad_data = NULL;
if (last_h_grad == nullptr) {
Tensor last_h_grad_temp;
last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
} else {
last_h_grad_data = last_h_grad->data<T>();
}
const T *last_c_grad_data = NULL;
if (last_c_grad == nullptr) {
Tensor last_c_grad_temp;
last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
} else {
last_c_grad_data = last_c_grad->data<T>();
}
const T *out_grad_data = NULL;
if (out_grad == nullptr) {
Tensor out_grad_temp;
out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
out_grad_data = (const T *)out_grad_temp.data<T>();
} else {
out_grad_data = out_grad->data<T>();
}
// zero( dev_ctx, last_h_grad, static_cast<T>(0.0));
// zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
auto out_data = out->data<T>();
// auto out_grad_data = out_grad->data<T>();
auto weight_data = weight->data<T>();
auto init_h_data = init_h->data<T>();
auto init_c_data = init_c->data<T>();
auto in_grad_data = in_grad->data<T>();
auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>();
auto run_seq_len = input_dims[0];
PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
"cudnn running seq_len CAN not greater max_lengh");
CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_,
out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data,
cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_,
weight_data, cudnn_rnn_cache->hx_desc_, init_h_data,
cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_,
in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data,
cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data,
cudnn_rnn_cache->workspace_size_, reserve_data,
cudnn_rnn_cache->reserve_size_));
CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len,
cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_,
init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_,
weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
cudnn_rnn_cache->reserve_size_));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>);
REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>);
......@@ -43,6 +43,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
T prior_box_width = prior_box_data[j * len + 2] -
......@@ -96,6 +99,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
size_t offset = i * col * len + j * len;
......
......@@ -9,36 +9,37 @@ else()
endif()
configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
if(WITH_GRPC)
grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc
request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(grpc_serde_test SRCS grpc_serde_test.cc
DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL)
cc_test(rpc_server_test SRCS rpc_server_test.cc
DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL)
cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler)
return()
endif()
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory)
else()
set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc
brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc
brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc
PROTO send_recv.proto
DEPS lod_tensor selected_rows memory)
set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory)
set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy)
cc_test(brpc_server_test SRCS rpc_server_test.cc
cc_test(brpc_server_test SRCS rpc_server_test.cc
DEPS ${brpc_test_depends} SERIAL)
cc_test(brpc_serde_test SRCS brpc_serde_test.cc
cc_test(brpc_serde_test SRCS brpc_serde_test.cc
DEPS ${brpc_test_depends} SERIAL)
endif()
......@@ -171,11 +171,13 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
const framework::Scope& scope,
const std::string& in_var_name,
const std::string& out_var_name,
const std::string& table_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep;
const std::string in_var_name_val = in_var_name;
const std::string out_var_name_val = out_var_name;
const std::string table_name_val = table_name;
const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch);
......@@ -186,11 +188,12 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
s->Prepare(h, time_out);
framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx,
s, method, h, this] {
s, method, h, table_name_val, this] {
auto* var = p_scope->FindVar(in_var_name_val);
::grpc::ByteBuffer req;
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val);
SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val,
0, table_name_val);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
......
......@@ -194,6 +194,7 @@ class GRPCClient : public RPCClient {
const framework::Scope& scope,
const std::string& in_var_name,
const std::string& out_var_name,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncSendBatchBarrier(
......
......@@ -42,7 +42,8 @@ static void SerializeDestroyCallback(void* payload) {
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg, const std::string& out_name,
const int trainer_id) {
const int trainer_id,
const std::string& table_name) {
platform::RecordRPCEvent record_event("serial", &ctx);
VarMsg request;
TensorPayload* payload = nullptr;
......@@ -63,6 +64,9 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
if (!out_name.empty()) {
request.set_out_varname(out_name);
}
if (!table_name.empty()) {
request.set_table_name(table_name);
}
if (var->IsType<framework::LoDTensor>()) {
request.set_type(::sendrecv::LOD_TENSOR);
payload = new TensorPayload(GetTensorPayload(var, ctx, &request));
......
......@@ -40,7 +40,8 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg,
const std::string& out_varname = std::string(),
const int trainer_id = 0);
const int trainer_id = 0,
const std::string& table_name = std::string());
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
......
......@@ -130,7 +130,8 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
math::set_constant(ctx, tensor, 31.9);
::grpc::ByteBuffer msg;
operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg);
operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg,
"outvar", 0, "table_name");
EXPECT_GT(msg.Length(), static_cast<size_t>(0));
// deserialize
......
......@@ -183,6 +183,7 @@ class RequestPrefetch final : public RequestBase {
// prefetch process...
std::string in_var_name = request_->Varname();
std::string out_var_name = request_->OutVarname();
std::string table_name = request_->TableName();
int trainer_id = request_->GetTrainerId();
VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name
<< " out_var_name: " << out_var_name;
......@@ -193,7 +194,7 @@ class RequestPrefetch final : public RequestBase {
framework::Variable* outvar = scope->Var(out_var_name);
request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id,
out_var_name);
out_var_name, table_name);
SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
&reply_);
......
......@@ -301,6 +301,20 @@ int GRPCVariableResponse::Parse(Source* source) {
meta_.set_trainer_id(trainer_id);
break;
}
case sendrecv::VariableMessage::kTableNameFieldNumber: {
uint32_t length;
if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) {
return tag;
}
std::string temp;
if (!input.ReadString(&temp, length)) {
return tag;
}
meta_.set_table_name(temp);
break;
}
default: {
// Unknown tag, return unknown error.
return -1;
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <set>
#include <string>
#include <vector>
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/detail/macros.h"
#include "paddle/fluid/operators/distributed/rpc_client.h"
#include "paddle/fluid/operators/distributed/variable_response.h"
#include "paddle/fluid/operators/distributed_ops/send_recv_util.h"
namespace paddle {
namespace operators {
namespace distributed {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
using SelectedRows = framework::SelectedRows;
using DDim = framework::DDim;
static size_t GetSectionIndex(int64_t id,
const std::vector<int64_t>& abs_sections) {
for (size_t i = 1; i < abs_sections.size(); ++i) {
if (id < abs_sections[i]) {
return i - 1;
}
}
return abs_sections.size() - 1;
}
static std::vector<int64_t> ToAbsoluteSection(
const std::vector<int>& height_sections) {
std::vector<int64_t> abs_sections;
abs_sections.resize(height_sections.size());
abs_sections[0] = 0;
for (size_t i = 1; i < height_sections.size(); ++i) {
abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
}
return abs_sections;
}
static std::vector<std::vector<int64_t>> SplitIds(
const std::vector<int64_t>& ids_vector,
const std::vector<int>& height_section, framework::Scope* scope) {
std::set<int64_t> all_ids;
for (auto id : ids_vector) {
all_ids.insert(id);
}
auto abs_sections = ToAbsoluteSection(height_section);
std::vector<std::vector<int64_t>> splited_ids;
splited_ids.resize(height_section.size() + 1);
for (auto& id : all_ids) {
auto section_index = GetSectionIndex(id, abs_sections);
splited_ids[section_index].push_back(id - abs_sections[section_index]);
}
return splited_ids;
}
static void SplitIdsIntoMultipleVarsBySection(
const std::vector<std::string>& in_var_names,
const std::vector<int>& height_section,
const std::vector<std::vector<int64_t>>& splited_ids,
framework::Scope* scope) {
PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), "");
auto place = platform::CPUPlace();
for (size_t i = 0; i < in_var_names.size(); ++i) {
auto* id_tensor =
scope->Var(in_var_names[i])->GetMutable<framework::LoDTensor>();
auto& ids = splited_ids[i];
if (!ids.empty()) {
auto* id_tensor_data = id_tensor->mutable_data<int64_t>(
framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size());
}
}
}
static void MergeMultipleVarsIntoOneBySection(
const std::string& id_name, const std::vector<int64_t>& ids_vector,
const std::string& out_name, const std::vector<std::string>& out_var_names,
const std::vector<int>& height_section,
const std::vector<std::vector<int64_t>>& splited_ids,
const framework::ExecutionContext& context, framework::Scope* scope,
platform::DeviceContext* actual_ctx) {
PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), "");
auto cpu_place = platform::CPUPlace();
auto abs_sections = ToAbsoluteSection(height_section);
std::unordered_map<int64_t, std::vector<size_t>> id_to_offset;
for (size_t i = 0; i < ids_vector.size(); ++i) {
id_to_offset[ids_vector[i]].push_back(i);
}
auto& id_tensor = scope->FindVar(id_name)->Get<framework::LoDTensor>();
auto* out_tensor =
scope->FindVar(out_name)->GetMutable<framework::LoDTensor>();
auto* out_tensor_data = out_tensor->mutable_data<float>(id_tensor.place());
bool is_on_cpu_place = true;
if (!platform::is_cpu_place(id_tensor.place())) {
is_on_cpu_place = false;
}
for (size_t section_idx = 0; section_idx < out_var_names.size();
++section_idx) {
auto& ids_in_this_section = splited_ids[section_idx];
if (!ids_in_this_section.empty()) {
auto& prefetch_out_var =
scope->Var(out_var_names[section_idx])->Get<framework::LoDTensor>();
const auto* out_var_data = prefetch_out_var.data<float>();
auto& dims = prefetch_out_var.dims();
PADDLE_ENFORCE_EQ(dims.size(), 2, "");
PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]);
auto row_numel = dims[1];
for (size_t i = 0; i < dims[0]; ++i) {
auto id = ids_in_this_section[i];
auto origin_id = id + abs_sections[section_idx];
auto& offsets = id_to_offset[origin_id];
for (auto& offset : offsets) {
// should support GPU tensor
if (is_on_cpu_place) {
memory::Copy(cpu_place, out_tensor_data + offset * row_numel,
cpu_place, out_var_data + i * row_numel,
sizeof(float) * row_numel);
} else {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("paddle is not compiled with CUDA!");
#else
auto stream =
static_cast<platform::CUDADeviceContext*>(actual_ctx)->stream();
memory::Copy(boost::get<platform::CUDAPlace>(id_tensor.place()),
out_tensor_data + offset * row_numel, cpu_place,
out_var_data + i * row_numel,
sizeof(float) * row_numel, stream);
#endif
}
}
}
} else {
VLOG(3) << "ids in this section is empty";
}
}
}
void prefetch(const std::string& id_name, const std::string& out_name,
const std::vector<std::string>& table_names,
const std::vector<std::string>& epmap,
const std::vector<int>& height_sections,
const framework::ExecutionContext& context) {
auto& local_scope = context.scope().NewScope();
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& cpu_ctx = *pool.Get(platform::CPUPlace());
auto& actual_ctx = *pool.Get(context.GetPlace());
distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(
context.Attr<int>("trainer_id"));
std::vector<std::string> in_var_names;
std::vector<std::string> out_var_names;
for (size_t i = 0; i < epmap.size(); ++i) {
in_var_names.push_back(id_name + "@" + epmap[i]);
out_var_names.push_back(out_name + "@" + epmap[i]);
}
auto& id_tensor = local_scope.FindVar(id_name)->Get<framework::LoDTensor>();
std::vector<int64_t> ids_vector;
if (platform::is_cpu_place(id_tensor.place())) {
auto* id_data = id_tensor.data<int64_t>();
for (size_t i = 0; i < id_tensor.numel(); ++i) {
ids_vector.push_back(id_data[i]);
}
} else {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("paddle is not compiled with CUDA!");
#else
auto cpu_place = platform::CPUPlace();
framework::Tensor cpu_tensor;
auto* cpu_tensor_data =
cpu_tensor.mutable_data<int64_t>(id_tensor.dims(), cpu_place);
auto stream =
static_cast<platform::CUDADeviceContext*>(&actual_ctx)->stream();
memory::Copy(cpu_place, cpu_tensor_data,
boost::get<platform::CUDAPlace>(id_tensor.place()),
id_tensor.data<int64_t>(), sizeof(int64_t) * id_tensor.numel(),
stream);
for (size_t i = 0; i < cpu_tensor.numel(); ++i) {
ids_vector.push_back(cpu_tensor_data[i]);
}
#endif
}
auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope);
SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids,
&local_scope);
// create output var in local scope
for (auto& name : out_var_names) {
local_scope.Var(name)->GetMutable<framework::LoDTensor>();
}
std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < in_var_names.size(); i++) {
if (NeedSend(local_scope, in_var_names[i])) {
VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i]
<< " to get " << out_var_names[i] << " back";
rets.push_back(rpc_client->AsyncPrefetchVar(
epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i],
table_names[i]));
} else {
VLOG(3) << "don't send no-initialied variable: " << out_var_names[i];
}
}
for (size_t i = 0; i < rets.size(); i++) {
PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient");
}
MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name,
out_var_names, height_sections, splited_ids,
context, &local_scope, &actual_ctx);
context.scope().DeleteScope(&local_scope);
}
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace operators {
namespace distributed {
void prefetch(const std::string& id_name, const std::string& out_name,
const std::vector<std::string>& table_names,
const std::vector<std::string>& epmap,
const std::vector<int>& height_sections,
const framework::ExecutionContext& context);
}; // namespace distributed
}; // namespace operators
}; // namespace paddle
......@@ -191,7 +191,8 @@ class RequestHandler {
virtual bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") = 0;
const std::string& out_var_name = "",
const std::string& table_name = "") = 0;
protected:
const bool sync_mode_;
......
......@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include <iostream>
#include <string>
#include <vector>
......@@ -20,7 +21,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/string/printf.h"
......@@ -37,7 +38,8 @@ bool RequestSendHandler::Handle(const std::string& varname,
framework::Variable* invar,
framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) {
const std::string& out_var_name,
const std::string& table_name) {
VLOG(4) << "RequestSendHandler:" << varname;
// Sync
......@@ -77,8 +79,10 @@ bool RequestGetHandler::Handle(const std::string& varname,
framework::Variable* invar,
framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) {
const std::string& out_var_name,
const std::string& table_name) {
VLOG(4) << "RequestGetHandler:" << varname;
if (sync_mode_) {
if (varname == FETCH_BARRIER_MESSAGE) {
VLOG(3) << "sync: recv fetch barrier message";
......@@ -113,14 +117,22 @@ bool RequestPrefetchHandler::Handle(const std::string& varname,
framework::Variable* invar,
framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) {
const std::string& out_var_name,
const std::string& table_name) {
VLOG(4) << "RequestPrefetchHandler " << varname;
if (table_name.empty()) {
auto var_desc = program_->Block(0).FindVar(out_var_name);
InitializeVariable(*outvar, var_desc->GetType());
executor_->RunPreparedContext(
(*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope);
} else {
(*outvar)->GetMutable<framework::LoDTensor>();
auto lookup_table_op =
BuildLookupTableOp(table_name, varname, out_var_name);
paddle::platform::CPUPlace cpu_place;
lookup_table_op->Run(*scope, cpu_place);
}
return true;
}
......@@ -129,7 +141,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname,
framework::Variable* invar,
framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name) {
const std::string& out_var_name,
const std::string& table_name) {
PADDLE_ENFORCE(
checkpoint_notify_id != -1,
"when checkpoint_notify_id = -1, there should be no RPC invoke.");
......
......@@ -24,6 +24,7 @@
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h"
......@@ -43,8 +44,8 @@ class RequestSendHandler final : public RequestHandler {
virtual ~RequestSendHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") override;
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override;
private:
bool enable_dc_asgd_;
......@@ -59,21 +60,44 @@ class RequestGetHandler final : public RequestHandler {
virtual ~RequestGetHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") override;
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override;
private:
bool enable_dc_asgd_;
};
static inline void BuildVar(const std::string& param_name,
std::initializer_list<const char*> arguments,
paddle::framework::proto::OpDesc::Var* var) {
var->set_parameter(param_name);
for (auto& arg_name : arguments) {
*var->mutable_arguments()->Add() = arg_name;
}
}
class RequestPrefetchHandler final : public RequestHandler {
public:
explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {}
virtual ~RequestPrefetchHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") override;
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override;
private:
std::unique_ptr<paddle::framework::OperatorBase> BuildLookupTableOp(
const std::string& table_name, const std::string& id_name,
const std::string& out_name) {
paddle::framework::proto::OpDesc op_desc;
op_desc.set_type("lookup_table");
BuildVar("W", {table_name.data()}, op_desc.add_inputs());
BuildVar("Ids", {id_name.data()}, op_desc.add_inputs());
BuildVar("Out", {out_name.data()}, op_desc.add_outputs());
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
return op;
}
};
class RequestCheckpointHandler final : public RequestHandler {
......@@ -85,8 +109,8 @@ class RequestCheckpointHandler final : public RequestHandler {
virtual ~RequestCheckpointHandler() {}
bool Handle(const std::string& varname, framework::Scope* scope,
framework::Variable* var, framework::Variable** outvar,
const int trainer_id,
const std::string& out_var_name = "") override;
const int trainer_id, const std::string& out_var_name = "",
const std::string& table_name = "") override;
private:
int checkpoint_notify_id;
......
......@@ -48,7 +48,7 @@ class RPCClient {
virtual VarHandlePtr AsyncPrefetchVar(
const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& in_var_name,
const std::string& out_var_name,
const std::string& out_var_name, const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncSendBatchBarrier(
......
......@@ -80,6 +80,7 @@ message VariableMessage {
// when profile switches from 1 to 2.
int64 profile = 11;
int64 trainer_id = 12;
string table_name = 13;
}
message VoidMessage {}
......@@ -85,6 +85,7 @@ class VariableResponse {
inline framework::Scope* GetMutableLocalScope() const { return local_scope_; }
inline std::string Varname() const { return meta_.varname(); }
inline std::string OutVarname() const { return meta_.out_varname(); }
inline std::string TableName() const { return meta_.table_name(); }
// should call parse first.
framework::Variable* GetVar() {
......
......@@ -87,6 +87,25 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) "
"If the grad op reuse the input's variable.")
.SetDefault(false);
// for parameter prefetch
AddAttr<bool>("remote_prefetch", "").SetDefault(false);
AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0);
AddAttr<std::vector<int>>("height_sections",
"Height for each output SelectedRows.")
.SetDefault(std::vector<int>({}));
AddAttr<std::vector<std::string>>(
"epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input variables for mapping")
.SetDefault({});
AddAttr<std::vector<std::string>>(
"table_names",
"(string vector, the splited table names that will be fetched from "
"parameter server)"
"in the order of input variables for mapping")
.SetDefault({});
AddComment(R"DOC(
Lookup Table Operator.
......
......@@ -31,8 +31,8 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids,
while (idy < K) {
int64_t id = ids[idy];
PADDLE_ASSERT(id >= 0);
PADDLE_ASSERT(id < N);
PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id);
PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id);
T *out = output + idy * D;
const T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
......@@ -57,9 +57,9 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids,
int idy = blockIdx.x + threadIdx.y * GridDimX;
while (idy < K) {
int id = ids[idy];
PADDLE_ASSERT(id >= 0);
PADDLE_ASSERT(id < N);
int64_t id = ids[idy];
PADDLE_ASSERT_MSG_CODE(id >= 0, "received id:", id);
PADDLE_ASSERT_MSG_CODE(id < N, "received id:", id);
const T *out = output + idy * D;
T *tab = table + id * D;
for (int i = idx; i < D; i += BlockDimX) {
......@@ -78,6 +78,27 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
auto *output_t = context.Output<LoDTensor>("Out");
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
auto id_name = context.Inputs("Ids").front();
auto out_name = context.Outputs("Out").front();
// for remote prefetch
auto epmap = context.Attr<std::vector<std::string>>("epmap");
auto height_sections = context.Attr<std::vector<int>>("height_sections");
auto table_names = context.Attr<std::vector<std::string>>("table_names");
if (!epmap.empty()) {
// if epmap is not empty, then the parameter will be fetched from remote
// parameter
// server
#ifdef PADDLE_WITH_DISTRIBUTE
operators::distributed::prefetch(id_name, out_name, table_names, epmap,
height_sections, context);
#else
PADDLE_THROW(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!");
#endif
} else {
size_t N = table_t->dims()[0];
size_t D = table_t->dims()[1];
size_t K = ids_t->numel();
......@@ -90,16 +111,15 @@ class LookupTableCUDAKernel : public framework::OpKernel<T> {
dim3 grids(8, 1);
if (padding_idx == -1)
LookupTable<
T, 128, 8, 8,
false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
LookupTable<T, 128, 8, 8, false><<<
grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids, N, K, D, padding_idx);
else
LookupTable<
T, 128, 8, 8,
true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
LookupTable<T, 128, 8, 8, true><<<
grids, threads, 0, context.cuda_device_context().stream()>>>(
output, table, ids, N, K, D, padding_idx);
}
}
};
template <typename T>
......@@ -109,6 +129,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
bool is_sparse = context.Attr<bool>("is_sparse");
// Since paddings are not trainable and fixed in forward, the gradient of
// paddings makes no sense and we don't deal with it in backward.
if (is_sparse) {
......
......@@ -23,6 +23,10 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/operators/math/blas.h"
#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/fluid/operators/distributed/parameter_prefetch.h"
#endif
namespace paddle {
namespace operators {
......@@ -41,6 +45,27 @@ class LookupTableKernel : public framework::OpKernel<T> {
auto *output_t = context.Output<LoDTensor>("Out"); // float tensor
auto *table_var = context.InputVar("W");
auto id_name = context.Inputs("Ids").front();
auto out_name = context.Outputs("Out").front();
// for remote prefetch
auto epmap = context.Attr<std::vector<std::string>>("epmap");
auto height_sections = context.Attr<std::vector<int>>("height_sections");
auto table_names = context.Attr<std::vector<std::string>>("table_names");
if (!epmap.empty()) {
// if epmap is not empty, then the parameter will be fetched from remote
// parameter
// server
#ifdef PADDLE_WITH_DISTRIBUTE
operators::distributed::prefetch(id_name, out_name, table_names, epmap,
height_sections, context);
#else
PADDLE_THROW(
"paddle is not compiled with distribute support, can not do "
"parameter prefetch!");
#endif
} else {
int64_t padding_idx = context.Attr<int64_t>("padding_idx");
int64_t *ids = const_cast<int64_t *>(ids_t->data<int64_t>());
int64_t ids_numel = ids_t->numel();
......@@ -83,6 +108,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
}
}
}
}
};
template <typename T>
......
......@@ -75,8 +75,13 @@ class AucKernel : public framework::OpKernel<T> {
const auto *label_data = label->data<int64_t>();
for (size_t i = 0; i < batch_size; i++) {
uint32_t binIdx = static_cast<uint32_t>(
inference_data[i * inference_width + 1] * num_thresholds);
auto predict_data = inference_data[i * inference_width + 1];
PADDLE_ENFORCE_LE(predict_data, 1,
"The predict data must less or equal 1.");
PADDLE_ENFORCE_GE(predict_data, 0,
"The predict data must gather or equal 0.");
uint32_t binIdx = static_cast<uint32_t>(predict_data * num_thresholds);
if (label_data[i]) {
(*stat_pos)[binIdx] += 1.0;
} else {
......
......@@ -319,20 +319,46 @@ void Pad2DGradEdgeNHWC(T* d_in_data, const int num, const int channels,
}
}
static inline void GetPaddings(int* paddings,
const framework::ExecutionContext& context) {
auto* paddings_t = context.Input<Tensor>("Paddings");
if (paddings_t) {
auto paddings_data = paddings_t->data<int>();
paddings[0] = paddings_data[0];
paddings[1] = paddings_data[1];
paddings[2] = paddings_data[2];
paddings[3] = paddings_data[3];
} else {
auto pads = context.Attr<std::vector<int>>("paddings");
std::copy(pads.begin(), pads.end(), paddings);
}
}
template <typename T>
class Pad2dCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
int pads[4];
GetPaddings(pads, context);
auto mode = context.Attr<std::string>("mode");
auto data_format = context.Attr<std::string>("data_format");
T value = context.Attr<T>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
auto in_dims = x->dims();
auto out_dims = out->dims();
const T* in_data = x->data<T>();
auto* out = context.Output<Tensor>("Out");
if (data_format == "NCHW") {
out->Resize({in_dims[0], in_dims[1], in_dims[2] + pads[0] + pads[1],
in_dims[3] + pads[2] + pads[3]});
} else {
out->Resize({in_dims[0], in_dims[1] + pads[0] + pads[1],
in_dims[2] + pads[2] + pads[3], in_dims[3]});
}
auto out_dims = out->dims();
T* out_data = out->mutable_data<T>(context.GetPlace());
const int pad_top = pads[0];
const int pad_left = pads[2];
const int num = in_dims[0];
......@@ -376,7 +402,8 @@ template <typename T>
class Pad2dGradCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
int pads[4];
GetPaddings(pads, context);
auto mode = context.Attr<std::string>("mode");
auto data_format = context.Attr<std::string>("data_format");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
......@@ -442,13 +469,26 @@ class Pad2dOp : public framework::OperatorWithKernel {
"Output(Out) of Pad2dOp should not be null.");
auto x_dim = ctx->GetInputDim("X");
auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
PADDLE_ENFORCE_EQ(x_dim.size(), 4,
"Size of paddings should be equal to 4.");
std::vector<int64_t> out_dims(x_dim.size());
"The size of input(X)'s dimension should be equal to 4.");
std::vector<int64_t> out_dims(x_dim.size());
auto data_format = ctx->Attrs().Get<std::string>("data_format");
out_dims[0] = x_dim[0];
if (ctx->HasInput("Paddings")) {
auto paddings_dim = ctx->GetInputDim("Paddings");
PADDLE_ENFORCE_EQ(
paddings_dim.size(), 1,
"Size of Input(Paddings)'s dimension should be equal to 1.");
PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
"Shape of Input(Paddings) should be equal to [4].");
out_dims[1] = x_dim[1];
out_dims[2] = x_dim[2];
out_dims[3] = x_dim[3];
} else {
auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
PADDLE_ENFORCE_EQ(paddings.size(), 4,
"Size of paddings should be equal to 4.");
if (data_format == "NCHW") {
out_dims[1] = x_dim[1];
out_dims[2] = x_dim[2] + paddings[0] + paddings[1]; // height
......@@ -458,6 +498,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
out_dims[1] = x_dim[1] + paddings[0] + paddings[1];
out_dims[2] = x_dim[2] + paddings[2] + paddings[3];
}
}
ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
if (out_dims[0] == x_dim[0]) {
......@@ -466,6 +507,13 @@ class Pad2dOp : public framework::OperatorWithKernel {
ctx->ShareLoD("X", /*->*/ "Out");
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
}
};
class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
......@@ -477,6 +525,12 @@ class Pad2dOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("Out",
"The output of pad2d op. "
"A tensor with the same shape as X.");
AddInput("Paddings",
"A 1-D tensor to describe the padding rules."
"paddings=[0, 1, 2, 3] means "
"padding 0 row to top, 1 row to bottom, 2 columns to left "
"and 3 columns to right. Size of paddings must be 4.")
.AsDispensable();
AddAttr<std::vector<int>>(
"paddings",
"(vector<int>) "
......@@ -554,6 +608,13 @@ class Pad2dOpGrad : public framework::OperatorWithKernel {
ctx->SetOutputDim(x_grad_name, x_dims);
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace());
}
};
class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
......@@ -564,6 +625,7 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker {
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* bind = new framework::OpDesc();
bind->SetInput("X", Input("X"));
bind->SetInput("Paddings", Input("Paddings"));
bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
bind->SetAttrMap(Attrs());
......
......@@ -287,20 +287,50 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, T* d_in_data,
}
}
static inline void GetPaddings(int* paddings,
const framework::ExecutionContext& context) {
auto* paddings_t = context.Input<Tensor>("Paddings");
if (paddings_t) {
Tensor pads;
framework::TensorCopySync(*paddings_t, platform::CPUPlace(), &pads);
auto pads_data = pads.data<int>();
paddings[0] = pads_data[0];
paddings[1] = pads_data[1];
paddings[2] = pads_data[2];
paddings[3] = pads_data[3];
} else {
auto pads = context.Attr<std::vector<int>>("paddings");
std::copy(pads.begin(), pads.end(), paddings);
}
}
template <typename T>
class Pad2dCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
int pads[4];
GetPaddings(pads, context);
auto mode = context.Attr<std::string>("mode");
auto data_format = context.Attr<std::string>("data_format");
T value = context.Attr<T>("pad_value");
auto* x = context.Input<Tensor>("X");
auto* out = context.Output<Tensor>("Out");
auto in_dims = x->dims();
auto out_dims = out->dims();
const T* in_data = x->data<T>();
T* out_data = out->mutable_data<T>(context.GetPlace());
auto* out = context.Output<Tensor>("Out");
auto out_dims = out->dims();
if (data_format == "NCHW") {
out_dims[0] = in_dims[0];
out_dims[1] = in_dims[1];
out_dims[2] = in_dims[2] + pads[0] + pads[1];
out_dims[3] = in_dims[3] + pads[2] + pads[3];
} else {
out_dims[0] = in_dims[0];
out_dims[1] = in_dims[1] + pads[0] + pads[1];
out_dims[2] = in_dims[2] + pads[2] + pads[3];
out_dims[3] = in_dims[3];
}
T* out_data = out->mutable_data<T>(out_dims, context.GetPlace());
const int pad_top = pads[0];
const int pad_left = pads[2];
const int num = in_dims[0];
......@@ -356,7 +386,8 @@ template <typename T>
class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto pads = context.Attr<std::vector<int>>("paddings");
int pads[4];
GetPaddings(pads, context);
auto mode = context.Attr<std::string>("mode");
auto data_format = context.Attr<std::string>("data_format");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
......
......@@ -18,6 +18,7 @@ namespace paddle {
namespace operators {
using framework::Tensor;
const int kIgnoreIndex = -100;
class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
public:
......@@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
AddOutput("Out",
"(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
" of elementwise logistic losses.");
AddAttr<int>("ignore_index",
"(int, default kIgnoreIndex), Specifies a target value that "
"is ignored and"
"does not contribute to the input gradient.")
.SetDefault(kIgnoreIndex);
AddComment(R"DOC(
SigmoidCrossEntropyWithLogits Operator.
......
......@@ -15,33 +15,72 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/legacy/utils/Logging.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename T>
struct SigmoidCrossEntropyWithLogitsForward {
HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
: ignore_index(ignore_index) {}
HOSTDEVICE T operator()(const T &x, const T &label) const {
if (static_cast<int>(label) == ignore_index) {
return static_cast<T>(0.);
}
T term1 = (x > 0) ? x : 0;
T term2 = x * label;
T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
return term1 - term2 + term3;
}
int ignore_index;
};
template <typename T>
struct SigmoidCrossEntropyWithLogitsBackward {
HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
: ignore_index(ignore_index) {}
HOSTDEVICE T operator()(const T &x, const T &label) const {
if (static_cast<int>(label) == ignore_index) {
return static_cast<T>(0.);
}
T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
return simoid_x - label;
}
int ignore_index;
};
// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
template <typename DeviceContext, typename T>
class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X");
const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
framework::Tensor *Out = context.Output<framework::Tensor>("Out");
const Tensor *X = context.Input<Tensor>("X");
const Tensor *Labels = context.Input<Tensor>("Label");
Tensor *Out = context.Output<Tensor>("Out");
Out->mutable_data<T>(context.GetPlace());
int ignore_index = context.Attr<int>("ignore_index");
auto x = framework::EigenVector<T>::Flatten(*X);
auto labels = framework::EigenVector<T>::Flatten(*Labels);
auto out = framework::EigenVector<T>::Flatten(*Out);
auto x = EigenVector<T>::Flatten(*X);
auto labels = EigenVector<T>::Flatten(*Labels);
auto out = EigenVector<T>::Flatten(*Out);
auto &place = *context.device_context<DeviceContext>().eigen_device();
// term1 = max(x, 0)
auto term1 = x.cwiseMax(static_cast<T>(0));
// term2 = x * labels
auto term2 = x * labels;
// term3 = log(1 + exp(-abs(x)))
auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
out.device(place) = term1 - term2 + term3;
out.device(place) = x.binaryExpr(
labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
}
};
......@@ -50,23 +89,23 @@ template <typename DeviceContext, typename T>
class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &context) const override {
const framework::Tensor *X = context.Input<framework::Tensor>("X");
const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
const framework::Tensor *dOut =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
framework::Tensor *dX =
context.Output<framework::Tensor>(framework::GradVarName("X"));
const Tensor *X = context.Input<Tensor>("X");
const Tensor *Labels = context.Input<Tensor>("Label");
const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
dX->mutable_data<T>(context.GetPlace());
auto x = framework::EigenVector<T>::Flatten(*X);
auto labels = framework::EigenVector<T>::Flatten(*Labels);
auto dout = framework::EigenVector<T>::Flatten(*dOut);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto ignore_index = context.Attr<int>("ignore_index");
auto x = EigenVector<T>::Flatten(*X);
auto labels = EigenVector<T>::Flatten(*Labels);
auto dout = EigenVector<T>::Flatten(*dOut);
auto dx = EigenVector<T>::Flatten(*dX);
auto &place =
*context.template device_context<DeviceContext>().eigen_device();
auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
dx.device(place) = dout * (sigmoid_x - labels);
auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
static_cast<int>(ignore_index)));
dx.device(place) = dout * diff;
}
};
......
......@@ -15,7 +15,7 @@ limitations under the License. */
#include <iostream>
#include "mkldnn.hpp"
#include "paddle/fluid/operators/softmax_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
......
......@@ -36,6 +36,15 @@ limitations under the License. */
asm("trap;"); \
} \
} while (0)
#define PADDLE_ASSERT_MSG_CODE(e, m, c) \
do { \
if (!(e)) { \
printf("%s:%d Assertion `%s` failed (%s %d).\n", __FILE__, __LINE__, \
TOSTRING(e), m, c); \
asm("trap;"); \
} \
} while (0)
#else
#include <assert.h>
// For cuda, the assertions can affect performance and it is therefore
......@@ -43,4 +52,5 @@ limitations under the License. */
// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion
#define PADDLE_ASSERT(e) assert((e))
#define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
#define PADDLE_ASSERT_MSG_CODE(e, m, c) assert((e) && (m) && (c || 1))
#endif
......@@ -111,7 +111,23 @@ extern void EnforceCUDNNLoaded(const char* fn_name);
__macro(cudnnFindConvolutionForwardAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \
__macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \
__macro(cudnnGetErrorString);
__macro(cudnnGetErrorString); \
__macro(cudnnCreateDropoutDescriptor); \
__macro(cudnnDropoutGetStatesSize); \
__macro(cudnnSetDropoutDescriptor); \
__macro(cudnnCreateRNNDescriptor); \
__macro(cudnnSetRNNDescriptor); \
__macro(cudnnGetRNNParamsSize); \
__macro(cudnnGetRNNWorkspaceSize); \
__macro(cudnnGetRNNTrainingReserveSize); \
__macro(cudnnRNNForwardTraining); \
__macro(cudnnRNNBackwardData); \
__macro(cudnnRNNBackwardWeights); \
__macro(cudnnRNNForwardInference); \
__macro(cudnnDestroyDropoutDescriptor); \
__macro(cudnnDestroyRNNDescriptor); \
__macro(cudnnSetRNNDescriptor_v6);
CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
......
......@@ -107,170 +107,6 @@ inline mkldnn::memory::format GetMKLDNNFormat(
memory.dst_primitive_desc().desc().data.format);
}
class MKLDNNHandler {
public:
MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: dev_ctx_(dev_ctx),
engine_(engine),
key_(base_key),
is_reusing_(false) {}
std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_src_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_weights_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
mkldnn::memory::primitive_desc mdp, void* ptr,
const std::string& suffix) {
auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
"Fail to find mem primitive in device context");
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireMemory(const mkldnn::memory::desc& md,
void* ptr,
const std::string& suffix) {
/*Generate key*/
auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
"Fail to find mem primitive in device context");
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(
mkldnn::memory::primitive_desc{md, engine_}, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireMemory(
const std::shared_ptr<mkldnn::memory>& user_memory_p,
const std::shared_ptr<mkldnn::memory>& target_memory_p,
const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p";
auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
dev_ctx_.GetBlob(key_reorder_p));
if (stored_reorder_p) {
pipeline.push_back(*stored_reorder_p);
} else {
auto reorder_p =
std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
dev_ctx_.SetBlob(key_reorder_p, reorder_p);
pipeline.push_back(*reorder_p);
}
return target_memory_p;
}
std::shared_ptr<mkldnn::memory> AcquireMemory(
mkldnn::memory::primitive_desc& mpd, // NOLINT
mkldnn::memory::primitive_desc& user_mpd, // NOLINT
const std::shared_ptr<mkldnn::memory> user_memory_p,
const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) {
// create reorder primitive if the input format is not the preferred one
auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p";
auto target_memory_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
"Fail to find mem primitive in device context");
if (target_memory_p == nullptr) {
target_memory_p = user_memory_p;
std::shared_ptr<mkldnn::primitive> reorder_p;
if (mpd != user_mpd) {
target_memory_p = std::make_shared<mkldnn::memory>(mpd);
auto reorder_p =
std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
dev_ctx_.SetBlob(key_reorder_p, reorder_p);
pipeline.push_back(*reorder_p);
}
dev_ctx_.SetBlob(local_key, target_memory_p);
} else if (!is_persistent) {
// Make reorder if needed
auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
dev_ctx_.GetBlob(key_reorder_p));
if (reorder_p != nullptr) {
pipeline.push_back(*reorder_p);
}
is_reusing_ = true;
}
return target_memory_p;
}
static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT
const std::string& suffix) {
return dims2str(operand_dims) + suffix;
}
protected:
static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
std::string dstr = "";
for (size_t i = 0; i < operand_dims.size(); ++i) {
dstr += std::to_string(operand_dims[i]) + "-";
}
return dstr;
}
protected:
const MKLDNNDeviceContext& dev_ctx_;
mkldnn::engine engine_;
std::string key_;
bool is_reusing_;
};
inline mkldnn::memory::format MKLDNNFormatForSize(
size_t dims_size, mkldnn::memory::format data_format) {
if (dims_size == 1) {
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#include "paddle/fluid/platform/place.h"
namespace paddle {
namespace platform {
using user_function = std::function<std::shared_ptr<float>(const float*)>;
class MKLDNNHandler {
public:
MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: dev_ctx_(dev_ctx),
engine_(engine),
key_(base_key),
is_reusing_(false) {}
std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_src_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemory(
const mkldnn::memory::desc& md, void* ptr,
user_function custom_func = {}) {
return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func);
}
std::shared_ptr<mkldnn::memory> AcquireBiasMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_bias_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
mkldnn::memory::primitive_desc mdp, void* ptr,
const std::string& suffix) {
auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
"Fail to find mem primitive in device context");
if (mem_p == nullptr) {
mem_p = std::make_shared<mkldnn::memory>(mdp, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
// This incarnation of AcquireMemory can call user function eg. custom reorder
// or preprocessing routine if needed
std::shared_ptr<mkldnn::memory> AcquireMemory(
const mkldnn::memory::desc& md, void* ptr, const std::string& suffix,
user_function custom_func = {}) {
/*Generate key*/
auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false),
"Fail to find mem primitive in device context");
if (mem_p == nullptr) {
// Call custom reorder/preprocessing func if available
if (custom_func) {
auto reordered_data = custom_func(reinterpret_cast<const float*>(ptr));
dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data);
ptr = reinterpret_cast<void*>(reordered_data.get());
}
mem_p = std::make_shared<mkldnn::memory>(
mkldnn::memory::primitive_desc{md, engine_}, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
// Mark that reusing happenned. All primitives from operator instance
// should be reused or none of them. So we check consistency
is_reusing_ = true;
}
return mem_p;
}
std::shared_ptr<mkldnn::memory> AcquireMemory(
const std::shared_ptr<mkldnn::memory>& user_memory_p,
const std::shared_ptr<mkldnn::memory>& target_memory_p,
const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p";
auto stored_reorder_p = std::static_pointer_cast<mkldnn::reorder>(
dev_ctx_.GetBlob(key_reorder_p));
if (stored_reorder_p) {
pipeline.push_back(*stored_reorder_p);
} else {
auto reorder_p =
std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
dev_ctx_.SetBlob(key_reorder_p, reorder_p);
pipeline.push_back(*reorder_p);
}
return target_memory_p;
}
std::shared_ptr<mkldnn::memory> AcquireMemory(
mkldnn::memory::primitive_desc& mpd, // NOLINT
mkldnn::memory::primitive_desc& user_mpd, // NOLINT
const std::shared_ptr<mkldnn::memory> user_memory_p,
const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) {
// create reorder primitive if the input format is not the preferred one
auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p";
auto target_memory_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
PADDLE_ENFORCE((target_memory_p != nullptr) || (is_reusing_ == false),
"Fail to find mem primitive in device context");
if (target_memory_p == nullptr) {
target_memory_p = user_memory_p;
std::shared_ptr<mkldnn::primitive> reorder_p;
if (mpd != user_mpd) {
target_memory_p = std::make_shared<mkldnn::memory>(mpd);
auto reorder_p =
std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p);
dev_ctx_.SetBlob(key_reorder_p, reorder_p);
pipeline.push_back(*reorder_p);
}
dev_ctx_.SetBlob(local_key, target_memory_p);
} else if (!is_persistent) {
// Make reorder if needed
auto reorder_p = std::static_pointer_cast<mkldnn::reorder>(
dev_ctx_.GetBlob(key_reorder_p));
if (reorder_p != nullptr) {
pipeline.push_back(*reorder_p);
}
is_reusing_ = true;
}
return target_memory_p;
}
static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT
const std::string& suffix) {
return dims2str(operand_dims) + suffix;
}
protected:
static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
std::string dstr = "";
for (size_t i = 0; i < operand_dims.size(); ++i) {
dstr += std::to_string(operand_dims[i]) + "-";
}
return dstr;
}
protected:
const MKLDNNDeviceContext& dev_ctx_;
mkldnn::engine engine_;
std::string key_;
bool is_reusing_;
};
template <class forward_t, class backward_data_t, class backward_weights_t>
class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
public:
ConvMKLDNNTemplateHandler(
std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key) {
conv_pd_ = conv_pd;
}
ConvMKLDNNTemplateHandler(
std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
std::shared_ptr<typename backward_data_t::primitive_desc>
conv_bwd_data_pd,
std::shared_ptr<typename backward_weights_t::primitive_desc>
conv_bwd_weights_pd,
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key),
conv_pd_(conv_pd),
conv_bwd_weights_pd_(conv_bwd_weights_pd),
conv_bwd_data_pd_(conv_bwd_data_pd) {
// If we are in Grad operatgor then update a key with BWD suffix to
// distinguish from FWD memory primitives
key_ += "-BWD";
}
size_t GetDstMemorySize() const {
return conv_pd_->dst_primitive_desc().get_size();
}
mkldnn::memory::format GetDstFormat() const {
return static_cast<mkldnn::memory::format>(
conv_pd_->dst_primitive_desc().desc().data.format);
}
size_t GetDiffWeightsMemorySize() const {
return conv_bwd_weights_pd_->diff_weights_primitive_desc().get_size();
}
size_t GetDiffSourceMemorySize() const {
return conv_bwd_data_pd_->diff_src_primitive_desc().get_size();
}
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto src_pd = conv_bwd_weights_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p,
"@weights-src_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromWeightsPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
"@weights-diff_dst_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
void* ptr) {
return this->AcquireMemoryFromPrimitive(
conv_bwd_weights_pd_->diff_weights_primitive_desc(), ptr,
"@diff_weights_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p,
"@data-diff_dst_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromDataPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto weights_pd = conv_bwd_data_pd_->weights_primitive_desc();
auto user_pd = user_weights_memory_p->get_primitive_desc();
return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p,
"@data-weights_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireResidualDataMemory(
const mkldnn::memory::desc& md, void* ptr) {
return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromResidualDataMemory(
const std::shared_ptr<mkldnn::memory>& user_residual_memory_p,
void* dst_ptr,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
return this->AcquireMemory(user_residual_memory_p,
this->AcquireDstMemoryFromPrimitive(dst_ptr),
"@residual_data_mem_p", pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireDiffSrcMemoryFromDataPrimitive(
void* ptr) {
return this->AcquireMemoryFromPrimitive(
conv_bwd_data_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
return this->AcquireMemoryFromPrimitive(conv_pd_->dst_primitive_desc(), ptr,
"@dst_mem_p");
}
std::shared_ptr<mkldnn::memory> AcquireSrcMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto src_pd = conv_pd_->src_primitive_desc();
auto user_pd = user_memory_p->get_primitive_desc();
return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p",
pipeline);
}
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) {
auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd,
user_weights_memory_p, "@weights_mem_p",
pipeline, is_persistent);
}
std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT
auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
auto bias_pd = conv_pd_->bias_primitive_desc();
return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
"@bias_mem_p", pipeline);
}
std::shared_ptr<forward_t> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> dst_memory_p) {
auto prim_key = key_ + "@conv_p";
auto conv_p =
std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution primitive in device context");
if (conv_p == nullptr) {
conv_p = std::make_shared<forward_t>(*conv_pd_, *(src_memory_p),
*(weights_memory_p.get()),
*(dst_memory_p.get()));
dev_ctx_.SetBlob(prim_key, conv_p);
} else {
is_reusing_ = true;
}
return conv_p;
}
std::shared_ptr<forward_t> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> bias_memory_p,
std::shared_ptr<mkldnn::memory> dst_memory_p) {
auto prim_key = key_ + "@conv_p";
auto conv_p =
std::static_pointer_cast<forward_t>(dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE((conv_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution primitive in device context");
if (conv_p == nullptr) {
conv_p = std::make_shared<forward_t>(
*conv_pd_, *(src_memory_p), *(weights_memory_p.get()),
*(bias_memory_p.get()), *(dst_memory_p.get()));
dev_ctx_.SetBlob(prim_key, conv_p);
} else {
is_reusing_ = true;
}
return conv_p;
}
std::shared_ptr<backward_weights_t> AcquireConvolutionBackwardWeights(
std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
std::shared_ptr<mkldnn::memory> diff_weights_memory_p) {
auto prim_key = key_ + "@conv_bwd_weights_p";
auto conv_bwd_weights_p = std::static_pointer_cast<backward_weights_t>(
dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE(
(conv_bwd_weights_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution bwd weights primitive in device context");
if (conv_bwd_weights_p == nullptr) {
// create backward conv primitive for weights
conv_bwd_weights_p = std::make_shared<backward_weights_t>(
*conv_bwd_weights_pd_, *src_memory_p, *diff_dst_memory_p,
*diff_weights_memory_p);
dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p);
} else {
is_reusing_ = true;
}
return conv_bwd_weights_p;
}
std::shared_ptr<backward_data_t> AcquireConvolutionBackwardData(
std::shared_ptr<mkldnn::memory> diff_dst_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p,
std::shared_ptr<mkldnn::memory> diff_src_memory_p) {
auto prim_key = key_ + "@conv_bwd_data_p";
auto conv_bwd_data_p =
std::static_pointer_cast<backward_data_t>(dev_ctx_.GetBlob(prim_key));
PADDLE_ENFORCE(
(conv_bwd_data_p != nullptr) || (is_reusing_ == false),
"Fail to find convolution bwd data primitive in device context");
if (conv_bwd_data_p == nullptr) {
conv_bwd_data_p = std::make_shared<backward_data_t>(
*conv_bwd_data_pd_, *diff_dst_memory_p, *weights_memory_p,
*diff_src_memory_p);
dev_ctx_.SetBlob(prim_key, conv_bwd_data_p);
} else {
is_reusing_ = true;
}
return conv_bwd_data_p;
}
// Generate keys for storing/retriving primitives for this operator
// TODO(jczaja): Make hashing function more optimial
static std::string GetHash(mkldnn::memory::dims& input_dims, // NOLINT
mkldnn::memory::dims& weights_dims, // NOLINT
std::vector<int>& strides, // NOLINT
std::vector<int>& paddings, // NOLINT
std::vector<int>& dilations, // NOLINT
int groups, const std::string& suffix) {
return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) +
dims2str(paddings) + dims2str(dilations) + std::to_string(groups) +
suffix;
}
private:
std::shared_ptr<typename forward_t::primitive_desc> conv_pd_;
std::shared_ptr<typename backward_weights_t::primitive_desc>
conv_bwd_weights_pd_;
std::shared_ptr<typename backward_data_t::primitive_desc> conv_bwd_data_pd_;
};
using ConvMKLDNNHandler =
ConvMKLDNNTemplateHandler<mkldnn::convolution_forward,
mkldnn::convolution_backward_data,
mkldnn::convolution_backward_weights>;
using ConvTransposeMKLDNNHandler =
ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
mkldnn::deconvolution_backward_data,
mkldnn::deconvolution_backward_weights>;
} // namespace platform
} // namespace paddle
set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc)
if(WITH_PYTHON)
if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fcntl.h>
// To avoid conflicting definition in gcc-4.8.2 headers and pyconfig.h (2.7.3)
#ifdef _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE
#endif
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#include <string>
#include <vector>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/async_executor.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/data_feed.pb.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/variant.h"
#include "paddle/fluid/pybind/async_executor_py.h"
namespace py = pybind11;
namespace pd = paddle::framework;
namespace paddle {
namespace pybind {
using set_name_func = void (pd::DataFeedDesc::*)(const std::string&);
void BindAsyncExecutor(py::module* m) {
py::class_<framework::AsyncExecutor>(*m, "AsyncExecutor")
.def(py::init([](framework::Scope* scope, const platform::Place& place) {
return std::unique_ptr<framework::AsyncExecutor>(
new framework::AsyncExecutor(scope, place));
}))
.def("run_from_files", &framework::AsyncExecutor::RunFromFile);
} // end BindAsyncExecutor
} // end namespace pybind
} // end namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace paddle {
namespace pybind {
void BindAsyncExecutor(py::module* m);
} // namespace pybind
} // namespace paddle
......@@ -42,6 +42,7 @@ limitations under the License. */
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/pybind/async_executor_py.h"
#include "paddle/fluid/pybind/const_value.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/protobuf.h"
......@@ -932,6 +933,7 @@ All parameter, weight, gradient are variables in Paddle.
});
BindRecordIOWriter(&m);
BindAsyncExecutor(&m);
}
} // namespace pybind
} // namespace paddle
......@@ -440,11 +440,29 @@ EOF
ctest --output-on-failure -j $1
# make install should also be test when unittest
make install -j 8
if [ "$1" == "cp27-cp27m" ]; then
pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp35-cp35m" ]; then
pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp36-cp36m" ]; then
pip3.6 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version
fi
if [ "$1" == "cp27-cp27m" ]; then
pip uninstall -y paddlepaddle
elif [ "$1" == "cp35-cp35m" ]; then
pip3.5 uninstall -y paddlepaddle
elif [ "$1" == "cp36-cp36m" ]; then
pip3.6 uninstall -y paddlepaddle
elif [ "$1" == "cp37-cp37m" ]; then
pip3.7 uninstall -y paddlepaddle
fi
fi
}
......
......@@ -20,6 +20,13 @@ from .framework import *
# import all class inside executor into fluid module
from . import executor
from .executor import *
from . import data_feed_desc
from .data_feed_desc import *
from . import async_executor
from .async_executor import *
from . import trainer
from . import inferencer
......@@ -54,7 +61,8 @@ Tensor = LoDTensor
__all__ = framework.__all__ + executor.__all__ + \
trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
parallel_executor.__all__ + lod_tensor.__all__ + [
parallel_executor.__all__ + lod_tensor.__all__ + \
data_feed_desc.__all__ + async_executor.__all__ + [
'io',
'initializer',
'layers',
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import numpy as np
import contextlib
import six
from .framework import Program, default_main_program, Variable
from . import core
from .executor import global_scope, Executor
from paddle.fluid.proto import data_feed_pb2
from google.protobuf import text_format
from . import io
from .data_feed_desc import DataFeedDesc
__all__ = ['AsyncExecutor']
class AsyncExecutor(object):
"""
An asynchronous Executor in Python. Through exploiting the power of
multi-core processor and data queueing, AsyncExecutor makes data reading
and cosuming decoupled, each run in multiple threads in parallel.
Instead of reading data in python side, AsyncExecutor accepts a training
file list, which will be retrieved in C++, then training inputs will be
read, parsed and fed to training network within C++ code.
AsyncExecutor is in active development and the API might change in the near
future.
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> startup_program = fluid.default_startup_program()
>>> main_program = fluid.default_main_program()
>>> filelist = ["train_data/part-%d" % i for i in range(100)]
>>> thread_num = len(filelist) / 4
>>>
>>> place = fluid.CPUPlace()
>>> async_executor = fluid.AsyncExecutor(place)
>>>
>>> async_executor.run_startup_program(startup_program)
>>>
>>> epoch = 10
>>> for i in range(epoch):
>>> async_executor.run(main_program,
>>> data_feed,
>>> filelist,
>>> thread_num,
>>> [acc],
>>> debug=False)
Args:
place(fluid.CPUPlace|None): indicate the executor run on which device.
Only CPUPlace supported
Note:
For debugging complicated network in parallel-GPUs, you can test it
on the executor. They has the exactly same arguments, and expected
the same results.
Note: Only running on CPUPlace supported.
"""
def __init__(self, place=None):
if place is None:
place = core.CPUPlace()
if not isinstance(place, core.CPUPlace):
raise ValueError("AsyncExecutor only supports CPU device")
p = core.Place()
p.set_place(place)
scope = global_scope()
self.executor = core.AsyncExecutor(scope, p)
def run(self, program, data_feed, filelist, thread_num, fetch, debug=False):
"""
Run program by this AsyncExecutor. Training dataset will be in filelist.
Users can also inspect certain variables by naming them in parameter
:code:`fetch`, like in fluid.Executor. Unlike fluid.Executor, however,
AsyncExecutor doesn't return fetched variables, instead, it will dump
the values of each fetched variable to stdandard output.
Running the dataset will be on multiple threads, within each a thread
local scope will be created, then all OPs also created in that scope.
Parameters are updated by all the OPs simultaneously.
Args:
program(Program): the program that need to run, if not provied,
then default_main_program will be used.
data_feed(DataFeedDesc): A DataFeedDesc object
filelist(str): a file containing the training dataset file list
thread_num(int): number of concurrent training threads. See
:code:`Note` for how to set this properly
fetch(str|list): the var name or a list of var names to inspect
debug(bool): When set to True, fetch vars will be printed to
standard output after each minibatch
Note:
the executor will run all operators in the program but not only
the operators dependent by the fetch_list.
Note:
Running AsyncExecutor will be on multiple threads, each bound to a
CPU core. To achieve best performance, it's suggested to set thread
num to be equal or slightly less than that of CPU cores.
"""
if program is None:
program = default_main_program()
program_desc = program.desc
if data_feed is None:
raise ValueError('ValueError: data_feed should be provided')
if filelist is None:
raise ValueError('ValueError: filelist should be provided')
if isinstance(filelist, str):
filelist = [filelist]
if not isinstance(thread_num, int):
raise TypeError('TypeError: thread_num should be a positive number')
if fetch is not None:
if isinstance(fetch, Variable):
fetch = [fetch]
fetch_var_names = [var.name for var in fetch]
for fetch_var in fetch:
shape = fetch_var.shape
if shape[len(shape) - 1] != 1:
raise AssertionError(
"%s: Fetch variable has wrong shape. Only varibles "
"with the last dimension size 1 supported." %
(fetch_var.name))
self.executor.run_from_files(program_desc,
data_feed.desc(), filelist, thread_num,
fetch_var_names, debug)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.proto import data_feed_pb2
from google.protobuf import text_format
__all__ = ['DataFeedDesc']
class DataFeedDesc(object):
"""
Datafeed descriptor, describing input training data format. This class is
currently only used for AsyncExecutor (See comments for class AsyncExecutor
for a brief introduction)
DataFeedDesc shall be initialized from a valid protobuf message from disk:
>>> data_feed = fluid.DataFeedDesc('data.proto')
See :code:`paddle/fluid/framework/data_feed.proto` for message definition.
A typical message might look like:
>>> name: "MultiSlotDataFeed"
>>> batch_size: 2
>>> multi_slot_desc {
>>> slots {
>>> name: "words"
>>> type: "uint64"
>>> is_dense: false
>>> is_used: true
>>> }
>>> slots {
>>> name: "label"
>>> type: "uint64"
>>> is_dense: false
>>> is_used: true
>>> }
>>> }
However, users usually shouldn't care about the message format; instead,
they are encouragd to use :code:`Data Generator` as a tool to generate a
valid data description, in the process of converting their raw log files to
training files acceptable to AsyncExecutor.
DataFeedDesc can also be changed during runtime. Once you got familiar with
what each field mean, you can modify it to better suit your need. E.g.:
>>> data_feed.set_batch_size(128)
>>> data_feed.set_dense_slots('wd') # The slot named 'wd' will be dense
>>> data_feed.set_use_slots('wd') # The slot named 'wd' will be used
Finally, the content can be dumped out for debugging purpose:
>>> print(data_feed.desc())
Args:
proto_file(string): Disk file containing a data feed description.
"""
def __init__(self, proto_file):
self.proto_desc = data_feed_pb2.DataFeedDesc()
with open(proto_file, 'r') as f:
text_format.Parse(f.read(), self.proto_desc)
if self.proto_desc.name == "MultiSlotDataFeed":
self.__name_to_index = {
slot.name: i
for i, slot in enumerate(self.proto_desc.multi_slot_desc.slots)
}
def set_batch_size(self, batch_size):
"""
Set batch size. Will be effective during training
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> data_feed.set_batch_size(128)
Args:
batch_size: batch size
"""
self.proto_desc.batch_size = batch_size
def set_dense_slots(self, dense_slots_name):
"""
Set if a specific slot will be dense. Will be effective during training.
features for a dense slot will be fed into a Tensor, while those for a
sparse slot will be fed into a LoDTensor
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> data_feed.set_dense_slots(['words'])
Args:
dense_slots_name: a list of slot names which will be set dense
Note:
Default is sparse for all slots
"""
if self.proto_desc.name != "MultiSlotDataFeed":
raise ValueError(
"Only MultiSlotDataFeed need set_dense_slots, pls check your datafeed.proto"
)
for name in dense_slots_name:
self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
name]].is_dense = True
def set_use_slots(self, use_slots_name):
"""
Set if a specific slot will be used for training. A dataset shall
contain a lot of features, through this function one can select which
ones will be used for a specific model.
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> data_feed.set_use_slots(['words'])
Args:
use_slots_name: a list of slot names which will be used in training
Note:
Default is not used for all slots
"""
if self.proto_desc.name != "MultiSlotDataFeed":
raise ValueError(
"Only MultiSlotDataFeed need set_use_slots, pls check your datafeed.proto"
)
for name in use_slots_name:
self.proto_desc.multi_slot_desc.slots[self.__name_to_index[
name]].is_used = True
def desc(self):
"""
Returns a protobuf message for this DataFeedDesc
Example:
>>> data_feed = fluid.DataFeedDesc('data.proto')
>>> print(data_feed.desc())
Returns:
A string message
"""
return text_format.MessageToString(self.proto_desc)
......@@ -278,6 +278,7 @@ class Executor(object):
p = core.Place()
p.set_place(place)
self.executor = core.Executor(p)
self.program_caches = dict()
self._closed = False
......
......@@ -169,8 +169,11 @@ __all__ = [
'log_loss',
'add_position_encoding',
'bilinear_tensor_product',
'lstm',
]
kIgnoreIndex = -100
def fc(input,
size,
......@@ -326,6 +329,11 @@ def embedding(input,
"""
helper = LayerHelper('embedding', **locals())
remote_prefetch = False
if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'):
remote_prefetch = True
if remote_prefetch:
assert is_sparse is True and is_distributed is False
w = helper.create_parameter(
attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
tmp = helper.create_variable_for_type_inference(dtype)
......@@ -339,6 +347,7 @@ def embedding(input,
attrs={
'is_sparse': is_sparse,
'is_distributed': is_distributed,
'remote_prefetch': remote_prefetch,
'padding_idx': padding_idx
})
return tmp
......@@ -466,6 +475,168 @@ def dynamic_lstm(input,
return hidden, cell
def lstm(input,
init_h,
init_c,
max_len,
hidden_size,
num_layers,
dropout_prob=0.0,
is_bidirec=False,
is_test=False,
name=None,
default_initializer=None,
seed=-1):
"""
If Device is GPU, This op will use cudnn LSTM implementation
A four-gate Long Short-Term Memory network with no peephole connections.
In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
$$ i_t = \\sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$
$$ f_t = \\sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$
$$ o_t = \\sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$
$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$
$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$
$$ h_t = o_t \\odot tanh(c_t) $$
- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix
of weights from the input gate to the input)
- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector).
- sigmoid is the logistic sigmoid function.
- $i, f, o$ and $c$ are the input gate, forget gate, output gate,
and cell activation vectors, respectively, all of which have the same size as
the cell output activation vector $h$.
- The $\odot$ is the element-wise product of the vectors.
- `tanh` is the activation functions.
- $\tilde{c_t}$ is also called candidate hidden state,
which is computed based on the current input and the previous hidden state.
Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication,
X represensts a matrix multiplication
Args:
input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
init_h(Variable): The initial hidden state of the LSTM
This is a tensor with shape ( num_layers x batch_size x hidden_size)
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
init_c(Variable): The initial cell state of the LSTM.
This is a tensor with shape ( num_layers x batch_size x hidden_size )
if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
hidden_size (int): hidden size of the LSTM
num_layers (int): total layers number of the LSTM
dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
There is NO dropout work on rnn output of the last RNN layers
is_bidirec (bool): If it is bidirectional
is_test (bool): If it is in test phrase
name (str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
default_initializer(Initialize|None): Where use initializer to initialize the Weight
If set None, defaule initializer will be used
seed(int): Seed for dropout in LSTM, If it's -1, dropout will use random seed
Returns:
rnn_out(Tensor): result of LSTM hidden, shape is (seq_len x batch_size x hidden_size)
if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
last_h(Tensor): the hidden state of the last step of LSTM
shape is ( num_layers x batch_size x hidden_size )
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
last_c(Tensor): the cell state of the last step of LSTM
shape is ( num_layers x batch_size x hidden_size )
if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
Examples:
.. code-block:: python
input = embedding
batch_size = 20
max_len = 100
dropout_prob = 0.2
input_size = 100
hidden_size = 150
num_layers = 1
init_hidden1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
init_cell1 = layers.fill_constant( [num_layers, batch_size, hidden_size], 'float32', 0.0, stop_grad=False)
rnn_out, last_h, last_c = layers.lstm( input, init_h, init_c, \
max_len, dropout_prob, input_size, hidden_size, \
num_layers)
"""
helper = LayerHelper('cudnn_lstm', **locals())
dtype = input.dtype
input_shape = list(input.shape)
input_size = input_shape[-1]
weight_size = 0
for i in range(num_layers):
if i == 0:
input_weight_size = (input_size * hidden_size) * 4
else:
if is_bidirec:
input_weight_size = (hidden_size * 2 * hidden_size) * 4
else:
input_weight_size = (hidden_size * hidden_size) * 4
hidden_weight_size = (hidden_size * hidden_size) * 4
if is_bidirec:
weight_size += (input_weight_size + hidden_weight_size) * 2
weight_size += hidden_size * 8 * 2
else:
weight_size += input_weight_size + hidden_weight_size
weight_size += hidden_size * 8
weight = helper.create_parameter(
attr=helper.param_attr,
shape=[weight_size],
dtype=dtype,
default_initializer=default_initializer)
out = helper.create_variable_for_type_inference(dtype)
last_h = helper.create_variable_for_type_inference(dtype)
last_c = helper.create_variable_for_type_inference(dtype)
cache = helper.create_variable(
persistable=True, type=core.VarDesc.VarType.RAW, stop_gradient=True)
helper.append_op(
type='cudnn_lstm',
inputs={
'Input': input,
'InitH': init_h,
'InitC': init_c,
'W': weight,
'Cache': cache,
},
outputs={
'Out': out,
'last_h': last_h,
'last_c': last_c,
},
attrs={
'max_len': max_len,
'is_bidirec': is_bidirec,
'input_size': input_size,
'hidden_size': hidden_size,
'num_layers': num_layers,
'is_test': is_test,
'dropout_prob': dropout_prob,
'seed': seed,
})
return out, last_h, last_c
def dynamic_lstmp(input,
size,
proj_size,
......@@ -1098,7 +1269,7 @@ def dropout(x,
return out
def cross_entropy(input, label, soft_label=False, ignore_index=-100):
def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
"""
**Cross Entropy Layer**
......@@ -1145,7 +1316,7 @@ def cross_entropy(input, label, soft_label=False, ignore_index=-100):
labels. Default: `False`.
ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100
if soft_label is set to False. Default: kIgnoreIndex
Returns:
A 2-D tensor with shape [N x 1], the cross entropy loss.
......@@ -2300,7 +2471,8 @@ def batch_norm(input,
moving_mean_name=None,
moving_variance_name=None,
do_model_average_for_mean_and_var=False,
fuse_with_relu=False):
fuse_with_relu=False,
use_global_stats=False):
"""
**Batch Normalization Layer**
......@@ -2327,6 +2499,19 @@ def batch_norm(input,
\\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
When use_global_stats = True, the :math:`\\mu_{\\beta}`
and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
They are global (or running) statistics. (It usually got from the
pre-trained model.)
The training and testing (or inference) have the same behavior:
.. math::
\\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
\\sigma_{\\beta}^{2} + \\epsilon}} \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta
Args:
input(variable): The input variable which is a LoDTensor.
act(string, Default None): Activation type, linear|relu|prelu|...
......@@ -2349,6 +2534,11 @@ def batch_norm(input,
moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
fuse_with_relu (bool): if True, this OP performs relu after batch norm.
use_global_stats(bool, Default False): Whether to use global mean and
variance. In inference or test mode, set use_global_stats to true
or is_test to true, and the behavior is equivalent.
In train mode, when setting use_global_stats True, the global mean
and variance are also used during train period.
Returns:
Variable: A tensor variable which is the result after applying batch normalization on the input.
......@@ -2381,9 +2571,15 @@ def batch_norm(input,
shape=param_shape,
dtype=dtype,
default_initializer=Constant(1.0))
# setting stop_gradient=True to reduce computation
if use_global_stats and helper.param_attr.learning_rate == 0.:
scale.stop_gradient = True
bias = helper.create_parameter(
attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
# setting stop_gradient=True to reduce computation
if use_global_stats and helper.bias_attr.learning_rate == 0.:
scale.stop_gradient = True
mean = helper.create_parameter(
attr=ParamAttr(
......@@ -2439,7 +2635,8 @@ def batch_norm(input,
"epsilon": epsilon,
"is_test": is_test,
"use_mkldnn": False,
"fuse_with_relu": fuse_with_relu
"fuse_with_relu": fuse_with_relu,
"use_global_stats": use_global_stats
})
return helper.append_activation(batch_norm_out)
......@@ -4219,7 +4416,14 @@ def ctc_greedy_decoder(input, blank, name=None):
input.lod = [[4, 4]]
Then:
Computation:
step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
[[0], [2], [1], [0]]
step2: merge repeated tokens and remove blank which is 0. Then we get first output sequence:
[[2], [1]]
Finally:
output.data = [[2],
[1],
......@@ -4227,6 +4431,7 @@ def ctc_greedy_decoder(input, blank, name=None):
output.lod = [[2, 1]]
Args:
input(Variable): (LoDTensor<float>), the probabilities of
......@@ -4241,8 +4446,10 @@ def ctc_greedy_decoder(input, blank, name=None):
name (str): The name of this layer. It is optional.
Returns:
Variable: CTC greedy decode result. If all the sequences in result were
empty, the result LoDTensor will be [-1] with LoD [[]] and dims [1, 1].
Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1].
'Lp' is the sum if all output sequences' length. If all the sequences
in result were empty, the result LoDTensor will be [-1] with
LoD [[]] and dims [1, 1].
Examples:
.. code-block:: python
......@@ -4980,7 +5187,7 @@ def multiplex(inputs, index):
def softmax_with_cross_entropy(logits,
label,
soft_label=False,
ignore_index=-100,
ignore_index=kIgnoreIndex,
numeric_stable_mode=False,
return_softmax=False):
"""
......@@ -5038,7 +5245,7 @@ def softmax_with_cross_entropy(logits,
labels as soft labels. By default, `soft_label` is set to False.
ignore_index (int): Specifies a target value that is ignored and does
not contribute to the input gradient. Only valid
if soft_label is set to False. Default: -100
if soft_label is set to False. Default: kIgnoreIndex
numeric_stable_mode (bool): A flag to indicate whether to use a more
numerically stable algorithm. Only valid
when soft_label is False and GPU is used.
......@@ -6892,7 +7099,7 @@ def pad2d(input,
Args:
input (Variable): The input image with [N, C, H, W] format or [N, H, W, C] format.
paddings (tuple|list): The padding size. If padding is a tuple, it must
paddings (tuple|list|Variable): The padding size. If padding is a tuple, it must
contain four integers, (padding_top, padding_bottom, padding_left, padding_right).
Default: padding = [0, 0, 0, 0].
mode (str): Three modes: constant(default), reflect, edge. Default: constant
......@@ -6917,16 +7124,17 @@ def pad2d(input,
helper = LayerHelper('pad2d', **locals())
dtype = helper.input_dtype(input_param_name='input')
out = helper.create_variable_for_type_inference(dtype)
inputs = {'X': input}
attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format}
if isinstance(paddings, Variable):
inputs['Paddings'] = paddings
attrs['paddings'] = []
else:
attrs['paddings'] = paddings
helper.append_op(
type='pad2d',
inputs={'X': input},
outputs={"Out": out},
attrs={
'paddings': paddings,
'mode': mode,
'pad_value': pad_value,
'data_frmat': data_format
})
type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
return out
......@@ -7574,6 +7782,11 @@ def uniform_random_batch_size_like(input,
Returns:
out (Variable): ${out_comment}
Examples:
.. code-block:: python
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.uniform_random_batch_size_like(input, [-1, 11])
"""
helper = LayerHelper('uniform_random_batch_size_like', **locals())
......@@ -7611,6 +7824,10 @@ def gaussian_random(shape, mean=0.0, std=1.0, seed=0, dtype='float32'):
Returns:
out (Variable): ${out_comment}
Examples:
.. code-block:: python
out = layers.gaussian_random(shape=[20, 30])
"""
helper = LayerHelper('gaussian_random', **locals())
......@@ -7646,6 +7863,16 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
Returns:
out (Variable): ${out_comment}
Examples:
.. code-block:: python
x = layers.data(
name="X",
shape=[13, 11],
dtype='float32',
append_batch_size=False)
out = layers.sampling_id(x)
"""
helper = LayerHelper('sampling_id', **locals())
......@@ -7685,6 +7912,14 @@ def gaussian_random_batch_size_like(input,
Returns:
out (Variable): ${out_comment}
Examples:
.. code-block:: python
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.gaussian_random_batch_size_like(
input, shape=[-1, 11], mean=1.0, std=2.0)
"""
helper = LayerHelper('gaussian_random_batch_size_like', **locals())
......@@ -7717,6 +7952,12 @@ def sum(x):
Returns:
out (Variable): ${out_comment}
Examples:
.. code-block:: python
input = layers.data(name="input", shape=[13, 11], dtype='float32')
out = layers.sum(input)
"""
helper = LayerHelper('sum', **locals())
......@@ -7745,6 +7986,17 @@ def slice(input, axes, starts, ends):
Returns:
out (Variable): ${out_comment}
Examples:
.. code-block:: python
starts = [1, 0, 2]
ends = [3, 3, 4]
axes = [0, 1, 2]
input = layers.data(
name="input", shape=[3, 4, 5, 6], dtype='float32')
out = layers.slice(input, axes=axes, starts=starts, ends=ends)
"""
helper = LayerHelper('slice', **locals())
......@@ -7772,6 +8024,12 @@ def shape(input):
Returns:
out (Variable): ${out_comment}
Examples:
.. code-block:: python
input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32")
out = layers.shape(input)
"""
helper = LayerHelper('shape', **locals())
......@@ -8159,13 +8417,17 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
@templatedoc()
def sigmoid_cross_entropy_with_logits(x, label, name=None):
def sigmoid_cross_entropy_with_logits(x,
label,
ignore_index=kIgnoreIndex,
name=None):
"""
${comment}
Args:
x(${x_type}): ${x_comment}
label(${label_type}): ${label_comment}
ignore_index(&{ignore_index}): ${ignore_index_comment}
name(basestring|None): Name of the output.
Returns:
......@@ -8184,7 +8446,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None):
type="sigmoid_cross_entropy_with_logits",
inputs={"X": x,
"Label": label},
attrs={},
attrs={"ignore_index": ignore_index},
outputs={"Out": out})
return out
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tarfile
import paddle.fluid as fluid
import paddle
from paddle.fluid import core
URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
MD5 = '2a405a31508969b3ab823f42c0f522ca'
def bow_net(data,
label,
dict_dim=89528,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
models/fluid/PaddleNLP/text_classification/nets.py
"""
# embedding
emb = fluid.layers.embedding(
input=data, size=[dict_dim, emb_dim], is_sparse=True)
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
bowh = fluid.layers.tanh(bow)
# fc layer after conv
fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
# probability of each class
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
# cross entropy loss
cost = fluid.layers.cross_entropy(input=prediction, label=label)
# mean loss
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, acc, prediction
def train():
# Download data
with tarfile.open(paddle.dataset.common.download(URL, "imdb", MD5)) as tarf:
tarf.extractall(path='./')
tarf.close()
# Initialize dataset description
dataset = fluid.DataFeedDesc('train_data/data.prototxt')
dataset.set_batch_size(128) # See API doc for how to change other fields
print dataset.desc() # Debug purpose: see what we get
# define network
# input text data
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
avg_cost, acc, prediction = bow_net(data, label)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
# Run startup program
startup_program = fluid.default_startup_program()
place = fluid.CPUPlace()
executor = fluid.Executor(place)
executor.run(startup_program)
async_executor = fluid.AsyncExecutor(place)
main_program = fluid.default_main_program()
epochs = 10
filelist = ["train_data/part-%d" % i for i in range(12)]
for i in range(epochs):
thread_num = 4
async_executor.run(
main_program, # This can be changed during iteration
dataset, # This can be changed during iteration
filelist, # This can be changed during iteration
thread_num, # This can be changed during iteration
[data, acc], # Multiple fetch targets can be specified
debug=False)
fluid.io.save_inference_model('imdb/epoch%d.model' % i,
[data.name, label.name], [acc], executor)
if __name__ == "__main__":
train()
......@@ -16,11 +16,13 @@ from __future__ import print_function
import paddle
import paddle.fluid as fluid
import os
import dist_ctr_reader
from test_dist_base import TestDistRunnerBase, runtime_main
IS_SPARSE = True
os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
# Fix seed for test
fluid.default_startup_program().random_seed = 1
......
......@@ -216,6 +216,15 @@ class OpTest(unittest.TestCase):
self.dtype)
outputs = append_input_output(block, op_proto, self.outputs, False,
self.dtype)
if hasattr(self, "cache_name_list"):
for name in self.cache_name_list:
inputs[name] = block.create_var(
name=name,
persistable=True,
type=core.VarDesc.VarType.RAW,
stop_gradient=True)
op = block.append_op(
type=self.op_type,
inputs=inputs,
......@@ -428,8 +437,17 @@ class OpTest(unittest.TestCase):
op_inputs = self.inputs if hasattr(self, "inputs") else dict()
op_outputs = self.outputs if hasattr(self, "outputs") else dict()
op_attrs = self.attrs if hasattr(self, "attrs") else dict()
self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
op_attrs)
cache_list = None
if hasattr(self, "cache_name_list"):
cache_list = self.cache_name_list
self.op = create_op(
self.scope,
self.op_type,
op_inputs,
op_outputs,
op_attrs,
cache_list=cache_list)
if no_grad_set is None:
no_grad_set = set()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import paddle
import unittest
import tarfile
import os
import shutil
proto_str = ('name: "MultiSlotDataFeed"\n'
'batch_size: 2\n'
'multi_slot_desc {\n'
' slots {\n'
' name: "words"\n'
' type: "uint64"\n'
' is_dense: false\n'
' is_used: true\n'
' }\n'
' slots {\n'
' name: "label"\n'
' type: "uint64"\n'
' is_dense: false\n'
' is_used: true\n'
' }\n'
'}')
URL = 'http://paddle-unittest-data.gz.bcebos.com/python_paddle_fluid_tests_demo_async-executor/train_data.tar.gz'
MD5 = '2a405a31508969b3ab823f42c0f522ca'
def bow_net(data,
label,
dict_dim=89528,
emb_dim=128,
hid_dim=128,
hid_dim2=96,
class_dim=2):
"""
BOW net
This model is from https://github.com/PaddlePaddle/models:
models/fluid/PaddleNLP/text_classification/nets.py
"""
# embedding
emb = fluid.layers.embedding(
input=data, size=[dict_dim, emb_dim], is_sparse=True)
bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
bowh = fluid.layers.tanh(bow)
# fc layer after conv
fc_1 = fluid.layers.fc(input=bowh, size=hid_dim, act="tanh")
fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
# probability of each class
prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
# cross entropy loss
cost = fluid.layers.cross_entropy(input=prediction, label=label)
# mean loss
avg_cost = fluid.layers.mean(x=cost)
acc = fluid.layers.accuracy(input=prediction, label=label)
return avg_cost, acc, prediction
class TestAsyncExecutor(unittest.TestCase):
def setUp(self):
with open('./data.prototxt', 'w+') as f:
f.write(proto_str)
f.close()
with tarfile.open(paddle.dataset.common.download(URL, "imdb",
MD5)) as tarf:
tarf.extractall(path='./')
tarf.close()
def test_data_feed_desc(self):
data_feed = fluid.DataFeedDesc('./data.prototxt')
# assertEqueal(data_feed.proto_desc.batch, 2)
# assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2)
self.assertEqual(" ".join(data_feed.desc().split()),
" ".join(proto_str.split()))
def test_run(self):
# Initialize dataset description
data_feed = fluid.DataFeedDesc('train_data/data.prototxt')
data_feed.set_batch_size(
128) # See API doc for how to change other fields
# define network
# input text data
data = fluid.layers.data(
name="words", shape=[1], dtype="int64", lod_level=1)
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
avg_cost, acc, prediction = bow_net(data, label)
sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002)
opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost)
# Run startup program
startup_program = fluid.default_startup_program()
place = fluid.CPUPlace()
executor = fluid.Executor(place)
executor.run(startup_program)
main_program = fluid.default_main_program()
async_executor = fluid.AsyncExecutor(place)
self.assertRaises(TypeError, async_executor.run)
self.assertRaises(TypeError, async_executor.run, main_program)
self.assertRaises(TypeError, async_executor.run, main_program,
data_feed)
filelist = ['train_data/part-%d' % i for i in range(10)]
self.assertRaises(TypeError, async_executor.run, main_program,
data_feed, filelist)
thread_num = 4
self.assertRaises(TypeError, async_executor.run, main_program,
data_feed, filelist, thread_num)
async_executor.run(main_program, data_feed, filelist, thread_num, [acc])
fluid.io.save_inference_model("imdb.model", [data.name, label.name],
[acc], executor)
statinfo = os.stat('imdb.model/__model__')
self.assertGreater(statinfo.st_size, 0)
os.remove('./data.prototxt')
shutil.rmtree('./train_data')
shutil.rmtree('./imdb.model')
if __name__ == '__main__':
unittest.main()
......@@ -54,6 +54,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
return y
def _cal_mean_variance(x, epsilon, data_format):
assert data_format in ['NCHW', 'NHWC']
x_square = x * x
axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2)
C = x.shape[1] if data_format == 'NCHW' else x.shape[-1]
x_square_sum = np.sum(x_square, axis)
x_sum = np.sum(x, axis=axis)
element_count = np.size(x) / C
mean = x_sum / element_count
var = x_square_sum / element_count - mean * mean
return mean, var
def _reference_training(x, scale, offset, epsilon, data_format):
x_shape = x.shape
......@@ -294,7 +307,18 @@ class TestBatchNormOpTraining(unittest.TestCase):
self.use_mkldnn = False
self.fuse_with_relu = False
self.data_formats = ["NCHW", "NHWC"]
self.momentum = 0.9
self.epsilon = 0.00001
self.init_kernel_type()
self.init_test_case()
def init_test_case(self):
self.use_global_stats = False
self.no_grad_set = set()
self.fetch_list = [
'y', 'mean', 'variance', 'saved_mean', 'saved_variance', 'x@GRAD',
'scale@GRAD', 'bias@GRAD'
]
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
np.allclose(np.array(tensor), np_array, atol=atol)
......@@ -313,11 +337,22 @@ class TestBatchNormOpTraining(unittest.TestCase):
return y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad
def set_mean_variance(self, scale_shape, x, data_layout):
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
# computing global mean/variance for one step
if self.use_global_stats:
mom = self.momentum
x_mean, x_var = _cal_mean_variance(x, self.epsilon, data_layout)
mean = x_mean * (1. - mom) + mom * mean
variance = x_var * (1. - mom) + mom * variance
return mean, variance
def test_forward_backward(self):
def test_with_place(place, data_layout, shape):
# attr
epsilon = 0.00001
momentum = 0.9
epsilon = self.epsilon
momentum = self.momentum
if data_layout == "NCHW":
n, c, h, w = shape[0], shape[1], shape[2], shape[3]
else:
......@@ -328,9 +363,7 @@ class TestBatchNormOpTraining(unittest.TestCase):
x = np.random.random_sample(shape).astype(np.float32)
scale = np.random.random_sample(scale_shape).astype(np.float32)
bias = np.random.random_sample(scale_shape).astype(np.float32)
mean = np.zeros(scale_shape).astype(np.float32)
variance = np.ones(scale_shape).astype(np.float32)
mean, variance = self.set_mean_variance(scale_shape, x, data_layout)
y_grad = np.random.random_sample(shape).astype(np.float32)
y, mean_out, variance_out, saved_mean, saved_variance, x_grad, scale_grad, bias_grad = self.ref_forward_backward(
......@@ -339,6 +372,9 @@ class TestBatchNormOpTraining(unittest.TestCase):
var_dict = locals()
var_dict['y@GRAD'] = y_grad
var_dict['x@GRAD'] = x_grad
var_dict['scale@GRAD'] = scale_grad
var_dict['bias@GRAD'] = bias_grad
var_names = [
'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
......@@ -365,9 +401,8 @@ class TestBatchNormOpTraining(unittest.TestCase):
},
outputs={
"Y": block.var('y'),
"MeanOut": block.var('mean'), # share the same memory
"VarianceOut":
block.var('variance'), # share the same memory
"MeanOut": block.var('mean'), # share memory
"VarianceOut": block.var('variance'), # share memory
"SavedMean": block.var('saved_mean'),
"SavedVariance": block.var('saved_variance')
},
......@@ -377,13 +412,14 @@ class TestBatchNormOpTraining(unittest.TestCase):
"is_test": False,
"data_layout": data_layout,
"use_mkldnn": self.use_mkldnn,
"fuse_with_relu": self.fuse_with_relu
"fuse_with_relu": self.fuse_with_relu,
"use_global_stats": self.use_global_stats
})
block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
# generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
bn_op.desc, set(), [])
bn_op.desc, self.no_grad_set, [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
......@@ -403,20 +439,10 @@ class TestBatchNormOpTraining(unittest.TestCase):
for name in
['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
},
fetch_list=[
'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
'x@GRAD', 'scale@GRAD', 'bias@GRAD'
])
self.__assert_close(y, out[0], "y")
self.__assert_close(mean_out, out[1], "mean")
self.__assert_close(variance_out, out[2], "variance", 1e-3)
self.__assert_close(saved_mean, out[3], "saved_mean")
self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
self.__assert_close(x_grad, out[5], "x_grad")
self.__assert_close(scale_grad, out[6], "scale_grad")
self.__assert_close(bias_grad, out[7], "bias_grad")
fetch_list=self.fetch_list)
for id, name in enumerate(self.fetch_list):
self.__assert_close(var_dict[name], out[id], name)
print("op test forward passed: ", str(place), data_layout)
places = [core.CPUPlace()]
......@@ -432,5 +458,66 @@ class TestBatchNormOpTraining(unittest.TestCase):
pass
class TestBatchNormOpFreezeStatsTraining(TestBatchNormOpTraining):
def init_test_case(self):
self.use_global_stats = True
self.no_grad_set = set()
self.fetch_list = [
'y', 'mean', 'variance', 'x@GRAD', 'scale@GRAD', 'bias@GRAD'
]
def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format):
if data_format == "NCHW":
x = np.transpose(x, (0, 2, 3, 1))
y_grad = np.transpose(y_grad, (0, 2, 3, 1))
x_grad = scale * y_grad / np.sqrt(var + epsilon)
grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
axis=(0, 1, 2))
grad_offset = np.sum(y_grad, axis=(0, 1, 2))
# transfer back to N, C, H, W
if data_format == "NCHW":
x_grad = np.transpose(x_grad, (0, 3, 1, 2))
x = np.transpose(x, (0, 3, 1, 2))
y_grad = np.transpose(y_grad, (0, 3, 1, 2))
return x_grad, grad_scale, grad_offset
def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance,
epsilon, momentum, shape, data_layout):
if data_layout != "NCHW" and data_layout != "NHWC":
raise ValueError("Unknown data order.")
if data_layout == "NCHW":
x = np.transpose(x, (0, 2, 3, 1))
# run normalizaton
normalized = (x - mean) / np.sqrt(variance + epsilon)
y = normalized * scale + bias
# transfer back to N, C, H, W
if data_layout == "NCHW":
x = np.transpose(x, (0, 3, 1, 2))
y = np.transpose(y, (0, 3, 1, 2))
mean_out = mean
variance_out = variance
saved_variance = 1. / np.sqrt(variance + epsilon)
# run backward
x_grad, scale_grad, bias_grad = self.reference_grad(
x, y_grad, scale, mean, variance, epsilon, data_layout)
return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad
class TestBatchNormOpFreezeStatsAndScaleBiasTraining(
TestBatchNormOpFreezeStatsTraining):
def init_test_case(self):
self.use_global_stats = True
self.no_grad_set = set(['scale@GRAD', 'bias@GRAD'])
self.fetch_list = ['y', 'mean', 'variance', 'x@GRAD']
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from test_conv2d_transpose_op import TestConv2dTransposeOp, TestWithPad, TestWithStride
class TestMKLDNN(TestConv2dTransposeOp):
def init_op_type(self):
self.is_test = True
self.use_mkldnn = True
self.data_format = "NCHW"
self.op_type = "conv2d_transpose"
self._cpu_only = True
def test_check_grad(self):
return
def test_check_grad_no_input(self):
return
def test_check_grad_no_filter(self):
return
class TestMKLDNNWithPad(TestWithPad):
def init_op_type(self):
self.is_test = True
self.use_mkldnn = True
self.data_format = "NCHW"
self.op_type = "conv2d_transpose"
self._cpu_only = True
def test_check_grad(self):
return
def test_check_grad_no_input(self):
return
def test_check_grad_no_filter(self):
return
class TestMKLDNNWithStride(TestWithStride):
def init_op_type(self):
self.is_test = True
self.use_mkldnn = True
self.data_format = "NCHW"
self.op_type = "conv2d_transpose"
self._cpu_only = True
def test_check_grad(self):
return
def test_check_grad_no_input(self):
return
def test_check_grad_no_filter(self):
return
if __name__ == '__main__':
unittest.main()
......@@ -68,8 +68,11 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs):
class TestConv2dTransposeOp(OpTest):
def setUp(self):
# init as conv transpose
self.is_test = False
self.use_cudnn = False
self.use_mkldnn = False
self.output_size = None
self.data_format = "AnyLayout"
self.init_op_type()
self.init_test_case()
......@@ -83,7 +86,9 @@ class TestConv2dTransposeOp(OpTest):
'groups': self.groups,
'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
'is_test': self.is_test,
'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format
}
if self.output_size is not None:
self.attrs['output_size'] = self.output_size
......
......@@ -782,5 +782,46 @@ class TestNCCL2Transpile(TranspilerTest):
pass
# test for remote prefetch
class TestRemoteLookupTable(TestDistLookupTableBase):
def net_conf(self):
import os
os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
self.network_with_table(is_sparse=True, is_distributed=False)
def transpiler_test_impl(self):
pserver1, startup1 = self.get_pserver(self.pserver1_ep)
self.assertEqual(len(pserver1.blocks), 4)
# 0 listen_and_serv
# 1 optimize for fc_w or fc_b adam
self.assertEqual([op.type for op in pserver1.blocks[1].ops],
["sum", "scale", "adam", "scale", "scale"])
# 2 optimize for table adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[2].ops],
["sum", "scale", "adam", "scale", "scale"])
# 3 optimize for table 2 adam
# NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num
self.assertEqual([op.type for op in pserver1.blocks[3].ops],
["sum", "scale", "adam", "scale", "scale"])
trainer, _ = self.get_trainer()
self.assertEqual(len(trainer.blocks), 1)
ops = [
'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool',
'lookup_table', 'sequence_pool', 'concat', 'mul', 'elementwise_add',
'cross_entropy', 'mean', 'fill_constant', 'mean_grad',
'cross_entropy_grad', 'elementwise_add_grad', 'send', 'mul_grad',
'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad',
'split_selected_rows', 'send', 'sequence_pool_grad',
'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad',
'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv',
'recv', 'fetch_barrier'
]
self.assertEqual([op.type for op in trainer.blocks[0].ops], ops)
if __name__ == "__main__":
unittest.main()
......@@ -170,9 +170,10 @@ class TestBook(unittest.TestCase):
with program_guard(program):
dat = layers.data(name='data', shape=[10], dtype='float32')
lbl = layers.data(name='label', shape=[10], dtype='float32')
ignore_index = -1
self.assertIsNotNone(
layers.sigmoid_cross_entropy_with_logits(
x=dat, label=lbl))
x=dat, label=lbl, ignore_index=ignore_index))
print(str(program))
def test_hsigmoid(self):
......@@ -636,13 +637,21 @@ class TestBook(unittest.TestCase):
with program_guard(program):
input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32")
paddings = layers.fill_constant(shape=[4], dtype='int32', value=1)
out = layers.pad2d(
input,
paddings=[1, 2, 3, 4],
mode='reflect',
data_format='NCHW',
name="shape")
out_1 = layers.pad2d(
input,
paddings=paddings,
mode='reflect',
data_format='NCHW',
name="shape")
self.assertIsNotNone(out)
self.assertIsNotNone(out_1)
print(str(program))
def test_prelu(self):
......@@ -955,6 +964,15 @@ class TestBook(unittest.TestCase):
print(str(program))
def test_batch_norm(self):
program = Program()
with program_guard(program):
data = layers.data(
name='data', shape=[32, 128, 128], dtype="float32")
out = layers.batch_norm(data)
print(str(program))
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
import signal
import time
import unittest
from multiprocessing import Process
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.framework import Program, program_guard
def run_pserver(pserver_id, use_cuda, sync_mode):
scope = fluid.core.Scope()
program = Program()
with fluid.scope_guard(scope):
with program_guard(program, startup_program=Program()):
# create table parameter in scope
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
# create and initialize Param Variable
param = scope.var('table').get_tensor()
param_array = np.ones((10, 8)).astype("float32")
for i in range(len(param_array)):
param_array[i] *= param_array[i] * i + pserver_id * 10
param.set(param_array, place)
optimize_block = program._create_block(program.global_block().idx)
program.global_block().append_op(
type="listen_and_serv",
inputs={'X': []},
outputs={},
attrs={
"optimize_blocks": [optimize_block],
"endpoint": '127.0.0.1:0',
"Fanin": 1,
"sync_mode": True,
"grad_to_block_id": []
})
exe = fluid.Executor(place)
exe.run(program)
class TestListenAndServOp(unittest.TestCase):
def setUp(self):
self.ps_timeout = 5
def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func):
p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode))
p.daemon = True
p.start()
return p
def _wait_ps_ready(self, pid):
start_left_time = self.ps_timeout
sleep_time = 0.5
while True:
assert start_left_time >= 0, "wait ps ready failed"
time.sleep(sleep_time)
try:
# the listen_and_serv_op would touch a file which contains the listen port
# on the /tmp directory until it was ready to process all the RPC call.
os.stat("/tmp/paddle.%d.port" % pid)
return
except os.error:
start_left_time -= sleep_time
def _get_pserver_port(self, pid):
with open("/tmp/paddle.%d.port" % pid, 'r') as f:
port = int(f.read().strip())
return port
def _run_lookup_table_op_one_pserver(self, place, port):
scope = fluid.core.Scope()
program = Program()
with fluid.scope_guard(scope):
with program_guard(program, startup_program=Program()):
# create and initialize Param Variable
param = scope.var('W').get_tensor()
param_array = np.full((10, 8), 1.0).astype("float32")
param.set(param_array, place)
ids = scope.var('Ids').get_tensor()
ids_array = np.array([[1], [2], [5]]).astype("int64")
ids.set(ids_array, place)
ids_lod = [[0, 1, 2, 3]]
ids.set_lod(ids_lod)
out = scope.var('Out').get_tensor()
emaps = ['127.0.0.1:' + str(port)]
table_names = ['table']
height_sections = [10]
# create and run sgd operator
lookup_table_op = Operator(
"lookup_table",
W='W',
Ids='Ids',
Out='Out',
remote_prefetch=True,
epmap=emaps,
table_names=table_names,
height_sections=height_sections)
lookup_table_op.run(scope, place)
# get and compare result
result_array = np.array(out)
self.assertEqual(out.lod(), ids_lod)
self.assertEqual(list(result_array.shape), [len(ids_array), 8])
for i in range(len(ids_array)):
id = ids_array[i][0]
self.assertTrue((result_array[i] == id).all())
def _run_lookup_table_op_two_pserver(self, place, port0, port1):
scope = fluid.core.Scope()
program = Program()
with fluid.scope_guard(scope):
with program_guard(program, startup_program=Program()):
# create and initialize Param Variable
param = scope.var('W').get_tensor()
param_array = np.full((10, 8), 1.0).astype("float32")
param.set(param_array, place)
ids = scope.var('Ids').get_tensor()
ids_array = np.array([[1], [2], [11], [13]]).astype("int64")
ids.set(ids_array, place)
ids_lod = [[0, 2, 3, 4]]
ids.set_lod(ids_lod)
out = scope.var('Out').get_tensor()
emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)]
table_names = ['table', 'table']
height_sections = [10, 20]
# create and run sgd operator
lookup_table_op = Operator(
"lookup_table",
W='W',
Ids='Ids',
Out='Out',
remote_prefetch=True,
epmap=emaps,
table_names=table_names,
height_sections=height_sections)
lookup_table_op.run(scope, place)
# get and compare result
result_array = np.array(out)
self.assertEqual(out.lod(), ids_lod)
self.assertEqual(list(result_array.shape), [len(ids_array), 8])
for i in range(len(ids_array)):
id = ids_array[i][0]
self.assertTrue((result_array[i] == id).all())
def test_lookup_remote_table(self):
os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1"
# run pserver on CPU in sync mode
p0 = self._start_pserver(0, False, True, run_pserver)
self._wait_ps_ready(p0.pid)
port0 = self._get_pserver_port(p0.pid)
p1 = self._start_pserver(1, False, True, run_pserver)
self._wait_ps_ready(p1.pid)
port1 = self._get_pserver_port(p1.pid)
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self._run_lookup_table_op_one_pserver(place, port0)
self._run_lookup_table_op_two_pserver(place, port0, port1)
# raise SIGTERM to pserver
os.kill(p0.pid, signal.SIGINT)
p0.join()
os.kill(p1.pid, signal.SIGINT)
p1.join()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
import paddle.fluid as fluid
SIGMOID_THRESHOLD_MIN = -40.0
SIGMOID_THRESHOLD_MAX = 13.0
EXP_MAX_INPUT = 40.0
def lstm_naive(
input,
w, ):
seq_len, batch_size, hidden_size = input.shape
offset = 0
wi = w[offset:offset + hidden_size * hidden_size].reshape(
(hidden_size, hidden_size)).transpose()
offset += hidden_size * hidden_size
wf = w[offset:offset + hidden_size * hidden_size].reshape(
(hidden_size, hidden_size)).transpose()
offset += hidden_size * hidden_size
wc = w[offset:offset + hidden_size * hidden_size].reshape(
(hidden_size, hidden_size)).transpose()
offset += hidden_size * hidden_size
wo = w[offset:offset + hidden_size * hidden_size].reshape(
(hidden_size, hidden_size)).transpose()
offset += hidden_size * hidden_size
ri = w[offset:offset + hidden_size * hidden_size].reshape(
(hidden_size, hidden_size)).transpose()
offset += hidden_size * hidden_size
rf = w[offset:offset + hidden_size * hidden_size].reshape(
(hidden_size, hidden_size)).transpose()
offset += hidden_size * hidden_size
rc = w[offset:offset + hidden_size * hidden_size].reshape(
(hidden_size, hidden_size)).transpose()
offset += hidden_size * hidden_size
ro = w[offset:offset + hidden_size * hidden_size].reshape(
(hidden_size, hidden_size)).transpose()
offset += hidden_size * hidden_size
bi_1 = w[offset:offset + hidden_size]
offset += hidden_size
bf_1 = w[offset:offset + hidden_size]
offset += hidden_size
bc_1 = w[offset:offset + hidden_size]
offset += hidden_size
bo_1 = w[offset:offset + hidden_size]
offset += hidden_size
bi_2 = w[offset:offset + hidden_size]
offset += hidden_size
bf_2 = w[offset:offset + hidden_size]
offset += hidden_size
bc_2 = w[offset:offset + hidden_size]
offset += hidden_size
bo_2 = w[offset:offset + hidden_size]
def sigmoid(x):
y = np.copy(x)
y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
return 1. / (1. + np.exp(-y))
def tanh(x):
y = -2. * x
y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
return (2. / (1. + np.exp(y))) - 1.
output = []
pre_h = np.zeros((batch_size, hidden_size), dtype=input.dtype)
pre_c = np.zeros((batch_size, hidden_size), dtype=input.dtype)
for i in range(seq_len):
emb_1 = input[i]
input_gate = sigmoid(
np.matmul(emb_1, wi) + np.matmul(pre_h, ri) + bi_1 + bi_2)
forget_gate = sigmoid(
np.matmul(emb_1, wf) + np.matmul(pre_h, rf) + bf_1 + bf_2)
output_gate = sigmoid(
np.matmul(emb_1, wo) + np.matmul(pre_h, ro) + bo_1 + bo_2)
c_t_temp = tanh(
np.matmul(emb_1, wc) + np.matmul(pre_h, rc) + bc_1 + bc_2)
new_c = input_gate * c_t_temp + forget_gate * pre_c
new_h = output_gate * tanh(new_c)
pre_h = new_h
pre_c = new_c
output.append(new_h)
output = np.concatenate(output, -1)
output = output.reshape((batch_size, -1, hidden_size))
output = output.transpose((1, 0, 2))
return output, pre_h, pre_c
class TestCUDNNLstmOp(OpTest):
def setUp(self):
self.op_type = "cudnn_lstm"
self.dtype = np.float32
num_steps = 20
batch_size = 5
hidden_size = 20
input_weight_size = (hidden_size * hidden_size) * 4
hidden_weight_size = (hidden_size * hidden_size) * 4
weight_size = input_weight_size + hidden_weight_size
weight_size += hidden_size * 8
input = np.random.uniform(
low=-0.1, high=0.1, size=(num_steps, batch_size,
hidden_size)).astype(self.dtype)
flat_w = np.random.uniform(
low=-0.1, high=0.1, size=(weight_size)).astype(self.dtype)
output, last_hidden, last_cell = lstm_naive(input, flat_w)
init_h = np.zeros((batch_size, hidden_size), dtype=np.float32)
init_c = np.zeros((batch_size, hidden_size), dtype=np.float32)
scope = core.Scope()
program = fluid.Program()
block = program.global_block()
cache_temp = block.create_var(
name="Cache",
persistable=True,
type=core.VarDesc.VarType.RAW,
stop_gradient=True)
self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input),
'W': OpTest.np_dtype_to_fluid_dtype(flat_w),
'InitH': OpTest.np_dtype_to_fluid_dtype(init_h),
'InitC': OpTest.np_dtype_to_fluid_dtype(init_c),
}
self.cache_name_list = ['Cache']
self.attrs = {
'max_len': num_steps,
'dropout_prob': 0.0,
'is_bidirec': False,
'input_size': hidden_size,
'hidden_size': hidden_size,
'num_layers': 1,
}
self.outputs = {
'Out': output,
"last_h": last_hidden,
'last_c': last_cell
}
def test_output_with_place(self):
if self.testcuda():
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-5)
def test_grad_with_place(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
self.check_grad_with_place(
place,
set(['Input', 'W', 'InitH', 'InitC']),
['Out', 'last_h', 'last_c'],
max_relative_error=0.02)
def testcuda(self):
return core.is_compiled_with_cuda()
if __name__ == '__main__':
unittest.main()
......@@ -20,10 +20,16 @@ from op_test import OpTest
class TestPad2dOp(OpTest):
def setUp(self):
self.pad_value = 0.0
self.variable_paddings = False
self.initTestCase()
self.op_type = "pad2d"
self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
self.attrs = {}
if self.variable_paddings:
self.attrs['paddings'] = []
self.inputs['Paddings'] = np.array(self.paddings).flatten().astype(
"int32")
else:
self.attrs['paddings'] = np.array(self.paddings).flatten()
self.attrs['pad_value'] = self.pad_value
self.attrs['mode'] = self.mode
......@@ -98,5 +104,24 @@ class TestCase5(TestPad2dOp):
self.data_format = "NHWC"
class TestCase6(TestPad2dOp):
def initTestCase(self):
self.shape = (2, 4, 4, 2)
self.paddings = [0, 1, 2, 3]
self.mode = "constant"
self.pad_value = 1.2
self.data_format = "NHWC"
self.variable_paddings = True
class TestCase7(TestPad2dOp):
def initTestCase(self):
self.shape = (2, 3, 4, 4)
self.paddings = [0, 1, 2, 3]
self.mode = "reflect"
self.data_format = "NCHW"
self.variable_paddings = True
if __name__ == '__main__':
unittest.main()
......@@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
"""Test sigmoid_cross_entropy_with_logit_op with probabalistic label
"""
def setUp(self):
self.op_type = "sigmoid_cross_entropy_with_logits"
batch_size = 64
num_classes = 20
ignore_index = -1
self.inputs = {
'X': logit(
np.random.uniform(0, 1, (batch_size, num_classes))
.astype("float32")),
'Label': np.random.randint(-1, 2, (batch_size, num_classes))
.astype("float32")
}
self.attrs = {'ignore_index': ignore_index, }
# Fw Pass is implemented as elementwise sigmoid followed by
# elementwise logistic loss
# Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
sigmoid_X = expit(self.inputs['X'])
term1 = self.inputs['Label'] * np.log(sigmoid_X)
term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
out = -term1 - term2
out[np.where(self.inputs['Label'] == ignore_index)] = 0
self.outputs = {'Out': out}
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['X'], 'Out')
class TestSigmoidCrossEntropyWithLogitsOp3(OpTest):
"""Test sigmoid_cross_entropy_with_logit_op with probabalistic label
"""
def setUp(self):
self.op_type = "sigmoid_cross_entropy_with_logits"
batch_size = 64
......
......@@ -20,7 +20,7 @@ import paddle.fluid.core as core
from paddle.fluid.op import Operator
def create_op(scope, op_type, inputs, outputs, attrs):
def create_op(scope, op_type, inputs, outputs, attrs, cache_list=None):
kwargs = dict()
op_maker = core.op_proto_and_checker_maker
......@@ -43,6 +43,11 @@ def create_op(scope, op_type, inputs, outputs, attrs):
__create_var__(in_name, sub_in_name)
else:
__create_var__(in_name, in_name)
if cache_list != None and isinstance(cache_list, list):
for name in cache_list:
kwargs[name] = []
scope.var(name)
kwargs[name].append(name)
for out_name, out_dup in Operator.get_op_outputs(op_type):
if out_name in outputs:
......
......@@ -236,6 +236,31 @@ class DistributeTranspiler(object):
else:
raise ValueError("must set trainer_id > 0")
def _get_all_remote_sparse_update_op(self, main_program):
sparse_update_ops = []
sparse_update_op_types = ["lookup_table"]
for op in main_program.global_block().ops:
if op.type in sparse_update_op_types and op.attr(
'remote_prefetch') is True and not op.attr(
'is_distributed'):
sparse_update_ops.append(op)
return sparse_update_ops
def _update_remote_sparse_update_op(self, param_varname, height_sections,
endpint_map, table_names):
for op in self.sparse_update_ops:
if param_varname in op.input_arg_names:
op._set_attr('epmap', endpint_map)
op._set_attr('table_names', table_names)
op._set_attr('height_sections', height_sections)
op._set_attr('trainer_id', self.trainer_id)
def _is_input_of_remote_sparse_update_op(self, param_name):
for op in self.sparse_update_ops:
if param_name in op.input_arg_names:
return True
return False
def transpile(self,
trainer_id,
program=None,
......@@ -299,6 +324,12 @@ class DistributeTranspiler(object):
self.param_name_to_grad_name[param_var.name] = grad_var.name
self.grad_name_to_param_name[grad_var.name] = param_var.name
# get all sparse update ops
self.sparse_update_ops = self._get_all_remote_sparse_update_op(
self.origin_program)
# use_sparse_update_param_name -> split_height_section
self.sparse_param_to_height_sections = dict()
# add distributed attrs to program
self.origin_program._is_distributed = True
self.origin_program._endpoints = self.pserver_endpoints
......@@ -336,6 +367,13 @@ class DistributeTranspiler(object):
splited_grad_varname = splited_vars[0].name
index = find_op_by_output_arg(
program.global_block(), splited_grad_varname, reverse=True)
if splited_vars[0].type == core.VarDesc.VarType.SELECTED_ROWS:
sparse_param_name = self.grad_name_to_param_name[
grad_varname]
if self._is_input_of_remote_sparse_update_op(
sparse_param_name):
self.sparse_param_to_height_sections[
sparse_param_name] = [splited_vars[0].shape[0]]
elif len(splited_vars) > 1:
orig_var = program.global_block().vars[splited_grad_varname]
index = find_op_by_output_arg(
......@@ -406,16 +444,18 @@ class DistributeTranspiler(object):
all_recv_outputs = []
for param_varname, splited_var in six.iteritems(self.param_var_mapping):
eps = []
table_names = []
for var in splited_var:
index = [v.name for v in recv_vars].index(var.name)
eps.append(eplist[index])
table_names.append(var.name)
if self.sync_mode:
recv_dep_in = send_barrier_out
else:
# connect deps to send op in async mode
recv_dep_in = self.grad_name_to_send_dummy_out[
self.param_name_to_grad_name[param_varname]]
all_recv_outputs.extend(splited_var)
# get recv op_role_var, if not splited, the grad should have .trainer suffix
# if splited, grad should be the original grad var name. ParallelExecutor
# will use op_role_var to get expected device place to run this op.
......@@ -425,6 +465,13 @@ class DistributeTranspiler(object):
if len(splited_trainer_grad) == 1:
recv_op_role_var_name = splited_trainer_grad[0].name
if param_varname in self.sparse_param_to_height_sections:
height_sections = self.sparse_param_to_height_sections[
param_varname]
self._update_remote_sparse_update_op(
param_varname, height_sections, eps, table_names)
else:
all_recv_outputs.extend(splited_var)
program.global_block().append_op(
type="recv",
inputs={"X": [recv_dep_in]},
......@@ -454,6 +501,7 @@ class DistributeTranspiler(object):
if len(splited_var) <= 1:
continue
orig_param = program.global_block().vars[param_varname]
if param_varname not in self.sparse_param_to_height_sections:
program.global_block().append_op(
type="concat",
inputs={"X": splited_var},
......@@ -1420,6 +1468,10 @@ to transpile() call.")
height_sections = []
for v in splited_vars:
height_sections.append(v.shape[0])
sparse_param_name = self.grad_name_to_param_name[orig_var.name]
if self._is_input_of_remote_sparse_update_op(sparse_param_name):
self.sparse_param_to_height_sections[
sparse_param_name] = height_sections
program.global_block()._insert_op(
index=index + 1,
type="split_selected_rows",
......
......@@ -62,10 +62,10 @@ class TestBuffered(unittest.TestCase):
for idx, i in enumerate(b()):
elapsed_time = time.time() - last_time
if i == 0:
time.sleep(0.3)
time.sleep(1)
else:
# read time should be short, meaning already buffered.
self.assertLess(elapsed_time, 0.05)
self.assertLess(elapsed_time, 0.08)
last_time = time.time()
......
......@@ -165,9 +165,9 @@ if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_LIB}', libs_path)
shutil.copy('${MKLML_IOMP_LIB}', libs_path)
package_data['paddle.libs']+=['libmklml_intel' + ext_name,'libiomp5' + ext_name]
if '${CMAKE_BUILD_TYPE}' == 'Release':
if '${WITH_MKLDNN}' == 'ON':
if '${CMAKE_BUILD_TYPE}' == 'Release':
# only change rpath in Release mode.
if '${WITH_MKLDNN}' == 'ON':
# TODO(typhoonzero): use install_name_tool to patch mkl libs once
# we can support mkl on mac.
#
......@@ -180,8 +180,13 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
package_data['paddle.libs']+=['libmkldnn.so.0']
shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
if '${WITH_NGRAPH}' == 'ON':
# only change rpath in Release mode,
# since in Debug mode, nGraph lib may be too large to be changed?
if '${CMAKE_BUILD_TYPE}' == 'Release':
# only change rpath in Release mode.
if os.name != 'nt':
if "@APPLE@" == "1":
command = "install_name_tool -id \"@loader_path/\" ${NGRAPH_SHARED_LIB}"
else:
command = "patchelf --set-rpath '$ORIGIN/' ${NGRAPH_SHARED_LIB}"
if os.system(command) != 0:
raise Exception("patch ${NGRAPH_SHARED_LIB_NAME} failed, command: %s" % command)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册