提交 0abf173e 编写于 作者: M minqiyang

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into update_simple_distranspiler

...@@ -58,6 +58,8 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful- ...@@ -58,6 +58,8 @@ PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-
create mode 100644 233 create mode 100644 233
``` ```
NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`.
1. Build and test 1. Build and test
Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md). Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
......
...@@ -98,6 +98,8 @@ def parse_args(): ...@@ -98,6 +98,8 @@ def parse_args():
'--use_fake_data', '--use_fake_data',
action='store_true', action='store_true',
help='If set ommit the actual read data operators.') help='If set ommit the actual read data operators.')
parser.add_argument(
'--profile', action='store_true', help='If set, profile a few steps.')
parser.add_argument( parser.add_argument(
'--update_method', '--update_method',
type=str, type=str,
...@@ -108,8 +110,8 @@ def parse_args(): ...@@ -108,8 +110,8 @@ def parse_args():
return args return args
def append_nccl2_prepare(): def append_nccl2_prepare(trainer_id):
if os.getenv("PADDLE_TRAINER_ID", None) != None: if trainer_id >= 0:
# append gen_nccl_id at the end of startup program # append gen_nccl_id at the end of startup program
trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
port = os.getenv("PADDLE_PSERVER_PORT") port = os.getenv("PADDLE_PSERVER_PORT")
...@@ -136,12 +138,12 @@ def append_nccl2_prepare(): ...@@ -136,12 +138,12 @@ def append_nccl2_prepare():
}) })
return nccl_id_var, num_trainers, trainer_id return nccl_id_var, num_trainers, trainer_id
else: else:
raise Exception( raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
"must set PADDLE_TRAINER_ID env variables for dist train.") "nccl-based dist train.")
def dist_transpile(): def dist_transpile(trainer_id):
if "PADDLE_TRAINING_ROLE" not in os.environ: if trainer_id < 0:
return None, None return None, None
# the port of all pservers, needed by both trainer and pserver # the port of all pservers, needed by both trainer and pserver
...@@ -158,9 +160,6 @@ def dist_transpile(): ...@@ -158,9 +160,6 @@ def dist_transpile():
trainers = int(os.getenv("PADDLE_TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
# the IP of the local machine, needed by pserver only # the IP of the local machine, needed by pserver only
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
# the unique trainer id, starting from 0, needed by trainer
# only
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
# the role, should be either PSERVER or TRAINER # the role, should be either PSERVER or TRAINER
training_role = os.getenv("PADDLE_TRAINING_ROLE") training_role = os.getenv("PADDLE_TRAINING_ROLE")
...@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
iters = 0 iters = 0
start_time = time.time() start_time = time.time()
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if args.profile and pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif args.profile and pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
if iters == args.skip_batch_num: if iters == args.skip_batch_num:
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
...@@ -334,7 +338,11 @@ def print_arguments(args): ...@@ -334,7 +338,11 @@ def print_arguments(args):
def main(): def main():
args = parse_args() args = parse_args()
print_arguments(args) print_arguments(args)
nccl_id_var, num_trainers, trainer_id = None, 1, 0
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var, num_trainers, trainer_id = (
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
if args.use_cprof: if args.use_cprof:
pr = cProfile.Profile() pr = cProfile.Profile()
...@@ -348,7 +356,7 @@ def main(): ...@@ -348,7 +356,7 @@ def main():
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
if args.update_method == "pserver": if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile() train_prog, startup_prog = dist_transpile(trainer_id)
if not train_prog: if not train_prog:
raise Exception( raise Exception(
"Must configure correct environments to run dist train.") "Must configure correct environments to run dist train.")
...@@ -364,7 +372,7 @@ def main(): ...@@ -364,7 +372,7 @@ def main():
train_args.append(fluid.default_startup_program()) train_args.append(fluid.default_startup_program())
if args.update_method == "nccl2": if args.update_method == "nccl2":
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare() nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
if args.gpus == 1: if args.gpus == 1:
# NOTE: parallel executor use profiler interanlly # NOTE: parallel executor use profiler interanlly
if args.use_nvprof and args.device == 'GPU': if args.use_nvprof and args.device == 'GPU':
......
...@@ -86,7 +86,7 @@ ...@@ -86,7 +86,7 @@
<br> <br>
<p align="center"> <p align="center">
<img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid_compiler.png" width=100%> <img src="https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/doc/fluid/images/fluid-compiler.png" width=100%>
</p> </p>
--- ---
......
...@@ -17,3 +17,4 @@ ...@@ -17,3 +17,4 @@
:maxdepth: 1 :maxdepth: 1
concepts/use_concepts_cn.rst concepts/use_concepts_cn.rst
developer's_guide_to_paddle_fluid.md
...@@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle, ...@@ -16,3 +16,4 @@ Here is an example of linear regression. It introduces workflow of PaddlePaddle,
:maxdepth: 1 :maxdepth: 1
concepts/index_en.rst concepts/index_en.rst
developer's_guide_to_paddle_fluid.md
...@@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14. ...@@ -11,7 +11,7 @@ PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.
pip install paddlepaddle pip install paddlepaddle
如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行: 如果需要安装支持GPU的版本(cuda8.0_cudnn5_avx_openblas),需要执行:
.. code-block:: bash .. code-block:: bash
......
...@@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas: ...@@ -12,7 +12,7 @@ Simply run the following command to install, the version is cpu_avx_openblas:
pip install paddlepaddle pip install paddlepaddle
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run: If you need to install GPU version (cuda8.0_cudnn5_avx_openblas), run:
.. code-block:: bash .. code-block:: bash
......
...@@ -51,6 +51,8 @@ Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 G ...@@ -51,6 +51,8 @@ Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 G
Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式,请确保 `clang-format` 版本在 3.8 以上。 Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式,请确保 `clang-format` 版本在 3.8 以上。
注:通过`pip install pre-commit``conda install -c conda-forge pre-commit`安装的`yapf`稍有不同的,Paddle 开发人员使用的是`pip install pre-commit`
## 开始开发 ## 开始开发
在本例中,我删除了 README.md 中的一行,并创建了一个新文件。 在本例中,我删除了 README.md 中的一行,并创建了一个新文件。
......
...@@ -13,7 +13,11 @@ ...@@ -13,7 +13,11 @@
# limitations under the License. # limitations under the License.
# #
function(inference_api_test TARGET_NAME TEST_SRC DEP_TEST) if(APPLE)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
endif(APPLE)
function(inference_api_test TARGET_NAME TEST_SRC)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs ARGS) set(multiValueArgs ARGS)
...@@ -34,6 +38,8 @@ function(inference_api_test TARGET_NAME TEST_SRC DEP_TEST) ...@@ -34,6 +38,8 @@ function(inference_api_test TARGET_NAME TEST_SRC DEP_TEST)
SRCS ${TEST_SRC} SRCS ${TEST_SRC}
DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl DEPS paddle_fluid_api paddle_inference_api paddle_inference_api_impl
ARGS --dirname=${PYTHON_TESTS_DIR}/book/) ARGS --dirname=${PYTHON_TESTS_DIR}/book/)
# TODO(panyx0178): Figure out how to add word2vec and image_classification
# as deps.
# set_tests_properties(${TARGET_NAME} # set_tests_properties(${TARGET_NAME}
# PROPERTIES DEPENDS ${DEP_TEST}) # PROPERTIES DEPENDS ${DEP_TEST})
endforeach() endforeach()
...@@ -53,5 +59,4 @@ cc_test(test_paddle_inference_api ...@@ -53,5 +59,4 @@ cc_test(test_paddle_inference_api
DEPS paddle_inference_api) DEPS paddle_inference_api)
inference_api_test(test_paddle_inference_api_impl inference_api_test(test_paddle_inference_api_impl
test_paddle_inference_api_impl.cc test_paddle_inference_api_impl.cc)
test_word2vec)
...@@ -102,8 +102,8 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs, ...@@ -102,8 +102,8 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
Timer timer; Timer timer;
timer.tic(); timer.tic();
// set feed variable // set feed variable
std::map<std::string, const paddle::framework::LoDTensor *> feed_targets; std::map<std::string, const framework::LoDTensor *> feed_targets;
std::vector<paddle::framework::LoDTensor> feeds; std::vector<framework::LoDTensor> feeds;
if (!SetFeed(inputs, &feeds)) { if (!SetFeed(inputs, &feeds)) {
LOG(ERROR) << "fail to set feed"; LOG(ERROR) << "fail to set feed";
return false; return false;
...@@ -112,8 +112,8 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs, ...@@ -112,8 +112,8 @@ bool PaddlePredictorImpl::Run(const std::vector<PaddleTensor> &inputs,
feed_targets[feed_target_names_[i]] = &feeds[i]; feed_targets[feed_target_names_[i]] = &feeds[i];
} }
// get fetch variable // get fetch variable
std::map<std::string, paddle::framework::LoDTensor *> fetch_targets; std::map<std::string, framework::LoDTensor *> fetch_targets;
std::vector<paddle::framework::LoDTensor> fetchs; std::vector<framework::LoDTensor> fetchs;
fetchs.resize(fetch_target_names_.size()); fetchs.resize(fetch_target_names_.size());
for (size_t i = 0; i < fetch_target_names_.size(); ++i) { for (size_t i = 0; i < fetch_target_names_.size(); ++i) {
fetch_targets[fetch_target_names_[i]] = &fetchs[i]; fetch_targets[fetch_target_names_[i]] = &fetchs[i];
...@@ -149,25 +149,24 @@ bool PaddlePredictorImpl::InitShared() { ...@@ -149,25 +149,24 @@ bool PaddlePredictorImpl::InitShared() {
VLOG(3) << "Predictor::init_shared"; VLOG(3) << "Predictor::init_shared";
// 1. Define place, executor, scope // 1. Define place, executor, scope
if (this->config_.device >= 0) { if (this->config_.device >= 0) {
place_ = paddle::platform::CUDAPlace(); place_ = platform::CUDAPlace();
} else { } else {
place_ = paddle::platform::CPUPlace(); place_ = platform::CPUPlace();
} }
this->executor_.reset(new paddle::framework::Executor(this->place_)); this->executor_.reset(new framework::Executor(this->place_));
this->scope_.reset(new paddle::framework::Scope()); this->scope_.reset(new framework::Scope());
// Initialize the inference program // Initialize the inference program
if (!this->config_.model_dir.empty()) { if (!this->config_.model_dir.empty()) {
// Parameters are saved in separate files sited in // Parameters are saved in separate files sited in
// the specified `dirname`. // the specified `dirname`.
this->inference_program_ = paddle::inference::Load( this->inference_program_ = inference::Load(
this->executor_.get(), this->scope_.get(), this->config_.model_dir); this->executor_.get(), this->scope_.get(), this->config_.model_dir);
} else if (!this->config_.prog_file.empty() && } else if (!this->config_.prog_file.empty() &&
!this->config_.param_file.empty()) { !this->config_.param_file.empty()) {
// All parameters are saved in a single file. // All parameters are saved in a single file.
// The file names should be consistent with that used // The file names should be consistent with that used
// in Python API `fluid.io.save_inference_model`. // in Python API `fluid.io.save_inference_model`.
this->inference_program_ = this->inference_program_ = inference::Load(this->executor_.get(),
paddle::inference::Load(this->executor_.get(),
this->scope_.get(), this->scope_.get(),
this->config_.prog_file, this->config_.prog_file,
this->config_.param_file); this->config_.param_file);
...@@ -185,24 +184,21 @@ bool PaddlePredictorImpl::InitShared() { ...@@ -185,24 +184,21 @@ bool PaddlePredictorImpl::InitShared() {
return true; return true;
} }
bool PaddlePredictorImpl::SetFeed( bool PaddlePredictorImpl::SetFeed(const std::vector<PaddleTensor> &inputs,
const std::vector<PaddleTensor> &inputs, std::vector<framework::LoDTensor> *feeds) {
std::vector<paddle::framework::LoDTensor> *feeds) {
VLOG(3) << "Predictor::set_feed"; VLOG(3) << "Predictor::set_feed";
if (inputs.size() != feed_target_names_.size()) { if (inputs.size() != feed_target_names_.size()) {
LOG(ERROR) << "wrong feed input size."; LOG(ERROR) << "wrong feed input size.";
return false; return false;
} }
for (size_t i = 0; i < feed_target_names_.size(); ++i) { for (size_t i = 0; i < feed_target_names_.size(); ++i) {
paddle::framework::LoDTensor input; framework::LoDTensor input;
paddle::framework::DDim ddim = framework::DDim ddim = framework::make_ddim(inputs[i].shape);
paddle::framework::make_ddim(inputs[i].shape);
void *input_ptr; void *input_ptr;
if (inputs[i].dtype == PaddleDType::INT64) { if (inputs[i].dtype == PaddleDType::INT64) {
input_ptr = input_ptr = input.mutable_data<int64_t>(ddim, platform::CPUPlace());
input.mutable_data<int64_t>(ddim, paddle::platform::CPUPlace());
} else if (inputs[i].dtype == PaddleDType::FLOAT32) { } else if (inputs[i].dtype == PaddleDType::FLOAT32) {
input_ptr = input.mutable_data<float>(ddim, paddle::platform::CPUPlace()); input_ptr = input.mutable_data<float>(ddim, platform::CPUPlace());
} else { } else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false; return false;
...@@ -213,13 +209,12 @@ bool PaddlePredictorImpl::SetFeed( ...@@ -213,13 +209,12 @@ bool PaddlePredictorImpl::SetFeed(
inputs[i].data.data, inputs[i].data.data,
inputs[i].data.length); inputs[i].data.length);
feeds->push_back(input); feeds->push_back(input);
LOG(ERROR) << "Actual feed type " << feeds->back().type().name();
} }
return true; return true;
} }
bool PaddlePredictorImpl::GetFetch( bool PaddlePredictorImpl::GetFetch(
const std::vector<paddle::framework::LoDTensor> &fetchs, const std::vector<framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *outputs) { std::vector<PaddleTensor> *outputs) {
VLOG(3) << "Predictor::get_fetch"; VLOG(3) << "Predictor::get_fetch";
outputs->resize(fetchs.size()); outputs->resize(fetchs.size());
...@@ -284,8 +279,9 @@ bool PaddlePredictorImpl::GetFetch( ...@@ -284,8 +279,9 @@ bool PaddlePredictorImpl::GetFetch(
return true; return true;
} }
std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl( template <>
const VisConfig &config) { std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(
const ConfigImpl &config) {
VLOG(3) << "create PaddlePredictorImpl"; VLOG(3) << "create PaddlePredictorImpl";
// 1. GPU memeroy // 1. GPU memeroy
std::vector<std::string> flags; std::vector<std::string> flags;
...@@ -299,12 +295,11 @@ std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl( ...@@ -299,12 +295,11 @@ std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
framework::InitGflags(flags); framework::InitGflags(flags);
} }
std::unique_ptr<PaddlePredictorImpl> predictor( std::unique_ptr<PaddlePredictor> predictor(new PaddlePredictorImpl(config));
new PaddlePredictorImpl(config)); if (!dynamic_cast<PaddlePredictorImpl *>(predictor.get())->Init()) {
if (!predictor->Init()) {
return nullptr; return nullptr;
} }
return predictor; return std::move(predictor);
} }
} // namespace paddle } // namespace paddle
...@@ -29,7 +29,7 @@ ...@@ -29,7 +29,7 @@
namespace paddle { namespace paddle {
struct VisConfig : public PaddlePredictor::Config { struct ConfigImpl : public PaddlePredictor::Config {
int device; int device;
float fraction_of_gpu_memory; float fraction_of_gpu_memory;
std::string prog_file; std::string prog_file;
...@@ -37,12 +37,9 @@ struct VisConfig : public PaddlePredictor::Config { ...@@ -37,12 +37,9 @@ struct VisConfig : public PaddlePredictor::Config {
bool share_variables; bool share_variables;
}; };
/*
* Do not use this, just a demo indicating how to customize a Predictor.
*/
class PaddlePredictorImpl : public PaddlePredictor { class PaddlePredictorImpl : public PaddlePredictor {
public: public:
explicit PaddlePredictorImpl(const VisConfig &config) : config_(config) {} explicit PaddlePredictorImpl(const ConfigImpl &config) : config_(config) {}
bool Init(); bool Init();
...@@ -56,21 +53,18 @@ class PaddlePredictorImpl : public PaddlePredictor { ...@@ -56,21 +53,18 @@ class PaddlePredictorImpl : public PaddlePredictor {
private: private:
bool InitShared() override; bool InitShared() override;
bool SetFeed(const std::vector<PaddleTensor> &input_datas, bool SetFeed(const std::vector<PaddleTensor> &input_datas,
std::vector<paddle::framework::LoDTensor> *feeds); std::vector<framework::LoDTensor> *feeds);
bool GetFetch(const std::vector<paddle::framework::LoDTensor> &fetchs, bool GetFetch(const std::vector<framework::LoDTensor> &fetchs,
std::vector<PaddleTensor> *output_data); std::vector<PaddleTensor> *output_data);
VisConfig config_; ConfigImpl config_;
paddle::platform::Place place_; platform::Place place_;
std::unique_ptr<paddle::framework::Executor> executor_; std::unique_ptr<framework::Executor> executor_;
std::unique_ptr<paddle::framework::Scope> scope_; std::unique_ptr<framework::Scope> scope_;
std::unique_ptr<paddle::framework::ExecutorPrepareContext> ctx_; std::unique_ptr<framework::ExecutorPrepareContext> ctx_;
std::unique_ptr<paddle::framework::ProgramDesc> inference_program_; std::unique_ptr<framework::ProgramDesc> inference_program_;
std::vector<std::string> feed_target_names_; std::vector<std::string> feed_target_names_;
std::vector<std::string> fetch_target_names_; std::vector<std::string> fetch_target_names_;
}; };
std::unique_ptr<PaddlePredictorImpl> CreatePaddlePredictorImpl(
const VisConfig &config);
} // namespace paddle } // namespace paddle
...@@ -40,16 +40,19 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { ...@@ -40,16 +40,19 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) {
return pt; return pt;
} }
TEST(paddle_inference_api_impl, word2vec) { ConfigImpl GetConfig() {
VisConfig config; ConfigImpl config;
config.model_dir = FLAGS_dirname + "word2vec.inference.model"; config.model_dir = FLAGS_dirname + "word2vec.inference.model";
LOG(INFO) << "dirname " << config.model_dir; LOG(INFO) << "dirname " << config.model_dir;
config.fraction_of_gpu_memory = 0.15; config.fraction_of_gpu_memory = 0.15;
config.device = 0; config.device = 0;
config.share_variables = true; config.share_variables = true;
return config;
}
std::unique_ptr<PaddlePredictorImpl> predictor = TEST(paddle_inference_api_impl, word2vec) {
CreatePaddlePredictorImpl(config); ConfigImpl config = GetConfig();
std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
framework::LoDTensor first_word, second_word, third_word, fourth_word; framework::LoDTensor first_word, second_word, third_word, fourth_word;
framework::LoD lod{{0, 1}}; framework::LoD lod{{0, 1}};
...@@ -60,24 +63,91 @@ TEST(paddle_inference_api_impl, word2vec) { ...@@ -60,24 +63,91 @@ TEST(paddle_inference_api_impl, word2vec) {
SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1); SetupLoDTensor(&third_word, lod, static_cast<int64_t>(0), dict_size - 1);
SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1); SetupLoDTensor(&fourth_word, lod, static_cast<int64_t>(0), dict_size - 1);
std::vector<PaddleTensor> cpu_feeds; std::vector<PaddleTensor> paddle_tensor_feeds;
cpu_feeds.push_back(LodTensorToPaddleTensor(&first_word)); paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&first_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&second_word)); paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&second_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&third_word)); paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&third_word));
cpu_feeds.push_back(LodTensorToPaddleTensor(&fourth_word)); paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&fourth_word));
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(cpu_feeds, &outputs)); ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs.size(), 1UL);
for (size_t i = 0; i < outputs.size(); ++i) { size_t len = outputs[0].data.length;
size_t len = outputs[i].data.length; float* data = static_cast<float*>(outputs[0].data.data);
float* data = static_cast<float*>(outputs[i].data.data); for (int j = 0; j < len / sizeof(float); ++j) {
for (size_t j = 0; j < len / sizeof(float); ++j) {
ASSERT_LT(data[j], 1.0); ASSERT_LT(data[j], 1.0);
ASSERT_GT(data[j], -1.0); ASSERT_GT(data[j], -1.0);
} }
free(outputs[i].data.data);
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&first_word);
cpu_feeds.push_back(&second_word);
cpu_feeds.push_back(&third_word);
cpu_feeds.push_back(&fourth_word);
framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace>(config.model_dir, cpu_feeds, cpu_fetchs1);
float* lod_data = output1.data<float>();
for (size_t i = 0; i < output1.numel(); ++i) {
EXPECT_LT(lod_data[i] - data[i], 1e-3);
EXPECT_GT(lod_data[i] - data[i], -1e-3);
}
free(outputs[0].data.data);
}
TEST(paddle_inference_api_impl, image_classification) {
int batch_size = 2;
bool use_mkldnn = false;
bool repeat = false;
ConfigImpl config = GetConfig();
config.model_dir =
FLAGS_dirname + "image_classification_resnet.inference.model";
const bool is_combined = false;
std::vector<std::vector<int64_t>> feed_target_shapes =
GetFeedTargetShapes(config.model_dir, is_combined);
framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0].
feed_target_shapes[0][0] = batch_size;
framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]);
SetupTensor<float>(
&input, input_dims, static_cast<float>(0), static_cast<float>(1));
std::vector<framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
framework::LoDTensor output1;
std::vector<framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
TestInference<platform::CPUPlace, false, true>(config.model_dir,
cpu_feeds,
cpu_fetchs1,
repeat,
is_combined,
use_mkldnn);
std::unique_ptr<PaddlePredictor> predictor = CreatePaddlePredictor(config);
std::vector<PaddleTensor> paddle_tensor_feeds;
paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input));
std::vector<PaddleTensor> outputs;
ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs));
ASSERT_EQ(outputs.size(), 1UL);
size_t len = outputs[0].data.length;
float* data = static_cast<float*>(outputs[0].data.data);
float* lod_data = output1.data<float>();
for (size_t j = 0; j < len / sizeof(float); ++j) {
EXPECT_LT(lod_data[j] - data[j], 1e-10);
EXPECT_GT(lod_data[j] - data[j], -1e-10);
} }
free(data);
} }
} // namespace paddle } // namespace paddle
...@@ -469,6 +469,7 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -469,6 +469,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
protected: protected:
DDim GetDim(const std::string& name) const override { DDim GetDim(const std::string& name) const override {
Variable* var = scope_.FindVar(name); Variable* var = scope_.FindVar(name);
PADDLE_ENFORCE_NOT_NULL(var);
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims(); return var->Get<LoDTensor>().dims();
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
......
...@@ -18,8 +18,8 @@ namespace paddle { ...@@ -18,8 +18,8 @@ namespace paddle {
namespace framework { namespace framework {
struct ReAllocateVisitor { struct ReAllocateVisitor {
ReAllocateVisitor(framework::Tensor* tensor, const framework::DDim& dims) ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor)
: tensor_(tensor), dims_(dims) {} : dims_(dims), tensor_(tensor) {}
template <typename T> template <typename T>
void operator()() const { void operator()() const {
...@@ -34,8 +34,8 @@ struct ReAllocateVisitor { ...@@ -34,8 +34,8 @@ struct ReAllocateVisitor {
tensor_->ShareDataWith(cpu_tensor); tensor_->ShareDataWith(cpu_tensor);
} }
framework::Tensor* tensor_;
framework::DDim dims_; framework::DDim dims_;
framework::Tensor* tensor_;
}; };
struct TensorCopyVisitor { struct TensorCopyVisitor {
...@@ -158,6 +158,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) { ...@@ -158,6 +158,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
} }
PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1), PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
"The first dim of value should be 1."); "The first dim of value should be 1.");
std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
auto index = Index(key); auto index = Index(key);
bool is_new_key = false; bool is_new_key = false;
if (index == -1) { if (index == -1) {
...@@ -169,7 +170,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) { ...@@ -169,7 +170,7 @@ bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
auto dims = value_->dims(); auto dims = value_->dims();
dims[0] = (dims[0] + 1) << 1; dims[0] = (dims[0] + 1) << 1;
framework::VisitDataType(framework::ToDataType(value.type()), framework::VisitDataType(framework::ToDataType(value.type()),
ReAllocateVisitor(value_.get(), dims)); ReAllocateVisitor(dims, value_.get()));
} }
} }
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <memory>
#include <mutex> // NOLINT
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -46,11 +48,13 @@ class SelectedRows { ...@@ -46,11 +48,13 @@ class SelectedRows {
SelectedRows(const std::vector<int64_t>& rows, const int64_t& height) SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
: rows_(rows), height_(height) { : rows_(rows), height_(height) {
value_.reset(new Tensor()); value_.reset(new Tensor());
auto_grown_mutex_.reset(new std::mutex);
} }
SelectedRows() { SelectedRows() {
height_ = 0; height_ = 0;
value_.reset(new Tensor()); value_.reset(new Tensor());
auto_grown_mutex_.reset(new std::mutex);
} }
platform::Place place() const { return value_->place(); } platform::Place place() const { return value_->place(); }
...@@ -125,6 +129,7 @@ class SelectedRows { ...@@ -125,6 +129,7 @@ class SelectedRows {
Vector<int64_t> rows_; Vector<int64_t> rows_;
std::unique_ptr<Tensor> value_{nullptr}; std::unique_ptr<Tensor> value_{nullptr};
int64_t height_; int64_t height_;
std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
}; };
/* /*
......
...@@ -131,6 +131,20 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) { ...@@ -131,6 +131,20 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
return buffer(name).buffer; return buffer(name).buffer;
} }
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst,
size_t max_size) {
// determine data size
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second);
auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
cudaMemcpyDeviceToDevice, *stream_),
0);
}
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst, void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst,
size_t max_size) { size_t max_size) {
// determine data size // determine data size
...@@ -152,7 +166,7 @@ Buffer& TensorRTEngine::buffer(const std::string& name) { ...@@ -152,7 +166,7 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
return buffers_[slot_offset]; return buffers_[slot_offset];
} }
void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data, void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data,
size_t size) { size_t size) {
auto& buf = buffer(name); auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_NOT_NULL(buf.buffer);
...@@ -162,6 +176,16 @@ void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data, ...@@ -162,6 +176,16 @@ void TensorRTEngine::SetInputFromCPU(const std::string& name, void* data,
cudaMemcpyHostToDevice, *stream_)); cudaMemcpyHostToDevice, *stream_));
} }
void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
size_t size) {
auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU);
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyDeviceToDevice, *stream_));
}
void TensorRTEngine::SetITensor(const std::string& name, void TensorRTEngine::SetITensor(const std::string& name,
nvinfer1::ITensor* tensor) { nvinfer1::ITensor* tensor) {
PADDLE_ENFORCE(tensor != nullptr); PADDLE_ENFORCE(tensor != nullptr);
......
...@@ -92,13 +92,15 @@ class TensorRTEngine : public EngineBase { ...@@ -92,13 +92,15 @@ class TensorRTEngine : public EngineBase {
cudaStream_t* stream() { return stream_; } cudaStream_t* stream() { return stream_; }
// Fill an input from CPU memory with name and size. // Fill an input from CPU memory with name and size.
void SetInputFromCPU(const std::string& name, void* data, size_t size); void SetInputFromCPU(const std::string& name, const void* data, size_t size);
// TODO(Superjomn) is this method necessary given that buffer(xxx) can be // TODO(Superjomn) is this method necessary given that buffer(xxx) can be
// accessed directly. Fill an input from GPU memory with name and size. // accessed directly. Fill an input from GPU memory with name and size.
void SetInputFromGPU(const std::string& name, void* data, size_t size); void SetInputFromGPU(const std::string& name, const void* data, size_t size);
// Get an output called name, the output of tensorrt is in GPU, so this method // Get an output called name, the output of tensorrt is in GPU, so this method
// will just return the output's GPU memory address. // Return the output's GPU memory address without copy.
void* GetOutputInGPU(const std::string& name); void* GetOutputInGPU(const std::string& name);
// Copy data into dst inside the GPU device.
void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU. // to CPU.
void GetOutputInCPU(const std::string& name, void* dst, size_t max_size); void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
......
...@@ -168,6 +168,8 @@ function(op_library TARGET) ...@@ -168,6 +168,8 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(relu);\n") file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "reduce") elseif(${TARGET} STREQUAL "reduce")
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n") file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
else() else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif() endif()
...@@ -223,6 +225,11 @@ op_library(cross_entropy_op DEPS cross_entropy) ...@@ -223,6 +225,11 @@ op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax) op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND)
op_library(tensorrt_engine_op DEPS tensorrt_engine)
else()
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
endif()
op_library(sum_op DEPS selected_rows_functor) op_library(sum_op DEPS selected_rows_functor)
op_library(sgd_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor)
op_library(print_op DEPS lod_tensor) op_library(print_op DEPS lod_tensor)
......
...@@ -89,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>, ...@@ -89,4 +89,5 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
ops::CastOpKernel<CPU, int>, ops::CastOpKernel<CPU, int>,
ops::CastOpKernel<CPU, int64_t>, ops::CastOpKernel<CPU, int64_t>,
ops::CastOpKernel<CPU, bool>, ops::CastOpKernel<CPU, bool>,
ops::CastOpKernel<CPU, uint8_t>,
ops::CastOpKernel<CPU, paddle::platform::float16>); ops::CastOpKernel<CPU, paddle::platform::float16>);
...@@ -21,5 +21,5 @@ using CastOpKernel = ...@@ -21,5 +21,5 @@ using CastOpKernel =
REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>, REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
CastOpKernel<int>, CastOpKernel<int64_t>, CastOpKernel<int>, CastOpKernel<int64_t>,
CastOpKernel<bool>, CastOpKernel<bool>, CastOpKernel<uint8_t>,
CastOpKernel<paddle::platform::float16>); CastOpKernel<paddle::platform::float16>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.h"
#include <string>
namespace paddle {
namespace operators {
class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
public:
FakeDequantizeMaxAbsOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of FakeDequantizeMaxAbsOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of FakeDequantizeMaxAbsOp should not be null.");
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
};
class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(Tensor) The input with float-32/64 type is the "
"low precision tensor.");
AddOutput("Out",
"(Tensor) The output is the dequantized high "
"precision tensor.");
AddAttr<int>("num_bits",
"(int) `num_bits` is the quantization level bits, "
"such as 2, 5, 8.");
AddAttr<float>("scale",
"(float) The maximum absolute value of low precision tensor."
"It is usually calculated by the fake_quantize_max_abs_op.");
AddComment(R"DOC(
FakeDequantizeMaxAbsOp operator.
This calculation is an opposite operation of FakeQuantizeMaxAbsOp:
$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OPERATOR(fake_dequantize_max_abs, ops::FakeDequantizeMaxAbsOp,
ops::FakeDequantizeMaxAbsOpMaker,
paddle::framework::EmptyGradOpMaker);
REGISTER_OP_CPU_KERNEL(fake_dequantize_max_abs,
ops::FakeDequantizeMaxAbsKernel<CPU, float>,
ops::FakeDequantizeMaxAbsKernel<CPU, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fake_dequantize_op.h"
namespace ops = paddle::operators;
using CUDA = paddle::platform::CUDADeviceContext;
REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
ops::FakeDequantizeMaxAbsKernel<CUDA, float>,
ops::FakeDequantizeMaxAbsKernel<CUDA, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& ctx) const {
auto* in = ctx.Input<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out");
out->mutable_data<T>(in->place());
int num_bits = ctx.Attr<int>("num_bits");
T scale = static_cast<T>(ctx.Attr<float>("scale"));
int range = std::pow(2, num_bits) - 1;
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
eigen_out.device(dev) = (scale / range) * eigen_in;
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/mul_op.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
template <typename Format = mkldnn::memory::format>
mkldnn::memory::desc type(const std::vector<int>& dims, Format&& f) {
return platform::MKLDNNMemDesc(dims, mkldnn::memory::data_type::f32, f);
}
template <typename T>
class MulMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine();
auto input = ctx.Input<Tensor>("X");
auto weight = ctx.Input<Tensor>("Y");
PADDLE_ENFORCE(input->dims().size() & (2 | 4),
"Input must be with 2 or 4 dimensions, i.e. NC or NCHW");
PADDLE_ENFORCE(weight->dims().size() & (2 | 4),
"Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
std::vector<int> w_tz = paddle::framework::vectorize2int(weight->dims());
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
auto src_md =
src_tz.size() != 2
? type(src_tz, mkldnn::memory::format::nchw)
: type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
auto weights_md =
src_tz.size() != 2
? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
mkldnn::memory::format::oihw)
: type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
auto output = ctx.Output<Tensor>("Out");
T* output_data = output->mutable_data<T>(ctx.GetPlace());
const std::string key = ctx.op().Output("Out");
const std::string key_fc_pd = key + "@mul_pd";
const T* input_data = input->data<T>();
const T* w_data = weight->data<T>();
auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
platform::to_void_cast(input_data));
auto weights_memory = mkldnn::memory({weights_md, mkldnn_engine},
platform::to_void_cast(w_data));
auto pd = platform::MKLDNNFwdPrimitiveDesc<mkldnn::inner_product_forward>(
mkldnn_engine, src_md, weights_md, dst_md);
dev_ctx.SetBlob(key_fc_pd, pd);
auto forward = mkldnn::inner_product_forward(*pd, src_memory,
weights_memory, dst_memory);
std::vector<mkldnn::primitive> pipeline = {forward};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
};
template <typename T>
class MulMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
auto mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("X");
const Tensor* w = ctx.Input<Tensor>("Y");
const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("Y"));
const std::string key = ctx.op().Input("Out");
const std::string key_fc_pd = key + "@mul_pd";
const T* input_data = input->data<T>();
const T* w_data = w->data<T>();
const T* out_grad_data = out_grad->data<T>();
T* input_grad_data = nullptr;
T* w_grad_data = nullptr;
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (w_grad) {
w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
}
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> w_tz = paddle::framework::vectorize2int(w->dims());
auto src_md =
src_tz.size() != 2
? type(src_tz, mkldnn::memory::format::nchw)
: type({src_tz[0], src_tz[1]}, mkldnn::memory::format::nc);
auto dst_md = type({src_tz[0], w_tz[1]}, mkldnn::memory::format::nc);
auto weights_md =
src_tz.size() != 2
? type({w_tz[1], src_tz[1], src_tz[2], src_tz[3]},
mkldnn::memory::format::oihw)
: type({w_tz[1], src_tz[1]}, mkldnn::memory::format::oi);
auto src_memory = mkldnn::memory({src_md, mkldnn_engine},
platform::to_void_cast(input_data));
auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine},
platform::to_void_cast(out_grad_data));
auto weight_memory = mkldnn::memory({weights_md, mkldnn_engine},
platform::to_void_cast(w_data));
auto pd =
std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
dev_ctx.GetBlob(key_fc_pd));
PADDLE_ENFORCE(pd != nullptr, "Fail to find pd in device context");
if (w_grad) {
auto weights_grad_memory = mkldnn::memory(
{weights_md, mkldnn_engine}, platform::to_void_cast(w_grad_data));
auto bwd_weight_pd = platform::MKLDNNBwdPrimitiveDesc<
mkldnn::inner_product_backward_weights>(mkldnn_engine, *pd, src_md,
weights_md, dst_md);
auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
bwd_weight_pd, src_memory, dst_memory, weights_grad_memory);
std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
if (input_grad) {
auto src_grad_memory = mkldnn::memory(
{src_md, mkldnn_engine}, platform::to_void_cast(input_grad_data));
auto bwd_data_pd =
platform::MKLDNNBwdPrimitiveDesc<mkldnn::inner_product_backward_data>(
mkldnn_engine, *pd, src_md, weights_md, dst_md);
auto bwd_data_prim = mkldnn::inner_product_backward_data(
bwd_data_pd, dst_memory, weight_memory, src_grad_memory);
std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP_KERNEL(mul, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::MulMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(mul_grad, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::MulMKLDNNGradOpKernel<float>);
...@@ -16,10 +16,6 @@ limitations under the License. */ ...@@ -16,10 +16,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -76,22 +72,6 @@ class MulOp : public framework::OperatorWithKernel { ...@@ -76,22 +72,6 @@ class MulOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
private:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
}
#endif
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout, library);
}
}; };
class MulOpMaker : public framework::OpProtoAndCheckerMaker { class MulOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -120,9 +100,6 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -120,9 +100,6 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
)DOC") )DOC")
.SetDefault(1) .SetDefault(1)
.EqualGreaterThan(1); .EqualGreaterThan(1);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<int>( AddAttr<int>(
"y_num_col_dims", "y_num_col_dims",
R"DOC((int, default 1), The mul_op can take tensors with more than two, R"DOC((int, default 1), The mul_op can take tensors with more than two,
...@@ -177,22 +154,6 @@ class MulGradOp : public framework::OperatorWithKernel { ...@@ -177,22 +154,6 @@ class MulGradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim(y_grad_name, y_dims); ctx->SetOutputDim(y_grad_name, y_dims);
} }
} }
private:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
}
#endif
framework::DataLayout layout{framework::DataLayout::kAnyLayout};
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout, library);
}
}; };
} // namespace operators } // namespace operators
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/random_crop_op.h"
namespace paddle {
namespace operators {
class RandomCropOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
ctx.device_context());
}
};
class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "A batch of instances to random crop.");
AddInput("Seed", "The random seed.");
AddOutput("Out", "The cropped instance batch.");
AddOutput("SeedOut", "The random seed after random cropping.")
.AsDispensable();
AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
AddComment(R"DOC(
This operator takes a batch of instance, and do random cropping on each instance.
It means that cropping positions differs on each instance, which is determined
by an uniform random generator. All cropped instances have the same shape, which
is determined by the operator's attribute 'shape'.
)DOC");
}
};
class RandomCropOpInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
auto seed_dim = ctx->GetInputDim("Seed");
PADDLE_ENFORCE(seed_dim.size() == 1 && seed_dim[0] == 1);
auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
auto x_dim = ctx->GetInputDim("X");
PADDLE_ENFORCE_GT(x_dim.size(), static_cast<int64_t>(shape.size()));
auto out_dim = framework::vectorize2int(x_dim);
for (size_t i = 1; i <= shape.size(); ++i) {
size_t x_i = x_dim.size() - i;
size_t shape_i = shape.size() - i;
PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
out_dim[x_i] = shape[shape_i];
}
ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
ctx->SetOutputDim("SeedOut", framework::make_ddim({1}));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace f = paddle::framework;
REGISTER_OPERATOR(random_crop, ops::RandomCropOp, ops::RandomCropOpMaker,
ops::RandomCropOpInferShape, f::EmptyGradOpMaker);
template <typename T>
using Kernel = ops::RandomCropKernel<paddle::platform::CPUDeviceContext, T>;
REGISTER_OP_CPU_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
Kernel<uint8_t>, Kernel<int16_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/random_crop_op.h"
namespace ops = paddle::operators;
template <typename T>
using Kernel = ops::RandomCropKernel<paddle::platform::CUDADeviceContext, T>;
REGISTER_OP_CUDA_KERNEL(random_crop, Kernel<float>, Kernel<int>, Kernel<double>,
Kernel<uint8_t>, Kernel<int16_t>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/for_range.h"
#ifdef PADDLE_WITH_CUDA
#include <thrust/random.h>
#endif
namespace paddle {
namespace operators {
template <typename DeviceContext>
struct Random;
template <>
struct Random<platform::CPUDeviceContext> {
using Engine = std::minstd_rand;
template <typename T>
using UniformIntDist = std::uniform_int_distribution<T>;
};
#ifdef PADDLE_WITH_CUDA
template <>
struct Random<platform::CUDADeviceContext> {
using Engine = thrust::minstd_rand;
template <typename T>
using UniformIntDist = thrust::uniform_int_distribution<T>;
};
#endif
template <typename T>
HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out,
const size_t* out_dims, int i, int rank,
size_t prod_x_remain,
size_t prod_out_remain,
const size_t* offsets) {
size_t x_dim_i = x_dims[i];
size_t out_dim_i = out_dims[i];
size_t x_stride = prod_x_remain / x_dim_i;
size_t out_stride = prod_out_remain / out_dim_i;
size_t offset_i = offsets[i];
if (i == rank - 1) {
PADDLE_ASSERT(x_stride == 1 && out_stride == 1);
x += offset_i;
for (size_t j = 0; j < out_dim_i; ++j) {
*out++ = *x++;
}
} else {
x += offset_i * x_stride;
for (size_t j = 0; j < out_dim_i; ++j) {
StridedMemcpy<T>(x, x_dims, out, out_dims, i + 1, rank, x_stride,
out_stride, offsets);
x += x_stride;
out += out_stride;
}
}
}
template <typename DeviceContext, typename T>
struct RandomCropFunctor {
const T* x_;
T* out_;
size_t x_dims_[9];
size_t out_dims_[9];
int num_batchsize_dims_;
int rank_;
int64_t seed_;
size_t prod_batchsize_dims_;
size_t prod_x_ins_dims_;
size_t prod_out_ins_dims_;
RandomCropFunctor(const T* x, T* out, const framework::DDim& x_dims,
const framework::DDim& out_dims, int num_batchsize_dims,
int64_t seed)
: x_(x),
out_(out),
num_batchsize_dims_(num_batchsize_dims),
rank_(x_dims.size()),
seed_(seed) {
PADDLE_ENFORCE_EQ(x_dims.size(), out_dims.size());
PADDLE_ENFORCE_GT(rank_, num_batchsize_dims_);
prod_batchsize_dims_ = 1;
prod_x_ins_dims_ = 1;
prod_out_ins_dims_ = 1;
for (size_t i = 0; i < static_cast<size_t>(rank_); ++i) {
size_t x_dim_i = x_dims[i];
size_t out_dim_i = out_dims[i];
x_dims_[i] = x_dim_i;
out_dims_[i] = out_dim_i;
if (i < static_cast<size_t>(num_batchsize_dims_)) {
PADDLE_ENFORCE_EQ(x_dim_i, out_dim_i);
prod_batchsize_dims_ *= x_dim_i;
} else {
prod_x_ins_dims_ *= x_dim_i;
prod_out_ins_dims_ *= out_dim_i;
}
}
}
HOSTDEVICE void operator()(size_t ins_idx) {
typename Random<DeviceContext>::Engine engine(seed_);
engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
size_t offsets[9];
for (int i = num_batchsize_dims_; i < rank_; ++i) {
typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
0, x_dims_[i] - out_dims_[i]);
offsets[i - num_batchsize_dims_] = dist(engine);
}
const T* x = x_ + ins_idx * prod_x_ins_dims_;
T* out = out_ + ins_idx * prod_out_ins_dims_;
StridedMemcpy<T>(x, x_dims_ + num_batchsize_dims_, out,
out_dims_ + num_batchsize_dims_, 0,
rank_ - num_batchsize_dims_, prod_x_ins_dims_,
prod_out_ins_dims_, offsets);
}
};
template <typename DeviceContext, typename T>
class RandomCropKernel : public framework::OpKernel<T> {
public:
virtual void Compute(const framework::ExecutionContext& ctx) const {
auto& seed_tensor = detail::Ref(ctx.Input<framework::LoDTensor>("Seed"));
int64_t seed = 0;
if (platform::is_cpu_place(seed_tensor.place())) {
seed = *seed_tensor.data<int64_t>();
} else {
LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
"your program";
framework::LoDTensor cpu_seed;
framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
seed = *cpu_seed.data<int64_t>();
}
auto shape = ctx.Attr<std::vector<int>>("shape");
auto& x = detail::Ref(ctx.Input<framework::LoDTensor>("X"));
auto& out = detail::Ref(ctx.Output<framework::LoDTensor>("Out"));
int num_batchsize_dims = x.dims().size() - shape.size();
RandomCropFunctor<DeviceContext, T> functor(
x.data<T>(), out.mutable_data<T>(ctx.GetPlace()), x.dims(), out.dims(),
num_batchsize_dims, seed);
platform::ForRange<DeviceContext> for_range(
ctx.template device_context<DeviceContext>(),
functor.prod_batchsize_dims_);
for_range(functor);
Random<platform::CPUDeviceContext>::Engine engine(seed);
engine.discard(functor.prod_batchsize_dims_ *
(functor.rank_ - functor.num_batchsize_dims_));
*ctx.Output<framework::LoDTensor>("SeedOut")->mutable_data<int64_t>(
platform::CPUPlace()) = engine();
}
};
// TODO(fengjiayi): Backward of random crop op
} // namespace operators
} // namespace paddle
...@@ -23,13 +23,12 @@ namespace reader { ...@@ -23,13 +23,12 @@ namespace reader {
class CustomReader : public framework::DecoratedReader { class CustomReader : public framework::DecoratedReader {
public: public:
CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block, CustomReader(ReaderBase* reader, const framework::BlockDesc& sub_block,
const platform::Place& dev_place,
const std::vector<std::string>& source_var_names, const std::vector<std::string>& source_var_names,
const std::vector<std::string>& sink_var_names) const std::vector<std::string>& sink_var_names)
: DecoratedReader(reader), : DecoratedReader(reader),
program_(*sub_block.Program()), program_(*sub_block.Program()),
sub_block_id_(sub_block.ID()), sub_block_id_(sub_block.ID()),
exe_(framework::Executor(dev_place)), exe_(framework::Executor(platform::CPUPlace())),
source_var_names_(source_var_names), source_var_names_(source_var_names),
sink_var_names_(sink_var_names) {} sink_var_names_(sink_var_names) {}
...@@ -60,7 +59,7 @@ class CreateCustomReaderOp : public framework::OperatorBase { ...@@ -60,7 +59,7 @@ class CreateCustomReaderOp : public framework::OperatorBase {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>(); ->Get<framework::ReaderHolder>();
out->Reset( out->Reset(
new CustomReader(underlying_reader.Get(), *sub_block, dev_place, new CustomReader(underlying_reader.Get(), *sub_block,
Attr<std::vector<std::string>>("source_var_names"), Attr<std::vector<std::string>>("source_var_names"),
Attr<std::vector<std::string>>("sink_var_names"))); Attr<std::vector<std::string>>("sink_var_names")));
} }
...@@ -85,9 +84,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase { ...@@ -85,9 +84,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
CreateCustomReader Operator CreateCustomReader Operator
A custom reader can be used for input data preprocessing. A custom reader can be used for input data preprocessing.
A custom reader holds its own sub-block, which will be executed in its A custom reader holds its own sub-block, which will be executed in CPU
'ReadNext()' function. Users can configurate their own preprocessing in its 'ReadNext()' function. Users can configurate their own
pipelines by inserting operators into custom reader's sub-block. preprocessing pipelines by inserting operators into custom reader's
sub-block.
)DOC"); )DOC");
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/operators/tensorrt_engine_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/utils/singleton.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
void paddle::operators::TensorRTEngineKernel<DeviceContext, T>::Prepare(
const framework::ExecutionContext &context) const {
// Get the ProgramDesc and pass to convert.
const auto &block = context.Attr<framework::proto::BlockDesc>("subgraph");
max_batch_ = context.Attr<int>("max_batch");
auto max_workspace = context.Attr<int>("max_workspace");
engine_.reset(new inference::tensorrt::TensorRTEngine(
max_batch_, max_workspace, nullptr));
inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
block, engine_.get());
engine_->FreezeNetwork();
}
class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Xs", "A list of inputs.").AsDuplicable();
AddOutput("Ys", "A list of outputs").AsDuplicable();
AddAttr<std::string>("subgraph", "the subgraph");
AddComment("TensorRT engine operator.");
}
};
class TensorRTEngineInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker);
REGISTER_OP_CPU_KERNEL(
tensorrt_engine,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, float>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, double>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
#endif // PADDLE_WITH_CUDA
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle {
namespace operators {
class TensorRTEngineOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::OpKernelType kt = framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::LoDTensor>("pre_ids")->type()),
platform::CPUPlace());
return kt;
}
};
template <typename DeviceContext, typename T>
class TensorRTEngineKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
if (!engine_) {
Prepare(context);
}
auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
// Try to determine a batch_size
auto* tensor0 = context.Input<framework::LoDTensor>(input_names.front());
PADDLE_ENFORCE_NOT_NULL(tensor0);
int batch_size = tensor0->dims()[0];
PADDLE_ENFORCE_LE(batch_size, max_batch_);
// Convert input tensor from fluid to engine.
for (const auto& x : context.Inputs("Xs")) {
// convert input and copy to TRT engine's buffer
auto* v = context.scope().FindVar(x);
PADDLE_ENFORCE_NOT_NULL(v, "no variable called %s", x);
auto& t = v->Get<framework::LoDTensor>();
if (platform::is_cpu_place(t.place())) {
engine_->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()),
t.memory_size());
} else {
engine_->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()),
t.memory_size());
}
}
// Execute the engine.
PADDLE_ENFORCE_GT(batch_size, 0);
engine_->Execute(batch_size);
// Convert output tensor from engine to fluid
for (const auto& y : context.Outputs("Ys")) {
// convert output and copy to fluid.
nvinfer1::ITensor* trt_t = engine_->GetITensor(y);
auto dims = trt_t->getDimensions();
// Use the output ITensor's dims to reshape the Fluid Tensor.
std::vector<int> ddim(dims.d, dims.d + dims.nbDims);
auto* fluid_v = context.scope().FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
fluid_t->Resize(framework::make_ddim(ddim));
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
if (platform::is_cpu_place(fluid_t->place())) {
engine_->GetOutputInCPU(
y, fluid_t->mutable_data<float>(platform::CPUPlace()), size);
} else {
engine_->GetOutputInGPU(
y, fluid_t->mutable_data<float>(platform::CUDAPlace()), size);
}
}
}
protected:
// Build the engine.
void Prepare(const framework::ExecutionContext& context) const;
private:
mutable std::unique_ptr<inference::tensorrt::TensorRTEngine> engine_;
mutable int max_batch_{0};
};
} // namespace operators
} // namespace paddle
#endif // PADDLE_WITH_CUDA
...@@ -55,6 +55,9 @@ class TopkKernel : public framework::OpKernel<T> { ...@@ -55,6 +55,9 @@ class TopkKernel : public framework::OpKernel<T> {
// NOTE: eigen shape doesn't affect paddle tensor. // NOTE: eigen shape doesn't affect paddle tensor.
eg_input.reshape(flat2dims); eg_input.reshape(flat2dims);
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (size_t i = 0; i < row; i++) { for (size_t i = 0; i < row; i++) {
std::vector<std::pair<T, size_t>> vec; std::vector<std::pair<T, size_t>> vec;
for (size_t j = 0; j < col; j++) { for (size_t j = 0; j < col; j++) {
......
...@@ -38,6 +38,7 @@ struct EventList; ...@@ -38,6 +38,7 @@ struct EventList;
static int64_t profiler_lister_id = 0; static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false; static bool should_send_profile_state = false;
std::mutex profiler_mu;
// The profiler state, the initial value is ProfilerState::kDisabled // The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
...@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) { ...@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
std::lock_guard<std::mutex> l(profiler_mu);
if (state == g_state) { if (state == g_state) {
return; return;
} }
...@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
} else if (g_state == ProfilerState::kAll) { } else if (g_state == ProfilerState::kAll) {
place = "All"; place = "All";
} else { } else {
PADDLE_THROW("Invalid profiler state"); PADDLE_THROW("Invalid profiler state", g_state);
} }
std::cout << "Place: " << place << std::endl; std::cout << "Place: " << place << std::endl;
...@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string& profile_path) {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
...@@ -466,7 +470,7 @@ void SetProfileListener() { ...@@ -466,7 +470,7 @@ void SetProfileListener() {
std::mt19937 rng; std::mt19937 rng;
rng.seed(std::random_device()()); rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6( std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<std::mt19937::result_type>::max()); 1, std::numeric_limits<int>::max());
profiler_lister_id = dist6(rng); profiler_lister_id = dist6(rng);
} }
int64_t ListenerId() { return profiler_lister_id; } int64_t ListenerId() { return profiler_lister_id; }
......
...@@ -117,6 +117,7 @@ PYBIND11_PLUGIN(core) { ...@@ -117,6 +117,7 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCPUTensorSetFromArray<int64_t>) .def("set", PyCPUTensorSetFromArray<int64_t>)
.def("set", PyCPUTensorSetFromArray<bool>) .def("set", PyCPUTensorSetFromArray<bool>)
.def("set", PyCPUTensorSetFromArray<uint16_t>) .def("set", PyCPUTensorSetFromArray<uint16_t>)
.def("set", PyCPUTensorSetFromArray<uint8_t>)
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
.def("set", PyCUDATensorSetFromArray<float>) .def("set", PyCUDATensorSetFromArray<float>)
.def("set", PyCUDATensorSetFromArray<int>) .def("set", PyCUDATensorSetFromArray<int>)
...@@ -124,12 +125,14 @@ PYBIND11_PLUGIN(core) { ...@@ -124,12 +125,14 @@ PYBIND11_PLUGIN(core) {
.def("set", PyCUDATensorSetFromArray<int64_t>) .def("set", PyCUDATensorSetFromArray<int64_t>)
.def("set", PyCUDATensorSetFromArray<bool>) .def("set", PyCUDATensorSetFromArray<bool>)
.def("set", PyCUDATensorSetFromArray<uint16_t>) .def("set", PyCUDATensorSetFromArray<uint16_t>)
.def("set", PyCUDATensorSetFromArray<uint8_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<float>) .def("set", PyCUDAPinnedTensorSetFromArray<float>)
.def("set", PyCUDAPinnedTensorSetFromArray<int>) .def("set", PyCUDAPinnedTensorSetFromArray<int>)
.def("set", PyCUDAPinnedTensorSetFromArray<double>) .def("set", PyCUDAPinnedTensorSetFromArray<double>)
.def("set", PyCUDAPinnedTensorSetFromArray<int64_t>) .def("set", PyCUDAPinnedTensorSetFromArray<int64_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<bool>) .def("set", PyCUDAPinnedTensorSetFromArray<bool>)
.def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>) .def("set", PyCUDAPinnedTensorSetFromArray<uint16_t>)
.def("set", PyCUDAPinnedTensorSetFromArray<uint8_t>)
#endif #endif
.def("shape", [](Tensor &self) { return vectorize(self.dims()); }) .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
.def("set_float_element", TensorSetElement<float>) .def("set_float_element", TensorSetElement<float>)
...@@ -492,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -492,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("enable_profiler", platform::EnableProfiler); m.def("enable_profiler", platform::EnableProfiler);
m.def("disable_profiler", platform::DisableProfiler); m.def("disable_profiler", platform::DisableProfiler);
m.def("is_profiler_enabled", platform::IsProfileEnabled);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
// -- python binds for parallel executor. // -- python binds for parallel executor.
......
...@@ -66,8 +66,9 @@ class NeonDepthwiseConvFunction : public ConvFunctionBase { ...@@ -66,8 +66,9 @@ class NeonDepthwiseConvFunction : public ConvFunctionBase {
float* inputPadding = inputData; float* inputPadding = inputData;
int padInputHeight = inputHeight + 2 * paddingH(); int padInputHeight = inputHeight + 2 * paddingH();
int padInputWidth = inputWidth + 2 * paddingW(); int padInputWidth = inputWidth + 2 * paddingW();
if (paddingH() > 0 || paddingW() > 0) { int newSize =
int newSize = batchSize * inputChannels * padInputHeight * padInputWidth; batchSize * (inputChannels + 1) * padInputHeight * padInputWidth;
resizeBuffer<Device>(newSize); resizeBuffer<Device>(newSize);
inputPadding = reinterpret_cast<float*>(memory_->getBuf()); inputPadding = reinterpret_cast<float*>(memory_->getBuf());
neon::Padding<float>::run(inputData, neon::Padding<float>::run(inputData,
...@@ -77,7 +78,6 @@ class NeonDepthwiseConvFunction : public ConvFunctionBase { ...@@ -77,7 +78,6 @@ class NeonDepthwiseConvFunction : public ConvFunctionBase {
inputWidth, inputWidth,
padInputHeight, padInputHeight,
padInputWidth); padInputWidth);
}
std::function<void( std::function<void(
const float*, const float*, int, int, int, int, int, int, float*)> const float*, const float*, int, int, int, int, int, int, float*)>
......
...@@ -183,7 +183,7 @@ function build() { ...@@ -183,7 +183,7 @@ function build() {
============================================ ============================================
EOF EOF
make clean make clean
make -j `nproc` make install -j `nproc`
} }
function build_android() { function build_android() {
......
...@@ -36,9 +36,11 @@ class DataToLoDTensorConverter(object): ...@@ -36,9 +36,11 @@ class DataToLoDTensorConverter(object):
self.dtype = 'float64' self.dtype = 'float64'
elif dtype == core.VarDesc.VarType.INT32: elif dtype == core.VarDesc.VarType.INT32:
self.dtype = 'int32' self.dtype = 'int32'
elif dtype == core.VarDesc.VarType.UINT8:
self.dtype = 'uint8'
else: else:
raise ValueError("dtype must be any of [int32, float32, int64, " raise ValueError("dtype must be any of [int32, float32, int64, "
"float64]") "float64, uint8]")
self.data = [] self.data = []
self.lod = [] self.lod = []
......
...@@ -82,6 +82,7 @@ __all__ = [ ...@@ -82,6 +82,7 @@ __all__ = [
'roi_pool', 'roi_pool',
'dice_loss', 'dice_loss',
'upsampling_bilinear2d', 'upsampling_bilinear2d',
'random_crop',
] ]
...@@ -154,7 +155,8 @@ def fc(input, ...@@ -154,7 +155,8 @@ def fc(input,
Examples: Examples:
.. code-block:: python .. code-block:: python
data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") data = fluid.layers.data(
name="data", shape=[32, 32], dtype="float32")
fc = fluid.layers.fc(input=data, size=1000, act="tanh") fc = fluid.layers.fc(input=data, size=1000, act="tanh")
""" """
...@@ -177,11 +179,8 @@ def fc(input, ...@@ -177,11 +179,8 @@ def fc(input,
inputs={"X": input_var, inputs={"X": input_var,
"Y": w}, "Y": w},
outputs={"Out": tmp}, outputs={"Out": tmp},
attrs={ attrs={"x_num_col_dims": num_flatten_dims,
"x_num_col_dims": num_flatten_dims, "y_num_col_dims": 1})
"y_num_col_dims": 1,
"use_mkldnn": use_mkldnn
})
mul_results.append(tmp) mul_results.append(tmp)
if len(mul_results) == 1: if len(mul_results) == 1:
...@@ -349,7 +348,8 @@ def dynamic_lstm(input, ...@@ -349,7 +348,8 @@ def dynamic_lstm(input,
cell_activation(str): The activation for cell output. Choices = ["sigmoid", cell_activation(str): The activation for cell output. Choices = ["sigmoid",
"tanh", "relu", "identity"], default "tanh". "tanh", "relu", "identity"], default "tanh".
candidate_activation(str): The activation for candidate hidden state. candidate_activation(str): The activation for candidate hidden state.
Choices = ["sigmoid", "tanh", "relu", "identity"], Choices = ["sigmoid", "tanh",
"relu", "identity"],
default "tanh". default "tanh".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32". dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
...@@ -516,10 +516,12 @@ def dynamic_lstmp(input, ...@@ -516,10 +516,12 @@ def dynamic_lstmp(input,
cell_activation(str): The activation for cell output. Choices = ["sigmoid", cell_activation(str): The activation for cell output. Choices = ["sigmoid",
"tanh", "relu", "identity"], default "tanh". "tanh", "relu", "identity"], default "tanh".
candidate_activation(str): The activation for candidate hidden state. candidate_activation(str): The activation for candidate hidden state.
Choices = ["sigmoid", "tanh", "relu", "identity"], Choices = ["sigmoid", "tanh",
"relu", "identity"],
default "tanh". default "tanh".
proj_activation(str): The activation for projection output. proj_activation(str): The activation for projection output.
Choices = ["sigmoid", "tanh", "relu", "identity"], Choices = ["sigmoid", "tanh",
"relu", "identity"],
default "tanh". default "tanh".
dtype(str): Data type. Choices = ["float32", "float64"], default "float32". dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
name(str|None): A name for this layer(optional). If set None, the layer name(str|None): A name for this layer(optional). If set None, the layer
...@@ -855,7 +857,7 @@ def cos_sim(X, Y): ...@@ -855,7 +857,7 @@ def cos_sim(X, Y):
return out return out
def dropout(x, dropout_prob, is_test=False, seed=None): def dropout(x, dropout_prob, is_test=False, seed=None, name=None):
""" """
Computes dropout. Computes dropout.
...@@ -873,6 +875,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None): ...@@ -873,6 +875,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None):
parameter is set to None, a random seed is used. parameter is set to None, a random seed is used.
NOTE: If an integer seed is given, always the same output NOTE: If an integer seed is given, always the same output
units will be dropped. DO NOT use a fixed seed in training. units will be dropped. DO NOT use a fixed seed in training.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns: Returns:
Variable: A tensor variable. Variable: A tensor variable.
...@@ -1117,7 +1121,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): ...@@ -1117,7 +1121,7 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True):
return softmax_out return softmax_out
def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None):
helper = LayerHelper('softmax', **locals()) helper = LayerHelper('softmax', **locals())
dtype = helper.input_dtype() dtype = helper.input_dtype()
softmax_out = helper.create_tmp_variable(dtype) softmax_out = helper.create_tmp_variable(dtype)
...@@ -2172,7 +2176,8 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None): ...@@ -2172,7 +2176,8 @@ def reduce_mean(input, dim=None, keep_dim=False, name=None):
fluid.layers.reduce_mean(x) # [0.4375] fluid.layers.reduce_mean(x) # [0.4375]
fluid.layers.reduce_mean(x, dim=0) # [0.15, 0.25, 0.55, 0.8] fluid.layers.reduce_mean(x, dim=0) # [0.15, 0.25, 0.55, 0.8]
fluid.layers.reduce_mean(x, dim=-1) # [0.475, 0.4] fluid.layers.reduce_mean(x, dim=-1) # [0.475, 0.4]
fluid.layers.reduce_mean(x, dim=1, keep_dim=True) # [[0.475], [0.4]] fluid.layers.reduce_mean(
x, dim=1, keep_dim=True) # [[0.475], [0.4]]
# x is a Tensor variable with shape [2, 2, 2] and elements as below: # x is a Tensor variable with shape [2, 2, 2] and elements as below:
# [[[1.0, 2.0], [3.0, 4.0]], # [[[1.0, 2.0], [3.0, 4.0]],
...@@ -2391,7 +2396,8 @@ def split(input, num_or_sections, dim=-1, name=None): ...@@ -2391,7 +2396,8 @@ def split(input, num_or_sections, dim=-1, name=None):
x0.shape # [3, 3, 5] x0.shape # [3, 3, 5]
x1.shape # [3, 3, 5] x1.shape # [3, 3, 5]
x2.shape # [3, 3, 5] x2.shape # [3, 3, 5]
x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1) x0, x1, x2 = fluid.layers.split(
x, num_or_sections=[2, 3, 4], dim=1)
x0.shape # [3, 2, 5] x0.shape # [3, 2, 5]
x1.shape # [3, 3, 5] x1.shape # [3, 3, 5]
x2.shape # [3, 4, 5] x2.shape # [3, 4, 5]
...@@ -2610,7 +2616,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): ...@@ -2610,7 +2616,7 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
return out return out
def topk(input, k): def topk(input, k, name=None):
""" """
This operator is used to find values and indices of the k largest entries This operator is used to find values and indices of the k largest entries
for the last dimension. for the last dimension.
...@@ -2626,6 +2632,8 @@ def topk(input, k): ...@@ -2626,6 +2632,8 @@ def topk(input, k):
input(Variable): The input variable which can be a vector or Tensor with input(Variable): The input variable which can be a vector or Tensor with
higher rank. higher rank.
k(int): An integer value to specify the top k largest elements. k(int): An integer value to specify the top k largest elements.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns: Returns:
values(Variable): The k largest elements along each last dimensional values(Variable): The k largest elements along each last dimensional
...@@ -3301,7 +3309,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False): ...@@ -3301,7 +3309,8 @@ def softmax_with_cross_entropy(logits, label, soft_label=False):
data = fluid.layers.data(name='data', shape=[128], dtype='float32') data = fluid.layers.data(name='data', shape=[128], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
fc = fluid.layers.fc(input=data, size=100) fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.softmax_with_cross_entropy(logits=fc, label=label) out = fluid.layers.softmax_with_cross_entropy(
logits=fc, label=label)
""" """
helper = LayerHelper('softmax_with_cross_entropy', **locals()) helper = LayerHelper('softmax_with_cross_entropy', **locals())
softmax = helper.create_tmp_variable(dtype=logits.dtype) softmax = helper.create_tmp_variable(dtype=logits.dtype)
...@@ -3348,7 +3357,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None): ...@@ -3348,7 +3357,8 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
.. code-block:: python .. code-block:: python
data = fluid.layers.data(name='data', shape=[128], dtype='float32') data = fluid.layers.data(name='data', shape=[128], dtype='float32')
label = fluid.layers.data(name='label', shape=[100], dtype='float32') label = fluid.layers.data(
name='label', shape=[100], dtype='float32')
fc = fluid.layers.fc(input=data, size=100) fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.smooth_l1(x=fc, y=label) out = fluid.layers.smooth_l1(x=fc, y=label)
""" """
...@@ -3670,7 +3680,8 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): ...@@ -3670,7 +3680,8 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
Examples: Examples:
.. code-block:: python .. code-block:: python
data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32") data = fluid.layers.data(
name="data", shape=[3, 112, 112], dtype="float32")
lrn = fluid.layers.lrn(input=data) lrn = fluid.layers.lrn(input=data)
""" """
helper = LayerHelper('lrn', **locals()) helper = LayerHelper('lrn', **locals())
...@@ -3979,3 +3990,33 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None): ...@@ -3979,3 +3990,33 @@ def upsampling_bilinear2d(input, out_shape=None, scale=None, name=None):
attrs={"out_h": out_h, attrs={"out_h": out_h,
"out_w": out_w}) "out_w": out_w})
return out return out
def random_crop(input, shape, seed=1):
helper = LayerHelper("random_crop", **locals())
dtype = helper.input_dtype()
out = helper.create_tmp_variable(dtype)
if isinstance(seed, int):
seed_value = seed
seed = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="fill_constant",
inputs={},
outputs={"Out": seed},
attrs={
"dtype": seed.dtype,
"shape": [1],
"value": float(seed_value),
"force_cpu": True
})
elif not isinstance(seed, Variable):
raise ValueError("'seed' must be a Variable or an int.")
seed_out = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="random_crop",
inputs={"X": input,
"Seed": seed},
outputs={"Out": out,
"SeedOut": seed_out},
attrs={"shape": shape})
return out
...@@ -112,7 +112,7 @@ def cast(x, dtype): ...@@ -112,7 +112,7 @@ def cast(x, dtype):
return out return out
def concat(input, axis=0): def concat(input, axis=0, name=None):
""" """
**Concat** **Concat**
...@@ -122,6 +122,8 @@ def concat(input, axis=0): ...@@ -122,6 +122,8 @@ def concat(input, axis=0):
Args: Args:
input(list): List of tensors to be concatenated input(list): List of tensors to be concatenated
axis(int): Integer axis along which the tensors will be concatenated axis(int): Integer axis along which the tensors will be concatenated
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns: Returns:
Variable: Output variable of the concatenation Variable: Output variable of the concatenation
......
...@@ -16,7 +16,10 @@ import core ...@@ -16,7 +16,10 @@ import core
from contextlib import contextmanager from contextlib import contextmanager
import os import os
__all__ = ['cuda_profiler', 'reset_profiler', 'profiler'] __all__ = [
'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
'stop_profiler'
]
NVPROF_CONFIG = [ NVPROF_CONFIG = [
"gpustarttimestamp", "gpustarttimestamp",
...@@ -72,20 +75,31 @@ def reset_profiler(): ...@@ -72,20 +75,31 @@ def reset_profiler():
core.reset_profiler() core.reset_profiler()
@contextmanager def start_profiler(state):
def profiler(state, sorted_key=None, profile_path='/tmp/profile'): """Enable the profiler.
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU Args:
and GPU program. By defalut, it records the CPU and GPU operator kernels, state (string) : The profiling state, which should be 'CPU', 'GPU'
if you want to profile other program, you can refer the profiling tutorial or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
to add more records. GPU as well. 'All' also generates timeline.
"""
if core.is_profiler_enabled():
return
if state not in ['CPU', 'GPU', "All"]:
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state)
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
"""Stop the profiler.
Args: Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one results will be sorted by the this flag. This flag should be one
...@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): ...@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
profile_path (string) : If state == 'All', it will write a profile profile_path (string) : If state == 'All', it will write a profile
proto output file. proto output file.
""" """
if state not in ['CPU', 'GPU', "All"]: if not core.is_profiler_enabled():
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.") return
if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state)
yield
sorted_key = 'default' if sorted_key is None else sorted_key sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']: if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', " raise ValueError("The sorted_key must be None or in 'calls', 'total', "
...@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): ...@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
# TODO(qingqing) : redirect C++ ostream to Python stream. # TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True): # with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key], profile_path) core.disable_profiler(key_map[sorted_key], profile_path)
@contextmanager
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
of 'calls', 'total', 'max', 'min' or 'ave'.
The `calls` means sorting by the number of calls.
The `total` means sorting by the total execution time.
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile
proto output file.
"""
start_profiler(state)
yield
stop_profiler(sorted_key, profile_path)
...@@ -217,8 +217,6 @@ def infer(use_cuda, inference_program, params_dirname): ...@@ -217,8 +217,6 @@ def infer(use_cuda, inference_program, params_dirname):
# The range of random integers is [low, high] # The range of random integers is [low, high]
word = fluid.create_random_int_lodtensor( word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
pred = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
ctx_n2 = fluid.create_random_int_lodtensor( ctx_n2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_n1 = fluid.create_random_int_lodtensor( ctx_n1 = fluid.create_random_int_lodtensor(
...@@ -229,18 +227,20 @@ def infer(use_cuda, inference_program, params_dirname): ...@@ -229,18 +227,20 @@ def infer(use_cuda, inference_program, params_dirname):
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
ctx_p2 = fluid.create_random_int_lodtensor( ctx_p2 = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1) lod, base_shape, place, low=0, high=WORD_DICT_LEN - 1)
pred = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=PRED_DICT_LEN - 1)
mark = fluid.create_random_int_lodtensor( mark = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1) lod, base_shape, place, low=0, high=MARK_DICT_LEN - 1)
results = inferencer.infer( results = inferencer.infer(
{ {
'word_data': word, 'word_data': word,
'verb_data': pred,
'ctx_n2_data': ctx_n2, 'ctx_n2_data': ctx_n2,
'ctx_n1_data': ctx_n1, 'ctx_n1_data': ctx_n1,
'ctx_0_data': ctx_0, 'ctx_0_data': ctx_0,
'ctx_p1_data': ctx_p1, 'ctx_p1_data': ctx_p1,
'ctx_p2_data': ctx_p2, 'ctx_p2_data': ctx_p2,
'verb_data': pred,
'mark_data': mark 'mark_data': mark
}, },
return_numpy=False) return_numpy=False)
......
...@@ -53,7 +53,7 @@ def encoder(is_sparse): ...@@ -53,7 +53,7 @@ def encoder(is_sparse):
return encoder_out return encoder_out
def decoder_train(context, is_sparse): def train_decoder(context, is_sparse):
# decoder # decoder
trg_language_word = pd.data( trg_language_word = pd.data(
name="target_language_word", shape=[1], dtype='int64', lod_level=1) name="target_language_word", shape=[1], dtype='int64', lod_level=1)
...@@ -81,7 +81,7 @@ def decoder_train(context, is_sparse): ...@@ -81,7 +81,7 @@ def decoder_train(context, is_sparse):
return rnn() return rnn()
def decoder_decode(context, is_sparse): def decode(context, is_sparse):
init_state = context init_state = context
array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
...@@ -148,31 +148,9 @@ def decoder_decode(context, is_sparse): ...@@ -148,31 +148,9 @@ def decoder_decode(context, is_sparse):
return translation_ids, translation_scores return translation_ids, translation_scores
def set_init_lod(data, lod, place):
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod(lod)
return res
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def train_program(is_sparse): def train_program(is_sparse):
context = encoder(is_sparse) context = encoder(is_sparse)
rnn_out = decoder_train(context, is_sparse) rnn_out = train_decoder(context, is_sparse)
label = pd.data( label = pd.data(
name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
cost = pd.cross_entropy(input=rnn_out, label=label) cost = pd.cross_entropy(input=rnn_out, label=label)
...@@ -218,13 +196,12 @@ def train(use_cuda, is_sparse, is_local=True): ...@@ -218,13 +196,12 @@ def train(use_cuda, is_sparse, is_local=True):
def decode_main(use_cuda, is_sparse): def decode_main(use_cuda, is_sparse):
if use_cuda and not fluid.core.is_compiled_with_cuda(): if use_cuda and not fluid.core.is_compiled_with_cuda():
return return
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
context = encoder(is_sparse) context = encoder(is_sparse)
translation_ids, translation_scores = decoder_decode(context, is_sparse) translation_ids, translation_scores = decode(context, is_sparse)
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
...@@ -234,26 +211,32 @@ def decode_main(use_cuda, is_sparse): ...@@ -234,26 +211,32 @@ def decode_main(use_cuda, is_sparse):
[1. for _ in range(batch_size)], dtype='float32') [1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1)) init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1)) init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [i for i in range(batch_size)] + [batch_size] init_lod = [1] * batch_size
init_lod = [init_lod, init_lod] init_lod = [init_lod, init_lod]
init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size) batch_size=batch_size)
for _, data in enumerate(train_data()):
init_ids = set_init_lod(init_ids_data, init_lod, place)
init_scores = set_init_lod(init_scores_data, init_lod, place)
src_word_data = to_lodtensor(map(lambda x: x[0], data), place) feed_order = ['src_word_id']
feed_list = [
framework.default_main_program().global_block().var(var_name)
for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
for data in train_data():
feed_dict = feeder.feed(map(lambda x: [x[0]], data))
feed_dict['init_ids'] = init_ids
feed_dict['init_scores'] = init_scores
result_ids, result_scores = exe.run( result_ids, result_scores = exe.run(
framework.default_main_program(), framework.default_main_program(),
feed={ feed=feed_dict,
'src_word_id': src_word_data,
'init_ids': init_ids,
'init_scores': init_scores
},
fetch_list=[translation_ids, translation_scores], fetch_list=[translation_ids, translation_scores],
return_numpy=False) return_numpy=False)
print result_ids.lod() print result_ids.lod()
......
...@@ -147,28 +147,6 @@ def decoder_decode(context, is_sparse): ...@@ -147,28 +147,6 @@ def decoder_decode(context, is_sparse):
return translation_ids, translation_scores return translation_ids, translation_scores
def set_init_lod(data, lod, place):
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod(lod)
return res
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = fluid.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def train_main(use_cuda, is_sparse, is_local=True): def train_main(use_cuda, is_sparse, is_local=True):
if use_cuda and not fluid.core.is_compiled_with_cuda(): if use_cuda and not fluid.core.is_compiled_with_cuda():
return return
...@@ -192,23 +170,25 @@ def train_main(use_cuda, is_sparse, is_local=True): ...@@ -192,23 +170,25 @@ def train_main(use_cuda, is_sparse, is_local=True):
paddle.dataset.wmt14.train(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size) batch_size=batch_size)
feed_order = [
'src_word_id', 'target_language_word', 'target_language_next_word'
]
exe = Executor(place) exe = Executor(place)
def train_loop(main_program): def train_loop(main_program):
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
feed_list = [
main_program.global_block().var(var_name) for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
batch_id = 0 batch_id = 0
for pass_id in xrange(1): for pass_id in xrange(1):
for data in train_data(): for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place)
trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
outs = exe.run(main_program, outs = exe.run(main_program,
feed={ feed=feeder.feed(data),
'src_word_id': word_data,
'target_language_word': trg_word,
'target_language_next_word': trg_word_next
},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0]) avg_cost_val = np.array(outs[0])
print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
...@@ -258,26 +238,32 @@ def decode_main(use_cuda, is_sparse): ...@@ -258,26 +238,32 @@ def decode_main(use_cuda, is_sparse):
[1. for _ in range(batch_size)], dtype='float32') [1. for _ in range(batch_size)], dtype='float32')
init_ids_data = init_ids_data.reshape((batch_size, 1)) init_ids_data = init_ids_data.reshape((batch_size, 1))
init_scores_data = init_scores_data.reshape((batch_size, 1)) init_scores_data = init_scores_data.reshape((batch_size, 1))
init_lod = [i for i in range(batch_size)] + [batch_size] init_lod = [1] * batch_size
init_lod = [init_lod, init_lod] init_lod = [init_lod, init_lod]
init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place)
init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place)
train_data = paddle.batch( train_data = paddle.batch(
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.wmt14.train(dict_size), buf_size=1000), paddle.dataset.wmt14.train(dict_size), buf_size=1000),
batch_size=batch_size) batch_size=batch_size)
for _, data in enumerate(train_data()):
init_ids = set_init_lod(init_ids_data, init_lod, place)
init_scores = set_init_lod(init_scores_data, init_lod, place)
src_word_data = to_lodtensor(map(lambda x: x[0], data), place) feed_order = ['src_word_id']
feed_list = [
framework.default_main_program().global_block().var(var_name)
for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
for data in train_data():
feed_dict = feeder.feed(map(lambda x: [x[0]], data))
feed_dict['init_ids'] = init_ids
feed_dict['init_scores'] = init_scores
result_ids, result_scores = exe.run( result_ids, result_scores = exe.run(
framework.default_main_program(), framework.default_main_program(),
feed={ feed=feed_dict,
'src_word_id': src_word_data,
'init_ids': init_ids,
'init_scores': init_scores
},
fetch_list=[translation_ids, translation_scores], fetch_list=[translation_ids, translation_scores],
return_numpy=False) return_numpy=False)
print result_ids.lod() print result_ids.lod()
......
...@@ -152,29 +152,6 @@ def seq_to_seq_net(): ...@@ -152,29 +152,6 @@ def seq_to_seq_net():
return avg_cost, prediction return avg_cost, prediction
def to_lodtensor(data, place):
seq_lens = [len(seq) for seq in data]
cur_len = 0
lod = [cur_len]
for l in seq_lens:
cur_len += l
lod.append(cur_len)
flattened_data = np.concatenate(data, axis=0).astype("int64")
flattened_data = flattened_data.reshape([len(flattened_data), 1])
res = core.LoDTensor()
res.set(flattened_data, place)
res.set_lod([lod])
return res
def create_random_lodtensor(lod, place, low, high):
data = np.random.random_integers(low, high, [lod[-1], 1]).astype("int64")
res = fluid.LoDTensor()
res.set(data, place)
res.set_lod([lod])
return res
def train(use_cuda, save_dirname=None): def train(use_cuda, save_dirname=None):
[avg_cost, prediction] = seq_to_seq_net() [avg_cost, prediction] = seq_to_seq_net()
...@@ -188,22 +165,20 @@ def train(use_cuda, save_dirname=None): ...@@ -188,22 +165,20 @@ def train(use_cuda, save_dirname=None):
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
exe = Executor(place) exe = Executor(place)
exe.run(framework.default_startup_program()) exe.run(framework.default_startup_program())
feed_order = ['source_sequence', 'target_sequence', 'label_sequence']
feed_list = [
framework.default_main_program().global_block().var(var_name)
for var_name in feed_order
]
feeder = fluid.DataFeeder(feed_list, place)
batch_id = 0 batch_id = 0
for pass_id in xrange(2): for pass_id in xrange(2):
for data in train_data(): for data in train_data():
word_data = to_lodtensor(map(lambda x: x[0], data), place)
trg_word = to_lodtensor(map(lambda x: x[1], data), place)
trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
outs = exe.run(framework.default_main_program(), outs = exe.run(framework.default_main_program(),
feed={ feed=feeder.feed(data),
'source_sequence': word_data,
'target_sequence': trg_word,
'label_sequence': trg_word_next
},
fetch_list=[avg_cost]) fetch_list=[avg_cost])
avg_cost_val = np.array(outs[0]) avg_cost_val = np.array(outs[0])
...@@ -237,9 +212,23 @@ def infer(use_cuda, save_dirname=None): ...@@ -237,9 +212,23 @@ def infer(use_cuda, save_dirname=None):
[inference_program, feed_target_names, [inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
lod = [0, 4, 10] # Setup input by creating LoDTensor to represent sequence of words.
word_data = create_random_lodtensor(lod, place, low=0, high=1) # Here each word is the basic element of the LoDTensor and the shape of
trg_word = create_random_lodtensor(lod, place, low=0, high=1) # each word (base_shape) should be [1] since it is simply an index to
# look up for the corresponding word vector.
# Suppose the length_based level of detail (lod) info is set to [[4, 6]],
# which has only one lod level. Then the created LoDTensor will have only
# one higher level structure (sequence of words, or sentence) than the basic
# element (word). Hence the LoDTensor will hold data for two sentences of
# length 4 and 6, respectively.
# Note that lod info should be a list of lists.
lod = [[4, 6]]
base_shape = [1]
# The range of random integers is [low, high]
word_data = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=1)
trg_word = fluid.create_random_int_lodtensor(
lod, base_shape, place, low=0, high=1)
# Construct feed as a dictionary of {feed_target_name: feed_target_data} # Construct feed as a dictionary of {feed_target_name: feed_target_data}
# and results will contain a list of data corresponding to fetch_targets. # and results will contain a list of data corresponding to fetch_targets.
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import unittest import unittest
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid import debuger from paddle.fluid import debugger
from paddle.fluid.framework import Program from paddle.fluid.framework import Program
...@@ -51,9 +51,9 @@ class TestDebugger(unittest.TestCase): ...@@ -51,9 +51,9 @@ class TestDebugger(unittest.TestCase):
outputs={"Out": mul_out}, outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1}) attrs={"x_num_col_dims": 1})
print(debuger.pprint_program_codes(p)) print(debugger.pprint_program_codes(p))
debuger.draw_block_graphviz(p.block(0), path="./test.dot") debugger.draw_block_graphviz(p.block(0), path="./test.dot")
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -13,31 +13,47 @@ ...@@ -13,31 +13,47 @@
# limitations under the License. # limitations under the License.
import unittest import unittest
from test_mul_op import TestMulOp, TestMulOp2, TestFP16MulOp1, TestFP16MulOp2 import numpy as np
import math
from op_test import OpTest
class TestMKLDNNMulOp(TestMulOp): def quantize_max_abs(x, num_bits):
def init_op_test(self): range = math.pow(2, num_bits) - 1
super(TestMKLDNNMulOp, self).setUp() scale = np.max(np.abs(x).flatten())
self.attrs = {"use_mkldnn": True} y = np.round(x / scale * range)
return y, scale
class TestMKLDNNMulOp2(TestMulOp2): def dequantize_max_abs(x, num_bits, scale):
def init_op_test(self): range = math.pow(2, num_bits) - 1
super(TestMKLDNNMulOp2, self).setUp() y = (scale / range) * x
self.attrs = {"use_mkldnn": True} return y
class TestMKLDNNFP16MulOp1(TestFP16MulOp1): class TestFakeDequantizeMaxAbsOp(OpTest):
def init_op_test(self): def set_args(self):
super(TestMKLDNNFP16MulOp1, self).setUp() self.num_bits = 8
self.attrs = {"use_mkldnn": True}
def setUp(self):
self.set_args()
self.op_type = "fake_dequantize_max_abs"
x = np.random.randn(31, 65).astype("float32")
yq, scale = quantize_max_abs(x, self.num_bits)
print 'scale ', scale
ydq = dequantize_max_abs(yq, self.num_bits, scale)
class TestMKLDNNFP16MulOp2(TestFP16MulOp2): self.inputs = {'X': yq}
def init_op_test(self): self.attrs = {'num_bits': self.num_bits, 'scale': float(scale)}
super(TestMKLDNNFP16MulOp2, self).setUp() self.outputs = {'Out': ydq}
self.attrs = {"use_mkldnn": True}
def test_check_output(self):
self.check_output()
class TestFakeDequantizeMaxAbsOp5Bits(OpTest):
def set_args(self):
self.num_bits = 5
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -21,12 +21,10 @@ from op_test import OpTest ...@@ -21,12 +21,10 @@ from op_test import OpTest
class TestMulOp(OpTest): class TestMulOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "mul" self.op_type = "mul"
self.use_mkldnn = False
self.inputs = { self.inputs = {
'X': np.random.random((32, 84)).astype("float32"), 'X': np.random.random((32, 84)).astype("float32"),
'Y': np.random.random((84, 100)).astype("float32") 'Y': np.random.random((84, 100)).astype("float32")
} }
self.attrs = {'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
def test_check_output(self): def test_check_output(self):
...@@ -47,16 +45,11 @@ class TestMulOp(OpTest): ...@@ -47,16 +45,11 @@ class TestMulOp(OpTest):
class TestMulOp2(OpTest): class TestMulOp2(OpTest):
def setUp(self): def setUp(self):
self.op_type = "mul" self.op_type = "mul"
self.use_mkldnn = False
self.inputs = { self.inputs = {
'X': np.random.random((15, 4, 12, 10)).astype("float32"), 'X': np.random.random((15, 4, 12, 10)).astype("float32"),
'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32") 'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
} }
self.attrs = { self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
'x_num_col_dims': 2,
'y_num_col_dims': 2,
'use_mkldnn': self.use_mkldnn
}
result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10), result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9)) self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
result = result.reshape(15, 4, 8, 2, 9) result = result.reshape(15, 4, 8, 2, 9)
...@@ -80,11 +73,9 @@ class TestMulOp2(OpTest): ...@@ -80,11 +73,9 @@ class TestMulOp2(OpTest):
class TestFP16MulOp1(OpTest): class TestFP16MulOp1(OpTest):
def setUp(self): def setUp(self):
self.op_type = "mul" self.op_type = "mul"
self.use_mkldnn = False
x = np.random.random((32, 84)).astype("float16") x = np.random.random((32, 84)).astype("float16")
y = np.random.random((84, 100)).astype("float16") y = np.random.random((84, 100)).astype("float16")
self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
self.attrs = {'use_mkldnn': self.use_mkldnn}
self.outputs = {'Out': np.dot(x, y)} self.outputs = {'Out': np.dot(x, y)}
def test_check_output(self): def test_check_output(self):
...@@ -97,15 +88,10 @@ class TestFP16MulOp1(OpTest): ...@@ -97,15 +88,10 @@ class TestFP16MulOp1(OpTest):
class TestFP16MulOp2(OpTest): class TestFP16MulOp2(OpTest):
def setUp(self): def setUp(self):
self.op_type = "mul" self.op_type = "mul"
self.use_mkldnn = False
x = np.random.random((15, 4, 12, 10)).astype("float16") x = np.random.random((15, 4, 12, 10)).astype("float16")
y = np.random.random((4, 30, 8, 2, 9)).astype("float16") y = np.random.random((4, 30, 8, 2, 9)).astype("float16")
self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)} self.inputs = {'X': x.view(np.uint16), 'Y': y.view(np.uint16)}
self.attrs = { self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
'x_num_col_dims': 2,
'y_num_col_dims': 2,
'use_mkldnn': self.use_mkldnn
}
result = np.dot( result = np.dot(
x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9)) x.reshape(15 * 4, 12 * 10), y.reshape(4 * 30, 8 * 2 * 9))
result = result.reshape(15, 4, 8, 2, 9) result = result.reshape(15, 4, 8, 2, 9)
......
...@@ -63,10 +63,7 @@ class TestOperator(unittest.TestCase): ...@@ -63,10 +63,7 @@ class TestOperator(unittest.TestCase):
self.assertEqual(mul_op.output("Out"), ["mul.out"]) self.assertEqual(mul_op.output("Out"), ["mul.out"])
self.assertEqual( self.assertEqual(
set(mul_op.attr_names), set(mul_op.attr_names),
set([ set(["x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var"]))
"x_num_col_dims", "y_num_col_dims", "use_mkldnn", "op_role",
"op_role_var"
]))
self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
self.assertEqual(mul_op.attr("x_num_col_dims"), 1) self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
class TestRandomCropOp(OpTest):
def setUp(self):
to_crop = np.array([[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] *
5).astype("float32")
self.possible_res = [
np.array([[1, 2, 3], [5, 6, 7]]), np.array([[2, 3, 4], [6, 7, 8]]),
np.array([[5, 6, 7], [9, 10, 11]]),
np.array([[6, 7, 8], [10, 11, 12]])
]
self.op_type = "random_crop"
self.inputs = {'X': to_crop, 'Seed': np.array([10])}
self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
self.attrs = {'shape': [2, 3]}
def test_check_output(self):
self.check_output_customized(self.verify_output)
def verify_output(self, outs):
out = np.array(outs[1])
for ins in out[:]:
is_equal = [(ins == res).all() for res in self.possible_res]
self.assertIn(True, is_equal)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册