提交 b07ebb37 编写于 作者: Q Qiao Longfei

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into dist-table-support-optimizer-regular
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <cstddef> // for size_t
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -26,6 +27,7 @@ struct ExecutionStrategy { ...@@ -26,6 +27,7 @@ struct ExecutionStrategy {
bool allow_op_delay_{false}; bool allow_op_delay_{false};
size_t num_iteration_per_drop_scope_{100}; size_t num_iteration_per_drop_scope_{100};
ExecutorType type_{kDefault}; ExecutorType type_{kDefault};
bool dry_run_{false};
}; };
} // namespace details } // namespace details
......
...@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( ...@@ -128,7 +128,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync(
size_t complete = 0; size_t complete = 0;
while (op_to_run != nullptr) { while (op_to_run != nullptr) {
try { try {
op_to_run->Run(strategy_.use_cuda_); if (LIKELY(!strategy_.dry_run_)) {
op_to_run->Run(strategy_.use_cuda_);
}
++complete; ++complete;
} catch (...) { } catch (...) {
exception_.Catch(std::current_exception()); exception_.Catch(std::current_exception());
......
...@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -211,7 +211,9 @@ void ThreadedSSAGraphExecutor::RunOp(
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(10)) {
VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
} }
op->Run(strategy_.use_cuda_); if (LIKELY(!strategy_.dry_run_)) {
op->Run(strategy_.use_cuda_);
}
VLOG(10) << op << " " << op->Name() << " Done "; VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--; running_ops_--;
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
......
...@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ...@@ -48,7 +48,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
// Use topological sort algorithm // Use topological sort algorithm
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override; FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
~ThreadedSSAGraphExecutor() {} ~ThreadedSSAGraphExecutor() final = default;
private: private:
void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q, void RunOp(const std::shared_ptr<BlockingQueue<VarHandleBase *>> &ready_var_q,
......
...@@ -38,9 +38,20 @@ class ParallelExecutorPrivate { ...@@ -38,9 +38,20 @@ class ParallelExecutorPrivate {
explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places) explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
: places_(places) {} : places_(places) {}
~ParallelExecutorPrivate() {
if (own_local_scope_) {
for (size_t i = 1; i < local_scopes_.size(); ++i) {
// Skip the first scope, since it is the global scope.
Scope *local_scope = local_scopes_[i];
if (global_scope_->HasKid(local_scope)) {
global_scope_->DeleteScope(local_scope);
}
}
}
}
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
Scope *global_scope_; Scope *global_scope_; // not owned
std::unique_ptr<details::SSAGraphExecutor> executor_; std::unique_ptr<details::SSAGraphExecutor> executor_;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() { ...@@ -306,16 +317,6 @@ ParallelExecutor::~ParallelExecutor() {
for (auto &p : member_->places_) { for (auto &p : member_->places_) {
platform::DeviceContextPool::Instance().Get(p)->Wait(); platform::DeviceContextPool::Instance().Get(p)->Wait();
} }
if (member_->own_local_scope_) {
for (size_t i = 1; i < member_->local_scopes_.size(); ++i) {
Scope *local_scope = member_->local_scopes_[i];
if (member_->global_scope_->HasKid(local_scope)) {
member_->global_scope_->DeleteScope(local_scope);
}
}
}
// member_ must be destructed before gcs_ since the destructor of // member_ must be destructed before gcs_ since the destructor of
// ReferenceCountOpHandle use raw pointers of gcs_ inside. // ReferenceCountOpHandle use raw pointers of gcs_ inside.
member_.reset(); member_.reset();
......
if(WITH_TESTING) if(WITH_TESTING)
include(test.cmake) # some generic cmake funtion for inference include(tests/test.cmake) # some generic cmake funtion for inference
endif() endif()
# analysis and tensorrt must be added before creating static library, # analysis and tensorrt must be added before creating static library,
# otherwise, there would be undefined reference to them in static library. # otherwise, there would be undefined reference to them in static library.
......
...@@ -18,6 +18,21 @@ namespace paddle { ...@@ -18,6 +18,21 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
bool to_skip_merging_optimize(TensorRTEngine* engine_,
const std::vector<int>& filters,
const std::vector<int>& strides,
const std::vector<int>& paddings,
std::string input_name) {
if (engine_->itensor_quote_num[input_name] > 0) {
return true;
}
if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
engine_->itensor_quote_num[input_name] += 1;
return false;
}
class Conv2dOpConverter : public OpConverter { class Conv2dOpConverter : public OpConverter {
public: public:
void operator()(const framework::proto::OpDesc& op, void operator()(const framework::proto::OpDesc& op,
...@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter { ...@@ -31,6 +46,7 @@ class Conv2dOpConverter : public OpConverter {
PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1);
auto* X = engine_->GetITensor(op_desc.Input("Input").front()); auto* X = engine_->GetITensor(op_desc.Input("Input").front());
// Declare weights // Declare weights
auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); auto* Y_v = scope.FindVar(op_desc.Input("Filter").front());
PADDLE_ENFORCE_NOT_NULL(Y_v); PADDLE_ENFORCE_NOT_NULL(Y_v);
...@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter { ...@@ -83,7 +99,10 @@ class Conv2dOpConverter : public OpConverter {
std::move(weight_tensor); std::move(weight_tensor);
layer->getOutput(0)->setName(output_name.c_str()); layer->getOutput(0)->setName(output_name.c_str());
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
if (test_mode) {
if (test_mode ||
to_skip_merging_optimize(engine_, {filter_h, filter_w}, strides,
paddings, op_desc.Input("Input").front())) {
engine_->DeclareOutput(output_name); engine_->DeclareOutput(output_name);
} }
} }
......
...@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, ...@@ -133,6 +133,10 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
buffer_sizes_[name] = 0; buffer_sizes_[name] = 0;
} }
bool TensorRTEngine::HasDeclared(const std::string &name) {
return buffer_sizes_.count(name) > 0;
}
void TensorRTEngine::DeclareOutput(const std::string &name) { void TensorRTEngine::DeclareOutput(const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name); name);
......
...@@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase { ...@@ -91,6 +91,8 @@ class TensorRTEngine : public EngineBase {
const std::string& name); const std::string& name);
// Set the itensor_map_[name] as the network's output, and set its name. // Set the itensor_map_[name] as the network's output, and set its name.
void DeclareOutput(const std::string& name); void DeclareOutput(const std::string& name);
// Check if the ITensor has been declared
bool HasDeclared(const std::string& name);
// GPU memory address for an ITensor with specific name. One can operate on // GPU memory address for an ITensor with specific name. One can operate on
// these memory directly for acceleration, for example, output the converted // these memory directly for acceleration, for example, output the converted
...@@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase { ...@@ -132,6 +134,16 @@ class TensorRTEngine : public EngineBase {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>> std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map; weight_map;
// TODO: (NHZLX)
// In the normal case, the paddle-trt exists bug when runing the googlenet.
// When there are more than two convolutions of 1 * 1 with the same input, the
// paddle-tensorrt will do the merging optimization, which fuse those conv
// into
// one conv, and then trigger bug. So, We should use strategy to avoid this
// optimization for the time being. This bug will be fixed in the future.
std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
itensor_quote_num;
private: private:
// the max batch size // the max batch size
int max_batch_; int max_batch_;
......
set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
function(download_model install_dir model_name)
if (NOT EXISTS ${install_dir})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
endif()
endfunction()
function(download_model_and_data install_dir model_name data_name) function(download_model_and_data install_dir model_name data_name)
if (NOT EXISTS ${install_dir}) if (NOT EXISTS ${install_dir})
inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name}) inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${model_name})
...@@ -13,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename) ...@@ -13,6 +19,13 @@ function(inference_analysis_api_test target install_dir filename)
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt) ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction() endfunction()
function(inference_analysis_api_test_with_fake_data target install_dir filename model_name)
download_model(${install_dir} ${model_name})
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${install_dir}/model)
endfunction()
# RNN1 # RNN1
if(NOT APPLE) if(NOT APPLE)
set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1")
...@@ -61,17 +74,13 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana ...@@ -61,17 +74,13 @@ inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} ana
# ocr # ocr
set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
if (NOT EXISTS ${OCR_INSTALL_DIR}) if (NOT EXISTS ${OCR_INSTALL_DIR})
inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz")
endif() endif()
inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc)
# resnet50 # resnet50
set(RESNET50_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") inference_analysis_api_test_with_fake_data(test_analyzer_resnet50
if (NOT EXISTS ${RESNET50_INSTALL_DIR}) "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz")
inference_download_and_uncompress(${RESNET50_INSTALL_DIR} ${INFERENCE_URL} "resnet50_model.tar.gz")
endif()
inference_analysis_test(test_analyzer_resnet50 SRCS analyzer_resnet50_tester.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_INSTALL_DIR}/model)
# anakin # anakin
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
...@@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -30,25 +30,7 @@ void SetConfig(AnalysisConfig *cfg) {
} }
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); SetFakeImageInput(inputs, FLAGS_infer_model);
PaddleTensor input;
// channel=3, height/width=318
std::vector<int> shape({FLAGS_batch_size, 3, 318, 318});
input.shape = shape;
input.dtype = PaddleDType::FLOAT32;
// fill input data, for profile easily, do not use random data here.
size_t size = FLAGS_batch_size * 3 * 318 * 318;
input.data.Resize(size * sizeof(float));
float *input_data = static_cast<float *>(input.data.data());
for (size_t i = 0; i < size; i++) {
*(input_data + i) = static_cast<float>(i) / size;
}
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
} }
// Easy for profiling independently. // Easy for profiling independently.
...@@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) { ...@@ -61,13 +43,6 @@ void profile(bool use_mkldnn = false) {
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
size_t size = GetSize(outputs[0]);
// output is a 512-dimension feature
EXPECT_EQ(size, 512 * FLAGS_batch_size);
}
} }
TEST(Analyzer_resnet50, profile) { profile(); } TEST(Analyzer_resnet50, profile) { profile(); }
...@@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) { ...@@ -83,8 +58,7 @@ TEST(Analyzer_resnet50, fuse_statis) {
auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg); auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
auto fuse_statis = GetFuseStatis( auto fuse_statis = GetFuseStatis(
static_cast<AnalysisPredictor *>(predictor.get()), &num_ops); static_cast<AnalysisPredictor *>(predictor.get()), &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse")); LOG(INFO) << "num_ops: " << num_ops;
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_model, "", "model path");
...@@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor, ...@@ -105,6 +106,34 @@ std::unordered_map<std::string, int> GetFuseStatis(PaddlePredictor *predictor,
return fuse_statis; return fuse_statis;
} }
void SetFakeImageInput(std::vector<std::vector<PaddleTensor>> *inputs,
const std::string &dirname) {
// Set fake_image_data
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::vector<std::vector<int64_t>> feed_target_shapes =
GetFeedTargetShapes(dirname, true, "model", "params");
int dim1 = feed_target_shapes[0][1];
int dim2 = feed_target_shapes[0][2];
int dim3 = feed_target_shapes[0][3];
PaddleTensor input;
std::vector<int> shape({FLAGS_batch_size, dim1, dim2, dim3});
input.shape = shape;
input.dtype = PaddleDType::FLOAT32;
// fill input data, for profile easily, do not use random data here.
size_t size = FLAGS_batch_size * dim1 * dim2 * dim3;
input.data.Resize(size * sizeof(float));
float *input_data = static_cast<float *>(input.data.data());
for (size_t i = 0; i < size; i++) {
*(input_data + i) = static_cast<float>(i) / size;
}
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
}
void TestOneThreadPrediction( void TestOneThreadPrediction(
const AnalysisConfig &config, const AnalysisConfig &config,
const std::vector<std::vector<PaddleTensor>> &inputs, const std::vector<std::vector<PaddleTensor>> &inputs,
......
...@@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) { ...@@ -93,11 +93,16 @@ void CompareTensorRTWithFluid(int batch_size, std::string model_dirname) {
} }
} }
TEST(trt_models_test, main) { TEST(trt_models_test, mobilenet) {
std::vector<std::string> infer_models = {"mobilenet", "resnet50", CompareTensorRTWithFluid(1, FLAGS_dirname + "/mobilenet");
"resnext50"}; }
for (auto &model_dir : infer_models) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/" + model_dir); TEST(trt_models_test, resnet50) {
} CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnet50");
} }
TEST(trt_models_test, resnext50) {
CompareTensorRTWithFluid(1, FLAGS_dirname + "/resnext50");
}
} // namespace paddle } // namespace paddle
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1, ...@@ -94,15 +93,15 @@ void CheckError(const paddle::framework::LoDTensor& output1,
std::unique_ptr<paddle::framework::ProgramDesc> InitProgram( std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
paddle::framework::Executor* executor, paddle::framework::Scope* scope, paddle::framework::Executor* executor, paddle::framework::Scope* scope,
const std::string& dirname, const bool is_combined = false) { const std::string& dirname, const bool is_combined = false,
const std::string& prog_filename = "__model_combined__",
const std::string& param_filename = "__params_combined__") {
std::unique_ptr<paddle::framework::ProgramDesc> inference_program; std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
if (is_combined) { if (is_combined) {
// All parameters are saved in a single file. // All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest. // Hard-coding the file names of program and parameters in unittest.
// The file names should be consistent with that used in Python API // The file names should be consistent with that used in Python API
// `fluid.io.save_inference_model`. // `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program = inference_program =
paddle::inference::Load(executor, scope, dirname + "/" + prog_filename, paddle::inference::Load(executor, scope, dirname + "/" + prog_filename,
dirname + "/" + param_filename); dirname + "/" + param_filename);
...@@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram( ...@@ -115,12 +114,15 @@ std::unique_ptr<paddle::framework::ProgramDesc> InitProgram(
} }
std::vector<std::vector<int64_t>> GetFeedTargetShapes( std::vector<std::vector<int64_t>> GetFeedTargetShapes(
const std::string& dirname, const bool is_combined = false) { const std::string& dirname, const bool is_combined = false,
const std::string& prog_filename = "__model_combined__",
const std::string& param_filename = "__params_combined__") {
auto place = paddle::platform::CPUPlace(); auto place = paddle::platform::CPUPlace();
auto executor = paddle::framework::Executor(place); auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope(); auto* scope = new paddle::framework::Scope();
auto inference_program = InitProgram(&executor, scope, dirname, is_combined); auto inference_program = InitProgram(&executor, scope, dirname, is_combined,
prog_filename, param_filename);
auto& global_block = inference_program->Block(0); auto& global_block = inference_program->Block(0);
const std::vector<std::string>& feed_target_names = const std::vector<std::string>& feed_target_names =
...@@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes( ...@@ -136,15 +138,6 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
return feed_target_shapes; return feed_target_shapes;
} }
void Compile(paddle::framework::ProgramDesc* program) {
std::unique_ptr<paddle::framework::ir::Graph> g(
new paddle::framework::ir::Graph(*program));
auto pass = paddle::framework::ir::PassRegistry::Instance().Get(
"graph_to_program_pass");
pass->SetNotOwned<paddle::framework::ProgramDesc>("program", program);
pass->Apply(std::move(g));
}
template <typename Place, bool CreateVars = true, bool PrepareContext = false> template <typename Place, bool CreateVars = true, bool PrepareContext = false>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
...@@ -182,7 +175,6 @@ void TestInference(const std::string& dirname, ...@@ -182,7 +175,6 @@ void TestInference(const std::string& dirname,
paddle::platform::DeviceContextPool::Instance().Get(place)); paddle::platform::DeviceContextPool::Instance().Get(place));
inference_program = InitProgram(&executor, scope, dirname, is_combined); inference_program = InitProgram(&executor, scope, dirname, is_combined);
} }
Compile(inference_program.get());
// Disable the profiler and print the timing information // Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
...@@ -261,5 +253,3 @@ void TestInference(const std::string& dirname, ...@@ -261,5 +253,3 @@ void TestInference(const std::string& dirname,
delete scope; delete scope;
} }
USE_PASS(graph_to_program_pass);
...@@ -286,10 +286,10 @@ int GRPCVariableResponse::Parse(Source* source) { ...@@ -286,10 +286,10 @@ int GRPCVariableResponse::Parse(Source* source) {
platform::EnableProfiler(platform::ProfilerState::kCPU); platform::EnableProfiler(platform::ProfilerState::kCPU);
} else if (profiling == platform::kDisableProfiler && } else if (profiling == platform::kDisableProfiler &&
platform::IsProfileEnabled()) { platform::IsProfileEnabled()) {
// TODO(panyx0718): Should we allow to customize file dir.
platform::DisableProfiler( platform::DisableProfiler(
platform::EventSortingKey::kDefault, platform::EventSortingKey::kDefault,
string::Sprintf("/tmp/profile_ps_%lld", listener_id)); string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path,
listener_id));
} }
break; break;
} }
......
...@@ -51,7 +51,6 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -51,7 +51,6 @@ bool RequestSendHandler::Handle(const std::string& varname,
// Async // Async
if (!sync_mode_) { if (!sync_mode_) {
VLOG(3) << "async process var: " << varname; VLOG(3) << "async process var: " << varname;
rpc_server_->Profiler().OneStep();
try { try {
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope); scope);
......
...@@ -20,42 +20,10 @@ ...@@ -20,42 +20,10 @@
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_int32(rpc_server_profile_period, 0,
"the period of listen_and_serv to do profile");
DEFINE_string(rpc_server_profile_path, "/dev/null",
"the profile log file path");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
RPCServerProfiler::RPCServerProfiler(int profile_period,
const std::string& profile_log_path)
: profile_period_(profile_period), profile_log_path_(profile_log_path) {
step_ = 0;
}
void RPCServerProfiler::OneStep() {
PADDLE_ENFORCE_LE(step_, profile_period_,
"step_ should not be larger then "
"profile_period_");
if (profile_period_ <= 0) {
return;
}
if (step_ == 0) {
auto pf_state = paddle::platform::ProfilerState::kCPU;
paddle::platform::EnableProfiler(pf_state);
}
if (step_ == profile_period_) {
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal,
profile_log_path_);
step_ = 0;
} else {
step_++;
}
}
void RPCServer::ShutDown() { void RPCServer::ShutDown() {
LOG(INFO) << "RPCServer ShutDown "; LOG(INFO) << "RPCServer ShutDown ";
ShutDownImpl(); ShutDownImpl();
......
...@@ -23,30 +23,14 @@ ...@@ -23,30 +23,14 @@
#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/request_handler.h"
DECLARE_int32(rpc_server_profile_period);
DECLARE_string(rpc_server_profile_path);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
class RPCServerProfiler {
public:
RPCServerProfiler(int profile_period, const std::string& profile_log_path);
void OneStep();
private:
const int profile_period_;
std::string profile_log_path_;
int step_;
};
class RPCServer { class RPCServer {
public: public:
explicit RPCServer(const std::string& address, int client_num) explicit RPCServer(const std::string& address, int client_num)
: cur_cond_(0), : cur_cond_(0),
profiler_(FLAGS_rpc_server_profile_period,
FLAGS_rpc_server_profile_path),
bind_address_(address), bind_address_(address),
exit_flag_(false), exit_flag_(false),
selected_port_(0), selected_port_(0),
...@@ -86,7 +70,6 @@ class RPCServer { ...@@ -86,7 +70,6 @@ class RPCServer {
void Complete(); void Complete();
void ResetBarrierCounter(); void ResetBarrierCounter();
RPCServerProfiler& Profiler() { return profiler_; }
bool NeedResetAllVars(); bool NeedResetAllVars();
...@@ -101,7 +84,6 @@ class RPCServer { ...@@ -101,7 +84,6 @@ class RPCServer {
std::unordered_map<std::string, int> rpc_cond_map_; std::unordered_map<std::string, int> rpc_cond_map_;
std::atomic<int> cur_cond_; std::atomic<int> cur_cond_;
std::condition_variable rpc_cond_; std::condition_variable rpc_cond_;
RPCServerProfiler profiler_;
protected: protected:
std::string bind_address_; std::string bind_address_;
......
...@@ -16,6 +16,9 @@ ...@@ -16,6 +16,9 @@
#include <vector> #include <vector>
#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h"
DEFINE_string(rpc_server_profile_path, "./profile_ps",
"the profile log file path");
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
......
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h"
DECLARE_string(rpc_server_profile_path);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace distributed { namespace distributed {
......
...@@ -134,7 +134,6 @@ void ListenAndServOp::RunSyncLoop( ...@@ -134,7 +134,6 @@ void ListenAndServOp::RunSyncLoop(
rpc_service_->ResetBarrierCounter(); rpc_service_->ResetBarrierCounter();
while (true) { while (true) {
rpc_service_->Profiler().OneStep();
// Get from multiple trainers, we don't care about the order in which // Get from multiple trainers, we don't care about the order in which
// the gradients arrives, just add suffix 0~n and merge the gradient. // the gradients arrives, just add suffix 0~n and merge the gradient.
rpc_service_->SetCond(distributed::kRequestSend); rpc_service_->SetCond(distributed::kRequestSend);
......
...@@ -28,7 +28,7 @@ __device__ __forceinline__ double real_log(double x) { return log(x); } ...@@ -28,7 +28,7 @@ __device__ __forceinline__ double real_log(double x) { return log(x); }
__device__ __forceinline__ platform::float16 real_log( __device__ __forceinline__ platform::float16 real_log(
const platform::float16& val) { const platform::float16& val) {
return static_cast<platform::float16>(hlog(static_cast<half>(val))); return static_cast<platform::float16>(logf(static_cast<float>(val)));
} }
template <typename T> template <typename T>
......
...@@ -36,7 +36,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M, ...@@ -36,7 +36,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
.template Get<jitkernel::VAddReluKernel<T>>(N); .template Get<jitkernel::VAddReluKernel<T>>(N);
for (int i = 0; i < M; i++) { for (int i = 0; i < M; i++) {
T* dst = Y + i * N; T* dst = Y + i * N;
vaddrelu->Compute(B, dst, dst); vaddrelu->Compute(B, dst, dst, N);
} }
} else { } else {
const auto& vadd = jitkernel::KernelPool::Instance() const auto& vadd = jitkernel::KernelPool::Instance()
...@@ -47,7 +47,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M, ...@@ -47,7 +47,7 @@ inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
#endif #endif
for (int i = 0; i < M; i++) { for (int i = 0; i < M; i++) {
T* dst = Y + i * N; T* dst = Y + i * N;
vadd->Compute(B, dst, dst); vadd->Compute(B, dst, dst, N);
} }
} }
} }
......
...@@ -24,19 +24,29 @@ namespace gen { ...@@ -24,19 +24,29 @@ namespace gen {
using namespace platform::jit; // NOLINT using namespace platform::jit; // NOLINT
bool VMulJitCode::init(int d) { bool VVVJitCode::init(int d) {
// It's not necessary to use avx512 since it would slow down the frequency // It's not necessary to use avx512 since it would slow down the frequency
// and this kernel is not compute bound. // and this kernel is not compute bound.
return MayIUse(avx); return MayIUse(avx);
} }
void VMulJitCode::generate() { void VVVJitCode::generate() {
// do not need push stack, and do not need save avx512reg if do not use avx512 // do not need push stack, and do not need save avx512reg if do not use avx512
int offset = 0; int offset = 0;
if (with_relu_) {
vxorps(ymm_zero, ymm_zero, ymm_zero);
}
for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) { for (int i = 0; i < num_ / AVX_FLOAT_BLOCK; ++i) {
vmovups(ymm_src1, ptr[param1 + offset]); vmovups(ymm_src1, ptr[param1 + offset]);
vmovups(ymm_src2, ptr[param2 + offset]); vmovups(ymm_src2, ptr[param2 + offset]);
vmulps(ymm_dst, ymm_src1, ymm_src2); if (type_ == operand_type::mul) {
vmulps(ymm_dst, ymm_src1, ymm_src2);
} else if (type_ == operand_type::add) {
vaddps(ymm_dst, ymm_src1, ymm_src2);
}
if (with_relu_) {
vmaxps(ymm_dst, ymm_zero, ymm_dst);
}
vmovups(ptr[param3 + offset], ymm_dst); vmovups(ptr[param3 + offset], ymm_dst);
offset += sizeof(float) * AVX_FLOAT_BLOCK; offset += sizeof(float) * AVX_FLOAT_BLOCK;
} }
...@@ -44,7 +54,14 @@ void VMulJitCode::generate() { ...@@ -44,7 +54,14 @@ void VMulJitCode::generate() {
if (rest >= 4) { if (rest >= 4) {
vmovups(xmm_src1, ptr[param1 + offset]); vmovups(xmm_src1, ptr[param1 + offset]);
vmovups(xmm_src2, ptr[param2 + offset]); vmovups(xmm_src2, ptr[param2 + offset]);
vmulps(xmm_dst, xmm_src1, xmm_src2); if (type_ == operand_type::mul) {
vmulps(xmm_dst, xmm_src1, xmm_src2);
} else if (type_ == operand_type::add) {
vaddps(xmm_dst, xmm_src1, xmm_src2);
}
if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst);
}
vmovups(ptr[param3 + offset], xmm_dst); vmovups(ptr[param3 + offset], xmm_dst);
offset += sizeof(float) * 4; offset += sizeof(float) * 4;
rest -= 4; rest -= 4;
...@@ -52,7 +69,14 @@ void VMulJitCode::generate() { ...@@ -52,7 +69,14 @@ void VMulJitCode::generate() {
if (rest >= 2) { if (rest >= 2) {
vmovq(xmm_src1, ptr[param1 + offset]); vmovq(xmm_src1, ptr[param1 + offset]);
vmovq(xmm_src2, ptr[param2 + offset]); vmovq(xmm_src2, ptr[param2 + offset]);
vmulps(xmm_dst, xmm_src1, xmm_src2); if (type_ == operand_type::mul) {
vmulps(xmm_dst, xmm_src1, xmm_src2);
} else if (type_ == operand_type::add) {
vaddps(xmm_dst, xmm_src1, xmm_src2);
}
if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst);
}
vmovq(ptr[param3 + offset], xmm_dst); vmovq(ptr[param3 + offset], xmm_dst);
offset += sizeof(float) * 2; offset += sizeof(float) * 2;
rest -= 2; rest -= 2;
...@@ -60,12 +84,18 @@ void VMulJitCode::generate() { ...@@ -60,12 +84,18 @@ void VMulJitCode::generate() {
if (rest > 0) { if (rest > 0) {
vmovss(xmm_src1, ptr[param1 + offset]); vmovss(xmm_src1, ptr[param1 + offset]);
vmovss(xmm_src2, ptr[param2 + offset]); vmovss(xmm_src2, ptr[param2 + offset]);
vmulss(xmm_dst, xmm_src1, xmm_src2); if (type_ == operand_type::mul) {
vmulss(xmm_dst, xmm_src1, xmm_src2);
} else if (type_ == operand_type::add) {
vaddss(xmm_dst, xmm_src1, xmm_src2);
}
if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst);
}
vmovss(ptr[param3 + offset], xmm_dst); vmovss(ptr[param3 + offset], xmm_dst);
} }
ret(); ret();
} }
} // namespace gen } // namespace gen
} // namespace jitkernel } // namespace jitkernel
} // namespace math } // namespace math
......
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <string>
#include "paddle/fluid/operators/math/jit_gen.h" #include "paddle/fluid/operators/math/jit_gen.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -29,28 +29,47 @@ using ymm_t = const Xbyak::Ymm; ...@@ -29,28 +29,47 @@ using ymm_t = const Xbyak::Ymm;
using zmm_t = const Xbyak::Zmm; using zmm_t = const Xbyak::Zmm;
using Label = Xbyak::Label; using Label = Xbyak::Label;
class VMulJitCode : public JitCode { // function: vec = Operand(vec, vec) (maybe with relu)
typedef enum { mul = 0, add } operand_type;
class VVVJitCode : public JitCode {
public: public:
DECLARE_JIT_CODE(VMulJitCode); const char* name() const override {
explicit VMulJitCode(int d, size_t code_size = 256 * 1024, std::string base = "VVVJitCode";
void* code_ptr = nullptr) if (type_ == operand_type::mul) {
: JitCode(code_size, code_ptr), num_(d) {} base += "_Mul";
} else if (type_ == operand_type::add) {
base += "_Add";
}
base += (with_relu_ ? "_relu" : "");
return base.c_str();
}
explicit VVVJitCode(int d, operand_type type, bool with_relu,
size_t code_size = 256 * 1024, void* code_ptr = nullptr)
: JitCode(code_size, code_ptr),
num_(d),
type_(type),
with_relu_(with_relu) {}
static bool init(int d); static bool init(int d);
void generate() override; void generate() override;
private: private:
int num_; int num_;
operand_type type_;
bool with_relu_;
reg64_t param1{abi_param1}; reg64_t param1{abi_param1};
reg64_t param2{abi_param2}; reg64_t param2{abi_param2};
reg64_t param3{abi_param3}; reg64_t param3{abi_param3};
xmm_t xmm_src1 = xmm_t(0); xmm_t xmm_src1 = xmm_t(0);
xmm_t xmm_src2 = xmm_t(1); xmm_t xmm_src2 = xmm_t(1);
xmm_t xmm_dst = xmm_t(2); xmm_t xmm_dst = xmm_t(1);
xmm_t xmm_zero = xmm_t(2);
ymm_t ymm_src1 = ymm_t(0); ymm_t ymm_src1 = ymm_t(0);
ymm_t ymm_src2 = ymm_t(1); ymm_t ymm_src2 = ymm_t(1);
ymm_t ymm_dst = ymm_t(2); ymm_t ymm_dst = ymm_t(1);
ymm_t ymm_zero = ymm_t(2);
}; };
} // namespace gen } // namespace gen
......
...@@ -71,26 +71,26 @@ class VMulKernel : public Kernel { ...@@ -71,26 +71,26 @@ class VMulKernel : public Kernel {
template <typename T> template <typename T>
class VAddKernel : public Kernel { class VAddKernel : public Kernel {
public: public:
virtual void Compute(const T *x, const T *y, T *z) const = 0; void (*Compute)(const T *, const T *, T *, int);
}; };
template <typename T> template <typename T>
class VScalKernel : public Kernel { class VAddReluKernel : public Kernel {
public: public:
virtual void Compute(const T a, const T *x, T *y) const = 0; void (*Compute)(const T *, const T *, T *, int);
virtual void Compute(const T a, T *x) const = 0;
}; };
template <typename T> template <typename T>
class VAddBiasKernel : public Kernel { class VScalKernel : public Kernel {
public: public:
virtual void Compute(const T a, const T *x, T *y) const = 0; virtual void Compute(const T a, const T *x, T *y) const = 0;
virtual void Compute(const T a, T *x) const = 0;
}; };
template <typename T> template <typename T>
class VAddReluKernel : public Kernel { class VAddBiasKernel : public Kernel {
public: public:
virtual void Compute(const T *x, const T *y, T *z) const = 0; virtual void Compute(const T a, const T *x, T *y) const = 0;
}; };
template <typename T> template <typename T>
......
...@@ -42,6 +42,21 @@ void VMulRefer(const T* x, const T* y, T* z, int n) { ...@@ -42,6 +42,21 @@ void VMulRefer(const T* x, const T* y, T* z, int n) {
} }
} }
template <typename T>
void VAddRefer(const T* x, const T* y, T* z, int n) {
for (int i = 0; i < n; ++i) {
z[i] = x[i] + y[i];
}
}
template <typename T>
void VAddReluRefer(const T* x, const T* y, T* z, int n) {
for (int i = 0; i < n; ++i) {
z[i] = x[i] + y[i];
z[i] = z[i] > 0 ? z[i] : 0;
}
}
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
template <typename T> template <typename T>
void VMulMKL(const T* x, const T* y, T* z, int n); void VMulMKL(const T* x, const T* y, T* z, int n);
...@@ -50,28 +65,45 @@ template <> ...@@ -50,28 +65,45 @@ template <>
void VMulMKL<float>(const float* x, const float* y, float* z, int n) { void VMulMKL<float>(const float* x, const float* y, float* z, int n) {
platform::dynload::vsMul(n, x, y, z); platform::dynload::vsMul(n, x, y, z);
} }
template <> template <>
void VMulMKL<double>(const double* x, const double* y, double* z, int n) { void VMulMKL<double>(const double* x, const double* y, double* z, int n) {
platform::dynload::vdMul(n, x, y, z); platform::dynload::vdMul(n, x, y, z);
} }
template <typename T>
void VAddMKL(const T* x, const T* y, T* z, int n);
template <>
void VAddMKL<float>(const float* x, const float* y, float* z, int n) {
platform::dynload::vsAdd(n, x, y, z);
}
template <>
void VAddMKL<double>(const double* x, const double* y, double* z, int n) {
platform::dynload::vdAdd(n, x, y, z);
}
#endif #endif
#define DECLARE_STATIC_FUNC \
static inline std::string name(int d) { \
PADDLE_THROW("DType should be either float or double"); \
} \
static inline bool useJIT(int d) { return false; } \
static inline bool useMKL(int d) { return false; }
/* VMUL JitKernel */ /* VMUL JitKernel */
template <typename T> template <typename T>
class VMulKernelImpl : public VMulKernel<T> { class VMulKernelImpl : public VMulKernel<T> {
public: public:
static inline std::string name(int d) { DECLARE_STATIC_FUNC;
PADDLE_THROW("DType should be either float or double");
}
static inline bool useJIT(int d) { return false; }
static inline bool useMKL(int d) { return false; }
explicit VMulKernelImpl(int d) : VMulKernel<T>() { explicit VMulKernelImpl(int d) : VMulKernel<T>() {
#ifdef PADDLE_WITH_XBYAK #ifdef PADDLE_WITH_XBYAK
if (useJIT(d)) { if (useJIT(d)) {
// roughly estimate the size of code // roughly estimate the size of code
size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8; size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
jitcode_.reset(new gen::VMulJitCode(d, sz > 4096 ? sz : 4096)); jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::mul, false,
sz > 4096 ? sz : 4096));
this->Compute = this->Compute =
jitcode_->getCode<void (*)(const T*, const T*, T*, int)>(); jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
return; return;
...@@ -89,14 +121,14 @@ class VMulKernelImpl : public VMulKernel<T> { ...@@ -89,14 +121,14 @@ class VMulKernelImpl : public VMulKernel<T> {
#ifdef PADDLE_WITH_XBYAK #ifdef PADDLE_WITH_XBYAK
private: private:
std::unique_ptr<gen::VMulJitCode> jitcode_{nullptr}; std::unique_ptr<gen::VVVJitCode> jitcode_{nullptr};
#endif #endif
}; };
#ifdef PADDLE_WITH_XBYAK #ifdef PADDLE_WITH_XBYAK
template <> template <>
bool VMulKernelImpl<float>::useJIT(int d) { bool VMulKernelImpl<float>::useJIT(int d) {
return gen::VMulJitCode::init(d); return gen::VVVJitCode::init(d);
} }
#endif #endif
...@@ -112,63 +144,89 @@ bool VMulKernelImpl<double>::useMKL(int d) { ...@@ -112,63 +144,89 @@ bool VMulKernelImpl<double>::useMKL(int d) {
} }
#endif #endif
REGISTER_JITKERNEL(vmul, VMulKernel); /* VAdd JitKernel */
template <typename T>
/* VADD JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VAddKernelImpl : public VAddKernel<T> { class VAddKernelImpl : public VAddKernel<T> {
public: public:
explicit VAddKernelImpl(int d) : VAddKernel<T>() { this->num_ = d; } DECLARE_STATIC_FUNC;
void Compute(const T* x, const T* y, T* z) const override { explicit VAddKernelImpl(int d) : VAddKernel<T>() {
for (int i = 0; i < this->num_; ++i) { #ifdef PADDLE_WITH_XBYAK
z[i] = x[i] + y[i]; if (useJIT(d)) {
size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::add, false,
sz > 4096 ? sz : 4096));
this->Compute =
jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
return;
} }
} #endif
};
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block) \ if (useMKL(d)) {
template <> \ this->Compute = VAddMKL<T>;
void VAddKernelImpl<float, isa, block>::Compute( \ return;
const float* x, const float* y, float* z) const { \ }
platform::dynload::vsAdd(this->num_, x, y, z); \ #endif
this->Compute = VAddRefer<T>;
} }
#define MKL_DOUBLE(isa, block) \ private:
template <> \ std::unique_ptr<gen::VVVJitCode> jitcode_{nullptr};
void VAddKernelImpl<double, isa, block>::Compute( \ };
const double* x, const double* y, double* z) const { \
platform::dynload::vdAdd(this->num_, x, y, z); \
}
FOR_EACH_ISA(MKL_FLOAT, kGT16); #ifdef PADDLE_WITH_XBYAK
FOR_EACH_ISA_BLOCK(MKL_DOUBLE); template <>
bool VAddKernelImpl<float>::useJIT(int d) {
return gen::VVVJitCode::init(d);
}
#endif #endif
#define INTRI8_FLOAT(isa) \ #ifdef PADDLE_WITH_MKLML
template <> \ template <>
void VAddKernelImpl<float, isa, kEQ8>::Compute( \ bool VAddKernelImpl<float>::useMKL(int d) {
const float* x, const float* y, float* z) const { \ return d > 512;
__m256 tmpx, tmpy; \ }
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \ template <>
tmpx = _mm256_add_ps(tmpx, tmpy); \ bool VAddKernelImpl<double>::useMKL(int d) {
_mm256_storeu_ps(z, tmpx); \ return true;
} }
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
#endif #endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2); /* VAddRelu JitKernel */
template <typename T>
class VAddReluKernelImpl : public VAddReluKernel<T> {
public:
DECLARE_STATIC_FUNC;
explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() {
#ifdef PADDLE_WITH_XBYAK
if (useJIT(d)) {
size_t sz = 96 + d / AVX_FLOAT_BLOCK * 4 * 8;
jitcode_.reset(new gen::VVVJitCode(d, gen::operand_type::add, true,
sz > 4096 ? sz : 4096));
this->Compute =
jitcode_->getCode<void (*)(const T*, const T*, T*, int)>();
return;
}
#endif #endif
#ifdef __AVX512F__ this->Compute = VAddReluRefer<T>;
INTRI8_FLOAT(jit::avx512f); }
private:
std::unique_ptr<gen::VVVJitCode> jitcode_{nullptr};
};
#ifdef PADDLE_WITH_XBYAK
template <>
bool VAddReluKernelImpl<float>::useJIT(int d) {
return gen::VVVJitCode::init(d);
}
#endif #endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT #undef DECLARE_STATIC_FUNC
#undef MKL_FLOAT
#undef MKL_DOUBLE REGISTER_JITKERNEL(vmul, VMulKernel);
REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vaddrelu, VAddReluKernel);
/* VSCAL JitKernel */ /* VSCAL JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block> template <typename T, platform::jit::cpu_isa_t isa, jit_block>
...@@ -405,98 +463,9 @@ class VIdentityKernelImpl : public VIdentityKernel<T> { ...@@ -405,98 +463,9 @@ class VIdentityKernelImpl : public VIdentityKernel<T> {
void Compute(const T* x, T* y) const override {} void Compute(const T* x, T* y) const override {}
}; };
/* VAddRelu JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VAddReluKernelImpl : public VAddReluKernel<T> {
public:
explicit VAddReluKernelImpl(int d) : VAddReluKernel<T>() { this->num_ = d; }
void Compute(const T* x, const T* y, T* z) const override {
for (int i = 0; i < this->num_; ++i) {
z[i] = x[i] + y[i];
z[i] = z[i] > 0 ? z[i] : 0;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx = _mm256_loadu_ps(x); \
__m256 tmpy = _mm256_loadu_ps(y); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, _mm256_setzero_ps()); \
_mm256_storeu_ps(z, tmpy); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddReluKernelImpl<float, isa, kEQ16>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(y); \
tmp0 = _mm256_add_ps(tmp0, tmp1); \
tmp0 = _mm256_max_ps(tmp0, zeros); \
tmp1 = _mm256_loadu_ps(x + 8); \
__m256 tmp2 = _mm256_loadu_ps(y + 8); \
tmp1 = _mm256_add_ps(tmp1, tmp2); \
tmp1 = _mm256_max_ps(tmp1, zeros); \
_mm256_storeu_ps(z, tmp0); \
_mm256_storeu_ps(z + 8, tmp1); \
}
#define INTRI_COMMON_FLOAT(isa, block) \
template <> \
VAddReluKernelImpl<float, isa, block>::VAddReluKernelImpl(int d) \
: VAddReluKernel<float>() { \
this->num_ = d; \
this->end_ = d - d % AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
} \
template <> \
void VAddReluKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 zeros = _mm256_setzero_ps(); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmpx = _mm256_loadu_ps(x + i); \
__m256 tmpy = _mm256_loadu_ps(y + i); \
tmpy = _mm256_add_ps(tmpx, tmpy); \
tmpy = _mm256_max_ps(tmpy, zeros); \
_mm256_storeu_ps(z + i, tmpy); \
} \
for (int i = this->end_; i < this->num_; ++i) { \
z[i] = x[i] + y[i]; \
z[i] = z[i] > 0 ? z[i] : 0; \
} \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
INTRI_COMMON_FLOAT(jit::avx, kGT16);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
INTRI_COMMON_FLOAT(jit::avx2, kGT16);
#endif
#ifdef __AVX512F__
// TODO(TJ): refine avx512
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
INTRI_COMMON_FLOAT(jit::avx512f, kGT16);
#endif
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_COMMON_FLOAT
REGISTER_JITKERNEL_DEPRECATED(vadd, VAddKernel);
REGISTER_JITKERNEL_DEPRECATED(vscal, VScalKernel); REGISTER_JITKERNEL_DEPRECATED(vscal, VScalKernel);
REGISTER_JITKERNEL_DEPRECATED(vaddb, VAddBiasKernel); REGISTER_JITKERNEL_DEPRECATED(vaddb, VAddBiasKernel);
REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel); REGISTER_JITKERNEL_DEPRECATED(vrelu, VReluKernel);
REGISTER_JITKERNEL_DEPRECATED(vaddrelu, VAddReluKernel);
REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel); REGISTER_JITKERNEL_DEPRECATED(videntity, VIdentityKernel);
} // namespace jitkernel } // namespace jitkernel
......
...@@ -181,7 +181,7 @@ class LSTMKernelImpl : public LSTMKernel<T> { ...@@ -181,7 +181,7 @@ class LSTMKernelImpl : public LSTMKernel<T> {
act_cand_d_->Compute(gates, gates); act_cand_d_->Compute(gates, gates);
vmul_d_->Compute(gates, gates + d_, gates + d_, d_); vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
vadd_d_->Compute(gates + d_, gates + d2_, ct); vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
/* H_t = act_cell(C_t) * ogated */ /* H_t = act_cell(C_t) * ogated */
act_cell_d_->Compute(ct, gates + d2_); act_cell_d_->Compute(ct, gates + d2_);
...@@ -291,16 +291,16 @@ class PeepholeKernelImpl : public LSTMKernel<T> { ...@@ -291,16 +291,16 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
/* get fgated and igated*/ /* get fgated and igated*/
vmul_d_->Compute(wp_data, ct_1, checked, d_); vmul_d_->Compute(wp_data, ct_1, checked, d_);
vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_); vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_);
vadd_d2_->Compute(checked, gates + d_, gates + d_); vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_);
act_gate_d2_->Compute(gates + d_, gates + d_); act_gate_d2_->Compute(gates + d_, gates + d_);
/* C_t = C_t-1 * fgated + cand_gated * igated*/ /* C_t = C_t-1 * fgated + cand_gated * igated*/
act_cand_d_->Compute(gates, gates); act_cand_d_->Compute(gates, gates);
vmul_d_->Compute(gates, gates + d_, gates + d_, d_); vmul_d_->Compute(gates, gates + d_, gates + d_, d_);
vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_);
vadd_d_->Compute(gates + d_, gates + d2_, ct); vadd_d_->Compute(gates + d_, gates + d2_, ct, d_);
/* get ogated*/ /* get ogated*/
vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
act_gate_d_->Compute(gates + d3_, gates + d3_); act_gate_d_->Compute(gates + d3_, gates + d3_);
/* H_t = act_cell(C_t) * ogated */ /* H_t = act_cell(C_t) * ogated */
act_cell_d_->Compute(ct, gates + d2_); act_cell_d_->Compute(ct, gates + d2_);
...@@ -314,7 +314,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> { ...@@ -314,7 +314,7 @@ class PeepholeKernelImpl : public LSTMKernel<T> {
vmul_d_->Compute(gates, gates + d_, ct, d_); vmul_d_->Compute(gates, gates + d_, ct, d_);
/* get outgated, put W_oc * C_t on igated */ /* get outgated, put W_oc * C_t on igated */
vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_);
vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_); vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_);
/* H_t = act_cell(C_t) * ogated */ /* H_t = act_cell(C_t) * ogated */
act_gate_d_->Compute(gates + d3_, gates + d3_); act_gate_d_->Compute(gates + d3_, gates + d3_);
act_cell_d_->Compute(ct, gates + d2_); act_cell_d_->Compute(ct, gates + d2_);
......
...@@ -371,7 +371,7 @@ void lstm_ctht_better( ...@@ -371,7 +371,7 @@ void lstm_ctht_better(
vtanh_d->Compute(gates, gates); vtanh_d->Compute(gates, gates);
vmul_d->Compute(gates, gates + d, gates + d, d); vmul_d->Compute(gates, gates + d, gates + d, d);
vmul_d->Compute(ct_1, gates + d2, gates + d2, d); vmul_d->Compute(ct_1, gates + d2, gates + d2, d);
vadd_d->Compute(gates + d, gates + d2, ct); vadd_d->Compute(gates + d, gates + d2, ct, d);
/* H_t = act_cell(C_t) * ogated */ /* H_t = act_cell(C_t) * ogated */
vtanh_d->Compute(ct, gates + d2); vtanh_d->Compute(ct, gates + d2);
vmul_d->Compute(gates + d2, gates + d * 3, ht, d); vmul_d->Compute(gates + d2, gates + d * 3, ht, d);
...@@ -695,7 +695,7 @@ TEST(JitKernel, vadd) { ...@@ -695,7 +695,7 @@ TEST(JitKernel, vadd) {
auto ttgts = GetCurrentUS(); auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, y_data, ztgt_data); ker->Compute(x_data, y_data, ztgt_data, d);
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
...@@ -723,8 +723,8 @@ void vaddrelu_better( ...@@ -723,8 +723,8 @@ void vaddrelu_better(
const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd, const paddle::operators::math::jitkernel::VAddKernel<float>>& vadd,
const std::shared_ptr< const std::shared_ptr<
const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu, const paddle::operators::math::jitkernel::VReluKernel<float>>& vrelu,
const float* x, const float* y, float* z) { const float* x, const float* y, float* z, int d) {
vadd->Compute(x, y, z); vadd->Compute(x, y, z, d);
vrelu->Compute(z, z); vrelu->Compute(z, z);
} }
...@@ -752,12 +752,12 @@ TEST(JitKernel, vaddrelu) { ...@@ -752,12 +752,12 @@ TEST(JitKernel, vaddrelu) {
auto trefe = GetCurrentUS(); auto trefe = GetCurrentUS();
auto tmkls = GetCurrentUS(); auto tmkls = GetCurrentUS();
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data); vaddrelu_better(vadd, vrelu, x_data, y_data, zref_data, d);
} }
auto tmkle = GetCurrentUS(); auto tmkle = GetCurrentUS();
auto ttgts = GetCurrentUS(); auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
ker->Compute(x_data, y_data, ztgt_data); ker->Compute(x_data, y_data, ztgt_data, d);
} }
auto ttgte = GetCurrentUS(); auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
...@@ -801,7 +801,11 @@ TEST(JitKernel, pool) { ...@@ -801,7 +801,11 @@ TEST(JitKernel, pool) {
std::dynamic_pointer_cast<const jit::Kernel>(pvmul_d)); std::dynamic_pointer_cast<const jit::Kernel>(pvmul_d));
const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulfjit4"); const auto& pvmul_from_key = jit::KernelPool::Instance().Get("vmulfjit4");
EXPECT_EQ(pvmul_f, pvmul_from_key); #if defined(__APPLE__) || defined(__OSX__) || defined(_WIN32)
EXPECT_EQ(pvmul_from_key, nullptr);
#else
EXPECT_EQ(pvmul_from_key, pvmul_f);
#endif
const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulfjit"); const auto& pvmul_from_key2 = jit::KernelPool::Instance().Get("vmulfjit");
EXPECT_TRUE(pvmul_from_key2 == nullptr); EXPECT_TRUE(pvmul_from_key2 == nullptr);
} }
...@@ -223,7 +223,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -223,7 +223,9 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
// Add outputs // Add outputs
for (auto& output : output_maps) { for (auto& output : output_maps) {
engine->DeclareOutput(output); if (!engine->HasDeclared(output)) {
engine->DeclareOutput(output);
}
} }
engine->FreezeNetwork(); engine->FreezeNetwork();
......
...@@ -116,6 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -116,6 +116,7 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
platform::SetNumThreads(FLAGS_paddle_num_threads); platform::SetNumThreads(FLAGS_paddle_num_threads);
#endif #endif
#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__)
if (platform::jit::MayIUse(platform::jit::avx)) { if (platform::jit::MayIUse(platform::jit::avx)) {
#ifndef __AVX__ #ifndef __AVX__
LOG(WARNING) << "AVX is available, Please re-compile on local machine"; LOG(WARNING) << "AVX is available, Please re-compile on local machine";
...@@ -157,8 +158,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) { ...@@ -157,8 +158,9 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
AVX_GUIDE(AVX, NonAVX); AVX_GUIDE(AVX, NonAVX);
} }
#endif #endif
#undef AVX_GUIDE #undef AVX_GUIDE
#endif
} }
void InitGLOG(const std::string &prog_name) { void InitGLOG(const std::string &prog_name) {
......
...@@ -226,7 +226,7 @@ RecordBlock::~RecordBlock() { ...@@ -226,7 +226,7 @@ RecordBlock::~RecordBlock() {
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enable profiling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
......
...@@ -742,7 +742,12 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -742,7 +742,12 @@ All parameter, weight, gradient are variables in Paddle.
will clean up the temp variables at the end of the current iteration. will clean up the temp variables at the end of the current iteration.
2. In some NLP model, it may cause the GPU memory is insufficient, 2. In some NLP model, it may cause the GPU memory is insufficient,
in this case, you should reduce `num_iteration_per_drop_scope`. in this case, you should reduce `num_iteration_per_drop_scope`.
)DOC"); )DOC")
.def_property("_dry_run",
[](const ExecutionStrategy &self) { return self.dry_run_; },
[](ExecutionStrategy &self, bool dry_run) {
self.dry_run_ = dry_run;
});
exec_strategy.def_property( exec_strategy.def_property(
"use_experimental_executor", "use_experimental_executor",
......
...@@ -118,7 +118,6 @@ def __bootstrap__(): ...@@ -118,7 +118,6 @@ def __bootstrap__():
] ]
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
read_env_flags.append('rpc_server_profile_period')
read_env_flags.append('rpc_server_profile_path') read_env_flags.append('rpc_server_profile_path')
read_env_flags.append('enable_rpc_profiler') read_env_flags.append('enable_rpc_profiler')
read_env_flags.append('rpc_send_thread_num') read_env_flags.append('rpc_send_thread_num')
......
...@@ -65,7 +65,7 @@ def is_persistable(var): ...@@ -65,7 +65,7 @@ def is_persistable(var):
Examples: Examples:
.. code-block:: python .. code-block:: python
param = fluid.default_main_program().global_block().var('fc.w') param = fluid.default_main_program().global_block().var('fc.b')
res = fluid.io.is_persistable(param) res = fluid.io.is_persistable(param)
""" """
if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
...@@ -625,8 +625,13 @@ def save_inference_model(dirname, ...@@ -625,8 +625,13 @@ def save_inference_model(dirname,
main_program._distributed_lookup_table, main_program._distributed_lookup_table,
main_program._endpoints) main_program._endpoints)
if not os.path.isdir(dirname): # when a pserver and a trainer running on the same machine, mkdir may conflict
try:
os.makedirs(dirname) os.makedirs(dirname)
except OSError as e:
if e.errno != errno.EEXIST:
raise
if model_filename is not None: if model_filename is not None:
model_basename = os.path.basename(model_filename) model_basename = os.path.basename(model_filename)
else: else:
......
...@@ -60,7 +60,7 @@ def data(name, ...@@ -60,7 +60,7 @@ def data(name,
For example if shape=[1], the resulting shape is [-1, 1]. For example if shape=[1], the resulting shape is [-1, 1].
2. If shape contains -1, such as shape=[1, -1], 2. If shape contains -1, such as shape=[1, -1],
append_batch_size will be enforced to be be False (ineffective). append_batch_size will be enforced to be be False (ineffective).
dtype(int|float): The type of data : float32, float_16, int etc dtype(basestring): The type of data : float32, float_16, int etc
type(VarType): The output type. By default it is LOD_TENSOR. type(VarType): The output type. By default it is LOD_TENSOR.
lod_level(int): The LoD Level. 0 means the input data is not a sequence. lod_level(int): The LoD Level. 0 means the input data is not a sequence.
stop_gradient(bool): A boolean that mentions whether gradient should flow. stop_gradient(bool): A boolean that mentions whether gradient should flow.
......
...@@ -41,9 +41,6 @@ def convert_reader_to_recordio_file( ...@@ -41,9 +41,6 @@ def convert_reader_to_recordio_file(
""" """
Convert a Python Reader to a recordio file. Convert a Python Reader to a recordio file.
Please see :ref:`api_guide_python_reader` and :ref:`api_guide_reader_op` for
details.
Examples: Examples:
>>> import paddle.fluid as fluid >>> import paddle.fluid as fluid
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
import unittest
import logging
import six
class TestBase(unittest.TestCase):
def main(self,
network_func,
iter=100,
iter_per_pe=100,
use_gpu=True,
use_experimental_executor=False):
if use_gpu and not fluid.core.is_compiled_with_cuda():
logging.warning(
"Paddle is not compiled with CUDA, skip GPU unittests")
return
main_prog = fluid.Program()
startup_prog = fluid.Program()
scope = fluid.Scope()
with fluid.program_guard(main_prog, startup_prog):
with fluid.scope_guard(scope):
loss = network_func()
fluid.Executor(
fluid.CUDAPlace(0)
if use_gpu else fluid.CPUPlace()).run(startup_prog)
for _ in six.moves.xrange(iter):
exe_strategy = fluid.ExecutionStrategy()
exe_strategy._dry_run = True
exe_strategy.use_experimental_executor = use_experimental_executor
pe = fluid.ParallelExecutor(
use_cuda=True,
loss_name=loss.name,
main_program=main_prog,
exec_strategy=exe_strategy)
for _ in six.moves.xrange(iter_per_pe):
pe.run([])
class TestMNISTDryRun(TestBase):
def test_mnist_dry_run(self):
for use_gpu in (False, True):
for use_experimental_executor in (False, True):
self.main(
network_func=TestMNISTDryRun.network_func,
use_gpu=use_gpu,
use_experimental_executor=use_experimental_executor)
@staticmethod
def network_func():
img = fluid.layers.data(name='img', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in six.moves.xrange(10):
hidden = fluid.layers.fc(input=img, size=200, act='tanh')
prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
avg_loss = fluid.layers.mean(loss)
fluid.optimizer.Adam().minimize(avg_loss)
return avg_loss
if __name__ == '__main__':
unittest.main()
...@@ -14,30 +14,18 @@ ...@@ -14,30 +14,18 @@
from __future__ import print_function from __future__ import print_function
from parallel_executor_test_base import TestParallelExecutorBase
import paddle.fluid as fluid
import paddle.fluid.core as core
import numpy as np
import paddle
import paddle.dataset.mnist as mnist
import unittest import unittest
import os
MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" import numpy as np
import paddle.fluid.core as core
import os
import paddle.fluid as fluid
from parallel_executor_test_base import TestParallelExecutorBase
def simple_fc_net(use_feed): def simple_fc_net(use_feed):
if use_feed: img = fluid.layers.data(name='image', shape=[784], dtype='float32')
img = fluid.layers.data(name='image', shape=[784], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img hidden = img
for _ in range(4): for _ in range(4):
hidden = fluid.layers.fc( hidden = fluid.layers.fc(
...@@ -53,17 +41,8 @@ def simple_fc_net(use_feed): ...@@ -53,17 +41,8 @@ def simple_fc_net(use_feed):
def fc_with_batchnorm(use_feed): def fc_with_batchnorm(use_feed):
if use_feed: img = fluid.layers.data(name='image', shape=[784], dtype='float32')
img = fluid.layers.data(name='image', shape=[784], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
else:
reader = fluid.layers.open_files(
filenames=[MNIST_RECORDIO_FILE],
shapes=[[-1, 784], [-1, 1]],
lod_levels=[0, 0],
dtypes=['float32', 'int64'])
reader = fluid.layers.io.double_buffer(reader)
img, label = fluid.layers.read_file(reader)
hidden = img hidden = img
for _ in range(1): for _ in range(1):
...@@ -88,19 +67,6 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -88,19 +67,6 @@ class TestMNIST(TestParallelExecutorBase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
os.environ['CPU_NUM'] = str(4) os.environ['CPU_NUM'] = str(4)
# Convert mnist to recordio file
with fluid.program_guard(fluid.Program(), fluid.Program()):
reader = paddle.batch(mnist.train(), batch_size=4)
feeder = fluid.DataFeeder(
feed_list=[ # order is image and label
fluid.layers.data(
name='image', shape=[784]),
fluid.layers.data(
name='label', shape=[1], dtype='int64'),
],
place=fluid.CPUPlace())
fluid.recordio_writer.convert_reader_to_recordio_file(
MNIST_RECORDIO_FILE, reader, feeder)
def _init_data(self): def _init_data(self):
np.random.seed(5) np.random.seed(5)
...@@ -111,10 +77,6 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -111,10 +77,6 @@ class TestMNIST(TestParallelExecutorBase):
def _compare_reduce_and_allreduce(self, model, use_cuda): def _compare_reduce_and_allreduce(self, model, use_cuda):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
self.check_network_convergence(
model, use_cuda=use_cuda, use_reduce=True)
self.check_network_convergence(
model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True)
img, label = self._init_data() img, label = self._init_data()
...@@ -140,9 +102,6 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -140,9 +102,6 @@ class TestMNIST(TestParallelExecutorBase):
def check_simple_fc_convergence(self, use_cuda, use_reduce=False): def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
self.check_network_convergence(simple_fc_net, use_cuda=use_cuda)
self.check_network_convergence(
simple_fc_net, use_cuda=use_cuda, allow_op_delay=True)
img, label = self._init_data() img, label = self._init_data()
...@@ -199,8 +158,6 @@ class TestMNIST(TestParallelExecutorBase): ...@@ -199,8 +158,6 @@ class TestMNIST(TestParallelExecutorBase):
if use_cuda and not core.is_compiled_with_cuda(): if use_cuda and not core.is_compiled_with_cuda():
return return
self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda)
img, label = self._init_data() img, label = self._init_data()
self.check_network_convergence( self.check_network_convergence(
......
...@@ -14,7 +14,8 @@ RC = 0 ...@@ -14,7 +14,8 @@ RC = 0
def git_commit(): def git_commit():
try: try:
cmd = ['git', 'rev-parse', 'HEAD'] cmd = ['git', 'rev-parse', 'HEAD']
git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE,
cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
except: except:
git_commit = 'Unknown' git_commit = 'Unknown'
git_commit = git_commit.decode() git_commit = git_commit.decode()
...@@ -44,7 +45,7 @@ def get_patch(): ...@@ -44,7 +45,7 @@ def get_patch():
def is_taged(): def is_taged():
try: try:
cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null'] cmd = ['git', 'describe', '--exact-match', '--tags', 'HEAD', '2>/dev/null']
git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() git_tag = subprocess.Popen(cmd, stdout = subprocess.PIPE, cwd="@PADDLE_SOURCE_DIR@").communicate()[0].strip()
git_tag = git_tag.decode() git_tag = git_tag.decode()
except: except:
return False return False
...@@ -55,8 +56,7 @@ def is_taged(): ...@@ -55,8 +56,7 @@ def is_taged():
return False return False
def write_version_py(filename='paddle/version.py'): def write_version_py(filename='paddle/version.py'):
cnt = ''' cnt = '''# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
# #
full_version = '%(major)d.%(minor)d.%(patch)s' full_version = '%(major)d.%(minor)d.%(patch)s'
major = '%(major)d' major = '%(major)d'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册