提交 605907fe 编写于 作者: D Dang Qingqing

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into quantize_transpiler_update

...@@ -136,10 +136,6 @@ def parse_args(): ...@@ -136,10 +136,6 @@ def parse_args():
'--no_random', '--no_random',
action='store_true', action='store_true',
help='If set, keep the random seed and do not shuffle the data.') help='If set, keep the random seed and do not shuffle the data.')
parser.add_argument(
'--use_lars',
action='store_true',
help='If set, use lars for optimizers, ONLY support resnet module.')
parser.add_argument( parser.add_argument(
'--reduce_strategy', '--reduce_strategy',
type=str, type=str,
......
...@@ -200,11 +200,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -200,11 +200,6 @@ def get_model(args, is_train, main_prog, startup_prog):
# configure optimize # configure optimize
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / (args.batch_size * args.gpus) + 1) step = int(total_images / (args.batch_size * args.gpus) + 1)
......
...@@ -224,11 +224,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -224,11 +224,6 @@ def get_model(args, is_train, main_prog, startup_prog):
# configure optimize # configure optimize
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1) step = int(total_images / args.batch_size + 1)
......
...@@ -244,11 +244,6 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -244,11 +244,6 @@ def get_model(args, is_train, main_prog, startup_prog):
optimizer = None optimizer = None
if is_train: if is_train:
if args.use_lars:
lars_decay = 1.0
else:
lars_decay = 0.0
total_images = 1281167 / trainer_count total_images = 1281167 / trainer_count
step = int(total_images / args.batch_size + 1) step = int(total_images / args.batch_size + 1)
...@@ -262,8 +257,7 @@ def get_model(args, is_train, main_prog, startup_prog): ...@@ -262,8 +257,7 @@ def get_model(args, is_train, main_prog, startup_prog):
learning_rate=fluid.layers.piecewise_decay( learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr), boundaries=bd, values=lr),
momentum=0.9, momentum=0.9,
regularization=fluid.regularizer.L2Decay(1e-4), regularization=fluid.regularizer.L2Decay(1e-4))
LARS_weight_decay=lars_decay)
optimizer.minimize(avg_cost) optimizer.minimize(avg_cost)
if args.memory_optimize: if args.memory_optimize:
......
...@@ -354,25 +354,25 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi ...@@ -354,25 +354,25 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi
paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate'], varargs=None, keywords='kwargs', defaults=None) paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov'], varargs=None, keywords='kwargs', defaults=(False,)) paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon'], varargs=None, keywords='kwargs', defaults=(1e-06,)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.001, 0.9, 0.999, 1e-08)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06)) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None))
paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power'], varargs=None, keywords='kwargs', defaults=(0.0, 0.0, -0.5)) paddle.fluid.optimizer.FtrlOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None))
paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.FtrlOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered'], varargs=None, keywords='kwargs', defaults=(0.95, 1e-06, 0.0, False)) paddle.fluid.optimizer.RMSPropOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None))
paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.RMSPropOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho'], varargs=None, keywords='kwargs', defaults=(1e-06, 0.95)) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None))
paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdadeltaOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window'], varargs=None, keywords='kwargs', defaults=(10000, 10000)) paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None))
paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None)
......
...@@ -103,108 +103,74 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -103,108 +103,74 @@ void GetOneBatch(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->assign({input_tensor}); input_slots->assign({input_tensor});
} }
const int64_t lac_ref_data[] = {24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, void SetConfig(AnalysisConfig *cfg) {
25, 25, 25, 25, 44, 24, 25, 25, 25, 36, 42, 43, cfg->model_dir = FLAGS_infer_model;
44, 14, 15, 44, 14, 15, 44, 14, 15, 44, 38, 39, cfg->use_gpu = false;
14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; cfg->device = 0;
cfg->specify_input_name = true;
void TestLACPrediction(const std::string &model_path, cfg->enable_ir_optim = true;
const std::string &data_file, const int batch_size, }
const int repeat, bool use_analysis = false) {
AnalysisConfig cfg;
cfg.model_dir = model_path;
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs_slots; void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(data_file, batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
GetOneBatch(&input_slots, &data, batch_size); std::vector<PaddleTensor> input_slots;
std::unique_ptr<PaddlePredictor> predictor; int epoch = FLAGS_test_all_data ? data.batched_datas.size() : 1;
if (use_analysis) { LOG(INFO) << "number of samples: " << epoch;
predictor = for (int bid = 0; bid < epoch; ++bid) {
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg); GetOneBatch(&input_slots, &data, FLAGS_batch_size);
} else { (*inputs).emplace_back(input_slots);
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
for (int i = 0; i < FLAGS_burning; i++) {
predictor->Run(input_slots, &outputs_slots);
} }
Timer timer; }
if (FLAGS_test_all_data) {
LOG(INFO) << "test all data";
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.batched_datas.size(); ++bid) {
GetOneBatch(&input_slots, &data, batch_size);
input_slots_all.emplace_back(input_slots);
}
LOG(INFO) << "total number of samples: " << data.datasets.size();
TestPrediction(cfg, input_slots_all, &outputs_slots, FLAGS_num_threads);
return;
}
timer.tic();
for (int i = 0; i < repeat; i++) {
predictor->Run(input_slots, &outputs_slots);
}
PrintTime(batch_size, repeat, 1, 0, timer.toc() / repeat);
// check result // Easy for profiling independently.
EXPECT_EQ(outputs_slots.size(), 1UL); TEST(Analyzer_LAC, profile) {
auto &out = outputs_slots[0]; AnalysisConfig cfg;
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, SetConfig(&cfg);
[](int a, int b) { return a * b; }); std::vector<PaddleTensor> outputs;
size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
PADDLE_ENFORCE_GT(size, 0);
EXPECT_GE(size, batch1_size);
int64_t *pdata = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < batch1_size; ++i) {
EXPECT_EQ(pdata[i], lac_ref_data[i]);
}
if (use_analysis) { std::vector<std::vector<PaddleTensor>> input_slots_all;
// run once for comparion as reference SetInput(&input_slots_all);
auto ref_predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
CompareResult(ref_outputs_slots, outputs_slots);
AnalysisPredictor *analysis_predictor = if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); // the first inference result
auto &fuse_statis = analysis_predictor->analysis_argument() const int64_t lac_ref_data[] = {
.Get<std::unordered_map<std::string, int>>( 24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25,
framework::ir::kFuseStatisAttr); 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14,
for (auto &item : fuse_statis) { 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23};
LOG(INFO) << "fused " << item.first << " " << item.second; PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
} size_t size = GetSize(outputs[0]);
int num_ops = 0; size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t);
for (auto &node : PADDLE_ENFORCE_GE(size, batch1_size);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { int64_t *pdata = static_cast<int64_t *>(outputs[0].data.data());
if (node->IsFunction()) { for (size_t i = 0; i < batch1_size; ++i) {
++num_ops; EXPECT_EQ(pdata[i], lac_ref_data[i]);
}
} }
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
EXPECT_EQ(num_ops, 11);
} }
} }
TEST(Analyzer_LAC, native) { // Check the fuse status
LOG(INFO) << "LAC with native"; TEST(Analyzer_LAC, fuse_statis) {
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, AnalysisConfig cfg;
FLAGS_repeat); SetConfig(&cfg);
int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4);
EXPECT_EQ(num_ops, 11);
} }
TEST(Analyzer_LAC, analysis) { // Compare result of NativeConfig and AnalysisConfig
LOG(INFO) << "LAC with analysis"; TEST(Analyzer_LAC, compare) {
TestLACPrediction(FLAGS_infer_model, FLAGS_infer_data, FLAGS_batch_size, AnalysisConfig cfg;
FLAGS_repeat, true); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
} // namespace analysis } // namespace analysis
......
...@@ -95,97 +95,73 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -95,97 +95,73 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
// the first inference result void SetConfig(AnalysisConfig *cfg) {
const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, cfg->prog_file = FLAGS_infer_model + "/__model__";
48, 39, 38, 16, 25}; cfg->param_file = FLAGS_infer_model + "/param";
cfg->use_gpu = false;
void TestChineseNERPrediction(bool use_analysis) { cfg->device = 0;
AnalysisConfig cfg; cfg->specify_input_name = true;
cfg.prog_file = FLAGS_infer_model + "/__model__"; cfg->enable_ir_optim = true;
cfg.param_file = FLAGS_infer_model + "/param"; }
cfg.use_gpu = false;
cfg.device = 0;
cfg.specify_input_name = true;
cfg.enable_ir_optim = true;
std::vector<PaddleTensor> input_slots, outputs;
std::unique_ptr<PaddlePredictor> predictor;
Timer timer;
if (use_analysis) {
predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
} else {
predictor =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
}
if (FLAGS_test_all_data) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
LOG(INFO) << "test all data";
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all;
for (size_t bid = 0; bid < data.num_samples / FLAGS_batch_size; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
input_slots_all.emplace_back(input_slots);
}
LOG(INFO) << "total number of samples: " << data.num_samples;
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
return;
}
// Prepare inputs.
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
PrepareInputs(&input_slots, &data, FLAGS_batch_size); std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
timer.tic(); LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int i = 0; i < FLAGS_repeat; i++) { for (int bid = 0; bid < epoch; ++bid) {
predictor->Run(input_slots, &outputs); PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
} }
PrintTime(FLAGS_batch_size, FLAGS_repeat, 1, 0, timer.toc() / FLAGS_repeat); }
PADDLE_ENFORCE(outputs.size(), 1UL); // Easy for profiling independently.
auto &out = outputs[0]; TEST(Analyzer_Chinese_ner, profile) {
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, AnalysisConfig cfg;
[](int a, int b) { return a * b; }); SetConfig(&cfg);
PADDLE_ENFORCE_GT(size, 0); std::vector<PaddleTensor> outputs;
int64_t *result = static_cast<int64_t *>(out.data.data());
for (size_t i = 0; i < std::min(11UL, size); i++) {
PADDLE_ENFORCE(result[i], chinese_ner_result_data[i]);
}
if (use_analysis) { std::vector<std::vector<PaddleTensor>> input_slots_all;
// run once for comparion as reference SetInput(&input_slots_all);
auto ref_predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg);
std::vector<PaddleTensor> ref_outputs_slots;
ref_predictor->Run(input_slots, &ref_outputs_slots);
CompareResult(ref_outputs_slots, outputs);
AnalysisPredictor *analysis_predictor = if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); // the first inference result
auto &fuse_statis = analysis_predictor->analysis_argument() const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26,
.Get<std::unordered_map<std::string, int>>( 48, 39, 38, 16, 25};
framework::ir::kFuseStatisAttr); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
for (auto &item : fuse_statis) { size_t size = GetSize(outputs[0]);
LOG(INFO) << "fused " << item.first << " " << item.second; PADDLE_ENFORCE_GT(size, 0);
} int64_t *result = static_cast<int64_t *>(outputs[0].data.data());
int num_ops = 0; for (size_t i = 0; i < std::min(11UL, size); i++) {
for (auto &node : EXPECT_EQ(result[i], chinese_ner_result_data[i]);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
}
} }
LOG(INFO) << "has num ops: " << num_ops;
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
EXPECT_EQ(num_ops, 14);
} }
} }
TEST(Analyzer_Chinese_ner, native) { TestChineseNERPrediction(false); } // Check the fuse status
TEST(Analyzer_Chinese_ner, fuse_statis) {
AnalysisConfig cfg;
SetConfig(&cfg);
TEST(Analyzer_Chinese_ner, analysis) { TestChineseNERPrediction(true); } int num_ops;
auto fuse_statis = GetFuseStatis(cfg, &num_ops);
ASSERT_TRUE(fuse_statis.count("fc_fuse"));
ASSERT_TRUE(fuse_statis.count("fc_gru_fuse"));
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2);
EXPECT_EQ(num_ops, 14);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Chinese_ner, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -25,6 +25,7 @@ struct DataRecord { ...@@ -25,6 +25,7 @@ struct DataRecord {
std::vector<size_t> lod1, lod2, lod3; std::vector<size_t> lod1, lod2, lod3;
std::vector<std::vector<float>> rnn_link_data, rnn_week_datas, std::vector<std::vector<float>> rnn_link_data, rnn_week_datas,
rnn_minute_datas; rnn_minute_datas;
size_t num_samples; // total number of samples
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
DataRecord() = default; DataRecord() = default;
...@@ -97,6 +98,7 @@ struct DataRecord { ...@@ -97,6 +98,7 @@ struct DataRecord {
week_data_all.push_back(std::move(week_data)); week_data_all.push_back(std::move(week_data));
minute_data_all.push_back(std::move(minute_data)); minute_data_all.push_back(std::move(minute_data));
} }
num_samples = num_lines;
} }
}; };
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...@@ -147,89 +149,72 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -147,89 +149,72 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
} }
} }
// Test with a really complicate model. void SetConfig(AnalysisConfig *cfg) {
void TestRNN1Prediction(bool use_analysis, bool activate_ir, int num_threads) { cfg->prog_file = FLAGS_infer_model + "/__model__";
AnalysisConfig config; cfg->param_file = FLAGS_infer_model + "/param";
config.prog_file = FLAGS_infer_model + "/__model__"; cfg->use_gpu = false;
config.param_file = FLAGS_infer_model + "/param"; cfg->device = 0;
config.use_gpu = false; cfg->specify_input_name = true;
config.device = 0; cfg->enable_ir_optim = true;
config.specify_input_name = true; cfg->ir_passes.clear(); // Do not exclude any pass.
config.enable_ir_optim = activate_ir; }
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
config.ir_passes.clear(); // Do not exclude any pass.
int batch_size = FLAGS_batch_size;
auto base_predictor = void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
auto predictor =
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots; std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size); int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
// Prepare inputs. LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
PrepareInputs(&input_slots, &data, batch_size); for (int bid = 0; bid < epoch; ++bid) {
std::vector<PaddleTensor> outputs, base_outputs; PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
}
}
base_predictor->Run(input_slots, &base_outputs); // Easy for profiling independently.
TEST(Analyzer_rnn1, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
input_slots_all.emplace_back(input_slots); SetInput(&input_slots_all);
if (num_threads == 1) { TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
TestOneThreadPrediction(config, input_slots_all, &outputs); }
CompareResult(outputs, base_outputs);
} else {
// only return the output of first thread
TestMultiThreadPrediction(config, input_slots_all, &outputs, num_threads);
}
if (use_analysis && activate_ir) { // Check the fuse status
AnalysisPredictor *analysis_predictor = TEST(Analyzer_rnn1, fuse_statis) {
dynamic_cast<AnalysisPredictor *>(predictor.get()); AnalysisConfig cfg;
auto &fuse_statis = analysis_predictor->analysis_argument() SetConfig(&cfg);
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0; int num_ops;
for (auto &node : auto fuse_statis = GetFuseStatis(cfg, &num_ops);
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) { ASSERT_TRUE(fuse_statis.count("fc_fuse"));
if (node->IsFunction()) { EXPECT_EQ(fuse_statis.at("fc_fuse"), 1);
++num_ops; EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM
} EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1);
} EXPECT_EQ(num_ops,
LOG(INFO) << "has num ops: " << num_ops; 13); // After graph optimization, only 13 operators exists.
}
ASSERT_TRUE(fuse_statis.count("fc_fuse")); // Compare result of NativeConfig and AnalysisConfig
EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); TEST(Analyzer_rnn1, compare) {
EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM AnalysisConfig cfg;
EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); SetConfig(&cfg);
EXPECT_EQ(num_ops,
13); // After graph optimization, only 13 operators exists. std::vector<std::vector<PaddleTensor>> input_slots_all;
} SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
// Inference with analysis and IR, easy for profiling independently. // Test Multi-Thread.
TEST(Analyzer, rnn1) { TestRNN1Prediction(true, true, FLAGS_num_threads); } TEST(Analyzer_rnn1, multi_thread) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
// Other unit-tests of RNN1, test different options of use_analysis, std::vector<std::vector<PaddleTensor>> input_slots_all;
// activate_ir and multi-threads. SetInput(&input_slots_all);
TEST(Analyzer, RNN_tests) { TestPrediction(cfg, input_slots_all, &outputs, 4 /* num_threads */);
int num_threads[2] = {1, 4};
for (auto i : num_threads) {
// Directly infer with the original model.
TestRNN1Prediction(false, false, i);
// Inference with the original model with the analysis turned on, the
// analysis module will transform the program to a data flow graph.
TestRNN1Prediction(true, false, i);
// Inference with analysis and IR. The IR module will fuse some large
// kernels.
TestRNN1Prediction(true, true, i);
}
} }
} // namespace inference } // namespace inference
......
...@@ -12,24 +12,7 @@ ...@@ -12,24 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
#include <google/protobuf/text_format.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data path");
DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -41,6 +24,7 @@ struct DataRecord { ...@@ -41,6 +24,7 @@ struct DataRecord {
std::vector<size_t> lod; std::vector<size_t> lod;
std::vector<std::vector<float>> rnn_link_data; std::vector<std::vector<float>> rnn_link_data;
std::vector<float> result_data; std::vector<float> result_data;
size_t num_samples; // total number of samples
size_t batch_iter{0}; size_t batch_iter{0};
size_t batch_size{1}; size_t batch_size{1};
DataRecord() = default; DataRecord() = default;
...@@ -100,6 +84,7 @@ struct DataRecord { ...@@ -100,6 +84,7 @@ struct DataRecord {
result_data.insert(result_data.end(), tmp.begin(), tmp.end()); result_data.insert(result_data.end(), tmp.begin(), tmp.end());
} }
} }
num_samples = num_lines / 2;
} }
}; };
void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
...@@ -118,64 +103,58 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -118,64 +103,58 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
input_slots->assign({feed_tensor}); input_slots->assign({feed_tensor});
} }
void CompareResult(const std::vector<PaddleTensor> &outputs, void SetConfig(AnalysisConfig *cfg) {
const std::vector<float> &base_result) { cfg->prog_file = FLAGS_infer_model + "/__model__";
PADDLE_ENFORCE_GT(outputs.size(), 0); cfg->param_file = FLAGS_infer_model + "/param";
for (size_t i = 0; i < outputs.size(); i++) { cfg->use_gpu = false;
auto &out = outputs[i]; cfg->device = 0;
size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1, cfg->specify_input_name = true;
[](int a, int b) { return a * b; }); cfg->enable_ir_optim = true;
PADDLE_ENFORCE_GT(size, 0); }
float *data = static_cast<float *>(out.data.data());
for (size_t i = 0; i < size; i++) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
EXPECT_NEAR(data[i], base_result[i], 1e-3); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
} std::vector<PaddleTensor> input_slots;
int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
for (int bid = 0; bid < epoch; ++bid) {
PrepareInputs(&input_slots, &data, FLAGS_batch_size);
(*inputs).emplace_back(input_slots);
} }
} }
// Test with a really complicate model.
void TestRNN2Prediction() {
AnalysisConfig config;
config.prog_file = FLAGS_infer_model + "/__model__";
config.param_file = FLAGS_infer_model + "/param";
config.use_gpu = false;
config.device = 0;
config.specify_input_name = true;
config.enable_ir_optim = true;
PADDLE_ENFORCE(config.ir_mode ==
AnalysisConfig::IrPassMode::kExclude); // default
int batch_size = FLAGS_batch_size; // Easy for profiling independently.
int num_times = FLAGS_repeat; TEST(Analyzer_rnn2, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<PaddleTensor> outputs;
auto base_predictor = std::vector<std::vector<PaddleTensor>> input_slots_all;
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config); SetInput(&input_slots_all);
auto predictor = TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
std::vector<PaddleTensor> input_slots;
DataRecord data(FLAGS_infer_data, batch_size);
PrepareInputs(&input_slots, &data, batch_size);
std::vector<PaddleTensor> outputs, base_outputs;
Timer timer1; if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
timer1.tic(); // the first inference result
for (int i = 0; i < num_times; i++) { DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
base_predictor->Run(input_slots, &base_outputs); PADDLE_ENFORCE_GT(outputs.size(), 0);
size_t size = GetSize(outputs[0]);
PADDLE_ENFORCE_GT(size, 0);
float *result = static_cast<float *>(outputs[0].data.data());
for (size_t i = 0; i < size; i++) {
EXPECT_NEAR(result[i], data.result_data[i], 1e-3);
}
} }
PrintTime(batch_size, num_times, 1, 0, timer1.toc() / num_times); }
Timer timer2; // Compare result of NativeConfig and AnalysisConfig
timer2.tic(); TEST(Analyzer_rnn2, compare) {
for (int i = 0; i < num_times; i++) { AnalysisConfig cfg;
predictor->Run(input_slots, &outputs); SetConfig(&cfg);
}
PrintTime(batch_size, num_times, 1, 0, timer2.toc() / num_times);
CompareResult(base_outputs, data.result_data); std::vector<std::vector<PaddleTensor>> input_slots_all;
CompareResult(outputs, data.result_data); SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
TEST(Analyzer, rnn2) { TestRNN2Prediction(); }
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -46,54 +46,63 @@ struct DataReader { ...@@ -46,54 +46,63 @@ struct DataReader {
std::unique_ptr<std::ifstream> file; std::unique_ptr<std::ifstream> file;
}; };
void Main(int batch_size) { void SetConfig(AnalysisConfig *cfg) {
// shape -- cfg->model_dir = FLAGS_infer_model;
// Create Predictor -- cfg->use_gpu = false;
AnalysisConfig config; cfg->device = 0;
config.model_dir = FLAGS_infer_model; cfg->specify_input_name = true;
config.use_gpu = false; cfg->enable_ir_optim = true;
config.enable_ir_optim = true; }
std::vector<PaddleTensor> input_slots, output_slots; void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
std::vector<PaddleTensor> input_slots;
DataReader reader(FLAGS_infer_data); DataReader reader(FLAGS_infer_data);
std::vector<std::vector<PaddleTensor>> input_slots_all; int num_batches = 0;
while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
if (FLAGS_test_all_data) { (*inputs).emplace_back(input_slots);
LOG(INFO) << "test all data"; ++num_batches;
int num_batches = 0; if (!FLAGS_test_all_data) return;
while (reader.NextBatch(&input_slots, FLAGS_batch_size)) {
input_slots_all.emplace_back(input_slots);
++num_batches;
}
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads);
return;
} }
LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size;
}
// one batch starts // Easy for profiling independently.
// data -- TEST(Analyzer_Text_Classification, profile) {
reader.NextBatch(&input_slots, FLAGS_batch_size); AnalysisConfig cfg;
input_slots_all.emplace_back(input_slots); SetConfig(&cfg);
TestPrediction(config, input_slots_all, &output_slots, FLAGS_num_threads); std::vector<PaddleTensor> outputs;
// Get output std::vector<std::vector<PaddleTensor>> input_slots_all;
LOG(INFO) << "get outputs " << output_slots.size(); SetInput(&input_slots_all);
TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
for (auto &output : output_slots) { if (FLAGS_num_threads == 1) {
LOG(INFO) << "output.shape: " << to_string(output.shape); // Get output
// no lod ? LOG(INFO) << "get outputs " << outputs.size();
CHECK_EQ(output.lod.size(), 0UL); for (auto &output : outputs) {
LOG(INFO) << "output.dtype: " << output.dtype; LOG(INFO) << "output.shape: " << to_string(output.shape);
std::stringstream ss; // no lod ?
for (int i = 0; i < 5; i++) { CHECK_EQ(output.lod.size(), 0UL);
ss << static_cast<float *>(output.data.data())[i] << " "; LOG(INFO) << "output.dtype: " << output.dtype;
std::stringstream ss;
for (int i = 0; i < 5; i++) {
ss << static_cast<float *>(output.data.data())[i] << " ";
}
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
} }
LOG(INFO) << "output.data summary: " << ss.str();
// one batch ends
} }
} }
TEST(text_classification, basic) { Main(FLAGS_batch_size); } // Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_Text_Classification, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -49,84 +49,83 @@ Record ProcessALine(const std::string &line) { ...@@ -49,84 +49,83 @@ Record ProcessALine(const std::string &line) {
return record; return record;
} }
/* void SetConfig(AnalysisConfig *cfg) {
* Use the native and analysis fluid engine to inference the demo. cfg->param_file = FLAGS_infer_model + "/__params__";
* ocr, mobilenet and se_resnext50 cfg->prog_file = FLAGS_infer_model + "/__model__";
*/ cfg->use_gpu = false;
void TestVisualPrediction(bool use_mkldnn) { cfg->device = 0;
std::unique_ptr<PaddlePredictor> predictor; cfg->enable_ir_optim = true;
AnalysisConfig cfg; cfg->specify_input_name = true;
cfg.param_file = FLAGS_infer_model + "/__params__";
cfg.prog_file = FLAGS_infer_model + "/__model__";
cfg.use_gpu = false;
cfg._use_mkldnn = use_mkldnn;
cfg.device = 0;
cfg.enable_ir_optim = true;
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg.ir_passes.push_back("fc_gru_fuse_pass"); cfg->ir_passes.push_back("fc_gru_fuse_pass");
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
cfg->_use_mkldnn = true;
// disable mkldnn fuse since it should have some bugs // disable mkldnn fuse since it should have some bugs
cfg.ir_passes.push_back("conv_relu_mkldnn_fuse_pass"); cfg->ir_passes.push_back("conv_relu_mkldnn_fuse_pass");
#endif #endif
predictor = }
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(cfg);
// Only have single batch of data. void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data.");
std::string line; std::string line;
std::ifstream file(FLAGS_infer_data); std::ifstream file(FLAGS_infer_data);
std::getline(file, line); std::getline(file, line);
auto record = ProcessALine(line); auto record = ProcessALine(line);
file.close();
// Inference.
PaddleTensor input; PaddleTensor input;
input.shape = record.shape; input.shape = record.shape;
input.data =
PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
input.dtype = PaddleDType::FLOAT32; input.dtype = PaddleDType::FLOAT32;
size_t input_size = record.data.size() * sizeof(float);
input.data.Resize(input_size);
memcpy(input.data.data(), record.data.data(), input_size);
std::vector<PaddleTensor> input_slots;
input_slots.assign({input});
(*inputs).emplace_back(input_slots);
}
std::vector<PaddleTensor> outputs_slots; // Easy for profiling independently.
Timer timer; // ocr, mobilenet and se_resnext50
timer.tic(); TEST(Analyzer_vis, profile) {
for (int i = 0; i < FLAGS_repeat; i++) { AnalysisConfig cfg;
predictor->Run({input}, &outputs_slots); SetConfig(&cfg);
} std::vector<PaddleTensor> outputs;
PrintTime(/*batch size*/ 1, FLAGS_repeat, /*num threads*/ 1, /*thread id*/ 0,
timer.toc() / FLAGS_repeat); std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
VLOG(3) << "output.size " << outputs_slots.size(); TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
// run native as reference if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
auto ref_predictor = const float ocr_result_data[] = {
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(cfg); 5.273636460856323538e-08, 3.296741795111302054e-07,
std::vector<PaddleTensor> ref_outputs_slots; 1.873261190610264748e-08, 3.403730275408634043e-08,
ref_predictor->Run({input}, &ref_outputs_slots); 3.383312474625199684e-08};
CompareResult(outputs_slots, ref_outputs_slots); PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
// print what are fused size_t size = GetSize(outputs[0]);
AnalysisPredictor *analysis_predictor = PADDLE_ENFORCE_GT(size, 0);
dynamic_cast<AnalysisPredictor *>(predictor.get()); float *result = static_cast<float *>(outputs[0].data.data());
auto &fuse_statis = analysis_predictor->analysis_argument() for (size_t i = 0; i < std::min(5UL, size); i++) {
.Get<std::unordered_map<std::string, int>>( EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3);
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num_ops = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num_ops;
} }
} }
LOG(INFO) << "has num ops: " << num_ops;
} }
TEST(Analyzer_vis, analysis) { TestVisualPrediction(/*use_mkldnn*/ false); } // Check the fuse status
#ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_vis, fuse_statis) {
TEST(Analyzer_vis, analysis_mkldnn) { AnalysisConfig cfg;
TestVisualPrediction(/*use_mkldnn*/ true); SetConfig(&cfg);
int num_ops;
GetFuseStatis(cfg, &num_ops);
}
// Compare result of NativeConfig and AnalysisConfig
TEST(Analyzer_vis, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareNativeAndAnalysis(cfg, input_slots_all);
} }
#endif
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
...@@ -28,17 +29,18 @@ ...@@ -28,17 +29,18 @@
DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_model, "", "model path");
DEFINE_string(infer_data, "", "data file"); DEFINE_string(infer_data, "", "data file");
DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(batch_size, 1, "batch size.");
DEFINE_int32(burning, 0, "Burning before repeat.");
DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_int32(repeat, 1, "Running the inference program repeat times.");
DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
DEFINE_bool(use_analysis, true,
"Running the inference program in analysis mode.");
namespace paddle { namespace paddle {
namespace inference { namespace inference {
void CompareResult(const std::vector<PaddleTensor> &outputs, void CompareResult(const std::vector<PaddleTensor> &outputs,
const std::vector<PaddleTensor> &ref_outputs) { const std::vector<PaddleTensor> &ref_outputs) {
EXPECT_GT(outputs.size(), 0); EXPECT_GT(outputs.size(), 0UL);
EXPECT_EQ(outputs.size(), ref_outputs.size()); EXPECT_EQ(outputs.size(), ref_outputs.size());
for (size_t i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
auto &out = outputs[i]; auto &out = outputs[i];
...@@ -72,14 +74,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -72,14 +74,50 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
} }
} }
std::unique_ptr<PaddlePredictor> GetPrediction(AnalysisConfig config,
bool use_analysis = true) {
if (use_analysis) {
return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
} else {
return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
config);
}
}
size_t GetSize(const PaddleTensor &out) {
return std::accumulate(out.shape.begin(), out.shape.end(), 1,
[](int a, int b) { return a * b; });
}
std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
int *num_ops) {
auto predictor = GetPrediction(config);
AnalysisPredictor *analysis_predictor =
dynamic_cast<AnalysisPredictor *>(predictor.get());
auto &fuse_statis = analysis_predictor->analysis_argument()
.Get<std::unordered_map<std::string, int>>(
framework::ir::kFuseStatisAttr);
for (auto &item : fuse_statis) {
LOG(INFO) << "fused " << item.first << " " << item.second;
}
int num = 0;
for (auto &node :
analysis_predictor->analysis_argument().main_dfg->nodes.nodes()) {
if (node->IsFunction()) {
++num;
}
}
*num_ops = num;
return fuse_statis;
}
void TestOneThreadPrediction( void TestOneThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs, AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs) { std::vector<PaddleTensor> *outputs, bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
auto predictor = auto predictor = GetPrediction(config, use_analysis);
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config);
Timer timer; Timer timer;
timer.tic(); timer.tic();
for (int i = 0; i < num_times; i++) { for (int i = 0; i < num_times; i++) {
...@@ -93,7 +131,8 @@ void TestOneThreadPrediction( ...@@ -93,7 +131,8 @@ void TestOneThreadPrediction(
void TestMultiThreadPrediction( void TestMultiThreadPrediction(
AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs, AnalysisConfig config, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) { std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = true) {
int batch_size = FLAGS_batch_size; int batch_size = FLAGS_batch_size;
int num_times = FLAGS_repeat; int num_times = FLAGS_repeat;
std::vector<std::thread> threads; std::vector<std::thread> threads;
...@@ -101,9 +140,7 @@ void TestMultiThreadPrediction( ...@@ -101,9 +140,7 @@ void TestMultiThreadPrediction(
// TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled // TODO(yanchunwei): Bug here, the analyzer phase can't be parallelled
// because AttentionLSTM's hard code nodeid will be damanged. // because AttentionLSTM's hard code nodeid will be damanged.
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
predictors.emplace_back( predictors.emplace_back(GetPrediction(config, use_analysis));
CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
config));
} }
for (int tid = 0; tid < num_threads; ++tid) { for (int tid = 0; tid < num_threads; ++tid) {
threads.emplace_back([&, tid]() { threads.emplace_back([&, tid]() {
...@@ -129,13 +166,25 @@ void TestMultiThreadPrediction( ...@@ -129,13 +166,25 @@ void TestMultiThreadPrediction(
void TestPrediction(AnalysisConfig config, void TestPrediction(AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs, const std::vector<std::vector<PaddleTensor>> inputs,
std::vector<PaddleTensor> *outputs, int num_threads) { std::vector<PaddleTensor> *outputs, int num_threads,
bool use_analysis = FLAGS_use_analysis) {
LOG(INFO) << "use_analysis: " << use_analysis;
if (num_threads == 1) { if (num_threads == 1) {
TestOneThreadPrediction(config, inputs, outputs); TestOneThreadPrediction(config, inputs, outputs, use_analysis);
} else { } else {
TestMultiThreadPrediction(config, inputs, outputs, num_threads); TestMultiThreadPrediction(config, inputs, outputs, num_threads,
use_analysis);
} }
} }
void CompareNativeAndAnalysis(
AnalysisConfig config,
const std::vector<std::vector<PaddleTensor>> inputs) {
std::vector<PaddleTensor> native_outputs, analysis_outputs;
TestOneThreadPrediction(config, inputs, &native_outputs, false);
TestOneThreadPrediction(config, inputs, &analysis_outputs, true);
CompareResult(analysis_outputs, native_outputs);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -174,12 +174,13 @@ struct SparseAdamFunctor { ...@@ -174,12 +174,13 @@ struct SparseAdamFunctor {
const int64_t* rows_; const int64_t* rows_;
int64_t row_numel_; int64_t row_numel_;
int64_t row_count_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out, const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows, const T* param, T* param_out, const int64_t* rows,
int64_t row_numel) int64_t row_numel, int64_t row_count)
: beta1_(beta1), : beta1_(beta1),
beta2_(beta2), beta2_(beta2),
epsilon_(epsilon), epsilon_(epsilon),
...@@ -194,28 +195,47 @@ struct SparseAdamFunctor { ...@@ -194,28 +195,47 @@ struct SparseAdamFunctor {
param_(param), param_(param),
param_out_(param_out), param_out_(param_out),
rows_(rows), rows_(rows),
row_numel_(row_numel) {} row_numel_(row_numel),
row_count_(row_count) {}
inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const {
int64_t beg = 0, end = row_count_ - 1;
while (beg <= end) {
auto mid = ((beg + end) >> 1);
if (rows_[mid] == row)
return mid;
else if (rows_[mid] < row)
beg = mid + 1;
else
end = mid - 1;
}
return -1;
}
inline HOSTDEVICE void operator()(size_t i) const { inline HOSTDEVICE void operator()(size_t i) const {
int64_t row = i / row_numel_;
auto row_idx = BinarySearchInRows(row);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
// The following code is the same as dense
T mom1 = moment1_[i];
T mom2 = moment2_[i];
T lr = *lr_;
T beta1_pow = *beta1_pow_; T beta1_pow = *beta1_pow_;
T beta2_pow = *beta2_pow_; T beta2_pow = *beta2_pow_;
for (int64_t j = 0; j < row_numel_; ++j) { T p = param_[i];
T g = grad_[i * row_numel_ + j];
T mom1 = moment1_[rows_[i] * row_numel_ + j]; // Calculation
T mom2 = moment2_[rows_[i] * row_numel_ + j]; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
T lr = *lr_;
T p = param_[rows_[i] * row_numel_ + j]; mom1 = beta1_ * mom1 + (1 - beta1_) * g;
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
mom1 = beta1_ * mom1 + (1 - beta1_) * g; // Write back to global memory
mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; moment1_out_[i] = mom1;
p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); moment2_out_[i] = mom2;
param_out_[i] = p;
moment1_out_[rows_[i] * row_numel_ + j] = mom1;
moment2_out_[rows_[i] * row_numel_ + j] = mom2;
param_out_[rows_[i] * row_numel_ + j] = p;
} // for col id
} }
}; };
...@@ -287,9 +307,14 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -287,9 +307,14 @@ class AdamOpKernel : public framework::OpKernel<T> {
return; return;
} }
// merge duplicated rows if any. // merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
scatter::MergeAdd<DeviceContext, T> merge_func; scatter::MergeAdd<DeviceContext, T> merge_func;
auto grad_merge = auto& grad_merge = *(ctx.scope()
merge_func(ctx.template device_context<DeviceContext>(), grad); .NewScope()
.Var("sparse_adam_grad_merge")
->GetMutable<framework::SelectedRows>());
merge_func(ctx.template device_context<DeviceContext>(), grad,
&grad_merge);
auto& grad_tensor = grad_merge.value(); auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
int64_t* rows = nullptr; int64_t* rows = nullptr;
...@@ -314,10 +339,11 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -314,10 +339,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2.template data<T>(), mom2.template data<T>(),
mom2_out.template mutable_data<T>(ctx.GetPlace()), mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(), lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel); param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size());
platform::ForRange<DeviceContext> for_range( platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()), static_cast<const DeviceContext&>(ctx.device_context()),
grad_merge.rows().size()); param.numel());
for_range(functor); for_range(functor);
} else { } else {
PADDLE_THROW("Variable type not supported by adam_op"); PADDLE_THROW("Variable type not supported by adam_op");
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
namespace paddle { namespace paddle {
...@@ -61,14 +62,32 @@ class ClipKernel : public framework::OpKernel<T> { ...@@ -61,14 +62,32 @@ class ClipKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto max = context.Attr<T>("max"); auto max = context.Attr<T>("max");
auto min = context.Attr<T>("min"); auto min = context.Attr<T>("min");
auto* x = context.Input<Tensor>("X"); auto* x_var = context.InputVar("X");
auto* out = context.Output<Tensor>("Out"); if (x_var->IsType<framework::LoDTensor>()) {
T* out_data = out->mutable_data<T>(context.GetPlace()); auto* x = context.Input<framework::LoDTensor>("X");
const T* x_data = x->data<T>(); auto* out = context.Output<framework::LoDTensor>("Out");
int64_t numel = x->numel(); T* out_data = out->mutable_data<T>(context.GetPlace());
Transform<DeviceContext> trans; const T* x_data = x->data<T>();
trans(context.template device_context<DeviceContext>(), x_data, int64_t numel = x->numel();
x_data + numel, out_data, ClipFunctor<T>(min, max)); Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), x_data,
x_data + numel, out_data, ClipFunctor<T>(min, max));
} else if (x_var->IsType<framework::SelectedRows>()) {
auto* x = context.Input<framework::SelectedRows>("X");
auto* out = context.Output<framework::SelectedRows>("Out");
PADDLE_ENFORCE_NE(x, out,
"Inplace clip is not allowed when x is SelectedRows");
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(context.template device_context<DeviceContext>(), *x, out);
auto* out_tensor = out->mutable_value();
auto* out_data = out_tensor->data<T>();
int64_t numel = out_tensor->numel();
Transform<DeviceContext> trans;
trans(context.template device_context<DeviceContext>(), out_data,
out_data + numel, out_data, ClipFunctor<T>(min, max));
} else {
PADDLE_THROW("ClipOp only supports LoDTensor and SelectedRows");
}
} }
}; };
...@@ -78,10 +97,12 @@ class ClipGradKernel : public framework::OpKernel<T> { ...@@ -78,10 +97,12 @@ class ClipGradKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto max = context.Attr<T>("max"); auto max = context.Attr<T>("max");
auto min = context.Attr<T>("min"); auto min = context.Attr<T>("min");
auto* d_out = context.Input<Tensor>(framework::GradVarName("Out")); auto* d_out =
auto* d_x = context.Output<Tensor>(framework::GradVarName("X")); context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
auto* d_x =
context.Output<framework::LoDTensor>(framework::GradVarName("X"));
if (d_x != nullptr) { if (d_x != nullptr) {
auto* x = context.Input<Tensor>("X"); auto* x = context.Input<framework::LoDTensor>("X");
int64_t numel = d_out->numel(); int64_t numel = d_out->numel();
auto* d_x_data = d_x->mutable_data<T>(context.GetPlace()); auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
const T* d_out_data = d_out->data<T>(); const T* d_out_data = d_out->data<T>();
......
...@@ -199,6 +199,14 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -199,6 +199,14 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
framework::SelectedRows operator()(const platform::CPUDeviceContext& context, framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input) { const framework::SelectedRows& input) {
framework::SelectedRows out; framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
auto input_rows = input.rows(); auto input_rows = input.rows();
std::set<int64_t> row_set(input_rows.begin(), input_rows.end()); std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end()); std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
...@@ -223,7 +231,6 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -223,7 +231,6 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
out_data[out_i * input_width + j] += input_data[i * input_width + j]; out_data[out_i * input_width + j] += input_data[i * input_width + j];
} }
} }
return out;
} }
}; };
......
...@@ -234,7 +234,7 @@ template <typename T, int block_size> ...@@ -234,7 +234,7 @@ template <typename T, int block_size>
__global__ void MergeAddKernel(const T* input, const int64_t* input_rows, __global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
T* out, const int64_t* out_rows, T* out, const int64_t* out_rows,
size_t out_rows_size, int64_t row_numel) { size_t out_rows_size, int64_t row_numel) {
const int ty = blockIdx.y; const int ty = blockIdx.x;
int tid = threadIdx.x; int tid = threadIdx.x;
__shared__ size_t out_idx; __shared__ size_t out_idx;
...@@ -260,6 +260,14 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -260,6 +260,14 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
framework::SelectedRows operator()(const platform::CUDADeviceContext& context, framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input) { const framework::SelectedRows& input) {
framework::SelectedRows out; framework::SelectedRows out;
(*this)(context, input, &out);
return out;
}
void operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output) {
framework::SelectedRows& out = *output;
framework::Vector<int64_t> input_rows(input.rows()); framework::Vector<int64_t> input_rows(input.rows());
std::set<int64_t> row_set(input_rows.begin(), input_rows.end()); std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
std::vector<int64_t> merge_rows(row_set.begin(), row_set.end()); std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
...@@ -281,16 +289,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> { ...@@ -281,16 +289,12 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
const int block_size = 256; const int block_size = 256;
dim3 threads(block_size, 1); dim3 threads(block_size, 1);
dim3 grid1(1, input_rows.size()); dim3 grid1(input_rows.size(), 1);
MergeAddKernel< MergeAddKernel<T, 256><<<grid1, threads, 0, context.stream()>>>(
T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(
input_data, input_rows.CUDAData(context.GetPlace()), out_data, input_data, input_rows.CUDAData(context.GetPlace()), out_data,
out.mutable_rows()->CUDAMutableData(context.GetPlace()), out.mutable_rows()->CUDAMutableData(context.GetPlace()),
out.rows().size(), input_width); out.rows().size(), input_width);
return out;
} }
}; };
......
...@@ -65,6 +65,9 @@ struct MergeAdd { ...@@ -65,6 +65,9 @@ struct MergeAdd {
// the input SelectedRows object. // the input SelectedRows object.
framework::SelectedRows operator()(const DeviceContext& context, framework::SelectedRows operator()(const DeviceContext& context,
const framework::SelectedRows& input); const framework::SelectedRows& input);
void operator()(const DeviceContext& context,
const framework::SelectedRows& input,
framework::SelectedRows* output);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
......
...@@ -43,11 +43,7 @@ class Optimizer(object): ...@@ -43,11 +43,7 @@ class Optimizer(object):
but need to use one of it's implementation. but need to use one of it's implementation.
""" """
def __init__(self, def __init__(self, learning_rate, regularization=None, name=None):
learning_rate,
regularization=None,
LARS_weight_decay=0.0,
name=None):
if not isinstance(learning_rate, float) and \ if not isinstance(learning_rate, float) and \
not isinstance(learning_rate, framework.Variable): not isinstance(learning_rate, framework.Variable):
raise TypeError("learning rate should be float or Variable") raise TypeError("learning rate should be float or Variable")
...@@ -68,7 +64,6 @@ class Optimizer(object): ...@@ -68,7 +64,6 @@ class Optimizer(object):
# {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...} # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
self._accumulators = defaultdict(lambda: dict()) self._accumulators = defaultdict(lambda: dict())
self.helper = None self.helper = None
self._LARS_weight_decay = LARS_weight_decay
def _create_global_learning_rate(self): def _create_global_learning_rate(self):
lr = self._global_learning_rate() lr = self._global_learning_rate()
...@@ -109,7 +104,6 @@ class Optimizer(object): ...@@ -109,7 +104,6 @@ class Optimizer(object):
param = param_and_grad[0] param = param_and_grad[0]
param_lr = param.optimize_attr['learning_rate'] param_lr = param.optimize_attr['learning_rate']
if type(param_lr) == Variable: if type(param_lr) == Variable:
# param learning rate has been updated (LARS)
print("returns updated param lr ", param_lr) print("returns updated param lr ", param_lr)
return param_lr return param_lr
else: else:
...@@ -227,10 +221,6 @@ class Optimizer(object): ...@@ -227,10 +221,6 @@ class Optimizer(object):
self._create_accumulators(loss.block, self._create_accumulators(loss.block,
[p[0] for p in parameters_and_grads]) [p[0] for p in parameters_and_grads])
self._create_global_learning_rate() self._create_global_learning_rate()
if self._LARS_weight_decay > 0.0:
layers.append_LARS(parameters_and_grads,
self._global_learning_rate(),
self._LARS_weight_decay)
optimize_ops = [] optimize_ops = []
for param_and_grad in parameters_and_grads: for param_and_grad in parameters_and_grads:
...@@ -287,6 +277,9 @@ class SGDOptimizer(Optimizer): ...@@ -287,6 +277,9 @@ class SGDOptimizer(Optimizer):
Args: Args:
learning_rate (float|Variable): the learning rate used to update parameters. \ learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -295,10 +288,12 @@ class SGDOptimizer(Optimizer): ...@@ -295,10 +288,12 @@ class SGDOptimizer(Optimizer):
sgd_optimizer.minimize(cost) sgd_optimizer.minimize(cost)
""" """
def __init__(self, learning_rate, **kwargs): def __init__(self, learning_rate, regularization=None, name=None):
assert learning_rate is not None assert learning_rate is not None
super(SGDOptimizer, self).__init__( super(SGDOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "sgd" self.type = "sgd"
def _append_optimize_op(self, block, param_and_grad): def _append_optimize_op(self, block, param_and_grad):
...@@ -343,6 +338,9 @@ class MomentumOptimizer(Optimizer): ...@@ -343,6 +338,9 @@ class MomentumOptimizer(Optimizer):
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
momentum (float): momentum factor momentum (float): momentum factor
use_nesterov (bool): enables Nesterov momentum use_nesterov (bool): enables Nesterov momentum
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -352,11 +350,18 @@ class MomentumOptimizer(Optimizer): ...@@ -352,11 +350,18 @@ class MomentumOptimizer(Optimizer):
""" """
_velocity_acc_str = "velocity" _velocity_acc_str = "velocity"
def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): def __init__(self,
learning_rate,
momentum,
use_nesterov=False,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert momentum is not None assert momentum is not None
super(MomentumOptimizer, self).__init__( super(MomentumOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "momentum" self.type = "momentum"
self._momentum = momentum self._momentum = momentum
self._use_nesterov = bool(use_nesterov) self._use_nesterov = bool(use_nesterov)
...@@ -412,6 +417,9 @@ class AdagradOptimizer(Optimizer): ...@@ -412,6 +417,9 @@ class AdagradOptimizer(Optimizer):
learning_rate (float|Variable): the learning rate used to update parameters. \ learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -421,11 +429,17 @@ class AdagradOptimizer(Optimizer): ...@@ -421,11 +429,17 @@ class AdagradOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): def __init__(self,
learning_rate,
epsilon=1.0e-6,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert epsilon is not None assert epsilon is not None
super(AdagradOptimizer, self).__init__( super(AdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adagrad" self.type = "adagrad"
self._epsilon = epsilon self._epsilon = epsilon
...@@ -485,6 +499,9 @@ class AdamOptimizer(Optimizer): ...@@ -485,6 +499,9 @@ class AdamOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates. beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -503,13 +520,16 @@ class AdamOptimizer(Optimizer): ...@@ -503,13 +520,16 @@ class AdamOptimizer(Optimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
**kwargs): regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamOptimizer, self).__init__( super(AdamOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adam" self.type = "adam"
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
...@@ -629,6 +649,9 @@ class AdamaxOptimizer(Optimizer): ...@@ -629,6 +649,9 @@ class AdamaxOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates. beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -645,13 +668,16 @@ class AdamaxOptimizer(Optimizer): ...@@ -645,13 +668,16 @@ class AdamaxOptimizer(Optimizer):
beta1=0.9, beta1=0.9,
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
**kwargs): regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
assert epsilon is not None assert epsilon is not None
super(AdamaxOptimizer, self).__init__( super(AdamaxOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adamax" self.type = "adamax"
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
...@@ -742,6 +768,9 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -742,6 +768,9 @@ class DecayedAdagradOptimizer(Optimizer):
Can be a float value or a Variable with one float value as data element. Can be a float value or a Variable with one float value as data element.
decay (float): decay rate. decay (float): decay rate.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -751,13 +780,20 @@ class DecayedAdagradOptimizer(Optimizer): ...@@ -751,13 +780,20 @@ class DecayedAdagradOptimizer(Optimizer):
""" """
_moment_acc_str = "moment" _moment_acc_str = "moment"
def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs): def __init__(self,
learning_rate,
decay=0.95,
epsilon=1.0e-6,
regularization=None,
name=None):
assert learning_rate is not None assert learning_rate is not None
assert decay is not None assert decay is not None
assert epsilon is not None assert epsilon is not None
super(DecayedAdagradOptimizer, self).__init__( super(DecayedAdagradOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "decayed_adagrad" self.type = "decayed_adagrad"
self._decay = decay self._decay = decay
self._epsilon = epsilon self._epsilon = epsilon
...@@ -811,6 +847,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -811,6 +847,9 @@ class AdadeltaOptimizer(Optimizer):
learning_rate(float): global learning rate learning_rate(float): global learning rate
rho(float): rho in equation rho(float): rho in equation
epsilon(float): epsilon in equation epsilon(float): epsilon in equation
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -823,7 +862,12 @@ class AdadeltaOptimizer(Optimizer): ...@@ -823,7 +862,12 @@ class AdadeltaOptimizer(Optimizer):
_avg_squared_grad_acc_str = "_avg_squared_grad" _avg_squared_grad_acc_str = "_avg_squared_grad"
_avg_squared_update_acc_str = "_avg_squared_update" _avg_squared_update_acc_str = "_avg_squared_update"
def __init__(self, learning_rate, epsilon=1.0e-6, rho=0.95, **kwargs): def __init__(self,
learning_rate,
epsilon=1.0e-6,
rho=0.95,
regularization=None,
name=None):
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
if epsilon is None: if epsilon is None:
...@@ -831,7 +875,9 @@ class AdadeltaOptimizer(Optimizer): ...@@ -831,7 +875,9 @@ class AdadeltaOptimizer(Optimizer):
if rho is None: if rho is None:
raise ValueError("rho is not set.") raise ValueError("rho is not set.")
super(AdadeltaOptimizer, self).__init__( super(AdadeltaOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
self.type = "adadelta" self.type = "adadelta"
self._epsilon = epsilon self._epsilon = epsilon
self._rho = rho self._rho = rho
...@@ -932,6 +978,9 @@ class RMSPropOptimizer(Optimizer): ...@@ -932,6 +978,9 @@ class RMSPropOptimizer(Optimizer):
the gradient; if False, by the uncentered second moment. Setting this to the gradient; if False, by the uncentered second moment. Setting this to
True may help with training, but is slightly more expensive in terms of True may help with training, but is slightly more expensive in terms of
computation and memory. Defaults to False. computation and memory. Defaults to False.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -953,9 +1002,12 @@ class RMSPropOptimizer(Optimizer): ...@@ -953,9 +1002,12 @@ class RMSPropOptimizer(Optimizer):
epsilon=1.0e-6, epsilon=1.0e-6,
momentum=0.0, momentum=0.0,
centered=False, centered=False,
**kwargs): regularization=None,
name=None):
super(RMSPropOptimizer, self).__init__( super(RMSPropOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
if rho is None: if rho is None:
...@@ -1061,6 +1113,9 @@ class FtrlOptimizer(Optimizer): ...@@ -1061,6 +1113,9 @@ class FtrlOptimizer(Optimizer):
l1 (float): l1 (float):
l2 (float): l2 (float):
lr_power (float): lr_power (float):
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Raises: Raises:
ValueError: If learning_rate, rho, epsilon, momentum are None. ValueError: If learning_rate, rho, epsilon, momentum are None.
...@@ -1075,9 +1130,17 @@ class FtrlOptimizer(Optimizer): ...@@ -1075,9 +1130,17 @@ class FtrlOptimizer(Optimizer):
_squared_acc_str = "squared" _squared_acc_str = "squared"
_linear_acc_str = "linear" _linear_acc_str = "linear"
def __init__(self, learning_rate, l1=0.0, l2=0.0, lr_power=-0.5, **kwargs): def __init__(self,
learning_rate,
l1=0.0,
l2=0.0,
lr_power=-0.5,
regularization=None,
name=None):
super(FtrlOptimizer, self).__init__( super(FtrlOptimizer, self).__init__(
learning_rate=learning_rate, **kwargs) learning_rate=learning_rate,
regularization=regularization,
name=name)
if learning_rate is None: if learning_rate is None:
raise ValueError("learning_rate is not set.") raise ValueError("learning_rate is not set.")
...@@ -1155,7 +1218,9 @@ class ModelAverage(Optimizer): ...@@ -1155,7 +1218,9 @@ class ModelAverage(Optimizer):
average_window_rate: The rate of average window. average_window_rate: The rate of average window.
min_average_window: The minimum size of average window. min_average_window: The minimum size of average window.
max_average_window: The maximum size of average window. max_average_window: The maximum size of average window.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1178,8 +1243,10 @@ class ModelAverage(Optimizer): ...@@ -1178,8 +1243,10 @@ class ModelAverage(Optimizer):
average_window_rate, average_window_rate,
min_average_window=10000, min_average_window=10000,
max_average_window=10000, max_average_window=10000,
**kwargs): regularization=None,
super(ModelAverage, self).__init__(0.0, **kwargs) name=None):
super(ModelAverage, self).__init__(
0.0, regularization=regularization, name=name)
self.average_window = average_window_rate self.average_window = average_window_rate
self.min_average_window = min_average_window self.min_average_window = min_average_window
self.max_average_window = max_average_window self.max_average_window = max_average_window
......
...@@ -190,14 +190,11 @@ class L1DecayRegularizer(WeightDecayRegularizer): ...@@ -190,14 +190,11 @@ class L1DecayRegularizer(WeightDecayRegularizer):
Examples: Examples:
.. code-block:: python .. code-block:: python
program = fluid.framework.Program() optimizer = fluid.optimizer.Adagrad(
block = program.global_block() learning_rate=1e-4,
mul_x = block.create_parameter( regularization=fluid.regularizer.L1DecayRegularizer(
dtype="float32", regularization_coeff=0.1))
shape=[5, 10], optimizer.minimize(avg_cost)
lod_level=0,
name="mul.x",
regularizer=fluid.regularizer.L1DecayRegularizer(0.5))
""" """
def __init__(self, regularization_coeff=0.0): def __init__(self, regularization_coeff=0.0):
......
...@@ -99,7 +99,7 @@ def train(nn_type, ...@@ -99,7 +99,7 @@ def train(nn_type,
test_program = fluid.default_main_program().clone(for_test=True) test_program = fluid.default_main_program().clone(for_test=True)
optimizer = fluid.optimizer.Adam(learning_rate=0.001, LARS_weight_decay=0.3) optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
......
...@@ -34,12 +34,13 @@ if(APPLE) ...@@ -34,12 +34,13 @@ if(APPLE)
list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_desc_clone)
list(REMOVE_ITEM TEST_OPS test_program_code) list(REMOVE_ITEM TEST_OPS test_program_code)
endif(NOT WITH_DISTRIBUTE) endif(NOT WITH_DISTRIBUTE)
message(WARNING "These tests has been disabled in OSX before being fixed: \n test_detection_map_op \n test_dist_se_resnext") message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext")
# this op is not support on mac # this op is not support on mac
list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op)
# TODO: add the unitest back when it fixed # TODO: add the unitest back when it fixed
list(REMOVE_ITEM TEST_OPS test_detection_map_op) list(REMOVE_ITEM TEST_OPS test_detection_map_op)
list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
list(REMOVE_ITEM TEST_OPS test_fuse_elewise_add_act_pass)
endif() endif()
function(py_test_modules TARGET_NAME) function(py_test_modules TARGET_NAME)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册