diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc index 583799f978896f0fdcbe386e4c19f7c5cd36e2dc..79fe2fc5594554a00633b04a53782ddde7437482 100644 --- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc +++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc @@ -14,18 +14,18 @@ namespace feed { VLOG(0) << "file_system is not initialized"; return -1; } - + auto fs = _trainer_context->file_system.get(); if (config["donefile"]) { - _done_file_path = _trainer_context->file_system->path_join(_model_root_path, config["donefile"].as()); + _done_file_path = fs->path_join(_model_root_path, config["donefile"].as()); } else { - _done_file_path = _trainer_context->file_system->path_join(_model_root_path, "epoch_donefile.txt"); + _done_file_path = fs->path_join(_model_root_path, "epoch_donefile.txt"); } - if (!_trainer_context->file_system->exists(_done_file_path)) { + if (!fs->exists(_done_file_path)) { VLOG(0) << "missing done file, path:" << _done_file_path; } - std::string done_text = _trainer_context->file_system->tail(_done_file_path); + std::string done_text = fs->tail(_done_file_path); _done_status = paddle::string::split_string(done_text, std::string("\t")); _current_epoch_id = get_status(EpochStatusFiled::EpochIdField); _last_checkpoint_epoch_id = get_status(EpochStatusFiled::CheckpointIdField); diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc index 29c2d18485c2f63dd638121b698264b95923cd3a..ba0a74f1b4c654870e99b9f6bdc43e86964206f3 100755 --- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc +++ b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc @@ -534,7 +534,7 @@ public: size_t buffer_size = 0; ssize_t line_len = 0; while ((line_len = getline(&buffer, &buffer_size, fin.get())) != -1) { - // 去掉行位回车 + // 去掉行尾回车 if (line_len > 0 && buffer[line_len - 1] == '\n') { buffer[--line_len] = '\0'; } @@ -547,7 +547,8 @@ public: VLOG(5) << "parse data: " << data_item.id << " " << data_item.data << ", filename: " << filepath << ", thread_num: " << thread_num << ", max_threads: " << max_threads; if (writer == nullptr) { if (!data_channel->Put(std::move(data_item))) { - VLOG(2) << "fail to put data, thread_num: " << thread_num; + LOG(WARNING) << "fail to put data, thread_num: " << thread_num; + is_failed = true; } } else { (*writer) << std::move(data_item); diff --git a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc index eab4d50bbf4e7b935c7b3d98d1d75661cc9e9d23..db41b6d6390fe42e67cf73855430710fece3fec3 100644 --- a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc +++ b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc @@ -77,7 +77,7 @@ public: FileSystem* get_file_system(const std::string& path) { auto pos = path.find_first_of(":"); if (pos != std::string::npos) { - auto substr = path.substr(0, pos + 1); + auto substr = path.substr(0, pos); // example: afs:/xxx -> afs auto fs_it = _file_system.find(substr); if (fs_it != _file_system.end()) { return fs_it->second.get(); diff --git a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc index 09f9af80347cd00dcb3482a9b0b2b04b9863b306..cf9ac43b96b50ea4dfa0859d434f6a46992dadf5 100644 --- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc +++ b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc @@ -76,7 +76,7 @@ int LearnerProcess::run() { uint64_t epoch_id = epoch_accessor->current_epoch_id(); environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, - "Resume trainer with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str()); + "Resume training with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str()); //判断是否先dump出base wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceBase); @@ -108,7 +108,7 @@ int LearnerProcess::run() { for (int thread_id = 0; thread_id < _train_thread_num; ++thread_id) { train_threads[i].reset(new std::thread([this](int exe_idx, int thread_idx) { auto* executor = _threads_executor[thread_idx][exe_idx].get(); - run_executor(executor); + run_executor(executor); }, i, thread_id)); } for (int i = 0; i < _train_thread_num; ++i) { diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py index e0021741eae73b1f14aa4894e78c0b3cef219633..c9b9ca94bc9d20283a2f5accf7b1470e2f73da30 100644 --- a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py +++ b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py @@ -119,11 +119,12 @@ class ModelBuilder: 'inputs': [{"name": var.name, "shape": var.shape} for var in inputs], 'outputs': [{"name": var.name, "shape": var.shape} for var in outputs], 'labels': [{"name": var.name, "shape": var.shape} for var in labels], + 'vars': [{"name": var.name, "shape": var.shape} for var in main_program.list_vars() if fluid.io.is_parameter(var)], 'loss': loss.name, } with open(model_desc_path, 'w') as f: - yaml.safe_dump(model_desc, f, encoding='utf-8', allow_unicode=True) + yaml.safe_dump(model_desc, f, encoding='utf-8', allow_unicode=True, default_flow_style=None) def main(argv): diff --git a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp index c3fc0d3a5807f7173d8b6d3eb328cbe005cbfd0f..1087f5672459506cc7b824127cd822c0df7ba566 100644 --- a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp +++ b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp @@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) { std::unique_ptr Load( paddle::framework::Executor* executor, const std::string& model_filename) { - LOG(DEBUG) << "loading model from " << model_filename; + VLOG(3) << "loading model from " << model_filename; std::string program_desc_str; ReadBinaryFile(model_filename, &program_desc_str); diff --git a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc index 8f2c0f5ef4c3e1b4bd2ca9af995dab1be165b7a4..8ad66b5df514160516755dbbb363f2e4f98d3457 100644 --- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc +++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc @@ -193,14 +193,14 @@ TEST_F(DataReaderTest, LineDataReader_FileSystem) { "file_system:\n" " class: AutoFileSystem\n" " file_systems:\n" - " 'afs:': &HDFS \n" + " 'afs': &HDFS \n" " class: HadoopFileSystem\n" " hdfs_command: 'hadoop fs'\n" " ugis:\n" " 'default': 'feed_video,D3a0z8'\n" " 'xingtian.afs.baidu.com:9902': 'feed_video,D3a0z8'\n" " \n" - " 'hdfs:': *HDFS\n"); + " 'hdfs': *HDFS\n"); ASSERT_EQ(0, data_reader->initialize(config, context_ptr)); { auto data_file_list = data_reader->data_file_list(test_data_dir);