From 825e947c89431b93b4b852950d28f0f6f7ac320a Mon Sep 17 00:00:00 2001 From: rensilin Date: Tue, 20 Aug 2019 14:13:50 +0800 Subject: [PATCH] dump params Change-Id: I073a955ac9aa13e4afdfb869121b83f95062cdac --- .../custom_trainer/feed/accessor/epoch_accessor.cc | 10 +++++----- .../train/custom_trainer/feed/dataset/data_reader.cc | 5 +++-- .../train/custom_trainer/feed/io/auto_file_system.cc | 2 +- .../custom_trainer/feed/process/learner_process.cc | 4 ++-- .../custom_trainer/feed/scripts/create_programs.py | 3 ++- .../train/custom_trainer/feed/temp/feed_trainer.cpp | 2 +- .../custom_trainer/feed/unit_test/test_datareader.cc | 4 ++-- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc index 583799f9..79fe2fc5 100644 --- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc +++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc @@ -14,18 +14,18 @@ namespace feed { VLOG(0) << "file_system is not initialized"; return -1; } - + auto fs = _trainer_context->file_system.get(); if (config["donefile"]) { - _done_file_path = _trainer_context->file_system->path_join(_model_root_path, config["donefile"].as()); + _done_file_path = fs->path_join(_model_root_path, config["donefile"].as()); } else { - _done_file_path = _trainer_context->file_system->path_join(_model_root_path, "epoch_donefile.txt"); + _done_file_path = fs->path_join(_model_root_path, "epoch_donefile.txt"); } - if (!_trainer_context->file_system->exists(_done_file_path)) { + if (!fs->exists(_done_file_path)) { VLOG(0) << "missing done file, path:" << _done_file_path; } - std::string done_text = _trainer_context->file_system->tail(_done_file_path); + std::string done_text = fs->tail(_done_file_path); _done_status = paddle::string::split_string(done_text, std::string("\t")); _current_epoch_id = get_status(EpochStatusFiled::EpochIdField); _last_checkpoint_epoch_id = get_status(EpochStatusFiled::CheckpointIdField); diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc index 29c2d184..ba0a74f1 100755 --- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc +++ b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc @@ -534,7 +534,7 @@ public: size_t buffer_size = 0; ssize_t line_len = 0; while ((line_len = getline(&buffer, &buffer_size, fin.get())) != -1) { - // 去掉行位回车 + // 去掉行尾回车 if (line_len > 0 && buffer[line_len - 1] == '\n') { buffer[--line_len] = '\0'; } @@ -547,7 +547,8 @@ public: VLOG(5) << "parse data: " << data_item.id << " " << data_item.data << ", filename: " << filepath << ", thread_num: " << thread_num << ", max_threads: " << max_threads; if (writer == nullptr) { if (!data_channel->Put(std::move(data_item))) { - VLOG(2) << "fail to put data, thread_num: " << thread_num; + LOG(WARNING) << "fail to put data, thread_num: " << thread_num; + is_failed = true; } } else { (*writer) << std::move(data_item); diff --git a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc index eab4d50b..db41b6d6 100644 --- a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc +++ b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc @@ -77,7 +77,7 @@ public: FileSystem* get_file_system(const std::string& path) { auto pos = path.find_first_of(":"); if (pos != std::string::npos) { - auto substr = path.substr(0, pos + 1); + auto substr = path.substr(0, pos); // example: afs:/xxx -> afs auto fs_it = _file_system.find(substr); if (fs_it != _file_system.end()) { return fs_it->second.get(); diff --git a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc index 09f9af80..cf9ac43b 100644 --- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc +++ b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc @@ -76,7 +76,7 @@ int LearnerProcess::run() { uint64_t epoch_id = epoch_accessor->current_epoch_id(); environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, - "Resume trainer with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str()); + "Resume training with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str()); //判断是否先dump出base wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceBase); @@ -108,7 +108,7 @@ int LearnerProcess::run() { for (int thread_id = 0; thread_id < _train_thread_num; ++thread_id) { train_threads[i].reset(new std::thread([this](int exe_idx, int thread_idx) { auto* executor = _threads_executor[thread_idx][exe_idx].get(); - run_executor(executor); + run_executor(executor); }, i, thread_id)); } for (int i = 0; i < _train_thread_num; ++i) { diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py index e0021741..c9b9ca94 100644 --- a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py +++ b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py @@ -119,11 +119,12 @@ class ModelBuilder: 'inputs': [{"name": var.name, "shape": var.shape} for var in inputs], 'outputs': [{"name": var.name, "shape": var.shape} for var in outputs], 'labels': [{"name": var.name, "shape": var.shape} for var in labels], + 'vars': [{"name": var.name, "shape": var.shape} for var in main_program.list_vars() if fluid.io.is_parameter(var)], 'loss': loss.name, } with open(model_desc_path, 'w') as f: - yaml.safe_dump(model_desc, f, encoding='utf-8', allow_unicode=True) + yaml.safe_dump(model_desc, f, encoding='utf-8', allow_unicode=True, default_flow_style=None) def main(argv): diff --git a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp index c3fc0d3a..1087f567 100644 --- a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp +++ b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp @@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) { std::unique_ptr Load( paddle::framework::Executor* executor, const std::string& model_filename) { - LOG(DEBUG) << "loading model from " << model_filename; + VLOG(3) << "loading model from " << model_filename; std::string program_desc_str; ReadBinaryFile(model_filename, &program_desc_str); diff --git a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc index 8f2c0f5e..8ad66b5d 100644 --- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc +++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc @@ -193,14 +193,14 @@ TEST_F(DataReaderTest, LineDataReader_FileSystem) { "file_system:\n" " class: AutoFileSystem\n" " file_systems:\n" - " 'afs:': &HDFS \n" + " 'afs': &HDFS \n" " class: HadoopFileSystem\n" " hdfs_command: 'hadoop fs'\n" " ugis:\n" " 'default': 'feed_video,D3a0z8'\n" " 'xingtian.afs.baidu.com:9902': 'feed_video,D3a0z8'\n" " \n" - " 'hdfs:': *HDFS\n"); + " 'hdfs': *HDFS\n"); ASSERT_EQ(0, data_reader->initialize(config, context_ptr)); { auto data_file_list = data_reader->data_file_list(test_data_dir); -- GitLab