dump params

Change-Id: I073a955ac9aa13e4afdfb869121b83f95062cdac

dump params
Change-Id: I073a955ac9aa13e4afdfb869121b83f95062cdac
825e947c · rensilin · 79133eae · 825e947c · 825e947c · 825e947c
7 changed file
--- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
+++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc
@@ -14,18 +14,18 @@ namespace feed {
            VLOG(0) << "file_system is not initialized";
            return -1;
        }
+        auto fs = _trainer_context->file_system.get();
        if (config["donefile"]) {
-            _done_file_path = _trainer_context->file_system->path_join(_model_root_path, config["donefile"].as<std::string>());
+            _done_file_path = fs->path_join(_model_root_path, config["donefile"].as<std::string>());
        } else {
-            _done_file_path = _trainer_context->file_system->path_join(_model_root_path, "epoch_donefile.txt");
+            _done_file_path = fs->path_join(_model_root_path, "epoch_donefile.txt");
        }
-        if (!_trainer_context->file_system->exists(_done_file_path)) {
+        if (!fs->exists(_done_file_path)) {
            VLOG(0) << "missing done file, path:" << _done_file_path;
        }
-        std::string done_text = _trainer_context->file_system->tail(_done_file_path);
+        std::string done_text = fs->tail(_done_file_path);
        _done_status = paddle::string::split_string(done_text, std::string("\t"));
        _current_epoch_id = get_status<uint64_t>(EpochStatusFiled::EpochIdField);
        _last_checkpoint_epoch_id = get_status<uint64_t>(EpochStatusFiled::CheckpointIdField);

--- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
+++ b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc
@@ -534,7 +534,7 @@ public:
            size_t buffer_size = 0;
            ssize_t line_len = 0;
            while ((line_len = getline(&buffer, &buffer_size, fin.get())) != -1) {
-                // 去掉行位回车
+                // 去掉行尾回车
                if (line_len > 0 && buffer[line_len - 1] == '\n') {
                    buffer[--line_len] = '\0';
                }
@@ -547,7 +547,8 @@ public:
                    VLOG(5) << "parse data: " << data_item.id << " " << data_item.data << ", filename: " << filepath << ", thread_num: " << thread_num << ", max_threads: " << max_threads;
                    if (writer == nullptr) {
                        if (!data_channel->Put(std::move(data_item))) {
-                            VLOG(2) << "fail to put data, thread_num: " << thread_num;
+                            LOG(WARNING) << "fail to put data, thread_num: " << thread_num;
+                            is_failed = true;
                        }
                    } else {
                        (*writer) << std::move(data_item);

--- a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc
+++ b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc
@@ -77,7 +77,7 @@ public:
    FileSystem* get_file_system(const std::string& path) {
        auto pos = path.find_first_of(":");
        if (pos != std::string::npos) {
-            auto substr = path.substr(0, pos + 1);
+            auto substr = path.substr(0, pos); // example: afs:/xxx -> afs
            auto fs_it = _file_system.find(substr);
            if (fs_it != _file_system.end()) {
                return fs_it->second.get();

--- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
+++ b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc
@@ -76,7 +76,7 @@ int LearnerProcess::run() {
    uint64_t epoch_id = epoch_accessor->current_epoch_id();
    environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, 
-        "Resume trainer with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str());
+        "Resume training with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str());
    //判断是否先dump出base
    wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceBase);
@@ -108,7 +108,7 @@ int LearnerProcess::run() {
            for (int thread_id = 0; thread_id < _train_thread_num; ++thread_id) {
                train_threads[i].reset(new std::thread([this](int exe_idx, int thread_idx) {
                    auto* executor = _threads_executor[thread_idx][exe_idx].get();
-                    run_executor(executor);                      
+                    run_executor(executor);
                }, i, thread_id));
            }   
            for (int i = 0; i < _train_thread_num; ++i) {

--- a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py
+++ b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py
@@ -119,11 +119,12 @@ class ModelBuilder:
            'inputs': [{"name": var.name, "shape": var.shape} for var in inputs],
            'outputs': [{"name": var.name, "shape": var.shape} for var in outputs],
            'labels': [{"name": var.name, "shape": var.shape} for var in labels],
+            'vars': [{"name": var.name, "shape": var.shape} for var in main_program.list_vars() if fluid.io.is_parameter(var)],
            'loss': loss.name,
        }
        with open(model_desc_path, 'w') as f:
-            yaml.safe_dump(model_desc, f, encoding='utf-8', allow_unicode=True)
+            yaml.safe_dump(model_desc, f, encoding='utf-8', allow_unicode=True, default_flow_style=None)
 def main(argv):

--- a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp
+++ b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp
@@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) {
 std::unique_ptr<paddle::framework::ProgramDesc> Load(
    paddle::framework::Executor* executor, const std::string& model_filename) {
-  LOG(DEBUG) << "loading model from " << model_filename;
+  VLOG(3) << "loading model from " << model_filename;
  std::string program_desc_str;
  ReadBinaryFile(model_filename, &program_desc_str);

--- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc
+++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc
@@ -193,14 +193,14 @@ TEST_F(DataReaderTest, LineDataReader_FileSystem) {
            "file_system:\n"
            "    class: AutoFileSystem\n"
            "    file_systems:\n"
-            "        'afs:': &HDFS \n"
+            "        'afs': &HDFS \n"
            "            class: HadoopFileSystem\n"
            "            hdfs_command: 'hadoop fs'\n"
            "            ugis:\n"
            "                'default': 'feed_video,D3a0z8'\n"
            "                'xingtian.afs.baidu.com:9902': 'feed_video,D3a0z8'\n"
            "            \n"
-            "        'hdfs:': *HDFS\n");
+            "        'hdfs': *HDFS\n");
    ASSERT_EQ(0, data_reader->initialize(config, context_ptr));
    {
        auto data_file_list = data_reader->data_file_list(test_data_dir);