diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc index 7e474eefd6bd9ec48d2f0393e64d088d60d5cfc4..583799f978896f0fdcbe386e4c19f7c5cd36e2dc 100644 --- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc +++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc @@ -7,20 +7,25 @@ namespace custom_trainer { namespace feed { int EpochAccessor::initialize(YAML::Node config, std::shared_ptr context_ptr) { - _model_root_path = config["model_root_path"].as() + "/"; + _model_root_path = config["model_root_path"].as(); + + _trainer_context = context_ptr.get(); + if (context_ptr->file_system == nullptr) { + VLOG(0) << "file_system is not initialized"; + return -1; + } - _done_file_path = _model_root_path; if (config["donefile"]) { - _done_file_path.append(config["donefile"].as()); + _done_file_path = _trainer_context->file_system->path_join(_model_root_path, config["donefile"].as()); } else { - _done_file_path.append("epoch_donefile.txt"); + _done_file_path = _trainer_context->file_system->path_join(_model_root_path, "epoch_donefile.txt"); } - if (!context_ptr->file_system->exists(_done_file_path)) { + if (!_trainer_context->file_system->exists(_done_file_path)) { VLOG(0) << "missing done file, path:" << _done_file_path; } - std::string done_text = context_ptr->file_system->tail(_done_file_path); + std::string done_text = _trainer_context->file_system->tail(_done_file_path); _done_status = paddle::string::split_string(done_text, std::string("\t")); _current_epoch_id = get_status(EpochStatusFiled::EpochIdField); _last_checkpoint_epoch_id = get_status(EpochStatusFiled::CheckpointIdField); @@ -67,23 +72,25 @@ namespace feed { if (epoch_id == 0) { return false; } - if (save_way == ModelSaveWay::ModelSaveInferenceDelta) { - return true; - } else if (save_way == ModelSaveWay::ModelSaveInferenceBase) { - return is_last_epoch(epoch_id); - } else if (save_way == ModelSaveWay::ModelSaveTrainCheckpoint) { - return ((epoch_id / 3600) % 8) == 0; + switch (save_way) { + case ModelSaveWay::ModelSaveInferenceDelta: + return true; + case ModelSaveWay::ModelSaveInferenceBase: + return is_last_epoch(epoch_id); + case ModelSaveWay::ModelSaveTrainCheckpoint: + return ((epoch_id / 3600) % 8) == 0; } return false; } std::string HourlyEpochAccessor::model_save_path(uint64_t epoch_id, ModelSaveWay save_way) { - if (save_way == ModelSaveWay::ModelSaveInferenceDelta) { - return _model_root_path + "/xbox/delta-" + std::to_string(epoch_id); - } else if (save_way == ModelSaveWay::ModelSaveInferenceBase) { - return _model_root_path + "/xbox/base"; - } else if (save_way == ModelSaveWay::ModelSaveTrainCheckpoint) { - return _model_root_path + "/xbox/checkpoint"; + switch (save_way) { + case ModelSaveWay::ModelSaveInferenceDelta: + return _trainer_context->file_system->path_join(_model_root_path, "/xbox/delta-" + std::to_string(epoch_id)); + case ModelSaveWay::ModelSaveInferenceBase: + return _trainer_context->file_system->path_join(_model_root_path, "/xbox/base"); + case ModelSaveWay::ModelSaveTrainCheckpoint: + return _trainer_context->file_system->path_join(_model_root_path, "/xbox/checkpoint"); } return ""; } diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h index 247a1fe3d1fab24fb5faae7f00def1adb4eda4e5..8646893c64d4ab9f2701b5b4ee6cc4fde25eaefe 100644 --- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h +++ b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h @@ -52,6 +52,7 @@ public: virtual bool need_save_model(uint64_t epoch_id, ModelSaveWay save_way) = 0; virtual std::string model_save_path(uint64_t epoch_id, ModelSaveWay save_way) = 0; protected: + TrainerContext* _trainer_context; std::string _done_file_path; std::string _model_root_path; uint64_t _current_epoch_id = 0; diff --git a/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf b/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf index f7ac928362ce0a2ebb6469fb074c99d6de575595..3a3c0b05a8f1849c4fda9d9a9d94dea89b73bf1e 100644 --- a/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf +++ b/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf @@ -1 +1,2 @@ +-log_dir=log -v=10 diff --git a/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml b/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml index 2b97e6e3a42650173661136cd3a8d66328ee4b34..71817d4acc0d5d990df38e0a69813f392e76dfbf 100644 --- a/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml +++ b/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml @@ -11,14 +11,14 @@ io : ugis : 'default': 'feed_video,D3a0z8' 'xingtian.afs.baidu.com:9902': 'feed_video,D3a0z8' - local : + default : class : LocalFileSystem buffer_size : 1024000 dataset : data_list : train_sample : prefetch_num : 2 - root_path : ./sample + root_path : [./sample] data_spit_interval : 300 data_path_formater : '%Y%m%d/%H%M' data_reader : LineDataReader diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc index fc29f30852245916c89b6a724b5f4aa23c2dd6c6..602fab3c6f50832e8c918f595b825b39eb3e6b03 100644 --- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc +++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc @@ -7,14 +7,14 @@ namespace feed { int Dataset::initialize( const YAML::Node& config, std::shared_ptr context) { if (config["data_list"].Type() != YAML::NodeType::Map) { - VLOG(0) << "miss data_list config in dataset, or type error please check"; + LOG(FATAL) << "miss data_list config in dataset, or type error please check"; return -1; } for (auto& data_config : config["data_list"]) { std::string name = data_config.first.as(); auto data_ptr = std::make_shared(); if (data_ptr->initialize(data_config.second, context) != 0) { - VLOG(0) << "dataset initialize failed, name:" << name; + LOG(FATAL) << "dataset initialize failed, name:" << name; return -1; } _data_containers[name] = data_ptr; diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc index 2f43e735ac525defa58139a6e0e0666c16eaa864..14d3062b4c329844811d6a1bd1a85948f2f10c09 100644 --- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc +++ b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc @@ -6,10 +6,10 @@ #include #include #include -#include "paddle/fluid/framework/io/shell.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" #include "paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h" +#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" #include "paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h" namespace paddle { @@ -27,8 +27,7 @@ int DatasetContainer::initialize( _dataset_list[i].reset(new DatasetInfo); } - _data_root_paths = paddle::string::split_string( - config["root_path"].as(), " "); + _data_root_paths = config["root_path"].as>(); _data_split_interval = config["data_spit_interval"].as(); _data_path_formater = config["data_path_formater"].as(); std::string data_reader_class = config["data_reader"].as(); @@ -66,7 +65,7 @@ void DatasetContainer::pre_detect_data(uint64_t epoch_id) { for (int i = 0; i < _data_root_paths.size() && status == 0; ++i) { for (int j = 0; j < data_num && status == 0; ++j) { std::string path_suffix = format_timestamp(data_timestamp + j * _data_split_interval, _data_path_formater); - std::string data_dir = _data_root_paths[i] + "/" + path_suffix; + std::string data_dir = _trainer_context->file_system->path_join(_data_root_paths[i], path_suffix); status = read_data_list(data_dir, data_path_list); } } diff --git a/paddle/fluid/train/custom_trainer/feed/executor/executor.cc b/paddle/fluid/train/custom_trainer/feed/executor/executor.cc index 1e8861de623993f65a0c42aea5af408710821ebb..2692cacabda81fe8af2b27c98c2795e6b5e8f73d 100644 --- a/paddle/fluid/train/custom_trainer/feed/executor/executor.cc +++ b/paddle/fluid/train/custom_trainer/feed/executor/executor.cc @@ -17,7 +17,7 @@ namespace { int ReadBinaryFile(const std::string& filename, std::string* contents) { std::ifstream fin(filename, std::ios::in | std::ios::binary); if (!fin) { - VLOG(2) << "Cannot open file " << filename; + LOG(FATAL) << "Cannot open file " << filename; return -1; } fin.seekg(0, std::ios::end); @@ -31,7 +31,7 @@ int ReadBinaryFile(const std::string& filename, std::string* contents) { std::unique_ptr Load( paddle::framework::Executor* /*executor*/, const std::string& model_filename) { - VLOG(3) << "loading model from " << model_filename; + LOG(INFO) << "loading model from " << model_filename; std::string program_desc_str; if (ReadBinaryFile(model_filename, &program_desc_str) != 0) { return nullptr; diff --git a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc index 1d55cabcf2a6217187d40092494160d16cb05825..eab4d50bbf4e7b935c7b3d98d1d75661cc9e9d23 100644 --- a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc +++ b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc @@ -19,17 +19,18 @@ public: for (auto& prefix_fs: config["file_systems"]) { std::unique_ptr fs(CREATE_CLASS(FileSystem, prefix_fs.second["class"].as(""))); if (fs == nullptr) { - VLOG(2) << "fail to create class: " << prefix_fs.second["class"].as(""); + LOG(FATAL) << "fail to create class: " << prefix_fs.second["class"].as(""); return -1; } if (fs->initialize(prefix_fs.second, context) != 0) { - VLOG(2) << "fail to initialize class: " << prefix_fs.second["class"].as(""); - return 0; + LOG(FATAL) << "fail to initialize class: " << prefix_fs.second["class"].as(""); + return -1; } _file_system.emplace(prefix_fs.first.as(""), std::move(fs)); } } if (_file_system.find("default") == _file_system.end()) { + LOG(WARNING) << "miss default file_system, use LocalFileSystem as default"; std::unique_ptr fs(CREATE_CLASS(FileSystem, "LocalFileSystem")); if (fs == nullptr || fs->initialize(YAML::Load(""), context) != 0) { return -1; @@ -82,7 +83,6 @@ public: return fs_it->second.get(); } } - VLOG(5) << "path: " << path << ", select default file system"; return _file_system["default"].get(); } diff --git a/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc index d0be197ba2a16ed4eb645a8c7f048bfa57c04209..7e6d42eba9f39ce37ff6d44a5d9491919e6ac171 100644 --- a/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc +++ b/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc @@ -25,7 +25,7 @@ public: } } if (_ugi.find("default") == _ugi.end()) { - VLOG(2) << "fail to load default ugi"; + LOG(FATAL) << "fail to load default ugi"; return -1; } return 0; @@ -62,7 +62,7 @@ public: int64_t file_size(const std::string& path) override { _err_no = -1; - VLOG(2) << "not support"; + LOG(FATAL) << "not support"; return 0; } diff --git a/paddle/fluid/train/custom_trainer/feed/main.cc b/paddle/fluid/train/custom_trainer/feed/main.cc index ea3140c62d348fcbaaf34b650b6323c1a790a0d1..1ce087262b73e5b9dcf91c78649e0f1cb121bb2b 100644 --- a/paddle/fluid/train/custom_trainer/feed/main.cc +++ b/paddle/fluid/train/custom_trainer/feed/main.cc @@ -13,6 +13,7 @@ using namespace paddle::custom_trainer::feed; DEFINE_string(feed_trainer_conf_path, "./conf/trainer.yaml", "path of trainer conf"); int main(int argc, char* argv[]) { + google::InitGoogleLogging(argv[0]); //gflags google::ParseCommandLineFlags(&argc, &argv, true); std::string gflag_conf = "./conf/gflags.conf"; diff --git a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc index 5c6a8cf7bdb0410b6261802505d8d317fb912115..0ec8d011c1407a4afa70ce136bea24feacfd9ffa 100644 --- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc +++ b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc @@ -46,7 +46,7 @@ int InitEnvProcess::initialize(std::shared_ptr context_ptr) { return -1; } - VLOG(3) << "Env initialize success"; + VLOG(3) << "Env initialize success"; return 0; } diff --git a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc index 146e0277ae7656a7bb7de947406eedb64b2a8888..09f9af80347cd00dcb3482a9b0b2b04b9863b306 100644 --- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc +++ b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc @@ -76,7 +76,7 @@ int LearnerProcess::run() { uint64_t epoch_id = epoch_accessor->current_epoch_id(); environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, - "Resume traine with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str()); + "Resume trainer with epoch_id:%d label:%s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str()); //判断是否先dump出base wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceBase); diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh b/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh old mode 100644 new mode 100755 index a0fec52dfbb6d1fb7d864cb79018a2ddd9c3bd96..1d4c21148e05e2c9cba0c3a132d53b2871635963 --- a/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh +++ b/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh @@ -1,3 +1,3 @@ -#!bash +#!/bin/bash export LD_LIBRARY_PATH=LD_LIBRARY_PATH:./so -./bin/feed_trainer +./bin/feed_trainer "$@" diff --git a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp index 1087f5672459506cc7b824127cd822c0df7ba566..c3fc0d3a5807f7173d8b6d3eb328cbe005cbfd0f 100644 --- a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp +++ b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp @@ -40,7 +40,7 @@ void ReadBinaryFile(const std::string& filename, std::string* contents) { std::unique_ptr Load( paddle::framework::Executor* executor, const std::string& model_filename) { - VLOG(3) << "loading model from " << model_filename; + LOG(DEBUG) << "loading model from " << model_filename; std::string program_desc_str; ReadBinaryFile(model_filename, &program_desc_str); diff --git a/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc b/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc index 218f93790c462e729ab29a4c404635d6a085861b..cfd001f0374578bb7319d49563722c5732da1bd8 100644 --- a/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc +++ b/paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc @@ -133,9 +133,9 @@ TEST_F(CreateProgramsTest, example_network) { auto output_var = executor->var<::paddle::framework::LoDTensor>(output_name); auto output = output_var.data()[0]; - VLOG(3) << "loss: " << loss << std::endl; - VLOG(3) << "label: " << label_data[0] << std::endl; - VLOG(3) << "output: " << output << std::endl; + LOG(INFO) << "loss: " << loss << std::endl; + LOG(INFO) << "label: " << label_data[0] << std::endl; + LOG(INFO) << "output: " << output << std::endl; ASSERT_NEAR(loss, pow(output - label_data[0], 2), 1e-8); }