main.cc 3.2 KB
Newer Older
X
xiexionghang 已提交
1 2 3 4
#include <time.h>
#include <fstream>
#include <yaml-cpp/yaml.h>
#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h"
X
xiexionghang 已提交
5
#include "paddle/fluid/platform/init.h"
X
xiexionghang 已提交
6 7
#include "paddle/fluid/train/custom_trainer/feed/process/process.h"
#include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h"
R
rensilin 已提交
8
#include "paddle/fluid/framework/op_registry.h"
R
rensilin 已提交
9
#include "paddle/fluid/pybind/pybind.h"
X
xiexionghang 已提交
10 11 12 13 14 15

using namespace paddle::custom_trainer::feed;

DEFINE_string(feed_trainer_conf_path, "./conf/trainer.yaml", "path of trainer conf");

int main(int argc, char* argv[]) {
R
rensilin 已提交
16
    google::InitGoogleLogging(argv[0]);
X
xiexionghang 已提交
17 18 19 20 21 22 23
    //gflags
    google::ParseCommandLineFlags(&argc, &argv, true);
    std::string gflag_conf = "./conf/gflags.conf";
    google::SetCommandLineOption("flagfile", gflag_conf.c_str()); 

    //load trainer config
    auto trainer_context_ptr = std::make_shared<TrainerContext>();
X
xiexionghang 已提交
24
    trainer_context_ptr->cache_dict.reset(new SignCacheDict);
X
xiexionghang 已提交
25
    trainer_context_ptr->trainer_config = YAML::LoadFile(FLAGS_feed_trainer_conf_path);    
X
xiexionghang 已提交
26 27 28 29 30 31 32 33 34 35

    //environment
    auto& config = trainer_context_ptr->trainer_config;
    std::string env_class = config["environment"]["environment_class"].as<std::string>();
    trainer_context_ptr->environment.reset(CREATE_INSTANCE(RuntimeEnvironment, env_class));
    if (trainer_context_ptr->environment->initialize(config["environment"]) != 0) {
        return -1;
    }
    auto* environment = trainer_context_ptr->environment.get();
    environment->wireup();
36
    VLOG(2) << "node_num: " << environment->node_num(EnvironmentRole::ALL);
X
xiexionghang 已提交
37 38 39 40 41
    if (environment->node_num(EnvironmentRole::ALL) == 1) {
        environment->add_role(EnvironmentRole::WORKER);
        environment->add_role(EnvironmentRole::PSERVER);
    } else if (environment->rank_id(EnvironmentRole::ALL) % 2 == 0) {
        environment->add_role(EnvironmentRole::WORKER);
X
xiexionghang 已提交
42
    } else {
X
xiexionghang 已提交
43
        environment->add_role(EnvironmentRole::PSERVER);
X
xiexionghang 已提交
44 45 46
    } 
    trainer_context_ptr->pslib.reset(new PSlib());
    std::string ps_config = config["environment"]["ps"].as<std::string>();
47
    trainer_context_ptr->environment->barrier(EnvironmentRole::ALL); 
X
xiexionghang 已提交
48
    trainer_context_ptr->pslib->initialize(ps_config, environment);
X
xiexionghang 已提交
49 50
    //VLOG(3) << "Node Start With Role:" << role;    
     
X
xiexionghang 已提交
51 52 53 54 55 56
    
    if (environment->is_role(EnvironmentRole::WORKER)) {
        std::vector<std::string> process_name_list = {
            "InitEnvProcess",
            "LearnerProcess"
        };
X
xiexionghang 已提交
57 58 59 60 61 62 63 64 65 66 67 68 69 70
        for (const auto& process_name : process_name_list) {
            Process* process = CREATE_INSTANCE(Process, process_name);
            if (process == NULL) {
                VLOG(1) << "Process:" << process_name << " does not exist"; 
                return -1;
            }
            if (process->initialize(trainer_context_ptr) != 0) {
                VLOG(1) << "Process:" << process_name << " initialize failed"; 
                return -1;
            }
            trainer_context_ptr->process_list.push_back(std::shared_ptr<Process>(process));
        } 
        for (auto& process : trainer_context_ptr->process_list) {
            process->run();
X
xiexionghang 已提交
71
        }
X
xiexionghang 已提交
72 73 74 75 76 77 78
     
    }
    
    //TODO exit control
    bool running = true;
    while (running) {
        sleep(10000);
X
xiexionghang 已提交
79 80 81
    }
    return 0;
}