diff --git a/oneflow/core/actor/actor.cpp b/oneflow/core/actor/actor.cpp index 9e08a8e05ffb671b257000b437e9bb11aa7a12b0..4564c264cd3d56c2aed8f9258de3f5daf42a22d8 100644 --- a/oneflow/core/actor/actor.cpp +++ b/oneflow/core/actor/actor.cpp @@ -3,7 +3,7 @@ namespace oneflow { void Actor::LogMsgEvent(ActorMsg& msg) { - if (!Global::Get()->is_experiment_phase()) { + if (Global::Get()->is_experiment_phase()) { if (msg.msg_type() == ActorMsgType::kRegstMsg) { // get nanoseconds, e.g. 1505840189520477525 = 1505840189.520477525 sec int64_t start = diff --git a/oneflow/core/job/act_event_analysis.cpp b/oneflow/core/job/act_event_analysis.cpp index dcc10a732c91323ca6b6e24f9f157ae41e5edd3d..cb3939a5cd95fc8ecf2801bcb37c8794c7b925a5 100644 --- a/oneflow/core/job/act_event_analysis.cpp +++ b/oneflow/core/job/act_event_analysis.cpp @@ -6,44 +6,29 @@ #include "oneflow/core/job/event_report_util.h" namespace oneflow { -std::string GetLastUpRegstTime( - const std::vector& regsts, - const HashMap& regst_events, - const int64_t actor_id, const int64_t act_id) { - // key: regst_desc_id _ actor_id _ act_id +std::string GetLastRegstTime(const std::vector& regsts, + const std::list& msg_events, + const ActEvent& act_event, + const double time_diff) { int64_t id = -1; double time = 0.0; for (int64_t regst_desc_id : regsts) { - std::string key = std::to_string(regst_desc_id) + "_" - + std::to_string(actor_id) + "_" + std::to_string(act_id); - auto re = regst_events.find(key); - if (re == regst_events.end()) return ",,"; - if (re->second.from_producer_time > time) { - time = re->second.from_producer_time; - id = regst_desc_id; + for (auto msg : msg_events) { + if (msg.dst_actor_id() == act_event.actor_id() + && msg.info().find("from") != std::string::npos + && msg.regst_desc_id() == regst_desc_id + && msg.time() < act_event.start_time()) { + if (msg.time() > time) { + time = msg.time(); + id = regst_desc_id; + } + } } } - return std::to_string(id) + "," + Time2String(time) + ","; -} -std::string GetLastDownRegstTime( - const std::vector& regsts, - const HashMap& regst_events, - const int64_t actor_id, const int64_t act_id) { - // key: regst_desc_id _ actor_id _ act_id - int64_t id = -1; - double time = 0.0; - for (int64_t regst_desc_id : regsts) { - std::string key = std::to_string(regst_desc_id) + "_" - + std::to_string(actor_id) + "_" + std::to_string(act_id); - auto re = regst_events.find(key); - if (re == regst_events.end()) return ",,"; - if (re->second.from_consumer_time > time) { - time = re->second.from_consumer_time; - id = regst_desc_id; - } - } - return std::to_string(id) + "," + Time2String(time) + ","; + if (id == -1) return ",,"; + return std::to_string(id) + ",'" + Time2String(time - time_diff) + ","; } + void ActEventAnalysis(const std::string& plan_filepath, const std::string& act_event_filepath, const std::string& time_diff_filepath, @@ -56,10 +41,10 @@ void ActEventAnalysis(const std::string& plan_filepath, actor_id2consumed_regsts); std::vector machine_time_diffs; GetMachineTimeDiff(time_diff_filepath, machine_time_diffs); - HashMap regst_events; - Msg2RegstEvents(msg_event_filepath, regst_events, time_diff_filepath); Plan plan; ParseProtoFromTextFile(plan_filepath, &plan); + auto msg_events = of_make_unique>(); + LoadEvents(msg_event_filepath, msg_events.get()); std::ofstream out_stream(report_filepath); out_stream << "actor,type,machine,thrd,stream,act_id,push_time,start_time," "stop_time," @@ -84,10 +69,10 @@ void ActEventAnalysis(const std::string& plan_filepath, out_stream << Time2String(event.stop_time() - event.start_time()) + ","; out_stream << Time2HumanReadable(event.stop_time() - event.start_time()) + ","; - out_stream << GetLastUpRegstTime(actor_id2produced_regsts[actor_id], - regst_events, actor_id, event.act_id()); - out_stream << GetLastDownRegstTime(actor_id2consumed_regsts[actor_id], - regst_events, actor_id, event.act_id()); + out_stream << GetLastRegstTime(actor_id2consumed_regsts[actor_id], + *msg_events, event, time_diff); + out_stream << GetLastRegstTime(actor_id2produced_regsts[actor_id], + *msg_events, event, time_diff); out_stream << "\n"; } out_stream.close(); diff --git a/oneflow/core/job/runtime.cpp b/oneflow/core/job/runtime.cpp index 0c7cd55bbcd9a43b45fe197bc0198c11d3f55327..8d66fb1bd02d45e162183940085233afe73079d7 100644 --- a/oneflow/core/job/runtime.cpp +++ b/oneflow/core/job/runtime.cpp @@ -75,11 +75,11 @@ Runtime::Runtime(const Plan& plan, bool is_experiment_phase) { void Runtime::NewAllGlobal(const Plan& plan, bool is_experiment_phase) { const JobDesc* job_desc = Global::Get(); int64_t piece_num = 0; - Global::New(); if (is_experiment_phase) { piece_num = job_desc->piece_num_of_experiment_phase(); Global::New(); Global::New(); + Global::New(); } else { if (job_desc->IsTrain()) { piece_num = job_desc->NumOfPiecesInBatch() * job_desc->TotalBatchNum();