From 9b03ed3acb0b75541d099373b1444734e66de507 Mon Sep 17 00:00:00 2001 From: lixinqi Date: Wed, 8 Jan 2020 00:01:38 +0800 Subject: [PATCH] JobBuildAndInferCtx::Complete --- oneflow/core/job/compiler.cpp | 2 - .../core/job/cudnn_conv_ctx_cache_scope.cpp | 18 ---- oneflow/core/job/cudnn_conv_ctx_cache_scope.h | 17 ---- oneflow/core/job/job_build_and_infer_ctx.cpp | 22 +++++ oneflow/core/job/job_build_and_infer_ctx.h | 1 + oneflow/core/job/job_builder.cpp | 1 + oneflow/core/job/oneflow.cpp | 2 - .../core/job/session_global_objects_scope.cpp | 7 ++ oneflow/core/job_completer/job_completer.cpp | 11 --- oneflow/python/framework/c_api_util.py | 5 + oneflow/python/framework/compiler.py | 1 + oneflow/python/framework/job_builder.py | 98 ------------------- oneflow/python/job_build_and_infer_helper.h | 2 + oneflow/python/job_build_and_infer_if.h | 4 + oneflow/python/ops/user_op_builder.py | 4 +- oneflow/python/test/ops/test_activations.py | 1 + 16 files changed, 46 insertions(+), 150 deletions(-) delete mode 100644 oneflow/core/job/cudnn_conv_ctx_cache_scope.cpp delete mode 100644 oneflow/core/job/cudnn_conv_ctx_cache_scope.h delete mode 100644 oneflow/python/framework/job_builder.py diff --git a/oneflow/core/job/compiler.cpp b/oneflow/core/job/compiler.cpp index 6c36603f42..96f1bfed3f 100644 --- a/oneflow/core/job/compiler.cpp +++ b/oneflow/core/job/compiler.cpp @@ -1,6 +1,5 @@ #include "oneflow/core/job/compiler.h" #include "oneflow/core/persistence/tee_persistent_log_stream.h" -#include "oneflow/core/job/cudnn_conv_ctx_cache_scope.h" #include "oneflow/core/graph/op_graph.h" #include "oneflow/core/job_completer/job_completer.h" @@ -48,7 +47,6 @@ void Compiler::GenNetTopo(Plan* plan) const { } void Compiler::Compile(Job* job, Plan* plan, bool need_job_complete) const { - auto cudnn_conv_ctx_cache_scope = std::make_unique(); const JobDesc& job_desc = GlobalJobDesc(); if (need_job_complete) { JobCompleter().Complete(job); } Global::New(*job); diff --git a/oneflow/core/job/cudnn_conv_ctx_cache_scope.cpp b/oneflow/core/job/cudnn_conv_ctx_cache_scope.cpp deleted file mode 100644 index 1e1cc428ee..0000000000 --- a/oneflow/core/job/cudnn_conv_ctx_cache_scope.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#include "oneflow/core/job/cudnn_conv_ctx_cache_scope.h" -#include "oneflow/core/device/cudnn_conv_ctx_cache.h" - -namespace oneflow { - -CudnnConvCtxCacheScope::CudnnConvCtxCacheScope() { -#ifdef WITH_CUDA - Global::New(); -#endif -} - -CudnnConvCtxCacheScope::~CudnnConvCtxCacheScope() { -#ifdef WITH_CUDA - Global::Delete(); -#endif -} - -} // namespace oneflow diff --git a/oneflow/core/job/cudnn_conv_ctx_cache_scope.h b/oneflow/core/job/cudnn_conv_ctx_cache_scope.h deleted file mode 100644 index 4804db1268..0000000000 --- a/oneflow/core/job/cudnn_conv_ctx_cache_scope.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef ONEFLOW_CORE_JOB_CUDNN_CONV_CTX_CACHE_SCOPE_H_ -#define ONEFLOW_CORE_JOB_CUDNN_CONV_CTX_CACHE_SCOPE_H_ - -#include "oneflow/core/common/util.h" - -namespace oneflow { - -class CudnnConvCtxCacheScope final { - public: - OF_DISALLOW_COPY_AND_MOVE(CudnnConvCtxCacheScope); - CudnnConvCtxCacheScope(); - ~CudnnConvCtxCacheScope(); -}; - -} // namespace oneflow - -#endif // ONEFLOW_CORE_JOB_CUDNN_CONV_CTX_CACHE_SCOPE_H_ diff --git a/oneflow/core/job/job_build_and_infer_ctx.cpp b/oneflow/core/job/job_build_and_infer_ctx.cpp index 748f1f084e..8d05e56d01 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.cpp +++ b/oneflow/core/job/job_build_and_infer_ctx.cpp @@ -1,5 +1,7 @@ #include "oneflow/core/job/job_build_and_infer_ctx.h" +#include "oneflow/core/job_completer/op_graph_pass.h" #include "oneflow/core/framework/user_op_conf.h" +#include "oneflow/core/common/protobuf.h" namespace oneflow { @@ -38,6 +40,26 @@ Maybe JobBuildAndInferCtx::SetJobConf(const JobConfigProto& job_conf) { return Maybe::Ok(); } +Maybe JobBuildAndInferCtx::Complete() { + CHECK_NOTNULL(Global::Get()); + Global::Delete(); + auto scope = std::make_unique(job_->job_conf(), job_id_); + auto DoPass = [&](const std::string& pass_name) { FunctionPass(pass_name)(job_); }; + DoPass("CompleteOfrecordDecoder"); + DoPass("SetDefaultVariableConf"); + DoPass("AutoMixedPrecision"); + DoPass("TieUpChainHeadersUnReachableFromAnyVariableOps"); + DoPass("NonDistributedOptimizerPass"); + DoPass("AutoTrainStep"); + DoPass("AutoLearningRate"); + DoPass("GenerateBackwardAndOptimizerOpConfs"); + DoPass("SequentializeNcclTupleBroadcastReducePass"); + DoPass("AddAllReduceGroupPass"); + DoPass("AddLbiDiffWatcherOpConfs"); + DoPass("SequentializeAllReduceGroupPass"); + return Maybe::Ok(); +} + Maybe JobBuildAndInferCtx::AddOpNameParallelConf2Placement( const std::string& op_name, const ParallelConf& parallel_conf) { ParallelDesc parallel_desc(parallel_conf); diff --git a/oneflow/core/job/job_build_and_infer_ctx.h b/oneflow/core/job/job_build_and_infer_ctx.h index fd821eed5c..36fe8fb991 100644 --- a/oneflow/core/job/job_build_and_infer_ctx.h +++ b/oneflow/core/job/job_build_and_infer_ctx.h @@ -20,6 +20,7 @@ class JobBuildAndInferCtx { Maybe CheckAndCompleteUserOpConf(const OperatorConf& op_conf); Maybe SetJobConf(const JobConfigProto& job_conf); + Maybe Complete(); Maybe AddAndInferOp(const OperatorConf& op_conf, const ParallelConf& parallel_conf); Maybe AddAndInferConsistentOp(const OperatorConf& op_conf, const ParallelConf& parallel_conf); diff --git a/oneflow/core/job/job_builder.cpp b/oneflow/core/job/job_builder.cpp index 3d6f8f52f9..f24e5fa015 100644 --- a/oneflow/core/job/job_builder.cpp +++ b/oneflow/core/job/job_builder.cpp @@ -65,6 +65,7 @@ const ParallelConf& JobBuilder::ParallelConf4Lbi(const LogicalBlobId& lbi) const void JobBuilder::AddOps(const ParallelConf& parallel_conf, const std::vector& op_confs) { + if (op_confs.empty()) { return; } auto* placemnt_group = job_->mutable_placement()->add_placement_group(); *placemnt_group->mutable_parallel_conf() = parallel_conf; for (const auto& op_conf : op_confs) { diff --git a/oneflow/core/job/oneflow.cpp b/oneflow/core/job/oneflow.cpp index c66982991f..db1354eb08 100644 --- a/oneflow/core/job/oneflow.cpp +++ b/oneflow/core/job/oneflow.cpp @@ -6,7 +6,6 @@ #include "oneflow/core/job/improver.h" #include "oneflow/core/job/job_desc.h" #include "oneflow/core/job/job_builder.h" -#include "oneflow/core/job_completer/op_graph_pass.h" #include "oneflow/core/job/job_set.pb.h" #include "oneflow/core/job/machine_context.h" #include "oneflow/core/job/profiler.h" @@ -743,7 +742,6 @@ void CompileAndMergePlanOnMaster(const PbRpf& conf_jobs, Plan* plan) { AddJobName2JobId(jobs.at(job_id).job_conf().job_name(), job_id); { auto scope = std::make_unique(jobs.at(job_id).job_conf(), job_id); - FunctionPass("CompleteOfrecordDecoder")(&jobs.at(job_id)); CompileCurJobOnMaster(&jobs.at(job_id), &sub_plans.at(job_id), true); } } diff --git a/oneflow/core/job/session_global_objects_scope.cpp b/oneflow/core/job/session_global_objects_scope.cpp index c80bf2cee9..841665dcc8 100644 --- a/oneflow/core/job/session_global_objects_scope.cpp +++ b/oneflow/core/job/session_global_objects_scope.cpp @@ -17,6 +17,7 @@ #include "oneflow/core/job/lbi_diff_watcher_info.pb.h" #include "oneflow/core/job/job_set_compile_ctx.h" #include "oneflow/core/job/runtime_buffer_managers_scope.h" +#include "oneflow/core/device/cudnn_conv_ctx_cache.h" namespace oneflow { @@ -72,10 +73,16 @@ Maybe SessionGlobalObjectsScope::Init(const ConfigProto& config_proto) { Global::New(); Global::New(); } +#ifdef WITH_CUDA + Global::New(); +#endif return Maybe::Ok(); } SessionGlobalObjectsScope::~SessionGlobalObjectsScope() { +#ifdef WITH_CUDA + Global::Delete(); +#endif if (Global::Get()->IsThisMachineMaster()) { Global::Delete(); Global::Delete(); diff --git a/oneflow/core/job_completer/job_completer.cpp b/oneflow/core/job_completer/job_completer.cpp index b469295b89..a217644a3f 100644 --- a/oneflow/core/job_completer/job_completer.cpp +++ b/oneflow/core/job_completer/job_completer.cpp @@ -86,17 +86,6 @@ void DumpLogicalBlobDescAndSbpSignature(const OpGraph& op_graph, JobBuilder* job } // namespace void JobCompleter::Complete(Job* job) const { - FunctionPass("SetDefaultVariableConf")(job); - FunctionPass("AutoMixedPrecision")(job); - FunctionPass("TieUpChainHeadersUnReachableFromAnyVariableOps")(job); - FunctionPass("NonDistributedOptimizerPass")(job); - FunctionPass("AutoTrainStep")(job); - FunctionPass("AutoLearningRate")(job); - FunctionPass("GenerateBackwardAndOptimizerOpConfs")(job); - FunctionPass("SequentializeNcclTupleBroadcastReducePass")(job); - FunctionPass("AddAllReduceGroupPass")(job); - FunctionPass("AddLbiDiffWatcherOpConfs")(job); - FunctionPass("SequentializeAllReduceGroupPass")(job); WithOpGraphAndMutJobBuilder(job, &DumpLogicalBlobDescAndSbpSignature); WithOpGraphAndMutJobBuilder(job, &GroupBoxingByDstParallel); WithOpGraphAndMutJobBuilder(job, &AddKeepHeaderOnlyOp); diff --git a/oneflow/python/framework/c_api_util.py b/oneflow/python/framework/c_api_util.py index 742f83f867..44eee62e26 100644 --- a/oneflow/python/framework/c_api_util.py +++ b/oneflow/python/framework/c_api_util.py @@ -97,6 +97,11 @@ def CurJobBuildAndInferCtx_SetJobConf(job_config_proto): error = text_format.Parse(error_str, error_util.ErrorProto()) if error.HasField("error_type"): raise JobBuildAndInferError(error) +def CurJobBuildAndInferCtx_Complete(): + error_str = oneflow_internal.CurJobBuildAndInferCtx_Complete() + error = text_format.Parse(error_str, error_util.ErrorProto()) + if error.HasField("error_type"): raise JobBuildAndInferError(error) + def CurJobBuildAndInferCtx_CheckAndCompleteUserOpConf(op_conf_proto): serialized_op_conf = str(text_format.MessageToString(op_conf_proto)) AddDefaultVal = oneflow_internal.CurJobBuildAndInferCtx_CheckAndCompleteUserOpConf diff --git a/oneflow/python/framework/compiler.py b/oneflow/python/framework/compiler.py index 8797b2e6d5..b5c36b1004 100644 --- a/oneflow/python/framework/compiler.py +++ b/oneflow/python/framework/compiler.py @@ -32,6 +32,7 @@ def Compile(function_desc, config_proto): with _JobBuildAndInferCtx(job_conf.job_name), placement_scope, distribute_strategy: c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_conf) _CompileJob(function_desc) + c_api_util.CurJobBuildAndInferCtx_Complete() def _CompileJob(function_desc): func = function_desc.job_func diff --git a/oneflow/python/framework/job_builder.py b/oneflow/python/framework/job_builder.py deleted file mode 100644 index 33f09e5543..0000000000 --- a/oneflow/python/framework/job_builder.py +++ /dev/null @@ -1,98 +0,0 @@ -from __future__ import absolute_import - -import oneflow.python.framework.c_api_util as c_api_util - -class JobBuildAndInferCtx(object): - def __init__(self, job_name): - self.job_name_ = job_name - - def __enter__(self): - _Open(self.job_name_) - - def __exit__(self, *args): - _Close(self.job_name_) - -def GetCurCtxJobName(): - return c_api_util.JobBuildAndInferCtx_GetCurrentJobName() - -def CurCtxCheckJob(): - c_api_util.CurJobBuildAndInferCtx_CheckJob() - -def CurCtxSetJobConfIfNotSet(job_config_proto): - global job_conf_inited - if job_conf_inited == False: - c_api_util.CurJobBuildAndInferCtx_SetJobConf(job_config_proto) - job_conf_inited = True - -def CurCtxCheckAndCompleteUserOpConf(op_conf_proto): - return c_api_util.CurJobBuildAndInferCtx_CheckAndCompleteUserOpConf(op_conf_proto) - -def CurCtxAddAndInferOp(op_conf_proto, parallel_conf_proto): - return c_api_util.CurJobBuildAndInferCtx_AddAndInferOp(op_conf_proto, parallel_conf_proto) - -def CurCtxAddLossLogicalBlobName(lbn): - return c_api_util.CurJobBuildAndInferCtx_AddLossLogicalBlobName(lbn) - -def CurCtxAddLbiAndDiffWatcherUuidPair(lbi_and_uuid): - return c_api_util.CurJobBuildAndInferCtx_AddLbiAndDiffWatcherUuidPair(lbi_and_uuid) - -def CurCtxHasJobConf(): - return c_api_util.CurJobBuildAndInferCtx_HasJobConf() - -def MirroredBlobGetStaticShape(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_MirroredBlobGetStaticShape(job_name, lbn) - -def MirroredBlobGetDataType(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_MirroredBlobGetDataType(job_name, lbn) - -def MirroredBlobIsDynamic(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_MirroredBlobIsDynamic(job_name, lbn) - -def MirroredBlobDisableBoxing(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_MirroredBlobDisableBoxing(job_name, lbn) - -def MirroredBlobIsTensorList(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_MirroredBlobIsTensorList(job_name, lbn) - -def MirroredBlobGetBatchAxis(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_MirroredBlobGetBatchAxis(job_name, lbn) - -def MirroredBlobGetSplitAxisFromProducerView(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_MirroredBlobGetSplitAxisFromProducerView(job_name, lbn) - -def MirroredBlobGetParallelConfFromProducerView(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_MirroredBlobGetParallelConfFromProducerView(job_name, lbn) - -def GetStaticShape(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_GetStaticShape(job_name, lbn) - -def GetDataType(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_GetDataType(job_name, lbn) - -def IsDynamic(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_IsDynamic(job_name, lbn) - -def DisableBoxing(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_DisableBoxing(job_name, lbn) - -def IsTensorList(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_IsTensorList(job_name, lbn) - -def GetBatchAxis(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_GetBatchAxis(job_name, lbn) - -def GetSplitAxisFromProducerView(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_GetSplitAxisFromProducerView(job_name, lbn) - -def GetParallelConfFromProducerView(job_name, lbn): - return c_api_util.JobBuildAndInferCtx_GetParallelConfFromProducerView(job_name, lbn) - -def _Open(job_name): - return c_api_util.JobBuildAndInferCtx_Open(job_name) - -def _Close(job_name): - global job_conf_inited - job_conf_inited = False - return c_api_util.JobBuildAndInferCtx_Close() - -job_conf_inited = False diff --git a/oneflow/python/job_build_and_infer_helper.h b/oneflow/python/job_build_and_infer_helper.h index dfce25dbb1..4d9316f4c8 100644 --- a/oneflow/python/job_build_and_infer_helper.h +++ b/oneflow/python/job_build_and_infer_helper.h @@ -40,6 +40,8 @@ Maybe CurJobBuildAndInferCtx_SetJobConf(const std::string& serialized_job_ return JUST(GetCurInferCtx())->SetJobConf(job_conf); } +Maybe CurJobBuildAndInferCtx_Complete() { return JUST(GetCurInferCtx())->Complete(); } + Maybe CurJobBuildAndInferCtx_HasJobConf() { return JUST(GetCurInferCtx())->HasJobConf(); } Maybe CurJobBuildAndInferCtx_CheckAndCompleteUserOpConf( diff --git a/oneflow/python/job_build_and_infer_if.h b/oneflow/python/job_build_and_infer_if.h index 4873656b9d..062fa538d8 100644 --- a/oneflow/python/job_build_and_infer_if.h +++ b/oneflow/python/job_build_and_infer_if.h @@ -30,6 +30,10 @@ void CurJobBuildAndInferCtx_SetJobConf(const std::string& serialized_job_conf, .GetDataAndSerializedErrorProto(error_str); } +void CurJobBuildAndInferCtx_Complete(std::string* error_str) { + return oneflow::CurJobBuildAndInferCtx_Complete().GetDataAndSerializedErrorProto(error_str); +} + bool CurJobBuildAndInferCtx_HasJobConf(std::string* error_str) { return oneflow::CurJobBuildAndInferCtx_HasJobConf().GetDataAndSerializedErrorProto(error_str, false); diff --git a/oneflow/python/ops/user_op_builder.py b/oneflow/python/ops/user_op_builder.py index e46d4ab7fc..5624175bf6 100644 --- a/oneflow/python/ops/user_op_builder.py +++ b/oneflow/python/ops/user_op_builder.py @@ -4,7 +4,7 @@ import oneflow.python.framework.compile_context as compile_context import oneflow.python.framework.blob_desc as blob_desc import oneflow.python.framework.remote_blob as remote_blob_util import oneflow.python.framework.id_util as id_util -import oneflow.python.framework.job_builder as job_builder +import oneflow.python.framework.c_api_util as c_api_util import oneflow.core.operator.op_conf_pb2 as op_conf_util import oneflow.core.framework.user_op_attr_pb2 as user_op_attr_util import oneflow.core.register.logical_blob_id_pb2 as logical_blob_id_util @@ -35,7 +35,7 @@ class UserOpConfWrapperBuilder(object): def Build(self): assert self.user_op_.op_conf_.user_conf.op_type_name is not "" self.user_op_.op_conf_ = \ - job_builder.CurCtxCheckAndCompleteUserOpConf(self.user_op_.op_conf_) + c_api_util.CurJobBuildAndInferCtx_CheckAndCompleteUserOpConf(self.user_op_.op_conf_) return self.user_op_ def Op(self, op_type_name): diff --git a/oneflow/python/test/ops/test_activations.py b/oneflow/python/test/ops/test_activations.py index a9707e3687..b2aaeacc27 100644 --- a/oneflow/python/test/ops/test_activations.py +++ b/oneflow/python/test/ops/test_activations.py @@ -14,6 +14,7 @@ from test_util import Save def compare_with_tensorflow(device_type, activation_type, shape): assert device_type in ["gpu", "cpu"] flow.clear_default_session() + flow.config.enable_debug_mode(True); func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.train.primary_lr(1e-4) -- GitLab