diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c600c6e70cea822fac777ff20163078e6..900303343838c2cda2e7c37c7a135d430dbbc18c 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -26,17 +26,46 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_(scope), place_(place) {} +struct RecordTime { + RecordTime(const std::string &name, const std::string &type) + : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} + + ~RecordTime() { + if (type_ == "elementsize_add") { + end_ = std::chrono::system_clock::now(); + std::chrono::duration diff = end_ - start_; + VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); + } + } + + std::string name_; + std::string type_; + std::chrono::system_clock::time_point start_; + std::chrono::system_clock::time_point end_; +}; + void ComputationOpHandle::RunImpl() { - WaitInputVarGenerated(place_); + { + RecordTime rt("ComputationOpHandle::RunImpl", "Wait"); + WaitInputVarGenerated(place_); + } + + Scope *scope = nullptr; + { + RecordTime rt("ComputationOpHandle::RunImpl", "PrepareScope"); + scope = scope_->FindVar(kLocalExecScopeName)->Get(); + } + + { + RecordTime rt("ComputationOpHandle::RunImpl", "ReallyRun " + op_->Type()); - auto run_func = [this]() { - op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get(), place_); - }; + auto run_func = [this, scope]() { op_->Run(*scope, place_); }; - if (is_lock_and_record_event_free_) { - run_func(); - } else { - this->RunAndRecordEvent(run_func); + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } } } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 949510e03705a4a0900f1c7b8758a8f7308aa44b..872bc5d654cd66db821e56031d878815b653645c 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -120,6 +120,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( ClearFetchOp(graph_.get(), &fetch_ops); return fetches; } + void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, OpHandleBase *op, diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 4822627ac3b65972f41d9a23d9fe3dba3de3f97d..5997f12ffabcfd42ac210c71d2172501ee0fdf6d 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -41,7 +41,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda) { + if (events_.empty() && use_cuda && !dev_ctxes_.empty()) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c6f3254e9f7cedcf47be8ce8c3eecf4aa1b57add..b8adce4edf19c9b9890c1a04685a55e28c982e23 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -701,85 +701,125 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope, this->InferShape(&infer_shape_ctx); } +struct RecordTime { + RecordTime(const std::string& name, const std::string& type) + : name_(name), type_(type), start_(std::chrono::system_clock::now()) {} + + void inline stop() { + end_ = std::chrono::system_clock::now(); + std::chrono::duration diff = end_ - start_; + VLOG(1) << name_ << " " << type_ << " time record: " << diff.count(); + } + + ~RecordTime() { + if (type_ == "elementwise_add") { + stop(); + } + // stop(); + } + + std::string name_; + std::string type_; + std::chrono::system_clock::time_point start_; + std::chrono::system_clock::time_point end_; +}; + void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - - // check if op[type] has kernel registered. - auto& all_op_kernels = AllOpKernels(); - auto kernels_iter = all_op_kernels.find(type_); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", type_); + RecordTime rt("OperatorWithKernel::All", type_); + { + RecordTime rt("OperatorWithKernel::InferShape", type_); + RuntimeInferShapeContext infer_shape_ctx(*this, scope); + this->InferShape(&infer_shape_ctx); } - OpKernelMap& kernels = kernels_iter->second; + { + RecordTime* rt_1 = new RecordTime("OperatorWithKernel::Compute1", type_); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + type_); + } - // for (auto& candidate : kKernelPriority) { - // Do selection - // } + OpKernelMap& kernels = kernels_iter->second; - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + // TODO(dzhwinter) : kernel fallback mechanism will be added when all the + // transform functions are ready. - auto kernel_iter = kernels.find(expected_kernel_key); + // for (auto& candidate : kKernelPriority) { + // Do selection + // } + + auto expected_kernel_key = + this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = LibraryType::kPlain; - expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } #endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", type_, - KernelTypeToString(expected_kernel_key)); - } + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } - // do data transformScope &transfer_scope; - std::vector transfered_inplace_vars; - auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + Scope* transfer_scope = nullptr; + // auto* transfer_scope = + // TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); - // exec scope is the scope that kernel actually executed on. - const Scope& exec_scope = - (transfer_scope == nullptr ? scope : *transfer_scope); + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = scope; + // const Scope& exec_scope = + // (transfer_scope == nullptr ? scope : *transfer_scope); - if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { - dev_ctx = pool.Get(expected_kernel_key.place_); - } + if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(expected_kernel_key.place_); + } + delete rt_1; - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + RecordTime* rt_2 = new RecordTime("OperatorWithKernel::Compute2", type_); + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + delete rt_2; - if (!transfered_inplace_vars.empty()) { - // there is inplace variable has been transfered. - TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); - } + RecordTime* rt_3 = new RecordTime("OperatorWithKernel::Compute3", type_); + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); + } - /*For profiling/benchmark only*/ - if (FLAGS_benchmark) { - dev_ctx->Wait(); - } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); + } - if (FLAGS_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(vname, var->Get().value()); + if (FLAGS_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, + var->Get().value()); + } } } + delete rt_3; } } void OperatorWithKernel::TransferInplaceVarsBack( diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7ccc323abddd2c3ef13f1874661a8ca75..61416676d631ff84e54388c1e9da21a246a051d1 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -43,9 +43,16 @@ DEFINE_double( // the mutex will cause serious performance issue. // So the mutex is disabled when `ON_INFER`. #ifdef PADDLE_ON_INFERENCE -#define SCOPE_LOCK_GUARD +#define SCOPE_READER_LOCK +#define SCOPE_WRITER_LOCK #else -#define SCOPE_LOCK_GUARD std::lock_guard lock(mutex_); +// TODO(minqiyang): use reader lock and writer lock in all platforms +#define SCOPE_READER_LOCK +#define SCOPE_WRITER_LOCK +// #define SCOPE_READER_LOCK boost::shared_lock +// lock(mutex_); +// #define SCOPE_WRITER_LOCK boost::unique_lock +// lock(mutex_); #endif namespace paddle { @@ -61,18 +68,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK return VarInternal(name); } Variable* Scope::Var(std::string* name) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -81,34 +88,34 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindVarInternal(name); } Variable* Scope::FindLocalVar(const std::string& name) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindVarLocally(name); } const Scope* Scope::FindScope(const Variable* var) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK return FindScopeInternal(var); } void Scope::DropKids() { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - SCOPE_LOCK_GUARD + SCOPE_READER_LOCK std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -118,7 +125,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", this, scope); @@ -132,7 +139,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -145,12 +152,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - SCOPE_LOCK_GUARD + SCOPE_WRITER_LOCK auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 87bf7c6b156f32b8f6a1abc30b0676e1d4711d64..181baac870a6b3dae012fb5a75c2651c6a78239d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -33,34 +33,37 @@ class ElementwiseOp : public framework::OperatorWithKernel { using Tensor = framework::Tensor; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), - "Input(Y) of elementwise op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of elementwise op should not be null."); - - PADDLE_ENFORCE( - ctx->GetInputsVarType("Y").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s [%s]", - ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); - - if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR) { - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); - } else if (ctx->GetInputsVarType("X").front() == - framework::proto::VarType::SELECTED_ROWS) { - PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && - (ctx->GetInputDim("Y")[0] == 1), - "For elementwise_op, if X is Sparse, " - "Y must be scalar."); - } else { - PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", - ctx->GetInputsVarType("X").front()); + if (!ctx->IsRuntime()) { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of elementwise op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of elementwise op should not be null."); + + PADDLE_ENFORCE(ctx->GetInputsVarType("Y").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the " + "received is %s [%s]", + ctx->GetInputsVarType("Y").front(), + ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); + } } ctx->ShareDim("X", /*->*/ "Out"); @@ -125,7 +128,7 @@ The equation is: $$%s$$ -- $X$: a tensor of any dimension. +- $X$: a tensor of any dimension. - $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$. There are two cases for this operator: @@ -135,10 +138,10 @@ There are two cases for this operator: For case 2: -1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index - for broadcasting $Y$ onto $X$. +1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index + for broadcasting $Y$ onto $X$. 2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$. -3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of +3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of subsequence, such as shape(Y) = (2, 1) => (2). For example: @@ -152,7 +155,7 @@ For example: shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0 shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0 -The inputs $X$ and $Y$ can carry the different LoD information. +The inputs $X$ and $Y$ can carry the different LoD information. But the output only shares the LoD information with the input $X$. )DOC", diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39acce53e35dfceec675fcd4979a84e31..bc1b20321f1fe32f0028db4d91e2b5ddd069c176 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -23,56 +23,57 @@ class AdamOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(Param) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(Grad) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment1"), - "Input(Moment1) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Moment2"), - "Input(Moment2) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), - "Input(Beta1Pow) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), - "Input(Beta2Pow) of AdamOp should not be null."); - - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(ParamOut) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), - "Output(Moment1Out) of AdamOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), - "Output(Moment2Out) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Param"), + // "Input(Param) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Grad"), + // "Input(Grad) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Moment1"), + // "Input(Moment1) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Moment2"), + // "Input(Moment2) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + // "Input(LearningRate) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), + // "Input(Beta1Pow) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"), + // "Input(Beta2Pow) of AdamOp should not be null."); + + // PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + // "Output(ParamOut) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"), + // "Output(Moment1Out) of AdamOp should not be null."); + // PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"), + // "Output(Moment2Out) of AdamOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); - PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, - "Learning rate should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + // "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); - PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, - "Beta1 power accumulator should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1, + // "Beta1 power accumulator should have 1 dimension"); auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); - PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, - "Beta2 power accumulator should have 1 dimension"); + // PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1, + // "Beta2 power accumulator should have 1 dimension"); auto param_dims = ctx->GetInputDim("Param"); - if (ctx->GetInputsVarType("Grad")[0] == - framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Grad"), - "Param and Grad input of AdamOp should have same dimension"); - } - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment1"), - "Param and Moment1 input of AdamOp should have same dimension"); - PADDLE_ENFORCE_EQ( - param_dims, ctx->GetInputDim("Moment2"), - "Param and Moment2 input of AdamOp should have same dimension"); + // if (ctx->GetInputsVarType("Grad")[0] == + // framework::proto::VarType::LOD_TENSOR) { + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Grad"), + // "Param and Grad input of AdamOp should have same dimension"); + // } + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Moment1"), + // "Param and Moment1 input of AdamOp should have same dimension"); + // PADDLE_ENFORCE_EQ( + // param_dims, ctx->GetInputDim("Moment2"), + // "Param and Moment2 input of AdamOp should have same dimension"); ctx->SetOutputDim("ParamOut", param_dims); ctx->SetOutputDim("Moment1Out", param_dims); ctx->SetOutputDim("Moment2Out", param_dims); } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index e05885f5f5bfc169828c1c6e723dffff098c3c2e..8df2e01b037490e64e00b68f121ed6b1cbb06cd0 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -92,7 +92,8 @@ def cuda_profiler(output_file, output_mode=None, config=None): config_file = 'nvprof_config_file' with open(config_file, 'wb') as fp: fp.writelines([six.b("%s\n" % item) for item in config]) - core.nvprof_init(output_file, output_mode, config_file) + #Comment this for nvprof + #core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield