diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 077072f6eadb0c48f4ae32f94828613d89ed01c9..a3e682e54ac496e37ed4a33a7b30d9fdca381d9d 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -18,7 +18,7 @@ function(copy TARGET) set(oneValueArgs "") set(multiValueArgs SRCS DSTS DEPS) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE) + set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) @@ -185,7 +185,8 @@ copy(cmake_cache SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt DSTS ${FLUID_INSTALL_DIR}) -add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) +# This command generates a complete fluid library for both train and inference +add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) # paddle fluid version execute_process( diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c6dd919a93d119723b389d3a695f0af82d711a06..212724a0c7fc06edf0599ddcaad33aba1fead879 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -127,6 +127,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.margin_rank_loss ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)) paddle.fluid.layers.elu ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) paddle.fluid.layers.relu6 ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)) paddle.fluid.layers.pow ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 519a00fb073b08f6c88de8186de187476b548fd3..48b36df6499e59fe742766b5f81fd30a9fb8b900 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -12,6 +12,5 @@ endif(NOT WIN32) if(WITH_INFERENCE) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) + add_subdirectory(train) endif() - -add_subdirectory(train) diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 9fbefabc841e3f6940860f60d959fee97495e4c9..d09b94a3fd32952985a37cf4246c7640d2db4f56 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -64,7 +64,8 @@ class OpHandleBase { virtual bool IsMultiDeviceTransfer() { return false; } const platform::DeviceContext *DeviceContext(platform::Place place) { - return dev_ctxes_[place]; + auto it = dev_ctxes_.find(place); + return it != dev_ctxes_.end() ? it->second : nullptr; } void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 70ec6e90a4d0106b7f838e51b8357798daa4b10d..4576999c8ec756ee662ed91df198e1a96f9897fc 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -46,6 +46,41 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } +template +static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, + GarbageCollector* gc, + RefCntMap* ref_cnts) { + std::unordered_set erase_tensors; + + auto handler = [&](const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + auto it = ref_cnts->find(name); + if (it == ref_cnts->end()) continue; + if ((it->second)-- == 1) { + auto* var = scope.FindVar(name); + if (var != nullptr) { + VLOG(10) << "Erase tensor \'" << name << "\'"; + if (var->IsType()) { + erase_tensors.insert(var->GetMutable()); + } else if (var->IsType()) { + erase_tensors.insert( + var->GetMutable()->mutable_value()); + } + } + } + } + } + }; + + handler(op->Inputs()); + handler(op->Outputs()); + + if (!erase_tensors.empty()) { + gc->Add(erase_tensors); + } +} + Executor::Executor(const platform::Place& place) : place_(place) {} void Executor::Close() { @@ -331,9 +366,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } int64_t max_memory_size = GetEagerDeletionThreshold(); - std::unique_ptr> gc; - if (max_memory_size >= 0) { + // WhileOp would set keep_kids to false + // WhileGradOp would need the scopes created in WhileOp + // Perhaps, we should not perform eager deletion in WhileOp + // The scopes and variables created by WhileOp would be deleted + // in WhileGradOp. + if (max_memory_size >= 0 && !keep_kids) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -352,45 +391,8 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, op->Run(*local_scope, place_); if (gc != nullptr) { - std::vector erase_vars; - for (auto& input : op->Inputs()) { - for (auto& input_name : input.second) { - auto it = ctx->cur_ref_cnts_.find(input_name); - if (it == ctx->cur_ref_cnts_.end()) continue; - if (it->second == 1) { // should delete it - erase_vars.emplace_back(input_name); - ctx->cur_ref_cnts_.erase(input_name); - } else { - --(it->second); - } - } - } - - for (auto& output : op->Outputs()) { - for (auto& output_name : output.second) { - auto it = ctx->cur_ref_cnts_.find(output_name); - if (it == ctx->cur_ref_cnts_.end()) continue; - if (it->second == 1) { - erase_vars.emplace_back(output_name); - ctx->cur_ref_cnts_.erase(output_name); - } else { - --(it->second); - } - } - } - - if (!erase_vars.empty()) { - std::vector erase_tensors; - for (auto& name : erase_vars) { - auto* var = local_scope->FindVar(name); - if (var == nullptr) continue; - if (var->IsType()) { - auto* tensor = var->GetMutable(); - erase_tensors.push_back(tensor); - } - } - if (!erase_tensors.empty()) gc->Add(erase_tensors); - } + DeleteUnusedTensors(*local_scope, op.get(), gc.get(), + &(ctx->cur_ref_cnts_)); } if (FLAGS_benchmark) { diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index f0cc1338a8af50030a70a9797cbcd1b0567272b5..36b36d49c2728dbef93042158dffa26d8f56d529 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -32,38 +32,32 @@ template std::unordered_map GetNonPersistableReferenceCount( const ProgramDesc& prog, size_t block_id) { auto& block = prog.Block(block_id); - std::unordered_set ignored_vars; std::unordered_map ref_cnts; - for (auto var_desc : block.AllVars()) { - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR || var_desc->Persistable()) { - ignored_vars.insert(var_desc->Name()); // ignore persistable vars - } - } - - for (auto op_desc : block.AllOps()) { - for (auto& input : op_desc->Inputs()) { - for (auto& input_name : input.second) { - if (!ignored_vars.count(input_name)) { - if (ref_cnts.count(input_name)) - ++ref_cnts[input_name]; - else - ref_cnts[input_name] = 1; + auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + auto* var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) continue; + auto type = var_desc->Proto()->type().type(); + if (type != proto::VarType::LOD_TENSOR && + type != proto::VarType::SELECTED_ROWS) { + continue; } - } - } - for (auto& output : op_desc->Outputs()) { - for (auto output_name : output.second) { - if (!ignored_vars.count(output_name)) { - if (ref_cnts.count(output_name)) - ++ref_cnts[output_name]; - else - ref_cnts[output_name] = 1; + auto it = ref_cnts.find(name); + if (it != ref_cnts.end()) { + ++it->second; + } else { + ref_cnts[name] = 1; } } } + }; + + for (auto op_desc : block.AllOps()) { + update_ref_cnts(op_desc, op_desc->Inputs()); + update_ref_cnts(op_desc, op_desc->Outputs()); } return ref_cnts; } diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc index 95d7138381baec17d4969ef8b5287a1e70f2ac81..04459612a726bcb60f9d752dfd8927b6f5c2500d 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -44,89 +44,6 @@ namespace ir { GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \ GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name) -template -LoDTensor tensor_apply(const LoDTensor& vec, UnaryOperation f) { - LoDTensor vec_y; - vec_y.Resize(vec.dims()); - const float* x = vec.data(); - float* y = vec_y.mutable_data(platform::CPUPlace()); - for (int64_t i = 0; i < vec.numel(); i++) { - y[i] = f(x[i]); - } - return vec_y; -} - -void tensor_apply_inplace(LoDTensor* vec, float (*f)(float)) { - float* data = vec->mutable_data(platform::CPUPlace()); - for (int64_t i = 0; i < vec->numel(); i++) { - data[i] = f(data[i]); - } -} - -template -LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b, - BinaryOperation f) { - PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims()); - LoDTensor vec_y; - vec_y.Resize(vec_a.dims()); - const float* a = vec_a.data(); - const float* b = vec_b.data(); - float* y = vec_y.mutable_data(platform::CPUPlace()); - for (int64_t i = 0; i < vec_a.numel(); i++) { - y[i] = f(a[i], b[i]); - } - return vec_y; -} - -template -LoDTensor tensor_apply_eltwise_broadcast(const LoDTensor& vec_a, - const LoDTensor& vec_b, - BinaryOperation f) { - PADDLE_ENFORCE_EQ(vec_a.dims().size(), 2); - PADDLE_ENFORCE_EQ(vec_b.dims().size(), 2); - PADDLE_ENFORCE_EQ(vec_a.dims()[0], vec_b.dims()[0]); - PADDLE_ENFORCE_EQ(vec_b.dims()[1], 1); - LoDTensor vec_y; - vec_y.Resize(vec_a.dims()); - const float* a = vec_a.data(); - const float* b = vec_b.data(); - float* y = vec_y.mutable_data(platform::CPUPlace()); - size_t a_height = vec_a.dims()[0]; - size_t a_width = vec_a.dims()[1]; - for (size_t h = 0; h < a_height; h++) { - for (size_t w = 0; w < a_width; ++w) { - *(y++) = f(*(a++), b[h]); - } - } - return vec_y; -} - -// reshape to two dimensions {A, B * C * ...} -void make_tensor_2d(LoDTensor* tensor_to_reshape) { - auto dims_count = tensor_to_reshape->dims().size(); - PADDLE_ENFORCE_GT(dims_count, 0); - - int size2 = 1; - for (int i = 1; i < dims_count; i++) { - size2 *= tensor_to_reshape->dims()[i]; - } - tensor_to_reshape->Resize(make_ddim({tensor_to_reshape->dims()[0], size2})); -} - -void recompute_conv_weights(LoDTensor* weights, LoDTensor* tmp) { - // remember the weights tensor shape {A, B, C, ...} - auto weights_shape = weights->dims(); - // reduce the weights to 2d {A, B * C * ...} - make_tensor_2d(weights); - // make tmp tensor 2d by adding 1 as second dim {A, 1} - make_tensor_2d(tmp); - - *weights = - tensor_apply_eltwise_broadcast(*weights, *tmp, std::multiplies()); - // reshape weights to the original dims {A, B, C, ...} - weights->Resize(weights_shape); -} - void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, // const ir::Node& bn_scale, // @@ -135,6 +52,13 @@ void recompute_bias_and_weights(const Scope* scope, const ir::Node& bn_variance, // LoDTensor* eltwise_y_in_tensor, // float epsilon) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + // Re-compute bias of conv2d from BN PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims()); @@ -143,31 +67,38 @@ void recompute_bias_and_weights(const Scope* scope, scope->FindVar(bn_variance.Name())->GetMutable(); auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable(); - auto std_tensor = LoDTensor(); - std_tensor.Resize(bn_bias_tensor.dims()); - std_tensor = - tensor_apply(*variance_tensor, [&](float x) { return x + epsilon; }); + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + EigenVectorArrayMap variance_array( + variance_tensor->mutable_data(platform::CPUPlace()), + variance_tensor->numel(), 1); + ConstEigenVectorArrayMap mean_array(mean_tensor->data(), + mean_tensor->numel(), 1); + ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data(), + bn_bias_tensor.numel(), 1); - using EigenVectorArrayMap = - Eigen::Map>; + // variance will not be used anymore, so make it std_array and then tmp_array + variance_array += epsilon; + variance_array = variance_array.sqrt(); + variance_array = scale_array / variance_array; + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); - EigenVectorArrayMap std_vec( - std_tensor.mutable_data(platform::CPUPlace()), std_tensor.numel(), - 1); - std_vec = std_vec.sqrt(); - auto tmp_tensor = - tensor_apply_eltwise(*scale_tensor, std_tensor, std::divides()); - auto tensor_minus = tensor_apply_eltwise(*eltwise_y_in_tensor, *mean_tensor, - std::minus()); - auto tensor_mul = - tensor_apply_eltwise(tensor_minus, tmp_tensor, std::multiplies()); - *eltwise_y_in_tensor = - tensor_apply_eltwise(tensor_mul, bn_bias_tensor, std::plus()); + eltwise_y_in_array = + ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array; // Re-compute weight of conv2d from BN - auto* current_param = - scope->FindVar(conv_weight->Name())->GetMutable(); - recompute_conv_weights(current_param, &tmp_tensor); + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = flatten_to_2d(weights_shape, 1); + + EigenMatrixArrayMap weights_array_2d( + weights->mutable_data(platform::CPUPlace()), weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= variance_array; } std::unique_ptr ConvBNFusePass::ApplyImpl( diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f06bad6c78c05804e583f859906b88fb7b500372..e8adabd26540754d5b9206294eeeed79757220bf 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -307,6 +307,10 @@ ParallelExecutor::~ParallelExecutor() { } } } + + // member_ must be destructed before gcs_ since the destructor of + // ReferenceCountOpHandle use raw pointers of gcs_ inside. + member_.reset(); } } // namespace framework diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index fd386a5987f11ff64964e95eb7e9b83572dc790c..ef09b98b2aa91a9d729b94d15dbb676dde4092b6 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -75,7 +75,7 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; - ParallelExecutorPrivate *member_; + std::unique_ptr member_; #ifdef PADDLE_WITH_CUDA // ref_cnts_ is only initialized when ParallelExecutor constructs, and then diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 1a727a2c8c759d010606d5b605823b7252b35c69..a4abd1b1283f08fb8431fbeea0cea17c8439fdd7 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -49,18 +49,18 @@ int64_t GetEagerDeletionThreshold() { Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); kids_.push_back(new Scope(this)); return *kids_.back(); } Variable* Scope::Var(const std::string& name) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return VarInternal(name); } Variable* Scope::Var(std::string* name) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto new_name = string::Sprintf("%p.%d", this, vars_.size()); if (name != nullptr) { *name = new_name; @@ -69,29 +69,34 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return FindVarInternal(name); } +Variable* Scope::FindLocalVar(const std::string& name) const { + std::lock_guard lock(mutex_); + return FindVarLocally(name); +} + const Scope* Scope::FindScope(const Variable* var) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); return FindScopeInternal(var); } void Scope::DropKids() { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); for (Scope* s : kids_) delete s; kids_.clear(); } bool Scope::HasKid(const Scope* scope) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); return it != this->kids_.end(); } std::vector Scope::LocalVarNames() const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); std::vector known_vars; known_vars.reserve(this->vars_.size()); for (auto& p : vars_) { @@ -101,7 +106,7 @@ std::vector Scope::LocalVarNames() const { } void Scope::DeleteScope(Scope* scope) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); this->kids_.erase(it); @@ -114,7 +119,7 @@ void Scope::DeleteScope(Scope* scope) const { } void Scope::EraseVars(const std::vector& var_names) { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); std::set var_set(var_names.begin(), var_names.end()); for (auto it = vars_.begin(); it != vars_.end();) { if (var_set.find(it->first) != var_set.end()) { @@ -127,12 +132,12 @@ void Scope::EraseVars(const std::vector& var_names) { void Scope::Rename(const std::string& origin_name, const std::string& new_name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); RenameInternal(origin_name, new_name); } std::string Scope::Rename(const std::string& origin_name) const { - std::unique_lock lock(mutex_); + std::lock_guard lock(mutex_); auto new_name = string::Sprintf("%p.%d", this, vars_.size()); RenameInternal(origin_name, new_name); return new_name; diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index e42fff1d79d92fb7ed61768a614d8cd98f6775a0..14f9f36812d690fc4a7440f2e7e6a85e9993a535 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -63,6 +63,11 @@ class Scope { /// Caller doesn't own the returned Variable. Variable* FindVar(const std::string& name) const; + /// Find a variable in the current scope. + /// Return nullptr if cannot find. + /// Caller doesn't own the returned Variable. + Variable* FindLocalVar(const std::string& name) const; + const Scope* parent() const { return parent_; } /// Find the scope or an ancestor scope that contains the given variable. diff --git a/paddle/fluid/operators/adadelta_op.cc b/paddle/fluid/operators/adadelta_op.cc index d1970515f58969948b1d2db5847e4344112f77f9..89a7a49e0fa8427826f5d91274912a68f2316b61 100644 --- a/paddle/fluid/operators/adadelta_op.cc +++ b/paddle/fluid/operators/adadelta_op.cc @@ -18,6 +18,7 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; + class AdadeltaOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -31,6 +32,16 @@ class AdadeltaOp : public framework::OperatorWithKernel { "Input(AvgSquaredGrad) of AdadeltaOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"), "Input(AvgSquaredUpdate) of AdadeltaOp should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Grad").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of AdadeltaOp should not be null."); @@ -56,6 +67,7 @@ class AdadeltaOp : public framework::OperatorWithKernel { ctx->SetOutputDim("AvgSquaredGradOut", param_dim); ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim); } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { auto input_data_type = diff --git a/paddle/fluid/operators/adadelta_op.h b/paddle/fluid/operators/adadelta_op.h index 822458daf663d99bbb38d99205f51163a0df4c4d..6c616aa03d9809e9b7725a700c7edd5ff5d6dc42 100644 --- a/paddle/fluid/operators/adadelta_op.h +++ b/paddle/fluid/operators/adadelta_op.h @@ -23,6 +23,17 @@ template class AdadeltaOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto param_out_tensor = ctx.Output("ParamOut"); auto avg_squared_grad_out_tensor = ctx.Output("AvgSquaredGradOut"); diff --git a/paddle/fluid/operators/adagrad_op.h b/paddle/fluid/operators/adagrad_op.h index df520fcc898ff5514927dbdd845ecaecdcf3c147..0a16ce00f71586ef55007c3753e024be29d0ed56 100644 --- a/paddle/fluid/operators/adagrad_op.h +++ b/paddle/fluid/operators/adagrad_op.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -21,25 +22,31 @@ namespace operators { template struct SparseAdagradFunctor { - void operator()(const DeviceContext& context, - const framework::SelectedRows& grad, - const framework::Tensor& learning_rate, T epsilon, - framework::Tensor* moment, framework::Tensor* param); + void operator()(const DeviceContext &context, + const framework::SelectedRows &grad, + const framework::Tensor &learning_rate, T epsilon, + framework::Tensor *moment, framework::Tensor *param); }; template class AdagradOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* param_out_tensor = ctx.Output("ParamOut"); - auto* moment_out_tensor = ctx.Output("MomentOut"); + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + + auto *param_out_tensor = ctx.Output("ParamOut"); + auto *moment_out_tensor = ctx.Output("MomentOut"); param_out_tensor->mutable_data(ctx.GetPlace()); moment_out_tensor->mutable_data(ctx.GetPlace()); T epsilon = static_cast(ctx.Attr("epsilon")); - auto* grad_var = ctx.InputVar("Grad"); + auto *grad_var = ctx.InputVar("Grad"); if (grad_var->IsType()) { auto param = framework::EigenVector::Flatten( *ctx.Input("Param")); @@ -47,16 +54,16 @@ class AdagradOpKernel : public framework::OpKernel { *ctx.Input("Grad")); auto moment = framework::EigenVector::Flatten( *ctx.Input("Moment")); - auto* learning_rate = ctx.Input("LearningRate"); + auto *learning_rate = ctx.Input("LearningRate"); auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto* place = ctx.template device_context().eigen_device(); + auto *place = ctx.template device_context().eigen_device(); moment_out.device(*place) = moment + grad * grad; Eigen::DSizes m_dsize(moment_out_tensor->numel()); if (platform::is_cpu_place(ctx.GetPlace())) { - auto* lr = learning_rate->data(); + auto *lr = learning_rate->data(); param_out.device(*place) = param - lr[0] * grad / (moment_out.sqrt() + epsilon); } else { @@ -66,10 +73,10 @@ class AdagradOpKernel : public framework::OpKernel { lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); } } else if (grad_var->IsType()) { - auto* param_tensor = ctx.Input("Param"); + auto *param_tensor = ctx.Input("Param"); PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); - auto* moment_tensor = ctx.Input("Moment"); + auto *moment_tensor = ctx.Input("Moment"); PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); SparseAdagradFunctor functor; diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h index 4cb1f3a80e95bdda79e6451dc3cc87e899b11779..3455d1ee54e8e6e498d0b0e6932ec099af9c0b30 100644 --- a/paddle/fluid/operators/adam_op.h +++ b/paddle/fluid/operators/adam_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/algorithm.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/for_range.h" @@ -199,23 +200,9 @@ struct SparseAdamFunctor { row_numel_(row_numel), row_count_(row_count) {} - inline HOSTDEVICE int64_t BinarySearchInRows(int64_t row) const { - int64_t beg = 0, end = row_count_ - 1; - while (beg <= end) { - auto mid = ((beg + end) >> 1); - if (rows_[mid] == row) - return mid; - else if (rows_[mid] < row) - beg = mid + 1; - else - end = mid - 1; - } - return -1; - } - inline HOSTDEVICE void operator()(size_t i) const { - int64_t row = i / row_numel_; - auto row_idx = BinarySearchInRows(row); + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; // The following code is the same as dense @@ -244,6 +231,12 @@ template class AdamOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; diff --git a/paddle/fluid/operators/adamax_op.cc b/paddle/fluid/operators/adamax_op.cc index 32062574bcf71ff96e451eaa6865b6bbfc3b1c80..d4aa4d338a2379adf985ba7f89b528bc402eda06 100644 --- a/paddle/fluid/operators/adamax_op.cc +++ b/paddle/fluid/operators/adamax_op.cc @@ -35,6 +35,16 @@ class AdamaxOp : public framework::OperatorWithKernel { "Input(LearningRate) of AdamaxOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"), "Input(Beta1Pow) of AdamaxOp should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Grad").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of AdamaxOp should not be null."); diff --git a/paddle/fluid/operators/adamax_op.h b/paddle/fluid/operators/adamax_op.h index de644676fd9c3fabdbf01d2fd9c69858c2627ed3..7137fbd9651b4523f6d1609a0595b30758aa40df 100644 --- a/paddle/fluid/operators/adamax_op.h +++ b/paddle/fluid/operators/adamax_op.h @@ -23,6 +23,17 @@ template class AdamaxOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); auto inf_norm_out_tensor = ctx.Output("InfNormOut"); diff --git a/paddle/fluid/operators/decayed_adagrad_op.cc b/paddle/fluid/operators/decayed_adagrad_op.cc index c0f2b49a04d9e88502c4b63bca493cd2b7ad1c5c..d73ae9e2721b388212cb6efa354eb4b480df9cad 100644 --- a/paddle/fluid/operators/decayed_adagrad_op.cc +++ b/paddle/fluid/operators/decayed_adagrad_op.cc @@ -32,6 +32,16 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->HasInput("LearningRate"), "Input(LearningRate) of DecayedAdagradOp should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Grad").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of DecayedAdagradOp should not be null."); diff --git a/paddle/fluid/operators/decayed_adagrad_op.h b/paddle/fluid/operators/decayed_adagrad_op.h index a46af078e0c6b4bf1faca0570b6a97b026864f13..5df43d33ef9f720fd20d57c53ff37cc85440b24e 100644 --- a/paddle/fluid/operators/decayed_adagrad_op.h +++ b/paddle/fluid/operators/decayed_adagrad_op.h @@ -23,6 +23,17 @@ template class DecayedAdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto param_out_tensor = ctx.Output("ParamOut"); auto moment_out_tensor = ctx.Output("MomentOut"); diff --git a/paddle/fluid/operators/ftrl_op.cc b/paddle/fluid/operators/ftrl_op.cc index 70ba25c213046cc934f46be067080d5fdbb42f9e..b77e12d6508eb07ae137b313ca91eac951afbcbe 100644 --- a/paddle/fluid/operators/ftrl_op.cc +++ b/paddle/fluid/operators/ftrl_op.cc @@ -34,6 +34,16 @@ class FTRLOp : public framework::OperatorWithKernel { "Input(Grad) of FTRL should not be null."); PADDLE_ENFORCE(ctx->HasInput("LearningRate"), "Input(LearningRate) of FTRL should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Grad").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of FTRL should not be null."); diff --git a/paddle/fluid/operators/ftrl_op.h b/paddle/fluid/operators/ftrl_op.h index 6f821e7e9944214fc5ebdf6bc7db8789b8ada6b9..8f812c9a037bfac8c1e29e32a5ad5b077c8153d1 100644 --- a/paddle/fluid/operators/ftrl_op.h +++ b/paddle/fluid/operators/ftrl_op.h @@ -28,6 +28,17 @@ template class FTRLOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto* param_out = ctx.Output("ParamOut"); auto* sq_accum_out = ctx.Output("SquaredAccumOut"); auto* lin_accum_out = ctx.Output("LinearAccumOut"); diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h new file mode 100644 index 0000000000000000000000000000000000000000..262469beea7449eb5820b86de1ac4f790a833e79 --- /dev/null +++ b/paddle/fluid/operators/math/algorithm.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // for int64_t +#include + +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +template +HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { + int64_t beg = 0, end = num - 1; + while (beg <= end) { + auto mid = ((beg + end) >> 1); + if (x[mid] == val) + return mid; + else if (x[mid] < val) + beg = mid + 1; + else + end = mid - 1; + } + return -1; +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index f6fe2bc2f674e333c4c65f3301f9eae55cbb0fe9..1c0e88f0752a9bfe5ced7a012724fc546088d068 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" namespace paddle { diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 8dc17478e6a4e8c60741b72233eee3cd8d33428e..521c53dd0d71707c13c4364c5ee59943a03d4a2d 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/eigen.h" diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 69318a6598c8c69eceab7216df6382537153d34f..235b5405fb7d016f4bd8c738f75b303522183116 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/sequence_pooling.h" #include + +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/sequence_pooling.h" namespace paddle { namespace operators { @@ -180,6 +182,7 @@ class SequencePoolFunctor { } auto lod = input.lod()[0]; auto& place = *context.eigen_device(); + auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { Tensor in_t = input.Slice(static_cast(lod[i]), static_cast(lod[i + 1])); @@ -191,7 +194,14 @@ class SequencePoolFunctor { if (pooltype == "AVERAGE") { out_e.device(place) = in_e.mean(Eigen::array({{0}})); } else if (pooltype == "SUM") { - out_e.device(place) = in_e.sum(Eigen::array({{0}})); + if (h > 0) { + const T* in_data = in_t.data(); + T* out_data = out_t.mutable_data(context.GetPlace()); + blas.VCOPY(w, in_data, out_data); + for (int64_t r = 1; r != h; ++r) { + blas.AXPY(w, 1., in_data + r * w, out_data); + } + } } else if (pooltype == "SQRT") { out_e.device(place) = in_e.sum(Eigen::array({{0}})) / std::sqrt(static_cast(h)); @@ -223,6 +233,7 @@ class SequencePoolGradFunctor { } auto lod = in_grad->lod()[0]; auto& place = *context.eigen_device(); + auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { auto in_g_t = in_grad->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); @@ -237,7 +248,11 @@ class SequencePoolGradFunctor { if (pooltype == "AVERAGE") { in_g_e.device(place) = (out_g_e / static_cast(h)).broadcast(bcast); } else if (pooltype == "SUM") { - in_g_e.device(place) = (out_g_e).broadcast(bcast); + const T* out_g_data = out_g_t.data(); + T* in_g_data = in_g_t.mutable_data(context.GetPlace()); + for (int r = 0; r != h; ++r) { + blas.VCOPY(w, out_g_data, in_g_data + r * w); + } } else if (pooltype == "SQRT") { in_g_e.device(place) = (out_g_e / std::sqrt(static_cast(h))).broadcast(bcast); diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc index 5f43c5810812260c4384349bdb709716c9a182f5..c8079a99fb8c8e05144c5390794db7757e74f6ae 100644 --- a/paddle/fluid/operators/momentum_op.cc +++ b/paddle/fluid/operators/momentum_op.cc @@ -33,6 +33,11 @@ class MomentumOp : public framework::OperatorWithKernel { "Input(velocity) of Momentum should not be null."); PADDLE_ENFORCE(ctx->HasInput("LearningRate"), "Input(LearningRate) of Momentum should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(ParamOut) of Momentum should not be null."); diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu index a3932db1f3a50305d585cd3d5e86fa1b527df78b..5dc920c70979ad5c3d4fb3acc8d2dacaffe386c0 100644 --- a/paddle/fluid/operators/momentum_op.cu +++ b/paddle/fluid/operators/momentum_op.cu @@ -46,6 +46,17 @@ template class MomentumOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + const auto* grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE(grad_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), grad_var->Type().name()); + auto param_out = ctx.Output("ParamOut"); auto velocity_out = ctx.Output("VelocityOut"); auto param = ctx.Input("Param"); diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h index 264726040fb566a52b8c0cdee0a1524197d2a675..40073d21b7186ad319e4988e667750c267581300 100644 --- a/paddle/fluid/operators/momentum_op.h +++ b/paddle/fluid/operators/momentum_op.h @@ -23,6 +23,12 @@ template class MomentumOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + auto param_out = ctx.Output("ParamOut"); auto velocity_out = ctx.Output("VelocityOut"); auto param = ctx.Input("Param"); diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 28cc91a5ed5d74994e5b960a0a4dd3c6a5e6cdcc..51b980acb5a08d431d96a3a92479dec09119c27e 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -31,8 +31,8 @@ class BlockingQueue { // is a workaround and a simplified version of framework::Channel as it // doesn't support GPU and it implements on buffered blocking queue. public: - explicit BlockingQueue(size_t capacity) - : capacity_(capacity), closed_(false) { + explicit BlockingQueue(size_t capacity, bool speed_test_mode = false) + : capacity_(capacity), speed_test_mode_(speed_test_mode), closed_(false) { PADDLE_ENFORCE_GT( capacity_, 0, "The capacity of a reader::BlockingQueue must be greater than 0."); @@ -72,7 +72,9 @@ class BlockingQueue { if (!queue_.empty()) { PADDLE_ENFORCE_NOT_NULL(elem); *elem = queue_.front(); - queue_.pop_front(); + if (LIKELY(!speed_test_mode_)) { + queue_.pop_front(); + } send_cv_.notify_one(); return true; } else { @@ -114,6 +116,7 @@ class BlockingQueue { private: size_t capacity_; + bool speed_test_mode_; bool closed_; std::deque queue_; diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 4f7cfc24ec035349f3c85e84d876ad9b5b5493a6..3f041ff7e4e32b407729a22aab25d3aab199fee0 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -33,8 +33,9 @@ class LoDTensorBlockingQueue { private: LoDTensorBlockingQueue(size_t capacity, - const std::vector& dims) - : queue_(capacity), dims_(dims) {} + const std::vector& dims, + bool speed_test_mode = false) + : queue_(capacity, speed_test_mode), dims_(dims) {} public: bool Push(const std::vector& lod_tensor_vec) { @@ -69,11 +70,12 @@ class LoDTensorBlockingQueue { class LoDTensorBlockingQueueHolder { public: - void InitOnce(size_t capacity, const std::vector& dims) { + void InitOnce(size_t capacity, const std::vector& dims, + bool speed_test_mode = false) { PADDLE_ENFORCE( queue_ == nullptr, "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); - queue_.reset(new LoDTensorBlockingQueue(capacity, dims)); + queue_.reset(new LoDTensorBlockingQueue(capacity, dims, speed_test_mode)); } inline const std::shared_ptr& GetQueue() const { diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc index 7d1b381d56c8cdc1e79e594b18c1a1ed59ab5284..bd7ac64b2fce2452744e4756b149ee7f291d38aa 100644 --- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc +++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc @@ -217,3 +217,27 @@ TEST(BlockingQueue, MyClassTest) { q.Receive(&b); EXPECT_EQ(a.val_, b.val_); } + +TEST(BlockingQueue, speed_test_mode) { + size_t queue_size = 10; + BlockingQueue q1(queue_size, false); + for (size_t i = 0; i < queue_size; ++i) { + q1.Send(i); + } + size_t b; + for (size_t i = 0; i < queue_size; ++i) { + q1.Receive(&b); + EXPECT_EQ(b, i); + } + EXPECT_EQ(q1.Size(), 0); + + BlockingQueue q2(queue_size, true); + for (size_t i = 0; i < queue_size; ++i) { + q2.Send(i); + } + for (size_t i = 0; i < queue_size; ++i) { + q2.Receive(&b); + EXPECT_EQ(b, 0); + } + EXPECT_EQ(q2.Size(), queue_size); +} diff --git a/paddle/fluid/operators/rmsprop_op.cc b/paddle/fluid/operators/rmsprop_op.cc index 2f773f222e50a440801b06a4fd997bf237b34772..f06f87e61d3a4d1fc8b864b9dd84e697fb12a006 100644 --- a/paddle/fluid/operators/rmsprop_op.cc +++ b/paddle/fluid/operators/rmsprop_op.cc @@ -32,6 +32,11 @@ class RmspropOp : public framework::OperatorWithKernel { "Input(Grad) of RmspropOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Moment"), "Input(Moment) of RmspropOp should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), "Output(param_out) of RmspropOp should not be null."); diff --git a/paddle/fluid/operators/rmsprop_op.h b/paddle/fluid/operators/rmsprop_op.h index 25ed32c5ebb2ff5be962ac1e3e38c970623d705c..797cd45fdcdbd5c3567d1676f37e148304ee6e2d 100644 --- a/paddle/fluid/operators/rmsprop_op.h +++ b/paddle/fluid/operators/rmsprop_op.h @@ -13,66 +13,254 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/algorithm.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; template using EigenVector = framework::EigenVector; +template +struct DenseRmspropGradFunctor { + inline explicit DenseRmspropGradFunctor(const T *grad) : grad_(grad) {} + + HOSTDEVICE inline T operator()(int64_t idx) const { return grad_[idx]; } + + const T *grad_; +}; + +template +struct SparseRmspropGradFunctor { + inline SparseRmspropGradFunctor(const T *grad, const int64_t *rows, + int64_t row_numel, int64_t row_count) + : grad_(grad), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count) {} + + HOSTDEVICE inline T operator()(int64_t idx) const { + auto row_idx = math::BinarySearch(rows_, row_count_, idx / row_numel_); + return row_idx >= 0 ? grad_[row_idx * row_numel_ + idx % row_numel_] : 0; + } + + const T *grad_; + const int64_t *rows_; + int64_t row_numel_; + int64_t row_count_; +}; + +template +struct UncenteredRmspropFunctor { + UncenteredRmspropFunctor(T *param, T *ms, T *mom, const T *lr, T rho, + T epsilon, T momentum, + const GradFunctor &grad_functor) + : param_(param), + ms_(ms), + mom_(mom), + lr_(lr), + rho_(rho), + epsilon_(epsilon), + momentum_(momentum), + grad_functor_(grad_functor) {} + + HOSTDEVICE inline void operator()(int64_t idx) const { + T g = grad_functor_(idx); + T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; + T mom_out = momentum_ * mom_[idx] + lr_[0] * g / sqrt(ms_out + epsilon_); + param_[idx] -= mom_out; + ms_[idx] = ms_out; + mom_[idx] = mom_out; + } + + T *param_; + T *ms_; + T *mom_; + const T *lr_; + T rho_; + T epsilon_; + T momentum_; + GradFunctor grad_functor_; +}; + +template +struct CenteredRmspropFunctor { + CenteredRmspropFunctor(T *param, T *ms, T *mom, T *mean_grad, const T *lr, + T rho, T epsilon, T momentum, + const GradFunctor &grad_functor) + : param_(param), + ms_(ms), + mom_(mom), + mean_grad_(mean_grad), + lr_(lr), + rho_(rho), + epsilon_(epsilon), + momentum_(momentum), + grad_functor_(grad_functor) {} + + HOSTDEVICE inline void operator()(int64_t idx) const { + T g = grad_functor_(idx); + T ms_out = rho_ * ms_[idx] + (1 - rho_) * g * g; + T mg_out = rho_ * mean_grad_[idx] + (1 - rho_) * g; + T mom_out = momentum_ * mom_[idx] + + lr_[0] * g / sqrt(ms_out - mg_out * mg_out + epsilon_); + param_[idx] -= mom_out; + ms_[idx] = ms_out; + mom_[idx] = mom_out; + mean_grad_[idx] = mg_out; + } + + T *param_; + T *ms_; + T *mom_; + T *mean_grad_; + const T *lr_; + T rho_; + T epsilon_; + T momentum_; + GradFunctor grad_functor_; +}; + template class RmspropOpKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* param_out = ctx.Output("ParamOut"); - auto* moment_out = ctx.Output("MomentOut"); - auto* mean_square_out = ctx.Output("MeanSquareOut"); + void Compute(const framework::ExecutionContext &ctx) const override { + using LoDTensor = framework::LoDTensor; + auto *grad_var = ctx.InputVar("Grad"); + auto *param_out = ctx.Output("ParamOut"); + auto *moment_out = ctx.Output("MomentOut"); + auto *mean_square_out = ctx.Output("MeanSquareOut"); - auto grad = ctx.Input("Grad"); + auto epsilon = static_cast(ctx.Attr("epsilon")); + auto rho = static_cast(ctx.Attr("decay")); + auto momentum = static_cast(ctx.Attr("momentum")); + bool centered = ctx.Attr("centered"); - param_out->mutable_data(ctx.GetPlace()); - moment_out->mutable_data(ctx.GetPlace()); - mean_square_out->mutable_data(ctx.GetPlace()); + auto &p_tensor = *ctx.Input("Param"); + auto &ms_tensor = *ctx.Input("MeanSquare"); + auto &lr_tensor = *ctx.Input("LearningRate"); + auto &mom_tensor = *ctx.Input("Moment"); - float epsilon = ctx.Attr("epsilon"); - float rho = ctx.Attr("decay"); - float momentum = ctx.Attr("momentum"); - bool centered = ctx.Attr("centered"); + PADDLE_ENFORCE_EQ(&p_tensor, param_out, + "Param and ParamOut must be the same Tensor"); + PADDLE_ENFORCE_EQ(&mom_tensor, moment_out, + "Moment and MomentOut must be the same Tensor"); + PADDLE_ENFORCE_EQ(&ms_tensor, mean_square_out, + "MeanSquare and MeanSquareOut must be the same Tensor"); + + auto &dev_ctx = ctx.template device_context(); + size_t limit = static_cast(ms_tensor.numel()); + + if (grad_var->IsType()) { + auto &grad_tensor = grad_var->Get(); + + if (std::is_same::value) { + auto &place = + *ctx.template device_context().eigen_device(); + auto lr_value = lr_tensor.data()[0]; + + auto p = EigenVector::Flatten(p_tensor); + auto ms = EigenVector::Flatten(ms_tensor); + auto g = EigenVector::Flatten(grad_tensor); + auto mom = EigenVector::Flatten(mom_tensor); + + auto p_out = EigenVector::Flatten(*param_out); + auto mom_out = EigenVector::Flatten(*moment_out); + auto ms_out = EigenVector::Flatten(*mean_square_out); + + ms_out.device(place) = rho * ms + (1 - rho) * g * g; + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto mg = EigenVector::Flatten(mg_tensor); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + auto mg_out = EigenVector::Flatten(*mean_grad_out); + + mg_out.device(place) = rho * mg + (1 - rho) * g; + mom_out.device(place) = + momentum * mom + + lr_value * g / (ms_out - mg_out.square() + epsilon).sqrt(); + } else { + mom_out.device(place) = + momentum * mom + lr_value * g / (ms_out + epsilon).sqrt(); + } + p_out.device(place) = p - mom_out; + } else { + DenseRmspropGradFunctor grad_func(grad_tensor.data()); + platform::ForRange for_range(dev_ctx, limit); + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + for_range(CenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), + mean_grad_out->mutable_data(ctx.GetPlace()), + lr_tensor.data(), rho, epsilon, momentum, grad_func)); + } else { + for_range(UncenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } + } + } else if (grad_var->IsType()) { + auto &grad = grad_var->Get(); + auto *merged_grad = const_cast(ctx.scope()) + .Var() + ->GetMutable(); + + math::scatter::MergeAdd merge_func; + merge_func(dev_ctx, grad, merged_grad); + + platform::ForRange for_range(dev_ctx, limit); + const int64_t *rows; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + rows = merged_grad->rows().CUDAData(ctx.GetPlace()); + } else { +#endif + rows = merged_grad->rows().data(); +#ifdef PADDLE_WITH_CUDA + } +#endif + auto &merged_tensor = merged_grad->value(); + int64_t row_count = merged_grad->rows().size(); + int64_t row_numel = merged_tensor.numel() / row_count; + SparseRmspropGradFunctor grad_func(merged_tensor.data(), rows, + row_numel, row_count); - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto ms = EigenVector::Flatten(*ctx.Input("MeanSquare")); - auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); - auto g = EigenVector::Flatten(*grad); - auto mom = EigenVector::Flatten(*ctx.Input("Moment")); - - auto p_out = EigenVector::Flatten(*param_out); - auto mom_out = EigenVector::Flatten(*moment_out); - auto ms_out = EigenVector::Flatten(*mean_square_out); - auto& place = *ctx.template device_context().eigen_device(); - - Eigen::DSizes grad_dsize(static_cast(grad->numel())); - - ms_out.device(place) = rho * ms + (1 - rho) * g * g; - if (centered) { - auto mg = EigenVector::Flatten(*ctx.Input("MeanGrad")); - auto* mean_grad_out = ctx.Output("MeanGradOut"); - mean_grad_out->mutable_data(ctx.GetPlace()); - auto mg_out = EigenVector::Flatten(*mean_grad_out); - - mg_out.device(place) = rho * mg + (1 - rho) * g; - mom_out.device(place) = momentum * mom + - lr.broadcast(grad_dsize) * g / - (ms_out - mg_out.square() + epsilon).sqrt(); + if (centered) { + auto &mg_tensor = *ctx.Input("MeanGrad"); + auto *mean_grad_out = ctx.Output("MeanGradOut"); + PADDLE_ENFORCE(&mg_tensor, mean_grad_out, + "MeanGrad and MeanGradOut must be the same Tensor"); + for_range(CenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), + mean_grad_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } else { + for_range(UncenteredRmspropFunctor>( + param_out->mutable_data(ctx.GetPlace()), + mean_square_out->mutable_data(ctx.GetPlace()), + moment_out->mutable_data(ctx.GetPlace()), lr_tensor.data(), + rho, epsilon, momentum, grad_func)); + } } else { - mom_out.device(place) = - momentum * mom + - lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt(); + PADDLE_THROW("RMSProp only supports LoDTensor or SelectedRows gradient"); } - p_out.device(place) = p - mom_out; } }; diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index fef230e42d07a5ed73b7a7a6ab682694675bb9d2..411a126bc8e2b3a8d25f436489c13970568ccae4 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -21,7 +21,7 @@ class SGDOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Param"), "Input(Param) of SGDOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Grad"), @@ -42,7 +42,7 @@ class SGDOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); return framework::OpKernelType(data_type, ctx.device_context()); } @@ -50,17 +50,20 @@ class SGDOp : public framework::OperatorWithKernel { class SGDOpInferVarType : public framework::VarTypeInference { public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - auto input_var = op_desc.Input("Param")[0]; - for (auto& out_var : op_desc.Output("ParamOut")) { - if (block->FindRecursiveOrCreateVar(input_var).GetType() == - framework::proto::VarType::SELECTED_ROWS) { - block->FindRecursiveOrCreateVar(out_var).SetType( - framework::proto::VarType::SELECTED_ROWS); - } else { - block->FindRecursiveOrCreateVar(out_var).SetType( - framework::proto::VarType::LOD_TENSOR); + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto input_var_n = op_desc.Input("Param")[0]; + auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType(); + PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS || + in_var_type == framework::proto::VarType::LOD_TENSOR, + "The input Var's type should be LoDtensor or SelectedRows," + " but the received var(%s)'s type is %s", + input_var_n, in_var_type); + + for (auto &out_var_n : op_desc.Output("ParamOut")) { + auto &out_var = block->FindRecursiveOrCreateVar(out_var_n); + if (out_var.GetType() != in_var_type) { + out_var.SetType(in_var_type); } } } diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 243609075713305a90dc162991166ba24d54e835..d3f4eba3b24ec1ac0328ef270256cdf3abe499db 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -56,6 +56,12 @@ template class SGDOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE(param_var->IsType(), + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), param_var->Type().name()); + auto* param = ctx.Input("Param"); auto* param_out = ctx.Output("ParamOut"); auto* learning_rate = ctx.Input("LearningRate"); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index dfc079e986e93c7f02f17b299e5d6293edbedd05..4286242b2a93d7046e7349a99d1d1a09dca09113 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -198,9 +198,9 @@ class CudnnHolder { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place), cudnn_holder_(nullptr) { SetDeviceId(place_.device); - compute_capability = GetCUDAComputeCapability(place_.device); - multi_process = GetCUDAMultiProcessors(place_.device); - max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); + compute_capability_ = GetCUDAComputeCapability(place_.device); + multi_process_ = GetCUDAMultiProcessors(place_.device); + max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); @@ -211,6 +211,16 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } + driver_version_ = GetCUDADriverVersion(place_.device); + runtime_version_ = GetCUDARuntimeVersion(place_.device); + + LOG(INFO) << "device: " << place_.device + << ", CUDA Capability: " << compute_capability_ + << ", Driver Version: " << driver_version_ / 1000 << "." + << (driver_version_ % 100) / 10 + << ", Runtime Version: " << runtime_version_ / 1000 << "." + << (runtime_version_ % 100) / 10; + callback_manager_.reset(new StreamCallbackManager(stream_)); } @@ -232,11 +242,11 @@ void CUDADeviceContext::Wait() const { } int CUDADeviceContext::GetComputeCapability() const { - return compute_capability; + return compute_capability_; } int CUDADeviceContext::GetMaxPhysicalThreadCount() const { - return multi_process * max_threads_per_mp; + return multi_process_ * max_threads_per_mp_; } Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 79539195157d74d4d757edee5e008cbb76c93ee2..e1ff1a1746952de5aa4bead361b50af4e99bc9bc 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -135,9 +135,11 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream_; cublasHandle_t cublas_handle_; - int compute_capability; - int multi_process; - int max_threads_per_mp; + int compute_capability_; + int runtime_version_; + int driver_version_; + int multi_process_; + int max_threads_per_mp_; mutable std::mutex mtx_; diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index f04395a8ac00f33501008aa12f22773ddda9b138..a251bfcd9914422cb6300adbbcdef3dfa79f441c 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -130,6 +130,13 @@ struct EOFException : public std::exception { #define UNLIKELY(condition) (condition == 0) #endif +#if !defined(_WIN32) +#define LIKELY(condition) __builtin_expect(static_cast(condition), 1) +#else +// there is no equivalent intrinsics in msvc. +#define LIKELY(condition) (condition != 0) +#endif + template inline typename std::enable_if::type throw_on_error( bool stat, const Args&... args) { diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index f599e7fbc886a60394ae4690e4160275b55b8596..8fff9844db738dbd6508569a8aaeed044e445e5f 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -46,6 +46,24 @@ int GetCUDAComputeCapability(int id) { return device_prop.major * 10 + device_prop.minor; } +int GetCUDARuntimeVersion(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + int runtime_version = 0; + PADDLE_ENFORCE(cudaRuntimeGetVersion(&runtime_version), + "cudaRuntimeGetVersion failed in " + "paddle::platform::cudaRuntimeGetVersion"); + return runtime_version; +} + +int GetCUDADriverVersion(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + int driver_version = 0; + PADDLE_ENFORCE(cudaDriverGetVersion(&driver_version), + "cudaDriverGetVersion failed in " + "paddle::platform::GetCUDADriverVersion"); + return driver_version; +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index f4640d3eaa2165c35e8e14690d83e9e7e7168c0b..be44158431ff80a41f7fdf4dfd4d070667f2ac63 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -29,6 +29,12 @@ int GetCUDADeviceCount(); //! Get the compute capability of the ith GPU (format: major * 10 + minor) int GetCUDAComputeCapability(int i); +//! Get the runtime version of the ith GPU +int GetCUDARuntimeVersion(int id); + +//! Get the driver version of the ith GPU +int GetCUDADriverVersion(int id); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 8d58e018c345f7c0ca188d4380f675d2bce2a847..339a7c98c6a2bba2cd46790cecc169ef447c63ce 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -57,6 +57,10 @@ limitations under the License. */ #include "pybind11/stl.h" +DEFINE_bool(reader_queue_speed_test_mode, false, + "If set true, the queue.pop will only get data from queue but not " + "remove the data from queue for speed testing"); + // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); @@ -170,14 +174,14 @@ PYBIND11_PLUGIN(core) { A LoDTensor X can look like the example below. It contains 2 sequences. The first has length 2 and the second has length 3, as described by x.lod. - The first tensor dimension 6=2+3 is calculated from LoD if it's available. + The first tensor dimension 5=2+3 is calculated from LoD if it's available. It means the total number of sequence element. In X, each element has 2 - columns, hence [6, 2]. + columns, hence [5, 2]. x.lod = [[2, 3]] x.data = [[1, 2], [3, 4], - [5, 6], [7, 8], [9, 10], [11, 12]] - x.shape = [6, 2] + [5, 6], [7, 8], [9, 10]] + x.shape = [5, 2] LoD can have multiple levels (for example, a paragraph can have multiple sentences and a sentence can have multiple words). In the following @@ -380,7 +384,8 @@ All parameter, weight, gradient are variables in Paddle. return make_ddim(shape); }); auto *holder = var.GetMutable(); - holder->InitOnce(capacity, dims); + holder->InitOnce(capacity, dims, + FLAGS_reader_queue_speed_test_mode); return holder->GetQueue(); }, py::return_value_policy::copy); diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md index 41b01d33828f750f67bba5f82cb7ed6fe4d4ea0a..191da20669e185d819ec5eed55427461cc0b10e4 100644 --- a/paddle/fluid/train/demo/README.md +++ b/paddle/fluid/train/demo/README.md @@ -15,7 +15,7 @@ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \ -DWITH_MKL=OFF \ -DWITH_MKLDNN=OFF make -j8 -make -j8 inference_lib_dist +make -j8 fluid_lib_dist ``` ### step 2. generate program desc diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index b97e63ecc85872ebccb21f37bb4f2f00d49b4599..da6f5ca1586a570fb548d7b987330a8d58156e24 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -648,25 +648,25 @@ function gen_capi_package() { fi } -function gen_fluid_inference_lib() { +function gen_fluid_lib() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build if [[ ${WITH_C_API:-OFF} == "OFF" && ${WITH_INFERENCE:-ON} == "ON" ]] ; then cat <