diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 7355b67ab1020f58760f23b1a20ca189591db35e..0b95a780721b0771d55c4dbb2ddce33418612018 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -186,8 +186,7 @@ set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* ${src_dir}/${module}/api/paddle_*.h - ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) set(module "platform") diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 26ecc1071fad70b46e81cd50eaf90c3addd444c3..894d8dda3ddb078e29931705bca4745848d71f91 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -97,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) -paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0)) paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'non_leaf_num', 'ptable', 'pcode', 'is_costum', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, False, False)) +paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index d6b5ad4570c1d8402dedb8596cc75d9eae5a91c7..93288936fea1fae897dc26e6d8850da612960333 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -39,11 +39,12 @@ if (WITH_GPU) endif() cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) +cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) if (WITH_GPU) list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) endif() diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..fe21e21bcfc42bfb3251a7d0d15aa5926f56813f --- /dev/null +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -0,0 +1,125 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/all_reduce_deps_pass.h" +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/op_graph_view.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace details { + +static constexpr char kAllOpDescs[] = "all_op_descs"; + +VarHandle* GetValidInput(const OpHandleBase* a) { + for (auto p : a->Inputs()) { + VarHandle* b = dynamic_cast(p); + if (b) { + return b; + } + } + + return nullptr; +} + +std::unique_ptr AllReduceDepsPass::ApplyImpl( + std::unique_ptr graph) const { + auto graph_ops = ir::FilterByNodeWrapper(*graph); + + // get vars order + int order = 0; + std::unordered_map vars; + // TODO(gongwb): use graph topology sort to find the order of operators. + // Note that must assert topology sort is stable + auto& ops = Get>(kAllOpDescs); + for (auto* op_desc : ops) { + auto outputs = op_desc->Outputs(); + for (auto& o_it : outputs) { + for (auto& v : o_it.second) { // values + vars[v] = order; + } + } + order++; + } + + std::vector dist_ops; + // get allreduce ops. + for (auto& op : graph_ops) { + // FIXME(gongwb):add broad cast. + if (op->Name() == "all_reduce" || op->Name() == "reduce") { + dist_ops.push_back(op); + } + } + + VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl; + + std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1, + OpHandleBase* op2) { + VarHandle* i0 = dynamic_cast(GetValidInput(op1)); + VarHandle* i1 = dynamic_cast(GetValidInput(op2)); + + PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error", + op1->DebugString(), op2->DebugString()); + + auto l_it = vars.find(i0->name_); + auto r_it = vars.find(i1->name_); + + if (l_it->second < r_it->second) return true; + + if (l_it->second == r_it->second) { + return i0->name_ < i1->name_; + } + + return false; + }); + + // add dependency. + auto& sorted_ops = dist_ops; + for (size_t i = 1; i < sorted_ops.size(); ++i) { + auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + + auto* pre_op = sorted_ops[i - 1]; + auto* op = sorted_ops[i]; + + pre_op->AddOutput(dep_var); + op->AddInput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + + VLOG(10) << "add all_reduce sequential dependencies between " << pre_op + << " and " << op; + + VLOG(10) << "pre_op:" << pre_op->DebugString() + << ", op:" << op->DebugString(); + } + + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(all_reduce_deps_pass, + paddle::framework::details::AllReduceDepsPass) + .RequirePassAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..e8b91089816c71bc56ba7dba0105e85d73eb52ad --- /dev/null +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +// TODO(gongwb): overlap allreduce with backward computation. +class AllReduceDepsPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 70baced0ada33c23ba05cd2722e607edf847585a..523f9eadf2d7e2e08504c5920372fb7cdb0d7aba 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" @@ -24,6 +25,10 @@ namespace paddle { namespace framework { namespace details { +static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { + return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1); +} + class ParallelExecutorPassBuilder : public ir::PassBuilder { public: explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) @@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Verify that the graph is correct for multi-device executor. AppendPass("multi_devices_check_pass"); + if (SeqOnlyAllReduceOps(strategy)) { + AppendPass("all_reduce_deps_pass"); + } + if (strategy_.remove_unnecessary_lock_) { AppendPass("modify_op_lock_and_record_event_pass"); } @@ -124,6 +133,17 @@ std::unique_ptr BuildStrategy::Apply( pass->SetNotOwned("nccl_ctxs", nctx); #endif } else if (pass->Type() == "sequential_execution_pass") { + VLOG(1) << "set enable_sequential_execution:" + << enable_sequential_execution_; + + pass->Erase(kAllOpDescs); + pass->Set>( + kAllOpDescs, + new std::vector(main_program.Block(0).AllOps())); + } else if (pass->Type() == "all_reduce_deps_pass") { + VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) + << ", num_trainers:" << num_trainers_; + pass->Erase(kAllOpDescs); pass->Set>( kAllOpDescs, @@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); USE_PASS(sequential_execution_pass); +USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 3236c35efdbf1175c3d06e531fc551f202ae17ad..9f0a25912886cea7a1f287125cfe8612e4b336eb 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -73,6 +73,7 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; + int num_trainers_{1}; bool remove_unnecessary_lock_{false}; // NOTE: diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2c6e337568306502fbaa362015e51f81efc0a5ff..0e907e6fd3114991adcfe36aa13fa5e777bfd972 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -54,7 +54,7 @@ class ParallelExecutorPrivate { Scope *global_scope_; // not owned std::unique_ptr executor_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; #endif bool own_local_scope_; @@ -104,7 +104,7 @@ ParallelExecutor::ParallelExecutor( if (member_->use_cuda_) { // Bcast Parameters to all GPUs -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; if (nccl_id_var != nullptr) { @@ -124,7 +124,7 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); @@ -213,7 +213,7 @@ void ParallelExecutor::BCastParamsToDevices( } auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::vector buffers; size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 7710ed7b613a180a4e24237a2cf6b5f212f677a3..cb88333d1570322fbac7112755bab5e11c97201a 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -76,7 +76,8 @@ void TestWord2vecPrediction(const std::string& model_path) { 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. - for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) { + for (size_t i = 0; i < std::min(static_cast(5UL), num_elements); + i++) { LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i]; PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc index 20748a23a1951383c888d9b8d7a360ec941e50cb..b274b05562b15856276b1c88d3504fda1ecafacc 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc @@ -99,9 +99,8 @@ TEST(BestFitAllocator, test_concurrent_cpu_allocation) { LockedAllocator locked_allocator(std::move(best_fit_allocator)); - auto th_main = [&] { - std::random_device dev; - std::default_random_engine engine(dev()); + auto th_main = [&](std::random_device::result_type seed) { + std::default_random_engine engine(seed); std::uniform_int_distribution dist(1U, 1024U); for (size_t i = 0; i < 128; ++i) { @@ -125,7 +124,8 @@ TEST(BestFitAllocator, test_concurrent_cpu_allocation) { { std::vector threads; for (size_t i = 0; i < 1024; ++i) { - threads.emplace_back(th_main); + std::random_device dev; + threads.emplace_back(th_main, dev()); } for (auto& th : threads) { th.join(); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu index f7f17e1d36e0adef0b0eb7a43715836db4b7927d..fdd5b43ad4aa8024efee314ca949445fefbef067 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu @@ -41,9 +41,8 @@ TEST(BestFitAllocator, concurrent_cuda) { LockedAllocator concurrent_allocator( std::unique_ptr(new BestFitAllocator(cuda_allocation.get()))); - auto th_main = [&] { - std::random_device dev; - std::default_random_engine engine(dev()); + auto th_main = [&](std::random_device::result_type seed) { + std::default_random_engine engine(seed); std::uniform_int_distribution dist(1U, 1024U); platform::CUDAPlace gpu(0); platform::CUDADeviceContext dev_ctx(gpu); @@ -75,7 +74,8 @@ TEST(BestFitAllocator, concurrent_cuda) { { std::vector threads; for (size_t i = 0; i < 1024; ++i) { - threads.emplace_back(th_main); + std::random_device dev; + threads.emplace_back(th_main, dev()); } for (auto& th : threads) { th.join(); diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 2019d1a14f6dd5ed09c251f26c6ca352faa594ae..3e8fb83e9d5ba2078bcf37e4a4af74708df9c11c 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -86,7 +86,11 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { munlock(p, size); #endif } +#ifdef _WIN32 + _aligned_free(p); +#else free(p); +#endif } bool CPUAllocator::UseGpu() const { return false; } diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index 424d273c34b7e8d70c88b591c4fe45db61465f38..3e401d1c4f9f4fa89cbbe04df1ca69d05132eb51 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef _WIN32 #include +#endif #include #include // NOLINT diff --git a/paddle/fluid/operators/math/sampler.cc b/paddle/fluid/operators/math/sampler.cc index 690d6f6baafb33d50c8f2d3606d903634d622d16..2708f3bcd8f1d2cab19c74b57fdf9f903d9dc65d 100644 --- a/paddle/fluid/operators/math/sampler.cc +++ b/paddle/fluid/operators/math/sampler.cc @@ -60,75 +60,30 @@ float LogUniformSampler::Probability(int64_t value) const { return (log((value + 2.0) / (value + 1.0))) / log_range_; } -CustomSampler::CustomSampler(int64_t range, const float* probabilities, +CustomSampler::CustomSampler(int64_t range, const float *probabilities, + const int *alias, const float *alias_probabilities, unsigned int seed) : Sampler(range, seed) { - random_engine_ = std::make_shared(seed_); + random_engine_ = std::make_shared(seed_); real_dist_ = std::make_shared>(0, 1); int_dist_ = std::make_shared>(0, range); - alias_probs_ = std::make_shared>(range + 1); - alias_ = std::make_shared>(range + 1); - probs_ = std::make_shared>(range + 1); - - std::queue> bigs; - std::queue> littles; - for (int64_t i = 0; i <= range; ++i) { - (*probs_)[i] = probabilities[i]; - float normal_prob = probabilities[i] * (range + 1); - if (normal_prob - 1.0 > 1e-4) { - bigs.emplace(i, normal_prob); - } else if (1.0 - normal_prob > 1e-4) { - littles.emplace(i, normal_prob); - } else { - (*alias_probs_)[i] = normal_prob; - (*alias_)[i] = -1; - } - } - - while ((!littles.empty()) && (!bigs.empty())) { - auto big = bigs.front(); - auto little = littles.front(); - bigs.pop(); - littles.pop(); - (*alias_probs_)[little.first] = little.second; - (*alias_)[little.first] = big.first; - auto big_left = big.second - (1 - little.second); - if (big_left - 1.0 > 1e-4) { - bigs.emplace(big.first, big_left); - } else if (1.0 - big_left > 1e-4) { - littles.emplace(big.first, big_left); - } else { - (*alias_probs_)[big.first] = big_left; - (*alias_)[big.first] = -1; - } - } - if (!littles.empty()) { // littles.second is close to 1.0 - auto little = littles.front(); - (*alias_probs_)[little.first] = 1.0; - (*alias_)[little.first] = -1; - } - - if (!bigs.empty()) { // bigs.second is close to 1.0 - auto big = bigs.front(); - (*alias_probs_)[big.first] = 1.0; - (*alias_)[big.first] = -1; - } + alias_probs_ = alias_probabilities; + probs_ = probabilities; + alias_ = alias; } int64_t CustomSampler::Sample() const { auto index = (*int_dist_)(*random_engine_); auto p = (*real_dist_)(*random_engine_); - if (p > (*alias_probs_)[index]) { - return (*alias_)[index]; + if (p > alias_probs_[index]) { + return alias_[index]; } else { return index; } } -float CustomSampler::Probability(int64_t value) const { - return (*probs_)[value]; -} +float CustomSampler::Probability(int64_t value) const { return probs_[value]; } } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sampler.h b/paddle/fluid/operators/math/sampler.h index 836cdad51f17e93f811ba14695bbe1a65156c588..98e0b898a504e3bd6b37c3cc772c179eab6038a4 100644 --- a/paddle/fluid/operators/math/sampler.h +++ b/paddle/fluid/operators/math/sampler.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include #include #include @@ -38,9 +39,12 @@ class Sampler { seed_ = seed; } } + virtual ~Sampler(); + // Sample a single value virtual int64_t Sample() const = 0; + // The probability that a single call to Sample() returns the given value. virtual float Probability(int64_t value) const = 0; @@ -99,6 +103,7 @@ class LogUniformSampler : public Sampler { class CustomSampler : public Sampler { public: explicit CustomSampler(int64_t range, const float* probabilities, + const int* alias, const float* alias_probabilities, unsigned int seed = 0UL); ~CustomSampler() override {} @@ -108,10 +113,10 @@ class CustomSampler : public Sampler { float Probability(int64_t value) const override; private: - std::shared_ptr> alias_probs_; - std::shared_ptr> alias_; - std::shared_ptr> probs_; - std::shared_ptr random_engine_; + const float* alias_probs_; + const int* alias_; + const float* probs_; + std::shared_ptr random_engine_; std::shared_ptr> real_dist_; std::shared_ptr> int_dist_; }; diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 0015fafbc892912424dfa6dbd1778438d384ca19..51da6de26e2a47da2c22a1c2e2e1a9412badc58f 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -16,13 +16,12 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace operators { namespace math { -#define FLT_MAX __FLT_MAX__ - template struct MaxPoolFunctor { HOSTDEVICE void operator()(const T* input, const size_t start, diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 9b0d45ae5b9d104c8b7bb1529a9baaaf3d6a736d..655e171e637919a3086eab2438bb995da2d4ca46 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/nce_op.h" +#include #include namespace paddle { @@ -25,7 +26,7 @@ class NCEOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Input")); PADDLE_ENFORCE(ctx->HasInput("Label")); PADDLE_ENFORCE(ctx->HasInput("Weight")); @@ -67,7 +68,7 @@ class NCEOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), platform::CPUPlace()); @@ -101,11 +102,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { .AsDispensable(); AddInput( - "CustomDistribution", + "CustomDistProbs", "(Tensor) It is used in 'CostumDist' sampler. " "It is a tensor with shape [num_total_classes]." "The i-th element is the probsbility of the i-th class being sampled.") .AsDispensable(); + AddInput( + "CustomDistAlias", + "(Tensor) It is used in 'CostumDist' sampler. " + "It is a tensor with shape [num_total_classes]." + "The i-th element is the probsbility of the i-th class being sampled.") + .AsDispensable(); + AddInput( + "CustomDistAliasProbs", + "(Tensor) It is used in 'CostumDist' sampler. " + "It is a tensor with shape [num_total_classes]." + "The i-th element is the probsbility of the i-th class being sampled.") + .AsDispensable(); + AddOutput("Cost", "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples."); AddOutput("SampleLogits", @@ -124,21 +138,22 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { "kernel to compute grads." "") .AsIntermediate(); + AddAttr("num_total_classes", "Total number of classes in all samples."); AddAttr("num_neg_samples", "The number of negative classes. The default value is 10.") .SetDefault(10); - AddAttr("sampler", "(int) Which sampler to be used to sample negative class." "0: Uniform; 1: LogUniform; 2: CostumDist.") .SetDefault(0); - AddAttr("seed", "(int) The seed used in sampler. If it is 0, " "the sampler will generate a seed randomly.") .SetDefault(0); + AddAttr("is_sparse", "(boolean, default false) Sparse update.") + .SetDefault(false); AddAttr>("custom_neg_classes", "This attribute only be used in unitest. Classes " @@ -156,11 +171,19 @@ By default this operator uses a uniform distribution for sampling. } }; +class NCEOpGradDescMaker : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { return "nce_grad"; } +}; + class NCEOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Input")); PADDLE_ENFORCE(ctx->HasInput("Weight")); PADDLE_ENFORCE(ctx->HasInput("Cost")); @@ -190,20 +213,45 @@ class NCEOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), platform::CPUPlace()); } }; +class NCEOpGradVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front(); + auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front(); + + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(30) << "nce_op_grad op " << weight_grad << " and " << bias_grad + << " is set to SelectedRows"; + block->Var(weight_grad) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(30) << "nce_op_grad op " << weight_grad << " and " << bias_grad + << " is set to LoDTensor"; + block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); + block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR); + } + block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); + block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType()); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad); +REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpGradDescMaker, ops::NCEOpMaker); +REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad, ops::NCEOpGradVarTypeInference); REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel, ops::NCEKernel); REGISTER_OP_CPU_KERNEL(nce_grad, diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index e9af8ad4ce8501f464202039d99c36984d7feba9..f2ca6ec247fd1ea09b707c2eaaad0548c8aa5757 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -16,26 +16,32 @@ limitations under the License. */ #include #include +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/math/sampler.h" #include "unsupported/Eigen/CXX11/Tensor" + namespace paddle { namespace operators { using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; using Sampler = math::Sampler; +using DDim = framework::DDim; template using EigenMatrix = framework::EigenMatrix; template -void PrepareSamples(const framework::ExecutionContext& context, - Sampler* sampler) { +void PrepareSamples(const framework::ExecutionContext &context, + Sampler *sampler) { auto label = context.Input("Label"); - const int64_t* label_data = label->data(); + const int64_t *label_data = label->data(); auto label_dims = label->dims(); // int num_total_classes = context.Attr("num_total_classes"); // for unitest @@ -44,7 +50,7 @@ void PrepareSamples(const framework::ExecutionContext& context, auto sample_labels = context.Output("SampleLabels"); auto sample_labels_dims = sample_labels->dims(); - int64_t* sample_labels_data = + int64_t *sample_labels_data = sample_labels->mutable_data(context.GetPlace()); int num_label = label_dims.size() == 2 ? label_dims[1] : 1; @@ -70,13 +76,13 @@ void PrepareSamples(const framework::ExecutionContext& context, template class NCEKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { + void Compute(const framework::ExecutionContext &context) const override { int sampler_type = context.Attr("sampler"); int seed = context.Attr("seed"); int num_total_classes = context.Attr("num_total_classes"); int num_neg_samples = context.Attr("num_neg_samples"); - Sampler* sampler; + Sampler *sampler; switch (sampler_type) { case 0: { sampler = new math::UniformSampler(num_total_classes - 1, seed); @@ -87,11 +93,19 @@ class NCEKernel : public framework::OpKernel { break; } case 2: { - auto custom_dist = context.Input("CustomDistribution"); - const float* custom_dist_data = custom_dist->data(); - PADDLE_ENFORCE_EQ(custom_dist->numel(), num_total_classes); - sampler = new math::CustomSampler(num_total_classes - 1, - custom_dist_data, seed); + auto dist_probs = context.Input("CustomDistProbs"); + auto dist_alias = context.Input("CustomDistAlias"); + auto dist_alias_probs = context.Input("CustomDistAliasProbs"); + + PADDLE_ENFORCE_EQ(dist_probs->numel(), num_total_classes); + PADDLE_ENFORCE_EQ(dist_alias->numel(), num_total_classes); + PADDLE_ENFORCE_EQ(dist_alias_probs->numel(), num_total_classes); + + const float *probs_data = dist_probs->data(); + const int *alias_data = dist_alias->data(); + const float *alias_probs_data = dist_alias_probs->data(); + sampler = new math::CustomSampler(num_total_classes - 1, probs_data, + alias_data, alias_probs_data, seed); break; } default: { PADDLE_THROW("Unsupported SamplerType."); } @@ -99,17 +113,17 @@ class NCEKernel : public framework::OpKernel { PrepareSamples(context, sampler); auto sample_labels = context.Output("SampleLabels"); - const int64_t* sample_labels_data = sample_labels->data(); + const int64_t *sample_labels_data = sample_labels->data(); auto sample_out = context.Output("SampleLogits"); - T* sample_out_data = sample_out->mutable_data(context.GetPlace()); + T *sample_out_data = sample_out->mutable_data(context.GetPlace()); auto label = context.Input("Label"); auto sample_weight = context.Input("SampleWeight"); - const T* sample_weight_data = nullptr; + const T *sample_weight_data = nullptr; if (sample_weight != nullptr) { sample_weight_data = sample_weight->data(); } auto out = context.Output("Cost"); - T* out_data = out->mutable_data(context.GetPlace()); + T *out_data = out->mutable_data(context.GetPlace()); int64_t num_true_class = 1; if (label != nullptr) { num_true_class = label->dims()[1]; @@ -119,7 +133,7 @@ class NCEKernel : public framework::OpKernel { // forward bias auto bias = context.Input("Bias"); if (bias != nullptr) { - const T* bias_data = bias->data(); + const T *bias_data = bias->data(); for (int64_t i = 0; i < sample_labels->numel(); ++i) { sample_out_data[i] = bias_data[sample_labels_data[i]]; } @@ -158,16 +172,16 @@ class NCEKernel : public framework::OpKernel { template class NCEGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { + void Compute(const framework::ExecutionContext &context) const override { auto d_out = context.Input(framework::GradVarName("Cost")); - const T* d_out_data = d_out->data(); + const T *d_out_data = d_out->data(); auto label = context.Input("Label"); auto sample_out = context.Input("SampleLogits"); - const T* sample_out_data = sample_out->data(); + const T *sample_out_data = sample_out->data(); auto sample_labels = context.Input("SampleLabels"); - const int64_t* sample_labels_data = sample_labels->data(); + const int64_t *sample_labels_data = sample_labels->data(); auto sample_weight = context.Input("SampleWeight"); - const T* sample_weight_data = nullptr; + const T *sample_weight_data = nullptr; if (sample_weight != nullptr) { sample_weight_data = sample_weight->data(); } @@ -180,7 +194,7 @@ class NCEGradKernel : public framework::OpKernel { int sampler_type = context.Attr("sampler"); int seed = context.Attr("seed"); - Sampler* sampler; + Sampler *sampler; switch (sampler_type) { case 0: { sampler = new math::UniformSampler(num_total_classes - 1, seed); @@ -191,11 +205,19 @@ class NCEGradKernel : public framework::OpKernel { break; } case 2: { - auto custom_dist = context.Input("CustomDistribution"); - const float* custom_dist_data = custom_dist->data(); - PADDLE_ENFORCE_EQ(custom_dist->numel(), num_total_classes); - sampler = new math::CustomSampler(num_total_classes - 1, - custom_dist_data, seed); + auto dist_probs = context.Input("CustomDistProbs"); + auto dist_alias = context.Input("CustomDistAlias"); + auto dist_alias_probs = context.Input("CustomDistAliasProbs"); + + PADDLE_ENFORCE_EQ(dist_probs->numel(), num_total_classes); + PADDLE_ENFORCE_EQ(dist_alias->numel(), num_total_classes); + PADDLE_ENFORCE_EQ(dist_alias_probs->numel(), num_total_classes); + + const float *probs_data = dist_probs->data(); + const int *alias_data = dist_alias->data(); + const float *alias_probs_data = dist_alias_probs->data(); + sampler = new math::CustomSampler(num_total_classes - 1, probs_data, + alias_data, alias_probs_data, seed); break; } default: { PADDLE_THROW("Unsupported SamplerType."); } @@ -203,7 +225,7 @@ class NCEGradKernel : public framework::OpKernel { // T b = 1. / num_total_classes * num_neg_samples; Tensor sample_grad; // tmp tensor - T* sample_grad_data = + T *sample_grad_data = sample_grad.mutable_data(sample_labels->dims(), context.GetPlace()); // backward cost for (int64_t i = 0; i < sample_labels->numel(); ++i) { @@ -217,32 +239,105 @@ class NCEGradKernel : public framework::OpKernel { : w * (o * (1 - o) / (o + b)); sample_grad_data[i] *= d_out_data[sample_idx]; } - // get d_bias - auto d_bias = context.Output(framework::GradVarName("Bias")); - if (d_bias != nullptr) { - T* d_bias_data = d_bias->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + + bool is_sparse = context.Attr("is_sparse"); + + if (!is_sparse) { + // get d_bias + auto d_bias = context.Output(framework::GradVarName("Bias")); + if (d_bias != nullptr) { + T *d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + // get d_w + auto d_w = context.Output(framework::GradVarName("Weight")); + if (d_w != nullptr) { + auto d_w_data = d_w->mutable_data(context.GetPlace()); + std::fill(d_w_data, d_w_data + d_w->numel(), 0.0); + auto d_w_matrix = EigenMatrix::From(*d_w); + auto x_matrix = EigenMatrix::From(*(context.Input("Input"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_w_matrix.chip(sample_labels_data[i], 0) += + x_matrix.chip(static_cast(i / sample_labels->dims()[1]), 0) * + sample_grad_data[i]; + } + } + } else { + std::vector labels; for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + labels.push_back(sample_labels_data[i]); } - } - // get d_w - auto d_w = context.Output(framework::GradVarName("Weight")); - if (d_w != nullptr) { - auto d_w_data = d_w->mutable_data(context.GetPlace()); - std::fill(d_w_data, d_w_data + d_w->numel(), 0.0); - auto d_w_matrix = EigenMatrix::From(*d_w); + std::set st(labels.begin(), labels.end()); + labels.assign(st.begin(), st.end()); + + auto *bias_var = context.InputVar("Bias"); + DDim bias_dim; + if (bias_var->IsType()) { + bias_dim = context.Input("Bias")->dims(); + } else if (bias_var->IsType()) { + auto *table_t = context.Input("Bias"); + bias_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter Bias of a NCE_OP " + "must be either LoDTensor or SelectedRows"); + } + + auto d_bias = + context.Output(framework::GradVarName("Bias")); + d_bias->set_rows(labels); + d_bias->set_height(bias_dim[0]); + + d_bias->mutable_value()->Resize( + {static_cast(labels.size()), bias_dim[1]}); + T *d_bias_data = + d_bias->mutable_value()->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + labels.size(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[d_bias->Index(sample_labels_data[i])] += + sample_grad_data[i]; + } + + auto *table_var = context.InputVar("Weight"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("Weight")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("Weight"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter Weight of a NCE_OP " + "must be either LoDTensor or SelectedRows"); + } + + auto d_w = context.Output(framework::GradVarName("Weight")); + + d_w->set_rows(labels); + d_w->set_height(table_dim[0]); + + auto *d_table_value = d_w->mutable_value(); + d_table_value->Resize( + {static_cast(labels.size()), table_dim[1]}); + auto d_w_data = d_table_value->mutable_data(context.GetPlace()); + std::fill(d_w_data, d_w_data + d_table_value->numel(), 0.0); + + auto d_w_matrix = EigenMatrix::From(*d_table_value); auto x_matrix = EigenMatrix::From(*(context.Input("Input"))); for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_w_matrix.chip(sample_labels_data[i], 0) += + d_w_matrix.chip(d_w->Index(sample_labels_data[i]), 0) += x_matrix.chip(static_cast(i / sample_labels->dims()[1]), 0) * sample_grad_data[i]; } } + // get d_x auto d_x = context.Output(framework::GradVarName("Input")); if (d_x != nullptr) { - auto* d_x_data = d_x->mutable_data(context.GetPlace()); + auto *d_x_data = d_x->mutable_data(context.GetPlace()); std::fill(d_x_data, d_x_data + d_x->numel(), 0.0); auto d_x_matrix = EigenMatrix::From(*d_x); auto w_matrix = EigenMatrix::From(*(context.Input("Weight"))); @@ -251,6 +346,7 @@ class NCEGradKernel : public framework::OpKernel { w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i]; } } + delete sampler; } }; diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 20bd349b94bb211b62a0dee32bcad79759fb500f..2bb36a5d6b3a6064cf9b5657fefdf798ee6279ad 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -20,12 +20,12 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #ifndef _WIN32 -const float fraction_of_gpu_memory_to_use = 0.92f; +constexpr static float fraction_of_gpu_memory_to_use = 0.92f; #else // fraction_of_gpu_memory_to_use cannot be too high on windows, // since the win32 graphic sub-system can occupy some GPU memory // which may lead to insufficient memory left for paddle -const float fraction_of_gpu_memory_to_use = 0.5f; +constexpr static float fraction_of_gpu_memory_to_use = 0.5f; #endif DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 0443ff3fc3d055c54527240cdf57c195e1c0acf8..8735b658758dfefeedcb1393b82682ad968739de 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -29,8 +29,16 @@ limitations under the License. */ namespace pybind11 { namespace detail { +#if !defined(PYBIND11_HIDDEN) +#ifdef _WIN32 +#define PYBIND11_HIDDEN __declspec(dllexport) +#else +#define PYBIND11_HIDDEN __attribute__((visibility("hidden"))) +#endif +#endif + // Can be replaced by a generic lambda in C++14 -struct __attribute__((visibility("hidden"))) paddle_variant_caster_visitor +struct PYBIND11_HIDDEN paddle_variant_caster_visitor : public boost::static_visitor { return_value_policy policy; handle parent; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a2a629acdfe65ae250164dad4c2367d525887acf..e31c2f217322be8ef8b131189504b54cf6b4ad80 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -860,6 +860,12 @@ All parameter, weight, gradient are variables in Paddle. self.remove_unnecessary_lock_ = b; }, R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default False.)DOC") + .def_property( + "num_trainers", + [](const BuildStrategy &self) { return self.num_trainers_; }, + [](BuildStrategy &self, int num_trainers) { + self.num_trainers_ = num_trainers; + }) .def_property( "fuse_elewise_add_act_ops", [](const BuildStrategy &self) { diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fd02b445e75fea0d37b610c9abce0d090bfdedc4..44116a262cadd079accdd45c88e21e28bc2d8c21 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4394,7 +4394,8 @@ def nce(input, name=None, sampler="uniform", custom_dist=None, - seed=0): + seed=0, + is_sparse=False): """ ${comment} @@ -4420,11 +4421,12 @@ def nce(input, sampler (str): The sampler used to sample class from negtive classes. It can be 'uniform', 'log_uniform' or 'custom_dist'. default: 'uniform'. - custom_dist (Variable): A tensor with shape [num_total_classes]. + custom_dist (float[]): A float[] with size=num_total_classes. It is used when sampler is set to 'custom_dist'. custom_dist[i] is the probsbility of i-th class to be sampled. default: None. seed (int): The seed used in sampler. default: 0. + is_sparse(bool): The flag indicating whether to use sparse update, the weight@GRAD and bias@GRAD will be changed to SelectedRows. Returns: Variable: The output nce loss. @@ -4476,12 +4478,7 @@ def nce(input, shape=[num_total_classes, dim], is_bias=False, dtype=input.dtype) - inputs = { - 'Input': input, - 'Label': label, - 'Weight': w, - 'SampleWeight': sample_weight if sample_weight is not None else [] - } + inputs = {} if helper.bias_attr: b = helper.create_parameter( attr=helper.bias_attr, @@ -4493,18 +4490,10 @@ def nce(input, sample_logits = helper.create_variable_for_type_inference(dtype=input.dtype) sample_labels = helper.create_variable_for_type_inference(dtype=label.dtype) - if num_neg_samples is None: - num_neg_samples = 10 - else: - num_neg_samples = int(num_neg_samples) - - inputs = { - 'Input': input, - 'Label': label, - 'Weight': w, - 'Bias': b, - 'SampleWeight': sample_weight if sample_weight is not None else [] - } + inputs['Input'] = input + inputs['Label'] = label + inputs['Weight'] = w + inputs['SampleWeight'] = sample_weight if sample_weight is not None else [] if sampler == "uniform": sampler = 0 @@ -4512,17 +4501,73 @@ def nce(input, sampler = 1 elif sampler == "custom_dist": assert custom_dist is not None - assert isinstance(custom_dist, Variable) - inputs['CustomDistribution'] = custom_dist + # assert isinstance(custom_dist, Variable) + + custom_dist_len = len(custom_dist) + alias_probs_ = [0] * custom_dist_len + alias_ = [0] * custom_dist_len + bigs = [] + littles = [] + for i in range(custom_dist_len): + normal_prob = custom_dist[i] * custom_dist_len + if normal_prob - 1.0 > 1e-4: + bigs.append((i, normal_prob)) + elif 1.0 - normal_prob > 1e-4: + littles.append((i, normal_prob)) + else: + alias_probs_[i] = normal_prob + alias_[i] = -1 + + while len(bigs) and len(littles): + big = bigs.pop(0) + little = littles.pop(0) + + big_idx = big[0] + big_prob = big[1] + + alias_probs_[little[0]] = little[1] + alias_[little[0]] = big_idx + big_left = big[1] + little[1] - 1 + if big_left - 1.0 > 1e-4: + bigs.append((big_idx, big_left)) + elif 1.0 - big_left > 1e-4: + littles.append((big_idx, big_left)) + else: + alias_probs_[big_idx] = big_left + alias_[big_idx] = -1 + + if len(bigs): + big = bigs.pop(0) + alias_probs_[big[0]] = 1.0 + alias_[big[0]] = -1 + if len(littles): + little = littles.pop(0) + alias_probs_[little[0]] = 1.0 + alias_[little[0]] = -1 + + probs = assign(input=np.array(custom_dist).astype('float32')) + custom_alias = assign(input=np.array(alias_).astype('int32')) + custom_alias_probs = assign( + input=np.array(alias_probs_).astype('float32')) + + inputs['CustomDistProbs'] = probs + inputs['CustomDistAlias'] = custom_alias + inputs['CustomDistAliasProbs'] = custom_alias_probs sampler = 2 else: raise Exception("Unsupported sampler type.") + if num_neg_samples is None: + num_neg_samples = 10 + else: + num_neg_samples = int(num_neg_samples) + attrs = { 'num_total_classes': int(num_total_classes), 'num_neg_samples': num_neg_samples, 'seed': seed, - 'sampler': sampler + 'sampler': sampler, + 'is_sparse': is_sparse } helper.append_op( @@ -6525,7 +6570,7 @@ def crop(x, shape=None, offsets=None, name=None): helper = LayerHelper('crop', **locals()) if not (isinstance(shape, list) or isinstance(shape, tuple) or \ - isinstance(shape, Variable)): + isinstance(shape, Variable)): raise ValueError("The shape should be a list, tuple or Variable.") if offsets is None: @@ -6647,7 +6692,7 @@ def affine_grid(theta, out_shape, name=None): helper = LayerHelper('affine_grid') if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \ - isinstance(out_shape, Variable)): + isinstance(out_shape, Variable)): raise ValueError("The out_shape should be a list, tuple or Variable.") if not isinstance(theta, Variable): @@ -6888,6 +6933,13 @@ def elu(x, alpha=1.0, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.elu(x, alpha=0.2) """ helper = LayerHelper('elu', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6911,6 +6963,13 @@ def relu6(x, threshold=6.0, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.relu6(x, threshold=6.0) """ helper = LayerHelper('relu6', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6934,6 +6993,13 @@ def pow(x, factor=1.0, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.pow(x, factor=2.0) """ helper = LayerHelper('pow', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6958,6 +7024,13 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.stanh(x, scale_a=0.67, scale_b=1.72) """ helper = LayerHelper('stanh', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6983,6 +7056,13 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.hard_sigmoid(x, slope=0.3, offset=0.8) """ helper = LayerHelper('hard_sigmoid', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -7007,6 +7087,13 @@ def swish(x, beta=1.0, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.swish(x, beta=2.0) """ helper = LayerHelper('swish', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 3f4dd5eb712e738bbee8f93c062375033b8ab2f6..bdcd045341212d6cf9dbfbc3cebc72f320e37e9d 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -124,16 +124,11 @@ class ParallelExecutor(object): os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exec_strategy.num_threads = cpu_num * 2 - # Set 1 thread num under nccl2 distribute - # env to make sure all gpus run ops in same order. - if num_trainers > 1: - assert (use_cuda) - # FIXME(gongwb): avoid this set. - exec_strategy.num_threads = 1 - if build_strategy is None: build_strategy = BuildStrategy() + build_strategy.num_trainers = num_trainers + main = main_program main = main if main else framework.default_main_program() if scope == None: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 1006cc568a51fd1d96615018512cd6a18e8295bb..26035f303e72a87b81fdb120fbb92894d78e996b 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -63,7 +63,7 @@ function(py_test_modules TARGET_NAME) set(multiValueArgs MODULES DEPS ENVS) cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (py_test_modules_SERIAL) diff --git a/python/paddle/fluid/tests/unittests/test_nce.py b/python/paddle/fluid/tests/unittests/test_nce.py index c01fdd5dddc139bdefc07b91e9816d62febd7f20..f4f97446746836f4acf42fa662cc30b20af6e4b1 100644 --- a/python/paddle/fluid/tests/unittests/test_nce.py +++ b/python/paddle/fluid/tests/unittests/test_nce.py @@ -14,8 +14,12 @@ from __future__ import print_function -import unittest import numpy as np +import unittest + +import paddle.fluid as fluid +import paddle.fluid.initializer as initializer + from op_test import OpTest @@ -59,7 +63,7 @@ def nce(input, weight, bias, sample_weight, labels, num_classes, class TestNCE(OpTest): def generate_data(self, dim, batch_size, num_classes, num_true_class, - num_neg_samples): + num_neg_samples, is_sparse): input = np.random.randn(batch_size, dim).astype(np.float32) weight = np.random.randn(num_classes, dim).astype(np.float32) bias = np.random.randn(num_classes).astype(np.float32) @@ -70,7 +74,8 @@ class TestNCE(OpTest): 'num_neg_samples': num_neg_samples, 'custom_neg_classes': list(range(num_neg_samples)), 'seed': 0, - 'sampler': 0 + 'sampler': 0, + 'is_sparse': is_sparse } self.inputs = { 'Input': input, @@ -81,7 +86,7 @@ class TestNCE(OpTest): } def set_data(self): - self.generate_data(5, 5, 4, 1, 2) + self.generate_data(5, 5, 4, 1, 2, False) def compute(self): out = nce(self.inputs['Input'], self.inputs['Weight'], @@ -107,9 +112,110 @@ class TestNCE(OpTest): ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02) -class TestNCECase1(TestNCE): +class TestNCECase1Tensor(TestNCE): def set_data(self): - self.generate_data(10, 20, 10, 2, 5) + self.generate_data(10, 20, 10, 2, 5, False) + + +class TestNCECase1SelectedRows(unittest.TestCase): + def setUp(self): + self.base_lr = 0.0001 + self.batch_size = 8 + + @staticmethod + def get_place(): + place = fluid.core.CPUPlace() + return place + + @staticmethod + def get_train_data(batch_size): + batchs = [] + for i in range(batch_size): + input = np.random.randn(batch_size, 10).astype(np.float32) + labels = np.random.randint(0, 20, (batch_size, 1)) + batchs.append([input, labels]) + return batchs + + def get_optimizer(self): + # SGD optimizer + optimizer = fluid.optimizer.SGD(learning_rate=self.base_lr) + return optimizer + + def train_network(self, num_total_classes, num_neg_samples, sampler, + custom_dist, is_sparse): + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='nce_w', + initializer=initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='nce_b', + initializer=initializer.ConstantInitializer()) + + cost = fluid.layers.nce(input=input, + label=label, + num_total_classes=num_total_classes, + sampler=sampler, + custom_dist=custom_dist, + sample_weight=None, + param_attr='nce_w', + bias_attr='nce_b', + seed=1, + num_neg_samples=num_neg_samples, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = self.get_optimizer() + optimizer.minimize(avg_cost) + + return [avg_cost, [input, label]] + + def test_input_is_selected_rows(self): + place = self.get_place() + exe = fluid.Executor(place) + + data = self.get_train_data(self.batch_size) + nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32') + + rets = [] + # for dense + dense_scope = fluid.core.Scope() + dense_startup_program = fluid.framework.Program() + dense_train_program = fluid.framework.Program() + with fluid.scope_guard(dense_scope): + with fluid.program_guard(dense_train_program, + dense_startup_program): + cost, feeds = self.train_network(20, 5, "custom_dist", + nid_freq_arr.tolist(), False) + feeder = fluid.DataFeeder(feed_list=feeds, place=place) + exe.run(dense_startup_program) + loss_val = exe.run(dense_train_program, + feed=feeder.feed(data), + fetch_list=[cost.name]) + rets.append(np.mean(loss_val)) + + # for sparse + sparse_scope = fluid.core.Scope() + sparse_startup_program = fluid.framework.Program() + sparse_train_program = fluid.framework.Program() + with fluid.scope_guard(sparse_scope): + with fluid.program_guard(sparse_train_program, + sparse_startup_program): + cost, feeds = self.train_network(20, 5, "custom_dist", + nid_freq_arr.tolist(), True) + feeder = fluid.DataFeeder(feed_list=feeds, place=place) + exe.run(sparse_startup_program) + loss_val = exe.run(sparse_train_program, + feed=feeder.feed(data), + fetch_list=[cost.name]) + rets.append(np.mean(loss_val)) + + self.assertEqual(rets[0], rets[1]) if __name__ == '__main__':