diff --git a/.gitignore b/.gitignore index 90138f996cf9cacc3c1cbff0cf2600eefca3f305..fa0c8882606b76ac71b43dcde7e1df6770c46c31 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ third_party/ build_* # clion workspace. cmake-build-* +model_test diff --git a/Dockerfile b/Dockerfile index 738bba9bc2e1ab19709722fe04f1490b1b13bd4f..c8b9eed6d60e5d3b32fc14c0c7af80a785145d1b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \ pip3 install -U docopt PyYAML sphinx==1.5.6 && \ pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ easy_install -U pip && \ - pip install -U wheel && \ + pip install -U pip setuptools wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip3 install pre-commit 'ipython==5.3.0' && \ +RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 install opencv-python && \ - pip install pre-commit 'ipython==5.3.0' && \ + pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install opencv-python diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 3ac9fe31b4f3a4e23b181a6005ec00077f562e38..d317117bcf95fd68ae2c6136ebd471d2e80017de 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -86,7 +86,7 @@ paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name' paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) +paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')) paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)) @@ -107,7 +107,7 @@ paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) -paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) +paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)) diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 134fcee826715672a6e021e9bf694bb771ebb830..ebd1d644bcce0554904ee0931827ef43a6d52b11 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -252,9 +252,9 @@ std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { std::vector sorted_ret; for (size_t i = 0; i < ret.size(); ++i) { if (i < last_backward) { - if (boost::get(ret[i]->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) == - static_cast(OpRole::kOptimize)) { + if (static_cast(boost::get(ret[i]->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kOptimize))) { optimize_ops.push_back(ret[i]); } else { sorted_ret.push_back(ret[i]); diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 77386f4f069489b6ff7b927a281bdc286ff816e0..e1aac6dc5a92fb616f00de5806f044b83c2f503f 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -542,6 +542,33 @@ class CPUVector : public std::vector> { this->reserve(this->size() + size_t(end - begin)); this->insert(this->end(), begin, end); } + + const T *CUDAData(platform::Place place) const { + PADDLE_THROW( + "Vector::CUDAData() method is not supported in CPU-only version"); + } + + T *CUDAMutableData(platform::Place place) { + PADDLE_THROW( + "Vector::CUDAMutableData() method is not supported in CPU-only " + "version"); + } + + const T *Data(platform::Place place) const { + PADDLE_ENFORCE( + platform::is_cpu_place(place), + "Vector::Data() method is not supported when not in CPUPlace"); + return this->data(); + } + + T *MutableData(platform::Place place) { + PADDLE_ENFORCE( + platform::is_cpu_place(place), + "Vector::MutableData() method is not supported when not in CPUPlace"); + return this->data(); + } + + const void *Handle() const { return static_cast(this); } }; template diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 2840d503f1454271afb309efdd435225ab077dc0..7fb42feb95b4d54aec693228721c052f683f4d80 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() { ops_.swap(ops); } -void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) { -#ifdef PADDLE_WITH_MKLDNN - VLOG(3) << "use_mkldnn=True"; - for (size_t block_id = 0; block_id < program.Size(); ++block_id) { - auto *block = const_cast(program).MutableBlock(block_id); - for (auto *op : block->AllOps()) { - if (op->HasAttr("use_mkldnn")) { - op->SetAttr("use_mkldnn", true); - } - } - } -#else - LOG(WARNING) - << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; -#endif -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index 9374f3f4a35cc0f90e5b2d6e8b397784b8eae123..ddfa6e1f4d8b73f594fc381ab505797491cdd378 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -48,8 +48,6 @@ class NaiveExecutor { void CleanFeedFetchOps(); - void EnableMKLDNN(const ProgramDesc& program); - protected: void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index 152fc3361a733b906765f3206089c5252658c213..ca31303f77c4a30eb64c43404e214779ea78aeaf 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -71,6 +71,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, static_cast(OpRole::kLoss) | static_cast(OpRole::kForward), static_cast(OpRole::kLoss) | static_cast(OpRole::kBackward), + static_cast(OpRole::kOptimize) | + static_cast(OpRole::kLRSched), static_cast(OpRole::kNotSpecified)}) .SetDefault(static_cast(OpRole::kNotSpecified)); AddAttr>(OpRoleVarAttrName(), diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index cd2471dc49503bb83245673ce543364f5f873995..5527783faad6f7ea6e0b9e776cbde3c743277f16 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -20,6 +20,9 @@ limitations under the License. */ namespace paddle { namespace framework { +////////////////////////// +// Don't add more roles to make this too complicated! +////////////////////////// enum class OpRole { kForward = 0x0000, kBackward = 0x0001, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3368ae2ee4cf65b85abf1fcd89dee14f43522e1f..7dad872dd04754d2866c361b74ab1236ff743da5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -156,12 +156,6 @@ ParallelExecutor::ParallelExecutor( params, member_->local_scopes_, member_->use_cuda_); #endif - // If the loss_var_name is given, the number of graph should be only one. - if (loss_var_name.size()) { - PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, - "The number of graph should be only one"); - } - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 18cdca3a658a6a89e6ab637a7f38825756acfea8..a588cb417aebe94bd4aeda02b1bc8ba07a04b960 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0, namespace paddle { namespace framework { - std::unique_ptr ThreadPool::threadpool_(nullptr); std::once_flag ThreadPool::init_flag_; @@ -47,8 +46,7 @@ void ThreadPool::Init() { } } -ThreadPool::ThreadPool(int num_threads) - : total_threads_(num_threads), idle_threads_(num_threads), running_(true) { +ThreadPool::ThreadPool(int num_threads) : running_(true) { threads_.resize(num_threads); for (auto& thread : threads_) { // TODO(Yancey1989): binding the thread on the specify CPU number @@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads) ThreadPool::~ThreadPool() { { // notify all threads to stop running + std::lock_guard l(mutex_); running_ = false; scheduled_.notify_all(); } @@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() { } } -void ThreadPool::Wait() { - std::unique_lock lock(mutex_); - completed_.wait(lock, [=] { return Done() == true; }); -} - void ThreadPool::TaskLoop() { - while (running_) { + while (true) { std::unique_lock lock(mutex_); - scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); - if (!running_) { - break; + scheduled_.wait( + lock, [this] { return !this->tasks_.empty() || !this->running_; }); + + if (!running_ || tasks_.empty()) { + return; } + // pop a task from the task queue auto task = std::move(tasks_.front()); tasks_.pop(); - - --idle_threads_; lock.unlock(); // run the task task(); - - { - std::unique_lock lock(mutex_); - ++idle_threads_; - if (Done()) { - completed_.notify_all(); - } - } } } diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 94111ee335b1a5df327b3e46d62069b4735c54f6..0687e628aaa4fb7b2e67938fa09a319c8bb35aff 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -57,15 +57,6 @@ class ThreadPool { ~ThreadPool(); - // Returns the number of threads created by the constructor. - size_t Threads() const { return total_threads_; } - - // Returns the number of currently idle threads. - size_t IdleThreads() { - std::unique_lock lock(mutex_); - return idle_threads_; - } - // Run pushes a function to the task queue and returns a std::future // object. To wait for the completion of the task, call // std::future::wait(). @@ -94,25 +85,13 @@ class ThreadPool { }); std::future> f = task.get_future(); tasks_.push(std::move(task)); - lock.unlock(); scheduled_.notify_one(); return f; } - // Wait until all the tasks are completed. - void Wait(); - private: DISABLE_COPY_AND_ASSIGN(ThreadPool); - // If the task queue is empty and avaialbe is equal to the number of - // threads, means that all tasks are completed. Note: this function - // is not thread-safe. Returns true if all tasks are completed. - // Note: don't delete the data member total_threads_ and use - // threads_.size() instead; because you'd need to lock the mutex - // before accessing threads_. - bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; } - // The constructor starts threads to run TaskLoop, which retrieves // and runs tasks from the queue. void TaskLoop(); @@ -125,14 +104,11 @@ class ThreadPool { static std::once_flag init_flag_; std::vector> threads_; - const size_t total_threads_; - size_t idle_threads_; std::queue tasks_; std::mutex mutex_; bool running_; std::condition_variable scheduled_; - std::condition_variable completed_; }; class ThreadPoolIO : ThreadPool { diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 27a4ffd4fcbf293a3dea1744b29384d0bee0c137..884d61e23428a0ad758946295ca9c470767e93ef 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -19,10 +19,11 @@ limitations under the License. */ namespace framework = paddle::framework; -void do_sum(framework::ThreadPool* pool, std::atomic* sum, int cnt) { - std::vector> fs; +void do_sum(std::vector>* fs, std::mutex* mu, + std::atomic* sum, int cnt) { for (int i = 0; i < cnt; ++i) { - fs.push_back(framework::Async([sum]() { sum->fetch_add(1); })); + std::lock_guard l(*mu); + fs->push_back(framework::Async([sum]() { sum->fetch_add(1); })); } } @@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) { } TEST(ThreadPool, ConcurrentRun) { - framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); std::atomic sum(0); std::vector threads; + std::vector> fs; + std::mutex fs_mu; int n = 50; // sum = (n * (n + 1)) / 2 for (int i = 1; i <= n; ++i) { - std::thread t(do_sum, pool, &sum, i); + std::thread t(do_sum, &fs, &fs_mu, &sum, i); threads.push_back(std::move(t)); } for (auto& t : threads) { t.join(); } - pool->Wait(); + for (auto& t : fs) { + t.wait(); + } EXPECT_EQ(sum, ((n + 1) * n) / 2); } diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index a69d9c9a529f26b3981ca8d1ba226fda71b8820a..709c2dfc4b7c67d7d04074c58ce6da85b6e790fe 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -284,7 +284,7 @@ static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, selected_indices.push_back(idx); ++selected_num; } - sorted_indices.erase(sorted_indices.end()); + sorted_indices.erase(sorted_indices.end() - 1); if (flag && eta < 1 && adaptive_threshold > 0.5) { adaptive_threshold *= eta; } diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 07322e720f26213ea777be3cd22f2fead28507f0..3c28ef30922e6d6ba09b96282619eef15867631e 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dropout_op.h" +#include namespace paddle { namespace operators { @@ -57,6 +58,29 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { "will be dropped.") .SetDefault(false); AddAttr("seed", "Dropout random seed.").SetDefault(0); + AddAttr( + "dropout_implementation", + "[\"downgrade_in_infer\"|\"upscale_in_train\"]" + "There are two kinds of ways to implement dropout" + "(the mask below is a tensor have the same shape with input" + "the value of mask is 0 or 1, the ratio of 0 is dropout_prob)" + "1. downgrade_in_infer(default), downgrade the outcome at inference " + "time" + " train: out = input * mask" + " inference: out = input * dropout_prob" + "2. upscale_in_train, upscale the outcome at training time, do nothing " + "in inference" + " train: out = input * mask / ( 1.0 - dropout_prob )" + " inference: out = input" + " dropout op can be removed from the program. the program will be " + "efficient") + .SetDefault("downgrade_in_infer") + .AddCustomChecker([](const std::string& type) { + PADDLE_ENFORCE( + type == "downgrade_in_infer" || type == "upscale_in_train", + "dropout_implementation can only be downgrade_in_infer or " + "upscale_in_train"); + }); AddComment(R"DOC( Dropout Operator. @@ -104,7 +128,9 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel); + dropout, ops::CPUDropoutKernel, + ops::CPUDropoutKernel); REGISTER_OP_CPU_KERNEL( dropout_grad, - ops::DropoutGradKernel); + ops::DropoutGradKernel, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index 1dd66e0280c46c0624ff70e822cb6fa6f06b7aa9..e011f47e086183a4ef3a3373c17acd6c21b6cf7e 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/float16.h" @@ -26,7 +27,8 @@ namespace operators { template __global__ void RandomGenerator(const size_t n, const int seed, const float dropout_prob, const T* src, - T* mask_data, T* dst) { + T* mask_data, T* dst, + bool is_upscale_in_train) { thrust::minstd_rand rng; rng.seed(seed); thrust::uniform_real_distribution dist(0, 1); @@ -47,7 +49,11 @@ __global__ void RandomGenerator(const size_t n, const int seed, if (dist(rng) < dropout_prob) { mask = static_cast(0); } else { - mask = static_cast(1); + if (is_upscale_in_train) { + mask = static_cast(1.0f / (1.0f - dropout_prob)); + } else { + mask = static_cast(1); + } } dest = s * mask; mask_data[idx] = mask; @@ -67,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel { y->mutable_data(context.GetPlace()); float dropout_prob = context.Attr("dropout_prob"); + auto dropout_implementation = + context.Attr("dropout_implementation"); auto& place = *context.template device_context().eigen_device(); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); @@ -83,11 +91,16 @@ class GPUDropoutKernel : public framework::OpKernel { int grid = (x->numel() + threads - 1) / threads; RandomGenerator< T><<>>( - size, seed, dropout_prob, x_data, mask_data, y_data); + size, seed, dropout_prob, x_data, mask_data, y_data, + (dropout_implementation == "upscale_in_train")); } else { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - Y.device(place) = X * static_cast(1.0f - dropout_prob); + if (dropout_implementation == "upscale_in_train") { + Y.device(place) = X; + } else { + Y.device(place) = X * static_cast(1.0f - dropout_prob); + } } } }; @@ -99,6 +112,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( dropout, ops::GPUDropoutKernel, - ops::GPUDropoutKernel); -REGISTER_OP_CUDA_KERNEL(dropout_grad, - ops::DropoutGradKernel); + ops::GPUDropoutKernel, + ops::GPUDropoutKernel); +REGISTER_OP_CUDA_KERNEL( + dropout_grad, ops::DropoutGradKernel, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h index 0628b4b826d2730a8e3fb4842e4ae550b8c00569..6c629b7b6d255828023ed25680675ca104a33e12 100644 --- a/paddle/fluid/operators/dropout_op.h +++ b/paddle/fluid/operators/dropout_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -36,6 +37,8 @@ class CPUDropoutKernel : public framework::OpKernel { auto* y_data = y->mutable_data(context.GetPlace()); float dropout_prob = context.Attr("dropout_prob"); + auto dropout_implementation = + context.Attr("dropout_implementation"); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); @@ -49,14 +52,20 @@ class CPUDropoutKernel : public framework::OpKernel { engine.seed(seed); std::uniform_real_distribution dist(0, 1); + size_t size = framework::product(mask->dims()); for (size_t i = 0; i < size; ++i) { if (dist(engine) < dropout_prob) { mask_data[i] = 0; y_data[i] = 0; } else { - mask_data[i] = 1; - y_data[i] = x_data[i]; + if (dropout_implementation == "upscale_in_train") { + mask_data[i] = 1.0f / static_cast(1.0f - dropout_prob); + y_data[i] = x_data[i] / static_cast(1.0f - dropout_prob); + } else { + mask_data[i] = 1; + y_data[i] = x_data[i]; + } } } } else { @@ -64,7 +73,11 @@ class CPUDropoutKernel : public framework::OpKernel { auto Y = EigenMatrix::Reshape(*y, 1); auto& place = *context.template device_context().eigen_device(); - Y.device(place) = X * (1.0f - dropout_prob); + if (dropout_implementation == "upscale_in_train") { + Y.device(place) = X; + } else { + Y.device(place) = X * static_cast(1.0f - dropout_prob); + } } } }; diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index c0847f0bee415393255983468349ad314716bcb3..fab293f7d03eb923995fa4cd99af955a34faa6a4 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -136,6 +136,7 @@ static std::shared_ptr> GetActKernel( return nullptr; } +#ifdef __AVX__ template static std::unique_ptr GetAVXAct(const std::string& type) { if (type == "sigmoid") { @@ -150,6 +151,7 @@ static std::unique_ptr GetAVXAct(const std::string& type) { PADDLE_THROW("Not support type: %s", type); return nullptr; } +#endif /* LSTM JitKernel */ template diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index 2bdb23e999621b10799b5163f326bc4b66a437e6..f6e241af0634650f4a32be6a4547617f8ec3ee60 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -76,6 +76,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, ops::SoftmaxCUDNNKernel, + ops::SoftmaxCUDNNKernel, ops::SoftmaxCUDNNKernel); REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, - ops::SoftmaxGradCUDNNKernel); + ops::SoftmaxGradCUDNNKernel, + ops::SoftmaxGradCUDNNKernel); diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 6a9fc6611a8f8eaa6749aefac0673ccabaebbcfe..bbd71db6062107f6ba40343c84d942b54b3958e6 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -210,18 +210,21 @@ REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker, REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); REGISTER_OP_CPU_KERNEL( - transpose, ops::TransposeKernel); + transpose, ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CPU_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, ops::Transpose2GradMaker); REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad); REGISTER_OP_CPU_KERNEL( - transpose2, - ops::TransposeKernel); + transpose2, ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CPU_KERNEL( transpose2_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc index c1b5a8b31be243fab3af06a18c8e51986c953700..b4025350fa9f3610bde43eee91cd059f3063813f 100644 --- a/paddle/fluid/operators/transpose_op.cu.cc +++ b/paddle/fluid/operators/transpose_op.cu.cc @@ -16,15 +16,18 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - transpose, - ops::TransposeKernel); + transpose, ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CUDA_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); REGISTER_OP_CUDA_KERNEL( transpose2, - ops::TransposeKernel); + ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CUDA_KERNEL( transpose2_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 4c24d0d6a7069c75c7b9b8245f4567ae8bfc2742..1738afe93e99f1de28bec2fb23be8e1a309d9288 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -272,7 +272,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): ) square = grad * grad - local_norm_var = layers.cast(layers.reduce_sum(input=square), 'float64') + local_norm_var = layers.reduce_sum(input=square) context[self.group_name].append(local_norm_var) self.context = context @@ -282,7 +282,6 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): if group_scale_name not in self.context: group_norm_var = layers.sums(input=self.context[self.group_name]) group_norm_var = layers.sqrt(x=group_norm_var) - group_norm_var = layers.cast(group_norm_var, 'float32') clip_var = self.context[self.group_name + "_clip"] group_scale_var = layers.elementwise_div( x=clip_var, @@ -333,7 +332,8 @@ def append_gradient_clip_ops(param_grads): for p, g in param_grads: if g is None: continue - with p.block.program._optimized_guard([p, g]): + with p.block.program._optimized_guard( + [p, g]), framework.name_scope('append_clip'): clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr()) if clip_attr is None: clip_attr = NullGradientClipAttr() @@ -348,7 +348,8 @@ def append_gradient_clip_ops(param_grads): for p, g in param_grads: if g is None: continue - with p.block.program._optimized_guard([p, g]): + with p.block.program._optimized_guard( + [p, g]), framework.name_scope('append_graident_clip'): res.append(clip_attr._create_operators(param=p, grad=g)) return res diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b07d0131a32c3f2744854a17b180ae714d532f80..fd03dff386cad21c727ca0f266fa1b37ad65b4ad 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1496,6 +1496,9 @@ class Program(object): >>> with program._optimized_guard([p,g]): >>> p = p - 0.001 * g """ + tmp_role = self._current_role + tmp_var = self._op_role_var + OpRole = core.op_proto_and_checker_maker.OpRole self._current_role = OpRole.Optimize self._op_role_var = [ @@ -1503,11 +1506,11 @@ class Program(object): for var in param_and_grads ] yield - self._op_role_var = [] - self._current_role = OpRole.Forward + self._op_role_var = tmp_var + self._current_role = tmp_role @contextlib.contextmanager - def _lr_schedule_guard(self): + def _lr_schedule_guard(self, is_with_opt=False): """ A with guard to set :code:`LRSched` :code:`OpRole` and :code:`OpRoleVar` automatically. The :code:`OpRoleVar` is @@ -1515,6 +1518,10 @@ class Program(object): Notes: This is a very low level API. Users should not use it directly. + Args: + is_with_opt: Only set to true if these ops a in the middle + of a bunch of optimize ops so that it can be treated + correctly. For example, sgd->lr_op->sgd->lr_op->sgd. Examples: @@ -1528,6 +1535,8 @@ class Program(object): OpRole = core.op_proto_and_checker_maker.OpRole self._current_role = OpRole.LRSched + if is_with_opt: + self._current_role = int(OpRole.LRSched) | int(OpRole.Optimize) # TODO(typhoonzero): how to set target learning rate var self._op_role_var = [] yield diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d3b5f13b577099afb9545b5177fbf7e07f9a8e45..c762633c601950e06dbd29e20231e943054c27b3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -981,7 +981,12 @@ def cos_sim(X, Y): return out -def dropout(x, dropout_prob, is_test=False, seed=None, name=None): +def dropout(x, + dropout_prob, + is_test=False, + seed=None, + name=None, + dropout_implementation="downgrade_in_infer"): """ Computes dropout. @@ -1001,6 +1006,21 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): units will be dropped. DO NOT use a fixed seed in training. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. + dropout_implementation(string): ['downgrade_in_infer'(defauld)|'upscale_in_train'] + 1. downgrade_in_infer(default), downgrade the outcome at inference + train: out = input * mask + inference: out = input * dropout_prob + (make is a tensor same shape with input, value is 0 or 1 + ratio of 0 is dropout_prob) + 2. upscale_in_train, upscale the outcome at training time + train: out = input * mask / ( 1.0 - dropout_prob ) + inference: out = input + (make is a tensor same shape with input, value is 0 or 1 + ratio of 0 is dropout_prob) + dropout op can be removed from the program. + the program will be efficient + + Returns: Variable: A tensor variable is the shape with `x`. @@ -1030,7 +1050,8 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): 'dropout_prob': dropout_prob, 'is_test': is_test, 'fix_seed': seed is not None, - 'seed': seed if seed is not None else 0 + 'seed': seed if seed is not None else 0, + 'dropout_implementation': dropout_implementation, }) return out @@ -4845,7 +4866,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): return counter -def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): +def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): """ Gives a new shape to the input Tensor without changing its data. @@ -4893,15 +4914,22 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): :attr:`shape` specifying shape. That is to say :attr:`actual_shape` has a higher priority than :attr:`shape`. - act (str): The non-linear activation to be applied to output variable. - inplace(bool): If this flag is set true, the output - shares data with input without copying, otherwise - a new output tensor is created - whose data is copied from input x. + act (str): The non-linear activation to be applied to the reshaped tensor + variable. + inplace(bool): Must use :attr:`False` if :attr:`x` is used in multiple + operators. If this flag is set :attr:`True`, reuse input + :attr:`x` to reshape, which will change the shape of + tensor variable :attr:`x` and might cause errors when + :attr:`x` is used in multiple operators. If :attr:`False`, + preserve the shape :attr:`x` and create a new output tensor + variable whose data is copied from input x but reshaped. name (str): The name of this layer. It is optional. Returns: - Variable: The output tensor. + Variable: The reshaped tensor variable if :attr:`act` is None. It is a \ + new tensor variable if :attr:`inplace` is :attr:`False`, \ + otherwise it is :attr:`x`. If :attr:`act` is not None, return \ + the activated tensor variable. Raises: TypeError: if actual_shape is neither Variable nor None. @@ -4912,7 +4940,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): data = fluid.layers.data( name='data', shape=[2, 4, 6], dtype='float32') reshaped = fluid.layers.reshape( - x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True) + x=data, shape=[-1, 0, 3, 2], inplace=True) """ if not (isinstance(shape, list) or isinstance(shape, tuple)): @@ -4939,7 +4967,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): "except one unknown dimension.") helper = LayerHelper("reshape2", **locals()) - out = helper.create_variable_for_type_inference(dtype=x.dtype) + out = x if inplace else helper.create_variable_for_type_inference( + dtype=x.dtype) x_shape = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type="reshape2", diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 17af44afdde5cdbec082d473457ef01974695bc6..6ea280c733996436a6218cc6f759f2cf7c652ac9 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -111,7 +111,9 @@ class Optimizer(object): if param_lr == 1.0: return self._global_learning_rate() else: - with default_main_program()._lr_schedule_guard(): + with default_main_program()._lr_schedule_guard( + is_with_opt=True), framework.name_scope( + 'scale_with_param_lr'): return self._global_learning_rate() * param_lr def _create_accumulators(self, block, parameters): @@ -602,7 +604,8 @@ class AdamOptimizer(Optimizer): for param, grad in param_and_grads: if grad is None: continue - with param.block.program._optimized_guard([param, grad]): + with param.block.program._optimized_guard( + [param, grad]), name_scope("optimizer"): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str, @@ -740,7 +743,8 @@ class AdamaxOptimizer(Optimizer): for param, grad in parameters_and_grads: if grad is None: continue - with param.block.program._optimized_guard([param, grad]): + with param.block.program._optimized_guard( + [param, grad]), name_scope('adamx'): beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str, param) main_block.append_op( @@ -1279,7 +1283,8 @@ class ModelAverage(Optimizer): for param, grad in self.params_grads: if grad is None: continue - with param.block.program._optimized_guard([param, grad]): + with param.block.program._optimized_guard( + [param, grad]), name_scope('move_average'): self._append_average_accumulate_op(param) self.apply_program = Program() diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index c151fbd17208bb6e3104e8d0f6590392c6095987..57185da4d1d38f3848994aae105411cf2844843a 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -47,7 +47,8 @@ def append_regularization_ops(parameters_and_grads, regularization=None): if grad is None: params_and_grads.append((param, grad)) continue - with param.block.program._optimized_guard([param, grad]): + with param.block.program._optimized_guard( + [param, grad]), framework.name_scope('regularization'): regularization_term = None if param.regularizer is not None: # Add variable for regularization term in grad block diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 68e498c6e82c6d2b389e2e08355b63db9de0bd14..cf54bc2dbe788f3757a7ef93f26156d118a0cd02 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -78,9 +78,9 @@ if(WITH_DISTRIBUTE) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) - - py_test_modules(test_dist_transformer MODULES test_dist_transformer) - set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) + # FIXME(typhoonzero): add this back + #py_test_modules(test_dist_transformer MODULES test_dist_transformer) + #set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 0296bc2af4e0b79478c34b4cceab32b5a8a50f2f..be3c5f3b9558ec522803ed9a5acedea75cda6ccc 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -85,6 +85,69 @@ class TestDropoutOp5(OpTest): self.check_output() +class TestDropoutOp6(TestDropoutOp): + def setUp(self): + self.op_type = "dropout" + self.inputs = {'X': np.random.random((32, 64)).astype("float32")} + self.attrs = { + 'dropout_prob': 1.0, + 'fix_seed': True, + 'is_test': False, + 'dropout_implementation': 'upscale_in_train' + } + self.outputs = { + 'Out': np.zeros((32, 64)).astype('float32'), + 'Mask': np.zeros((32, 64)).astype('float32') + } + + +class TestDropoutOp7(TestDropoutOp): + def setUp(self): + self.op_type = "dropout" + self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")} + self.attrs = { + 'dropout_prob': 0.0, + 'fix_seed': True, + 'is_test': False, + 'dropout_implementation': 'upscale_in_train' + } + self.outputs = { + 'Out': self.inputs['X'], + 'Mask': np.ones((32, 64, 2)).astype('float32') + } + + +class TestDropoutOp8(OpTest): + def setUp(self): + self.op_type = "dropout" + self.inputs = {'X': np.random.random((32, 64)).astype("float32")} + self.attrs = { + 'dropout_prob': 0.35, + 'fix_seed': True, + 'is_test': True, + 'dropout_implementation': 'upscale_in_train' + } + self.outputs = {'Out': self.inputs['X']} + + def test_check_output(self): + self.check_output() + + +class TestDropoutOp9(OpTest): + def setUp(self): + self.op_type = "dropout" + self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")} + self.attrs = { + 'dropout_prob': 0.75, + 'is_test': True, + 'dropout_implementation': 'upscale_in_train' + } + self.outputs = {'Out': self.inputs['X']} + + def test_check_output(self): + self.check_output() + + class TestFP16DropoutOp(OpTest): def setUp(self): self.op_type = "dropout" diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 2192139f8d5950286691a77333dd8ec35505b033..28d7df8e45d4f3209fb6205b76f8d1be8e73392b 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -49,6 +49,7 @@ LOOKUP_TABLE_GRAD_TYPE = "lookup_table_grad" OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName() RPC_OP_ROLE_ATTR_NAME = op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName( ) +OPT_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Optimize RPC_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.RPC DIST_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.Dist LR_SCHED_OP_ROLE_ATTR_VALUE = core.op_proto_and_checker_maker.OpRole.LRSched @@ -1717,8 +1718,10 @@ to transpile() call.") lr_ops = [] block = self.origin_program.global_block() for op in block.ops: - if int(op.attr(RPC_OP_ROLE_ATTR_NAME)) == int( - LR_SCHED_OP_ROLE_ATTR_VALUE): + role_id = int(op.attr(RPC_OP_ROLE_ATTR_NAME)) + if role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) or \ + role_id == int(LR_SCHED_OP_ROLE_ATTR_VALUE) | \ + int(OPT_OP_ROLE_ATTR_VALUE): lr_ops.append(op) log("append lr op: ", op.type) return lr_ops