diff --git a/doc/fluid/howto/cluster/fluid_cluster_train_cn.md b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md new file mode 100644 index 0000000000000000000000000000000000000000..1b6f767869aaa800c122c8e7a06a1413e48e10e0 --- /dev/null +++ b/doc/fluid/howto/cluster/fluid_cluster_train_cn.md @@ -0,0 +1,145 @@ +# Fluid 分布式版本使用指南 +本篇文章将说明如何在PaddlePaddle Fluid版本下进行分布式训练的配置和执行,以及将单机训练脚本改造成支持集群训练的版本 + +## 准备工作 +* 可用的集群 + + 包含一个或多个计算节点的集群,每一个节点都能够执行PaddlePaddle的训练任务且拥有唯一的IP地址,集群内的所有计算节点可以通过网络相互通信。 +* 安装PaddlePaddle Fluid with Distribution版本 + + 所有的计算节点上均需要按照分布式版本的PaddlePaddle, 在用于GPU等设备的机器上还需要额外安装好相应的驱动程序和CUDA的库。 + + **注意:**当前对外提供的PaddlePaddle版本并不支持分布式,需要通过源码重新编译。编译和安装方法参见[编译和安装指南](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html)。 + cmake编译命令中需要将WITH_DISTRIBUTE设置为ON,下面是一个cmake编译指令示例: +``` bash +cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON +``` + +## 更新训练脚本 +这里,我们以[Deep Learing 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)课程中的第一章 fit a line 为例,描述如何将单机训练脚本改造成支持集群训练的版本。 +### 单机训练脚本示例 +```python +import paddle.v2 as paddle +import paddle.fluid as fluid + +x = fluid.layers.data(name='x', shape=[13], dtype='float32') +y_predict = fluid.layers.fc(input=x, size=1, act=None) +y = fluid.layers.data(name='y', shape=[1], dtype='float32') + +cost = fluid.layers.square_error_cost(input=y_predict, label=y) +avg_cost = fluid.layers.mean(x=cost) + +sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) +sgd_optimizer.minimize(avg_cost) + +BATCH_SIZE = 20 + +train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=BATCH_SIZE) + +place = fluid.CPUPlace() +feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) +exe = fluid.Executor(place) + +exe.run(fluid.default_startup_program()) + +PASS_NUM = 100 +for pass_id in range(PASS_NUM): + fluid.io.save_persistables(exe, "./fit_a_line.model/") + fluid.io.load_persistables(exe, "./fit_a_line.model/") + for data in train_reader(): + avg_loss_value, = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost]) + + if avg_loss_value[0] < 10.0: + exit(0) # if avg cost less than 10.0, we think our code is good. +exit(1) +``` + +我们创建了一个简单的全连接神经网络程序,并且通过Fluid的Executor执行了100次迭代,现在我们需要将该单机版本的程序更新为分布式版本的程序。 +### 介绍Parameter Server +在非分布式版本的训练脚本中,只存在Trainer一种角色,它不仅处理常规的计算任务,也处理参数相关的计算、保存和优化任务。在分布式版本的训练过程中,由于存在多个Trainer节点进行同样的数据计算任务,因此需要有一个中心化的节点来统一处理参数相关的保存和分配。在PaddlePaddle中,我们称这样的节点为[Parameter Server](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/design/dist_train/parameter_server.md) + +**因此,在分布式的Fluid环境中,我们有两个角色需要创建,分别是Parameter Server和Trainer。** + +### 分布式训练 +Fliud专门提供了工具[Distributed Transpiler](https://github.com/PaddlePaddle/Paddle/blob/ba65d54d9d3b41cd3c5171b00f476d4e60133ddb/doc/fluid/design/dist_train/distributed_architecture.md#distributed-transpiler)用于将单机版的训练程序转换为分布式版本的训练程序。工具背后的理念是找出程序的优化算子和梯度参数,将他们分隔为两部分,通过send/recv 操作算子进行连接,优化算子和梯度参数可以在优化器的minimize函数的返回值中获取到。 +```python +optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) +``` +将Distributed Transpiler、优化算子和梯度函数放在一个代码中如下: +```python +... #define the program, cost, and create sgd optimizer + +optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters + +t = fluid.DistributeTranspiler() # create the transpiler instance +# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers +t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2) + +... #create executor + +# in pserver, run this +#current_endpoint here means current pserver IP:PORT you wish to run on +pserver_prog = t.get_pserver_program(current_endpoint) +pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) +exe.run(pserver_startup) +exe.run(pserver_prog) + +# in trainer, run this +... # define data reader +exe.run(fluid.default_startup_program()) +for pass_id in range(100): + for data in train_reader(): + exe.run(t.get_trainer_program()) +``` +### 分布式训练脚本运行说明 +分布式任务的运行需要将表格中说明的多个参数进行赋值: + +| 参数名 | 值类型 | 说明 | 示例 | +|:-------------|:------|:---------------------------------------|:-------------| +| trainer_id | int | 当前训练节点的ID,训练节点ID编号为0 - n-1, n为trainers的值 | 0/1/2/3 | +| pservers | str | parameter server 列表 | 127.0.0.1:6710,127.0.0.1:6711 | +| trainers | int | 训练节点的总个数,>0的数字 | 4 | +| server_endpoint | str | 当前所起的服务节点的IP:PORT | 127.0.0.1:8789 | +| training_role | str | 节点角色, TRAINER/PSERVER | PSERVER | + +**注意:** ```training_role```是用来区分当前所起服务的角色的,用于训练程序中,用户可根据需要自行定义,其他参数为fluid.DistributeTranspiler的transpile函数所需要,需要在调用函数前进行定义,样例如下: + +```python +t = fluid.DistributeTranspiler() +t.transpile( + optimize_ops, + params_grads, + trainer_id, + pservers=pserver, + trainers=trainers) +if training_role == "PSERVER": + pserver_prog = t.get_pserver_program(server_endpoint) + pserver_startup = t.get_startup_program(server_endpoint, pserver_prog) +``` + +### Demo +完整的demo代码位于Fluid的test目录下的[book](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/tests/book/test_fit_a_line.py)中。 + +第一步,进入demo代码所在目录: +```bash +cd /paddle/python/paddle/fluid/tests/book +``` + +第二步,启动Parameter Server: +```bash +PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.2 TRAINERS=2 POD_IP=192.168.1.2 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=PSERVER python test_fit_a_line.py +``` +执行命令后请等待出现提示: ```Server listening on 192.168.1.2:6174 ```, 表示Paramter Server已经正常启动。 + +第三步,启动Trainer: +```bash +PADDLE_INIT_PORT=6174 PADDLE_INIT_PSERVERS=192.168.1.3 TRAINERS=2 POD_IP=192.168.1.3 PADDLE_INIT_TRAINER_ID=1 TRAINING_ROLE=TRAINER python test_fit_a_line.py +``` +由于我们定义的Trainer的数量是2个,因此需要在另外一个计算节点上再启动一个Trainer。 + +现在我们就启动了一个包含一个Parameter Server和两个Trainer的分布式训练任务。 diff --git a/doc/v2/faq/index_en.rst b/doc/v2/faq/index_en.rst index 57df868f760038b25fae30df7ab20a68875ad36a..3fa220792b252617848a1c76bc2be49928e35f64 100644 --- a/doc/v2/faq/index_en.rst +++ b/doc/v2/faq/index_en.rst @@ -1,7 +1,8 @@ FAQ ==== - +This document provides answers to some of the frequently asked questions about PaddlePaddle. If you have a question that is not covered here, please go to `PaddlePaddle Community `_ , to find an answer or submit new `issue `_ , we will reply in time. + .. toctree:: :maxdepth: 1 diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index d74c47b981e51f12d99098818c71f3f6ec455d98..ec637658c03ad94624ee9a4f5def6a84387d293e 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -613,3 +613,14 @@ REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, ops::grad_functor>); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL); + +REGISTER_OP_CPU_KERNEL(relu, + ops::ActivationKernel>, + ops::ActivationKernel>); +REGISTER_OP_CPU_KERNEL( + relu_grad, ops::ActivationGradKernel>, + ops::ActivationGradKernel>); diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index b2633d017623c3a6a3bab2b416009d6d7c8fc1d4..7709a551dc155e1f3cd2a19a689999608f497beb 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -14,6 +14,7 @@ limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/platform/float16.h" namespace ops = paddle::operators; @@ -31,3 +32,16 @@ namespace ops = paddle::operators; ops::grad_functor>); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); + +REGISTER_OP_CUDA_KERNEL( + relu, ops::ActivationKernel>, + ops::ActivationKernel>, + ops::ActivationKernel>); +REGISTER_OP_CUDA_KERNEL( + relu_grad, ops::ActivationGradKernel>, + ops::ActivationGradKernel>); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 8f791a6ca81c13a92fd8adf0d1620203bd4cf7d6..b95e793586219b7c413d0c7adb835081874d9363 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -772,7 +772,6 @@ struct SwishGradFunctor : public BaseActivationFunctor { __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(exp, ExpFunctor, ExpGradFunctor); \ - __macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c95077fcbdb6b6c0da31f30b795dbe4d7d4fe6fe --- /dev/null +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -0,0 +1,216 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/average_accumulates_op.h" + +namespace paddle { +namespace operators { + +template <> +void GetAccumulators( + const framework::ExecutionContext& ctx, int64_t& num_updates_, + int64_t& num_accumulates_, int64_t& old_num_accumulates_) { + auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); + auto* in_num_accumulates = ctx.Input("in_num_accumulates"); + auto* in_num_updates = ctx.Input("in_num_updates"); + + old_num_accumulates_ = in_old_num_accumulates->data()[0]; + num_accumulates_ = in_num_accumulates->data()[0]; + num_updates_ = in_num_updates->data()[0]; +} + +template <> +void SetAccumulators( + const framework::ExecutionContext& ctx, int64_t num_updates_, + int64_t num_accumulates_, int64_t old_num_accumulates_) { + auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); + auto* out_num_accumulates = ctx.Output("out_num_accumulates"); + auto* out_num_updates = ctx.Output("out_num_updates"); + + out_old_num_accumulates->data()[0] = old_num_accumulates_; + out_num_accumulates->data()[0] = num_accumulates_; + out_num_updates->data()[0] = num_updates_; +} + +class AverageAccumulatesOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("param"), + "Input (param) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_sum_1"), + "Input (sum_1) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_sum_2"), + "Input (sum_2) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_sum_3"), + "Input (sum_3) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_num_accumulates"), + "Input (in_num_accumulates) of average_accumulates op should " + "not be null."); + PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"), + "Input (old_num_accumulates) of average_accumulates op " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_num_updates"), + "Input (num_updates) of average_accumulates op should not be null."); + + PADDLE_ENFORCE( + ctx->HasOutput("out_sum_1"), + "Output (sum_1) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("out_sum_2"), + "Output (sum_2) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("out_sum_3"), + "Output (sum_3) of average_accumulates op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"), + "Output (num_accumulates) of average_accumulates op should " + "not be null."); + PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"), + "Output (old_num_accumulates) of average_accumulates op " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("out_num_updates"), + "Output (num_updates) of average_accumulates op should not be null."); + + auto in_dim = ctx->GetInputDim("param"); + + ctx->SetOutputDim("out_sum_1", in_dim); + ctx->SetOutputDim("out_sum_2", in_dim); + ctx->SetOutputDim("out_sum_3", in_dim); + ctx->SetOutputDim("out_num_accumulates", {1}); + ctx->SetOutputDim("out_old_num_accumulates", {1}); + ctx->SetOutputDim("out_num_updates", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("param")->type()), + ctx.GetPlace()); + } +}; + +class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker { + public: + AverageAccumulatesOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("param", "(Tensor), The parameter to be accumulated."); + AddInput("in_sum_1", + "(Tensor), A tensor used to store the parameter " + "sums with the same shape as input(param)."); + AddInput("in_sum_2", + "(Tensor), A auxiliary tensor to help " + "accumulating sums of parameter values with the same shape as " + "input(param). It is used to avoid loss of precision due to too " + "many sums."); + AddInput("in_sum_3", + "(Tensor), A auxiliary tensor to help " + "accumulating sums of parameter values with the same shape as " + "input(param)."); + AddInput("in_num_accumulates", + "(Tensor), The accumulating times of current window with " + "shape [1]."); + AddInput( + "in_old_num_accumulates", + "(Tensor), The accumulating times of previous window with " + "shape [1]."); + AddInput("in_num_updates", + "(Tensor), The total number of batches used by trainning " + "before this batch with shape [1]."); + + AddOutput("out_sum_1", + "(Tensor), A tensor used to store the " + "parameter sums with the same shape as input(param)."); + AddOutput("out_sum_2", + "(Tensor), A auxiliary tensor to help " + "accumulating sums of parameter values with the same shape as " + "input(param). It is used to avoid loss of precision due to too " + "many sums."); + AddOutput("out_sum_3", + "(Tensor), A auxiliary tensor to help " + "accumulating sums of parameter values with the same shape as " + "input(param)."); + AddOutput( + "out_num_accumulates", + "(Tensor), The accumulating times of current window with " + "shape [1]."); + AddOutput( + "out_old_num_accumulates", + "(Tensor) The accumulating times of previous window with " + "shape [1]."); + AddOutput( + "out_num_updates", + "(Tensor), The total number of batches used by trainning " + "before this batch with shape [1]."); + + AddAttr("average_window", + "(float, default 0) " + "The rate of average window size relative to num_updates.") + .SetDefault(0); + AddAttr("max_average_window", + "(int64_t) " + "Maximum size of average window. It suggests that the " + "number of mini-batches " + "in one pass is appropriate value to set."); + AddAttr("min_average_window", + "(int64_t, default 10000L) " + "Minimu size of average window.") + .SetDefault(10000L); + + AddComment(R"DOC( +AverageAccumulates Operator. +Accumulate the sum of parameter whtin sliding window. The size of sliding window is +determined by 'average_window', 'max_average_window' and 'min_average_window'. +Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'. +'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'. + +All the accumulators were inited to zero before training. + +And for a mini-batch in training, accumulators were computed as below steps: + num_updates += 1 + num_accumulates += 1 + sum_1 += param + if num_updates % kMaxNumAccumulates == 0: + sum_2 += sum_1 + sum_1 = 0 + if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window): + sum_3 = sum_1 + sum_2 + sum_1 = 0 + sum_2 = 0 + old_num_accumulates = num_accumulates + num_accumulates = 0 + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp, + ops::AverageAccumulatesOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + average_accumulates, + ops::AverageAccumulatesKernel, + ops::AverageAccumulatesKernel); diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..270c46984465e5ca62eaa8da3955ce7a3eaa0c57 --- /dev/null +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -0,0 +1,63 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/average_accumulates_op.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { +template <> +void GetAccumulators( + const framework::ExecutionContext& ctx, int64_t& num_updates_, + int64_t& num_accumulates_, int64_t& old_num_accumulates_) { + auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); + auto* in_num_accumulates = ctx.Input("in_num_accumulates"); + auto* in_num_updates = ctx.Input("in_num_updates"); + auto stream = ctx.cuda_device_context().stream(); + memory::Copy(platform::CPUPlace(), &old_num_accumulates_, + platform::CUDAPlace(), in_old_num_accumulates->data(), + sizeof(int64_t), stream); + memory::Copy(platform::CPUPlace(), &num_accumulates_, platform::CUDAPlace(), + in_num_accumulates->data(), sizeof(int64_t), stream); + memory::Copy(platform::CPUPlace(), &num_updates_, platform::CUDAPlace(), + in_num_updates->data(), sizeof(int64_t), stream); +} + +template <> +void SetAccumulators( + const framework::ExecutionContext& ctx, int64_t num_updates_, + int64_t num_accumulates_, int64_t old_num_accumulates_) { + auto stream = ctx.cuda_device_context().stream(); + auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); + auto* out_num_accumulates = ctx.Output("out_num_accumulates"); + auto* out_num_updates = ctx.Output("out_num_updates"); + + memory::Copy(platform::CUDAPlace(), out_old_num_accumulates->data(), + platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t), + stream); + memory::Copy(platform::CUDAPlace(), out_num_accumulates->data(), + platform::CPUPlace(), &num_accumulates_, sizeof(int64_t), + stream); + memory::Copy(platform::CUDAPlace(), out_num_updates->data(), + platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + average_accumulates, + ops::AverageAccumulatesKernel, + ops::AverageAccumulatesKernel); diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f858109d1428dc67d94c253e5a39818eb2d4560d --- /dev/null +++ b/paddle/fluid/operators/average_accumulates_op.h @@ -0,0 +1,113 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenVector = framework::EigenVector; + +template +void GetAccumulators(const framework::ExecutionContext& ctx, + int64_t& num_updates, int64_t& num_accumulates, + int64_t& old_num_accumulates); + +template +void SetAccumulators(const framework::ExecutionContext& ctx, + int64_t num_updates, int64_t num_accumulates, + int64_t old_num_accumulates); + +template +class AverageAccumulatesKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // It is used to avoid loss of precision + static const int64_t kMaxNumAccumulates = 16384; + // Get accumulators from input + int64_t num_updates = 0; + int64_t num_accumulates = 0; + int64_t old_num_accumulates = 0; + GetAccumulators(ctx, num_updates, num_accumulates, + old_num_accumulates); + + // Get attrs + float average_window = ctx.Attr("average_window"); + int64_t max_average_window = ctx.Attr("max_average_window"); + int64_t min_average_window = ctx.Attr("min_average_window"); + min_average_window = + std::min(min_average_window, max_average_window); + + // Get inputs + auto* param = ctx.Input("param"); + auto* in_sum_1 = ctx.Input("in_sum_1"); + auto* in_sum_2 = ctx.Input("in_sum_2"); + auto* in_sum_3 = ctx.Input("in_sum_3"); + auto param_tensor = EigenVector::Flatten(*param); + auto in_sum_1_tensor = EigenVector::Flatten(*in_sum_1); + auto in_sum_2_tensor = EigenVector::Flatten(*in_sum_2); + auto in_sum_3_tensor = EigenVector::Flatten(*in_sum_3); + + // Get outputs + auto* out_sum_1 = ctx.Output("out_sum_1"); + auto* out_sum_2 = ctx.Output("out_sum_2"); + auto* out_sum_3 = ctx.Output("out_sum_3"); + auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); + auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); + auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); + + // Compute + auto& place = *ctx.template device_context().eigen_device(); + math::SetConstant constant_functor; + ++num_updates; + ++num_accumulates; + out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; + out_sum_2_tensor.device(place) = in_sum_2_tensor; + out_sum_3_tensor.device(place) = in_sum_3_tensor; + if (num_updates % kMaxNumAccumulates == 0) { + // Move the sum to a different buffer to avoid loss of precision due to + // too many sums. + out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; + constant_functor(ctx.template device_context(), out_sum_1, + 0.0); + } + if (num_accumulates >= min_average_window && + num_accumulates >= std::min(max_average_window, + num_updates * average_window)) { + // Now the average window is too long, discard the old sum. + out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; + constant_functor(ctx.template device_context(), out_sum_1, + 0.0); + constant_functor(ctx.template device_context(), out_sum_2, + 0.0); + old_num_accumulates = num_accumulates; + num_accumulates = 0; + } + + // Set accumulators to output + SetAccumulators(ctx, num_updates, num_accumulates, + old_num_accumulates); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index ec315695a68befc2e3de798fdb3fa146a903aaff..6da3a24dc89a85fe432b6350d3af7b0e84337c9d 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -78,7 +78,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { for (int64_t i = 0; i < batch_size; ++i) { PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); int64_t index = i * class_num + label_data[i]; - dx_data[index] = -dy_data[i] / x_data[index]; + dx_data[index] = math::TolerableValue()(-dy_data[i] / x_data[index]); } } } diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 34ea6a91ce7743462d378cf471a5ec3a12ca51d1..5518ebed3f792a5acdfbb27976bc2c6dbd78069a 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -89,6 +89,7 @@ void SoftmaxGradCUDNNFunctor::operator()( XGrad->mutable_data(context.GetPlace()))); } +template class SoftmaxCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxCUDNNFunctor; template class SoftmaxGradCUDNNFunctor; diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index 47cb336d87f8627d86ac33d6ac32c04d5d93f753..5596fa0648ccc151bc0d11de9c556599428a8d71 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -56,7 +56,9 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_KERNEL(softmax, CUDNN, ::paddle::platform::CUDAPlace, - ops::SoftmaxCUDNNKernel); -REGISTER_OP_KERNEL(softmax_grad, CUDNN, ::paddle::platform::CUDAPlace, +namespace plat = paddle::platform; +REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, + ops::SoftmaxCUDNNKernel, + ops::SoftmaxCUDNNKernel); +REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, ops::SoftmaxGradCUDNNKernel); diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..cf0244e8662e827a90d8472a097315680579ff6d --- /dev/null +++ b/paddle/fluid/operators/softmax_mkldnn_op.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "mkldnn.hpp" +#include "paddle/fluid/operators/softmax_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +#include + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; +using paddle::platform::MKLDNNDeviceContext; +using paddle::platform::MKLDNNMemDesc; + +using mkldnn::memory; // Note: paddle has also "memory" namespace +using mkldnn::primitive; +using mkldnn::softmax_forward; +using mkldnn::prop_kind; +using mkldnn::stream; + +template +class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + auto& dev_ctx = ctx.template device_context(); + auto mkldnn_engine = dev_ctx.GetEngine(); + const Tensor* input = ctx.Input("X"); + Tensor* output = ctx.Output("Out"); + PADDLE_ENFORCE(input->dims().size() == 2UL, + "The input of softmax op must be a 2D matrix."); + const T* input_data = input->data(); + // allocate memory for output + T* output_data = output->mutable_data(ctx.GetPlace()); + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + // MKL-DNN does support softmax over selected axis. Having 2D Tensor, + // we will make normalization after final eg. axis: 1 + PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])), + "Softmax input and output dimensions should match"); + // Same memory descriptor to be used for input and output + memory::dims softmax_tz = {src_tz[0], src_tz[1]}; + // Currently only supports NC data format + // TODO(jczaja-intel): support more formats + auto softmax_md = + MKLDNNMemDesc({softmax_tz}, memory::f32, memory::format::nc); + // Normalization is made after innermost dimension eg. C out of NC + auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring, + softmax_md, 1 /*dim: C*/); + // create memory primitives + auto softmax_src_memory = + memory({softmax_md, mkldnn_engine}, (void*)input_data); + auto softmax_dst_memory = + memory({softmax_md, mkldnn_engine}, (void*)output_data); + auto softmax_prim_desc = + softmax_forward::primitive_desc(softmax_desc, mkldnn_engine); + auto softmax = softmax_forward(softmax_prim_desc, softmax_src_memory, + softmax_dst_memory); + std::vector pipeline{softmax}; + stream(stream::kind::eager).submit(pipeline).wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace, + ops::SoftmaxMKLDNNKernel); diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 1b63f8a499e5d20d2f10c3cd1024d1bcf78764d4..e2c0f915d96b7746191572fa27b725d90cb6e2e5 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -13,7 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/softmax_op.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -38,26 +44,32 @@ class SoftmaxOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. - bool use_cudnn = ctx.Attr("use_cudnn"); - bool runtime_cudnn_support = false; + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = - ctx.template device_context(); - runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - framework::LibraryType library_ = framework::LibraryType::kPlain; - if (use_cudnn && runtime_cudnn_support) { - library_ = framework::LibraryType::kCUDNN; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif + + auto input_data_type = + framework::ToDataType(ctx.Input("X")->type()); + if (input_data_type == framework::proto::VarType::FP16) { + PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN, + "float16 can only be used when CUDNN is used"); + } + std::string data_format = ctx.Attr("data_format"); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - framework::StringToDataLayout(data_format), library_); + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::StringToDataLayout(data_format), + library_); } }; - class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { public: SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker) @@ -77,6 +89,9 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( Softmax Operator. @@ -119,19 +134,12 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { // choose cudnn kernel if the runtime supported. - bool use_cudnn = ctx.Attr("use_cudnn"); - bool runtime_cudnn_support = false; + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = - ctx.template device_context(); - runtime_cudnn_support = dev_ctx.cudnn_handle() != nullptr ? true : false; - } -#endif - framework::LibraryType library_ = framework::LibraryType::kPlain; - if (use_cudnn && runtime_cudnn_support) { + if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; } +#endif std::string data_format = ctx.Attr("data_format"); return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 98b4178177b0a8bafd6fe34a92be2a07a2fbc5a7..59b76a1edb5ec5900520fbccb6a6f8f6e7a70aa4 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -10,43 +10,45 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_context.h" +#include #include "paddle/fluid/memory/memory.h" - namespace paddle { namespace platform { DeviceContextPool* DeviceContextPool::pool = nullptr; -const platform::DeviceContext* DeviceContextPool::Get( - const platform::Place& place) { +platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { PADDLE_THROW( "'Place' is not supported, Please re-compile with WITH_GPU " "option"); } - return it->second; + return it->second.get(); } DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); - for (size_t i = 0; i < places.size(); i++) { - if (platform::is_cpu_place(places[i])) { + using PtrType = std::unique_ptr; + std::unordered_set set; + for (auto& p : places) { + set.insert(p); + } + + for (auto& p : set) { + if (platform::is_cpu_place(p)) { #ifdef PADDLE_WITH_MKLDNN - device_contexts_.emplace(places[i], - new platform::MKLDNNDeviceContext( - boost::get(places[i]))); + device_contexts_.emplace( + p, PtrType(new MKLDNNDeviceContext(boost::get(p)))); #else - device_contexts_.emplace(places[i], - new platform::CPUDeviceContext( - boost::get(places[i]))); + device_contexts_.emplace( + p, PtrType(new CPUDeviceContext(boost::get(p)))); #endif - } else if (platform::is_gpu_place(places[i])) { + } else if (platform::is_gpu_place(p)) { #ifdef PADDLE_WITH_CUDA - device_contexts_.emplace(places[i], - new platform::CUDADeviceContext( - boost::get(places[i]))); + device_contexts_.emplace( + p, PtrType(new CUDADeviceContext(boost::get(p)))); #else PADDLE_THROW( "'CUDAPlace' is not supported, Please re-compile with WITH_GPU " @@ -159,6 +161,7 @@ CUDADeviceContext::~CUDADeviceContext() { Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { + std::lock_guard guard(mutex_); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 603b890af13b529c490c29112a73a09cc815d07a..202394c7be7e103a609dd0999fc883c794ef0edd 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -103,6 +103,7 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; + mutable std::mutex mutex_; cudaStream_t stream_; cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; @@ -159,7 +160,7 @@ class DeviceContextPool { } /*! \brief Return handle of single device context. */ - const platform::DeviceContext* Get(const platform::Place& place); + platform::DeviceContext* Get(const platform::Place& place); template const typename DefaultDeviceContextType::TYPE* GetByPlace( @@ -172,19 +173,8 @@ class DeviceContextPool { private: static DeviceContextPool* pool; - constexpr static int LEFT_SHIFT = 8; - struct Hash { - std::hash hash_; - size_t operator()(const platform::Place& place) const { - int pre_hash = place.which() << LEFT_SHIFT; - if (platform::is_gpu_place(place)) { - pre_hash += boost::get(place).GetDeviceId(); - } - return hash_(pre_hash); - } - }; - std::unordered_map + std::unordered_map, PlaceHash> device_contexts_; DISABLE_COPY_AND_ASSIGN(DeviceContextPool); }; diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 501bddfc6ec8b5d0bf554b0911c32e47fd51ec15..4cc8b377b8b671eb5a446ecbae21ba9628fbd2c8 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -65,6 +65,18 @@ bool is_cpu_place(const Place &); bool places_are_same_class(const Place &, const Place &); bool is_same_place(const Place &, const Place &); +struct PlaceHash { + std::size_t operator()(const Place &p) const { + constexpr size_t num_dev_bits = 4; + std::hash ihash; + size_t dev_id = 0; + if (is_gpu_place(p)) { + dev_id = boost::get(p).device; + } + return ihash(dev_id << num_dev_bits | p.which()); + } +}; + std::ostream &operator<<(std::ostream &, const Place &); template diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py index 0fc4981a8e9da09f15e6d0a5e5c6761e01328876..3e4292d23550b853ea73de787a1c053e1f2c80fd 100644 --- a/python/paddle/fluid/concurrency.py +++ b/python/paddle/fluid/concurrency.py @@ -131,7 +131,7 @@ def make_channel(dtype, capacity=0): return channel -def channel_send(channel, value, copy=False): +def channel_send(channel, value, is_copy=False): """ Sends a value through a channel variable. Used by an unbuffered or buffered channel to pass data from within or to a concurrent Go block, where @@ -141,8 +141,8 @@ def channel_send(channel, value, copy=False): channel (Variable|Channel): Channel variable created using `make_channel`. value (Variable): Value to send to channel - copy (bool): Copy data while channel send. If False, then data - is moved. The input cannot be used after move. + is_copy (bool): Copy data while channel send. If False, then data + is moved. The input cannot be used after move. (default False) Returns: Variable: The boolean status on whether or not the channel successfully sent the passed value. @@ -166,7 +166,7 @@ def channel_send(channel, value, copy=False): X = value - if copy is True: + if is_copy is True: copied_X = helper.create_variable( name=unique_name.generate(value.name + '_copy'), type=value.type, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 70ecffd910a46570b5a8e576d88039fa5e22e726..3e78788f470556d2196b5104f69a0a3285543ec4 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -918,6 +918,24 @@ class Block(object): name=v.name) self.vars[new_p.name] = new_p + def clone_variable(self, var): + """ + Clone a variable into current block. + Args: + var: the variable to be cloned. + + Returns: + The new variable cloned from 'var' in current block. + """ + assert isinstance(var, Variable) + return self.create_var( + name=var.name, + shape=var.shape, + dtype=var.dtype, + type=var.type, + lod_level=var.lod_level, + persistable=True) + class Program(object): def __init__(self): @@ -960,14 +978,14 @@ class Program(object): """Clone the Program object Set for_test to False when we want to clone the program for training. - Set for_test to True when we want to clone the program for testing. + Set for_test to True when we want to clone the program for testing. Args: for_test(bool): Some operators, such as batch_norm and drop_out ops, behave differently in training and testing. If for_test is True, the is_test attributes in these operators will be set to True for - testing purposes, otherwise, they remain unchanged. - + testing purposes, otherwise, they remain unchanged. + Returns(Program): The cloned Program object. """ diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index da7e74c901e1f5be709c5f9d73f048bfda0c5549..58b668227168c5c5e080f3928035ad98303bbae9 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -399,6 +399,9 @@ class LayerHelper(object): if isinstance(act, basestring): act = {'type': act} tmp = self.create_tmp_variable(dtype=input_var.dtype) + + if 'use_mkldnn' in self.kwargs: + act['use_mkldnn'] = self.kwargs.get('use_mkldnn') act_type = act.pop('type') self.append_op( type=act_type, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2ce68f95057f7820d7ab59ba2b41171c7ecd3654..679de6ce2aa67abe1322702fcb371eded0130698 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -82,6 +82,7 @@ def fc(input, num_flatten_dims=1, param_attr=None, bias_attr=None, + use_mkldnn=False, act=None, name=None): """ @@ -163,8 +164,11 @@ def fc(input, inputs={"X": input_var, "Y": w}, outputs={"Out": tmp}, - attrs={"x_num_col_dims": num_flatten_dims, - "y_num_col_dims": 1}) + attrs={ + "x_num_col_dims": num_flatten_dims, + "y_num_col_dims": 1, + 'use_mkldnn': use_mkldnn + }) mul_results.append(tmp) # sum diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index d7bad221c5fa7b18137bf317125195267437a644..f5c6b47d243dcf4ba985cfb41fc23b44d3ed809f 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -69,6 +69,7 @@ __all__ = [ 'gaussian_random_batch_size_like', 'cumsum', 'scatter', + 'sum', ] + __activations__ for _OP in set(__all__): diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index a33760a528f667b7afabafa19762eca7d1ef0635..180575c35dc6e115e11cccf9fff9fb2d3cd7e9a6 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -13,7 +13,7 @@ # limitations under the License. from collections import defaultdict - +from paddle.fluid.framework import Program import framework import layers from backward import append_backward @@ -23,9 +23,11 @@ from initializer import Constant from layer_helper import LayerHelper from regularizer import append_regularization_ops from clip import append_gradient_clip_ops, error_clip_callback +from contextlib import contextmanager __all__ = [ - 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Adadelta' + 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', + 'Adadelta', 'ModelAverage' ] @@ -121,7 +123,12 @@ class Optimizer(object): """ pass - def _add_accumulator(self, name, param, dtype=None, fill_value=0.0): + def _add_accumulator(self, + name, + param, + dtype=None, + fill_value=0.0, + shape=None): """Utility function to add an accumulator for a parameter Args: @@ -135,17 +142,19 @@ class Optimizer(object): param.name in self._accumulators[name]): raise Exception("Accumulator {} already exists for parameter {}". format(name, param.name)) - + if shape == None: + shape = param.shape assert isinstance(self.helper, LayerHelper) var = self.helper.create_global_variable( name=unique_name.generate(name), persistable=True, dtype=dtype or param.dtype, type=param.type, - shape=param.shape) + shape=shape) self.helper.set_variable_initializer( var, initializer=Constant(value=float(fill_value))) self._accumulators[name][param.name] = var + return var def _get_accumulator(self, name, param): """Utility function to fetch an accumulator for a parameter @@ -797,3 +806,143 @@ Adamax = AdamaxOptimizer DecayedAdagrad = DecayedAdagradOptimizer Adadelta = AdadeltaOptimizer RMSProp = RMSPropOptimizer + + +class ModelAverage(Optimizer): + """Accumulate the average of parameters whtin sliding window. The average + result will be saved in temporary variables which can be applied to + parameter variables of current model by calling 'apply()' method. And the + 'restore()' method is used to restored the parameter values of current model. + + The size of average window is determined by average_window_rate, + min_average_window, max_average_window and current update times. + + Args: + params_grads: A list of parameter-grad variable pairs. + average_window_rate: The rate of average window. + min_average_window: The minimum size of average window. + max_average_window: The maximum size of average window. + + Examples: + ... + optimizer = fluid.optimizer.Momentum() + _, params_grads = optimizer.minimize(cost) + model_average = fluid.optimizer.ModelAverage(params_grads, 0.15, + min_average_window=10000, + max_average_window=20000) + for pass_id in range(args.pass_num): + for data in train_reader(): + exe.run(fluid.default_main_program()...) + + with model_average.apply(exe): + for data in test_reader(): + exe.run(inference_program...) + """ + + def __init__(self, + params_grads, + average_window_rate, + min_average_window=10000, + max_average_window=10000, + **kwargs): + super(ModelAverage, self).__init__(0.0, **kwargs) + self.average_window = average_window_rate + self.min_average_window = min_average_window + self.max_average_window = max_average_window + self.params_grads = params_grads + for param, grad in self.params_grads: + if grad is not None: + self._append_average_accumulate_op(param) + + self.apply_program = Program() + block = self.apply_program.global_block() + with program_guard(main_program=self.apply_program): + for param_grad in self.params_grads: + if param_grad[1] is not None: + self._add_average_apply_op(block, param_grad) + + self.restore_program = Program() + block = self.restore_program.global_block() + with program_guard(main_program=self.restore_program): + for param_grad in self.params_grads: + if param_grad[1] is not None: + self._add_average_restore_op(block, param_grad) + + def _add_average_apply_op(self, block, param_grad): + param = block.clone_variable(param_grad[0]) + grad = block.clone_variable(param_grad[1]) + sum_1 = block.clone_variable(self._get_accumulator('sum_1', param)) + sum_2 = block.clone_variable(self._get_accumulator('sum_2', param)) + sum_3 = block.clone_variable(self._get_accumulator('sum_3', param)) + num_accumulates = block.clone_variable( + self._get_accumulator('num_accumulates', param)) + old_num_accumulates = block.clone_variable( + self._get_accumulator('old_num_accumulates', param)) + num_updates = block.clone_variable( + self._get_accumulator('num_updates', param)) + # backup param value to grad + layers.assign(input=param, output=grad) + # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates) + tmp = layers.sum(x=[num_accumulates, old_num_accumulates]) + sum = layers.sum(x=[sum_1, sum_2, sum_3]) + tmp = layers.cast(x=tmp, dtype='float32') + sum = layers.cast(x=sum, dtype='float32') + layers.elementwise_div(x=sum, y=tmp, out=param) + + def _add_average_restore_op(self, block, param_grad): + param = block.clone_variable(param_grad[0]) + grad = block.clone_variable(param_grad[1]) + layers.assign(input=grad, output=param) + + def _append_average_accumulate_op(self, param): + self.helper = LayerHelper("average_accumulate") + sum_1 = self._add_accumulator('sum_1', param) + sum_2 = self._add_accumulator('sum_2', param) + sum_3 = self._add_accumulator('sum_3', param) + num_accumulates = self._add_accumulator( + 'num_accumulates', param, dtype='int64', shape=[1]) + old_num_accumulates = self._add_accumulator( + 'old_num_accumulates', param, dtype='int64', shape=[1]) + num_updates = self._add_accumulator( + 'num_updates', param, dtype='int64', shape=[1]) + + self.helper.append_op( + type='average_accumulates', + inputs={ + "param": param, + "in_sum_1": sum_1, + "in_sum_2": sum_2, + "in_sum_3": sum_3, + "in_num_accumulates": num_accumulates, + "in_old_num_accumulates": old_num_accumulates, + "in_num_updates": num_updates + }, + outputs={ + "out_sum_1": sum_1, + "out_sum_2": sum_2, + "out_sum_3": sum_3, + "out_num_accumulates": num_accumulates, + "out_old_num_accumulates": old_num_accumulates, + "out_num_updates": num_updates, + }, + attrs={ + "average_window": self.average_window, + "min_average_window": self.min_average_window, + "max_average_window": self.max_average_window, + }) + + @contextmanager + def apply(self, executor, need_restore=True): + """Apply average values to parameters of current model. + """ + executor.run(self.apply_program) + try: + yield + finally: + if need_restore: + self.restore(executor) + + def restore(self, executor): + """Restore parameter values of current model. + """ + executor.run(self.restore_program) diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index eab41ebe711bd21bdc3b34ca83ab57388cc35ba2..1e3decfbaf0691e912b96b415b68353e626cf51e 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -14,6 +14,7 @@ import unittest import numpy as np +import paddle.fluid.core as core from op_test import OpTest from scipy.special import expit @@ -212,18 +213,39 @@ class TestRound(OpTest): class TestRelu(OpTest): def setUp(self): self.op_type = "relu" - x = np.random.uniform(-1, 1, [11, 17]).astype("float32") + self.dtype = np.float32 + self.init_dtype() + + x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype) # The same reason with TestAbs x[np.abs(x) < 0.005] = 0.02 - self.inputs = {'X': x} - self.outputs = {'Out': np.maximum(self.inputs['X'], 0)} + out = np.maximum(x, 0) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} def test_check_output(self): self.check_output() def test_check_grad(self): + if self.dtype == np.float16: + return self.check_grad(['X'], 'Out', max_relative_error=0.007) + def init_dtype(self): + pass + + +class TestFP16Relu(TestRelu): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) + class TestBRelu(OpTest): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index 4f20da2b926823db9e7ec92c95178b6d3d1feec9..33d60c7e31ce0817ad26ea1c1c974339936052d3 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -29,15 +29,20 @@ class TestSoftmaxOp(OpTest): def setUp(self): self.op_type = "softmax" self.use_cudnn = False - self.inputs = { - 'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32") - } - self.outputs = { - 'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X']) + self.use_mkldnn = False + self.dtype = np.float32 + self.init_kernel_type() + + x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype) + out = np.apply_along_axis(stable_softmax, 1, x) + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + self.attrs = { + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn } - self.attrs = {'use_cudnn': self.use_cudnn, } - def init_op_type(self): + def init_kernel_type(self): pass def test_check_output(self): @@ -48,6 +53,8 @@ class TestSoftmaxOp(OpTest): self.check_output() def test_check_grad(self): + if self.dtype == np.float16: + return if self.use_cudnn: place = core.CUDAPlace(0) self.check_grad_with_place( @@ -57,8 +64,25 @@ class TestSoftmaxOp(OpTest): class TestSoftmaxCUDNNOp(TestSoftmaxOp): - def init_op_type(self): + def init_kernel_type(self): + self.use_cudnn = True + + +class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp): + def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=1e-3) + + +class TestSoftmaxMKLDNNOp(TestSoftmaxOp): + def init_kernel_type(self): + self.use_mkldnn = True if __name__ == "__main__":