提交 8d74782e 编写于 作者: Z Zhen Wang

Enable uniform_random_op and gaussian_random_op to support the float16 data type.

上级 f64c861e
...@@ -11,16 +11,31 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,16 +11,31 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <thrust/random.h> #include <thrust/random.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include <type_traits>
#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/fill_constant_op.h" #include "paddle/fluid/operators/fill_constant_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace details {
template <typename T>
struct RandomDistributionType {
using Type = T;
};
template <>
struct RandomDistributionType<platform::float16> {
using Type = float;
};
} // namespace details
template <typename T> template <typename T>
struct GaussianGenerator { struct GaussianGenerator {
T mean_, std_; T mean_, std_;
...@@ -34,12 +49,16 @@ struct GaussianGenerator { ...@@ -34,12 +49,16 @@ struct GaussianGenerator {
: mean_(mean), std_(std), seed_(seed), offset_(offset) {} : mean_(mean), std_(std), seed_(seed), offset_(offset) {}
__host__ __device__ T operator()(const unsigned int n) const { __host__ __device__ T operator()(const unsigned int n) const {
using DataType = typename details::RandomDistributionType<T>::Type;
thrust::minstd_rand rng; thrust::minstd_rand rng;
rng.seed(seed_); rng.seed(seed_);
thrust::normal_distribution<T> dist(mean_, std_); thrust::normal_distribution<DataType> dist(static_cast<DataType>(mean_),
static_cast<DataType>(std_));
unsigned int new_n = n + offset_; unsigned int new_n = n + offset_;
rng.discard(new_n); rng.discard(new_n);
return dist(rng); T out = static_cast<T>(dist(rng));
return out;
} }
}; };
...@@ -122,10 +141,13 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> { ...@@ -122,10 +141,13 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(gaussian_random, REGISTER_OP_CUDA_KERNEL(
paddle::operators::GPUGaussianRandomKernel<float>, gaussian_random, paddle::operators::GPUGaussianRandomKernel<float>,
paddle::operators::GPUGaussianRandomKernel<double>); paddle::operators::GPUGaussianRandomKernel<double>,
paddle::operators::GPUGaussianRandomKernel<paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
gaussian_random_batch_size_like, gaussian_random_batch_size_like,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<float>, paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<float>,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<double>); paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<double>,
paddle::operators::GPUGaussianRandomBatchSizeLikeKernel<
paddle::platform::float16>);
...@@ -19,11 +19,27 @@ limitations under the License. */ ...@@ -19,11 +19,27 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/algorithm.h" #include "paddle/fluid/operators/math/algorithm.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace details {
template <typename T>
struct LearningRateType {
using Type = T;
};
template <>
struct LearningRateType<platform::float16> {
using Type = float;
};
} // namespace details
template <typename T>
using DataType = typename details::LearningRateType<T>::Type;
using framework::Tensor; using framework::Tensor;
using framework::SelectedRows; using framework::SelectedRows;
struct NoNesterov; struct NoNesterov;
...@@ -124,7 +140,7 @@ class CPUDenseMomentumFunctor { ...@@ -124,7 +140,7 @@ class CPUDenseMomentumFunctor {
auto p = framework::EigenVector<T>::Flatten(*param); auto p = framework::EigenVector<T>::Flatten(*param);
auto v = framework::EigenVector<T>::Flatten(*velocity); auto v = framework::EigenVector<T>::Flatten(*velocity);
auto g = framework::EigenVector<T>::Flatten(*grad); auto g = framework::EigenVector<T>::Flatten(*grad);
const float* lr = learning_rate->data<float>(); const auto* lr = learning_rate->data<DataType<T>>();
v_out = v * mu + g; v_out = v * mu + g;
if (use_nesterov) { if (use_nesterov) {
...@@ -147,7 +163,7 @@ class DenseMomentumFunctor<T, UseNesterov> { ...@@ -147,7 +163,7 @@ class DenseMomentumFunctor<T, UseNesterov> {
const T* p_; const T* p_;
const T* g_; const T* g_;
const T* v_; const T* v_;
const float* lr_; const DataType<T>* lr_;
const T mu_; const T mu_;
const int64_t num_; const int64_t num_;
T* p_out_; T* p_out_;
...@@ -155,7 +171,7 @@ class DenseMomentumFunctor<T, UseNesterov> { ...@@ -155,7 +171,7 @@ class DenseMomentumFunctor<T, UseNesterov> {
public: public:
DenseMomentumFunctor(const T* p, const T* g, const T* v, DenseMomentumFunctor(const T* p, const T* g, const T* v,
const float* learning_rate, const T mu, const DataType<T>* learning_rate, const T mu,
const int64_t num, T* p_out, T* v_out) const int64_t num, T* p_out, T* v_out)
: p_(p), : p_(p),
g_(g), g_(g),
...@@ -169,7 +185,7 @@ class DenseMomentumFunctor<T, UseNesterov> { ...@@ -169,7 +185,7 @@ class DenseMomentumFunctor<T, UseNesterov> {
// put memory access in register // put memory access in register
const T p = p_[i]; const T p = p_[i];
const T g = g_[i]; const T g = g_[i];
const float lr = lr_[0]; const auto lr = lr_[0];
const T v = v_[i]; const T v = v_[i];
T v_out = v * mu_ + g; T v_out = v * mu_ + g;
T p_out = p - (g + v_out * mu_) * static_cast<T>(lr); T p_out = p - (g + v_out * mu_) * static_cast<T>(lr);
...@@ -185,7 +201,7 @@ class DenseMomentumFunctor<T, NoNesterov> { ...@@ -185,7 +201,7 @@ class DenseMomentumFunctor<T, NoNesterov> {
const T* p_; const T* p_;
const T* g_; const T* g_;
const T* v_; const T* v_;
const float* lr_; const DataType<T>* lr_;
const T mu_; const T mu_;
const int64_t num_; const int64_t num_;
T* p_out_; T* p_out_;
...@@ -193,7 +209,7 @@ class DenseMomentumFunctor<T, NoNesterov> { ...@@ -193,7 +209,7 @@ class DenseMomentumFunctor<T, NoNesterov> {
public: public:
DenseMomentumFunctor(const T* p, const T* g, const T* v, DenseMomentumFunctor(const T* p, const T* g, const T* v,
const float* learning_rate, const T mu, const DataType<T>* learning_rate, const T mu,
const int64_t num, T* p_out, T* v_out) const int64_t num, T* p_out, T* v_out)
: p_(p), : p_(p),
g_(g), g_(g),
...@@ -226,7 +242,7 @@ class SparseMomentumFunctor<T, UseNesterov> { ...@@ -226,7 +242,7 @@ class SparseMomentumFunctor<T, UseNesterov> {
const T* p_; const T* p_;
const T* g_; const T* g_;
const T* v_; const T* v_;
const float* lr_; const DataType<T>* lr_;
const T mu_; const T mu_;
const int64_t* rows_; const int64_t* rows_;
const int64_t row_numel_; const int64_t row_numel_;
...@@ -235,9 +251,10 @@ class SparseMomentumFunctor<T, UseNesterov> { ...@@ -235,9 +251,10 @@ class SparseMomentumFunctor<T, UseNesterov> {
T* v_out_; T* v_out_;
public: public:
SparseMomentumFunctor(const T* p, const T* g, const T* v, const float* lr, SparseMomentumFunctor(const T* p, const T* g, const T* v,
const T mu, const int64_t* rows, int64_t row_numel, const DataType<T>* lr, const T mu, const int64_t* rows,
int64_t row_height, T* p_out, T* v_out) int64_t row_numel, int64_t row_height, T* p_out,
T* v_out)
: p_(p), : p_(p),
g_(g), g_(g),
v_(v), v_(v),
...@@ -256,7 +273,7 @@ class SparseMomentumFunctor<T, UseNesterov> { ...@@ -256,7 +273,7 @@ class SparseMomentumFunctor<T, UseNesterov> {
: static_cast<T>(0); : static_cast<T>(0);
// put memory access in register // put memory access in register
const T p = p_[i]; const T p = p_[i];
const float lr = lr_[0]; const auto lr = lr_[0];
const T v = v_[i]; const T v = v_[i];
T v_out = v * mu_ + g; T v_out = v * mu_ + g;
T p_out = p - (g + v_out * mu_) * static_cast<T>(lr); T p_out = p - (g + v_out * mu_) * static_cast<T>(lr);
...@@ -272,7 +289,7 @@ class SparseMomentumFunctor<T, NoNesterov> { ...@@ -272,7 +289,7 @@ class SparseMomentumFunctor<T, NoNesterov> {
const T* p_; const T* p_;
const T* g_; const T* g_;
const T* v_; const T* v_;
const float* lr_; const DataType<T>* lr_;
const T mu_; const T mu_;
const int64_t* rows_; const int64_t* rows_;
const int64_t row_numel_; const int64_t row_numel_;
...@@ -281,9 +298,10 @@ class SparseMomentumFunctor<T, NoNesterov> { ...@@ -281,9 +298,10 @@ class SparseMomentumFunctor<T, NoNesterov> {
T* v_out_; T* v_out_;
public: public:
SparseMomentumFunctor(const T* p, const T* g, const T* v, const float* lr, SparseMomentumFunctor(const T* p, const T* g, const T* v,
const T mu, const int64_t* rows, int64_t row_numel, const DataType<T>* lr, const T mu, const int64_t* rows,
int64_t row_height, T* p_out, T* v_out) int64_t row_numel, int64_t row_height, T* p_out,
T* v_out)
: p_(p), : p_(p),
g_(g), g_(g),
v_(v), v_(v),
...@@ -342,7 +360,7 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -342,7 +360,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
if (use_nesterov) { if (use_nesterov) {
DenseMomentumFunctor<T, UseNesterov> functor( DenseMomentumFunctor<T, UseNesterov> functor(
param->data<T>(), grad->data<T>(), velocity->data<T>(), param->data<T>(), grad->data<T>(), velocity->data<T>(),
learning_rate->data<float>(), mu, param->numel(), learning_rate->data<DataType<T>>(), mu, param->numel(),
param_out->mutable_data<T>(ctx.GetPlace()), param_out->mutable_data<T>(ctx.GetPlace()),
velocity_out->mutable_data<T>(ctx.GetPlace())); velocity_out->mutable_data<T>(ctx.GetPlace()));
for_range(functor); for_range(functor);
...@@ -350,7 +368,7 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -350,7 +368,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
} else { } else {
DenseMomentumFunctor<T, NoNesterov> functor( DenseMomentumFunctor<T, NoNesterov> functor(
param->data<T>(), grad->data<T>(), velocity->data<T>(), param->data<T>(), grad->data<T>(), velocity->data<T>(),
learning_rate->data<float>(), mu, param->numel(), learning_rate->data<DataType<T>>(), mu, param->numel(),
param_out->mutable_data<T>(ctx.GetPlace()), param_out->mutable_data<T>(ctx.GetPlace()),
velocity_out->mutable_data<T>(ctx.GetPlace())); velocity_out->mutable_data<T>(ctx.GetPlace()));
for_range(functor); for_range(functor);
...@@ -382,7 +400,7 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -382,7 +400,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
if (use_nesterov) { if (use_nesterov) {
SparseMomentumFunctor<T, UseNesterov> functor( SparseMomentumFunctor<T, UseNesterov> functor(
param->data<T>(), merged_grad->value().data<T>(), param->data<T>(), merged_grad->value().data<T>(),
velocity->data<T>(), learning_rate->data<float>(), mu, rows, velocity->data<T>(), learning_rate->data<DataType<T>>(), mu, rows,
row_numel, static_cast<int64_t>(merged_grad->rows().size()), row_numel, static_cast<int64_t>(merged_grad->rows().size()),
param_out->mutable_data<T>(ctx.GetPlace()), param_out->mutable_data<T>(ctx.GetPlace()),
velocity_out->mutable_data<T>(ctx.GetPlace())); velocity_out->mutable_data<T>(ctx.GetPlace()));
...@@ -391,7 +409,7 @@ class MomentumOpKernel : public framework::OpKernel<T> { ...@@ -391,7 +409,7 @@ class MomentumOpKernel : public framework::OpKernel<T> {
} else { } else {
SparseMomentumFunctor<T, NoNesterov> functor( SparseMomentumFunctor<T, NoNesterov> functor(
param->data<T>(), merged_grad->value().data<T>(), param->data<T>(), merged_grad->value().data<T>(),
velocity->data<T>(), learning_rate->data<float>(), mu, rows, velocity->data<T>(), learning_rate->data<DataType<T>>(), mu, rows,
row_numel, static_cast<int64_t>(merged_grad->rows().size()), row_numel, static_cast<int64_t>(merged_grad->rows().size()),
param_out->mutable_data<T>(ctx.GetPlace()), param_out->mutable_data<T>(ctx.GetPlace()),
velocity_out->mutable_data<T>(ctx.GetPlace())); velocity_out->mutable_data<T>(ctx.GetPlace()));
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/uniform_random_op.h" #include "paddle/fluid/operators/uniform_random_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -163,9 +164,12 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> { ...@@ -163,9 +164,12 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_CUDA_KERNEL(uniform_random, REGISTER_OP_CUDA_KERNEL(
uniform_random, paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>,
paddle::operators::GPUUniformRandomKernel<paddle::platform::float16>);
REGISTER_OP_CUDA_KERNEL(
uniform_random_batch_size_like,
paddle::operators::GPUUniformRandomKernel<float>, paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>); paddle::operators::GPUUniformRandomKernel<double>,
REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like, paddle::operators::GPUUniformRandomKernel<paddle::platform::float16>);
paddle::operators::GPUUniformRandomKernel<float>,
paddle::operators::GPUUniformRandomKernel<double>);
...@@ -131,6 +131,10 @@ struct PADDLE_ALIGN(2) float16 { ...@@ -131,6 +131,10 @@ struct PADDLE_ALIGN(2) float16 {
#endif #endif
} }
HOSTDEVICE inline float16(int32_t val) : float16(static_cast<float>(val)) {}
HOSTDEVICE inline float16(uint32_t val) : float16(static_cast<float>(val)) {}
HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
template <class T> template <class T>
......
...@@ -267,48 +267,35 @@ def cast_net_to_fp16(program): ...@@ -267,48 +267,35 @@ def cast_net_to_fp16(program):
op._set_attr('dtype', core.VarDesc.VarType.FP16) op._set_attr('dtype', core.VarDesc.VarType.FP16)
def cast_parameters_to_fp16(exe, program): def cast_parameters_to_fp16(program):
global_block = program.global_block() global_block = program.global_block()
all_parameters = global_block.all_parameters() all_parameters = global_block.all_parameters()
is_bn_params = lambda param: (param.name.find('bn') != -1 and (param.name.endswith('_offset') or param.name.endswith('_mean') or param.name.endswith('_scale') or param.name.endswith('_variance')))
all_param_names = {p.name for p in all_parameters if not is_bn_params(p)}
ops = global_block.ops
for param in all_parameters: for param in all_parameters:
if not (param.name.find('bn') != -1 and if param.name in all_param_names:
(param.name.endswith('_offset') or param.name.endswith('_mean') param_var = global_block.var(param.name)
or param.name.endswith('_scale') or if param_var.dtype == core.VarDesc.VarType.FP32:
param.name.endswith('_variance'))): param_var.desc.set_dtype(core.VarDesc.VarType.FP16)
param_t = global_scope().find_var(param.name).get_tensor()
data = np.array(param_t) for op in ops:
param_t.set(np.float16(data), exe.place) target_op = False
for out_name in op.output_names:
for out_var_name in op.output(out_name):
# def cast_parameters_to_fp16(program): if out_var_name in all_param_names:
# global_block = program.global_block() target_op = True
# all_parameters = global_block.all_parameters() if target_op:
# is_bn_params = lambda param: (param.name.find('bn') != -1 and (param.name.endswith('_offset') or param.name.endswith('_mean') or param.name.endswith('_scale') or param.name.endswith('_variance'))) if op.has_attr('in_dtype') and op.attr(
# all_param_names = {p.name for p in all_parameters if not is_bn_params(p)} 'in_dtype') == core.VarDesc.VarType.FP32:
# ops = global_block.ops op._set_attr('in_dtype', core.VarDesc.VarType.FP16)
if op.has_attr('out_dtype') and op.attr(
# for param in all_parameters: 'out_dtype') == core.VarDesc.VarType.FP32:
# if param.name in all_param_names: op._set_attr('out_dtype', core.VarDesc.VarType.FP16)
# param_var = global_block.var(param.name) if op.has_attr('dtype') and op.attr(
# if param_var.dtype == core.VarDesc.VarType.FP32: 'dtype') == core.VarDesc.VarType.FP32:
# param_var.desc.set_dtype(core.VarDesc.VarType.FP16) op._set_attr('dtype', core.VarDesc.VarType.FP16)
# for op in ops:
# target_op = False
# for out_name in op.output_names:
# for out_var_name in op.output(out_name):
# if out_var_name in all_param_names:
# target_op = True
# if target_op:
# if op.has_attr('in_dtype') and op.attr(
# 'in_dtype') == core.VarDesc.VarType.FP32:
# op._set_attr('in_dtype', core.VarDesc.VarType.FP16)
# if op.has_attr('out_dtype') and op.attr(
# 'out_dtype') == core.VarDesc.VarType.FP32:
# op._set_attr('out_dtype', core.VarDesc.VarType.FP16)
# if op.has_attr('dtype') and op.attr(
# 'dtype') == core.VarDesc.VarType.FP32:
# op._set_attr('dtype', core.VarDesc.VarType.FP16)
def rewrite_program(main_prog, amp_lists): def rewrite_program(main_prog, amp_lists):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册