提交 d307b5e4 编写于 作者: K Kexin Zhao

Merge remote-tracking branch 'upstream/develop' into elementwise_add_fp16

...@@ -26,7 +26,7 @@ lookup of rows. ...@@ -26,7 +26,7 @@ lookup of rows.
The following figure illustrates the multiplication of x with two The following figure illustrates the multiplication of x with two
non-zero elements, or say, two symbols, and a lookup table W: non-zero elements, or say, two symbols, and a lookup table W:
![lookup table](./lookup_table.png) ![lookup table](./src/lookup_table.png)
### The Backward Algorithm ### The Backward Algorithm
...@@ -42,7 +42,7 @@ or some more sophisticated algorithms that rely on both W' and W: ...@@ -42,7 +42,7 @@ or some more sophisticated algorithms that rely on both W' and W:
$$W = f(W, W')$$ $$W = f(W, W')$$
The following figure illustrates the backward pass of the lookup The following figure illustrates the backward pass of the lookup
operator: ![lookup table training](./lookup_table_training.png) operator: ![lookup table training](./src/lookup_table_training.png)
## Distributed Storage Service ## Distributed Storage Service
......
...@@ -103,7 +103,7 @@ In computability theory, a system of data-manipulation rules, such as a programm ...@@ -103,7 +103,7 @@ In computability theory, a system of data-manipulation rules, such as a programm
There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree). There are two ways to execute a Fluid program. When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program. There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md). Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
......
...@@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel { ...@@ -35,7 +35,6 @@ class DropoutOp : public framework::OperatorWithKernel {
} }
}; };
template <typename AttrType>
class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker) DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
...@@ -73,7 +72,6 @@ are set equal to their corresponding inputs. ...@@ -73,7 +72,6 @@ are set equal to their corresponding inputs.
} }
}; };
template <typename AttrType>
class DropoutOpGrad : public framework::OperatorWithKernel { class DropoutOpGrad : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel { ...@@ -103,11 +101,10 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad, REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad,
ops::DropoutOpGrad<float>); ops::DropoutOpGrad);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout, dropout, ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float>);
ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
dropout_grad, dropout_grad,
ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>); ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
...@@ -18,17 +18,18 @@ limitations under the License. */ ...@@ -18,17 +18,18 @@ limitations under the License. */
#include <thrust/random.h> #include <thrust/random.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T, typename AttrType> template <typename T>
__global__ void RandomGenerator(const size_t n, const int seed, __global__ void RandomGenerator(const size_t n, const int seed,
const AttrType dropout_prob, const T* src, const float dropout_prob, const T* src,
T* mask_data, T* dst) { T* mask_data, T* dst) {
thrust::minstd_rand rng; thrust::minstd_rand rng;
rng.seed(seed); rng.seed(seed);
thrust::uniform_real_distribution<AttrType> dist(0, 1); thrust::uniform_real_distribution<float> dist(0, 1);
int idx = blockDim.x * blockIdx.x + threadIdx.x; int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < n; idx += blockDim.x * gridDim.x) { for (; idx < n; idx += blockDim.x * gridDim.x) {
...@@ -44,14 +45,14 @@ __global__ void RandomGenerator(const size_t n, const int seed, ...@@ -44,14 +45,14 @@ __global__ void RandomGenerator(const size_t n, const int seed,
// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT. // It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to // Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random. // implement uniform random.
template <typename Place, typename T, typename AttrType> template <typename Place, typename T>
class GPUDropoutKernel : public framework::OpKernel<T> { class GPUDropoutKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* x = context.Input<Tensor>("X"); auto* x = context.Input<Tensor>("X");
auto* y = context.Output<Tensor>("Out"); auto* y = context.Output<Tensor>("Out");
y->mutable_data<T>(context.GetPlace()); y->mutable_data<T>(context.GetPlace());
AttrType dropout_prob = context.Attr<AttrType>("dropout_prob"); float dropout_prob = context.Attr<float>("dropout_prob");
auto X = EigenMatrix<T>::Reshape(*x, 1); auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1); auto Y = EigenMatrix<T>::Reshape(*y, 1);
...@@ -70,11 +71,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -70,11 +71,11 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
int threads = 512; int threads = 512;
int grid = (x->numel() + threads - 1) / threads; int grid = (x->numel() + threads - 1) / threads;
RandomGenerator<T, AttrType><<<grid, threads, 0, RandomGenerator<
context.cuda_device_context().stream()>>>( T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, seed, dropout_prob, x_data, mask_data, y_data); size, seed, dropout_prob, x_data, mask_data, y_data);
} else { } else {
Y.device(place) = X * (1.0f - dropout_prob); Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
} }
} }
}; };
...@@ -83,9 +84,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -83,9 +84,9 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
dropout, dropout, ops::GPUDropoutKernel<plat::CUDADeviceContext, float>,
ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>); ops::GPUDropoutKernel<plat::CUDADeviceContext, plat::float16>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(dropout_grad,
dropout_grad, ops::DropoutGradKernel<plat::CUDADeviceContext, float>);
ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
...@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor, ...@@ -25,7 +25,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex> typename IndexType = Eigen::DenseIndex>
using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>; using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
template <typename DeviceContext, typename T, typename AttrType> template <typename DeviceContext, typename T>
class CPUDropoutKernel : public framework::OpKernel<T> { class CPUDropoutKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
......
...@@ -43,7 +43,7 @@ math_library(sequence2batch) ...@@ -43,7 +43,7 @@ math_library(sequence2batch)
math_library(sequence_padding) math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function) math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax) math_library(softmax DEPS math_function)
math_library(unpooling) math_library(unpooling)
math_library(vol2col) math_library(vol2col)
......
...@@ -48,20 +48,24 @@ class DoubleBufferReader : public framework::DecoratedReader { ...@@ -48,20 +48,24 @@ class DoubleBufferReader : public framework::DecoratedReader {
void start_thread() { void start_thread() {
buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize); buffer_ = framework::MakeChannel<Item>(kDoubleBufferSize);
std::thread prefetch([this] { PrefetchThreadFunc(); }); prefetcher_ = std::thread([this] { PrefetchThreadFunc(); });
prefetch.detach();
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override; void ReadNext(std::vector<framework::LoDTensor>* out) override;
void ReInit() override; void ReInit() override;
~DoubleBufferReader() { buffer_->Close(); } ~DoubleBufferReader() {
buffer_->Close();
prefetcher_.join();
delete buffer_;
}
bool HasNext() const override; bool HasNext() const override;
private: private:
void PrefetchThreadFunc(); void PrefetchThreadFunc();
std::thread prefetcher_;
framework::Channel<Item>* buffer_; framework::Channel<Item>* buffer_;
platform::Place place_; platform::Place place_;
std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_; std::vector<std::unique_ptr<platform::DeviceContext>> ctxs_;
...@@ -134,6 +138,8 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) { ...@@ -134,6 +138,8 @@ void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
void DoubleBufferReader::ReInit() { void DoubleBufferReader::ReInit() {
reader_->ReInit(); reader_->ReInit();
buffer_->Close(); buffer_->Close();
prefetcher_.join();
delete buffer_;
start_thread(); start_thread();
} }
...@@ -159,11 +165,12 @@ void DoubleBufferReader::PrefetchThreadFunc() { ...@@ -159,11 +165,12 @@ void DoubleBufferReader::PrefetchThreadFunc() {
if (!buffer_->Send(&batch)) { if (!buffer_->Send(&batch)) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The " VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread terminates."; "prefetch thread will terminate.";
break; break;
} }
} }
buffer_->Close(); buffer_->Close();
VLOG(5) << "Prefetch thread terminates.";
} }
bool DoubleBufferReader::HasNext() const { bool DoubleBufferReader::HasNext() const {
......
...@@ -34,6 +34,9 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -34,6 +34,9 @@ class ShuffleReader : public framework::DecoratedReader {
} }
void ReadNext(std::vector<framework::LoDTensor>* out) override { void ReadNext(std::vector<framework::LoDTensor>* out) override {
if (!HasNext()) {
PADDLE_THROW("There is no next data!");
}
if (iteration_pos_ >= buffer_.size()) { if (iteration_pos_ >= buffer_.size()) {
VLOG(10) << "Resetting shuffle buffer"; VLOG(10) << "Resetting shuffle buffer";
ReadIntoBuffers(); ReadIntoBuffers();
...@@ -50,7 +53,6 @@ class ShuffleReader : public framework::DecoratedReader { ...@@ -50,7 +53,6 @@ class ShuffleReader : public framework::DecoratedReader {
buffer_.clear(); buffer_.clear();
buffer_.reserve(buffer_size_); buffer_.reserve(buffer_size_);
iteration_pos_ = 0; iteration_pos_ = 0;
PADDLE_ENFORCE(reader_->HasNext());
for (size_t i = 0; i < buffer_size_; ++i) { for (size_t i = 0; i < buffer_size_; ++i) {
if (!reader_->HasNext()) { if (!reader_->HasNext()) {
break; break;
......
...@@ -483,9 +483,124 @@ DEVICE inline bool operator>=(const half& a, const half& b) { ...@@ -483,9 +483,124 @@ DEVICE inline bool operator>=(const half& a, const half& b) {
#endif // PADDLE_CUDA_FP16 #endif // PADDLE_CUDA_FP16
// Arithmetic operators on ARMv8.2-A CPU // Arithmetic operators for float16 on GPU
#if defined(PADDLE_WITH_NATIVE_FP16) #if defined(PADDLE_CUDA_FP16)
HOST inline float16 operator+(const float16& a, const float16& b) { HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hadd(half(a), half(b)));
#else
return float16(float(a) + float(b));
#endif
}
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hsub(half(a), half(b)));
#else
return float16(float(a) - float(b));
#endif
}
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hmul(half(a), half(b)));
#else
return float16(float(a) * float(b));
#endif
}
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
// TODO(kexinzhao): check which cuda version starts to support __hdiv
float num = __half2float(half(a));
float denom = __half2float(half(b));
return float16(num / denom);
#else
return float16(float(a) / float(b));
#endif
}
HOSTDEVICE inline float16 operator-(const float16& a) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return float16(__hneg(half(a)));
#else
float16 res;
res.x = a.x ^ 0x8000;
return res;
#endif
}
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
a = a + b;
return a;
}
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
a = a - b;
return a;
}
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
a = a * b;
return a;
}
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
a = a / b;
return a;
}
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __heq(half(a), half(b));
#else
return float(a) == float(b);
#endif
}
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hne(half(a), half(b));
#else
return float(a) != float(b);
#endif
}
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hlt(half(a), half(b));
#else
return float(a) < float(b);
#endif
}
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hle(half(a), half(b));
#else
return float(a) <= float(b);
#endif
}
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hgt(half(a), half(b));
#else
return float(a) > float(b);
#endif
}
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hge(half(a), half(b));
#else
return float(a) >= float(b);
#endif
}
// Arithmetic operators for float16 on ARMv8.2-A CPU
#elif defined(PADDLE_WITH_NATIVE_FP16)
inline float16 operator+(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -501,7 +616,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) { ...@@ -501,7 +616,7 @@ HOST inline float16 operator+(const float16& a, const float16& b) {
return res; return res;
} }
HOST inline float16 operator-(const float16& a, const float16& b) { inline float16 operator-(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -517,7 +632,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) { ...@@ -517,7 +632,7 @@ HOST inline float16 operator-(const float16& a, const float16& b) {
return res; return res;
} }
HOST inline float16 operator*(const float16& a, const float16& b) { inline float16 operator*(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -533,7 +648,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) { ...@@ -533,7 +648,7 @@ HOST inline float16 operator*(const float16& a, const float16& b) {
return res; return res;
} }
HOST inline float16 operator/(const float16& a, const float16& b) { inline float16 operator/(const float16& a, const float16& b) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -549,7 +664,7 @@ HOST inline float16 operator/(const float16& a, const float16& b) { ...@@ -549,7 +664,7 @@ HOST inline float16 operator/(const float16& a, const float16& b) {
return res; return res;
} }
HOST inline float16 operator-(const float16& a) { inline float16 operator-(const float16& a) {
float16 res; float16 res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -564,27 +679,27 @@ HOST inline float16 operator-(const float16& a) { ...@@ -564,27 +679,27 @@ HOST inline float16 operator-(const float16& a) {
return res; return res;
} }
HOST inline float16& operator+=(float16& a, const float16& b) { inline float16& operator+=(float16& a, const float16& b) {
a = a + b; a = a + b;
return a; return a;
} }
HOST inline float16& operator-=(float16& a, const float16& b) { inline float16& operator-=(float16& a, const float16& b) {
a = a - b; a = a - b;
return a; return a;
} }
HOST inline float16& operator*=(float16& a, const float16& b) { inline float16& operator*=(float16& a, const float16& b) {
a = a * b; a = a * b;
return a; return a;
} }
HOST inline float16& operator/=(float16& a, const float16& b) { inline float16& operator/=(float16& a, const float16& b) {
a = a / b; a = a / b;
return a; return a;
} }
HOST inline bool operator==(const float16& a, const float16& b) { inline bool operator==(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -600,11 +715,9 @@ HOST inline bool operator==(const float16& a, const float16& b) { ...@@ -600,11 +715,9 @@ HOST inline bool operator==(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
HOST inline bool operator!=(const float16& a, const float16& b) { inline bool operator!=(const float16& a, const float16& b) { return !(a == b); }
return !(a == b);
}
HOST inline bool operator<(const float16& a, const float16& b) { inline bool operator<(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v1.h}[0], [%[a_ptr]]\n" "ld1 {v1.h}[0], [%[a_ptr]]\n"
...@@ -620,7 +733,7 @@ HOST inline bool operator<(const float16& a, const float16& b) { ...@@ -620,7 +733,7 @@ HOST inline bool operator<(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
HOST inline bool operator<=(const float16& a, const float16& b) { inline bool operator<=(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v1.h}[0], [%[a_ptr]]\n" "ld1 {v1.h}[0], [%[a_ptr]]\n"
...@@ -636,7 +749,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) { ...@@ -636,7 +749,7 @@ HOST inline bool operator<=(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
HOST inline bool operator>(const float16& a, const float16& b) { inline bool operator>(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -652,7 +765,7 @@ HOST inline bool operator>(const float16& a, const float16& b) { ...@@ -652,7 +765,7 @@ HOST inline bool operator>(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
HOST inline bool operator>=(const float16& a, const float16& b) { inline bool operator>=(const float16& a, const float16& b) {
uint16_t res; uint16_t res;
asm volatile( asm volatile(
"ld1 {v0.h}[0], [%[a_ptr]]\n" "ld1 {v0.h}[0], [%[a_ptr]]\n"
...@@ -668,71 +781,71 @@ HOST inline bool operator>=(const float16& a, const float16& b) { ...@@ -668,71 +781,71 @@ HOST inline bool operator>=(const float16& a, const float16& b) {
return (res & 0xffff) != 0; return (res & 0xffff) != 0;
} }
// Arithmetic operators, software emulated on other CPU // Arithmetic operators for float16, software emulated on other CPU
#else #else
HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { inline float16 operator+(const float16& a, const float16& b) {
return float16(float(a) + float(b)); return float16(float(a) + float(b));
} }
HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { inline float16 operator-(const float16& a, const float16& b) {
return float16(float(a) - float(b)); return float16(float(a) - float(b));
} }
HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { inline float16 operator*(const float16& a, const float16& b) {
return float16(float(a) * float(b)); return float16(float(a) * float(b));
} }
HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { inline float16 operator/(const float16& a, const float16& b) {
return float16(float(a) / float(b)); return float16(float(a) / float(b));
} }
HOSTDEVICE inline float16 operator-(const float16& a) { inline float16 operator-(const float16& a) {
float16 res; float16 res;
res.x = a.x ^ 0x8000; res.x = a.x ^ 0x8000;
return res; return res;
} }
HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { inline float16& operator+=(float16& a, const float16& b) {
a = float16(float(a) + float(b)); a = float16(float(a) + float(b));
return a; return a;
} }
HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { inline float16& operator-=(float16& a, const float16& b) {
a = float16(float(a) - float(b)); a = float16(float(a) - float(b));
return a; return a;
} }
HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { inline float16& operator*=(float16& a, const float16& b) {
a = float16(float(a) * float(b)); a = float16(float(a) * float(b));
return a; return a;
} }
HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { inline float16& operator/=(float16& a, const float16& b) {
a = float16(float(a) / float(b)); a = float16(float(a) / float(b));
return a; return a;
} }
HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { inline bool operator==(const float16& a, const float16& b) {
return float(a) == float(b); return float(a) == float(b);
} }
HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { inline bool operator!=(const float16& a, const float16& b) {
return float(a) != float(b); return float(a) != float(b);
} }
HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { inline bool operator<(const float16& a, const float16& b) {
return float(a) < float(b); return float(a) < float(b);
} }
HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { inline bool operator<=(const float16& a, const float16& b) {
return float(a) <= float(b); return float(a) <= float(b);
} }
HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { inline bool operator>(const float16& a, const float16& b) {
return float(a) > float(b); return float(a) > float(b);
} }
HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { inline bool operator>=(const float16& a, const float16& b) {
return float(a) >= float(b); return float(a) >= float(b);
} }
#endif #endif
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import unittest import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core
from op_test import OpTest from op_test import OpTest
...@@ -82,5 +83,37 @@ class TestDropoutOp5(OpTest): ...@@ -82,5 +83,37 @@ class TestDropoutOp5(OpTest):
self.check_output() self.check_output()
class TestFP16DropoutOp(OpTest):
def setUp(self):
self.op_type = "dropout"
self.init_test_case()
x = np.random.random(self.input_size).astype("float16")
out = x * (1.0 - self.prob)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.attrs = {
'dropout_prob': self.prob,
'fix_seed': self.fix_seed,
'is_test': True
}
self.outputs = {'Out': out}
def init_test_case(self):
self.input_size = [32, 64]
self.prob = 0.35
self.fix_seed = True
def test_check_output(self):
if core.is_compiled_with_cuda() and core.op_support_gpu("dropout"):
self.check_output_with_place(core.CUDAPlace(0), atol=1e-3)
class TestFP16DropoutOp2(TestFP16DropoutOp):
def init_test_case(self):
self.input_size = [32, 64, 3]
self.prob = 0.75
self.fix_seed = False
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册