diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 6516b861a9c0d43b9bfd6a71503ef10a53fec55d..a20cc6d1b69ce8fe64c5311f2e55ee823b8e8a44 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -10,7 +10,7 @@ if (WITH_AARCH64) elseif(WITH_SUNWAY) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2020_1227.tar.gz" CACHE STRING "" FORCE) else() - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_0105.tar.gz" CACHE STRING "" FORCE) + SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) endif() SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") diff --git a/paddle/fluid/operators/layer_norm_op_xpu.cc b/paddle/fluid/operators/layer_norm_op_xpu.cc index 5a3c865e26c35d8e30afa4161e23342510c61a33..c55250f27087aea1a332fff301c116f0a740aae8 100644 --- a/paddle/fluid/operators/layer_norm_op_xpu.cc +++ b/paddle/fluid/operators/layer_norm_op_xpu.cc @@ -45,15 +45,13 @@ class LayerNormXPUKernel : public framework::OpKernel { auto* mean_data = mean->mutable_data(ctx.GetPlace()); auto* variance_data = variance->mutable_data(ctx.GetPlace()); auto& dev_ctx = ctx.template device_context(); - int r = xpu::layer_norm(dev_ctx.x_context(), left, right, x_data, y_data, - scale_data, bias_data, epsilon, mean_data, - variance_data, false); - PADDLE_ENFORCE_EQ( - r, XPU_SUCCESS, - platform::errors::External("XPU API(layer_norm) return wrong " - "value[%d], please check whether Baidu " - "Kunlun Card is properly installed.", - r)); + int r = xpu::layer_norm(dev_ctx.x_context(), x_data, y_data, left, right, + epsilon, scale_data, bias_data, mean_data, + variance_data); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU layer_norm kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; @@ -87,15 +85,14 @@ class LayerNormGradXPUKernel : public framework::OpKernel { auto* dx_data = (dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace())); auto& dev_ctx = ctx.template device_context(); - int r = xpu::layer_norm_backward( - dev_ctx.x_context(), left, right, x_data, scale_data, variance_data, - mean_data, dy_data, dx_data, dscale_data, dbias_data, epsilon); + int r = xpu::layer_norm_grad(dev_ctx.x_context(), x_data, dy_data, dx_data, + left, right, epsilon, scale_data, mean_data, + variance_data, dscale_data, dbias_data); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, - platform::errors::External("XPU API(layer_norm_backward) return wrong " - "value[%d], please check whether Baidu " - "Kunlun Card is properly installed.", - r)); + platform::errors::External( + "XPU layer_norm_grad kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc index 14bef89a71b8b4ea96f15b5bf4664456045ccb90..8834e95758bf2f43c8ccda213b559d04c18556ce 100644 --- a/paddle/fluid/operators/matmul_op_xpu.cc +++ b/paddle/fluid/operators/matmul_op_xpu.cc @@ -24,6 +24,8 @@ limitations under the License. */ namespace paddle { namespace operators { +using framework::Tensor; + static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) { if (x_dim.size() > 1) { return x_dim; @@ -97,6 +99,86 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x, ReshapeTensorIntoMatrixSequence(y, mat_dim_y); } +template +static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, + bool trans_x, bool trans_y, + const paddle::framework::ExecutionContext &ctx) { + const auto &x_dims = x->dims(); + const auto &y_dims = y->dims(); + auto &dev_ctx = + ctx.template device_context(); + + auto mat_dim_a = + math::CreateMatrixDescriptor(RowMatrixFromVector(x_dims), 0, trans_x); + auto mat_dim_b = + math::CreateMatrixDescriptor(ColumnMatrixFromVector(y_dims), 0, trans_y); + + if (x_dims.size() == 3 && y_dims.size() <= 2) { + // if transpose_X is true, the transpose cost much time + if (!trans_x) { + mat_dim_a.height_ *= mat_dim_a.batch_size_; + mat_dim_a.batch_size_ = 0; + } else { + mat_dim_b.batch_size_ = mat_dim_a.batch_size_; + mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_; + } + } + PADDLE_ENFORCE_EQ( + mat_dim_a.width_, mat_dim_b.height_, + platform::errors::InvalidArgument("Shape mistake in matmul_op, the " + "first tensor width must be same as " + "second tensor height, but received " + "width:%d, height:%d", + mat_dim_a.width_, mat_dim_b.height_)); + PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_, + platform::errors::InvalidArgument( + "Shape mistake in matmul_op, the two input" + "tensor batch_size must be same, but received first " + "tensor batch_size:%d, second " + "tensor batch_size:%d", + mat_dim_a.batch_size_, mat_dim_b.batch_size_)); + + T alpha = static_cast(ctx.Attr("alpha")); + + float *data_c = out->data(); + int m = mat_dim_a.height_; + int n = mat_dim_b.width_; + int k = mat_dim_a.width_; + int ldx = mat_dim_a.trans_ ? m : k; + int ldy = mat_dim_b.trans_ ? k : n; + int ldout = n; + int batch_size = mat_dim_a.batch_size_; + + if (batch_size == 0) { + int r = xpu::fc_fusion( + dev_ctx.x_context(), x->data(), y->data(), data_c, m, n, k, + mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, + ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU fc_fusion kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + } else { + // batch matmul + int x_stride = mat_dim_a.stride_; + int y_stride = mat_dim_b.stride_; + int out_stride = m * n; + for (int i = 0; i < batch_size; ++i) { + const float *x_data = x->data() + x_stride * i; + const float *y_data = y->data() + y_stride * i; + float *out_data = data_c + out_stride * i; + int r = xpu::fc_fusion( + dev_ctx.x_context(), x_data, y_data, out_data, m, n, k, + mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, + ldy, ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU fc_fusion kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + } + } +} + template class MatMulXPUKernel : public framework::OpKernel { public: @@ -105,78 +187,12 @@ class MatMulXPUKernel : public framework::OpKernel { auto *y = context.Input("Y"); auto *out = context.Output("Out"); out->mutable_data(context.GetPlace()); - - auto mat_dim_a = math::CreateMatrixDescriptor( - RowMatrixFromVector(x->dims()), 0, context.Attr("transpose_X")); - auto mat_dim_b = - math::CreateMatrixDescriptor(ColumnMatrixFromVector(y->dims()), 0, - context.Attr("transpose_Y")); - - const auto &x_dims = x->dims(); - const auto &y_dims = y->dims(); - if (x_dims.size() == 3 && y_dims.size() <= 2) { - // if transpose_X is true, the transpose cost much time - if (!context.Attr("transpose_X")) { - mat_dim_a.height_ *= mat_dim_a.batch_size_; - mat_dim_a.batch_size_ = 0; - } else { - mat_dim_b.batch_size_ = mat_dim_a.batch_size_; - mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_; - } - } - - PADDLE_ENFORCE_EQ( - mat_dim_a.width_, mat_dim_b.height_, - platform::errors::InvalidArgument("Shape mistake in matmul_op, the " - "first tensor width must be same as " - "second tensor height, but received " - "width:%d, height:%d", - mat_dim_a.width_, mat_dim_b.height_)); - PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_, - platform::errors::InvalidArgument( - "Shape mistake in matmul_op, the two input" - "tensor batch_size must be same, but received first " - "tensor batch_size:%d, second " - "tensor batch_size:%d", - mat_dim_a.batch_size_, mat_dim_b.batch_size_)); - T alpha = static_cast(context.Attr("alpha")); - - auto &dev_ctx = context.template device_context(); - float *data_c = out->data(); - int m = mat_dim_a.height_; - int n = mat_dim_b.width_; - int k = mat_dim_a.width_; - int ldx = mat_dim_a.trans_ ? m : k; - int ldy = mat_dim_b.trans_ ? k : n; - int ldout = n; - int batch_size = mat_dim_a.batch_size_; - if (batch_size == 0 || batch_size == 1) { - int r = xpu::fc_fusion( - dev_ctx.x_context(), x->data(), y->data(), data_c, m, n, k, - mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, - ldy, ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External( - "XPU fc_fusion kernel return wrong value[%d %s]", r, - XPUAPIErrorMsg[r])); + bool trans_x = context.Attr("transpose_X"); + bool trans_y = context.Attr("transpose_Y"); + if (std::getenv("XPU_PADDLE_MAT_MUL_FCINT32") != nullptr) { + MatMulXPUFunction(x, y, out, trans_x, trans_y, context); } else { - // batch matmul - int x_stride = mat_dim_a.stride_; - int y_stride = mat_dim_b.stride_; - int out_stride = m * n; - for (int i = 0; i < batch_size; ++i) { - const float *x_data = x->data() + x_stride * i; - const float *y_data = y->data() + y_stride * i; - float *out_data = data_c + out_stride * i; - int r = xpu::fc_fusion( - dev_ctx.x_context(), x_data, y_data, out_data, m, n, k, - mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, - ldy, ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External( - "XPU fc_fusion kernel return wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); - } + MatMulXPUFunction(x, y, out, trans_x, trans_y, context); } } }; @@ -244,75 +260,10 @@ class MatMulGradXPUKernel : public framework::OpKernel { const framework::Tensor &b, bool trans_b, framework::Tensor *out) const { out->mutable_data(context.GetPlace()); - auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); - auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b); - const auto &a_dims = a.dims(); - const auto &b_dims = b.dims(); - if (a_dims.size() == 3 && b_dims.size() <= 2) { - // if transpose_X is true, the transpose cost much time - if (!context.Attr("transpose_X")) { - mat_dim_a.height_ *= mat_dim_a.batch_size_; - mat_dim_a.batch_size_ = 0; - } else { - mat_dim_b.batch_size_ = mat_dim_a.batch_size_; - mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_; - } - } - - PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_, - platform::errors::InvalidArgument( - "Shape mistake in matmul_grad_op, the " - "first tensor width must be same as second tensor " - "height, but received " - "width:%d, height:%d", - mat_dim_a.width_, mat_dim_b.height_)); - PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_, - platform::errors::InvalidArgument( - "Shape mistake in matmul_grad_op, the two input" - "tensor batch_size must be same, but received first " - "tensor batch_size:%d, second " - "tensor batch_size:%d", - mat_dim_a.batch_size_, mat_dim_b.batch_size_)); - - T alpha = static_cast(context.Attr("alpha")); - - auto &dev_ctx = context.template device_context(); - float *data_c = out->data(); - - int m = mat_dim_a.height_; - int n = mat_dim_b.width_; - int k = mat_dim_a.width_; - int ldx = mat_dim_a.trans_ ? m : k; - int ldy = mat_dim_b.trans_ ? k : n; - int ldout = n; - int batch_size = mat_dim_a.batch_size_; - if (batch_size == 0 || batch_size == 1) { - int r = xpu::fc_fusion( - dev_ctx.x_context(), a.data(), b.data(), data_c, m, n, k, - mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, - ldy, ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External( - "XPU fc_fusion kernel return wrong value[%d %s]", r, - XPUAPIErrorMsg[r])); + if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_FCINT32") != nullptr) { + MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); } else { - // batch matmul - int x_stride = mat_dim_a.stride_; - int y_stride = mat_dim_b.stride_; - int out_stride = m * n; - for (int i = 0; i < batch_size; ++i) { - const float *x_data = a.data() + x_stride * i; - const float *y_data = b.data() + y_stride * i; - float *out_data = data_c + out_stride * i; - int r = xpu::fc_fusion( - dev_ctx.x_context(), x_data, y_data, out_data, m, n, k, - mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, - ldy, ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External( - "XPU fc_fusion kernel return wrong value[%d %s]", - r, XPUAPIErrorMsg[r])); - } + MatMulXPUFunction(&a, &b, out, trans_a, trans_b, context); } } diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index d6f3cc226e65543e8ddc923467d7dc26a5ca4432..765a380c6b84ff4cf0da7a36bf9e1050ff0a9b73 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -21,211 +21,141 @@ namespace paddle { namespace operators { -template -void MatMulXPUFunction(const Tensor* X, const Tensor* Y, - const std::vector& x_dims, - const std::vector& y_dims, Tensor* Out, - bool trans_x, bool trans_y, - const paddle::framework::ExecutionContext& ctx) { - const int x_ndim = x_dims.size(); - const int y_ndim = y_dims.size(); - +template +static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, + bool trans_x, bool trans_y, + const paddle::framework::ExecutionContext& ctx) { + const auto& x_dims = x->dims(); + const auto& y_dims = y->dims(); auto& dev_ctx = ctx.template device_context(); - // currently only support x_ndim == y_dim and non-broadcast case - PADDLE_ENFORCE_EQ(x_ndim, y_ndim, platform::errors::InvalidArgument( - "Shape mistake in matmul_v2_op")); - for (int i = 0; i < x_ndim - 2; i++) { - PADDLE_ENFORCE_EQ( - x_dims.data()[i], y_dims.data()[i], - platform::errors::InvalidArgument("Shape mistake in matmul_v2_op")); - } - - int ret = 0; - if (x_ndim == 1 && y_ndim == 1) { - PADDLE_ENFORCE_EQ(X->numel(), Y->numel(), - platform::errors::InvalidArgument( - "X's numbers is not equal to Y's numbers," - "when X/Y's dims =1")); - VLOG(3) << "MatMul's case 1"; - Out->Resize({1}); - Out->mutable_data(ctx.GetPlace()); - ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, false, 1, 1, - X->numel(), 1.0f, X->data(), - Y->data(), 0.0f, Out->data()); - PADDLE_ENFORCE_EQ( - ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d] in matmul_v2, please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); - return; - } + auto mat_dim_a = + math::CreateMatrixDescriptor(RowMatrixFromVector(x_dims), 0, trans_x); + auto mat_dim_b = + math::CreateMatrixDescriptor(ColumnMatrixFromVector(y_dims), 0, trans_y); - if (x_ndim == 1) { - const int N = X->numel(); - if (trans_y) { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 1], N, - platform::errors::InvalidArgument("Input(Y) has error dim.")); + if (x_dims.size() == 3 && y_dims.size() <= 2) { + // if transpose_X is true, the transpose cost much time + if (!trans_x) { + mat_dim_a.height_ *= mat_dim_a.batch_size_; + mat_dim_a.batch_size_ = 0; } else { - PADDLE_ENFORCE_EQ( - y_dims[y_ndim - 2], N, - platform::errors::InvalidArgument("Input(Y) has error dim.")); + mat_dim_b.batch_size_ = mat_dim_a.batch_size_; + mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_; } - std::vector out_dims(y_ndim - 1); - if (trans_y) { - std::copy_n(y_dims.cbegin(), y_ndim - 1, out_dims.begin()); - } else { - std::copy_n(y_dims.cbegin(), y_ndim - 2, out_dims.begin()); - out_dims.back() = y_dims.back(); - } - Out->Resize(framework::make_ddim(out_dims)); - Out->mutable_data(ctx.GetPlace()); - if (trans_y) { - const int M = Y->numel() / N; - VLOG(3) << "MatMul's case 2"; - ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, true, 1, M, N, - 1.0f, X->data(), Y->data(), 0.0f, - Out->data()); - PADDLE_ENFORCE_EQ( - ret, XPU_SUCCESS, - platform::errors::External("XPU API return wrong value[%d] in " - "matmul_v2, please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); - } else { - const int M = y_dims[y_ndim - 1]; - const int batch_size = Y->numel() / (M * N); - for (int i = 0; i < batch_size; i++) { - ret = baidu::xpu::api::fc_int16( - dev_ctx.x_context(), false, false, 1, M, N, 1.0f, X->data(), - Y->data() + i * M * N, 0.0f, Out->data() + i * M); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d] in matmul_v2, " - "please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); - } - } - return; } - if (y_ndim == 1) { - const int N = Y->numel(); - if (trans_x) { - PADDLE_ENFORCE_EQ( - x_dims[x_ndim - 2], N, - platform::errors::InvalidArgument("Input(X) has error dim.")); - } else { - PADDLE_ENFORCE_EQ( - x_dims[x_ndim - 1], N, - platform::errors::InvalidArgument("Input(X) has error dim.")); + if (mat_dim_a.width_ == mat_dim_b.height_) { + if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) { + mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; } - std::vector out_dims(x_ndim - 1); - if (trans_x) { - std::copy_n(x_dims.cbegin(), x_ndim - 2, out_dims.begin()); - out_dims.back() = x_dims.back(); - } else { - std::copy_n(x_dims.cbegin(), x_ndim - 1, out_dims.begin()); - } - Out->Resize(framework::make_ddim(out_dims)); - Out->mutable_data(ctx.GetPlace()); - - if (trans_x) { - const int M = x_dims[x_ndim - 1]; - const int batch_size = X->numel() / (M * N); - for (int i = 0; i < batch_size; i++) { - ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), true, false, M, 1, - N, 1.0f, X->data() + i * M * N, - Y->data(), 0.0f, - Out->data() + i * M); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d] in matmul_v2, " - "please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); - } - } else { - const int M = X->numel() / N; - VLOG(3) << "MatMul's case 7"; - ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, false, M, 1, - N, 1.0f, X->data(), Y->data(), 0.0f, - Out->data()); - PADDLE_ENFORCE_EQ( - ret, XPU_SUCCESS, - platform::errors::External("XPU API return wrong value[%d] in " - "matmul_v2, please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); + if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) { + mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; } - return; } - const int M = trans_x ? x_dims[x_ndim - 1] : x_dims[x_ndim - 2]; - const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; - if (trans_y) { - PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, platform::errors::InvalidArgument( - "Input(X) has error dim.")); + PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_, + platform::errors::InvalidArgument( + "Shape mistake in matmul_v2_op xdims = %s ydims = %s", + x_dims.to_str(), y_dims.to_str())); + PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_, + platform::errors::InvalidArgument( + "Shape mistake in matmul_v2_op xdims = %s ydims = %s", + x_dims.to_str(), y_dims.to_str())); + + float* data_c = out->data(); + int m = mat_dim_a.height_; + int n = mat_dim_b.width_; + int k = mat_dim_a.width_; + int batch_size = mat_dim_a.batch_size_; + + if (batch_size == 0) { + int r = xpu::fc( + dev_ctx.x_context(), x->data(), y->data(), data_c, m, n, k, + mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU fc_fusion kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } else { - PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, platform::errors::InvalidArgument( - "Input(X) has error dim.")); - } - const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - const int ndim = (std::max)(x_ndim, y_ndim); - std::vector out_broadcast_dims(ndim); - int batch_size = 1; - for (int i = 0; i < ndim - 2; i++) { - PADDLE_ENFORCE_EQ( - x_dims.data()[i], y_dims.data()[i], - platform::errors::InvalidArgument("Shape mistake in matmul_v2_op")); - out_broadcast_dims[i] = x_dims.data()[i]; - batch_size *= x_dims.data()[i]; + // batch matmul + int x_stride = mat_dim_a.stride_; + int y_stride = mat_dim_b.stride_; + int out_stride = m * n; + for (int i = 0; i < batch_size; ++i) { + const float* x_data = x->data() + x_stride * i; + const float* y_data = y->data() + y_stride * i; + float* out_data = data_c + out_stride * i; + int r = xpu::fc( + dev_ctx.x_context(), x_data, y_data, out_data, m, n, k, + mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU fc_fusion kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + } } - - out_broadcast_dims[ndim - 2] = M; - out_broadcast_dims[ndim - 1] = N; - - Out->Resize(framework::make_ddim(out_broadcast_dims)); - Out->mutable_data(ctx.GetPlace()); - ret = baidu::xpu::api::batched_gemm_int16( - dev_ctx.x_context(), trans_x, trans_y, batch_size, M, N, K, 1.0f, - X->data(), Y->data(), Out->data(), nullptr, nullptr); - PADDLE_ENFORCE_EQ( - ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d] in matmul_v2, please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); } template class MatMulV2XPUKernel : public framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* Y = ctx.Input("Y"); - auto* Out = ctx.Output("Out"); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); bool trans_x = ctx.Attr("trans_x"); bool trans_y = ctx.Attr("trans_y"); - MatMulXPUFunction(X, Y, vectorize(X->dims()), vectorize(Y->dims()), Out, - trans_x, trans_y, ctx); + out->mutable_data(ctx.GetPlace()); + if (std::getenv("XPU_PADDLE_MAT_MUL_V2_FCINT32") != nullptr) { + MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); + } else { + MatMulXPUFunction(x, y, out, trans_x, trans_y, ctx); + } } }; +template +static framework::Tensor XPUFoldHeadAndLastDims( + const DeviceContext& context, const framework::Tensor& input) { + auto in_dims = input.dims(); + if (in_dims.size() != 3) { + return input; + } + + framework::Tensor output; + output.Resize({in_dims[1], in_dims[0], in_dims[2]}); + output.mutable_data(context.GetPlace()); + std::vector in_shape_host = {static_cast(in_dims[0]), + static_cast(in_dims[1]), + static_cast(in_dims[2])}; + std::vector axis_host = {1, 0, 2}; + + int r = xpu::transpose(context.x_context(), input.data(), output.data(), + in_shape_host.data(), axis_host.data(), /*ndims=*/3); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU transpose kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); + + return output; +} + template class MatMulV2XPUGradKernel : public framework::OpKernel { public: - void MatMul(const framework::ExecutionContext& context, + void MatMul(const framework::ExecutionContext& ctx, const framework::Tensor& a, bool trans_a, const framework::Tensor& b, bool trans_b, framework::Tensor* out) const { - out->mutable_data(context.GetPlace()); - MatMulXPUFunction(&a, &b, vectorize(a.dims()), vectorize(b.dims()), out, - trans_a, trans_b, context); + out->mutable_data(ctx.GetPlace()); + if (std::getenv("XPU_PADDLE_MAT_MUL_GRAD_V2_FCINT32") != nullptr) { + MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); + } else { + MatMulXPUFunction(&a, &b, out, trans_a, trans_b, ctx); + } } void CalcInputGrad(const framework::ExecutionContext& context, @@ -239,118 +169,73 @@ class MatMulV2XPUGradKernel : public framework::OpKernel { if (!need_combine) { MatMul(context, a, trans_a, b, trans_b, out); } else { - // currently not support this case + auto& dev_ctx = + context.template device_context(); + MatMul( + context, + is_fold_init_dims_a + ? FoldInitDims(a) + : XPUFoldHeadAndLastDims( + dev_ctx, a), + trans_a, + is_fold_init_dims_b + ? FoldInitDims(b) + : XPUFoldHeadAndLastDims( + dev_ctx, b), + trans_b, out); } } - void Compute(const framework::ExecutionContext& ctx) const override { - bool transpose_x = ctx.Attr("trans_x"); - bool transpose_y = ctx.Attr("trans_y"); - - auto x = *ctx.Input("X"); - auto y = *ctx.Input("Y"); - auto dout = *ctx.Input(framework::GradVarName("Out")); - - // get dims - std::vector x_dims = vectorize(x.dims()); - std::vector y_dims = vectorize(y.dims()); - std::vector dout_dims = vectorize(dout.dims()); - - int x_ndim = x_dims.size(); - int y_ndim = y_dims.size(); - - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto& dev_ctx = - ctx.template device_context(); - // Case1 : x's or y's dim = 1 - int ret = 0; - if (x_ndim == 1 && y_ndim == 1) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, false, - dx->numel(), 1, 1, 1.0f, y.data(), - dout.data(), 0.0f, dx->data()); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d] in " - "matmul_v2_grad, please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); + void Compute(const framework::ExecutionContext& context) const override { + bool transpose_x = context.Attr("trans_x"); + bool transpose_y = context.Attr("trans_y"); + + auto x = *context.Input("X"); + auto y = *context.Input("Y"); + auto dout = + *context.Input(framework::GradVarName("Out")); + auto* dx = context.Output(framework::GradVarName("X")); + auto* dy = context.Output(framework::GradVarName("Y")); + ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + framework::DDim dx_dims; + if (dx) { + dx_dims = dx->dims(); + if (dx_dims != x.dims()) { + dx->Resize(x.dims()); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - ret = baidu::xpu::api::fc_int16(dev_ctx.x_context(), false, false, - dy->numel(), 1, 1, 1.0f, x.data(), - dout.data(), 0.0f, dy->data()); - PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d] in " - "matmul_v2_grad, please check whether " - "Baidu Kunlun Card is properly installed.", - ret)); + } + + framework::DDim dy_dims; + if (dy) { + dy_dims = dy->dims(); + if (dy_dims != y.dims()) { + dy->Resize(y.dims()); } - return; } - bool is_broadcast = true; - if (x_ndim <= 2 || y_ndim <= 2) { - is_broadcast = false; - } else if (x_ndim != y_ndim) { - is_broadcast = true; + if (transpose_x && transpose_y) { + CalcInputGrad(context, y, true, true, dout, true, false, dx); + CalcInputGrad(context, dout, true, true, x, true, false, dy); + } else if (transpose_x) { + CalcInputGrad(context, y, false, false, dout, true, false, dx); + CalcInputGrad(context, x, false, false, dout, false, true, dy); + } else if (transpose_y) { + CalcInputGrad(context, dout, false, false, y, false, true, dx); + CalcInputGrad(context, dout, true, true, x, false, true, dy); } else { - is_broadcast = !std::equal(x_dims.cbegin(), x_dims.cbegin() + x_ndim - 2, - y_dims.cbegin()); + CalcInputGrad(context, dout, false, false, y, true, false, dx); + CalcInputGrad(context, x, true, true, dout, false, true, dy); } - // currently only support non-broadcast case - PADDLE_ENFORCE_EQ( - is_broadcast, false, - platform::errors::InvalidArgument("Shape mistake in matmul_v2_op")); - - // Case2: no broadcast or no batch size, it aims to speed and it is same as - // matmul in old version. - if (!is_broadcast) { - ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); - framework::DDim dx_dims; - if (dx) { - dx_dims = dx->dims(); - if (dx_dims != x.dims()) { - dx->Resize(x.dims()); - } - } - - framework::DDim dy_dims; - if (dy) { - dy_dims = dy->dims(); - if (dy_dims != y.dims()) { - dy->Resize(y.dims()); - } - } - if (transpose_x && transpose_y) { - CalcInputGrad(ctx, y, true, true, dout, true, false, dx); - CalcInputGrad(ctx, dout, true, true, x, true, false, dy); - } else if (transpose_x) { - CalcInputGrad(ctx, y, false, false, dout, true, false, dx); - CalcInputGrad(ctx, x, false, false, dout, false, true, dy); - } else if (transpose_y) { - CalcInputGrad(ctx, dout, false, false, y, false, true, dx); - CalcInputGrad(ctx, dout, true, true, x, false, true, dy); - } else { - CalcInputGrad(ctx, dout, false, false, y, true, false, dx); - CalcInputGrad(ctx, x, true, true, dout, false, true, dy); + if (dx) { + if (dx_dims != x.dims()) { + dx->Resize(dx_dims); } + } - if (dx) { - if (dx_dims != x.dims()) { - dx->Resize(dx_dims); - } - } - if (dy) { - if (dy_dims != y.dims()) { - dy->Resize(dy_dims); - } + if (dy) { + if (dy_dims != y.dims()) { + dy->Resize(dy_dims); } } } diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc index 6cb2dd0bcf6d5b90cafbec82cdbb4e84719551bb..14ecd11d114d0b65a65989e38a43fc78dc765c53 100644 --- a/paddle/fluid/operators/one_hot_op_xpu.cc +++ b/paddle/fluid/operators/one_hot_op_xpu.cc @@ -35,7 +35,7 @@ class OneHotXPUKernel : public framework::OpKernel { if (context.HasInput("depth_tensor")) { auto* depth_tensor = context.Input("depth_tensor"); auto* depth_data = depth_tensor->data(); - if (depth_tensor->place() == platform::XPUPlace()) { + if (platform::is_xpu_place(depth_tensor->place())) { xpu_memcpy(static_cast(&depth), static_cast(depth_data), sizeof(int32_t), XPU_DEVICE_TO_HOST); diff --git a/paddle/fluid/operators/one_hot_v2_op_xpu.cc b/paddle/fluid/operators/one_hot_v2_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..6fec597db1729076e9e26ddf72a13ed402538905 --- /dev/null +++ b/paddle/fluid/operators/one_hot_v2_op_xpu.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_XPU +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/operators/one_hot_op.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +class OneHotV2XPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int depth = context.Attr("depth"); + if (context.HasInput("depth_tensor")) { + auto* depth_tensor = context.Input("depth_tensor"); + auto* depth_data = depth_tensor->data(); + if (platform::is_xpu_place(depth_tensor->place())) { + xpu_memcpy(static_cast(&depth), + static_cast(depth_data), sizeof(int32_t), + XPU_DEVICE_TO_HOST); + } else { + depth = depth_data[0]; + } + auto out_dims = out->dims(); + out_dims[out_dims.size() - 1] = depth; + out->Resize(out_dims); + } + + auto& dev_ctx = context.template device_context(); + int len = in->numel(); + int ret = xpu::one_hot(dev_ctx.x_context(), in->data(), + out->mutable_data(context.GetPlace()), len, + depth, 1.0, 0.0); + + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU one_hot kernel return wrong value[%d %s]", ret, + XPUAPIErrorMsg[ret])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + one_hot_v2, ops::OneHotV2XPUKernel, + ops::OneHotV2XPUKernel); +#endif diff --git a/paddle/fluid/operators/range_op_xpu.cc b/paddle/fluid/operators/range_op_xpu.cc new file mode 100644 index 0000000000000000000000000000000000000000..f37a8b34a0fd64dadbb96572b4d0691c73e87b01 --- /dev/null +++ b/paddle/fluid/operators/range_op_xpu.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU +#include "paddle/fluid/operators/range_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class XPURangeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* start_t = context.Input("Start"); + auto* end_t = context.Input("End"); + auto* step_t = context.Input("Step"); + auto* out = context.Output("Out"); + + framework::Tensor n; + framework::TensorCopy(*start_t, platform::CPUPlace(), &n); + T start = n.data()[0]; + framework::TensorCopy(*end_t, platform::CPUPlace(), &n); + T end = n.data()[0]; + framework::TensorCopy(*step_t, platform::CPUPlace(), &n); + T step = n.data()[0]; + + int64_t size = 0; + GetSize(start, end, step, &size); + out->Resize(framework::make_ddim({size})); + + T* out_data = out->mutable_data(context.GetPlace()); + + framework::Tensor out_cpu; + T* out_cpu_data_ptr = + out_cpu.mutable_data(platform::CPUPlace(), out->numel() * sizeof(T)); + T value = start; + for (int64_t i = 0; i < size; ++i) { + out_cpu_data_ptr[i] = value; + value += step; + } + int ret = xpu_memcpy(out_data, out_cpu_data_ptr, out->numel() * sizeof(T), + XPUMemcpyKind::XPU_HOST_TO_DEVICE); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External("XPU xpu_memcpy return wrong " + "value[%d %s]", + ret, XPUAPIErrorMsg[ret])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(range, ops::XPURangeKernel, + ops::XPURangeKernel, ops::XPURangeKernel, + ops::XPURangeKernel); +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc index b778bab8f93087e08709273b3859939a6358b855..fdb90797b69db5b4ca325eeeddf4ccc63353ce64 100644 --- a/paddle/fluid/operators/scale_op_xpu.cc +++ b/paddle/fluid/operators/scale_op_xpu.cc @@ -46,10 +46,13 @@ class ScaleXPUKernel : public framework::OpKernel { in->dims().to_str().c_str(), out->dims().to_str().c_str())); auto& dev_ctx = ctx.template device_context(); - int r = xpu::scale(dev_ctx.x_context(), in->numel(), scale, bias, - bias_after_scale, in->data(), out->data()); - PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, - platform::errors::Fatal("XPU scale kernel error!")); + int r = + xpu::scale(dev_ctx.x_context(), in->data(), out->data(), + in->numel(), bias_after_scale, scale, bias); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU scale kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc index 312c5d2dde163f2e34110c7761ac71b9dfb363e6..5d190189bf08253e8e1c04830c72b7d0abe1dfbe 100644 --- a/paddle/fluid/operators/softmax_op_xpu.cc +++ b/paddle/fluid/operators/softmax_op_xpu.cc @@ -41,8 +41,21 @@ class SoftmaxXPUKernel : public framework::OpKernel { } auto& dev_ctx = context.template device_context(); - int r = xpu::softmax(dev_ctx.x_context(), x->data(), - out->data(), x_dims, axis); + + int r = XPU_SUCCESS; + Tensor clip_x; + int len = x->numel(); + T* clip_x_data = + clip_x.mutable_data(platform::XPUPlace(), len * sizeof(T)); + r = xpu::clip(dev_ctx.x_context(), x->data(), clip_x_data, len, + -1e30, 1e30); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU API(clip) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + + r = xpu::softmax(dev_ctx.x_context(), clip_x_data, out->data(), + x_dims, axis); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External("XPU API(softmax2d_forward) return wrong " diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py index 1cc9950f9a15bcacee87cae548670647d6afde84..531e9488d602de89360bc4f3e733264371c2eb9f 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py @@ -13,12 +13,11 @@ # limitations under the License. from __future__ import print_function - -import unittest -import numpy as np import sys sys.path.append("..") -from op_test import OpTest +import unittest +import numpy as np +from op_test_xpu import XPUOpTest import paddle.fluid.core as core import paddle @@ -57,9 +56,7 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): return Out -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestMatMulV2Op(OpTest): +class TestMatMulV2Op(XPUOpTest): """ case 1 """ @@ -74,10 +71,10 @@ class TestMatMulV2Op(OpTest): self.dtype = "float32" def setUp(self): + self.use_xpu = True self.init_kernel_type() self.config() self.op_type = "matmul_v2" - self.use_xpu = True x = np.random.random(self.x_shape).astype(self.dtype) y = np.random.random(self.y_shape).astype(self.dtype) # -0.1 ~ 0.1 @@ -94,31 +91,25 @@ class TestMatMulV2Op(OpTest): def test_check_output(self): place = paddle.XPUPlace(0) - self.check_output_with_place(place, atol=0.01) + self.check_output_with_place(place) def test_check_grad(self): place = paddle.XPUPlace(0) - self.check_grad_with_place( - place, ['X', 'Y'], 'Out', max_relative_error=0.1) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') -''' -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestMatMuklOp2(TestMatMulV2Op): - """ - case 2 - """ +# class TestMatMuklOp2(TestMatMulV2Op): +# """ +# case 2 +# """ - def config(self): - self.x_shape = (100, ) - self.y_shape = (1, 3, 2, 100) - self.trans_x = False - self.trans_y = True +# def config(self): +# self.x_shape = (100, ) +# self.y_shape = (1, 3, 2, 100) +# self.trans_x = False +# self.trans_y = True -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") class TestMatMuklOp3(TestMatMulV2Op): """ case 3 @@ -131,21 +122,18 @@ class TestMatMuklOp3(TestMatMulV2Op): self.trans_y = False -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestMatMuklOp4(TestMatMulV2Op): - """ - case 4 - """ +# class TestMatMuklOp4(TestMatMulV2Op): +# """ +# case 4 +# """ + +# def config(self): +# self.x_shape = (100, ) +# self.y_shape = (1, 2, 100, 2) +# self.trans_x = False +# self.trans_y = False - def config(self): - self.x_shape = (100, ) - self.y_shape = (1, 2, 100, 2) - self.trans_x = False - self.trans_y = False -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") class TestMatMuklOp5(TestMatMulV2Op): """ case 5 @@ -158,37 +146,29 @@ class TestMatMuklOp5(TestMatMulV2Op): self.trans_y = False -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestMatMuklOp6(TestMatMulV2Op): - """ - case 6 - """ - - def config(self): - self.x_shape = (1, 2, 100, 1) - self.y_shape = (100, ) - self.trans_x = True - self.trans_y = False +# class TestMatMuklOp6(TestMatMulV2Op): +# """ +# case 6 +# """ +# def config(self): +# self.x_shape = (1, 2, 102, 1) +# self.y_shape = (102, ) +# self.trans_x = True +# self.trans_y = False -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestMatMuklOp7(TestMatMulV2Op): - """ - case 7 - """ +# class TestMatMuklOp7(TestMatMulV2Op): +# """ +# case 7 +# """ - def config(self): - self.x_shape = (1, 2, 1, 100) - self.y_shape = (100, ) - self.trans_x = False - self.trans_y = False -''' +# def config(self): +# self.x_shape = (1, 2, 1, 100) +# self.y_shape = (100, ) +# self.trans_x = False +# self.trans_y = False -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") class TestMatMuklOp8(TestMatMulV2Op): """ case 8 @@ -201,37 +181,97 @@ class TestMatMuklOp8(TestMatMulV2Op): self.trans_y = False -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") +# class TestMatMuklOp9(TestMatMulV2Op): +# """ +# case 9 +# """ + +# def config(self): +# self.x_shape = (1, 1, 1, 100) +# self.y_shape = (2, 1, 2, 100) +# self.trans_x = False +# self.trans_y = True + +# class TestMatMuklOp10(TestMatMulV2Op): +# """ +# case 10 +# """ + +# def config(self): +# self.x_shape = (1, 1, 25, 4) +# self.y_shape = (1, 2, 4, 25) +# self.trans_x = False +# self.trans_y = False + +# class TestMatMuklOp11(TestMatMulV2Op): +# """ +# case 11 +# """ + +# def config(self): +# self.x_shape = (2, 1, 2, 100) +# self.y_shape = (1, 1, 100, 2) +# self.trans_x = False +# self.trans_y = False + +# class TestMatMuklOp12(TestMatMulV2Op): +# """ +# case 12 +# """ + +# def config(self): +# self.x_shape = (2, 1, 4, 25) +# self.y_shape = (1, 1, 4, 25) +# self.trans_x = True +# self.trans_y = False + + class TestMatMuklOp13(TestMatMulV2Op): """ case 13 """ def config(self): - self.x_shape = (2, 2, 2, 50) - self.y_shape = (2, 2, 2, 50) + self.x_shape = (2, 2, 10, 10) + self.y_shape = (2, 2, 10, 10) self.trans_x = True self.trans_y = False -''' -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestMatMuklOp16(TestMatMulV2Op): - """ - case 16 : to check the gradient for special case - """ +# class TestMatMuklOp14(TestMatMulV2Op): +# """ +# case 14_1 +# """ - def config(self): - self.x_shape = (100) - self.y_shape = (1, 2, 2, 100, 2) - self.trans_x = False - self.trans_y = False +# def config(self): +# self.x_shape = (3, 1, 6, 6) +# self.y_shape = (1, 2, 6, 9) +# self.trans_x = True +# self.trans_y = False + +# class TestMatMuklOp15(TestMatMulV2Op): +# """ +# case 14_2 +# """ + +# def config(self): +# self.x_shape = (3, 1, 6, 6) +# self.y_shape = (1, 2, 6, 9) +# self.trans_x = False +# self.trans_y = False + +# class TestMatMuklOp16(TestMatMulV2Op): +# """ +# case 16 : to check the gradient for special case +# """ + +# def config(self): +# self.x_shape = (100) +# self.y_shape = (1, 2, 2, 100, 2) +# self.trans_x = False +# self.trans_y = False -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") class TestMatMuklOp17(TestMatMulV2Op): """ case 17 : to check the gradient for special case @@ -242,36 +282,30 @@ class TestMatMuklOp17(TestMatMulV2Op): self.y_shape = (100) self.trans_x = False self.trans_y = False -''' - - -@unittest.skipIf(not paddle.is_compiled_with_xpu(), - "core is not compiled with XPU") -class TestMatMulV2API(unittest.TestCase): - def setUp(self): - self.places = [fluid.CPUPlace()] - self.places.append(fluid.XPUPlace(0)) - - def check_static_result(self, place): - with fluid.program_guard(fluid.Program(), fluid.Program()): - input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32") - input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32") - result = paddle.matmul(input_x, input_y) - x_np = np.random.random([4, 3]).astype("float32") - y_np = np.random.random([3, 4]).astype("float32") +# class TestMatMuklOpBroadcast1(TestMatMulV2Op): +# """ +# case 14_3 +# """ - exe = fluid.Executor(place) - fetches = exe.run(fluid.default_main_program(), - feed={"input_x": x_np, - "input_y": y_np}, - fetch_list=[result]) +# def config(self): +# self.x_shape = (3, 1, 10, 10) +# self.y_shape = (1, 2, 10, 10) +# self.trans_x = True +# self.trans_y = True - def test_static(self): - for place in self.places: - self.check_static_result(place=place) +# class TestMatMuklOpBroadcast2(TestMatMulV2Op): +# """ +# case 14_4 +# """ +# def config(self): +# self.x_shape = (3, 1, 10, 10) +# self.y_shape = (1, 2, 10, 10) +# self.trans_x = False +# self.trans_y = True if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py new file mode 100644 index 0000000000000000000000000000000000000000..9f937caa37ebfe476fbf2ba8efdd4ad2376209ea --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_one_hot_v2_op_xpu.py @@ -0,0 +1,196 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +import sys +sys.path.append("..") +from op_test_xpu import XPUOpTest +import paddle.fluid as fluid +from paddle.fluid import Program, program_guard +import time + +paddle.enable_static() + + +class TestOneHotOp(XPUOpTest): + def setUp(self): + self.use_xpu = True + self.op_type = 'one_hot_v2' + depth = 10 + depth_np = np.array(10).astype('int32') + # dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) + + out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np} + self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, check_dygraph=False) + + +class TestOneHotOp_attr(XPUOpTest): + def setUp(self): + self.op_type = 'one_hot_v2' + depth = 10 + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), 1, + depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, 0, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, check_dygraph=False) + + +class TestOneHotOp_default_dtype(XPUOpTest): + def setUp(self): + self.op_type = 'one_hot_v2' + depth = 10 + depth_np = np.array(10).astype('int32') + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) + + out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np} + self.attrs = {} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, check_dygraph=False) + + +class TestOneHotOp_default_dtype_attr(XPUOpTest): + def setUp(self): + self.op_type = 'one_hot_v2' + depth = 10 + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), 1, + depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, 0, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, check_dygraph=False) + + +class TestOneHotOp_out_of_range(XPUOpTest): + def setUp(self): + self.op_type = 'one_hot_v2' + depth = 10 + x_lod = [[4, 1, 3, 3]] + x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0])]) + + out = np.zeros(shape=(np.product(x.shape), depth)).astype('float32') + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth, 'allow_out_of_range': True} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, check_dygraph=False) + + +class TestOneHotOpApi(unittest.TestCase): + def test_api(self): + depth = 10 + self._run(depth) + + def test_api_with_depthTensor(self): + depth = fluid.layers.assign(input=np.array([10], dtype=np.int32)) + self._run(depth) + + def test_api_with_dygraph(self): + depth = 10 + label = np.array([np.random.randint(0, depth - 1) + for i in range(6)]).reshape([6, 1]) + with fluid.dygraph.guard(): + one_hot_label = fluid.one_hot( + input=fluid.dygraph.to_variable(label), depth=depth) + + def _run(self, depth): + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + one_hot_label = fluid.one_hot(input=label, depth=depth) + + place = fluid.XPUPlace(0) + label_data = np.array([np.random.randint(0, 10 - 1) + for i in range(6)]).reshape([6, 1]) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + ret = exe.run(feed={'label': label_data, }, + fetch_list=[one_hot_label], + return_numpy=False) + + +class BadInputTestOnehotV2(unittest.TestCase): + def test_error(self): + with fluid.program_guard(fluid.Program()): + + def test_bad_x(): + label = fluid.layers.data( + name="label", + shape=[4], + append_batch_size=False, + dtype="float32") + one_hot_label = fluid.one_hot(input=label, depth=4) + + self.assertRaises(TypeError, test_bad_x) + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py new file mode 100644 index 0000000000000000000000000000000000000000..f2a078fcd2db1d9a43a339c5a4261b49e8f6c63b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_range_xpu.py @@ -0,0 +1,76 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle +import numpy as np +import sys +sys.path.append("..") +from op_test_xpu import XPUOpTest + +paddle.enable_static() + + +class TestRangeOp(XPUOpTest): + def setUp(self): + self.op_type = "range" + self.init_config() + self.inputs = { + 'Start': np.array([self.case[0]]).astype(self.dtype), + 'End': np.array([self.case[1]]).astype(self.dtype), + 'Step': np.array([self.case[2]]).astype(self.dtype) + } + + self.outputs = { + 'Out': np.arange(self.case[0], self.case[1], + self.case[2]).astype(self.dtype) + } + + def init_config(self): + self.dtype = np.float32 + self.case = (0, 1, 0.2) + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place, check_dygraph=False) + + +class TestFloatRangeOpCase0(TestRangeOp): + def init_config(self): + self.dtype = np.float32 + self.case = (0, 5, 1) + + +class TestInt32RangeOpCase0(TestRangeOp): + def init_config(self): + self.dtype = np.int32 + self.case = (0, 5, 2) + + +class TestInt32RangeOpCase1(TestRangeOp): + def init_config(self): + self.dtype = np.int32 + self.case = (10, 1, -2) + + +class TestInt32RangeOpCase2(TestRangeOp): + def init_config(self): + self.dtype = np.int32 + self.case = (-1, -10, -2) + + +if __name__ == "__main__": + unittest.main()