未验证 提交 25ffe9c2 编写于 作者: Z zhangyikun02 提交者: GitHub

add warpctc kernel and change cast_v2 to cast for xpu, test=kunlun (#48134)

上级 b07e6b45
...@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so") ...@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
if(NOT DEFINED XPU_BASE_URL) if(NOT DEFINED XPU_BASE_URL)
set(XPU_BASE_URL_WITHOUT_DATE set(XPU_BASE_URL_WITHOUT_DATE
"https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev") "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev")
set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221116") set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20221120")
else() else()
set(XPU_BASE_URL "${XPU_BASE_URL}") set(XPU_BASE_URL "${XPU_BASE_URL}")
endif() endif()
......
...@@ -40,12 +40,12 @@ static void XPUCastData(const phi::DenseTensor& in, ...@@ -40,12 +40,12 @@ static void XPUCastData(const phi::DenseTensor& in,
const platform::XPUDeviceContext* dev_ctx) { const platform::XPUDeviceContext* dev_ctx) {
using XPUInTDType = typename XPUTypeTrait<InType>::Type; using XPUInTDType = typename XPUTypeTrait<InType>::Type;
using XPUOutTDType = typename XPUTypeTrait<OutType>::Type; using XPUOutTDType = typename XPUTypeTrait<OutType>::Type;
int r = xpu::cast_v2<XPUInTDType, XPUOutTDType>( int r = xpu::cast<XPUInTDType, XPUOutTDType>(
dev_ctx->x_context(), dev_ctx->x_context(),
reinterpret_cast<const XPUInTDType*>(in.data<InType>()), reinterpret_cast<const XPUInTDType*>(in.data<InType>()),
reinterpret_cast<XPUOutTDType*>(out->mutable_data<OutType>(in.place())), reinterpret_cast<XPUOutTDType*>(out->mutable_data<OutType>(in.place())),
in.numel()); in.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
dev_ctx->Wait(); dev_ctx->Wait();
} }
......
...@@ -161,13 +161,10 @@ TEST(test_add_functor, add_functor) { ...@@ -161,13 +161,10 @@ TEST(test_add_functor, add_functor) {
static_cast<platform::float16>(1.0), static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0)); static_cast<platform::float16>(2.0));
EXPECT_EQ(cpu_res, 0); EXPECT_EQ(cpu_res, 0);
// double
#ifndef PADDLE_WITH_XPU
// does not support double when compiled using xpu
cpu_res = TensorddTest( cpu_res = TensorddTest(
cpu_place, cpu_place, static_cast<double>(1.0), static_cast<double>(2.0)); cpu_place, cpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
EXPECT_EQ(cpu_res, 0); EXPECT_EQ(cpu_res, 0);
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
int gpu_res = 1; int gpu_res = 1;
...@@ -217,6 +214,9 @@ TEST(test_add_functor, add_functor) { ...@@ -217,6 +214,9 @@ TEST(test_add_functor, add_functor) {
static_cast<platform::float16>(1.0), static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0)); static_cast<platform::float16>(2.0));
EXPECT_EQ(xpu_res, 0); EXPECT_EQ(xpu_res, 0);
xpu_res = TensorddTest(
xpu_place, xpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
EXPECT_EQ(xpu_res, 0);
// different places // different places
xpu_res = TensorddTest( xpu_res = TensorddTest(
cpu_place, xpu_place, static_cast<float>(1.0), static_cast<float>(2.0)); cpu_place, xpu_place, static_cast<float>(1.0), static_cast<float>(2.0));
...@@ -234,6 +234,12 @@ TEST(test_add_functor, add_functor) { ...@@ -234,6 +234,12 @@ TEST(test_add_functor, add_functor) {
static_cast<platform::float16>(1.0), static_cast<platform::float16>(1.0),
static_cast<platform::float16>(2.0)); static_cast<platform::float16>(2.0));
EXPECT_EQ(xpu_res, 0); EXPECT_EQ(xpu_res, 0);
xpu_res = TensorddTest(
cpu_place, xpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
EXPECT_EQ(xpu_res, 0);
xpu_res = TensorddTest(
xpu_place, cpu_place, static_cast<double>(1.0), static_cast<double>(2.0));
EXPECT_EQ(xpu_res, 0);
#endif #endif
} }
......
...@@ -50,13 +50,13 @@ class AccuracyXPUKernel : public framework::OpKernel<T> { ...@@ -50,13 +50,13 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
int* label_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size); int* label_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
PADDLE_ENFORCE_XDNN_NOT_NULL(label_int32_ptr); PADDLE_ENFORCE_XDNN_NOT_NULL(label_int32_ptr);
int r = xpu::cast_v2<int64_t, int32_t>( int r = xpu::cast<int64_t, int32_t>(
dev_ctx.x_context(), indices_data, indices_int32_ptr, size); dev_ctx.x_context(), indices_data, indices_int32_ptr, size);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
r = xpu::cast_v2<int64_t, int32_t>( r = xpu::cast<int64_t, int32_t>(
dev_ctx.x_context(), label_data, label_int32_ptr, size); dev_ctx.x_context(), label_data, label_int32_ptr, size);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
r = xpu::accuracy(dev_ctx.x_context(), r = xpu::accuracy(dev_ctx.x_context(),
indices_int32_ptr, indices_int32_ptr,
......
...@@ -79,11 +79,11 @@ class TopkXPUKernel : public framework::OpKernel<T> { ...@@ -79,11 +79,11 @@ class TopkXPUKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
// cast to int64 as final result // cast to int64 as final result
r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(), r = xpu::cast<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)indices_int_data, (const int32_t*)indices_int_data,
indices_data, indices_data,
indices->numel()); indices->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} }
}; };
......
...@@ -681,6 +681,9 @@ XPUOpMap& get_kl2_ops() { ...@@ -681,6 +681,9 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::INT8, XPUPlace()), pOpKernelType(vartype::INT8, XPUPlace()),
pOpKernelType(vartype::UINT8, XPUPlace()), pOpKernelType(vartype::UINT8, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})}, pOpKernelType(vartype::FP32, XPUPlace())})},
{"warpctc_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"warpctc", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"where_index", {"where_index",
XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()),
pOpKernelType(vartype::BOOL, XPUPlace()), pOpKernelType(vartype::BOOL, XPUPlace()),
......
...@@ -233,11 +233,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -233,11 +233,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
dev_ctx.template Alloc<MPDType>(&float_out, dev_ctx.template Alloc<MPDType>(&float_out,
out->numel() * sizeof(MPDType)); out->numel() * sizeof(MPDType));
int r = xpu::cast_v2(dev_ctx.x_context(), int r = xpu::cast(dev_ctx.x_context(),
reinterpret_cast<const float16*>(x->data<T>()), reinterpret_cast<const float16*>(x->data<T>()),
float_x.data<MPDType>(), float_x.data<MPDType>(),
x->numel()); x->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
r = xpu::scale(dev_ctx.x_context(), r = xpu::scale(dev_ctx.x_context(),
float_x.data<MPDType>(), float_x.data<MPDType>(),
...@@ -248,11 +248,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, ...@@ -248,11 +248,11 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
0.0); 0.0);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
r = xpu::cast_v2(dev_ctx.x_context(), r = xpu::cast(dev_ctx.x_context(),
float_out.data<MPDType>(), float_out.data<MPDType>(),
reinterpret_cast<float16*>(out->data<T>()), reinterpret_cast<float16*>(out->data<T>()),
out->numel()); out->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else { } else {
int r = xpu::scale(dev_ctx.x_context(), int r = xpu::scale(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(x->data<T>()), reinterpret_cast<const XPUType*>(x->data<T>()),
......
...@@ -39,14 +39,14 @@ void CastKernel(const Context& dev_ctx, ...@@ -39,14 +39,14 @@ void CastKernel(const Context& dev_ctx,
int r = -1; int r = -1;
switch (out_dtype) { switch (out_dtype) {
case phi::DataType::FLOAT32: case phi::DataType::FLOAT32:
r = xpu::cast_v2<XPUInTDType, float>( r = xpu::cast<XPUInTDType, float>(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUInTDType*>(in_data), reinterpret_cast<const XPUInTDType*>(in_data),
dev_ctx.template Alloc<float>(out), dev_ctx.template Alloc<float>(out),
numel); numel);
break; break;
case phi::DataType::FLOAT16: case phi::DataType::FLOAT16:
r = xpu::cast_v2<XPUInTDType, float16>( r = xpu::cast<XPUInTDType, float16>(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUInTDType*>(in_data), reinterpret_cast<const XPUInTDType*>(in_data),
reinterpret_cast<float16*>( reinterpret_cast<float16*>(
...@@ -54,35 +54,35 @@ void CastKernel(const Context& dev_ctx, ...@@ -54,35 +54,35 @@ void CastKernel(const Context& dev_ctx,
numel); numel);
break; break;
case phi::DataType::INT64: case phi::DataType::INT64:
r = xpu::cast_v2<XPUInTDType, int64_t>( r = xpu::cast<XPUInTDType, int64_t>(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUInTDType*>(in_data), reinterpret_cast<const XPUInTDType*>(in_data),
dev_ctx.template Alloc<int64_t>(out), dev_ctx.template Alloc<int64_t>(out),
numel); numel);
break; break;
case phi::DataType::INT32: case phi::DataType::INT32:
r = xpu::cast_v2<XPUInTDType, int32_t>( r = xpu::cast<XPUInTDType, int32_t>(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUInTDType*>(in_data), reinterpret_cast<const XPUInTDType*>(in_data),
dev_ctx.template Alloc<int>(out), dev_ctx.template Alloc<int>(out),
numel); numel);
break; break;
case phi::DataType::BOOL: case phi::DataType::BOOL:
r = xpu::cast_v2<XPUInTDType, bool>( r = xpu::cast<XPUInTDType, bool>(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUInTDType*>(in_data), reinterpret_cast<const XPUInTDType*>(in_data),
dev_ctx.template Alloc<bool>(out), dev_ctx.template Alloc<bool>(out),
numel); numel);
break; break;
case phi::DataType::UINT8: case phi::DataType::UINT8:
r = xpu::cast_v2<XPUInTDType, uint8_t>( r = xpu::cast<XPUInTDType, uint8_t>(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUInTDType*>(in_data), reinterpret_cast<const XPUInTDType*>(in_data),
dev_ctx.template Alloc<uint8_t>(out), dev_ctx.template Alloc<uint8_t>(out),
numel); numel);
break; break;
case phi::DataType::FLOAT64: case phi::DataType::FLOAT64:
r = xpu::cast_v2<XPUInTDType, double>( r = xpu::cast<XPUInTDType, double>(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUInTDType*>(in_data), reinterpret_cast<const XPUInTDType*>(in_data),
dev_ctx.template Alloc<double>(out), dev_ctx.template Alloc<double>(out),
...@@ -93,7 +93,7 @@ void CastKernel(const Context& dev_ctx, ...@@ -93,7 +93,7 @@ void CastKernel(const Context& dev_ctx,
"Not supported cast %d -> %d", x.dtype(), out_dtype)); "Not supported cast %d -> %d", x.dtype(), out_dtype));
} }
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} }
} // namespace phi } // namespace phi
......
...@@ -59,11 +59,11 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, ...@@ -59,11 +59,11 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel()); RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), r = xpu::cast<int64_t, int32_t>(dev_ctx.x_context(),
labels.data<int64_t>(), labels.data<int64_t>(),
labels_int_ptr_l3, labels_int_ptr_l3,
labels.numel()); labels.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>( r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
dev_ctx.x_context(), dev_ctx.x_context(),
...@@ -117,11 +117,11 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx, ...@@ -117,11 +117,11 @@ void CrossEntropyWithSoftmaxGradKernel(const Context& dev_ctx,
RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel()); RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), r = xpu::cast<int64_t, int32_t>(dev_ctx.x_context(),
labels.data<int64_t>(), labels.data<int64_t>(),
labels_int_ptr_l3, labels_int_ptr_l3,
labels.numel()); labels.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>( r = xpu::hard_softmax_with_cross_entropy_grad<XPUType, int>(
dev_ctx.x_context(), dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(loss_grad.data<T>()), reinterpret_cast<const XPUType*>(loss_grad.data<T>()),
......
...@@ -132,11 +132,11 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx, ...@@ -132,11 +132,11 @@ void CrossEntropyWithSoftmaxKernel(const Context& dev_ctx,
int* labels_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel()); int* labels_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm<int32_t>(labels.numel());
PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3); PADDLE_ENFORCE_XDNN_NOT_NULL(labels_int_ptr_l3);
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), r = xpu::cast<int64_t, int32_t>(dev_ctx.x_context(),
labels.data<int64_t>(), labels.data<int64_t>(),
labels_int_ptr_l3, labels_int_ptr_l3,
labels.numel()); labels.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
r = xpu::hard_cross_entropy<XPUType, int32_t>( r = xpu::hard_cross_entropy<XPUType, int32_t>(
dev_ctx.x_context(), dev_ctx.x_context(),
......
...@@ -72,16 +72,11 @@ void GatherGradKernel(const Context& dev_ctx, ...@@ -72,16 +72,11 @@ void GatherGradKernel(const Context& dev_ctx,
} else { } else {
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context()); xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
int* index_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm<int32_t>(index.numel()); int* index_int_ptr_l3 = RAII_GUARD.alloc_l3_or_gm<int32_t>(index.numel());
r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), r = xpu::cast<int64_t, int32_t>(dev_ctx.x_context(),
index.data<int64_t>(), index.data<int64_t>(),
index_int_ptr_l3, index_int_ptr_l3,
index.numel()); index.numel());
PADDLE_ENFORCE_EQ(r, PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
XPU_SUCCESS,
phi::errors::External("XPU API(cast_v2) return wrong "
"value[%d %s]",
r,
XPUAPIErrorMsg[r]));
r = xpu::gather_grad<XPUType, int>( r = xpu::gather_grad<XPUType, int>(
dev_ctx.x_context(), dev_ctx.x_context(),
...@@ -93,12 +88,7 @@ void GatherGradKernel(const Context& dev_ctx, ...@@ -93,12 +88,7 @@ void GatherGradKernel(const Context& dev_ctx,
axis_v, axis_v,
overwrite); overwrite);
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_XDNN_SUCCESS(r, "gather_grad");
r,
xpu::Error_t::SUCCESS,
phi::errors::External("XPU gather grad kernel return wrong value[%d %s]",
r,
XPUAPIErrorMsg[r]));
} }
} // namespace phi } // namespace phi
......
...@@ -54,12 +54,11 @@ void SGDDenseKernel(const Context &dev_ctx, ...@@ -54,12 +54,11 @@ void SGDDenseKernel(const Context &dev_ctx,
const float *lr = nullptr; const float *lr = nullptr;
if (std::is_same<T, dtype::float16>::value) { if (std::is_same<T, dtype::float16>::value) {
float *lr_float = RAII_GUARD.alloc_l3_or_gm<float>(learning_rate.numel()); float *lr_float = RAII_GUARD.alloc_l3_or_gm<float>(learning_rate.numel());
int r = int r = xpu::cast<XPUType, float>(dev_ctx.x_context(),
xpu::cast_v2<XPUType, float>(dev_ctx.x_context(),
reinterpret_cast<const XPUType *>(lr_t), reinterpret_cast<const XPUType *>(lr_t),
lr_float, lr_float,
learning_rate.numel()); learning_rate.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
lr = lr_float; lr = lr_float;
} else { } else {
lr = reinterpret_cast<const float *>(lr_t); lr = reinterpret_cast<const float *>(lr_t);
......
...@@ -68,11 +68,11 @@ void TopkKernel(const Context& dev_ctx, ...@@ -68,11 +68,11 @@ void TopkKernel(const Context& dev_ctx,
k); k);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(), r = xpu::cast<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)indices_int_data, (const int32_t*)indices_int_data,
indices_data, indices_data,
indices->numel()); indices->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
} else { } else {
// do transpose if axis is not the last dim of input // do transpose if axis is not the last dim of input
std::vector<int> trans_axes; std::vector<int> trans_axes;
...@@ -127,11 +127,11 @@ void TopkKernel(const Context& dev_ctx, ...@@ -127,11 +127,11 @@ void TopkKernel(const Context& dev_ctx,
k); k);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
r = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(), r = xpu::cast<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)trans_idx_int32_data, (const int32_t*)trans_idx_int32_data,
trans_idx_data, trans_idx_data,
indices->numel()); indices->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2"); PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
// Transpose back to original dims // Transpose back to original dims
std::vector<int> trans_back_axes; std::vector<int> trans_back_axes;
for (int i = 0; i < axis; i++) { for (int i = 0; i < axis; i++) {
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/warpctc_grad_kernel.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void WarpctcGradKernel(const Context& dev_ctx,
const DenseTensor& logits,
const paddle::optional<DenseTensor>& logits_length,
const DenseTensor& warpctcgrad,
const DenseTensor& loss_grad,
int blank,
bool norm_by_times,
DenseTensor* logits_grad) {
dev_ctx.template Alloc<T>(logits_grad);
bool has_logits_length = logits_length.is_initialized();
if (!has_logits_length) {
PADDLE_THROW(
phi::errors::External("XPU only support logits_length is_initialized"));
}
int max_seq_length = warpctcgrad.dims()[0]; // Tmax
int num_sequences = warpctcgrad.dims()[1]; // B
int seq_width = warpctcgrad.dims()[2]; // D
auto* logits_length_ptr = logits_length.get_ptr();
int r = xpu::ctc_loss_grad<T, int64_t>(dev_ctx.x_context(),
loss_grad.data<T>(),
logits_grad->data<T>(),
warpctcgrad.data<T>(),
max_seq_length,
num_sequences,
seq_width,
logits_length_ptr->data<int64_t>(),
norm_by_times);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "ctc_loss_grad");
}
} // namespace phi
PD_REGISTER_KERNEL(
warpctc_grad, XPU, ALL_LAYOUT, phi::WarpctcGradKernel, float) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/warpctc_kernel.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "paddle/phi/backends/xpu/xpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T, typename Context>
void WarpctcKernel(const Context& dev_ctx,
const DenseTensor& logits,
const DenseTensor& label,
const paddle::optional<DenseTensor>& logits_length,
const paddle::optional<DenseTensor>& labels_length,
int blank,
bool norm_by_times,
DenseTensor* loss,
DenseTensor* warpctcgrad) {
bool has_logits_length = logits_length.is_initialized();
if (!has_logits_length) {
PADDLE_THROW(
phi::errors::External("XPU only support logits_length is_initialized"));
}
bool has_labels_length = labels_length.is_initialized();
if (!has_labels_length) {
PADDLE_THROW(
phi::errors::External("XPU only support labels_length is_initialized"));
}
int max_sequence_length = logits.dims()[0];
int num_sequences = logits.dims()[1];
int sequence_width = logits.dims()[2];
int max_target_seq_length = label.dims()[1];
PADDLE_ENFORCE_GT(max_sequence_length,
0,
phi::errors::InvalidArgument(
"The first dimension of Input(Logits) should be "
"greater than zero "
"but received %d. ",
max_sequence_length));
PADDLE_ENFORCE_GT(num_sequences,
0,
phi::errors::InvalidArgument(
"The second dimension of Input(Logits) should be "
"greater than zero "
"but received %d. ",
num_sequences));
PADDLE_ENFORCE_GT(sequence_width,
0,
phi::errors::InvalidArgument(
"The third dimension of Input(Logits) should be "
"greater than zero "
"but received %d. ",
sequence_width));
loss->Resize(phi::make_ddim({num_sequences, 1}));
dev_ctx.template Alloc<T>(loss);
warpctcgrad->Resize(
phi::make_ddim({max_sequence_length, num_sequences, sequence_width}));
dev_ctx.template Alloc<T>(warpctcgrad);
const T* logits_data = logits.data<T>();
const int* label_data = label.data<int>();
auto logits_length_data = logits_length.get_ptr()->data<int64_t>();
auto labels_length_data = labels_length.get_ptr()->data<int64_t>();
T* loss_data = loss->data<T>();
T* warpctcgrad_data = warpctcgrad->data<T>();
int r = xpu::ctc_loss<T, int64_t>(dev_ctx.x_context(),
logits_data,
label_data,
loss_data,
warpctcgrad_data,
logits_length_data,
labels_length_data,
max_sequence_length,
num_sequences,
sequence_width,
max_target_seq_length,
blank);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "ctc_loss");
}
} // namespace phi
PD_REGISTER_KERNEL(warpctc, XPU, ALL_LAYOUT, phi::WarpctcKernel, float) {}
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.append("..")
import unittest
import numpy as np
from test_softmax_op import stable_softmax
import paddle.fluid as fluid
from paddle.fluid import Program, program_guard
import paddle
import paddle.nn.functional as F
from op_test_xpu import XPUOpTest
from xpu.get_test_cover_info import (
create_test_class,
get_xpu_op_support_types,
XPUOpTestWrapper,
)
paddle.enable_static()
CUDA_BLOCK_SIZE = 32
class CTCForward(object):
def __init__(
self,
softmax,
softmax_lod,
labels,
labels_lod,
num_classes,
batch_size,
blank,
norm_by_times,
):
self.softmax = softmax
self.softmax_lod = softmax_lod
self.labels = labels
self.labels_lod = labels_lod
self.blank = blank
self.norm_by_times = norm_by_times
self.level = 0
self.num_classes = num_classes
self.batch_size = batch_size
self.loss = np.zeros([self.batch_size, 1], dtype=softmax.dtype)
self.gradient = np.zeros(self.softmax.shape, dtype=softmax.dtype)
# float64
self.EXP_MAX = sys.float_info.max
self.EXP_MIN = sys.float_info.min
self.LOG_ZERO = np.log(self.EXP_MIN)
self.LOG_INFINITY = np.log(self.EXP_MAX)
def safe_exp(self, x):
if x <= self.LOG_ZERO:
return 0.0
if x >= self.LOG_INFINITY:
return self.EXP_MAX
return np.exp(x)
def safe_log(self, x):
if x <= self.EXP_MIN:
return self.LOG_ZERO
return np.log(x)
# x = lna and y = lnb are in log scale, ln(a / b) = lna - lnb
def log_div(self, x, y):
res = x - y
if res <= self.LOG_ZERO:
return self.LOG_ZERO
if res >= self.LOG_INFINITY:
return self.LOG_INFINITY
return res
# x = lna and y = lnb are in log scale, ln(a * b) = lna + lnb
def log_mul(self, x, y):
res = x + y
if res <= self.LOG_ZERO:
return self.LOG_ZERO
if res >= self.LOG_INFINITY:
return self.LOG_INFINITY
return res
# x = lna and y = lnb are in log scale,
# ln(a + b) = lna + ln(1 + exp(lnb - lna)), where b > a
def log_add(self, x, y):
if x < y:
t = y
y = x
x = t
return x + self.safe_log(1 + self.safe_exp(y - x))
def segment_range(self, time, total_times, total_segments):
start = max(0, total_segments - (2 * (total_times - time)))
end = min(total_segments, 2 * (time + 1))
return start, end
def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence):
total_times = softmax_a_sequence.shape[0]
total_segments = labels_a_sequence.shape[0] * 2 + 1
required_times = labels_a_sequence.shape[0]
old_label = -1
for i in range(labels_a_sequence.shape[0]):
# two contingous labels with the same value
if labels_a_sequence[i, 0] == old_label:
required_times = required_times + 1
old_label = labels_a_sequence[i, 0]
if total_times < required_times:
return 0
# calculate the forward and backward variables,
# reference Chapter 7.3 of "Alex Grave, Supervised Sequence
# Labelling with Recurrent Neural Networks"
log_acts = np.zeros(
[total_times, self.num_classes], dtype=softmax_a_sequence.dtype
)
for i in range(total_times):
for j in range(self.num_classes):
log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
# calculate the forward variables
forward_vars = np.zeros(
[total_times, total_segments], dtype=softmax_a_sequence.dtype
)
for i in range(total_times):
for j in range(total_segments):
forward_vars[i, j] = self.LOG_ZERO
for i in range(total_times):
# dp initialization at t0
if i == 0:
forward_vars[i, 0] = log_acts[0, self.blank]
if total_segments > 1:
forward_vars[i, 1] = log_acts[0, labels_a_sequence[i, 0]]
continue
# dp from t1
start, end = self.segment_range(i, total_times, total_segments)
for k in range(end - start):
j = k + start
if j & 1 == 1:
label_idx = j // 2
label_val = labels_a_sequence[label_idx, 0]
fv = self.log_add(
forward_vars[i - 1, j], forward_vars[i - 1, j - 1]
)
if (
j > 1
and label_val != labels_a_sequence[label_idx - 1, 0]
):
fv = self.log_add(fv, forward_vars[i - 1, j - 2])
fv = self.log_mul(fv, log_acts[i, label_val])
else:
fv = forward_vars[i - 1, j]
if j > 0:
fv = self.log_add(fv, forward_vars[i - 1, j - 1])
fv = self.log_mul(fv, log_acts[i, self.blank])
forward_vars[i, j] = fv
# sum the last two value as log_prob
log_prob = forward_vars[total_times - 1, total_segments - 1]
if total_segments > 1:
log_prob = self.log_add(
log_prob, forward_vars[total_times - 1, total_segments - 2]
)
return -log_prob
def forward(self):
softmax_offset = 0
labels_offset = 0
for i in range(self.batch_size):
if self.labels.shape[1] == 1:
softmax_start_i = softmax_offset
softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
labels_start_i = labels_offset
labels_end_i = labels_offset + self.labels_lod[self.level][i]
softmax_a_sequence = self.softmax[
softmax_start_i:softmax_end_i, :
]
labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
self.loss[i] = self.forward_a_sequence(
softmax_a_sequence, labels_a_sequence
)
softmax_offset += self.softmax_lod[self.level][i]
labels_offset += self.labels_lod[self.level][i]
else:
softmax_a_sequence = self.softmax[: self.softmax_lod[i], i, :]
labels_a_sequence = self.labels[: self.labels_lod[i], :]
self.loss[i] = self.forward_a_sequence(
softmax_a_sequence, labels_a_sequence
)
return self.loss
def python_api(
logits,
label,
logits_length=None,
labels_length=None,
blank=0,
norm_by_times=False,
):
return paddle.fluid.layers.warpctc(
logits, label, blank, norm_by_times, logits_length, labels_length
)
class XPUTestWarpCTCOp(XPUOpTestWrapper):
def __init__(self):
self.op_name = 'warpctc'
class TestWarpCTCOpWithPadding(XPUOpTest):
def config(self):
self.batch_size = 4
self.num_classes = 8
self.logits_lod = [[4, 1, 3, 3]]
self.labels_lod = [[3, 1, 4, 4]]
self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
def setUp(self):
self.op_type = "warpctc"
self.dtype = self.in_type
self.place = paddle.XPUPlace(0)
self.python_api = python_api
self.python_out_sig = ["Loss"]
self.config()
logits = np.random.uniform(
0.1, 1.0, [sum(self.logits_length), self.num_classes]
).astype(self.dtype)
print("logits.shape = ", logits.shape)
softmax = np.apply_along_axis(stable_softmax, 1, logits)
# labels should not be blank
labels = np.random.randint(
0,
self.num_classes - 1,
[sum(self.labels_length), 1],
dtype="int32",
)
ctc = CTCForward(
softmax,
self.logits_lod,
labels,
self.labels_lod,
self.num_classes,
self.batch_size,
self.blank,
self.norm_by_times,
)
loss = ctc.forward()
max_sequence_length = 0
for i in range(self.batch_size):
max_sequence_length = max(
max_sequence_length, self.logits_length[i]
)
# reshape logits to T*N*S
new_logits = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype,
)
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.logits_length[batch_id]):
for j in range(self.num_classes):
new_logits[i, batch_id, j] = logits[cur + i, j]
cur = cur + self.logits_length[batch_id]
# reshape labels to N*S
max_target_seq_length = 0
for i in range(self.batch_size):
max_target_seq_length = max(
max_target_seq_length, self.labels_length[i]
)
new_labels = np.zeros(
[self.batch_size, max_target_seq_length], dtype="int32"
)
cur = 0
for batch_id in range(self.batch_size):
for i in range(self.labels_length[batch_id]):
new_labels[batch_id, i] = labels[cur + i]
cur = cur + self.labels_length[batch_id]
self.gradient = np.zeros(
[max_sequence_length, self.batch_size, self.num_classes],
dtype=logits.dtype,
)
self.inputs = {
"Logits": new_logits,
"Label": new_labels,
"LogitsLength": self.logits_length,
"LabelLength": self.labels_length,
}
self.outputs = {"Loss": loss}
self.attrs = {
"blank": self.blank,
"norm_by_times": self.norm_by_times,
}
def test_check_output(self):
self.check_output(check_eager=True)
def test_check_grad(self):
self.outputs['WarpCTCGrad'] = self.gradient
place = paddle.XPUPlace(0)
self.check_grad_with_place(
place,
["Logits"],
"Loss",
max_relative_error=0.007,
check_dygraph=False,
)
class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding):
def config(self):
self.batch_size = 4
self.num_classes = CUDA_BLOCK_SIZE + 2
self.logits_lod = [[4, 1, 3, 3]]
self.labels_lod = [[3, 1, 4, 4]]
self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
class TestWarpCTCOpError(unittest.TestCase):
def test_errors(self):
self.dtype = self.in_type
self.place = paddle.XPUPlace(0)
with program_guard(Program(), Program()):
logits = fluid.data(
name='logits', shape=[5, 16, 6], dtype=self.dtype
)
logits_length = fluid.data(
name='logits_length', shape=[None], dtype='int64'
)
label = fluid.data(name='label', shape=[16, 3], dtype='int32')
label_length = fluid.data(
name='labels_length', shape=[None], dtype='int64'
)
def test_logits_Variable():
logits_data = np.random.rand(5, 16, 6).astype(logits.dtype)
fluid.layers.warpctc(
input=logits_data,
label=label,
input_length=logits_length,
label_length=label_length,
)
self.assertRaises(TypeError, test_logits_Variable)
def test_label_Variable():
label_data = np.random.randint(0, 5, [5, 1]).astype("int32")
fluid.layers.warpctc(
input=logits,
label=label_data,
input_length=logits_length,
label_length=label_length,
)
self.assertRaises(TypeError, test_label_Variable)
def test_logits_len_Variable():
logits_length_data = np.array([5] * 16).astype("int64")
fluid.layers.warpctc(
input=logits,
label=label,
input_length=logits_length_data,
label_length=label_length,
)
self.assertRaises(TypeError, test_logits_len_Variable)
def test_label_len_Variable():
label_length_data = np.array([3] * 16).astype("int64")
fluid.layers.warpctc(
input=logits,
label=label,
input_length=logits_length,
label_length=label_length_data,
)
self.assertRaises(TypeError, test_label_len_Variable)
def test_dygraph_errors(self):
def test_dygraph_with_lod():
self.dtype = self.in_type
self.place = paddle.XPUPlace(0)
logits = np.random.uniform(0.1, 1.0, [20, 15]).astype(
self.dtype
)
# labels should not be blank
labels = np.random.randint(0, 15 - 1, [15, 1], dtype="int32")
softmax = paddle.to_tensor(logits)
labels = paddle.to_tensor(labels)
fluid.layers.warpctc(input=softmax, label=labels)
paddle.disable_static()
self.assertRaises(ValueError, test_dygraph_with_lod)
paddle.enable_static()
class TestCTCLossAPICase(unittest.TestCase):
def test_functinal_api(self):
self.dtype = self.in_type
self.place = paddle.XPUPlace(0)
self.batch_size = 4
self.num_classes = CUDA_BLOCK_SIZE + 2
self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
self.blank = self.num_classes - 1
self.norm_by_times = False
logits = np.random.uniform(
0.1,
1.0,
[max(self.logits_length), self.batch_size, self.num_classes],
).astype(self.dtype)
softmax = np.apply_along_axis(stable_softmax, -1, logits)
# labels should not be blank
labels = np.random.randint(
0,
self.num_classes - 1,
[self.batch_size, max(self.labels_length)],
dtype="int32",
)
ctc = CTCForward(
softmax,
self.logits_length,
labels,
self.labels_length,
self.num_classes,
self.batch_size,
self.blank,
self.norm_by_times,
)
loss_np = ctc.forward()
paddle.disable_static()
softmax = paddle.to_tensor(logits)
labels = paddle.to_tensor(labels)
logits_length = paddle.to_tensor(self.logits_length)
labels_length = paddle.to_tensor(self.labels_length)
loss_pd_mean = F.ctc_loss(
softmax,
labels,
logits_length,
labels_length,
blank=self.blank,
reduction='mean',
)
loss_pd_mean = loss_pd_mean.numpy()
loss_pd_sum = F.ctc_loss(
softmax,
labels,
logits_length,
labels_length,
blank=self.blank,
reduction='sum',
)
loss_pd_sum = loss_pd_sum.numpy()
paddle.enable_static()
loss_np = np.squeeze(loss_np, axis=-1)
loss_np_mean = (loss_np / labels_length.numpy()).mean()
loss_np_sum = loss_np.sum()
np.testing.assert_allclose(
loss_pd_mean, loss_np_mean, rtol=1e-05, atol=1
)
np.testing.assert_allclose(
loss_pd_sum, loss_np_sum, rtol=1e-05, atol=1
)
def test_class_api(self):
self.dtype = self.in_type
self.place = paddle.XPUPlace(0)
self.batch_size = 3
self.num_classes = 15
self.logits_length = np.array([3, 3, 3], dtype=np.int64)
self.labels_length = np.array([0, 1, 2], dtype=np.int64)
self.blank = 0
self.norm_by_times = False
logits = np.random.uniform(
0.1,
1.0,
[max(self.logits_length), self.batch_size, self.num_classes],
).astype(self.dtype)
softmax = np.apply_along_axis(stable_softmax, -1, logits)
# labels should not be blank
labels = np.random.randint(
1,
self.num_classes,
[self.batch_size, max(self.labels_length)],
dtype="int32",
)
ctc = CTCForward(
softmax,
self.logits_length,
labels,
self.labels_length,
self.num_classes,
self.batch_size,
self.blank,
self.norm_by_times,
)
loss_np = ctc.forward()
paddle.disable_static()
softmax = paddle.to_tensor(logits)
labels = paddle.to_tensor(labels)
logits_length = paddle.to_tensor(self.logits_length)
labels_length = paddle.to_tensor(self.labels_length)
loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
softmax, labels, logits_length, labels_length
)
loss_pd = loss_pd.numpy()
paddle.enable_static()
loss_np = np.squeeze(loss_np, axis=-1)
np.testing.assert_allclose(loss_pd, loss_np, rtol=1e-05, atol=1)
support_types = get_xpu_op_support_types('warpctc')
for stype in support_types:
create_test_class(globals(), XPUTestWarpCTCOp, stype)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册