1. Fix warpctc grad tensor initial bug.

2. Remove num_seq arguments. 3. Refine CUDA kernel of ScaleLoDTensorFunctor. 4. Change max_relative_error of gradient unitest to 0.007

1. Fix warpctc grad tensor initial bug.
2. Remove num_seq arguments. 3. Refine CUDA kernel of ScaleLoDTensorFunctor. 4. Change max_relative_error of gradient unitest to 0.007
137f0dfc · wanghaoshuang · fd24e195 · 137f0dfc · 137f0dfc · 137f0dfc
5 changed file
--- a/paddle/operators/math/sequence_scale.cc
+++ b/paddle/operators/math/sequence_scale.cc
@@ -22,10 +22,10 @@ template <typename T>
 class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales,
-                  const size_t num_seq) {
+                  framework::LoDTensor& seq, const T* scales) {
    const size_t level = 0;
    auto lod = seq.lod();
+    const size_t num_seq = lod[level].size() - 1;
    size_t seq_width = seq.dims()[1];
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);


--- a/paddle/operators/math/sequence_scale.cu
+++ b/paddle/operators/math/sequence_scale.cu
@@ -20,18 +20,10 @@ namespace math {

 template <typename T>
 __global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
-                                    const size_t num_seq,
                                    const size_t seq_width) {
-  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (idx < lod[num_seq] * seq_width) {
-    size_t i = 0;
-    for (i = 0; i < num_seq; ++i) {
-      if (idx < lod[i + 1] * seq_width) {
-        break;
-      }
-    }
-    seq[idx] *= scales[i];
+  if (threadIdx.x < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width) {
+    int idx = lod[blockIdx.x] * seq_width + threadIdx.x;
+    seq[idx] *= scales[blockIdx.x];
  }
 }

@@ -39,18 +31,17 @@ template <typename T>
 class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
-                  framework::LoDTensor& seq, const T* scales,
-                  const size_t num_seq) {
-    auto lod = seq.lod();
-    const size_t seq_width = seq.dims()[1];
+                  framework::LoDTensor& seq, const T* scales) {
    const size_t level = 0;
+    auto lod = seq.lod();
+    const size_t num_seq = lod[level].size() - 1;
+    const size_t seq_width = seq.numel() / seq.dims()[0];
    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
    T* seq_data = seq.mutable_data<T>(context.GetPlace());

    int threads = 1024;
-    int grid = (seq.numel() * seq_width + threads - 1) / threads;
-    SequenceScaleKernel<T><<<grid, threads, 0, context.stream()>>>(
-        seq_data, abs_offset_lod[level].data(), scales, num_seq, seq_width);
+    SequenceScaleKernel<T><<<num_seq, threads, 0, context.stream()>>>(
+        seq_data, abs_offset_lod[level].data(), scales, seq_width);
  }
 };


--- a/paddle/operators/math/sequence_scale.h
+++ b/paddle/operators/math/sequence_scale.h
@@ -47,7 +47,7 @@ template <typename DeviceContext, typename T>
 class ScaleLoDTensorFunctor {
 public:
  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
-                  const T* scales, const size_t num_seq);
+                  const T* scales);
 };

 }  // namespace math

--- a/paddle/operators/warpctc_op.h
+++ b/paddle/operators/warpctc_op.h
@@ -179,6 +179,10 @@ class WarpCTCKernel : public framework::OpKernel<T> {
    T* warpctc_grad_data =
        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());

+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), warpctc_grad,
+        static_cast<T>(0));
+
    // warpctc accesses labels in CPU memory
    Tensor warpctc_label;
    Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
@@ -215,10 +219,9 @@ class WarpCTCGradKernel : public framework::OpKernel<T> {
        *warpctc_grad, norm_by_times);

    const T* loss_grad_data = loss_grad->data<T>();
-    const size_t num_seq = loss_grad->dims()[0];
    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
        ctx.template device_context<DeviceContext>(), *logits_grad,
-        loss_grad_data, num_seq);
+        loss_grad_data);
  }
 };


--- a/python/paddle/v2/fluid/tests/test_warpctc_op.py
+++ b/python/paddle/v2/fluid/tests/test_warpctc_op.py
@@ -193,7 +193,7 @@ class TestWarpCTCOp(OpTest):

    def test_check_grad(self):
        self.outputs['WarpCTCGrad'] = self.gradient
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.01)
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.007)


 if __name__ == "__main__":