From ec9bc1bd9f2381e95a7933e2157974c2a38d89f7 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Wed, 11 Sep 2019 11:46:43 +0800 Subject: [PATCH] paddle::framework::vectorize() templatization (#19730) remove unused accuracy-diff warpctc-cudnn implementation test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/framework/ddim.cc | 9 - paddle/fluid/framework/ddim.h | 1 - paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 197 ------------------ paddle/fluid/operators/warpctc_op.cc | 18 -- python/paddle/fluid/layers/nn.py | 3 - .../fluid/tests/unittests/test_warpctc_op.py | 23 +- 7 files changed, 2 insertions(+), 251 deletions(-) delete mode 100644 paddle/fluid/operators/warpctc_cudnn_op.cu.cc diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 4f50eeb795..50953fe00a 100755 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -159,7 +159,7 @@ paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a')) paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', '3720b4a386585094435993deb028b592')) paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e50940f3ce5a08cc477b72f517491bf3')) -paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, False, None, None)), ('document', 'ba27f25141adf24706536d179fabdf17')) +paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, None, None)), ('document', 'a5be881ada816e47ea7a6ee4396da357')) paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'f568714a876425004aca4ea2d4a27701')) paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa')) paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '33134416fc27dd65a767e5f15116ee16')) diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index 39f9ef04aa..b3aaa01d53 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -48,15 +48,6 @@ bool DDim::operator==(const DDim& d) const { bool DDim::operator!=(const DDim& d) const { return !(*this == d); } -// NOTE: framework::vectorize converts to type int64_t -// which does not fit cudnn inputs. -std::vector vectorize2int(const DDim& ddim) { - std::vector result(DDim::kMaxRank); - dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); - result.resize(ddim.size()); - return result; -} - struct ProductVisitor { template inline int64_t operator()(const Dim& dim) { diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index 62a9ad36db..14824afbea 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -177,7 +177,6 @@ std::vector vectorize(const DDim& ddim) { result.resize(ddim.size()); return result; } -std::vector vectorize2int(const DDim& ddim); int64_t product(const DDim& ddim); diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc deleted file mode 100644 index 2a744f66f1..0000000000 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/operators/warpctc_op.h" -#include "paddle/fluid/platform/cudnn_helper.h" - -namespace paddle { -namespace operators { - -#if CUDNN_VERSION >= 7001 -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedCTCLossDescriptor = platform::ScopedCTCLossDescriptor; -using DataLayout = platform::DataLayout; - -template -class CudnnCTCKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // =====================Copied code from warpctc=========================== - auto* logits = ctx.Input("Logits"); - auto* label = ctx.Input("Label"); - auto* warpctc_grad = ctx.Output("WarpCTCGrad"); - auto* loss = ctx.Output("Loss"); - - const size_t level = 0; - - auto logits_lod = framework::ToAbsOffset(logits->lod()); - auto logits_dims = logits->dims(); - PADDLE_ENFORCE_EQ(logits_dims[0], - static_cast(logits_lod[level].back()), - "The first dimension of Input(Logits) should be equal to " - "the sum of all sequences' lengths."); - - auto label_lod = framework::ToAbsOffset(label->lod()); - auto label_dims = label->dims(); - PADDLE_ENFORCE_EQ( - label_dims[0], label->numel(), - "The width of each timestep in Input(Label) should be 1."); - - const size_t num_sequences = logits_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1, - "The number of sequences of Input(Logits) should be " - "equal to that of Input(Label)."); - PADDLE_ENFORCE_LE(num_sequences, 256, - "The labelLengths must less than 256 for cudnn call."); - - const size_t sequence_width = logits->numel() / logits_dims[0]; - auto loss_dims = - framework::make_ddim({static_cast(num_sequences), 1}); - - // NOTE: cudnn takes softmax input, calculate softmax first, then do padding - auto& dev_ctx = ctx.template device_context(); - LoDTensor softmax_logits; - softmax_logits.mutable_data(logits->dims(), ctx.GetPlace()); - softmax_logits.set_lod(logits_lod); - int rank = logits->dims().size(); - int axis_dim = logits->dims()[rank - 1]; - Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1); - Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1); - math::SoftmaxFunctor()(dev_ctx, axis_dim, &in_2d, - &out_2d); - - // ctc needs sequences data stored in transposed padding format - // logits and grad using padding data of layout 'TNC' - // T: max_sequence_length - // N: batch_size (num_sequences) - // C: width - LoDTensor warpctc_logits; - const size_t max_sequence_length = - math::MaximumSequenceLength(logits_lod[level]); - auto warpctc_logits_dims = - framework::make_ddim({static_cast(max_sequence_length), - static_cast(num_sequences), - static_cast(sequence_width)}); - warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); - - LoDTensor cpu_pad_value; - T* pad_value_data = - cpu_pad_value.mutable_data({1}, platform::CPUPlace()); - *pad_value_data = static_cast(0); - LoDTensor pad_value; - if (platform::is_cpu_place(ctx.GetPlace())) { - pad_value = cpu_pad_value; - } else { - TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value); - } - - math::PaddingLoDTensorFunctor()( - ctx.template device_context(), softmax_logits, - &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, - math::kLengthBatchWidth); - const T* warpctc_logits_data = warpctc_logits.data(); - - std::vector warpctc_label_lengths(num_sequences); - std::vector warpctc_logits_lengths(num_sequences); - - for (size_t i = 0; i < num_sequences; ++i) { - warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i]; - warpctc_logits_lengths[i] = - logits_lod[level][i + 1] - logits_lod[level][i]; - } - - T* warpctc_grad_data = - warpctc_grad->mutable_data(warpctc_logits.dims(), ctx.GetPlace()); - - math::SetConstant()( - ctx.template device_context(), warpctc_grad, - static_cast(0)); - - Tensor warpctc_label; - TensorCopySync(*label, platform::CPUPlace(), &warpctc_label); - const int* warpctc_label_data = warpctc_label.data(); - // ======================================================================== - - ScopedTensorDescriptor logits_desc; - ScopedTensorDescriptor grad_desc; - ScopedCTCLossDescriptor ctcloss_desc; - // layout here doesn't have effect. - DataLayout layout = DataLayout::kNCHW; - - auto cu_logits_desc = logits_desc.descriptor( - layout, framework::vectorize2int(warpctc_logits.dims())); - auto cu_grad_desc = grad_desc.descriptor( - layout, framework::vectorize2int(warpctc_grad->dims())); - auto cu_ctcloss_desc = ctcloss_desc.descriptor(); - - auto handle = dev_ctx.cudnn_handle(); - size_t workspace_size; - - CUDNN_ENFORCE(platform::dynload::cudnnGetCTCLossWorkspaceSize( - handle, cu_logits_desc, cu_grad_desc, warpctc_label_data, - warpctc_label_lengths.data(), warpctc_logits_lengths.data(), - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size)); - - T* loss_data = loss->mutable_data(loss_dims, ctx.GetPlace()); - - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( - handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, - warpctc_label_lengths.data(), warpctc_logits_lengths.data(), - loss_data, cu_grad_desc, warpctc_grad_data, - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace, - workspace_size)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size); - } -}; - -template -class CudnnCTCGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* warpctc_grad = ctx.Input("WarpCTCGrad"); - auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); - const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); - - logits_grad->mutable_data(ctx.GetPlace()); - bool norm_by_times = ctx.Attr("norm_by_times"); - math::UnpaddingLoDTensorFunctor()( - ctx.template device_context(), *warpctc_grad, - logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth); - - const T* loss_grad_data = loss_grad->data(); - math::ScaleLoDTensorFunctor()( - ctx.template device_context(), loss_grad_data, - logits_grad); - } -}; - -#endif -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#if CUDNN_VERSION >= 7001 -REGISTER_OP_KERNEL( - warpctc, CUDNN, plat::CUDAPlace, - ops::CudnnCTCKernel); -REGISTER_OP_KERNEL( - warpctc_grad, CUDNN, plat::CUDAPlace, - ops::CudnnCTCGradKernel); -#endif diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc index 2eacefc43c..7033d55a53 100644 --- a/paddle/fluid/operators/warpctc_op.cc +++ b/paddle/fluid/operators/warpctc_op.cc @@ -59,20 +59,6 @@ class WarpCTCOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; -#ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { -#if CUDA_VERSION >= 9000 - LOG(WARNING) - << "The cudnnCTCLoss of CUDNN7 have some diff between " - "CUDA9/CUDA10 and CUDA8. You can close use_cudnn option to " - "use " - "baidu-research/warp-ctc(https://github.com/baidu-research/" - "warp-ctc)"; -#endif - - library_ = framework::LibraryType::kCUDNN; - } -#endif framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; return framework::OpKernelType(ctx.Input("Logits")->type(), ctx.device_context(), layout_, library_); @@ -129,10 +115,6 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { "normalize the gradients by the number of time-step, " "which is also the sequence's length.") .SetDefault(false); - AddAttr("use_cudnn", - "(bool, default: false), whether to " - "use cudnn kernel.") - .SetDefault(false); AddComment(R"DOC( An operator integrating the open-source [warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 20735aa23b..5c5e9975b3 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -5793,7 +5793,6 @@ def warpctc(input, label, blank=0, norm_by_times=False, - use_cudnn=False, input_length=None, label_length=None): """ @@ -5825,7 +5824,6 @@ def warpctc(input, by the number of time-step, which is also the sequence's length. There is no need to normalize the gradients if warpctc layer was follewed by a mean_op. - use_cudnn (bool, default false): Whether to use cudnn. input_length(Variable): The length for each input sequence if it is of Tensor type, it should have shape `[batch_size]` and dtype int64. label_length(Variable): The length for each label sequence if it is @@ -5885,7 +5883,6 @@ def warpctc(input, attrs={ 'blank': blank, 'norm_by_times': norm_by_times, - 'use_cudnn': use_cudnn }) return loss_out diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py index 1289250134..74bb5ea2b0 100644 --- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py +++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py @@ -183,7 +183,6 @@ class TestWarpCTCOp(OpTest): self.labels_lod = [[3, 1, 4, 4]] self.blank = self.num_classes - 1 self.norm_by_times = False - self.use_cudnn = False def setUp(self): self.op_type = "warpctc" @@ -219,7 +218,6 @@ class TestWarpCTCOp(OpTest): self.attrs = { "blank": self.blank, "norm_by_times": self.norm_by_times, - "use_cudnn": self.use_cudnn } def test_check_output(self): @@ -238,7 +236,6 @@ class TestWarpCTCOpCase1(TestWarpCTCOp): self.labels_lod = [[3, 1, 4, 4]] self.blank = 0 self.norm_by_times = False - self.use_cudnn = False class TestWarpCTCOpWithPadding(OpTest): @@ -251,7 +248,6 @@ class TestWarpCTCOpWithPadding(OpTest): self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64) self.blank = self.num_classes - 1 self.norm_by_times = False - self.use_cudnn = False def setUp(self): self.op_type = "warpctc" @@ -315,7 +311,6 @@ class TestWarpCTCOpWithPadding(OpTest): self.attrs = { "blank": self.blank, "norm_by_times": self.norm_by_times, - "use_cudnn": self.use_cudnn } def test_check_output(self): @@ -336,23 +331,7 @@ class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding): self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64) self.blank = 0 self.norm_by_times = False - self.use_cudnn = False - - -# TODO: fix this test failed cuda9/10 manylinux images -# class TestCudnnCTCOp(TestWarpCTCOp): -# def config(self): -# self.batch_size = 4 -# self.num_classes = 8 -# self.logits_lod = [[4, 1, 3, 3]] -# self.labels_lod = [[3, 1, 4, 4]] -# self.blank = 0 -# self.norm_by_times = False -# self.use_cudnn = True -# def test_check_grad(self): -# if sys.version_info < (3, 0): -# self.outputs['WarpCTCGrad'] = self.gradient -# self.check_grad(["Logits"], "Loss", max_relative_error=0.01) + if __name__ == "__main__": unittest.main() -- GitLab