remove linear_chain_crf and crf_decoding from fluid (#48996)

* remove linear_chain_crf and crf_decoding

remove linear_chain_crf and crf_decoding from fluid (#48996)
* remove linear_chain_crf and crf_decoding
aaee07a3 · ccrrong · GitHub · 265a54aa · aaee07a3 · aaee07a3
8 changed file
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -65,8 +65,6 @@ from collections.abc import Iterable
 __all__ = [
    'fc',
    'embedding',
-    'linear_chain_crf',
-    'crf_decoding',
    'conv2d',
    'dropout',
    'split',
@@ -752,211 +750,6 @@ def _pull_box_sparse(
    return outs


-@templatedoc()
-def linear_chain_crf(input, label, param_attr=None, length=None):
-    """
-    :api_attr: Static Graph
-
-    Linear Chain CRF.
-
-    ${comment}
-
-    Args:
-        input(${emission_type}): ${emission_comment}
-        label(${label_type}): ${label_comment}
-        Length(${length_type}): ${length_comment}
-        param_attr(ParamAttr): The attribute of the learnable parameter for transition parameter.
-
-    Returns:
-        output(${emission_exps_type}): ${emission_exps_comment} \n
-        output(${transition_exps_type}): ${transition_exps_comment} \n
-        output(${log_likelihood_type}): ${log_likelihood_comment} \n
-
-    Examples:
-        .. code-block:: python
-
-            import paddle.fluid as fluid
-            import numpy as np
-            import paddle
-            paddle.enable_static()
-
-            #define net structure, using LodTensor
-            train_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(train_program, startup_program):
-                input_data = fluid.data(name='input_data', shape=[-1,10], dtype='float32')
-                label = fluid.data(name='label', shape=[-1,1], dtype='int')
-                emission= fluid.layers.fc(input=input_data, size=10, act="tanh")
-                crf_cost = fluid.layers.linear_chain_crf(
-                    input=emission,
-                    label=label,
-                    param_attr=fluid.ParamAttr(
-                    name='crfw',
-                    learning_rate=0.01))
-            use_cuda = False
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_program)
-            #define data, using LoDTensor
-            a = fluid.create_lod_tensor(np.random.rand(12,10).astype('float32'), [[3,3,4,2]], place)
-            b = fluid.create_lod_tensor(np.array([[1],[1],[2],[3],[1],[1],[1],[3],[1],[1],[1],[1]]),[[3,3,4,2]] , place)
-            feed1 = {'input_data':a,'label':b}
-            loss= exe.run(train_program,feed=feed1, fetch_list=[crf_cost])
-            print(loss)
-
-            #define net structure, using padding
-            train_program = fluid.Program()
-            startup_program = fluid.Program()
-            with fluid.program_guard(train_program, startup_program):
-                input_data2 = fluid.data(name='input_data2', shape=[-1,10,10], dtype='float32')
-                label2 = fluid.data(name='label2', shape=[-1,10,1], dtype='int')
-                label_length = fluid.data(name='length', shape=[-1,1], dtype='int')
-                emission2= fluid.layers.fc(input=input_data2, size=10, act="tanh", num_flatten_dims=2)
-                crf_cost2 = fluid.layers.linear_chain_crf(
-                    input=emission2,
-                    label=label2,
-                    length=label_length,
-                    param_attr=fluid.ParamAttr(
-                     name='crfw',
-                     learning_rate=0.01))
-
-            use_cuda = False
-            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(startup_program)
-
-            #define data, using padding
-            cc=np.random.rand(4,10,10).astype('float32')
-            dd=np.random.rand(4,10,1).astype('int64')
-            ll=np.array([[3],[3],[4],[2]])
-            feed2 = {'input_data2':cc,'label2':dd,'length':ll}
-            loss2= exe.run(train_program,feed=feed2, fetch_list=[crf_cost2])
-            print(loss2)
-            #[array([[ 7.8902354],
-            #        [ 7.3602567],
-            #        [ 10.004011],
-            #        [ 5.86721  ]], dtype=float32)]
-
-            #you can use find_var to get transition parameter.
-            transition=np.array(fluid.global_scope().find_var('crfw').get_tensor())
-            print(transition)
-
-    """
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'linear_chain_crf'
-    )
-    check_variable_and_dtype(label, 'label', ['int64'], 'linear_chain_crf')
-    helper = LayerHelper('linear_chain_crf', **locals())
-    size = input.shape[2] if length else input.shape[1]
-    transition = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=[size + 2, size],
-        dtype=helper.input_dtype(),
-    )
-    alpha = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype()
-    )
-    emission_exps = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype()
-    )
-    transition_exps = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype()
-    )
-    log_likelihood = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype()
-    )
-    this_inputs = {
-        "Emission": [input],
-        "Transition": transition,
-        "Label": [label],
-    }
-    if length:
-        this_inputs['Length'] = [length]
-    helper.append_op(
-        type='linear_chain_crf',
-        inputs=this_inputs,
-        outputs={
-            "Alpha": [alpha],
-            "EmissionExps": [emission_exps],
-            "TransitionExps": transition_exps,
-            "LogLikelihood": log_likelihood,
-        },
-    )
-
-    return log_likelihood
-
-
-@templatedoc()
-def crf_decoding(input, param_attr, label=None, length=None):
-    """
-    :api_attr: Static Graph
-
-    ${comment}
-
-    Args:
-        input(Tensor): ${emission_comment}
-
-        param_attr (ParamAttr|None): To specify the weight parameter attribute.
-            Default: None, which means the default weight parameter property is
-            used. See usage for details in :ref:`api_paddle_fluid_param_attr_ParamAttr` .
-
-        label(${label_type}, optional): ${label_comment}
-
-        length(${length_type}, optional): ${length_comment}
-
-    Returns:
-        Tensor: ${viterbi_path_comment}
-
-    Examples:
-        .. code-block:: python
-
-           import paddle
-           paddle.enable_static()
-
-           # LoDTensor-based example
-           num_labels = 10
-           feature = paddle.static.data(name='word_emb', shape=[-1, 784], dtype='float32', lod_level=1)
-           label = paddle.static.data(name='label', shape=[-1, 1], dtype='int64', lod_level=1)
-           emission = paddle.static.nn.fc(feature, size=num_labels)
-
-           crf_cost = paddle.fluid.layers.linear_chain_crf(input=emission, label=label,
-                     param_attr=paddle.ParamAttr(name="crfw"))
-           crf_decode = paddle.static.nn.crf_decoding(input=emission,
-                     param_attr=paddle.ParamAttr(name="crfw"))
-
-           # Common tensor example
-           num_labels, max_len = 10, 20
-           feature = paddle.static.data(name='word_emb_pad', shape=[-1, max_len, 784], dtype='float32')
-           label = paddle.static.data(name='label_pad', shape=[-1, max_len, 1], dtype='int64')
-           length = paddle.static.data(name='length', shape=[-1, 1], dtype='int64')
-           emission = paddle.static.nn.fc(feature, size=num_labels,
-                                      num_flatten_dims=2)
-
-           crf_cost = paddle.fluid.layers.linear_chain_crf(input=emission, label=label, length=length,
-                     param_attr=paddle.ParamAttr(name="crfw_pad"))
-           crf_decode = paddle.static.nn.crf_decoding(input=emission, length=length,
-                     param_attr=paddle.ParamAttr(name="crfw_pad"))
-    """
-    check_variable_and_dtype(
-        input, 'input', ['float32', 'float64'], 'crf_decoding'
-    )
-    helper = LayerHelper('crf_decoding', **locals())
-    transition = helper.get_parameter(param_attr.name)
-    viterbi_path = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64
-    )
-    inputs = {"Emission": [input], "Transition": transition, "Label": label}
-    if length:
-        inputs['Length'] = length
-    helper.append_op(
-        type='crf_decoding',
-        inputs=inputs,
-        outputs={"ViterbiPath": [viterbi_path]},
-    )
-
-    return viterbi_path
-
-
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.dropout")
 def dropout(
    x,

--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
@@ -162,12 +162,8 @@ def train(use_cuda, save_dirname=None, is_local=True):
    target = fluid.layers.data(
        name='target', shape=[1], dtype='int64', lod_level=1
    )
-    crf_cost = fluid.layers.linear_chain_crf(
-        input=feature_out,
-        label=target,
-        param_attr=fluid.ParamAttr(name='crfw', learning_rate=mix_hidden_lr),
-    )
-    avg_cost = paddle.mean(crf_cost)
+    cost = fluid.layers.softmax_with_cross_entropy(feature_out, target)
+    avg_cost = paddle.mean(cost)

    # TODO(qiao)
    # check other optimizers and check why out will be NAN
@@ -183,9 +179,6 @@ def train(use_cuda, save_dirname=None, is_local=True):

    # TODO(qiao)
    # add dependency track and move this config before optimizer
-    crf_decode = fluid.layers.crf_decoding(
-        input=feature_out, param_attr=fluid.ParamAttr(name='crfw')
-    )

    train_data = paddle.batch(
        paddle.reader.shuffle(paddle.dataset.conll05.test(), buf_size=8192),

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -422,7 +422,6 @@ endfunction()
 list(REMOVE_ITEM TEST_OPS test_feed_data_check_shape_type)
 list(REMOVE_ITEM TEST_OPS test_fetch_lod_tensor_array)
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_profiler)
 list(REMOVE_ITEM TEST_OPS test_data_norm_op)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
@@ -748,7 +747,6 @@ if(WITH_DISTRIBUTE)
  endif()
 endif()

-py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf)
 # profiler will random hang in linux cuda 10.1 or 10.2
 # see https://github.com/PaddlePaddle/Paddle/issues/29082 for details.
 # We guess there are some bugs in linux cuda 10.1 or 10.2,
@@ -916,7 +914,6 @@ set_tests_properties(
  test_buffer_shared_memory_reuse_pass
  PROPERTIES LABELS "RUN_TYPE=DIST")
 set_tests_properties(
-  test_parallel_executor_crf
  test_sync_batch_norm_op
  test_inplace_abn_op
  test_parallel_executor_seresnext_base_gpu
@@ -1053,7 +1050,6 @@ set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_index_add_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data
                     PROPERTIES TIMEOUT 120)
-set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
 set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT

--- a/python/paddle/fluid/tests/unittests/test_directory_migration.py
+++ b/python/paddle/fluid/tests/unittests/test_directory_migration.py
@@ -91,7 +91,6 @@ class TestDirectory(unittest.TestCase):
            'paddle.static.nn.conv3d',
            'paddle.static.nn.conv3d_transpose',
            'paddle.static.nn.create_parameter',
-            'paddle.static.nn.crf_decoding',
            'paddle.static.nn.data_norm',
            'paddle.static.nn.deform_conv2d',
            'paddle.static.nn.group_norm',

--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import unittest
-
-import paddle
-import paddle.dataset.conll05 as conll05
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import compiler
-
-word_dict, verb_dict, label_dict = conll05.get_dict()
-word_dict_len = len(word_dict)
-label_dict_len = len(label_dict)
-pred_dict_len = len(verb_dict)
-mark_dict_len = 2
-word_dim = 32
-mark_dim = 5
-hidden_dim = 512
-depth = 8
-mix_hidden_lr = 1e-3
-embedding_name = 'emb'
-
-
-def db_lstm(
-    word,
-    predicate,
-    ctx_n2,
-    ctx_n1,
-    ctx_0,
-    ctx_p1,
-    ctx_p2,
-    mark,
-    is_sparse,
-    **ignored
-):
-    # 8 features
-    predicate_embedding = fluid.layers.embedding(
-        input=predicate,
-        is_sparse=is_sparse,
-        size=[pred_dict_len, word_dim],
-        dtype='float32',
-        param_attr='vemb',
-    )
-
-    mark_embedding = fluid.layers.embedding(
-        input=mark,
-        is_sparse=is_sparse,
-        size=[mark_dict_len, mark_dim],
-        dtype='float32',
-    )
-
-    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-    emb_layers = [
-        fluid.layers.embedding(
-            size=[word_dict_len, word_dim],
-            is_sparse=is_sparse,
-            input=x,
-            param_attr=fluid.ParamAttr(name=embedding_name, trainable=False),
-        )
-        for x in word_input
-    ]
-    # TODO(zcd): if the parameter is not trainable, the
-    #  parameter's gradient should not generated.
-    for emb_layer in emb_layers:
-        emb_layer.stop_gradient = True
-
-    emb_layers.append(predicate_embedding)
-    emb_layers.append(mark_embedding)
-
-    hidden_0_layers = [
-        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
-        for emb in emb_layers
-    ]
-
-    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
-
-    lstm_0 = fluid.layers.dynamic_lstm(
-        input=hidden_0,
-        size=hidden_dim,
-        candidate_activation='relu',
-        gate_activation='sigmoid',
-        cell_activation='sigmoid',
-    )
-
-    # stack L-LSTM and R-LSTM with direct edges
-    input_tmp = [hidden_0, lstm_0]
-
-    for i in range(1, depth):
-        mix_hidden = fluid.layers.sums(
-            input=[
-                fluid.layers.fc(
-                    input=input_tmp[0], size=hidden_dim, act='tanh'
-                ),
-                fluid.layers.fc(
-                    input=input_tmp[1], size=hidden_dim, act='tanh'
-                ),
-            ]
-        )
-
-        lstm = fluid.layers.dynamic_lstm(
-            input=mix_hidden,
-            size=hidden_dim,
-            candidate_activation='relu',
-            gate_activation='sigmoid',
-            cell_activation='sigmoid',
-            is_reverse=((i % 2) == 1),
-        )
-
-        input_tmp = [mix_hidden, lstm]
-
-    feature_out = fluid.layers.sums(
-        input=[
-            fluid.layers.fc(
-                input=input_tmp[0], size=label_dict_len, act='tanh'
-            ),
-            fluid.layers.fc(
-                input=input_tmp[1], size=label_dict_len, act='tanh'
-            ),
-        ]
-    )
-
-    return feature_out
-
-
-class TestCRFModel(unittest.TestCase):
-    def check_network_convergence(
-        self, is_sparse, build_strategy=None, use_cuda=True
-    ):
-        os.environ['CPU_NUM'] = str(4)
-        main = fluid.Program()
-        startup = fluid.Program()
-        scope = fluid.Scope()
-        with fluid.scope_guard(scope):
-            with fluid.program_guard(main, startup):
-                word = fluid.layers.data(
-                    name='word_data', shape=[1], dtype='int64', lod_level=1
-                )
-                predicate = fluid.layers.data(
-                    name='verb_data', shape=[1], dtype='int64', lod_level=1
-                )
-                ctx_n2 = fluid.layers.data(
-                    name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1
-                )
-                ctx_n1 = fluid.layers.data(
-                    name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1
-                )
-                ctx_0 = fluid.layers.data(
-                    name='ctx_0_data', shape=[1], dtype='int64', lod_level=1
-                )
-                ctx_p1 = fluid.layers.data(
-                    name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1
-                )
-                ctx_p2 = fluid.layers.data(
-                    name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1
-                )
-                mark = fluid.layers.data(
-                    name='mark_data', shape=[1], dtype='int64', lod_level=1
-                )
-
-                feature_out = db_lstm(**locals())
-                target = fluid.layers.data(
-                    name='target', shape=[1], dtype='int64', lod_level=1
-                )
-                crf_cost = fluid.layers.linear_chain_crf(
-                    input=feature_out,
-                    label=target,
-                    param_attr=fluid.ParamAttr(name='crfw', learning_rate=1e-1),
-                )
-                avg_cost = paddle.mean(crf_cost)
-
-                sgd_optimizer = fluid.optimizer.SGD(
-                    learning_rate=fluid.layers.exponential_decay(
-                        learning_rate=0.01,
-                        decay_steps=100000,
-                        decay_rate=0.5,
-                        staircase=True,
-                    )
-                )
-                sgd_optimizer.minimize(avg_cost)
-
-                train_data = paddle.batch(
-                    paddle.reader.shuffle(
-                        paddle.dataset.conll05.test(), buf_size=8192
-                    ),
-                    batch_size=8,
-                )
-
-                place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-                exe = fluid.Executor(place)
-                exe.run(startup)
-
-                train_cp = compiler.CompiledProgram(main).with_data_parallel(
-                    loss_name=avg_cost.name, build_strategy=build_strategy
-                )
-
-                feeder = fluid.DataFeeder(
-                    feed_list=[
-                        word,
-                        ctx_n2,
-                        ctx_n1,
-                        ctx_0,
-                        ctx_p1,
-                        ctx_p2,
-                        predicate,
-                        mark,
-                        target,
-                    ],
-                    place=fluid.CPUPlace(),
-                )
-
-            data = train_data()
-            for i in range(4):
-                cur_batch = next(data)
-                print(
-                    exe.run(
-                        train_cp,
-                        feed=feeder.feed(cur_batch),
-                        fetch_list=[avg_cost.name],
-                    )[0]
-                )
-
-    def _new_build_strategy(self, use_reduce=False):
-        build_strategy = fluid.BuildStrategy()
-
-        if use_reduce:
-            build_strategy.reduce_strategy = (
-                fluid.BuildStrategy.ReduceStrategy.Reduce
-            )
-        else:
-            build_strategy.reduce_strategy = (
-                fluid.BuildStrategy.ReduceStrategy.AllReduce
-            )
-
-        return build_strategy
-
-    def test_update_sparse_parameter_all_reduce(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                is_sparse=True,
-                build_strategy=self._new_build_strategy(),
-                use_cuda=True,
-            )
-
-        self.check_network_convergence(
-            is_sparse=True,
-            build_strategy=self._new_build_strategy(),
-            use_cuda=False,
-        )
-
-    def test_update_dense_parameter_all_reduce(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                is_sparse=False,
-                build_strategy=self._new_build_strategy(),
-                use_cuda=True,
-            )
-
-        self.check_network_convergence(
-            is_sparse=False,
-            build_strategy=self._new_build_strategy(),
-            use_cuda=False,
-        )
-
-    def test_update_sparse_parameter_reduce(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                is_sparse=True,
-                build_strategy=self._new_build_strategy(use_reduce=True),
-                use_cuda=True,
-            )
-        self.check_network_convergence(
-            is_sparse=True,
-            build_strategy=self._new_build_strategy(use_reduce=True),
-            use_cuda=False,
-        )
-
-    def test_update_dense_parameter_reduce(self):
-        if core.is_compiled_with_cuda():
-            self.check_network_convergence(
-                is_sparse=False,
-                build_strategy=self._new_build_strategy(use_reduce=True),
-                use_cuda=True,
-            )
-        self.check_network_convergence(
-            is_sparse=False,
-            build_strategy=self._new_build_strategy(use_reduce=True),
-            use_cuda=False,
-        )
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/python/paddle/static/nn/__init__.py
+++ b/python/paddle/static/nn/__init__.py
@@ -31,7 +31,6 @@ from .common import bilinear_tensor_product  # noqa: F401
 from .common import py_func  # noqa: F401
 from ...tensor.creation import create_parameter  # noqa: F401
 from ...fluid.layers import conv2d  # noqa: F401
-from ...fluid.layers import crf_decoding  # noqa: F401
 from ...fluid.layers import layer_norm  # noqa: F401
 from ...fluid.layers import multi_box_head  # noqa: F401
 from .loss import nce  # noqa: F401
@@ -72,7 +71,6 @@ __all__ = [  # noqa
    'conv2d_transpose',
    'conv3d',
    'conv3d_transpose',
-    'crf_decoding',
    'data_norm',
    'deform_conv2d',
    'group_norm',

--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -1572,7 +1572,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
 FIFTH_PARALLEL_JOB_NEW = [
    'test_buffer_shared_memory_reuse_pass',
    'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass',
-    'test_parallel_executor_crf',
    'test_multiprocess_reader_exception',
    'buddy_allocator_test',
    'test_multiprocess_dataloader_dataset',

--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -543,7 +543,6 @@ STATIC_MODE_TESTING_LIST = [
    'test_transpiler_ops',
    'test_communicator_sync',
    'test_collective_optimizer',
-    'test_parallel_executor_crf',
    'test_parallel_executor_profiler',
    'test_parallel_executor_transformer',
    'test_parallel_executor_transformer_auto_growth',