formate code

91f6f243 · tink2123 · 5edb619c · 91f6f243
隐藏空白更改
内联并排

Showing with 9 addition and 568 deletion

ppocr/modeling/heads/self_attention/model.py ppocr/modeling/heads/self_attention/model.py +9 -568

未找到文件。
--- a/ppocr/modeling/heads/self_attention/model.py
+++ b/ppocr/modeling/heads/self_attention/model.py
@@ -7,6 +7,11 @@ import paddle.fluid.layers as layers
 # Set seed for CE
 dropout_seed = None

+encoder_data_input_fields = (
+    "src_word",
+    "src_pos",
+    "src_slf_attn_bias", )
+

 def wrap_layer_with_block(layer, block_idx):
    """
@@ -45,25 +50,6 @@ def wrap_layer_with_block(layer, block_idx):
    return layer_wrapper


-def position_encoding_init(n_position, d_pos_vec):
-    """
-    Generate the initial values for the sinusoid position encoding table.
-    """
-    channels = d_pos_vec
-    position = np.arange(n_position)
-    num_timescales = channels // 2
-    log_timescale_increment = (np.log(float(1e4) / float(1)) /
-                               (num_timescales - 1))
-    inv_timescales = np.exp(np.arange(
-        num_timescales)) * -log_timescale_increment
-    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
-                                                               0)
-    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
-    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
-    position_enc = signal
-    return position_enc.astype("float32")
-
-
 def multi_head_attention(queries,
                         keys,
                         values,
@@ -271,7 +257,7 @@ post_process_layer = pre_post_process_layer


 def prepare_encoder(
-        src_word,  #[b,t,c]
+        src_word,  # [b,t,c]
        src_pos,
        src_vocab_size,
        src_emb_dim,
@@ -286,7 +272,7 @@ def prepare_encoder(
    This module is used at the bottom of the encoder stacks.
    """

-    src_word_emb = src_word  #layers.concat(res,axis=1)
+    src_word_emb = src_word  # layers.concat(res,axis=1)
    src_word_emb = layers.cast(src_word_emb, 'float32')
    # print("src_word_emb",src_word_emb)

@@ -338,12 +324,6 @@ def prepare_decoder(src_word,
        is_test=False) if dropout_rate else enc_input


-# prepare_encoder = partial(
-#     prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[0])
-# prepare_decoder = partial(
-#     prepare_encoder_decoder, pos_enc_param_name=pos_enc_param_names[1])
-
-
 def encoder_layer(enc_input,
                  attn_bias,
                  n_head,
@@ -412,234 +392,6 @@ def encoder(enc_input,
    return enc_output


-def decoder_layer(dec_input,
-                  enc_output,
-                  slf_attn_bias,
-                  dec_enc_attn_bias,
-                  n_head,
-                  d_key,
-                  d_value,
-                  d_model,
-                  d_inner_hid,
-                  prepostprocess_dropout,
-                  attention_dropout,
-                  relu_dropout,
-                  preprocess_cmd,
-                  postprocess_cmd,
-                  cache=None,
-                  gather_idx=None):
-    """ The layer to be stacked in decoder part.
-    The structure of this module is similar to that in the encoder part except
-    a multi-head attention is added to implement encoder-decoder attention.
-    """
-    slf_attn_output = multi_head_attention(
-        pre_process_layer(dec_input, preprocess_cmd, prepostprocess_dropout),
-        None,
-        None,
-        slf_attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        cache=cache,
-        gather_idx=gather_idx)
-    slf_attn_output = post_process_layer(
-        dec_input,
-        slf_attn_output,
-        postprocess_cmd,
-        prepostprocess_dropout, )
-    enc_attn_output = multi_head_attention(
-        pre_process_layer(slf_attn_output, preprocess_cmd,
-                          prepostprocess_dropout),
-        enc_output,
-        enc_output,
-        dec_enc_attn_bias,
-        d_key,
-        d_value,
-        d_model,
-        n_head,
-        attention_dropout,
-        cache=cache,
-        gather_idx=gather_idx,
-        static_kv=True)
-    enc_attn_output = post_process_layer(
-        slf_attn_output,
-        enc_attn_output,
-        postprocess_cmd,
-        prepostprocess_dropout, )
-    ffd_output = positionwise_feed_forward(
-        pre_process_layer(enc_attn_output, preprocess_cmd,
-                          prepostprocess_dropout),
-        d_inner_hid,
-        d_model,
-        relu_dropout, )
-    dec_output = post_process_layer(
-        enc_attn_output,
-        ffd_output,
-        postprocess_cmd,
-        prepostprocess_dropout, )
-    return dec_output
-
-
-def decoder(dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_layer,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            caches=None,
-            gather_idx=None):
-    """
-    The decoder is composed of a stack of identical decoder_layer layers.
-    """
-    for i in range(n_layer):
-        dec_output = decoder_layer(
-            dec_input,
-            enc_output,
-            dec_slf_attn_bias,
-            dec_enc_attn_bias,
-            n_head,
-            d_key,
-            d_value,
-            d_model,
-            d_inner_hid,
-            prepostprocess_dropout,
-            attention_dropout,
-            relu_dropout,
-            preprocess_cmd,
-            postprocess_cmd,
-            cache=None if caches is None else caches[i],
-            gather_idx=gather_idx)
-        dec_input = dec_output
-    dec_output = pre_process_layer(dec_output, preprocess_cmd,
-                                   prepostprocess_dropout)
-    return dec_output
-
-
-def make_all_inputs(input_fields):
-    """
-    Define the input data layers for the transformer model.
-    """
-    inputs = []
-    for input_field in input_fields:
-        input_var = layers.data(
-            name=input_field,
-            shape=input_descs[input_field][0],
-            dtype=input_descs[input_field][1],
-            lod_level=input_descs[input_field][2]
-            if len(input_descs[input_field]) == 3 else 0,
-            append_batch_size=False)
-        inputs.append(input_var)
-    return inputs
-
-
-def make_all_py_reader_inputs(input_fields, is_test=False):
-    reader = layers.py_reader(
-        capacity=20,
-        name="test_reader" if is_test else "train_reader",
-        shapes=[input_descs[input_field][0] for input_field in input_fields],
-        dtypes=[input_descs[input_field][1] for input_field in input_fields],
-        lod_levels=[
-            input_descs[input_field][2]
-            if len(input_descs[input_field]) == 3 else 0
-            for input_field in input_fields
-        ])
-    return layers.read_file(reader), reader
-
-
-def transformer(src_vocab_size,
-                trg_vocab_size,
-                max_length,
-                n_layer,
-                n_head,
-                d_key,
-                d_value,
-                d_model,
-                d_inner_hid,
-                prepostprocess_dropout,
-                attention_dropout,
-                relu_dropout,
-                preprocess_cmd,
-                postprocess_cmd,
-                weight_sharing,
-                label_smooth_eps,
-                bos_idx=0,
-                use_py_reader=False,
-                is_test=False):
-    if weight_sharing:
-        assert src_vocab_size == trg_vocab_size, (
-            "Vocabularies in source and target should be same for weight sharing."
-        )
-
-    data_input_names = encoder_data_input_fields + \
-                decoder_data_input_fields[:-1] + label_data_input_fields
-
-    if use_py_reader:
-        all_inputs, reader = make_all_py_reader_inputs(data_input_names,
-                                                       is_test)
-    else:
-        all_inputs = make_all_inputs(data_input_names)
-    # print("all inputs",all_inputs)
-    enc_inputs_len = len(encoder_data_input_fields)
-    dec_inputs_len = len(decoder_data_input_fields[:-1])
-    enc_inputs = all_inputs[0:enc_inputs_len]
-    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len + dec_inputs_len]
-    label = all_inputs[-2]
-    weights = all_inputs[-1]
-
-    enc_output = wrap_encoder(
-        src_vocab_size, 64, n_layer, n_head, d_key, d_value, d_model,
-        d_inner_hid, prepostprocess_dropout, attention_dropout, relu_dropout,
-        preprocess_cmd, postprocess_cmd, weight_sharing, enc_inputs)
-
-    predict = wrap_decoder(
-        trg_vocab_size,
-        max_length,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd,
-        weight_sharing,
-        dec_inputs,
-        enc_output, )
-
-    # Padding index do not contribute to the total loss. The weights is used to
-    # cancel padding index in calculating the loss.
-    if label_smooth_eps:
-        label = layers.label_smooth(
-            label=layers.one_hot(
-                input=label, depth=trg_vocab_size),
-            epsilon=label_smooth_eps)
-
-    cost = layers.softmax_with_cross_entropy(
-        logits=predict,
-        label=label,
-        soft_label=True if label_smooth_eps else False)
-    weighted_cost = cost * weights
-    sum_cost = layers.reduce_sum(weighted_cost)
-    token_num = layers.reduce_sum(weights)
-    token_num.stop_gradient = True
-    avg_cost = sum_cost / token_num
-    return sum_cost, avg_cost, predict, token_num, reader if use_py_reader else None
-
-
 def wrap_encoder_forFeature(src_vocab_size,
                            max_length,
                            n_layer,
@@ -662,44 +414,8 @@ def wrap_encoder_forFeature(src_vocab_size,
    img
    """

-    if enc_inputs is None:
-        # This is used to implement independent encoder program in inference.
-        conv_features, src_pos, src_slf_attn_bias = make_all_inputs(
-            encoder_data_input_fields)
-    else:
-        conv_features, src_pos, src_slf_attn_bias = enc_inputs  #
-        b, t, c = conv_features.shape
-        #"""
-        #    insert cnn
-        #"""
-        #import basemodel
-        # feat = basemodel.resnet_50(img)
-
-        # mycrnn = basemodel.CRNN()
-        # feat = mycrnn.ocr_convs(img,use_cudnn=TrainTaskConfig.use_gpu)
-        # b, c, w, h = feat.shape
-        # src_word = layers.reshape(feat, shape=[-1, c, w * h])
-
-        #myconv8 = basemodel.conv8()
-        #feat = myconv8.net(img )
-        #b , c, h, w = feat.shape#h=6
-        #print(feat)
-        #layers.Print(feat,message="conv_feat",summarize=10)
-
-        #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu")
-        #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1))
-        #src_word = layers.squeeze(feat,axes=[2]) #src_word  [-1,c,ww]
-
-        #feat = layers.transpose(feat, [0,3,1,2])
-        #src_word = layers.reshape(feat,[-1,w, c*h])
-        #src_word = layers.im2sequence(
-        #    input=feat,
-        #    stride=[1, 1],
-        #    filter_size=[feat.shape[2], 1])
-        #layers.Print(src_word,message="src_word",summarize=10)
-
-        # print('feat',feat)
-        #print("src_word",src_word)
+    conv_features, src_pos, src_slf_attn_bias = enc_inputs  #
+    b, t, c = conv_features.shape

    enc_input = prepare_encoder(
        conv_features,
@@ -755,37 +471,7 @@ def wrap_encoder(src_vocab_size,
            encoder_data_input_fields)
    else:
        src_word, src_pos, src_slf_attn_bias = enc_inputs  #
-        #"""
-        #    insert cnn
-        #"""
-        #import basemodel
-        # feat = basemodel.resnet_50(img)
-
-        # mycrnn = basemodel.CRNN()
-        # feat = mycrnn.ocr_convs(img,use_cudnn=TrainTaskConfig.use_gpu)
-        # b, c, w, h = feat.shape
-        # src_word = layers.reshape(feat, shape=[-1, c, w * h])

-        #myconv8 = basemodel.conv8()
-        #feat = myconv8.net(img )
-        #b , c, h, w = feat.shape#h=6
-        #print(feat)
-        #layers.Print(feat,message="conv_feat",summarize=10)
-
-        #feat =layers.conv2d(feat,c,filter_size =[4 , 1],act="relu")
-        #feat = layers.pool2d(feat,pool_stride=(3,1),pool_size=(3,1))
-        #src_word = layers.squeeze(feat,axes=[2]) #src_word  [-1,c,ww]
-
-        #feat = layers.transpose(feat, [0,3,1,2])
-        #src_word = layers.reshape(feat,[-1,w, c*h])
-        #src_word = layers.im2sequence(
-        #    input=feat,
-        #    stride=[1, 1],
-        #    filter_size=[feat.shape[2], 1])
-        #layers.Print(src_word,message="src_word",summarize=10)
-
-        # print('feat',feat)
-        #print("src_word",src_word)
    enc_input = prepare_decoder(
        src_word,
        src_pos,
@@ -811,248 +497,3 @@ def wrap_encoder(src_vocab_size,
        preprocess_cmd,
        postprocess_cmd, )
    return enc_output
-
-
-def wrap_decoder(trg_vocab_size,
-                 max_length,
-                 n_layer,
-                 n_head,
-                 d_key,
-                 d_value,
-                 d_model,
-                 d_inner_hid,
-                 prepostprocess_dropout,
-                 attention_dropout,
-                 relu_dropout,
-                 preprocess_cmd,
-                 postprocess_cmd,
-                 weight_sharing,
-                 dec_inputs=None,
-                 enc_output=None,
-                 caches=None,
-                 gather_idx=None,
-                 bos_idx=0):
-    """
-    The wrapper assembles together all needed layers for the decoder.
-    """
-    if dec_inputs is None:
-        # This is used to implement independent decoder program in inference.
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, enc_output = \
-            make_all_inputs(decoder_data_input_fields)
-    else:
-        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs
-
-    dec_input = prepare_decoder(
-        trg_word,
-        trg_pos,
-        trg_vocab_size,
-        d_model,
-        max_length,
-        prepostprocess_dropout,
-        bos_idx=bos_idx,
-        word_emb_param_name="src_word_emb_table"
-        if weight_sharing else "trg_word_emb_table")
-    dec_output = decoder(
-        dec_input,
-        enc_output,
-        trg_slf_attn_bias,
-        trg_src_attn_bias,
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd,
-        caches=caches,
-        gather_idx=gather_idx)
-    return dec_output
-    # Reshape to 2D tensor to use GEMM instead of BatchedGEMM
-    dec_output = layers.reshape(
-        dec_output, shape=[-1, dec_output.shape[-1]], inplace=True)
-    if weight_sharing:
-        predict = layers.matmul(
-            x=dec_output,
-            y=fluid.default_main_program().global_block().var(
-                "trg_word_emb_table"),
-            transpose_y=True)
-    else:
-        predict = layers.fc(input=dec_output,
-                            size=trg_vocab_size,
-                            bias_attr=False)
-    if dec_inputs is None:
-        # Return probs for independent decoder program.
-        predict = layers.softmax(predict)
-    return predict
-
-
-def fast_decode(src_vocab_size,
-                trg_vocab_size,
-                max_in_len,
-                n_layer,
-                n_head,
-                d_key,
-                d_value,
-                d_model,
-                d_inner_hid,
-                prepostprocess_dropout,
-                attention_dropout,
-                relu_dropout,
-                preprocess_cmd,
-                postprocess_cmd,
-                weight_sharing,
-                beam_size,
-                max_out_len,
-                bos_idx,
-                eos_idx,
-                use_py_reader=False):
-    """
-    Use beam search to decode. Caches will be used to store states of history
-    steps which can make the decoding faster.
-    """
-    data_input_names = encoder_data_input_fields + fast_decoder_data_input_fields
-
-    if use_py_reader:
-        all_inputs, reader = make_all_py_reader_inputs(data_input_names)
-    else:
-        all_inputs = make_all_inputs(data_input_names)
-
-    enc_inputs_len = len(encoder_data_input_fields)
-    dec_inputs_len = len(fast_decoder_data_input_fields)
-    enc_inputs = all_inputs[0:enc_inputs_len]  #enc_inputs tensor
-    dec_inputs = all_inputs[enc_inputs_len:enc_inputs_len +
-                            dec_inputs_len]  #dec_inputs tensor
-
-    enc_output = wrap_encoder(
-        src_vocab_size,
-        64,  ##to do !!!!!????
-        n_layer,
-        n_head,
-        d_key,
-        d_value,
-        d_model,
-        d_inner_hid,
-        prepostprocess_dropout,
-        attention_dropout,
-        relu_dropout,
-        preprocess_cmd,
-        postprocess_cmd,
-        weight_sharing,
-        enc_inputs,
-        bos_idx=bos_idx)
-    start_tokens, init_scores, parent_idx, trg_src_attn_bias = dec_inputs
-
-    def beam_search():
-        max_len = layers.fill_constant(
-            shape=[1],
-            dtype=start_tokens.dtype,
-            value=max_out_len,
-            force_cpu=True)
-        step_idx = layers.fill_constant(
-            shape=[1], dtype=start_tokens.dtype, value=0, force_cpu=True)
-        cond = layers.less_than(x=step_idx, y=max_len)  # default force_cpu=True
-        while_op = layers.While(cond)
-        # array states will be stored for each step.
-        ids = layers.array_write(
-            layers.reshape(start_tokens, (-1, 1)), step_idx)
-        scores = layers.array_write(init_scores, step_idx)
-        # cell states will be overwrited at each step.
-        # caches contains states of history steps in decoder self-attention
-        # and static encoder output projections in encoder-decoder attention
-        # to reduce redundant computation.
-        caches = [
-            {
-                "k":  # for self attention
-                layers.fill_constant_batch_size_like(
-                    input=start_tokens,
-                    shape=[-1, n_head, 0, d_key],
-                    dtype=enc_output.dtype,
-                    value=0),
-                "v":  # for self attention
-                layers.fill_constant_batch_size_like(
-                    input=start_tokens,
-                    shape=[-1, n_head, 0, d_value],
-                    dtype=enc_output.dtype,
-                    value=0),
-                "static_k":  # for encoder-decoder attention
-                layers.create_tensor(dtype=enc_output.dtype),
-                "static_v":  # for encoder-decoder attention
-                layers.create_tensor(dtype=enc_output.dtype)
-            } for i in range(n_layer)
-        ]
-
-        with while_op.block():
-            pre_ids = layers.array_read(array=ids, i=step_idx)
-            # Since beam_search_op dosen't enforce pre_ids' shape, we can do
-            # inplace reshape here which actually change the shape of pre_ids.
-            pre_ids = layers.reshape(pre_ids, (-1, 1, 1), inplace=True)
-            pre_scores = layers.array_read(array=scores, i=step_idx)
-            # gather cell states corresponding to selected parent
-            pre_src_attn_bias = layers.gather(
-                trg_src_attn_bias, index=parent_idx)
-            pre_pos = layers.elementwise_mul(
-                x=layers.fill_constant_batch_size_like(
-                    input=pre_src_attn_bias,  # cann't use lod tensor here
-                    value=1,
-                    shape=[-1, 1, 1],
-                    dtype=pre_ids.dtype),
-                y=step_idx,
-                axis=0)
-            logits = wrap_decoder(
-                trg_vocab_size,
-                max_in_len,
-                n_layer,
-                n_head,
-                d_key,
-                d_value,
-                d_model,
-                d_inner_hid,
-                prepostprocess_dropout,
-                attention_dropout,
-                relu_dropout,
-                preprocess_cmd,
-                postprocess_cmd,
-                weight_sharing,
-                dec_inputs=(pre_ids, pre_pos, None, pre_src_attn_bias),
-                enc_output=enc_output,
-                caches=caches,
-                gather_idx=parent_idx,
-                bos_idx=bos_idx)
-            # intra-beam topK
-            topk_scores, topk_indices = layers.topk(
-                input=layers.softmax(logits), k=beam_size)
-            accu_scores = layers.elementwise_add(
-                x=layers.log(topk_scores), y=pre_scores, axis=0)
-            # beam_search op uses lod to differentiate branches.
-            accu_scores = layers.lod_reset(accu_scores, pre_ids)
-            # topK reduction across beams, also contain special handle of
-            # end beams and end sentences(batch reduction)
-            selected_ids, selected_scores, gather_idx = layers.beam_search(
-                pre_ids=pre_ids,
-                pre_scores=pre_scores,
-                ids=topk_indices,
-                scores=accu_scores,
-                beam_size=beam_size,
-                end_id=eos_idx,
-                return_parent_idx=True)
-            layers.increment(x=step_idx, value=1.0, in_place=True)
-            # cell states(caches) have been updated in wrap_decoder,
-            # only need to update beam search states here.
-            layers.array_write(selected_ids, i=step_idx, array=ids)
-            layers.array_write(selected_scores, i=step_idx, array=scores)
-            layers.assign(gather_idx, parent_idx)
-            layers.assign(pre_src_attn_bias, trg_src_attn_bias)
-            length_cond = layers.less_than(x=step_idx, y=max_len)
-            finish_cond = layers.logical_not(layers.is_empty(x=selected_ids))
-            layers.logical_and(x=length_cond, y=finish_cond, out=cond)
-
-        finished_ids, finished_scores = layers.beam_search_decode(
-            ids, scores, beam_size=beam_size, end_id=eos_idx)
-        return finished_ids, finished_scores
-
-    finished_ids, finished_scores = beam_search()
-    return finished_ids, finished_scores, reader if use_py_reader else None