fix_python3_bug (#3461)

2fa3e51a · 0YuanZhang0 · GitHub · 313d0666 · 2fa3e51a · 2fa3e51a
7 changed file
--- a/PaddleNLP/Research/MRQA2019-D-NET/knowledge_distillation/README.md
+++ b/PaddleNLP/Research/MRQA2019-D-NET/knowledge_distillation/README.md
@@ -9,7 +9,7 @@ Model ensemble can improve the generalization of MRC models. However, such appro
 - Python >= 2.7
 - cuda >= 9.0
 - cudnn >= 7.0
- PaddlePaddle >= 1.5.0 Please refer to Installation Guide [Installation Guide](http://www.paddlepaddle.org/#quick-start)
+- PaddlePaddle >= 1.6 Please refer to Installation Guide [Installation Guide](http://www.paddlepaddle.org/#quick-start)
 ### Data and Models Preparation
 User can get the data and trained knowledge_distillation models directly we provided: 

--- a/PaddleNLP/Research/MRQA2019-D-NET/knowledge_distillation/model/transformer_encoder.py
+++ b/PaddleNLP/Research/MRQA2019-D-NET/knowledge_distillation/model/transformer_encoder.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 from functools import partial
+from functools import reduce
 import numpy as np
 import paddle.fluid as fluid

--- a/PaddleNLP/Research/MRQA2019-D-NET/multi_task_learning/README.md
+++ b/PaddleNLP/Research/MRQA2019-D-NET/multi_task_learning/README.md
@@ -15,7 +15,7 @@ PALM user guide: [README.md](https://github.com/PaddlePaddle/PALM/blob/master/RE
 - Python >= 2.7
 - cuda >= 9.0
 - cudnn >= 7.0
- PaddlePaddle >= 1.5.0 Please refer to Installation Guide [Installation Guide](http://www.paddlepaddle.org/#quick-start)
+- PaddlePaddle >= 1.6 Please refer to Installation Guide [Installation Guide](http://www.paddlepaddle.org/#quick-start)
 ### Data Preparation
 #### Get data directly: 

--- a/PaddleNLP/Research/MRQA2019-D-NET/server/README.md
+++ b/PaddleNLP/Research/MRQA2019-D-NET/server/README.md
@@ -12,6 +12,17 @@ bash wget_server_inference_model.sh
 ```
 ## Start server
+We can set GPU card for bert server or xlnet server, By setting variable CUDA_VISIBLE_DEVICES:
+```
+export CUDA_VISIBLE_DEVICES=1
+```
+In main_server.py file we set the server port for bert and xlnet model, as shown below, If the port 5118 or 5120 is occupied, please set up an idle port. 
+```
+url_1 = 'http://127.0.0.1:5118'   # url for model1
+url_2 = 'http://127.0.0.1:5120'   # url for model2
+```
+start server
 ```
 bash start.sh
 ```
--- a/PaddleNLP/Research/MRQA2019-D-NET/server/start.sh
+++ b/PaddleNLP/Research/MRQA2019-D-NET/server/start.sh
 cd bert_server
+export CUDA_VISIBLE_DEVICES=1
 sh start.sh
 cd ../xlnet_server
+export CUDA_VISIBLE_DEVICES=2
 sh serve.sh
 cd ..

--- a/PaddleNLP/Research/MRQA2019-D-NET/server/xlnet_server/modeling.py
+++ b/PaddleNLP/Research/MRQA2019-D-NET/server/xlnet_server/modeling.py
@@ -41,144 +41,148 @@ def positional_embedding(pos_seq, inv_freq, bsz=None):
 def positionwise_ffn(inp, d_model, d_inner, dropout_prob, param_initializer=None,
                     act_type='relu', name='ff'):
-  """Position-wise Feed-forward Network."""
+    """Position-wise Feed-forward Network."""
-  if act_type not in ['relu', 'gelu']:
+    if act_type not in ['relu', 'gelu']:
-      raise ValueError('Unsupported activation type {}'.format(act_type))
+        raise ValueError('Unsupported activation type {}'.format(act_type))
-  output = fluid.layers.fc(input=inp, size=d_inner, act=act_type,
+    output = fluid.layers.fc(input=inp, size=d_inner, act=act_type,
                           num_flatten_dims=2,
                           param_attr=fluid.ParamAttr(
                             name=name+'_layer_1_weight', initializer=param_initializer),
                           bias_attr=name+'_layer_1_bias')
-  output = fluid.layers.dropout(output, dropout_prob=dropout_prob,
+    output = fluid.layers.dropout(output, dropout_prob=dropout_prob,
                                dropout_implementation="upscale_in_train", is_test=False)
-  output = fluid.layers.fc(output, size=d_model,
+    output = fluid.layers.fc(output, size=d_model,
                           num_flatten_dims=2,
                           param_attr=fluid.ParamAttr(
                               name=name+'_layer_2_weight', initializer=param_initializer),
                           bias_attr=name+'_layer_2_bias')
-  output = fluid.layers.dropout(output, dropout_prob=dropout_prob,
+    output = fluid.layers.dropout(output, dropout_prob=dropout_prob,
                             dropout_implementation="upscale_in_train", is_test=False)
-  output = fluid.layers.layer_norm(output + inp, begin_norm_axis=len(output.shape)-1,
+    output = fluid.layers.layer_norm(output + inp, begin_norm_axis=len(output.shape)-1,
                                   epsilon=1e-12,
                                   param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale',
                                       initializer=fluid.initializer.Constant(1.)),
                                   bias_attr=fluid.ParamAttr(name+'_layer_norm_bias',
                                       initializer=fluid.initializer.Constant(0.)))
-  return output
+    return output
 def head_projection(h, d_model, n_head, d_head, param_initializer, name=''):
-  """Project hidden states to a specific head with a 4D-shape."""
+    """Project hidden states to a specific head with a 4D-shape."""
-  proj_weight=fluid.layers.create_parameter(
+    proj_weight=fluid.layers.create_parameter(
                shape=[d_model, n_head, d_head],
                dtype=h.dtype,
                attr=fluid.ParamAttr(name=name+'_weight', initializer=param_initializer),
                is_bias=False)
-  # ibh,hnd->ibnd 
+    # ibh,hnd->ibnd 
-  head = fluid.layers.mul(x=h, y=proj_weight, x_num_col_dims=2, y_num_col_dims=1)
+    head = fluid.layers.mul(x=h, y=proj_weight, x_num_col_dims=2, y_num_col_dims=1)
-  return head 
+    return head 
 def post_attention(h, attn_vec, d_model, n_head, d_head, dropout,
                   param_initializer, residual=True, name=''):
-  """Post-attention processing."""
+    """Post-attention processing."""
-  # post-attention projection (back to `d_model`)
+    # post-attention projection (back to `d_model`)
-  proj_o=fluid.layers.create_parameter(
+    proj_o=fluid.layers.create_parameter(
                shape=[d_model, n_head, d_head],
                dtype=h.dtype,
                attr=fluid.ParamAttr(name=name+'_o_weight', initializer=param_initializer),
                is_bias=False)
-  # ibnd,hnd->ibh
+    # ibnd,hnd->ibh
-  proj_o = fluid.layers.transpose(proj_o, perm=[1, 2, 0])
+    proj_o = fluid.layers.transpose(proj_o, perm=[1, 2, 0])
-  attn_out = fluid.layers.mul(x=attn_vec, y=proj_o, x_num_col_dims=2, y_num_col_dims=2)
+    attn_out = fluid.layers.mul(x=attn_vec, y=proj_o, x_num_col_dims=2, y_num_col_dims=2)
-  attn_out = fluid.layers.dropout(attn_out, dropout_prob=dropout,
+    attn_out = fluid.layers.dropout(attn_out, dropout_prob=dropout,
                             dropout_implementation="upscale_in_train", is_test=False)
-  if residual:
+    if residual:
-      output = fluid.layers.layer_norm(attn_out + h, begin_norm_axis=len(attn_out.shape)-1,
+        output = fluid.layers.layer_norm(attn_out + h, begin_norm_axis=len(attn_out.shape)-1,
                                   epsilon=1e-12,
                                   param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale',
                                       initializer=fluid.initializer.Constant(1.)),
                                   bias_attr=fluid.ParamAttr(name+'_layer_norm_bias',
                                       initializer=fluid.initializer.Constant(0.)))
-  else:
+    else:
-      output = fluid.layers.layer_norm(attn_out, begin_norm_axis=len(attn_out.shape)-1,
+        output = fluid.layers.layer_norm(attn_out, begin_norm_axis=len(attn_out.shape)-1,
                                   epsilon=1e-12,
                                   param_attr=fluid.ParamAttr(name=name+'_layer_norm_scale',
                                       initializer=fluid.initializer.Constant(1.)),
                                   bias_attr=fluid.ParamAttr(name+'_layer_norm_bias',
                                       initializer=fluid.initializer.Constant(0.)))
-  return output
+    return output
 def abs_attn_core(q_head, k_head, v_head, attn_mask, dropatt, scale):
-  """Core absolute positional attention operations."""
+    """Core absolute positional attention operations."""
-  attn_score = einsum4x4('ibnd,jbnd->ijbn', q_head, k_head) 
+    attn_score = einsum4x4('ibnd,jbnd->ijbn', q_head, k_head) 
-  attn_score *= scale
+    attn_score *= scale
-  if attn_mask is not None:
+    if attn_mask is not None:
-    attn_score = attn_score - 1e30 * attn_mask
+        attn_score = attn_score - 1e30 * attn_mask
-  # attention probability
+    # attention probability
-  attn_prob = fluid.layers.softmax(attn_score, axis=1)
+    attn_prob = fluid.layers.softmax(attn_score, axis=1)
-  attn_prob = fluid.layers.dropout(attn_prob, dropout_prob=dropatt, 
+    attn_prob = fluid.layers.dropout(attn_prob, dropout_prob=dropatt, 
                  dropout_implementation="upscale_in_train", is_test=False)
-  # attention output
+    # attention output
-  attn_vec = einsum4x4('ijbn,jbnd->ibnd', attn_prob, v_head)
+    attn_vec = einsum4x4('ijbn,jbnd->ibnd', attn_prob, v_head)
+    return attn_vec
-  return attn_vec
 def rel_attn_core(q_head, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat,
                  r_w_bias, r_r_bias, r_s_bias, attn_mask, dropatt,
                  scale):
-  """Core relative positional attention operations."""
+    """Core relative positional attention operations."""
-  ## content based attention score
+    ## content based attention score
-  ac = einsum4x4('ibnd,jbnd->ijbn', fluid.layers.elementwise_add(q_head, r_w_bias, 2), k_head_h) 
+    ac = einsum4x4('ibnd,jbnd->ijbn', fluid.layers.elementwise_add(q_head, r_w_bias, 2), k_head_h) 
-  # position based attention score
+    # position based attention score
-  bd = einsum4x4('ibnd,jbnd->ijbn', fluid.layers.elementwise_add(q_head, r_r_bias, 2), k_head_r)
+    bd = einsum4x4('ibnd,jbnd->ijbn', fluid.layers.elementwise_add(q_head, r_r_bias, 2), k_head_r)
-  #klen = fluid.layers.slice(fluid.layers.shape(ac), axes=[0], starts=[1], ends=[2])
+    #klen = fluid.layers.slice(fluid.layers.shape(ac), axes=[0], starts=[1], ends=[2])
-  bd = rel_shift(bd, klen=ac.shape[1])
+    bd = rel_shift(bd, klen=ac.shape[1])
-  # segment based attention score
+    # segment based attention score
-  if seg_mat is None:
+    if seg_mat is None:
-    ef = 0
+        ef = 0
-  else:
+    else:
-    ef = 0
+        ef = 0
-    """
+        """
-    bsz = fluid.layers.slice(fluid.layers.shape(q_head), axes=[0], starts=[1], ends=[2])
+        bsz = fluid.layers.slice(fluid.layers.shape(q_head), axes=[0], starts=[1], ends=[2])
-    bsz.stop_gradient = True
+        bsz.stop_gradient = True
-    """
+        """
-    #seg_embed = fluid.layers.unsqueeze(input=seg_embed, axes=[0])
+        #seg_embed = fluid.layers.unsqueeze(input=seg_embed, axes=[0])
-    seg_embed = fluid.layers.stack([seg_embed]*q_head.shape[0], axis=0)
+        seg_embed = fluid.layers.stack([seg_embed]*q_head.shape[0], axis=0)
-    ef = einsum4x4('ibnd,isnd->ibns', fluid.layers.elementwise_add(q_head, r_s_bias, 2), seg_embed)
+        ef = einsum4x4('ibnd,isnd->ibns', fluid.layers.elementwise_add(q_head, r_s_bias, 2), seg_embed)
-    ef = einsum4x4('ijbs,ibns->ijbn', seg_mat, ef)
+        ef = einsum4x4('ijbs,ibns->ijbn', seg_mat, ef)
-  # merge attention scores and perform masking
+        # merge attention scores and perform masking
-  attn_score = (ac + bd + ef) * scale
+    attn_score = (ac + bd + ef) * scale
-  if attn_mask is not None:
+    if attn_mask is not None:
-    # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
+        # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask
-    attn_score = attn_score - 1e30 * attn_mask
+        attn_score = attn_score - 1e30 * attn_mask
-  # attention probability
+    # attention probability
-  #attn_prob = fluid.layers.softmax(attn_score, axis=1)
+    #attn_prob = fluid.layers.softmax(attn_score, axis=1)
-  attn_score = fluid.layers.transpose(attn_score, [0, 2, 3, 1])
+    attn_score = fluid.layers.transpose(attn_score, [0, 2, 3, 1])
-  attn_prob = fluid.layers.softmax(attn_score)
+    attn_prob = fluid.layers.softmax(attn_score)
-  attn_prob = fluid.layers.transpose(attn_prob, [0, 3, 1, 2])
+    attn_prob = fluid.layers.transpose(attn_prob, [0, 3, 1, 2])
-  attn_prob = fluid.layers.dropout(attn_prob, dropatt, 
+    attn_prob = fluid.layers.dropout(attn_prob, dropatt, 
                                   dropout_implementation="upscale_in_train")
-  # attention output
+    # attention output
-  attn_vec = einsum4x4('ijbn,jbnd->ibnd', attn_prob, v_head_h)
+    attn_vec = einsum4x4('ijbn,jbnd->ibnd', attn_prob, v_head_h)
-  return attn_vec
+    return attn_vec
 def rel_shift(x, klen=-1):
    """perform relative shift to form the relative attention score."""
@@ -192,67 +196,69 @@ def rel_shift(x, klen=-1):
 def _cache_mem(curr_out, prev_mem, mem_len, reuse_len=None):
-  """cache hidden states into memory."""
+    """cache hidden states into memory."""
-  if mem_len is None or mem_len == 0:
+    if mem_len is None or mem_len == 0:
-      return None
+        return None
-  else:
+    else:
-      if reuse_len is not None and reuse_len > 0:
+        if reuse_len is not None and reuse_len > 0:
-          curr_out = curr_out[:reuse_len]
+            curr_out = curr_out[:reuse_len]
+        if prev_mem is None:
+            new_mem = curr_out[-mem_len:]
+        else:
+            new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:]
-      if prev_mem is None:
+    new_mem.stop_gradient = True
-         new_mem = curr_out[-mem_len:]
+    return new_mem
-      else:
-        new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:]
-  new_mem.stop_gradient = True
-  return new_mem
 def relative_positional_encoding(qlen, klen, d_model, clamp_len, attn_type,
                                 bi_data, bsz=None, dtype=None):
-  """create relative positional encoding."""
+    """create relative positional encoding."""
-  freq_seq = fluid.layers.range(0, d_model, 2.0, 'float32')
+    freq_seq = fluid.layers.range(0, d_model, 2.0, 'float32')
-  if dtype is not None and dtype != 'float32':
-    freq_seq = tf.cast(freq_seq, dtype=dtype)
-  inv_freq = 1 / (10000 ** (freq_seq / d_model))
-  if attn_type == 'bi':
-    beg, end = klen, -qlen
-  elif attn_type == 'uni':
-    beg, end = klen, -1
-  else:
-    raise ValueError('Unknown `attn_type` {}.'.format(attn_type))
-  if bi_data:
-    fwd_pos_seq = fluid.layers.range(beg, end, -1.0, 'float32')
-    bwd_pos_seq = fluid.layers.range(-beg, -end, 1.0, 'float32')
    if dtype is not None and dtype != 'float32':
-      fwd_pos_seq =fluid.layers.cast(fwd_pos_seq, dtype='float32')
+        freq_seq = tf.cast(freq_seq, dtype=dtype)
-      bwd_pos_seq = fluid.layers.cast(bwd_pos_seq, dtype='float32')
+    inv_freq = 1 / (10000 ** (freq_seq / d_model))
-    if clamp_len > 0:
+    if attn_type == 'bi':
-      fwd_pos_seq = fluid.layers.clip(fwd_pos_seq, -clamp_len, clamp_len)
+        beg, end = klen, -qlen
-      bwd_pos_seq = fluid.layers.clip(bwd_pos_seq, -clamp_len, clamp_len)
+    elif attn_type == 'uni':
+        beg, end = klen, -1
+    else:
+        raise ValueError('Unknown `attn_type` {}.'.format(attn_type))
-    if bsz is not None:
+    if bi_data:
-      # With bi_data, the batch size should be divisible by 2.
+        fwd_pos_seq = fluid.layers.range(beg, end, -1.0, 'float32')
-      assert bsz % 2 == 0
+        bwd_pos_seq = fluid.layers.range(-beg, -end, 1.0, 'float32')
-      fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
-      bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+        if dtype is not None and dtype != 'float32':
+            fwd_pos_seq =fluid.layers.cast(fwd_pos_seq, dtype='float32')
+            bwd_pos_seq = fluid.layers.cast(bwd_pos_seq, dtype='float32')
+        if clamp_len > 0:
+            fwd_pos_seq = fluid.layers.clip(fwd_pos_seq, -clamp_len, clamp_len)
+            bwd_pos_seq = fluid.layers.clip(bwd_pos_seq, -clamp_len, clamp_len)
+        if bsz is not None:
+            # With bi_data, the batch size should be divisible by 2.
+            assert bsz % 2 == 0
+            fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2)
+            bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2)
+        else:
+            fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq)
+            bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq)
+        pos_emb = fluid.layers.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
    else:
-      fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq)
+        fwd_pos_seq = fluid.layers.range(beg, end, -1.0, 'float32')
-      bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq)
+        if dtype is not None and dtype != 'float32':
+            fwd_pos_seq = fluid.layers.cast(fwd_pos_seq, dtype=dtype)
+        if clamp_len > 0:
+            fwd_pos_seq = fluid.layers.clip(fwd_pos_seq, -clamp_len, clamp_len)
+        pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz)
+        fluid.layers.reshape(pos_emb, [2*qlen, -1, d_model], inplace=True)
+    return pos_emb
-    pos_emb = fluid.layers.concat([fwd_pos_emb, bwd_pos_emb], axis=1)
-  else:
-    fwd_pos_seq = fluid.layers.range(beg, end, -1.0, 'float32')
-    if dtype is not None and dtype != 'float32':
-      fwd_pos_seq = fluid.layers.cast(fwd_pos_seq, dtype=dtype)
-    if clamp_len > 0:
-      fwd_pos_seq = fluid.layers.clip(fwd_pos_seq, -clamp_len, clamp_len)
-    pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz)
-    fluid.layers.reshape(pos_emb, [2*qlen, -1, d_model], inplace=True)
-  return pos_emb
 def rel_multihead_attn(h, r, r_w_bias, r_r_bias, seg_mat, r_s_bias, seg_embed,
                       attn_mask, mems, d_model, n_head, d_head, dropout,
@@ -299,58 +305,58 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
                use_fp16=False, name='', **kwargs):
    """
    Defines a Transformer-XL computation graph with additional
-	support for XLNet.
+    support for XLNet.
-	Args:
+    Args:
-	inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
+    inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
-	seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
+    seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
-	input_mask: float32 Tensor in shape [len, bsz], the input mask.
+    input_mask: float32 Tensor in shape [len, bsz], the input mask.
-	  0 for real tokens and 1 for padding.
+        0 for real tokens and 1 for padding.
-	mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
+    mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
-	  from previous batches. The length of the list equals n_layer.
+        from previous batches. The length of the list equals n_layer.
-	  If None, no memory is used.
+        If None, no memory is used.
-	perm_mask: float32 Tensor in shape [len, len, bsz].
+    perm_mask: float32 Tensor in shape [len, len, bsz].
-	  If perm_mask[i, j, k] = 0, i attend to j in batch k;
+        If perm_mask[i, j, k] = 0, i attend to j in batch k;
-	  if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
+        if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
-	  If None, each position attends to all the others.
+        If None, each position attends to all the others.
-	target_mapping: float32 Tensor in shape [num_predict, len, bsz].
+    target_mapping: float32 Tensor in shape [num_predict, len, bsz].
-	  If target_mapping[i, j, k] = 1, the i-th predict in batch k is
+        If target_mapping[i, j, k] = 1, the i-th predict in batch k is
-	  on the j-th token.
+        on the j-th token.
-	  Only used during pretraining for partial prediction.
+        Only used during pretraining for partial prediction.
-	  Set to None during finetuning.
+        Set to None during finetuning.
-	inp_q: float32 Tensor in shape [len, bsz].
+    inp_q: float32 Tensor in shape [len, bsz].
-	  1 for tokens with losses and 0 for tokens without losses.
+        1 for tokens with losses and 0 for tokens without losses.
-	  Only used during pretraining for two-stream attention.
+        Only used during pretraining for two-stream attention.
-	  Set to None during finetuning.
+        Set to None during finetuning.
-	n_layer: int, the number of layers.
+    n_layer: int, the number of layers.
-	d_model: int, the hidden size.
+    d_model: int, the hidden size.
-	n_head: int, the number of attention heads.
+    n_head: int, the number of attention heads.
-	d_head: int, the dimension size of each attention head.
+    d_head: int, the dimension size of each attention head.
-	d_inner: int, the hidden size in feed-forward layers.
+    d_inner: int, the hidden size in feed-forward layers.
-	ff_activation: str, "relu" or "gelu".
+    ff_activation: str, "relu" or "gelu".
-	untie_r: bool, whether to untie the biases in attention.
+    untie_r: bool, whether to untie the biases in attention.
-	n_token: int, the vocab size.
+    n_token: int, the vocab size.
-	is_training: bool, whether in training mode.
+    is_training: bool, whether in training mode.
-	use_tpu: bool, whether TPUs are used.
+    use_tpu: bool, whether TPUs are used.
-	use_fp16: bool, use bfloat16 instead of float32.
+    use_fp16: bool, use bfloat16 instead of float32.
-	dropout: float, dropout rate.
+    dropout: float, dropout rate.
-	dropatt: float, dropout rate on attention probabilities.
+    dropatt: float, dropout rate on attention probabilities.
-	init: str, the initialization scheme, either "normal" or "uniform".
+    init: str, the initialization scheme, either "normal" or "uniform".
-	init_range: float, initialize the parameters with a uniform distribution
+    init_range: float, initialize the parameters with a uniform distribution
-	  in [-init_range, init_range]. Only effective when init="uniform".
+        in [-init_range, init_range]. Only effective when init="uniform".
-	init_std: float, initialize the parameters with a normal distribution
+    init_std: float, initialize the parameters with a normal distribution
-	  with mean 0 and stddev init_std. Only effective when init="normal".
+        with mean 0 and stddev init_std. Only effective when init="normal".
-	mem_len: int, the number of tokens to cache.
+    mem_len: int, the number of tokens to cache.
-	reuse_len: int, the number of tokens in the currect batch to be cached
+    reuse_len: int, the number of tokens in the currect batch to be cached
-	  and reused in the future.
+        and reused in the future.
-	bi_data: bool, whether to use bidirectional input pipeline.
+    bi_data: bool, whether to use bidirectional input pipeline.
-	  Usually set to True during pretraining and False during finetuning.
+        Usually set to True during pretraining and False during finetuning.
-	clamp_len: int, clamp all relative distances larger than clamp_len.
+    clamp_len: int, clamp all relative distances larger than clamp_len.
-	  -1 means no clamping.
+        -1 means no clamping.
-	same_length: bool, whether to use the same attention length for each token.
+    same_length: bool, whether to use the same attention length for each token.
-	summary_type: str, "last", "first", "mean", or "attn". The method
+    summary_type: str, "last", "first", "mean", or "attn". The method
-	  to pool the input to get a vector representation.
+        to pool the input to get a vector representation.
-	initializer: A tf initializer.
+    initializer: A tf initializer.
-	scope: scope name for the computation graph.
+    scope: scope name for the computation graph.
    """
    print('memory input {}'.format(mems))
    data_type = "float16" if use_fp16 else "float32"
@@ -365,7 +371,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
    # causal attention mask
    if attn_type == 'uni':
        attn_mask = fluid.layers.create_global_var(
-	                name='attn_mask', 
+                    name='attn_mask', 
                        shape=[qlen, klen, 1, 1], 
                        value=0.0, 
                        dtype=data_type, persistable=True)
@@ -413,21 +419,21 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
        non_tgt_mask = None
    if untie_r:
-      r_w_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, 
+        r_w_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, 
                                 attr=fluid.ParamAttr(name=name+'_r_w_bias', initializer=initializer), 
                                 is_bias=True)
-      r_w_bias = [fluid.layers.slice(r_w_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
+        r_w_bias = [fluid.layers.slice(r_w_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
-      r_w_bias = [fluid.layers.squeeze(r_w_bias[i], axes=[0]) for i in range(n_layer)]
+        r_w_bias = [fluid.layers.squeeze(r_w_bias[i], axes=[0]) for i in range(n_layer)]
-      r_r_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, 
+        r_r_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, 
                                 attr=fluid.ParamAttr(name=name+'_r_r_bias', initializer=initializer), 
                                 is_bias=True)
-      r_r_bias = [fluid.layers.slice(r_r_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
+        r_r_bias = [fluid.layers.slice(r_r_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
-      r_r_bias = [fluid.layers.squeeze(r_r_bias[i], axes=[0]) for i in range(n_layer)]
+        r_r_bias = [fluid.layers.squeeze(r_r_bias[i], axes=[0]) for i in range(n_layer)]
    else:
-      r_w_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, 
+        r_w_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, 
                                 attr=fluid.ParamAttr(name=name+'_r_w_bias', initializer=initializer), 
                                 is_bias=True)
-      r_r_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, 
+        r_r_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, 
                                 attr=fluid.ParamAttr(name=name+'_r_r_bias', initializer=initializer), 
                                 is_bias=True)
@@ -442,28 +448,28 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
        param_attr=fluid.ParamAttr(name=name+'_word_embedding', initializer=initializer))
    if inp_q is not None:
-       pass
+        pass
    output_h = fluid.layers.dropout(word_emb_k, dropout_prob=dropout,
                                   dropout_implementation="upscale_in_train") 
    if inp_q is not None:
-       pass
+        pass
    if seg_id is not None:
-	if untie_r:
+        if untie_r:
-	    r_s_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, 
+            r_s_bias = fluid.layers.create_parameter(shape=[n_layer, n_head, d_head], dtype=data_type, 
-				     attr=fluid.ParamAttr(name=name+'_r_s_bias', initializer=initializer), 
+                        attr=fluid.ParamAttr(name=name+'_r_s_bias', initializer=initializer), 
-				     is_bias=True)
+                        is_bias=True)
            r_s_bias = [fluid.layers.slice(r_s_bias, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
            r_s_bias = [fluid.layers.squeeze(r_s_bias[i], axes=[0]) for i in range(n_layer)]
-	else:
+        else:
-	    r_s_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, 
+            r_s_bias = fluid.layers.create_parameter(shape=[n_head, d_head], dtype=data_type, 
-				     attr=fluid.ParamAttr(name=name+'_r_s_bias', initializer=initializer), 
+                        attr=fluid.ParamAttr(name=name+'_r_s_bias', initializer=initializer), 
-				     is_bias=True)
+                        is_bias=True)
        seg_embed = fluid.layers.create_parameter(shape=[n_layer, 2, n_head, d_head],
-			      dtype=data_type, attr=fluid.ParamAttr(name=name+'_seg_embed', 
+                              dtype=data_type, attr=fluid.ParamAttr(name=name+'_seg_embed', 
                              initializer=initializer))
        seg_embed = [fluid.layers.slice(seg_embed, axes=[0], starts=[i], ends=[i+1]) for i in range(n_layer)]
        seg_embed = [fluid.layers.squeeze(seg_embed[i], axes=[0]) for i in range(n_layer)]
@@ -497,7 +503,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
    pos_emb.stop_gradient = True
    ##### Attention layers
    if mems is None:
-      mems = [None] * n_layer
+        mems = [None] * n_layer
    for i in range(n_layer):
        # cache new mems
        #new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len)) 
@@ -548,6 +554,7 @@ def transformer_xl(inp_k, n_token, n_layer, d_model, n_head,
    new_mems = None
    return output, new_mems, lookup_table
 def lm_loss(hidden, target, n_token, d_model, initializer, lookup_table=None,
            tie_weight=False, bi_data=True):
@@ -578,53 +585,54 @@ def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout,
                       dropatt, input_mask, is_training, initializer,
                       scope=None, reuse=None, use_proj=True):
-  """
+    """
-      Different classification tasks may not may not share the same parameters
+    Different classification tasks may not may not share the same parameters
-      to summarize the sequence features.
+    to summarize the sequence features.
-      If shared, one can keep the `scope` to the default value `None`.
+    If shared, one can keep the `scope` to the default value `None`.
-      Otherwise, one should specify a different `scope` for each task.
+    Otherwise, one should specify a different `scope` for each task.
-  """
+    """
-  with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse):
+    with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse):
-    if summary_type == 'last':
+        if summary_type == 'last':
-      summary = hidden[-1]
+            summary = hidden[-1]
-    elif summary_type == 'first':
+        elif summary_type == 'first':
-      summary = hidden[0]
+            summary = hidden[0]
-    elif summary_type == 'mean':
+        elif summary_type == 'mean':
-      summary = tf.reduce_mean(hidden, axis=0)
+            summary = tf.reduce_mean(hidden, axis=0)
-    elif summary_type == 'attn':
+        elif summary_type == 'attn':
-      bsz = tf.shape(hidden)[1]
+            bsz = tf.shape(hidden)[1]
-      summary_bias = tf.get_variable('summary_bias', [d_model],
+            summary_bias = tf.get_variable('summary_bias', [d_model],
                                     dtype=hidden.dtype,
                                     initializer=initializer)
-      summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1])
+            summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1])
-      if input_mask is not None:
+            if input_mask is not None:
-        input_mask = input_mask[None, :, :, None]
+                input_mask = input_mask[None, :, :, None]
-      summary = multihead_attn(summary_bias, hidden, hidden, input_mask,
+            summary = multihead_attn(summary_bias, hidden, hidden, input_mask,
                               d_model, n_head, d_head, dropout, dropatt,
                               is_training, initializer, residual=False)
-      summary = summary[0]
+            summary = summary[0]
-    else:
+        else:
-      raise ValueError('Unsupported summary type {}'.format(summary_type))
+            raise ValueError('Unsupported summary type {}'.format(summary_type))
+        # use another projection as in BERT
+        if use_proj:
+            summary = tf.layers.dense(
+                        summary,
+                        d_model,
+                        activation=tf.tanh,
+                        initializer=initializer,
+                        name='summary')
-    # use another projection as in BERT
+        # dropout
-    if use_proj:
+        summary = tf.layers.dropout(
-      summary = tf.layers.dense(
+                summary, dropout, training=is_training,
-          summary,
+                name='dropout')
-          d_model,
-          activation=tf.tanh,
-          initializer=initializer,
-          name='summary')
-    # dropout
+    return summary
-    summary = tf.layers.dropout(
-        summary, dropout, training=is_training,
-        name='dropout')
-  return summary
 def classification_loss(hidden, labels, n_class, initializer, name, reuse=None,
                        return_logits=False):
@@ -641,10 +649,10 @@ def classification_loss(hidden, labels, n_class, initializer, name, reuse=None,
        param_attr=fluid.ParamAttr(name=name+'_logits', initializer=initializer))
    one_hot_target = fluid.layers.one_hot(labels, depth=n_class, dtype=hidden.dtype)
-    loss = -fuid.layers.reduce_sum(fluid.layers.log_softmax(logits) * one_hot_target, -1)
+    loss = -fluid.layers.reduce_sum(fluid.layers.log_softmax(logits) * one_hot_target, -1)
    if return_logits:
-      return loss, logits
+        return loss, logits
    return loss
@@ -661,6 +669,6 @@ def regression_loss(hidden, labels, initializer, name='transformer',
    loss = tf.square(logits - labels)
    if return_logits:
-      return loss, logits
+        return loss, logits
    return loss 
--- a/PaddleNLP/Research/MRQA2019-D-NET/server/xlnet_server/prepro_utils.py
+++ b/PaddleNLP/Research/MRQA2019-D-NET/server/xlnet_server/prepro_utils.py
@@ -12,126 +12,126 @@ SPIECE_UNDERLINE = '▁'
 def printable_text(text):
-  """Returns text encoded in a way suitable for print or `tf.logging`."""
+    """Returns text encoded in a way suitable for print or `tf.logging`."""
-  # These functions want `str` for both Python2 and Python3, but in one case
+    # These functions want `str` for both Python2 and Python3, but in one case
-  # it's a Unicode string and in the other it's a byte string.
+    # it's a Unicode string and in the other it's a byte string.
-  if six.PY3:
+    if six.PY3:
-    if isinstance(text, str):
+        if isinstance(text, str):
-      return text
+            return text
-    elif isinstance(text, bytes):
+        elif isinstance(text, bytes):
-      return text.decode("utf-8", "ignore")
+            return text.decode("utf-8", "ignore")
-    else:
+        else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
+            raise ValueError("Unsupported string type: %s" % (type(text)))
-  elif six.PY2:
+    elif six.PY2:
-    if isinstance(text, str):
+        if isinstance(text, str):
-      return text
+            return text
-    elif isinstance(text, unicode):
+        elif isinstance(text, unicode):
-      return text.encode("utf-8")
+            return text.encode("utf-8")
+        else:
+            raise ValueError("Unsupported string type: %s" % (type(text)))
    else:
-      raise ValueError("Unsupported string type: %s" % (type(text)))
+        raise ValueError("Not running on Python2 or Python 3?")
-  else:
-    raise ValueError("Not running on Python2 or Python 3?")
 def print_(*args):
-  new_args = []
+    new_args = []
-  for arg in args:
+    for arg in args:
-    if isinstance(arg, list):
+        if isinstance(arg, list):
-      s = [printable_text(i) for i in arg]
+            s = [printable_text(i) for i in arg]
-      s = ' '.join(s)
+            s = ' '.join(s)
-      new_args.append(s)
+            new_args.append(s)
-    else:
+        else:
-      new_args.append(printable_text(arg))
+            new_args.append(printable_text(arg))
-  print(*new_args)
+    print(*new_args)
 def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
-  if remove_space:
+    if remove_space:
-    outputs = ' '.join(inputs.strip().split())
+        outputs = ' '.join(inputs.strip().split())
-  else:
+    else:
-    outputs = inputs
+        outputs = inputs
-  outputs = outputs.replace("``", '"').replace("''", '"')
+    outputs = outputs.replace("``", '"').replace("''", '"')
-  if six.PY2 and isinstance(outputs, str):
+    if six.PY2 and isinstance(outputs, str):
-    outputs = outputs.decode('utf-8')
+        outputs = outputs.decode('utf-8')
-  if not keep_accents:
+    if not keep_accents:
-    outputs = unicodedata.normalize('NFKD', outputs)
+        outputs = unicodedata.normalize('NFKD', outputs)
-    outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
+        outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
-  if lower:
+    if lower:
-    outputs = outputs.lower()
+        outputs = outputs.lower()
-  return outputs
+    return outputs
 def encode_pieces(sp_model, text, return_unicode=True, sample=False):
-  # return_unicode is used only for py2
+    # return_unicode is used only for py2
-  # note(zhiliny): in some systems, sentencepiece only accepts str for py2
+    # note(zhiliny): in some systems, sentencepiece only accepts str for py2
-  if six.PY2 and isinstance(text, unicode):
+    if six.PY2 and isinstance(text, unicode):
-    text = text.encode('utf-8')
+        text = text.encode('utf-8')
-  if not sample:
+    if not sample:
-    pieces = sp_model.EncodeAsPieces(text)
+        pieces = sp_model.EncodeAsPieces(text)
-  else:
-    pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
-  new_pieces = []
-  for piece in pieces:
-    if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
-      cur_pieces = sp_model.EncodeAsPieces(
-          piece[:-1].replace(SPIECE_UNDERLINE, ''))
-      if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
-        if len(cur_pieces[0]) == 1:
-          cur_pieces = cur_pieces[1:]
-        else:
-          cur_pieces[0] = cur_pieces[0][1:]
-      cur_pieces.append(piece[-1])
-      new_pieces.extend(cur_pieces)
    else:
-      new_pieces.append(piece)
+        pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+    new_pieces = []
+    for piece in pieces:
+        if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
+            cur_pieces = sp_model.EncodeAsPieces(
+                    piece[:-1].replace(SPIECE_UNDERLINE, ''))
+            if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+                if len(cur_pieces[0]) == 1:
+                    cur_pieces = cur_pieces[1:]
+                else:
+                    cur_pieces[0] = cur_pieces[0][1:]
+            cur_pieces.append(piece[-1])
+            new_pieces.extend(cur_pieces)
+        else:
+            new_pieces.append(piece)
-  # note(zhiliny): convert back to unicode for py2
+    # note(zhiliny): convert back to unicode for py2
-  if six.PY2 and return_unicode:
+    if six.PY2 and return_unicode:
-    ret_pieces = []
+        ret_pieces = []
-    for piece in new_pieces:
+        for piece in new_pieces:
-      if isinstance(piece, str):
+            if isinstance(piece, str):
-        piece = piece.decode('utf-8')
+                piece = piece.decode('utf-8')
-      ret_pieces.append(piece)
+            ret_pieces.append(piece)
-    new_pieces = ret_pieces
+        new_pieces = ret_pieces
-  return new_pieces
+    return new_pieces
 def encode_ids(sp_model, text, sample=False):
-  pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample)
+    pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample)
-  ids = [sp_model.PieceToId(piece) for piece in pieces]
+    ids = [sp_model.PieceToId(piece) for piece in pieces]
-  return ids
+    return ids
 if __name__ == '__main__':
-  import sentencepiece as spm
+    import sentencepiece as spm
-  sp = spm.SentencePieceProcessor()
+    sp = spm.SentencePieceProcessor()
-  sp.load('sp10m.uncased.v3.model')
+    sp.load('sp10m.uncased.v3.model')
-  print_(u'I was born in 2000, and this is falsé.')
+    print_(u'I was born in 2000, and this is falsé.')
-  print_(u'ORIGINAL', sp.EncodeAsPieces(u'I was born in 2000, and this is falsé.'))
+    print_(u'ORIGINAL', sp.EncodeAsPieces(u'I was born in 2000, and this is falsé.'))
-  print_(u'OURS', encode_pieces(sp, u'I was born in 2000, and this is falsé.'))
+    print_(u'OURS', encode_pieces(sp, u'I was born in 2000, and this is falsé.'))
-  print(encode_ids(sp, u'I was born in 2000, and this is falsé.'))
+    print(encode_ids(sp, u'I was born in 2000, and this is falsé.'))
-  print_('')
+    print_('')
-  prepro_func = partial(preprocess_text, lower=True)
+    prepro_func = partial(preprocess_text, lower=True)
-  print_(prepro_func('I was born in 2000, and this is falsé.'))
+    print_(prepro_func('I was born in 2000, and this is falsé.'))
-  print_('ORIGINAL', sp.EncodeAsPieces(prepro_func('I was born in 2000, and this is falsé.')))
+    print_('ORIGINAL', sp.EncodeAsPieces(prepro_func('I was born in 2000, and this is falsé.')))
-  print_('OURS', encode_pieces(sp, prepro_func('I was born in 2000, and this is falsé.')))
+    print_('OURS', encode_pieces(sp, prepro_func('I was born in 2000, and this is falsé.')))
-  print(encode_ids(sp, prepro_func('I was born in 2000, and this is falsé.')))
+    print(encode_ids(sp, prepro_func('I was born in 2000, and this is falsé.')))
-  print_('')
+    print_('')
-  print_('I was born in 2000, and this is falsé.')
+    print_('I was born in 2000, and this is falsé.')
-  print_('ORIGINAL', sp.EncodeAsPieces('I was born in 2000, and this is falsé.'))
+    print_('ORIGINAL', sp.EncodeAsPieces('I was born in 2000, and this is falsé.'))
-  print_('OURS', encode_pieces(sp, 'I was born in 2000, and this is falsé.'))
+    print_('OURS', encode_pieces(sp, 'I was born in 2000, and this is falsé.'))
-  print(encode_ids(sp, 'I was born in 2000, and this is falsé.'))
+    print(encode_ids(sp, 'I was born in 2000, and this is falsé.'))
-  print_('')
+    print_('')
-  print_('I was born in 92000, and this is falsé.')
+    print_('I was born in 92000, and this is falsé.')
-  print_('ORIGINAL', sp.EncodeAsPieces('I was born in 92000, and this is falsé.'))
+    print_('ORIGINAL', sp.EncodeAsPieces('I was born in 92000, and this is falsé.'))
-  print_('OURS', encode_pieces(sp, 'I was born in 92000, and this is falsé.'))
+    print_('OURS', encode_pieces(sp, 'I was born in 92000, and this is falsé.'))
-  print(encode_ids(sp, 'I was born in 92000, and this is falsé.'))
+    print(encode_ids(sp, 'I was born in 92000, and this is falsé.'))