ds2 offline cer 6p4287

341038b6 · Hui Zhang · 673cc4a0 · 341038b6 · 341038b6 · 341038b6
7 changed file
--- a/deepspeech/models/ds2/conv.py
+++ b/deepspeech/models/ds2/conv.py
@@ -41,13 +41,6 @@ def conv_output_size(I, F, P, S):
    return (I - F + 2 * P - S) // S


-# receptive field calculator
-# https://fomoro.com/research/article/receptive-field-calculator
-# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
-# https://distill.pub/2019/computing-receptive-fields/
-# Rl-1 = Sl * Rl + (Kl - Sl) 
-
-
 class ConvBn(nn.Layer):
    """Convolution layer with batch normalization.


--- a/deepspeech/modules/subsampling.py
+++ b/deepspeech/modules/subsampling.py
@@ -108,8 +108,8 @@ class Conv2dSubsampling4(BaseSubsampling):
            nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
        self.subsampling_rate = 4
        # The right context for every conv layer is computed by:
-        # (kernel_size - 1) / 2 * stride  * frame_rate_of_this_layer
-        # 6 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
        self.right_context = 6

    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
@@ -160,10 +160,10 @@ class Conv2dSubsampling6(BaseSubsampling):
        # when Padding == 0, O = (I - F - S) // S
        self.linear = nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
        # The right context for every conv layer is computed by:
-        # (kernel_size - 1) / 2 * stride  * frame_rate_of_this_layer
-        # 14 = (3 - 1) / 2 * 2 * 1 + (5 - 1) / 2 * 3 * 2
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        # 10 = (3 - 1) * 1 + (5 - 1) * 2
        self.subsampling_rate = 6
-        self.right_context = 14
+        self.right_context = 10

    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
@@ -214,8 +214,8 @@ class Conv2dSubsampling8(BaseSubsampling):
                                odim)
        self.subsampling_rate = 8
        # The right context for every conv layer is computed by:
-        # (kernel_size - 1) / 2 * stride  * frame_rate_of_this_layer
-        # 14 = (3 - 1) / 2 * 2 * 1 + (3 - 1) / 2 * 2 * 2 + (3 - 1) / 2 * 2 * 4
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        # 14 = (3 - 1) * 1 + (3 - 1) * 2 + (3 - 1) * 4
        self.right_context = 14

    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0

--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
@@ -10,7 +10,7 @@

 | Model | Params | Release | Config | Test set | Loss | CER |  
 | --- | --- | --- | --- | --- | --- | --- |  
-| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug + new datapipe | test | 6.396368026733398 | 0.068382 |  
+| DeepSpeech2 | 58.4M | 2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.71956205368042 | 0.064287 |  
 | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |  
 | DeepSpeech2 | 58.4M | 2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
 | DeepSpeech2 | 58.4M | 2.0.0 | conf/deepspeech2.yaml | test | - | 0.078977 |  

--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -42,7 +42,7 @@ model:
  share_rnn_weights: False

 training:
-  n_epoch: 50
+  n_epoch: 80
  lr: 2e-3
  lr_decay: 0.83
  weight_decay: 1e-06

--- a/examples/aishell/s0/local/train.sh
+++ b/examples/aishell/s0/local/train.sh
@@ -19,7 +19,7 @@ fi

 mkdir -p exp

-seed=1024
+seed=10086
 if [ ${seed} ]; then
    export FLAGS_cudnn_deterministic=True
 fi

--- a/utils/avg.sh
+++ b/utils/avg.sh
 #! /usr/bin/env bash

-if [ $# != 2 ]; then
-    echo "usage: ${0} ckpt_dir avg_num"
+if [ $# != 3 ]; then
+    echo "usage: ${0} [best|latest] ckpt_dir avg_num"
    exit -1
 fi

 ckpt_dir=${1}
-average_num=${2}
+avg_mode=${2} # best,latest
+average_num=${3}
 decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams

-avg_model.py \
--dst_model ${decode_checkpoint} \
--ckpt_dir ${ckpt_dir}  \
--num ${average_num} \
--val_best
+if [ $avg_mode == best ];then
+    # best
+    avg_model.py \
+    --dst_model ${decode_checkpoint} \
+    --ckpt_dir ${ckpt_dir}  \
+    --num ${average_num} \
+    --val_best
+else
+    # latest
+    avg_model.py \
+    --dst_model ${decode_checkpoint} \
+    --ckpt_dir ${ckpt_dir}  \
+    --num ${average_num}
+fi

 if [ $? -ne 0 ]; then
    echo "Failed in avg ckpt!"

--- a/utils/tarball.sh
+++ b/utils/tarball.sh
 #!/bin/bash

-if [ $# != 4 ];then
-    echo "usage: $0 ckpt_prefix model_config mean_std vocab"
+if [ $# != 5 ];then
+    echo "usage: $0 ckpt_prefix model_config mean_std vocab pack_name"
    exit -1
 fi

@@ -9,6 +9,7 @@ ckpt_prefix=$1
 model_config=$2
 mean_std=$3
 vocab=$4
+pack_name=$5

 output=release

@@ -27,6 +28,6 @@ cp ${ckpt_prefix}.*  ${output}
 # model config, mean std, vocab
 cp ${model_config} ${mean_std} ${vocab} ${output}

-tar zcvf release.tar.gz ${output}
+tar zcvf ${pack_name}.release.tar.gz ${output}

-echo "tarball done!"
+echo "tarball: ${pack_name}.release.tar.gz done!"