diff --git a/README.md b/README.md
index 5c5dc3a0fee42979ae5eed71634e56b197c4c816..3c60db650d4023a36641c691c64e6c5f7b040c67 100644
--- a/README.md
+++ b/README.md
@@ -180,6 +180,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 ### Recent Update
 - 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3).
 - 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo](./demos/TTSArmLinux).
+- 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3).
 - 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3).
 - 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition).
 - 👑 2023.01.06: Add [code-switch asr tal_cs recipe](./examples/tal_cs/asr1/).
diff --git a/README_cn.md b/README_cn.md
index fa013029ce4480726effec86518222c8a0e10e9a..29ee387c0ceecadd56b597d06d2562c17f7ade03 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -183,8 +183,9 @@
   - 🧩 级联模型应用: 作为传统语音任务的扩展，我们结合了自然语言处理、计算机视觉等任务，实现更接近实际需求的产业级应用。
 
 ### 近期更新
-- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3).
+- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。
 - 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例](./demos/TTSArmLinux)。
+- 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。
 - 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。
 - 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。
 - 👑 2023.01.06: 新增 [ASR 中英混合 tal_cs 训练推理流程](./examples/tal_cs/asr1/)。
diff --git a/examples/aishell/asr3/README.md b/examples/aishell/asr3/README.md
index f6fa60d7f0f217dc409829bb93754303e1e68e63..6b587e12f47b02ea5bd6aaa0b84a5182bb8d0abd 100644
--- a/examples/aishell/asr3/README.md
+++ b/examples/aishell/asr3/README.md
@@ -190,9 +190,9 @@ tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz
 ```
 You can download the audio demo:
 ```bash
-wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
 ```
 You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
 ```bash
-CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_01_03.wav
 ```
diff --git a/examples/aishell/asr3/conf/wav2vec2ASR.yaml b/examples/aishell/asr3/conf/wav2vec2ASR.yaml
index cdb04f8c1d7613e863478ae5ff5ad651644d3399..4a127468845c023dbe570a6e3c67f3ff14952927 100755
--- a/examples/aishell/asr3/conf/wav2vec2ASR.yaml
+++ b/examples/aishell/asr3/conf/wav2vec2ASR.yaml
@@ -107,6 +107,7 @@ vocab_filepath: data/lang_char/vocab.txt
 ###########################################
 
 unit_type: 'char'
+tokenizer: bert-base-chinese
 mean_std_filepath: 
 preprocess_config: conf/preprocess.yaml
 sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
@@ -139,12 +140,10 @@ n_epoch: 80
 accum_grad: 1
 global_grad_clip: 5.0
 
-model_optim: adadelta
+model_optim: sgd
 model_optim_conf:
   lr: 1.0
   weight_decay: 0.0
-  rho: 0.95
-  epsilon: 1.0e-8
 
 wav2vec2_optim: adam
 wav2vec2_optim_conf:
@@ -165,3 +164,4 @@ log_interval: 1
 checkpoint:
   kbest_n: 50
   latest_n: 5
+
diff --git a/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml
new file mode 100755
index 0000000000000000000000000000000000000000..ec287f0c623227684aae7c247ae0c2fef69d5d89
--- /dev/null
+++ b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml
@@ -0,0 +1,168 @@
+############################################
+#          Network Architecture           #
+############################################
+freeze_wav2vec2: False
+normalize_wav: True
+output_norm: True
+init_type: 'kaiming_uniform' # !Warning: need to convergence
+enc:
+  input_shape: 1024
+  dnn_blocks: 3
+  dnn_neurons: 1024
+  activation: True
+  normalization: True
+  dropout_rate: [0.15, 0.15, 0.0]
+ctc:
+  enc_n_units: 1024
+  blank_id: 0
+  dropout_rate: 0.0
+
+audio_augment:
+  speeds: [90, 100, 110]
+
+spec_augment:
+  time_warp: True
+  time_warp_window: 5
+  time_warp_mode: bicubic
+  freq_mask: True
+  n_freq_mask: 2
+  time_mask: True
+  n_time_mask: 2
+  replace_with_zero: False
+  freq_mask_width: 30
+  time_mask_width: 40
+wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
+
+
+############################################
+#               Wav2Vec2.0                 #
+############################################
+# vocab_size: 1000000
+hidden_size: 1024
+num_hidden_layers: 24
+num_attention_heads: 16
+intermediate_size: 4096
+hidden_act: gelu
+hidden_dropout: 0.1
+activation_dropout: 0.0
+attention_dropout: 0.1
+feat_proj_dropout: 0.1
+feat_quantizer_dropout: 0.0
+final_dropout: 0.0
+layerdrop: 0.1
+initializer_range: 0.02
+layer_norm_eps: 1e-5
+feat_extract_norm: layer
+feat_extract_activation: gelu
+conv_dim: [512, 512, 512, 512, 512, 512, 512]
+conv_stride: [5, 2, 2, 2, 2, 2, 2]
+conv_kernel: [10, 3, 3, 3, 3, 2, 2]
+conv_bias: True
+num_conv_pos_embeddings: 128
+num_conv_pos_embedding_groups: 16
+do_stable_layer_norm: True
+apply_spec_augment: False
+mask_channel_length: 10
+mask_channel_min_space: 1
+mask_channel_other: 0.0
+mask_channel_prob: 0.0
+mask_channel_selection: static
+mask_feature_length: 10
+mask_feature_min_masks: 0
+mask_feature_prob: 0.0
+mask_time_length: 10
+mask_time_min_masks: 2
+mask_time_min_space: 1
+mask_time_other: 0.0
+mask_time_prob: 0.075
+mask_time_selection: static
+num_codevectors_per_group: 320
+num_codevector_groups: 2
+contrastive_logits_temperature: 0.1
+num_negatives: 100
+codevector_dim: 256
+proj_codevector_dim: 256
+diversity_loss_weight: 0.1
+use_weighted_layer_sum: False
+# pad_token_id: 0
+# bos_token_id: 1
+# eos_token_id: 2
+add_adapter: False
+adapter_kernel_size: 3
+adapter_stride: 2
+num_adapter_layers: 3
+output_hidden_size: None
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+vocab_filepath: data/lang_char/vocab.txt 
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+unit_type: 'char'
+tokenizer: bert-base-chinese
+mean_std_filepath: 
+preprocess_config: conf/preprocess.yaml
+sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 5  # Different batch_size may cause large differences in results
+maxlen_in: 51200000000  # if input length  > maxlen-in batchsize is automatically reduced
+maxlen_out: 1500000  # if output length > maxlen-out batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 6
+subsampling_factor: 1
+num_encs: 1
+dist_sampler: True
+shortest_first: True
+return_lens_rate: True
+
+###########################################
+#        use speechbrain dataloader       #
+###########################################
+use_sb_pipeline: True  # whether use speechbrain pipeline. Default is True.
+sb_pipeline_conf: conf/train_with_wav2vec.yaml
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 80
+accum_grad: 1
+global_grad_clip: 5.0
+
+model_optim: adadelta
+model_optim_conf:
+  lr: 1.0
+  weight_decay: 0.0
+  rho: 0.95
+  epsilon: 1.0e-8
+
+wav2vec2_optim: adam
+wav2vec2_optim_conf:
+  lr: 0.0001
+  weight_decay: 0.0
+
+model_scheduler: newbobscheduler
+model_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.8
+  patient: 0
+wav2vec2_scheduler: newbobscheduler
+wav2vec2_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.9
+  patient: 0
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr3/local/test.sh b/examples/aishell/asr3/local/test.sh
index 9d4b84291d5862c5a2c8a9d77e458653565e6efc..91e1c54579b6249e15caa20f401bcc46682be25d 100755
--- a/examples/aishell/asr3/local/test.sh
+++ b/examples/aishell/asr3/local/test.sh
@@ -8,9 +8,7 @@ echo "using $ngpu gpus..."
 expdir=exp
 datadir=data
 
-train_set=train_960
-recog_set="test-clean test-other dev-clean dev-other"
-recog_set="test-clean"
+train_set=train
 
 config_path=$1
 decode_config_path=$2
@@ -75,7 +73,7 @@ for type in ctc_prefix_beam_search; do
         --trans_hyp ${ckpt_prefix}.${type}.rsl.text
 
     python3 utils/compute-wer.py --char=1 --v=1 \
-        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+        data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
     echo "decoding ${type} done."
 done
 
diff --git a/examples/aishell/asr3/local/test_wav.sh b/examples/aishell/asr3/local/test_wav.sh
index fdf3589f4ba3e183603f2c9e632d43968de10bfe..7ccef6945b4a63532af08827092761de9b0b8475 100755
--- a/examples/aishell/asr3/local/test_wav.sh
+++ b/examples/aishell/asr3/local/test_wav.sh
@@ -14,7 +14,7 @@ ckpt_prefix=$3
 audio_file=$4
 
 mkdir -p data
-wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
 if [ $? -ne 0 ]; then
    exit 1
 fi
diff --git a/examples/aishell/asr3/run.sh b/examples/aishell/asr3/run.sh
index 9b0a3c47255a803bd2b81c538424df24b9e5c502..557ca0fcdecdd3693a334b47a5a0a64650b7636f 100755
--- a/examples/aishell/asr3/run.sh
+++ b/examples/aishell/asr3/run.sh
@@ -15,11 +15,11 @@ resume=         # xx e.g. 30
 export FLAGS_cudnn_deterministic=1
 . ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 
-audio_file=data/demo_002_en.wav
+audio_file=data/demo_01_03.wav
 
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
-echo "checkpoint name ${ckpt}"git revert -v 
+echo "checkpoint name ${ckpt}" 
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 04df18623f16199c1ba6da10ae4bf433852cfc20..3c5db64bb75d033f262a1bb5939932d05b12178b 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -102,13 +102,11 @@ ssl_dynamic_pretrained_models = {
             'params':
             'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
         },
-    },
-    "wav2vec2ASR_aishell1-zh-16k": {
         '1.4': {
             'url':
             'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz',
             'md5':
-            '9f0bc943adb822789bf61e674b229d17',
+            '150e51b8ea5d255ccce6b395de8d916a',
             'cfg_path':
             'model.yaml',
             'ckpt_path':
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
index 0d66ac41075781b6bdf84216d255d4fd7c0a1ec0..5efa82e601077be470a470a9c4c70bcd64bba581 100644
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
@@ -18,13 +18,13 @@ from pathlib import Path
 
 import paddle
 import soundfile
-from yacs.config import CfgNode
-
+from paddlenlp.transformers import AutoTokenizer
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.s2t.utils.utility import UpdateConfig
+from yacs.config import CfgNode
 logger = Log(__name__).getlog()
 
 
@@ -34,8 +34,13 @@ class Wav2vec2Infer():
         self.config = config
         self.audio_file = args.audio_file
 
-        self.text_feature = TextFeaturizer(
-            unit_type=config.unit_type, vocab=config.vocab_filepath)
+        if self.config.tokenizer:
+            self.text_feature = AutoTokenizer.from_pretrained(
+                self.config.tokenizer)
+        else:
+            self.text_feature = TextFeaturizer(
+                unit_type=config.unit_type, vocab=config.vocab_filepath)
+
         paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         # model
@@ -59,14 +64,14 @@ class Wav2vec2Infer():
             audio, _ = soundfile.read(
                 self.audio_file, dtype="int16", always_2d=True)
             logger.info(f"audio shape: {audio.shape}")
-
             xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
             decode_config = self.config.decode
             result_transcripts, result_tokenids = self.model.decode(
                 xs,
                 text_feature=self.text_feature,
                 decoding_method=decode_config.decoding_method,
-                beam_size=decode_config.beam_size)
+                beam_size=decode_config.beam_size,
+                tokenizer=self.config.tokenizer, )
             rsl = result_transcripts[0]
             utt = Path(self.audio_file).name
             logger.info(f"hyp: {utt} {rsl}")
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 12b75615e8f8f7dabada50f888627f42a2292877..354636b48daf66bce9ef2e34ff754fac77b232f4 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -27,8 +27,6 @@ from paddle import inference
 from paddle import jit
 from paddle.io import DataLoader
 from paddle.static import InputSpec
-from yacs.config import CfgNode
-
 from paddlespeech.t2s.datasets.am_batch_fn import *
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
@@ -38,6 +36,7 @@ from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.utils.dynamic_import import dynamic_import
+from yacs.config import CfgNode
 
 # remove [W:onnxruntime: xxx] from ort
 ort.set_default_logger_severity(3)
@@ -490,6 +489,7 @@ def get_predictor(
         device: str='cpu',
         # for gpu
         use_trt: bool=False,
+        device_id: int=0,
         # for trt
         use_dynamic_shape: bool=True,
         min_subgraph_size: int=5,
@@ -505,6 +505,7 @@ def get_predictor(
         params_file (os.PathLike): name of params_file.
         device (str): Choose the device you want to run, it can be: cpu/gpu, default is cpu.
         use_trt (bool): whether to use TensorRT or not in GPU.
+        device_id (int): Choose your device id, only valid when the device is gpu, default 0.
         use_dynamic_shape (bool): use dynamic shape or not in TensorRT.
         use_mkldnn (bool): whether to use MKLDNN or not in CPU.
         cpu_threads (int): num of thread when use CPU.
@@ -521,7 +522,7 @@ def get_predictor(
     config.enable_memory_optim()
     config.switch_ir_optim(True)
     if device == "gpu":
-        config.enable_use_gpu(100, 0)
+        config.enable_use_gpu(100, device_id)
     else:
         config.disable_gpu()
         config.set_cpu_math_library_num_threads(cpu_threads)