PaddleAudio initial commit (#5299)

8a0045aa · KP · GitHub · ffe11953 · 8a0045aa · 8a0045aa
37 changed file
--- a/PaddleAudio/.gitignore
+++ b/PaddleAudio/.gitignore
+.ipynb_checkpoints/**
+*.ipynb
+nohup.out
+__pycache__/
+*.wav
+*.m4a
--- a/PaddleAudio/.pre-commit-config.yaml
+++ b/PaddleAudio/.pre-commit-config.yaml
+-   repo: local
+    hooks:
+    -   id: yapf
+        name: yapf
+        entry: yapf
+        language: system
+        args: [-i, --style .style.yapf]
+        files: \.py$
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    sha: a11d9314b22d8f8c7556443875b731ef05965464
+    hooks:
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: detect-private-key
+    -   id: check-symlinks
+    -   id: check-added-large-files
+-   repo: https://github.com/pycqa/isort
+    rev: 5.8.0
+    hooks:
+    -   id: isort
+        name: isort (python)
+    -   id: isort
+        name: isort (cython)
+        types: [cython]
+    -   id: isort
+        name: isort (pyi)
+        types: [pyi]
+-   repo: local
+    hooks:
+    -   id: flake8
+        name: flake8
+        entry: flake8
+        language: system
+        args:
+        -   --count
+        -   --select=E9,F63,F7,F82
+        -   --show-source
+        -   --statistics
+        files: \.py$
--- a/PaddleAudio/.style.yapf
+++ b/PaddleAudio/.style.yapf
+[style]
+based_on_style = pep8
+column_limit = 120
--- a/PaddleAudio/LICENSE
+++ b/PaddleAudio/LICENSE
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/PaddleAudio/README.md
+++ b/PaddleAudio/README.md
+# PaddleAudio
+Unofficial  paddle audio codebase
+## Install
+```
+git clone https://github.com/ranchlai/PaddleAudio.git
+cd PaddleAudio
+pip install .
+```
+## Usage
+```
+import paddleAudio as pa
+s,r = pa.load(f)
+mel = pa.features.mel_spect(s,r)
+```
+## to do 
+- add sound effects(tempo, mag, etc) , sox supports
+- add dataset support
+- add models DCASE classication ASD，sound classification
+- add demos (audio,video demos) 
+- add openL3 support
--- a/PaddleAudio/examples/audio_tagging/README.md
+++ b/PaddleAudio/examples/audio_tagging/README.md
+# Audioset Tagging Example
+本示例采用PANNs预训练模型，对输入音频实时打tag，并最终以文本形式输出对应时刻的topk类别和对应的得分。
+PANNs预训练模型的详情，请参考论文[PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf)。
+## Usage
+```python
+python audio_tag.py \
+    --wav ./cat_meow.wav \
+    --sr 32000 \
+    --sample_duration 2 \
+    --hop_duration 0.3 \
+    --checkpoint ./assets/cnn14.pdparams \
+    --use_gpu True \
+    --output_dir ./output_dir
+```
+参数用法：
+```
+--wav  # 音频路径
+--sr  # sample rate
+--sample_duration  # tagging音频长度，单位为秒
+--hop_duration  # tagging音频间步长，单位为秒
+--checkpoint  # 预训练模型参数
+--use_gpu  # 使用GPU加速
+--output_dir  # 输出路径
+```
+执行结果：
+```
+[2021-04-06 21:10:36,438] [    INFO] - Loaded CNN14 pretrained parameters from: ./assets/cnn14.pdparams
+[2021-04-06 21:10:38,193] [    INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_32000.npz  
+```
+执行后得分结果保存在`output_dir`的`.npz`文件中。
+## Output
+```python
+python parse_result.py \
+    --input_file ./output_dir/audioset_tagging_sr_32000.npz \
+    --topk 10 \
+    --smooth True \
+    --smooth_size 5 \
+    --output_dir ./output_dir
+```
+参数用法：
+```
+--input_file  # tagging得分文件
+--topk  # 展示topk结果
+--smooth  # 帧间得分平滑
+--smooth_size  # 平滑窗口大小
+--output_dir  # 输出路径
+```
+执行结果：
+```
+[2021-04-06 21:22:00,696] [    INFO] - Posterior smoothing...
+[2021-04-06 21:22:00,699] [    INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_32000.txt
+```
+执行后文本结果保存在`output_dir`的`.txt`文件中。
+## Labels
+最终输出的文本结果如下所示。  
+不同tagging的topk结果用空行分隔。每一个结果中，第一行是时间信息，数字表示tagging结果的起始样本点；接下来的k行是对应的标签和得分。
+```
+0
+Cat: 0.80844646692276
+Animal: 0.6848719716072083
+Meow: 0.6470851898193359
+Domestic animals, pets: 0.6392854452133179
+Inside, small room: 0.05361200496554375
+Purr: 0.02675800956785679
+Music: 0.021260583773255348
+Speech: 0.0209784135222435
+Caterwaul: 0.019929537549614906
+Outside, urban or manmade: 0.010916451923549175
+9600
+Cat: 0.7778594493865967
+Meow: 0.6465566158294678
+Animal: 0.6342337131500244
+Domestic animals, pets: 0.5945377349853516
+Inside, small room: 0.04747435823082924
+Purr: 0.027785276994109154
+Music: 0.022447215393185616
+Caterwaul: 0.020785318687558174
+Speech: 0.01982543244957924
+Vehicle: 0.014558425173163414
+19200
+Cat: 0.8243843913078308
+Animal: 0.6799540519714355
+Meow: 0.6794822812080383
+Domestic animals, pets: 0.6637188792228699
+Caterwaul: 0.09927166253328323
+Inside, small room: 0.0378643162548542
+Music: 0.02170632779598236
+Purr: 0.02035444974899292
+Speech: 0.02006830833852291
+Vehicle: 0.01234798226505518
+28800
+Cat: 0.8329735398292542
+Animal: 0.6937487125396729
+Meow: 0.6766577959060669
+Domestic animals, pets: 0.6669812798500061
+Caterwaul: 0.08647485077381134
+Inside, small room: 0.03593464195728302
+Music: 0.022975120693445206
+Speech: 0.01964726485311985
+Purr: 0.017558127641677856
+Vehicle: 0.010926523245871067
+38400
+Cat: 0.8097503781318665
+Animal: 0.6702587604522705
+Meow: 0.6487116813659668
+Domestic animals, pets: 0.6369225382804871
+Caterwaul: 0.07185821980237961
+Inside, small room: 0.039198972284793854
+Music: 0.02381189912557602
+Speech: 0.018534155562520027
+Purr: 0.0178740955889225
+Outside, urban or manmade: 0.011107126250863075
+...
+...
+```
--- a/PaddleAudio/examples/audio_tagging/assets/audioset_labels.txt
+++ b/PaddleAudio/examples/audio_tagging/assets/audioset_labels.txt
+Speech
+Male speech, man speaking
+Female speech, woman speaking
+Child speech, kid speaking
+Conversation
+Narration, monologue
+Babbling
+Speech synthesizer
+Shout
+Bellow
+Whoop
+Yell
+Battle cry
+Children shouting
+Screaming
+Whispering
+Laughter
+Baby laughter
+Giggle
+Snicker
+Belly laugh
+Chuckle, chortle
+Crying, sobbing
+Baby cry, infant cry
+Whimper
+Wail, moan
+Sigh
+Singing
+Choir
+Yodeling
+Chant
+Mantra
+Male singing
+Female singing
+Child singing
+Synthetic singing
+Rapping
+Humming
+Groan
+Grunt
+Whistling
+Breathing
+Wheeze
+Snoring
+Gasp
+Pant
+Snort
+Cough
+Throat clearing
+Sneeze
+Sniff
+Run
+Shuffle
+Walk, footsteps
+Chewing, mastication
+Biting
+Gargling
+Stomach rumble
+Burping, eructation
+Hiccup
+Fart
+Hands
+Finger snapping
+Clapping
+Heart sounds, heartbeat
+Heart murmur
+Cheering
+Applause
+Chatter
+Crowd
+Hubbub, speech noise, speech babble
+Children playing
+Animal
+Domestic animals, pets
+Dog
+Bark
+Yip
+Howl
+Bow-wow
+Growling
+Whimper (dog)
+Cat
+Purr
+Meow
+Hiss
+Caterwaul
+Livestock, farm animals, working animals
+Horse
+Clip-clop
+Neigh, whinny
+Cattle, bovinae
+Moo
+Cowbell
+Pig
+Oink
+Goat
+Bleat
+Sheep
+Fowl
+Chicken, rooster
+Cluck
+Crowing, cock-a-doodle-doo
+Turkey
+Gobble
+Duck
+Quack
+Goose
+Honk
+Wild animals
+Roaring cats (lions, tigers)
+Roar
+Bird
+Bird vocalization, bird call, bird song
+Chirp, tweet
+Squawk
+Pigeon, dove
+Coo
+Crow
+Caw
+Owl
+Hoot
+Bird flight, flapping wings
+Canidae, dogs, wolves
+Rodents, rats, mice
+Mouse
+Patter
+Insect
+Cricket
+Mosquito
+Fly, housefly
+Buzz
+Bee, wasp, etc.
+Frog
+Croak
+Snake
+Rattle
+Whale vocalization
+Music
+Musical instrument
+Plucked string instrument
+Guitar
+Electric guitar
+Bass guitar
+Acoustic guitar
+Steel guitar, slide guitar
+Tapping (guitar technique)
+Strum
+Banjo
+Sitar
+Mandolin
+Zither
+Ukulele
+Keyboard (musical)
+Piano
+Electric piano
+Organ
+Electronic organ
+Hammond organ
+Synthesizer
+Sampler
+Harpsichord
+Percussion
+Drum kit
+Drum machine
+Drum
+Snare drum
+Rimshot
+Drum roll
+Bass drum
+Timpani
+Tabla
+Cymbal
+Hi-hat
+Wood block
+Tambourine
+Rattle (instrument)
+Maraca
+Gong
+Tubular bells
+Mallet percussion
+Marimba, xylophone
+Glockenspiel
+Vibraphone
+Steelpan
+Orchestra
+Brass instrument
+French horn
+Trumpet
+Trombone
+Bowed string instrument
+String section
+Violin, fiddle
+Pizzicato
+Cello
+Double bass
+Wind instrument, woodwind instrument
+Flute
+Saxophone
+Clarinet
+Harp
+Bell
+Church bell
+Jingle bell
+Bicycle bell
+Tuning fork
+Chime
+Wind chime
+Change ringing (campanology)
+Harmonica
+Accordion
+Bagpipes
+Didgeridoo
+Shofar
+Theremin
+Singing bowl
+Scratching (performance technique)
+Pop music
+Hip hop music
+Beatboxing
+Rock music
+Heavy metal
+Punk rock
+Grunge
+Progressive rock
+Rock and roll
+Psychedelic rock
+Rhythm and blues
+Soul music
+Reggae
+Country
+Swing music
+Bluegrass
+Funk
+Folk music
+Middle Eastern music
+Jazz
+Disco
+Classical music
+Opera
+Electronic music
+House music
+Techno
+Dubstep
+Drum and bass
+Electronica
+Electronic dance music
+Ambient music
+Trance music
+Music of Latin America
+Salsa music
+Flamenco
+Blues
+Music for children
+New-age music
+Vocal music
+A capella
+Music of Africa
+Afrobeat
+Christian music
+Gospel music
+Music of Asia
+Carnatic music
+Music of Bollywood
+Ska
+Traditional music
+Independent music
+Song
+Background music
+Theme music
+Jingle (music)
+Soundtrack music
+Lullaby
+Video game music
+Christmas music
+Dance music
+Wedding music
+Happy music
+Funny music
+Sad music
+Tender music
+Exciting music
+Angry music
+Scary music
+Wind
+Rustling leaves
+Wind noise (microphone)
+Thunderstorm
+Thunder
+Water
+Rain
+Raindrop
+Rain on surface
+Stream
+Waterfall
+Ocean
+Waves, surf
+Steam
+Gurgling
+Fire
+Crackle
+Vehicle
+Boat, Water vehicle
+Sailboat, sailing ship
+Rowboat, canoe, kayak
+Motorboat, speedboat
+Ship
+Motor vehicle (road)
+Car
+Vehicle horn, car horn, honking
+Toot
+Car alarm
+Power windows, electric windows
+Skidding
+Tire squeal
+Car passing by
+Race car, auto racing
+Truck
+Air brake
+Air horn, truck horn
+Reversing beeps
+Ice cream truck, ice cream van
+Bus
+Emergency vehicle
+Police car (siren)
+Ambulance (siren)
+Fire engine, fire truck (siren)
+Motorcycle
+Traffic noise, roadway noise
+Rail transport
+Train
+Train whistle
+Train horn
+Railroad car, train wagon
+Train wheels squealing
+Subway, metro, underground
+Aircraft
+Aircraft engine
+Jet engine
+Propeller, airscrew
+Helicopter
+Fixed-wing aircraft, airplane
+Bicycle
+Skateboard
+Engine
+Light engine (high frequency)
+Dental drill, dentist's drill
+Lawn mower
+Chainsaw
+Medium engine (mid frequency)
+Heavy engine (low frequency)
+Engine knocking
+Engine starting
+Idling
+Accelerating, revving, vroom
+Door
+Doorbell
+Ding-dong
+Sliding door
+Slam
+Knock
+Tap
+Squeak
+Cupboard open or close
+Drawer open or close
+Dishes, pots, and pans
+Cutlery, silverware
+Chopping (food)
+Frying (food)
+Microwave oven
+Blender
+Water tap, faucet
+Sink (filling or washing)
+Bathtub (filling or washing)
+Hair dryer
+Toilet flush
+Toothbrush
+Electric toothbrush
+Vacuum cleaner
+Zipper (clothing)
+Keys jangling
+Coin (dropping)
+Scissors
+Electric shaver, electric razor
+Shuffling cards
+Typing
+Typewriter
+Computer keyboard
+Writing
+Alarm
+Telephone
+Telephone bell ringing
+Ringtone
+Telephone dialing, DTMF
+Dial tone
+Busy signal
+Alarm clock
+Siren
+Civil defense siren
+Buzzer
+Smoke detector, smoke alarm
+Fire alarm
+Foghorn
+Whistle
+Steam whistle
+Mechanisms
+Ratchet, pawl
+Clock
+Tick
+Tick-tock
+Gears
+Pulleys
+Sewing machine
+Mechanical fan
+Air conditioning
+Cash register
+Printer
+Camera
+Single-lens reflex camera
+Tools
+Hammer
+Jackhammer
+Sawing
+Filing (rasp)
+Sanding
+Power tool
+Drill
+Explosion
+Gunshot, gunfire
+Machine gun
+Fusillade
+Artillery fire
+Cap gun
+Fireworks
+Firecracker
+Burst, pop
+Eruption
+Boom
+Wood
+Chop
+Splinter
+Crack
+Glass
+Chink, clink
+Shatter
+Liquid
+Splash, splatter
+Slosh
+Squish
+Drip
+Pour
+Trickle, dribble
+Gush
+Fill (with liquid)
+Spray
+Pump (liquid)
+Stir
+Boiling
+Sonar
+Arrow
+Whoosh, swoosh, swish
+Thump, thud
+Thunk
+Electronic tuner
+Effects unit
+Chorus effect
+Basketball bounce
+Bang
+Slap, smack
+Whack, thwack
+Smash, crash
+Breaking
+Bouncing
+Whip
+Flap
+Scratch
+Scrape
+Rub
+Roll
+Crushing
+Crumpling, crinkling
+Tearing
+Beep, bleep
+Ping
+Ding
+Clang
+Squeal
+Creak
+Rustle
+Whir
+Clatter
+Sizzle
+Clicking
+Clickety-clack
+Rumble
+Plop
+Jingle, tinkle
+Hum
+Zing
+Boing
+Crunch
+Silence
+Sine wave
+Harmonic
+Chirp tone
+Sound effect
+Pulse
+Inside, small room
+Inside, large room or hall
+Inside, public space
+Outside, urban or manmade
+Outside, rural or natural
+Reverberation
+Echo
+Noise
+Environmental noise
+Static
+Mains hum
+Distortion
+Sidetone
+Cacophony
+White noise
+Pink noise
+Throbbing
+Vibration
+Television
+Radio
+Field recording
--- a/PaddleAudio/examples/audio_tagging/audio_tag.py
+++ b/PaddleAudio/examples/audio_tagging/audio_tag.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import ast
+import os
+from typing import List
+import librosa
+import numpy as np
+import paddle
+from paddleaudio.features import mel_spect
+from paddleaudio.models import CNN14
+from paddleaudio.utils.log import logger
+parser = argparse.ArgumentParser(__doc__)
+# features
+parser.add_argument("--sr", type=int, default=32000, help="Sample rate of inference audio.")
+parser.add_argument('--window_size', type=int, default=1024)
+parser.add_argument('--hop_size', type=int, default=320)
+parser.add_argument('--mel_bins', type=int, default=64)
+parser.add_argument('--fmin', type=int, default=50)
+parser.add_argument('--fmax', type=int, default=14000)
+# waveform
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument('--sample_duration', type=float, default=1.0)  # 1s
+parser.add_argument('--hop_duration', type=float, default=0.3)  # 0.3s
+parser.add_argument("--output_dir", type=str, default='./output_dir')
+parser.add_argument("--use_gpu",
+                    type=ast.literal_eval,
+                    default=True,
+                    help="Whether use GPU for fine-tuning, input should be True or False")
+parser.add_argument("--checkpoint", type=str, default='./assets/cnn14.pdparams', help="Checkpoint of model.")
+args = parser.parse_args()
+def split(waveform: np.ndarray, win_size: int, hop_size: int):
+    """
+    Split into N audios.
+    N is decided by win_size and hop_size.
+    """
+    assert isinstance(waveform, np.ndarray)
+    ret = []
+    for i in range(0, len(waveform), hop_size):
+        segment = waveform[i:i + win_size]
+        if len(segment) < win_size:
+            segment = np.pad(segment, (0, win_size - len(segment)))
+        ret.append(segment)
+    return ret
+def batchify(data: List[List[float]], batch_size: int):
+    """
+    Extract features from waveforms and create batches.
+    """
+    examples = []
+    for waveform in data:
+        feat = mel_spect(
+            waveform,
+            sample_rate=args.sr,
+            window_size=args.window_size,
+            hop_length=args.hop_size,
+            mel_bins=args.mel_bins,
+            fmin=args.fmin,
+            fmax=args.fmax,
+        )
+        examples.append(np.expand_dims(feat.transpose(), 0))  # (mel_bins, time) -> (1, time, mel_bins)
+    # Seperates data into some batches.
+    one_batch = []
+    for example in examples:
+        one_batch.append(example)
+        if len(one_batch) == batch_size:
+            yield one_batch
+            one_batch = []
+    if one_batch:
+        yield one_batch
+def predict(model, data: List[List[float]], batch_size: int = 1, use_gpu: bool = False):
+    paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu')
+    batches = batchify(data, batch_size)
+    results = None
+    model.eval()
+    for batch in batches:
+        feats = paddle.to_tensor(batch)
+        audioset_scores = model(feats)
+        if results is None:
+            results = audioset_scores.numpy()
+        else:
+            results = np.concatenate((results, audioset_scores.numpy()))
+    return results
+if __name__ == '__main__':
+    model = CNN14(extract_embedding=False, checkpoint=args.checkpoint)
+    waveform = librosa.load(args.wav, sr=args.sr)[0]
+    data = split(waveform, int(args.sample_duration * args.sr), int(args.hop_duration * args.sr))
+    results = predict(model, data, batch_size=8, use_gpu=args.use_gpu)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    time = np.arange(0, 1, int(args.hop_duration * args.sr) / len(waveform))
+    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{args.sr}.npz')
+    np.savez(output_file, time=time, scores=results)
+    logger.info(f'Saved tagging results to {output_file}')
--- a/PaddleAudio/examples/audio_tagging/parse_result.py
+++ b/PaddleAudio/examples/audio_tagging/parse_result.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import ast
+import os
+from typing import Dict, List
+import numpy as np
+from paddleaudio.utils.log import logger
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--input_file", type=str, required=True)
+parser.add_argument("--topk", type=int, default=10, help="Show top k results of audioset labels.")
+parser.add_argument("--smooth", type=ast.literal_eval, default=True, help="Posterior smoothing.")
+parser.add_argument("--smooth_size", type=int, default=5, help="Window size of smoothing.")
+parser.add_argument("--output_dir", type=str, default='./output_dir')
+args = parser.parse_args()
+def smooth(results: np.ndarray, win_size: int):
+    """
+    Execute posterior smoothing in-place.
+    """
+    for i in range(len(results) - 1, -1, -1):
+        if i < win_size - 1:
+            left = 0
+        else:
+            left = i + 1 - win_size
+        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
+def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
+    """
+    Return top k result.
+    """
+    result = np.asarray(result)
+    topk_idx = (-result).argsort()[:k]
+    ret = ''
+    for idx in topk_idx:
+        label, score = label_map[idx], result[idx]
+        ret += f'{label}: {score}\n'
+    return ret
+if __name__ == "__main__":
+    label_file = './assets/audioset_labels.txt'
+    label_map = {}
+    with open(label_file, 'r') as f:
+        for i, l in enumerate(f.readlines()):
+            label_map[i] = l.strip()
+    results = np.load(args.input_file, allow_pickle=True)
+    times, scores = results['time'], results['scores']
+    if args.smooth:
+        logger.info('Posterior smoothing...')
+        smooth(scores, win_size=args.smooth_size)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    output_file = os.path.join(args.output_dir, os.path.basename(args.input_file).split('.')[0] + '.txt')
+    with open(output_file, 'w') as f:
+        for time, score in zip(times, scores):
+            f.write(f'{time}\n')
+            f.write(generate_topk_label(args.topk, label_map, score) + '\n')
+    logger.info(f'Saved tagging labels to {output_file}')
--- a/PaddleAudio/examples/tools/wav2mel.py
+++ b/PaddleAudio/examples/tools/wav2mel.py
+import argparse
+import glob
+import os
+import h5py
+import numpy as np
+import tqdm
+import paddleaudio as pa
+#from pylab import *
+parser = argparse.ArgumentParser(description='wave2mel')
+parser.add_argument('--wav_file', type=str, required=False, default='')
+parser.add_argument('--wav_list', type=str, required=False, default='')
+parser.add_argument('--wav_h5_file', type=str, required=False, default='')
+parser.add_argument('--wav_h5_list', type=str, required=False, default='')
+parser.add_argument('--output_folder', type=str, required=False, default='./')
+parser.add_argument('--output_h5', type=bool, required=False, default=True)
+parser.add_argument('--sample_rate', type=int, required=False, default=32000)
+parser.add_argument('--window_size', type=int, required=False, default=1024)
+parser.add_argument('--mel_bins', type=int, required=False, default=128)
+parser.add_argument('--hop_length', type=int, required=False, default=640)  #20ms
+parser.add_argument('--fmin', type=int, required=False, default=50)  #25ms
+parser.add_argument('--fmax', type=int, required=False, default=16000)  #25ms
+args = parser.parse_args()
+#args.wav_h5_file = '/ssd2/laiyongquan/audioset/h5/audioset_unblance_group28.h5'
+assert not (args.wav_h5_file == '' and args.wav_h5_list == ''\
+and args.wav_list == '' and args.wav_file == ''), 'one of wav_file,wav_list,\
+wav_h5_file,wav_h5_list needs to specify'
+if args.wav_h5_file != '':
+    h5_files = [args.wav_h5_file]
+if args.wav_h5_list != '':
+    h5_files = open(args.wav_h5_list).read().split('\n')
+    h5_files = [h for h in h5_files if len(h.strip()) != 0]
+dst_folder = args.output_folder
+print(f'{len(h5_files)} h5 files listed')
+for f in h5_files:
+    print(f'processing {f}')
+    dst_file = os.path.join(dst_folder, f.split('/')[-1])
+    print(f'target file {dst_file}')
+    assert not os.path.exists(dst_file), f'target file {dst_file} existed'
+    src_h5 = h5py.File(f)
+    dst_h5 = h5py.File(dst_file, "w")
+    for key in tqdm.tqdm(src_h5.keys()):
+        s = src_h5[key][:]
+        s = pa.depth_convert(s, 'float32')
+        # s = pa.resample(s,32000,args.sample_rate)
+        x = pa.features.mel_spect(s,
+                                  sample_rate=args.sample_rate,
+                                  window_size=args.window_size,
+                                  hop_length=args.hop_length,
+                                  mel_bins=args.mel_bins,
+                                  fmin=args.fmin,
+                                  fmax=args.fmax,
+                                  window='hann',
+                                  center=True,
+                                  pad_mode='reflect',
+                                  ref=1.0,
+                                  amin=1e-10,
+                                  top_db=None)
+        #         figure(figsize=(8,8))
+        #         imshow(x)
+        #         show()
+        #         print(x.shape)
+        dst_h5.create_dataset(key, data=x)
+    src_h5.close()
+    dst_h5.close()
--- a/PaddleAudio/examples/wav2mel.py
+++ b/PaddleAudio/examples/wav2mel.py
+import argparse
+import glob
+import os
+import h5py
+import numpy as np
+import tqdm
+import paddleaudio as pa
+#from pylab import *
+parser = argparse.ArgumentParser(description='wave2mel')
+parser.add_argument('--wav_file', type=str, required=False, default='')
+parser.add_argument('--wav_list', type=str, required=False, default='')
+parser.add_argument('--wav_h5_file', type=str, required=False, default='')
+parser.add_argument('--wav_h5_list', type=str, required=False, default='')
+parser.add_argument('--output_folder', type=str, required=False, default='./')
+parser.add_argument('--output_h5', type=bool, required=False, default=True)
+parser.add_argument('--sample_rate', type=int, required=False, default=32000)
+parser.add_argument('--window_size', type=int, required=False, default=1024)
+parser.add_argument('--mel_bins', type=int, required=False, default=128)
+parser.add_argument('--hop_length', type=int, required=False, default=640)  #20ms
+parser.add_argument('--fmin', type=int, required=False, default=50)  #25ms
+parser.add_argument('--fmax', type=int, required=False, default=16000)  #25ms
+parser.add_argument('--skip_existed', type=int, required=False, default=1)  #25ms
+args = parser.parse_args()
+#args.wav_h5_file = '/ssd2/laiyongquan/audioset/h5/audioset_unblance_group28.h5'
+assert not (args.wav_h5_file == '' and args.wav_h5_list == ''\
+and args.wav_list == '' and args.wav_file == ''), 'one of wav_file,wav_list,\
+wav_h5_file,wav_h5_list needs to specify'
+if args.wav_h5_file != '':
+    h5_files = [args.wav_h5_file]
+if args.wav_h5_list != '':
+    h5_files = open(args.wav_h5_list).read().split('\n')
+    h5_files = [h for h in h5_files if len(h.strip()) != 0]
+dst_folder = args.output_folder
+print(f'{len(h5_files)} h5 files listed')
+for f in h5_files:
+    print(f'processing {f}')
+    dst_file = os.path.join(dst_folder, f.split('/')[-1])
+    print(f'target file {dst_file}')
+    if args.skip_existed != 0 and os.path.exists(dst_file):
+        print(f'skipped file {f}')
+        continue
+    assert not os.path.exists(dst_file), f'target file {dst_file} existed'
+    src_h5 = h5py.File(f)
+    dst_h5 = h5py.File(dst_file, "w")
+    for key in tqdm.tqdm(src_h5.keys()):
+        s = src_h5[key][:]
+        s = pa.depth_convert(s, 'float32')
+        # s = pa.resample(s,32000,args.sample_rate)
+        x = pa.features.mel_spect(s,
+                                  sample_rate=args.sample_rate,
+                                  window_size=args.window_size,
+                                  hop_length=args.hop_length,
+                                  mel_bins=args.mel_bins,
+                                  fmin=args.fmin,
+                                  fmax=args.fmax,
+                                  window='hann',
+                                  center=True,
+                                  pad_mode='reflect',
+                                  ref=1.0,
+                                  amin=1e-10,
+                                  top_db=None)
+        #         figure(figsize=(8,8))
+        #         imshow(x)
+        #         show()
+        #         print(x.shape)
+        dst_h5.create_dataset(key, data=x)
+    src_h5.close()
+    dst_h5.close()
--- a/PaddleAudio/paddleaudio/__init__.py
+++ b/PaddleAudio/paddleaudio/__init__.py
+from .backends import *
+from .features import *
--- a/PaddleAudio/paddleaudio/backends/__init__.py
+++ b/PaddleAudio/paddleaudio/backends/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .audio import *
--- a/PaddleAudio/paddleaudio/backends/audio.py
+++ b/PaddleAudio/paddleaudio/backends/audio.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+import numpy as np
+from scipy.io import wavfile
+try:
+    import librosa
+    has_librosa = True
+except:
+    has_librosa = False
+try:
+    import soundfile as sf
+    has_snf = True
+except:
+    has_snf = False
+try:
+    import resampy
+    has_resampy = True
+except:
+    has_resampy = False
+__norm_types__ = ['linear', 'gaussian']
+__mono_types__ = ['ch0', 'ch1', 'random', 'average']
+__all__ = ['resample', 'to_mono', 'depth_convert', 'normalize', 'save', 'load']
+def resample(y, src_sr, target_sr):
+    warnings.warn(
+        f'Using resampy to {src_sr}=>{target_sr}. This function is pretty slow, we recommend to process audio using ffmpeg'
+    )
+    assert type(y) == np.ndarray, 'currently only numpy data are supported'
+    assert type(
+        src_sr) == int and src_sr > 0 and src_sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,'
+    assert type(
+        target_sr
+    ) == int and target_sr > 0 and target_sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,'
+    if has_resampy:
+        return resampy.resample(y, src_sr, target_sr)
+    if has_librosa:
+        return librosa.resample(y, src_sr, target_sr)
+    assert False, 'requires librosa or resampy to do resampling, pip install resampy'
+def to_mono(y, mono_type='average'):
+    assert type(y) == np.ndarray, 'currently only numpy data are supported'
+    if mono_type not in __mono_types__:
+        assert False, 'Unsupported mono_type {}, available types are {}'.format(mono_type, __mono_types__)
+    if y.ndim == 1:
+        return y
+    if y.ndim > 2:
+        assert False, 'Unsupported audio array,  y.ndim > 2, the shape is {}'.format(y.shape)
+    if mono_type == 'ch0':
+        return y[0]
+    if mono_type == 'ch1':
+        return y[1]
+    if mono_type == 'random':
+        return y[np.random.randint(0, 2)]
+    if y.dtype == 'float32':
+        return (y[0] + y[1]) * 0.5
+    if y.dtype == 'int16':
+        y1 = y.astype('int32')
+        y1 = (y1[0] + y1[1]) // 2
+        y1 = np.clip(y1, np.iinfo(y.dtype).min, np.iinfo(y.dtype).max).astype(y.dtype)
+        return y1
+    if y.dtype == 'int8':
+        y1 = y.astype('int16')
+        y1 = (y1[0] + y1[1]) // 2
+        y1 = np.clip(y1, np.iinfo(y.dtype).min, np.iinfo(y.dtype).max).astype(y.dtype)
+        return y1
+    assert False, 'Unsupported audio array type,  y.dtype={}'.format(y.dtype)
+def __safe_cast__(y, dtype):
+    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
+def depth_convert(y, dtype):  # convert audio array to target dtype
+    assert type(y) == np.ndarray, 'currently only numpy data are supported'
+    __eps__ = 1e-5
+    __supported_dtype__ = ['int16', 'int8', 'float32', 'float64']
+    if y.dtype not in __supported_dtype__:
+        assert False, 'Unsupported audio dtype,  y.dtype is {}, supported dtypes are {}'.format(
+            y.dtype, __supported_dtype__)
+    if dtype not in __supported_dtype__:
+        assert False, 'Unsupported dtype,  target dtype is {}, supported dtypes are {}'.format(
+            dtype, __supported_dtype__)
+    if dtype == y.dtype:
+        return y
+    if dtype == 'float64' and y.dtype == 'float32':
+        return __safe_cast__(y, dtype)
+    if dtype == 'float32' and y.dtype == 'float64':
+        return __safe_cast__(y, dtype)
+    if dtype == 'int16' or dtype == 'int8':
+        if y.dtype in ['float64', 'float32']:
+            factor = np.iinfo(dtype).max
+            y = np.clip(y * factor, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
+            y = y.astype(dtype)
+        # figure
+        # plot(y)
+        # show()
+        else:
+            if dtype == 'int16' and y.dtype == 'int8':
+                factor = np.iinfo('int16').max / np.iinfo('int8').max - __eps__
+                y = y.astype('float32') * factor
+                y = y.astype('int16')
+            else:  #dtype == 'int8' and y.dtype=='int16':
+                y = y.astype('int32') * np.iinfo('int8').max / np.iinfo('int16').max
+                y = y.astype('int8')
+    if dtype in ['float32', 'float64']:
+        org_dtype = y.dtype
+        y = y.astype(dtype) / np.iinfo(org_dtype).max
+    return y
+def sound_file_load(file, offset=None, dtype='int16', duration=None):
+    with sf.SoundFile(file) as sf_desc:
+        sr_native = sf_desc.samplerate
+        if offset:
+            sf_desc.seek(int(offset * sr_native))
+        if duration is not None:
+            frame_duration = int(duration * sr_native)
+        else:
+            frame_duration = -1
+        y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T
+    return y, sf_desc.samplerate
+def normalize(y, norm_type='linear', mul_factor=1.0):
+    assert type(y) == np.ndarray, 'currently only numpy data are supported'
+    __eps__ = 1e-8
+    #set_trace()
+    if norm_type == 'linear':
+        # amin = np.min(y)
+        amax = np.max(np.abs(y))
+        factor = 1.0 / (amax + __eps__)
+        y = y * factor * mul_factor
+    elif norm_type == 'gaussian':
+        amean = np.mean(y)
+        mul_factor = max(0.01, min(mul_factor, 0.2))
+        astd = np.std(y)
+        y = mul_factor * (y - amean) / (astd + __eps__)
+    else:
+        assert False, 'not implemented error, norm_type should be in {}'.format(__norm_types__)
+    return y
+def save(y, sr, file):
+    assert type(y) == np.ndarray, 'currently only numpy data are supported'
+    assert type(sr) == int and sr > 0 and sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,'
+    if y.dtype not in ['int16', 'int8']:
+        warnings.warn('input data type is {}, saving data to int16 format'.format(y.dtype))
+        yout = depth_convert(y, 'int16')
+    else:
+        yout = y
+    wavfile.write(file, sr, y)
+def load(
+        file,
+        sr=None,
+        mono=True,
+        mono_type='average',  # ch0,ch1,random,average
+        normal=True,
+        norm_type='linear',
+        norm_mul_factor=1.0,
+        offset=0.0,
+        duration=None,
+        dtype='float32'):
+    if has_librosa:
+        y, r = librosa.load(file, sr=sr, mono=False, offset=offset, duration=duration,
+                            dtype='float32')  #alwasy load in float32, then convert to target dtype
+    elif has_snf:
+        y, r = sound_file_load(file, offset=offset, dypte=dtype, duration=duration)
+    else:
+        assert False, 'not implemented error'
+    ##
+    assert (y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0), 'audio file {} looks empty'.format(file)
+    if mono:
+        y = to_mono(y, mono_type)
+    if sr is not None and sr != r:
+        y = resample(y, r, sr)
+        r = sr
+    if normal:
+        #    print('before nom',np.max(y))
+        y = normalize(y, norm_type, norm_mul_factor)
+    # print('after norm',np.max(y))
+    #plot(y)
+    #show()
+    if dtype in ['int8', 'int16'] and (normalize == False or normalize == True and norm_type == 'guassian'):
+        y = normalize(y, 'linear', 1.0)  # do normalization before converting to target dtype
+    y = depth_convert(y, dtype)
+    #figure
+    #plot(y)
+    #show()
+    return y, r
--- a/PaddleAudio/paddleaudio/datasets/__init__.py
+++ b/PaddleAudio/paddleaudio/datasets/__init__.py
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .dcase import TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet
+from .esc50 import ESC50
+from .gtzan import GTZAN
+from .urban_sound import UrbanSound8K
+__all__ = [
+    'ESC50',
+    'UrbanSound8K',
+    'GTZAN',
+    'TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet',
+]
--- a/PaddleAudio/paddleaudio/datasets/dataset.py
+++ b/PaddleAudio/paddleaudio/datasets/dataset.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import List, Tuple
+import librosa
+import numpy as np
+import paddle
+from tqdm import tqdm
+from ..features import linear_spect, log_spect, mel_spect
+from ..utils.log import logger
+class AudioClassificationDataset(paddle.io.Dataset):
+    """
+    Base class of audio classification dataset.
+    """
+    _feat_func = {
+        'raw': None,
+        'mel_spect': mel_spect,
+        'linear_spect': linear_spect,
+        'log_spect': log_spect,
+    }
+    def __init__(self, files: List[str], labels: List[int], sample_rate: int, feat_type: str = 'raw', **kwargs):
+        """
+        Ags:
+            files (:obj:`List[str]`): A list of absolute path of audio files.
+            labels (:obj:`List[int]`): Labels of audio files.
+            sample_rate (:obj:`int`): Sample rate of audio files.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        super(AudioClassificationDataset, self).__init__()
+        if feat_type not in self._feat_func.keys():
+            raise RuntimeError(\
+                f"Unknown feat_type: {feat_type}, it must be one in {list(self._feat_func.keys())}")
+        self.feat_type = feat_type
+        self.files = files
+        self.labels = labels
+        self.records = self._convert_to_records(sample_rate, **kwargs)
+    def _get_data(self, input_file: str):
+        raise NotImplementedError
+    def _convert_to_records(self, sample_rate: int, **kwargs) -> List[dict]:
+        records = []
+        feat_func = self._feat_func[self.feat_type]
+        logger.info('Start extracting features from audio files.')
+        for file, label in tqdm(zip(self.files, self.labels), total=len(self.files)):
+            record = {}
+            waveform, _ = librosa.load(file, sr=sample_rate)
+            record['feat'] = feat_func(waveform, **kwargs) if feat_func else waveform
+            record['label'] = label
+            records.append(record)
+        return records
+    def __getitem__(self, idx):
+        record = self.records[idx]
+        return np.array(record['feat']), np.array(record['label'], dtype=np.int64)
+    def __len__(self):
+        return len(self.records)
--- a/PaddleAudio/paddleaudio/datasets/dcase.py
+++ b/PaddleAudio/paddleaudio/datasets/dcase.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List, Tuple
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import AudioClassificationDataset
+__all__ = ['TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet']
+class TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet(AudioClassificationDataset):
+    """
+    TAU Urban Acoustic Scenes 2020 Mobile Development dataset
+    This dataset is used in DCASE2020 - Task 1, Acoustic scene classification / Subtask A / Development
+    """
+    source_url = 'https://zenodo.org/record/3819968/files/'
+    base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development'
+    archieves = [
+        {
+            'url': source_url + base_name + '.meta.zip',
+            'md5': '6eae9db553ce48e4ea246e34e50a3cf5',
+        },
+        {
+            'url': source_url + base_name + '.audio.1.zip',
+            'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8',
+        },
+        {
+            'url': source_url + base_name + '.audio.2.zip',
+            'md5': '4310a13cc2943d6ce3f70eba7ba4c784',
+        },
+        {
+            'url': source_url + base_name + '.audio.3.zip',
+            'md5': 'ed38956c4246abb56190c1e9b602b7b8',
+        },
+        {
+            'url': source_url + base_name + '.audio.4.zip',
+            'md5': '97ab8560056b6816808dedc044dcc023',
+        },
+        {
+            'url': source_url + base_name + '.audio.5.zip',
+            'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb',
+        },
+        {
+            'url': source_url + base_name + '.audio.6.zip',
+            'md5': 'fbf856a3a86fff7520549c899dc94372',
+        },
+        {
+            'url': source_url + base_name + '.audio.7.zip',
+            'md5': '0dbffe7b6e45564da649378723284062',
+        },
+        {
+            'url': source_url + base_name + '.audio.8.zip',
+            'md5': 'bb6f77832bf0bd9f786f965beb251b2e',
+        },
+        {
+            'url': source_url + base_name + '.audio.9.zip',
+            'md5': 'a65596a5372eab10c78e08a0de797c9e',
+        },
+        {
+            'url': source_url + base_name + '.audio.10.zip',
+            'md5': '2ad595819ffa1d56d2de4c7ed43205a6',
+        },
+        {
+            'url': source_url + base_name + '.audio.11.zip',
+            'md5': '0ad29f7040a4e6a22cfd639b3a6738e5',
+        },
+        {
+            'url': source_url + base_name + '.audio.12.zip',
+            'md5': 'e5f4400c6b9697295fab4cf507155a2f',
+        },
+        {
+            'url': source_url + base_name + '.audio.13.zip',
+            'md5': '8855ab9f9896422746ab4c5d89d8da2f',
+        },
+        {
+            'url': source_url + base_name + '.audio.14.zip',
+            'md5': '092ad744452cd3e7de78f988a3d13020',
+        },
+        {
+            'url': source_url + base_name + '.audio.15.zip',
+            'md5': '4b5eb85f6592aebf846088d9df76b420',
+        },
+        {
+            'url': source_url + base_name + '.audio.16.zip',
+            'md5': '2e0a89723e58a3836be019e6996ae460',
+        },
+    ]
+    label_list = ['airport', 'shopping_mall', 'metro_station', 'street_pedestrian', \
+        'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park']
+    meta = os.path.join(base_name, 'meta.csv')
+    meta_info = collections.namedtuple('META_INFO', ('filename', 'scene_label', 'identifier', 'source_label'))
+    subset_meta = {
+        'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'),
+        'dev': os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'),
+        'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'),
+    }
+    subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ('filename', 'scene_label'))
+    audio_path = os.path.join(base_name, 'audio')
+    sample_rate = 44100  # 44.1 khz
+    duration = 10  # 10s
+    def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode)
+        super(TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet, \
+            self).__init__(files=files,
+                           labels=labels,
+                           sample_rate=self.sample_rate,
+                           feat_type=feat_type,
+                           **kwargs)
+    def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[collections.namedtuple]:
+        if subset is None:
+            meta_file = self.meta
+            meta_info = self.meta_info
+        else:
+            assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.'
+            meta_file = self.subset_meta[subset]
+            meta_info = self.subset_meta_info
+        ret = []
+        with open(os.path.join(DATA_HOME, meta_file), 'r') as rf:
+            lines = rf.readlines()[1:] if skip_header else rf.readlines()
+            for line in lines:
+                ret.append(meta_info(*line.strip().split('\t')))
+        return ret
+    def _get_data(self, mode: str) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+        meta_info = self._get_meta_info(subset=mode, skip_header=True)
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, label = sample
+            filename = os.path.basename(filename)
+            target = self.label_list.index(label)
+            files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+            labels.append(int(target))
+        return files, labels
--- a/PaddleAudio/paddleaudio/datasets/esc50.py
+++ b/PaddleAudio/paddleaudio/datasets/esc50.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List, Tuple
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import AudioClassificationDataset
+__all__ = ['ESC50']
+class ESC50(AudioClassificationDataset):
+    """
+    Environment Sound Classification Dataset
+    """
+    archieves = [
+        {
+            'url': 'https://github.com/karoldvl/ESC-50/archive/master.zip',
+            'md5': '70aba3bada37d2674b8f6cd5afd5f065',
+        },
+    ]
+    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
+    meta_info = collections.namedtuple('META_INFO',
+                                       ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'))
+    audio_path = os.path.join('ESC-50-master', 'audio')
+    sample_rate = 44100  # 44.1 khz
+    duration = 5  # 5s
+    def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        files, labels = self._get_data(mode, split)
+        super(ESC50, self).__init__(files=files,
+                                    labels=labels,
+                                    sample_rate=self.sample_rate,
+                                    feat_type=feat_type,
+                                    **kwargs)
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+        meta_info = self._get_meta_info()
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, fold, target, _, _, _, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+            if mode != 'train' and int(fold) == split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+        return files, labels
--- a/PaddleAudio/paddleaudio/datasets/gtzan.py
+++ b/PaddleAudio/paddleaudio/datasets/gtzan.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+import random
+from typing import List, Tuple
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import AudioClassificationDataset
+__all__ = ['GTZAN']
+class GTZAN(AudioClassificationDataset):
+    """
+    GTZAN Dataset
+    """
+    archieves = [
+        {
+            'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz',
+            'md5': '5b3d6dddb579ab49814ab86dba69e7c7',
+        },
+    ]
+    label_list = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
+    meta = os.path.join('genres', 'input.mf')
+    meta_info = collections.namedtuple('META_INFO', ('file_path', 'label'))
+    audio_path = 'genres'
+    sample_rate = 22050  # 44.1 khz
+    duration = 30  # 5s
+    def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs):
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            seed (:obj:`int`, `optional`, defaults to 0):
+                Set the random seed to shuffle samples.
+            n_folds (:obj:`int`, `optional`, defaults to 5):
+                Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset.
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}'
+        files, labels = self._get_data(mode, seed, n_folds, split)
+        super(GTZAN, self).__init__(files=files,
+                                    labels=labels,
+                                    sample_rate=self.sample_rate,
+                                    feat_type=feat_type,
+                                    **kwargs)
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines():
+                ret.append(self.meta_info(*line.strip().split('\t')))
+        return ret
+    def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+        meta_info = self._get_meta_info()
+        random.seed(seed)  # shuffle samples to split data
+        random.shuffle(meta_info)  # make sure using the same seed to create train and dev dataset
+        files = []
+        labels = []
+        n_samples_per_fold = len(meta_info) // n_folds
+        for idx, sample in enumerate(meta_info):
+            file_path, label = sample
+            filename = os.path.basename(file_path)
+            target = self.label_list.index(label)
+            fold = idx // n_samples_per_fold + 1
+            if mode == 'train' and int(fold) != split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, label, filename))
+                labels.append(target)
+            if mode != 'train' and int(fold) == split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, label, filename))
+                labels.append(target)
+        return files, labels
--- a/PaddleAudio/paddleaudio/datasets/urban_sound.py
+++ b/PaddleAudio/paddleaudio/datasets/urban_sound.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List, Tuple
+from ..utils.download import download_and_decompress
+from ..utils.env import DATA_HOME
+from ..utils.log import logger
+from .dataset import AudioClassificationDataset
+__all__ = ['UrbanSound8K']
+class UrbanSound8K(AudioClassificationDataset):
+    """
+    UrbanSound8K Dataset
+    """
+    archieves = [
+        {
+            'url': 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz',
+            'md5': '9aa69802bbf37fb986f71ec1483a196e',
+        },
+    ]
+    meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv')
+    meta_info = collections.namedtuple('META_INFO',
+                                       ('filename', 'fsid', 'start', 'end', 'salience', 'fold', 'class_id', 'label'))
+    audio_path = os.path.join('UrbanSound8K', 'audio')
+    sample_rate = 48000  # 48 khz
+    duration = 4  # 4s
+    def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs):
+        files, labels = self._get_data(mode, split)
+        super(UrbanSound8K, self).__init__(files=files,
+                                           labels=labels,
+                                           sample_rate=self.sample_rate,
+                                           feat_type=feat_type,
+                                           **kwargs)
+        """
+        Ags:
+            mode (:obj:`str`, `optional`, defaults to `train`):
+                It identifies the dataset mode (train or dev).
+            split (:obj:`int`, `optional`, defaults to 1):
+                It specify the fold of dev dataset.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+    def _get_meta_info(self):
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \
+            not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download_and_decompress(self.archieves, DATA_HOME)
+        meta_info = self._get_meta_info()
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, _, _, _, _, fold, target, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', filename))
+                labels.append(int(target))
+            if mode != 'train' and int(fold) == split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', filename))
+                labels.append(int(target))
+        return files, labels
--- a/PaddleAudio/paddleaudio/features/__init__.py
+++ b/PaddleAudio/paddleaudio/features/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .augmentation import *
+from .features import *
--- a/PaddleAudio/paddleaudio/features/augmentation.py
+++ b/PaddleAudio/paddleaudio/features/augmentation.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+from ..backends import depth_convert
+from .utils import randint, weighted_sampling
+__all__ = ['depth_augment', 'spect_augment', 'random_crop1d', 'random_crop2d']
+# example y = depth_augment(y,['int8','int16'],[0.8,0.1])
+def depth_augment(y, choices=['int8', 'int16'], probs=[0.5, 0.5]):
+    assert len(probs) == len(choices), 'number of choices {} must be equal to size of probs {}'.format(
+        len(choices), len(probs))
+    k = weighted_sampling(probs)
+    #k = randint(len(choices))
+    src_depth = y.dtype
+    y1 = depth_convert(y, choices[k])
+    y2 = depth_convert(y1, src_depth)
+    return y2
+def adaptive_spect_augment(spect, tempo_axis=0, level=0.1):
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+    time_mask_width = int(nt * level * 0.5)
+    freq_mask_width = int(nf * level * 0.5)
+    num_time_mask = int(10 * level)
+    num_freq_mask = int(10 * level)
+    # num_zeros = num_time_mask*time_mask_width*nf + num_freq_mask*freq_mask_width*nt
+    # factor = (nt*nf)/(nt*nf-num_zeros)
+    if tempo_axis == 0:
+        for i in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for i in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for i in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for i in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+    return spect
+def spect_augment(
+    spect,
+    tempo_axis=0,
+    max_time_mask=3,
+    max_freq_mask=3,
+    max_time_mask_width=30,
+    max_freq_mask_width=20,
+):
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+    num_time_mask = randint(max_time_mask)
+    num_freq_mask = randint(max_freq_mask)
+    time_mask_width = randint(max_time_mask_width)
+    freq_mask_width = randint(max_freq_mask_width)
+    #print(num_time_mask)
+    #print(num_freq_mask)
+    if tempo_axis == 0:
+        for i in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for i in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for i in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for i in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+    return spect
+def random_crop1d(y, crop_len):
+    assert y.ndim == 1, 'only accept 1d tensor or numpy array'
+    n = len(y)
+    idx = randint(n - crop_len)
+    return y[idx:idx + crop_len]
+def random_crop2d(s, crop_len, tempo_axis=0):  # random crop according to temporal direction
+    assert tempo_axis < s.ndim, 'axis out of range'
+    n = s.shape[tempo_axis]
+    idx = randint(high=n - crop_len)
+    if type(s) == np.ndarray:
+        sli = [slice(None) for i in range(s.ndim)]
+        sli[tempo_axis] = slice(idx, idx + crop_len)
+        out = s[tuple(sli)]
+    else:
+        out = paddle.index_select(s, paddle.Tensor(np.array([i for i in range(idx, idx + crop_len)])), axis=tempo_axis)
+    return out
--- a/PaddleAudio/paddleaudio/features/features.py
+++ b/PaddleAudio/paddleaudio/features/features.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa
+import numpy as np
+import paddle
+__all__ = ['mel_spect', 'linear_spect', 'log_spect']
+#mel
+def mel_spect(y,
+              sample_rate=16000,
+              window_size=512,
+              hop_length=320,
+              mel_bins=64,
+              fmin=50,
+              fmax=14000,
+              window='hann',
+              center=True,
+              pad_mode='reflect',
+              ref=1.0,
+              amin=1e-10,
+              top_db=None):
+    """ compute mel-spectrogram from input waveform y.
+    Create a Mel filter-bank.
+    This produces a linear transformation matrix to project
+    FFT bins onto Mel-frequency bins.
+    """
+    s = librosa.stft(y,
+                     n_fft=window_size,
+                     hop_length=hop_length,
+                     win_length=window_size,
+                     window=window,
+                     center=center,
+                     pad_mode=pad_mode)
+    power = np.abs(s)**2
+    melW = librosa.filters.mel(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax)
+    mel = np.matmul(melW, power)
+    db = librosa.power_to_db(mel, ref=ref, amin=amin, top_db=None)
+    return db
+def linear_spect(y,
+                 sample_rate=16000,
+                 window_size=512,
+                 hop_length=320,
+                 window='hann',
+                 center=True,
+                 pad_mode='reflect',
+                 power=2):
+    s = librosa.stft(y,
+                     n_fft=window_size,
+                     hop_length=hop_length,
+                     win_length=window_size,
+                     window=window,
+                     center=center,
+                     pad_mode=pad_mode)
+    return np.abs(s)**power
+def log_spect(y,
+              sample_rate=16000,
+              window_size=512,
+              hop_length=320,
+              window='hann',
+              center=True,
+              pad_mode='reflect',
+              power=2.0,
+              offset=1.0):
+    s = librosa.stft(
+        y,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode,
+    )
+    s = np.abs(s)**power
+    return np.log(offset + s)  # remove
--- a/PaddleAudio/paddleaudio/features/utils.py
+++ b/PaddleAudio/paddleaudio/features/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+__all__ = ['randint', 'rand', 'weighted_sampling']
+def randint(high, use_paddle=True):
+    if use_paddle:
+        return int(paddle.randint(0, high=high))
+    return int(np.random.randint(0, high=high))
+def rand(use_paddle=True):
+    if use_paddle:
+        return float(paddle.rand((1, )))
+    return float(np.random.rand(1))
+def weighted_sampling(weights):
+    n = len(weights)
+    w = np.cumsum(weights)
+    w = w / w[-1]
+    flag = rand() < w
+    return np.argwhere(flag)[0][0]
--- a/PaddleAudio/paddleaudio/models/PANNs/__init__.py
+++ b/PaddleAudio/paddleaudio/models/PANNs/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .cnn6 import CNN6
+from .cnn10 import CNN10
+from .cnn14 import CNN14
--- a/PaddleAudio/paddleaudio/models/PANNs/cnn10.py
+++ b/PaddleAudio/paddleaudio/models/PANNs/cnn10.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ...utils.log import logger
+from .conv import ConvBlock
+class CNN10(nn.Layer):
+    """
+    The CNN10(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
+    block consists of 2 convolutional layers with a kernel size of 3 × 3.
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 512
+    def __init__(self, extract_embedding: bool = True, checkpoint: str = None):
+        super(CNN10, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.fc1 = nn.Linear(512, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        if checkpoint is not None and os.path.isfile(checkpoint):
+            state_dict = paddle.load(checkpoint)
+            self.set_state_dict(state_dict)
+            print(f'Loaded CNN10 pretrained parameters from: {checkpoint}')
+        else:
+            print('No valid checkpoints for CNN10. Start training from scratch.')
+        self.extract_embedding = extract_embedding
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
--- a/PaddleAudio/paddleaudio/models/PANNs/cnn14.py
+++ b/PaddleAudio/paddleaudio/models/PANNs/cnn14.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ...utils.log import logger
+from .conv import ConvBlock
+class CNN14(nn.Layer):
+    """
+    The CNN14(14-layer CNNs) mainly consist of 6 convolutional blocks while each convolutional
+    block consists of 2 convolutional layers with a kernel size of 3 × 3.
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 2048
+    def __init__(self, extract_embedding: bool = True, checkpoint: str = None):
+        super(CNN14, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        self.fc1 = nn.Linear(2048, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        if checkpoint is not None and os.path.isfile(checkpoint):
+            state_dict = paddle.load(checkpoint)
+            self.set_state_dict(state_dict)
+            logger.info(f'Loaded CNN14 pretrained parameters from: {checkpoint}')
+        else:
+            logger.error('No valid checkpoints for CNN14. Start training from scratch.')
+        self.extract_embedding = extract_embedding
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
--- a/PaddleAudio/paddleaudio/models/PANNs/cnn6.py
+++ b/PaddleAudio/paddleaudio/models/PANNs/cnn6.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ...utils.log import logger
+from .conv import ConvBlock5x5
+class CNN6(nn.Layer):
+    """
+    The CNN14(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional
+    block consists of 1 convolutional layers with a kernel size of 5 × 5.
+    Reference:
+        PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition
+        https://arxiv.org/pdf/1912.10211.pdf
+    """
+    emb_size = 512
+    def __init__(self, extract_embedding: bool = True, checkpoint: str = None):
+        super(CNN6, self).__init__()
+        self.bn0 = nn.BatchNorm2D(64)
+        self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512)
+        self.fc1 = nn.Linear(512, self.emb_size)
+        self.fc_audioset = nn.Linear(self.emb_size, 527)
+        if checkpoint is not None and os.path.isfile(checkpoint):
+            state_dict = paddle.load(checkpoint)
+            self.set_state_dict(state_dict)
+            print(f'Loaded CNN6 pretrained parameters from: {checkpoint}')
+        else:
+            print('No valid checkpoints for CNN6. Start training from scratch.')
+        self.extract_embedding = extract_embedding
+    def forward(self, x):
+        x.stop_gradient = False
+        x = x.transpose([0, 3, 2, 1])
+        x = self.bn0(x)
+        x = x.transpose([0, 3, 2, 1])
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = x.mean(axis=3)
+        x = x.max(axis=2) + x.mean(axis=2)
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = F.relu(self.fc1(x))
+        if self.extract_embedding:
+            output = F.dropout(x, p=0.5, training=self.training)
+        else:
+            output = F.sigmoid(self.fc_audioset(x))
+        return output
--- a/PaddleAudio/paddleaudio/models/PANNs/conv.py
+++ b/PaddleAudio/paddleaudio/models/PANNs/conv.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+class ConvBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2D(in_channels=in_channels,
+                               out_channels=out_channels,
+                               kernel_size=(3, 3),
+                               stride=(1, 1),
+                               padding=(1, 1),
+                               bias_attr=False)
+        self.conv2 = nn.Conv2D(in_channels=out_channels,
+                               out_channels=out_channels,
+                               kernel_size=(3, 3),
+                               stride=(1, 1),
+                               padding=(1, 1),
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channels)
+        self.bn2 = nn.BatchNorm2D(out_channels)
+    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = F.relu(x)
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x = F.avg_pool2d(x, kernel_size=pool_size) + F.max_pool2d(x, kernel_size=pool_size)
+        else:
+            raise Exception(
+                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".')
+        return x
+class ConvBlock5x5(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock5x5, self).__init__()
+        self.conv1 = nn.Conv2D(in_channels=in_channels,
+                               out_channels=out_channels,
+                               kernel_size=(5, 5),
+                               stride=(1, 1),
+                               padding=(2, 2),
+                               bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channels)
+    def forward(self, x, pool_size=(2, 2), pool_type='avg'):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = F.relu(x)
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x = F.avg_pool2d(x, kernel_size=pool_size) + F.max_pool2d(x, kernel_size=pool_size)
+        else:
+            raise Exception(
+                f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".')
+        return x
--- a/PaddleAudio/paddleaudio/models/__init__.py
+++ b/PaddleAudio/paddleaudio/models/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .PANNs import CNN14, CNN10, CNN6
+__all__ = [
+    'CNN14',
+    'CNN10',
+    'CNN6',
+]
--- a/PaddleAudio/paddleaudio/scripts/wav2mel.py
+++ b/PaddleAudio/paddleaudio/scripts/wav2mel.py
+import argparse
+import glob
+import os
+import h5py
+import numpy as np
+import tqdm
+import paddleaudio as pa
+#from pylab import *
+parser = argparse.ArgumentParser(description='wave2mel')
+parser.add_argument('--wav_file', type=str, required=False, default='')
+parser.add_argument('--wav_list', type=str, required=False, default='')
+parser.add_argument('--wav_h5_file', type=str, required=False, default='')
+parser.add_argument('--wav_h5_list', type=str, required=False, default='')
+parser.add_argument('--output_folder', type=str, required=False, default='./')
+parser.add_argument('--output_h5', type=bool, required=False, default=True)
+parser.add_argument('--sample_rate', type=int, required=False, default=32000)
+parser.add_argument('--window_size', type=int, required=False, default=1024)
+parser.add_argument('--mel_bins', type=int, required=False, default=128)
+parser.add_argument('--hop_length', type=int, required=False, default=640)  #20ms
+parser.add_argument('--fmin', type=int, required=False, default=50)  #25ms
+parser.add_argument('--fmax', type=int, required=False, default=16000)  #25ms
+parser.add_argument('--skip_existed', type=int, required=False, default=1)  #25ms
+args = parser.parse_args()
+#args.wav_h5_file = '/ssd2/laiyongquan/audioset/h5/audioset_unblance_group28.h5'
+assert not (args.wav_h5_file == '' and args.wav_h5_list == ''\
+and args.wav_list == '' and args.wav_file == ''), 'one of wav_file,wav_list,\
+wav_h5_file,wav_h5_list needs to specify'
+if args.wav_h5_file != '':
+    h5_files = [args.wav_h5_file]
+if args.wav_h5_list != '':
+    h5_files = open(args.wav_h5_list).read().split('\n')
+    h5_files = [h for h in h5_files if len(h.strip()) != 0]
+dst_folder = args.output_folder
+print(f'{len(h5_files)} h5 files listed')
+for f in h5_files:
+    print(f'processing {f}')
+    dst_file = os.path.join(dst_folder, f.split('/')[-1])
+    print(f'target file {dst_file}')
+    if args.skip_existed != 0 and os.path.exists(dst_file):
+        print(f'skipped file {f}')
+        continue
+    assert not os.path.exists(dst_file), f'target file {dst_file} existed'
+    src_h5 = h5py.File(f)
+    dst_h5 = h5py.File(dst_file, "w")
+    for key in tqdm.tqdm(src_h5.keys()):
+        s = src_h5[key][:]
+        s = pa.depth_convert(s, 'float32')
+        # s = pa.resample(s,32000,args.sample_rate)
+        x = pa.features.mel_spect(s,
+                                  sample_rate=args.sample_rate,
+                                  window_size=args.window_size,
+                                  hop_length=args.hop_length,
+                                  mel_bins=args.mel_bins,
+                                  fmin=args.fmin,
+                                  fmax=args.fmax,
+                                  window='hann',
+                                  center=True,
+                                  pad_mode='reflect',
+                                  ref=1.0,
+                                  amin=1e-10,
+                                  top_db=None)
+        #         figure(figsize=(8,8))
+        #         imshow(x)
+        #         show()
+        #         print(x.shape)
+        dst_h5.create_dataset(key, data=x)
+    src_h5.close()
+    dst_h5.close()
--- a/PaddleAudio/paddleaudio/scripts/wav2mel2.py
+++ b/PaddleAudio/paddleaudio/scripts/wav2mel2.py
+import argparse
+import glob
+import os
+import h5py
+import numpy as np
+import tqdm
+import paddleaudio as pa
+parser = argparse.ArgumentParser(description='wave2mel')
+parser.add_argument('--wav_file', type=str, required=False, default='')
+parser.add_argument('--wav_list', type=str, required=False, default='')
+parser.add_argument('--wav_h5_file', type=str, required=False, default='')
+parser.add_argument('--wav_h5_list', type=str, required=False, default='')
+parser.add_argument('--output_folder', type=str, required=False, default='./')
+parser.add_argument('--output_h5', type=bool, required=False, default=True)
+parser.add_argument('--dst_h5_file', type=str, required=False, default='')
+parser.add_argument('--sample_rate', type=int, required=False, default=32000)
+parser.add_argument('--window_size', type=int, required=False, default=1024)
+parser.add_argument('--mel_bins', type=int, required=False, default=128)
+parser.add_argument('--hop_length', type=int, required=False, default=640)  #20ms
+parser.add_argument('--fmin', type=int, required=False, default=50)  #25ms
+parser.add_argument('--fmax', type=int, required=False, default=16000)  #25ms
+parser.add_argument('--skip_existed', type=int, required=False, default=1)  #25ms
+args = parser.parse_args()
+assert not (args.wav_h5_file == '' and args.wav_h5_list == ''\
+and args.wav_list == '' and args.wav_file == ''), 'one of wav_file,wav_list,\
+wav_h5_file,wav_h5_list needs to specify'
+h5_files = []
+wav_files = []
+if args.wav_h5_file != '':
+    h5_files = [args.wav_h5_file]
+elif args.wav_h5_list != '':
+    h5_files = open(args.wav_h5_list).read().split('\n')
+    h5_files = [h for h in h5_files if len(h.strip()) != 0]
+elif args.wav_list != '':
+    wav_files = open(args.wav_list).read().split('\n')
+    wav_files = [h for h in wav_files if len(h.strip()) != 0]
+elif args.wav_file != '':
+    wav_files = [args.wav_file]
+dst_folder = args.output_folder
+if len(h5_files) > 0:
+    print(f'{len(h5_files)} h5 files listed')
+    for f in h5_files:
+        print(f'processing {f}')
+        dst_file = os.path.join(dst_folder, f.split('/')[-1])
+        print(f'target file {dst_file}')
+        if args.skip_existed != 0 and os.path.exists(dst_file):
+            print(f'skipped file {f}')
+            continue
+        assert not os.path.exists(dst_file), f'target file {dst_file} existed'
+        src_h5 = h5py.File(f)
+        dst_h5 = h5py.File(dst_file, "w")
+        for key in tqdm.tqdm(src_h5.keys()):
+            s = src_h5[key][:]
+            s = pa.depth_convert(s, 'float32')
+            # s = pa.resample(s,32000,args.sample_rate)
+            x = pa.features.mel_spect(s,
+                                      sample_rate=args.sample_rate,
+                                      window_size=args.window_size,
+                                      hop_length=args.hop_length,
+                                      mel_bins=args.mel_bins,
+                                      fmin=args.fmin,
+                                      fmax=args.fmax,
+                                      window='hann',
+                                      center=True,
+                                      pad_mode='reflect',
+                                      ref=1.0,
+                                      amin=1e-10,
+                                      top_db=None)
+            dst_h5.create_dataset(key, data=x)
+        src_h5.close()
+        dst_h5.close()
+if len(wav_files) > 0:
+    assert args.dst_h5_file != '', 'for using wav file or wav list, dst_h5_file must be specified'
+    dst_file = args.dst_h5_file
+    assert not os.path.exists(dst_file), f'target file {dst_file} existed'
+    dst_h5 = h5py.File(dst_file, "w")
+    print(f'{len(wav_files)} wav files listed')
+    for f in tqdm.tqdm(wav_files):
+        s, _ = pa.load(f, sr=args.sample_rate)
+        # s = pa.resample(s,32000,args.sample_rate)
+        x = pa.features.mel_spect(s,
+                                  sample_rate=args.sample_rate,
+                                  window_size=args.window_size,
+                                  hop_length=args.hop_length,
+                                  mel_bins=args.mel_bins,
+                                  fmin=args.fmin,
+                                  fmax=args.fmax,
+                                  window='hann',
+                                  center=True,
+                                  pad_mode='reflect',
+                                  ref=1.0,
+                                  amin=1e-10,
+                                  top_db=None)
+        #         figure(figsize=(8,8))
+        #         imshow(x)
+        #         show()
+        #         print(x.shape)
+        key = f.split('/')[-1][:11]
+        dst_h5.create_dataset(key, data=x)
+    dst_h5.close()
--- a/PaddleAudio/paddleaudio/utils/download.py
+++ b/PaddleAudio/paddleaudio/utils/download.py
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, List
+from paddle.utils import download
+from .log import logger
+download.logger = logger
+def download_and_decompress(archives: List[Dict[str, str]], path: str):
+    """
+    Download archieves and decompress to specific path.
+    """
+    for archive in archives:
+        assert 'url' in archive and 'md5' in archive, \
+            'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
+        logger.info(f'Downloading from: {archive["url"]}')
+        download.get_path_from_url(archive['url'], path, archive['md5'])
--- a/PaddleAudio/paddleaudio/utils/env.py
+++ b/PaddleAudio/paddleaudio/utils/env.py
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+This module is used to store environmental variables in PaddleAudio.
+PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
+├                            default value through the PPAUDIO_HOME environment variable.
+├─ MODEL_HOME    -->  Store model files.
+└─ DATA_HOME     -->  Store automatically downloaded datasets.
+'''
+import os
+def _get_user_home():
+    return os.path.expanduser('~')
+def _get_ppaudio_home():
+    if 'PPAUDIO_HOME' in os.environ:
+        home_path = os.environ['PPAUDIO_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError('The environment variable PPAUDIO_HOME {} is not a directory.'.format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddleaudio')
+def _get_sub_home(directory):
+    home = os.path.join(_get_ppaudio_home(), directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+USER_HOME = _get_user_home()
+PPAUDIO_HOME = _get_ppaudio_home()
+MODEL_HOME = _get_sub_home('models')
+DATA_HOME = _get_sub_home('datasets')
--- a/PaddleAudio/paddleaudio/utils/log.py
+++ b/PaddleAudio/paddleaudio/utils/log.py
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+import copy
+import functools
+import logging
+import os
+import sys
+import threading
+import time
+from typing import List
+import colorlog
+from colorama import Fore
+loggers = {}
+log_config = {
+    'DEBUG': {
+        'level': 10,
+        'color': 'purple'
+    },
+    'INFO': {
+        'level': 20,
+        'color': 'green'
+    },
+    'TRAIN': {
+        'level': 21,
+        'color': 'cyan'
+    },
+    'EVAL': {
+        'level': 22,
+        'color': 'blue'
+    },
+    'WARNING': {
+        'level': 30,
+        'color': 'yellow'
+    },
+    'ERROR': {
+        'level': 40,
+        'color': 'red'
+    },
+    'CRITICAL': {
+        'level': 50,
+        'color': 'bold_red'
+    }
+}
+class Logger(object):
+    '''
+    Deafult logger in PaddleAudio
+    Args:
+        name(str) : Logger name, default is 'PaddleAudio'
+    '''
+    def __init__(self, name: str = None):
+        name = 'PaddleAudio' if not name else name
+        self.logger = logging.getLogger(name)
+        for key, conf in log_config.items():
+            logging.addLevelName(conf['level'], key)
+            self.__dict__[key] = functools.partial(self.__call__, conf['level'])
+            self.__dict__[key.lower()] = functools.partial(self.__call__, conf['level'])
+        self.format = colorlog.ColoredFormatter('%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s',
+                                                log_colors={key: conf['color']
+                                                            for key, conf in log_config.items()})
+        self.handler = logging.StreamHandler()
+        self.handler.setFormatter(self.format)
+        self.logger.addHandler(self.handler)
+        self.logLevel = 'DEBUG'
+        self.logger.setLevel(logging.DEBUG)
+        self.logger.propagate = False
+        self._is_enable = True
+    def disable(self):
+        self._is_enable = False
+    def enable(self):
+        self._is_enable = True
+    @property
+    def is_enable(self) -> bool:
+        return self._is_enable
+    def __call__(self, log_level: str, msg: str):
+        if not self.is_enable:
+            return
+        self.logger.log(log_level, msg)
+    @contextlib.contextmanager
+    def use_terminator(self, terminator: str):
+        old_terminator = self.handler.terminator
+        self.handler.terminator = terminator
+        yield
+        self.handler.terminator = old_terminator
+    @contextlib.contextmanager
+    def processing(self, msg: str, interval: float = 0.1):
+        '''
+        Continuously print a progress bar with rotating special effects.
+        Args:
+            msg(str): Message to be printed.
+            interval(float): Rotation interval. Default to 0.1.
+        '''
+        end = False
+        def _printer():
+            index = 0
+            flags = ['\\', '|', '/', '-']
+            while not end:
+                flag = flags[index % len(flags)]
+                with self.use_terminator('\r'):
+                    self.info('{}: {}'.format(msg, flag))
+                time.sleep(interval)
+                index += 1
+        t = threading.Thread(target=_printer)
+        t.start()
+        yield
+        end = True
+logger = Logger()
--- a/PaddleAudio/requirements.txt
+++ b/PaddleAudio/requirements.txt
+colorama
+colorlog
+easydict
+filelock
+gitpython
+numpy
+packaging
+Pillow
+pyyaml
+pyzmq
+rarfile
+tqdm
+librosa
--- a/PaddleAudio/setup.py
+++ b/PaddleAudio/setup.py
+import setuptools
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="PaddleAudio", 
+    version="0.0.0",
+    author="",
+    author_email="",
+    description="PaddleAudio, in development",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/ranchlai/PaddleAudio",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+)