From 8a0045aae196442ade6b3c12bb6c6800005a17cd Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Fri, 23 Apr 2021 19:13:46 +0800 Subject: [PATCH] PaddleAudio initial commit (#5299) --- PaddleAudio/.gitignore | 6 + PaddleAudio/.pre-commit-config.yaml | 44 ++ PaddleAudio/.style.yapf | 3 + PaddleAudio/LICENSE | 201 +++++++ PaddleAudio/README.md | 26 + PaddleAudio/examples/audio_tagging/README.md | 136 +++++ .../audio_tagging/assets/audioset_labels.txt | 527 ++++++++++++++++++ .../examples/audio_tagging/audio_tag.py | 122 ++++ .../examples/audio_tagging/parse_result.py | 81 +++ PaddleAudio/examples/tools/wav2mel.py | 71 +++ PaddleAudio/examples/wav2mel.py | 76 +++ PaddleAudio/paddleaudio/__init__.py | 4 + PaddleAudio/paddleaudio/backends/__init__.py | 15 + PaddleAudio/paddleaudio/backends/audio.py | 241 ++++++++ PaddleAudio/paddleaudio/datasets/__init__.py | 25 + PaddleAudio/paddleaudio/datasets/dataset.py | 80 +++ PaddleAudio/paddleaudio/datasets/dcase.py | 168 ++++++ PaddleAudio/paddleaudio/datasets/esc50.py | 88 +++ PaddleAudio/paddleaudio/datasets/gtzan.py | 101 ++++ .../paddleaudio/datasets/urban_sound.py | 88 +++ PaddleAudio/paddleaudio/features/__init__.py | 16 + .../paddleaudio/features/augmentation.py | 130 +++++ PaddleAudio/paddleaudio/features/features.py | 100 ++++ PaddleAudio/paddleaudio/features/utils.py | 38 ++ .../paddleaudio/models/PANNs/__init__.py | 17 + PaddleAudio/paddleaudio/models/PANNs/cnn10.py | 85 +++ PaddleAudio/paddleaudio/models/PANNs/cnn14.py | 93 ++++ PaddleAudio/paddleaudio/models/PANNs/cnn6.py | 85 +++ PaddleAudio/paddleaudio/models/PANNs/conv.py | 86 +++ PaddleAudio/paddleaudio/models/__init__.py | 21 + PaddleAudio/paddleaudio/scripts/wav2mel.py | 76 +++ PaddleAudio/paddleaudio/scripts/wav2mel2.py | 113 ++++ PaddleAudio/paddleaudio/utils/download.py | 33 ++ PaddleAudio/paddleaudio/utils/env.py | 52 ++ PaddleAudio/paddleaudio/utils/log.py | 139 +++++ PaddleAudio/requirements.txt | 13 + PaddleAudio/setup.py | 22 + 37 files changed, 3222 insertions(+) create mode 100644 PaddleAudio/.gitignore create mode 100644 PaddleAudio/.pre-commit-config.yaml create mode 100644 PaddleAudio/.style.yapf create mode 100644 PaddleAudio/LICENSE create mode 100644 PaddleAudio/README.md create mode 100644 PaddleAudio/examples/audio_tagging/README.md create mode 100644 PaddleAudio/examples/audio_tagging/assets/audioset_labels.txt create mode 100644 PaddleAudio/examples/audio_tagging/audio_tag.py create mode 100644 PaddleAudio/examples/audio_tagging/parse_result.py create mode 100644 PaddleAudio/examples/tools/wav2mel.py create mode 100644 PaddleAudio/examples/wav2mel.py create mode 100644 PaddleAudio/paddleaudio/__init__.py create mode 100644 PaddleAudio/paddleaudio/backends/__init__.py create mode 100644 PaddleAudio/paddleaudio/backends/audio.py create mode 100644 PaddleAudio/paddleaudio/datasets/__init__.py create mode 100644 PaddleAudio/paddleaudio/datasets/dataset.py create mode 100644 PaddleAudio/paddleaudio/datasets/dcase.py create mode 100644 PaddleAudio/paddleaudio/datasets/esc50.py create mode 100644 PaddleAudio/paddleaudio/datasets/gtzan.py create mode 100644 PaddleAudio/paddleaudio/datasets/urban_sound.py create mode 100644 PaddleAudio/paddleaudio/features/__init__.py create mode 100644 PaddleAudio/paddleaudio/features/augmentation.py create mode 100644 PaddleAudio/paddleaudio/features/features.py create mode 100644 PaddleAudio/paddleaudio/features/utils.py create mode 100644 PaddleAudio/paddleaudio/models/PANNs/__init__.py create mode 100644 PaddleAudio/paddleaudio/models/PANNs/cnn10.py create mode 100644 PaddleAudio/paddleaudio/models/PANNs/cnn14.py create mode 100644 PaddleAudio/paddleaudio/models/PANNs/cnn6.py create mode 100644 PaddleAudio/paddleaudio/models/PANNs/conv.py create mode 100644 PaddleAudio/paddleaudio/models/__init__.py create mode 100644 PaddleAudio/paddleaudio/scripts/wav2mel.py create mode 100644 PaddleAudio/paddleaudio/scripts/wav2mel2.py create mode 100644 PaddleAudio/paddleaudio/utils/download.py create mode 100644 PaddleAudio/paddleaudio/utils/env.py create mode 100644 PaddleAudio/paddleaudio/utils/log.py create mode 100644 PaddleAudio/requirements.txt create mode 100644 PaddleAudio/setup.py diff --git a/PaddleAudio/.gitignore b/PaddleAudio/.gitignore new file mode 100644 index 00000000..fef8c3c9 --- /dev/null +++ b/PaddleAudio/.gitignore @@ -0,0 +1,6 @@ +.ipynb_checkpoints/** +*.ipynb +nohup.out +__pycache__/ +*.wav +*.m4a diff --git a/PaddleAudio/.pre-commit-config.yaml b/PaddleAudio/.pre-commit-config.yaml new file mode 100644 index 00000000..485cd2e2 --- /dev/null +++ b/PaddleAudio/.pre-commit-config.yaml @@ -0,0 +1,44 @@ +- repo: local + hooks: + - id: yapf + name: yapf + entry: yapf + language: system + args: [-i, --style .style.yapf] + files: \.py$ + +- repo: https://github.com/pre-commit/pre-commit-hooks + sha: a11d9314b22d8f8c7556443875b731ef05965464 + hooks: + - id: check-merge-conflict + - id: check-symlinks + - id: end-of-file-fixer + - id: trailing-whitespace + - id: detect-private-key + - id: check-symlinks + - id: check-added-large-files + +- repo: https://github.com/pycqa/isort + rev: 5.8.0 + hooks: + - id: isort + name: isort (python) + - id: isort + name: isort (cython) + types: [cython] + - id: isort + name: isort (pyi) + types: [pyi] + +- repo: local + hooks: + - id: flake8 + name: flake8 + entry: flake8 + language: system + args: + - --count + - --select=E9,F63,F7,F82 + - --show-source + - --statistics + files: \.py$ diff --git a/PaddleAudio/.style.yapf b/PaddleAudio/.style.yapf new file mode 100644 index 00000000..c9a88d5a --- /dev/null +++ b/PaddleAudio/.style.yapf @@ -0,0 +1,3 @@ +[style] +based_on_style = pep8 +column_limit = 120 diff --git a/PaddleAudio/LICENSE b/PaddleAudio/LICENSE new file mode 100644 index 00000000..261eeb9e --- /dev/null +++ b/PaddleAudio/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/PaddleAudio/README.md b/PaddleAudio/README.md new file mode 100644 index 00000000..fe322a67 --- /dev/null +++ b/PaddleAudio/README.md @@ -0,0 +1,26 @@ +# PaddleAudio +Unofficial paddle audio codebase + +## Install +``` +git clone https://github.com/ranchlai/PaddleAudio.git +cd PaddleAudio +pip install . + +``` + +## Usage +``` +import paddleAudio as pa +s,r = pa.load(f) +mel = pa.features.mel_spect(s,r) +``` +## to do + + +- add sound effects(tempo, mag, etc) , sox supports +- add dataset support +- add models DCASE classication ASD,sound classification +- add demos (audio,video demos) +- add openL3 support + diff --git a/PaddleAudio/examples/audio_tagging/README.md b/PaddleAudio/examples/audio_tagging/README.md new file mode 100644 index 00000000..3d2fb6aa --- /dev/null +++ b/PaddleAudio/examples/audio_tagging/README.md @@ -0,0 +1,136 @@ +# Audioset Tagging Example + +本示例采用PANNs预训练模型,对输入音频实时打tag,并最终以文本形式输出对应时刻的topk类别和对应的得分。 + +PANNs预训练模型的详情,请参考论文[PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf)。 + +## Usage + +```python +python audio_tag.py \ + --wav ./cat_meow.wav \ + --sr 32000 \ + --sample_duration 2 \ + --hop_duration 0.3 \ + --checkpoint ./assets/cnn14.pdparams \ + --use_gpu True \ + --output_dir ./output_dir +``` + +参数用法: +``` +--wav # 音频路径 +--sr # sample rate +--sample_duration # tagging音频长度,单位为秒 +--hop_duration # tagging音频间步长,单位为秒 +--checkpoint # 预训练模型参数 +--use_gpu # 使用GPU加速 +--output_dir # 输出路径 +``` + +执行结果: +``` +[2021-04-06 21:10:36,438] [ INFO] - Loaded CNN14 pretrained parameters from: ./assets/cnn14.pdparams +[2021-04-06 21:10:38,193] [ INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_32000.npz +``` + +执行后得分结果保存在`output_dir`的`.npz`文件中。 + + +## Output +```python +python parse_result.py \ + --input_file ./output_dir/audioset_tagging_sr_32000.npz \ + --topk 10 \ + --smooth True \ + --smooth_size 5 \ + --output_dir ./output_dir +``` + +参数用法: +``` +--input_file # tagging得分文件 +--topk # 展示topk结果 +--smooth # 帧间得分平滑 +--smooth_size # 平滑窗口大小 +--output_dir # 输出路径 +``` + +执行结果: +``` +[2021-04-06 21:22:00,696] [ INFO] - Posterior smoothing... +[2021-04-06 21:22:00,699] [ INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_32000.txt +``` + +执行后文本结果保存在`output_dir`的`.txt`文件中。 + + +## Labels + +最终输出的文本结果如下所示。 +不同tagging的topk结果用空行分隔。每一个结果中,第一行是时间信息,数字表示tagging结果的起始样本点;接下来的k行是对应的标签和得分。 + +``` +0 +Cat: 0.80844646692276 +Animal: 0.6848719716072083 +Meow: 0.6470851898193359 +Domestic animals, pets: 0.6392854452133179 +Inside, small room: 0.05361200496554375 +Purr: 0.02675800956785679 +Music: 0.021260583773255348 +Speech: 0.0209784135222435 +Caterwaul: 0.019929537549614906 +Outside, urban or manmade: 0.010916451923549175 + +9600 +Cat: 0.7778594493865967 +Meow: 0.6465566158294678 +Animal: 0.6342337131500244 +Domestic animals, pets: 0.5945377349853516 +Inside, small room: 0.04747435823082924 +Purr: 0.027785276994109154 +Music: 0.022447215393185616 +Caterwaul: 0.020785318687558174 +Speech: 0.01982543244957924 +Vehicle: 0.014558425173163414 + +19200 +Cat: 0.8243843913078308 +Animal: 0.6799540519714355 +Meow: 0.6794822812080383 +Domestic animals, pets: 0.6637188792228699 +Caterwaul: 0.09927166253328323 +Inside, small room: 0.0378643162548542 +Music: 0.02170632779598236 +Purr: 0.02035444974899292 +Speech: 0.02006830833852291 +Vehicle: 0.01234798226505518 + +28800 +Cat: 0.8329735398292542 +Animal: 0.6937487125396729 +Meow: 0.6766577959060669 +Domestic animals, pets: 0.6669812798500061 +Caterwaul: 0.08647485077381134 +Inside, small room: 0.03593464195728302 +Music: 0.022975120693445206 +Speech: 0.01964726485311985 +Purr: 0.017558127641677856 +Vehicle: 0.010926523245871067 + +38400 +Cat: 0.8097503781318665 +Animal: 0.6702587604522705 +Meow: 0.6487116813659668 +Domestic animals, pets: 0.6369225382804871 +Caterwaul: 0.07185821980237961 +Inside, small room: 0.039198972284793854 +Music: 0.02381189912557602 +Speech: 0.018534155562520027 +Purr: 0.0178740955889225 +Outside, urban or manmade: 0.011107126250863075 + +... +... +``` diff --git a/PaddleAudio/examples/audio_tagging/assets/audioset_labels.txt b/PaddleAudio/examples/audio_tagging/assets/audioset_labels.txt new file mode 100644 index 00000000..6fccf56a --- /dev/null +++ b/PaddleAudio/examples/audio_tagging/assets/audioset_labels.txt @@ -0,0 +1,527 @@ +Speech +Male speech, man speaking +Female speech, woman speaking +Child speech, kid speaking +Conversation +Narration, monologue +Babbling +Speech synthesizer +Shout +Bellow +Whoop +Yell +Battle cry +Children shouting +Screaming +Whispering +Laughter +Baby laughter +Giggle +Snicker +Belly laugh +Chuckle, chortle +Crying, sobbing +Baby cry, infant cry +Whimper +Wail, moan +Sigh +Singing +Choir +Yodeling +Chant +Mantra +Male singing +Female singing +Child singing +Synthetic singing +Rapping +Humming +Groan +Grunt +Whistling +Breathing +Wheeze +Snoring +Gasp +Pant +Snort +Cough +Throat clearing +Sneeze +Sniff +Run +Shuffle +Walk, footsteps +Chewing, mastication +Biting +Gargling +Stomach rumble +Burping, eructation +Hiccup +Fart +Hands +Finger snapping +Clapping +Heart sounds, heartbeat +Heart murmur +Cheering +Applause +Chatter +Crowd +Hubbub, speech noise, speech babble +Children playing +Animal +Domestic animals, pets +Dog +Bark +Yip +Howl +Bow-wow +Growling +Whimper (dog) +Cat +Purr +Meow +Hiss +Caterwaul +Livestock, farm animals, working animals +Horse +Clip-clop +Neigh, whinny +Cattle, bovinae +Moo +Cowbell +Pig +Oink +Goat +Bleat +Sheep +Fowl +Chicken, rooster +Cluck +Crowing, cock-a-doodle-doo +Turkey +Gobble +Duck +Quack +Goose +Honk +Wild animals +Roaring cats (lions, tigers) +Roar +Bird +Bird vocalization, bird call, bird song +Chirp, tweet +Squawk +Pigeon, dove +Coo +Crow +Caw +Owl +Hoot +Bird flight, flapping wings +Canidae, dogs, wolves +Rodents, rats, mice +Mouse +Patter +Insect +Cricket +Mosquito +Fly, housefly +Buzz +Bee, wasp, etc. +Frog +Croak +Snake +Rattle +Whale vocalization +Music +Musical instrument +Plucked string instrument +Guitar +Electric guitar +Bass guitar +Acoustic guitar +Steel guitar, slide guitar +Tapping (guitar technique) +Strum +Banjo +Sitar +Mandolin +Zither +Ukulele +Keyboard (musical) +Piano +Electric piano +Organ +Electronic organ +Hammond organ +Synthesizer +Sampler +Harpsichord +Percussion +Drum kit +Drum machine +Drum +Snare drum +Rimshot +Drum roll +Bass drum +Timpani +Tabla +Cymbal +Hi-hat +Wood block +Tambourine +Rattle (instrument) +Maraca +Gong +Tubular bells +Mallet percussion +Marimba, xylophone +Glockenspiel +Vibraphone +Steelpan +Orchestra +Brass instrument +French horn +Trumpet +Trombone +Bowed string instrument +String section +Violin, fiddle +Pizzicato +Cello +Double bass +Wind instrument, woodwind instrument +Flute +Saxophone +Clarinet +Harp +Bell +Church bell +Jingle bell +Bicycle bell +Tuning fork +Chime +Wind chime +Change ringing (campanology) +Harmonica +Accordion +Bagpipes +Didgeridoo +Shofar +Theremin +Singing bowl +Scratching (performance technique) +Pop music +Hip hop music +Beatboxing +Rock music +Heavy metal +Punk rock +Grunge +Progressive rock +Rock and roll +Psychedelic rock +Rhythm and blues +Soul music +Reggae +Country +Swing music +Bluegrass +Funk +Folk music +Middle Eastern music +Jazz +Disco +Classical music +Opera +Electronic music +House music +Techno +Dubstep +Drum and bass +Electronica +Electronic dance music +Ambient music +Trance music +Music of Latin America +Salsa music +Flamenco +Blues +Music for children +New-age music +Vocal music +A capella +Music of Africa +Afrobeat +Christian music +Gospel music +Music of Asia +Carnatic music +Music of Bollywood +Ska +Traditional music +Independent music +Song +Background music +Theme music +Jingle (music) +Soundtrack music +Lullaby +Video game music +Christmas music +Dance music +Wedding music +Happy music +Funny music +Sad music +Tender music +Exciting music +Angry music +Scary music +Wind +Rustling leaves +Wind noise (microphone) +Thunderstorm +Thunder +Water +Rain +Raindrop +Rain on surface +Stream +Waterfall +Ocean +Waves, surf +Steam +Gurgling +Fire +Crackle +Vehicle +Boat, Water vehicle +Sailboat, sailing ship +Rowboat, canoe, kayak +Motorboat, speedboat +Ship +Motor vehicle (road) +Car +Vehicle horn, car horn, honking +Toot +Car alarm +Power windows, electric windows +Skidding +Tire squeal +Car passing by +Race car, auto racing +Truck +Air brake +Air horn, truck horn +Reversing beeps +Ice cream truck, ice cream van +Bus +Emergency vehicle +Police car (siren) +Ambulance (siren) +Fire engine, fire truck (siren) +Motorcycle +Traffic noise, roadway noise +Rail transport +Train +Train whistle +Train horn +Railroad car, train wagon +Train wheels squealing +Subway, metro, underground +Aircraft +Aircraft engine +Jet engine +Propeller, airscrew +Helicopter +Fixed-wing aircraft, airplane +Bicycle +Skateboard +Engine +Light engine (high frequency) +Dental drill, dentist's drill +Lawn mower +Chainsaw +Medium engine (mid frequency) +Heavy engine (low frequency) +Engine knocking +Engine starting +Idling +Accelerating, revving, vroom +Door +Doorbell +Ding-dong +Sliding door +Slam +Knock +Tap +Squeak +Cupboard open or close +Drawer open or close +Dishes, pots, and pans +Cutlery, silverware +Chopping (food) +Frying (food) +Microwave oven +Blender +Water tap, faucet +Sink (filling or washing) +Bathtub (filling or washing) +Hair dryer +Toilet flush +Toothbrush +Electric toothbrush +Vacuum cleaner +Zipper (clothing) +Keys jangling +Coin (dropping) +Scissors +Electric shaver, electric razor +Shuffling cards +Typing +Typewriter +Computer keyboard +Writing +Alarm +Telephone +Telephone bell ringing +Ringtone +Telephone dialing, DTMF +Dial tone +Busy signal +Alarm clock +Siren +Civil defense siren +Buzzer +Smoke detector, smoke alarm +Fire alarm +Foghorn +Whistle +Steam whistle +Mechanisms +Ratchet, pawl +Clock +Tick +Tick-tock +Gears +Pulleys +Sewing machine +Mechanical fan +Air conditioning +Cash register +Printer +Camera +Single-lens reflex camera +Tools +Hammer +Jackhammer +Sawing +Filing (rasp) +Sanding +Power tool +Drill +Explosion +Gunshot, gunfire +Machine gun +Fusillade +Artillery fire +Cap gun +Fireworks +Firecracker +Burst, pop +Eruption +Boom +Wood +Chop +Splinter +Crack +Glass +Chink, clink +Shatter +Liquid +Splash, splatter +Slosh +Squish +Drip +Pour +Trickle, dribble +Gush +Fill (with liquid) +Spray +Pump (liquid) +Stir +Boiling +Sonar +Arrow +Whoosh, swoosh, swish +Thump, thud +Thunk +Electronic tuner +Effects unit +Chorus effect +Basketball bounce +Bang +Slap, smack +Whack, thwack +Smash, crash +Breaking +Bouncing +Whip +Flap +Scratch +Scrape +Rub +Roll +Crushing +Crumpling, crinkling +Tearing +Beep, bleep +Ping +Ding +Clang +Squeal +Creak +Rustle +Whir +Clatter +Sizzle +Clicking +Clickety-clack +Rumble +Plop +Jingle, tinkle +Hum +Zing +Boing +Crunch +Silence +Sine wave +Harmonic +Chirp tone +Sound effect +Pulse +Inside, small room +Inside, large room or hall +Inside, public space +Outside, urban or manmade +Outside, rural or natural +Reverberation +Echo +Noise +Environmental noise +Static +Mains hum +Distortion +Sidetone +Cacophony +White noise +Pink noise +Throbbing +Vibration +Television +Radio +Field recording diff --git a/PaddleAudio/examples/audio_tagging/audio_tag.py b/PaddleAudio/examples/audio_tagging/audio_tag.py new file mode 100644 index 00000000..43764dbe --- /dev/null +++ b/PaddleAudio/examples/audio_tagging/audio_tag.py @@ -0,0 +1,122 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import ast +import os +from typing import List + +import librosa +import numpy as np +import paddle + +from paddleaudio.features import mel_spect +from paddleaudio.models import CNN14 +from paddleaudio.utils.log import logger + +parser = argparse.ArgumentParser(__doc__) +# features +parser.add_argument("--sr", type=int, default=32000, help="Sample rate of inference audio.") +parser.add_argument('--window_size', type=int, default=1024) +parser.add_argument('--hop_size', type=int, default=320) +parser.add_argument('--mel_bins', type=int, default=64) +parser.add_argument('--fmin', type=int, default=50) +parser.add_argument('--fmax', type=int, default=14000) +# waveform +parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") +parser.add_argument('--sample_duration', type=float, default=1.0) # 1s +parser.add_argument('--hop_duration', type=float, default=0.3) # 0.3s + +parser.add_argument("--output_dir", type=str, default='./output_dir') +parser.add_argument("--use_gpu", + type=ast.literal_eval, + default=True, + help="Whether use GPU for fine-tuning, input should be True or False") +parser.add_argument("--checkpoint", type=str, default='./assets/cnn14.pdparams', help="Checkpoint of model.") +args = parser.parse_args() + + +def split(waveform: np.ndarray, win_size: int, hop_size: int): + """ + Split into N audios. + N is decided by win_size and hop_size. + """ + assert isinstance(waveform, np.ndarray) + ret = [] + for i in range(0, len(waveform), hop_size): + segment = waveform[i:i + win_size] + if len(segment) < win_size: + segment = np.pad(segment, (0, win_size - len(segment))) + ret.append(segment) + return ret + + +def batchify(data: List[List[float]], batch_size: int): + """ + Extract features from waveforms and create batches. + """ + examples = [] + for waveform in data: + feat = mel_spect( + waveform, + sample_rate=args.sr, + window_size=args.window_size, + hop_length=args.hop_size, + mel_bins=args.mel_bins, + fmin=args.fmin, + fmax=args.fmax, + ) + examples.append(np.expand_dims(feat.transpose(), 0)) # (mel_bins, time) -> (1, time, mel_bins) + + # Seperates data into some batches. + one_batch = [] + for example in examples: + one_batch.append(example) + if len(one_batch) == batch_size: + yield one_batch + one_batch = [] + if one_batch: + yield one_batch + + +def predict(model, data: List[List[float]], batch_size: int = 1, use_gpu: bool = False): + + paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') + + batches = batchify(data, batch_size) + results = None + model.eval() + for batch in batches: + feats = paddle.to_tensor(batch) + audioset_scores = model(feats) + if results is None: + results = audioset_scores.numpy() + else: + results = np.concatenate((results, audioset_scores.numpy())) + + return results + + +if __name__ == '__main__': + model = CNN14(extract_embedding=False, checkpoint=args.checkpoint) + waveform = librosa.load(args.wav, sr=args.sr)[0] + data = split(waveform, int(args.sample_duration * args.sr), int(args.hop_duration * args.sr)) + results = predict(model, data, batch_size=8, use_gpu=args.use_gpu) + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + time = np.arange(0, 1, int(args.hop_duration * args.sr) / len(waveform)) + output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{args.sr}.npz') + np.savez(output_file, time=time, scores=results) + logger.info(f'Saved tagging results to {output_file}') diff --git a/PaddleAudio/examples/audio_tagging/parse_result.py b/PaddleAudio/examples/audio_tagging/parse_result.py new file mode 100644 index 00000000..f0198ea5 --- /dev/null +++ b/PaddleAudio/examples/audio_tagging/parse_result.py @@ -0,0 +1,81 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import ast +import os +from typing import Dict, List + +import numpy as np + +from paddleaudio.utils.log import logger + +parser = argparse.ArgumentParser(__doc__) +parser.add_argument("--input_file", type=str, required=True) +parser.add_argument("--topk", type=int, default=10, help="Show top k results of audioset labels.") +parser.add_argument("--smooth", type=ast.literal_eval, default=True, help="Posterior smoothing.") +parser.add_argument("--smooth_size", type=int, default=5, help="Window size of smoothing.") +parser.add_argument("--output_dir", type=str, default='./output_dir') +args = parser.parse_args() + + +def smooth(results: np.ndarray, win_size: int): + """ + Execute posterior smoothing in-place. + """ + for i in range(len(results) - 1, -1, -1): + if i < win_size - 1: + left = 0 + else: + left = i + 1 - win_size + results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1) + + +def generate_topk_label(k: int, label_map: Dict, result: np.ndarray): + """ + Return top k result. + """ + result = np.asarray(result) + topk_idx = (-result).argsort()[:k] + + ret = '' + for idx in topk_idx: + label, score = label_map[idx], result[idx] + ret += f'{label}: {score}\n' + return ret + + +if __name__ == "__main__": + label_file = './assets/audioset_labels.txt' + label_map = {} + with open(label_file, 'r') as f: + for i, l in enumerate(f.readlines()): + label_map[i] = l.strip() + + results = np.load(args.input_file, allow_pickle=True) + times, scores = results['time'], results['scores'] + + if args.smooth: + logger.info('Posterior smoothing...') + smooth(scores, win_size=args.smooth_size) + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + output_file = os.path.join(args.output_dir, os.path.basename(args.input_file).split('.')[0] + '.txt') + with open(output_file, 'w') as f: + for time, score in zip(times, scores): + f.write(f'{time}\n') + f.write(generate_topk_label(args.topk, label_map, score) + '\n') + + logger.info(f'Saved tagging labels to {output_file}') diff --git a/PaddleAudio/examples/tools/wav2mel.py b/PaddleAudio/examples/tools/wav2mel.py new file mode 100644 index 00000000..1b993bc7 --- /dev/null +++ b/PaddleAudio/examples/tools/wav2mel.py @@ -0,0 +1,71 @@ +import argparse +import glob +import os + +import h5py +import numpy as np +import tqdm + +import paddleaudio as pa + +#from pylab import * +parser = argparse.ArgumentParser(description='wave2mel') +parser.add_argument('--wav_file', type=str, required=False, default='') +parser.add_argument('--wav_list', type=str, required=False, default='') +parser.add_argument('--wav_h5_file', type=str, required=False, default='') +parser.add_argument('--wav_h5_list', type=str, required=False, default='') +parser.add_argument('--output_folder', type=str, required=False, default='./') +parser.add_argument('--output_h5', type=bool, required=False, default=True) +parser.add_argument('--sample_rate', type=int, required=False, default=32000) +parser.add_argument('--window_size', type=int, required=False, default=1024) +parser.add_argument('--mel_bins', type=int, required=False, default=128) +parser.add_argument('--hop_length', type=int, required=False, default=640) #20ms +parser.add_argument('--fmin', type=int, required=False, default=50) #25ms +parser.add_argument('--fmax', type=int, required=False, default=16000) #25ms +args = parser.parse_args() +#args.wav_h5_file = '/ssd2/laiyongquan/audioset/h5/audioset_unblance_group28.h5' + +assert not (args.wav_h5_file == '' and args.wav_h5_list == ''\ +and args.wav_list == '' and args.wav_file == ''), 'one of wav_file,wav_list,\ +wav_h5_file,wav_h5_list needs to specify' + +if args.wav_h5_file != '': + h5_files = [args.wav_h5_file] +if args.wav_h5_list != '': + h5_files = open(args.wav_h5_list).read().split('\n') + h5_files = [h for h in h5_files if len(h.strip()) != 0] + +dst_folder = args.output_folder +print(f'{len(h5_files)} h5 files listed') +for f in h5_files: + print(f'processing {f}') + dst_file = os.path.join(dst_folder, f.split('/')[-1]) + print(f'target file {dst_file}') + assert not os.path.exists(dst_file), f'target file {dst_file} existed' + src_h5 = h5py.File(f) + dst_h5 = h5py.File(dst_file, "w") + for key in tqdm.tqdm(src_h5.keys()): + s = src_h5[key][:] + s = pa.depth_convert(s, 'float32') + # s = pa.resample(s,32000,args.sample_rate) + x = pa.features.mel_spect(s, + sample_rate=args.sample_rate, + window_size=args.window_size, + hop_length=args.hop_length, + mel_bins=args.mel_bins, + fmin=args.fmin, + fmax=args.fmax, + window='hann', + center=True, + pad_mode='reflect', + ref=1.0, + amin=1e-10, + top_db=None) + # figure(figsize=(8,8)) + # imshow(x) + # show() + # print(x.shape) + + dst_h5.create_dataset(key, data=x) + src_h5.close() + dst_h5.close() diff --git a/PaddleAudio/examples/wav2mel.py b/PaddleAudio/examples/wav2mel.py new file mode 100644 index 00000000..a1013d36 --- /dev/null +++ b/PaddleAudio/examples/wav2mel.py @@ -0,0 +1,76 @@ +import argparse +import glob +import os + +import h5py +import numpy as np +import tqdm + +import paddleaudio as pa + +#from pylab import * +parser = argparse.ArgumentParser(description='wave2mel') +parser.add_argument('--wav_file', type=str, required=False, default='') +parser.add_argument('--wav_list', type=str, required=False, default='') +parser.add_argument('--wav_h5_file', type=str, required=False, default='') +parser.add_argument('--wav_h5_list', type=str, required=False, default='') +parser.add_argument('--output_folder', type=str, required=False, default='./') +parser.add_argument('--output_h5', type=bool, required=False, default=True) +parser.add_argument('--sample_rate', type=int, required=False, default=32000) +parser.add_argument('--window_size', type=int, required=False, default=1024) +parser.add_argument('--mel_bins', type=int, required=False, default=128) +parser.add_argument('--hop_length', type=int, required=False, default=640) #20ms +parser.add_argument('--fmin', type=int, required=False, default=50) #25ms +parser.add_argument('--fmax', type=int, required=False, default=16000) #25ms +parser.add_argument('--skip_existed', type=int, required=False, default=1) #25ms + +args = parser.parse_args() +#args.wav_h5_file = '/ssd2/laiyongquan/audioset/h5/audioset_unblance_group28.h5' + +assert not (args.wav_h5_file == '' and args.wav_h5_list == ''\ +and args.wav_list == '' and args.wav_file == ''), 'one of wav_file,wav_list,\ +wav_h5_file,wav_h5_list needs to specify' + +if args.wav_h5_file != '': + h5_files = [args.wav_h5_file] +if args.wav_h5_list != '': + h5_files = open(args.wav_h5_list).read().split('\n') + h5_files = [h for h in h5_files if len(h.strip()) != 0] + +dst_folder = args.output_folder +print(f'{len(h5_files)} h5 files listed') +for f in h5_files: + print(f'processing {f}') + dst_file = os.path.join(dst_folder, f.split('/')[-1]) + print(f'target file {dst_file}') + if args.skip_existed != 0 and os.path.exists(dst_file): + print(f'skipped file {f}') + continue + assert not os.path.exists(dst_file), f'target file {dst_file} existed' + src_h5 = h5py.File(f) + dst_h5 = h5py.File(dst_file, "w") + for key in tqdm.tqdm(src_h5.keys()): + s = src_h5[key][:] + s = pa.depth_convert(s, 'float32') + # s = pa.resample(s,32000,args.sample_rate) + x = pa.features.mel_spect(s, + sample_rate=args.sample_rate, + window_size=args.window_size, + hop_length=args.hop_length, + mel_bins=args.mel_bins, + fmin=args.fmin, + fmax=args.fmax, + window='hann', + center=True, + pad_mode='reflect', + ref=1.0, + amin=1e-10, + top_db=None) + # figure(figsize=(8,8)) + # imshow(x) + # show() + # print(x.shape) + + dst_h5.create_dataset(key, data=x) + src_h5.close() + dst_h5.close() diff --git a/PaddleAudio/paddleaudio/__init__.py b/PaddleAudio/paddleaudio/__init__.py new file mode 100644 index 00000000..49b35bc7 --- /dev/null +++ b/PaddleAudio/paddleaudio/__init__.py @@ -0,0 +1,4 @@ +from .backends import * +from .features import * + + diff --git a/PaddleAudio/paddleaudio/backends/__init__.py b/PaddleAudio/paddleaudio/backends/__init__.py new file mode 100644 index 00000000..5d885295 --- /dev/null +++ b/PaddleAudio/paddleaudio/backends/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .audio import * diff --git a/PaddleAudio/paddleaudio/backends/audio.py b/PaddleAudio/paddleaudio/backends/audio.py new file mode 100644 index 00000000..0419a535 --- /dev/null +++ b/PaddleAudio/paddleaudio/backends/audio.py @@ -0,0 +1,241 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings + +import numpy as np +from scipy.io import wavfile + +try: + import librosa + has_librosa = True +except: + has_librosa = False + +try: + import soundfile as sf + has_snf = True +except: + has_snf = False +try: + import resampy + has_resampy = True +except: + has_resampy = False + +__norm_types__ = ['linear', 'gaussian'] +__mono_types__ = ['ch0', 'ch1', 'random', 'average'] + +__all__ = ['resample', 'to_mono', 'depth_convert', 'normalize', 'save', 'load'] + + +def resample(y, src_sr, target_sr): + + warnings.warn( + f'Using resampy to {src_sr}=>{target_sr}. This function is pretty slow, we recommend to process audio using ffmpeg' + ) + assert type(y) == np.ndarray, 'currently only numpy data are supported' + assert type( + src_sr) == int and src_sr > 0 and src_sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,' + assert type( + target_sr + ) == int and target_sr > 0 and target_sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,' + + if has_resampy: + return resampy.resample(y, src_sr, target_sr) + + if has_librosa: + return librosa.resample(y, src_sr, target_sr) + + assert False, 'requires librosa or resampy to do resampling, pip install resampy' + + +def to_mono(y, mono_type='average'): + + assert type(y) == np.ndarray, 'currently only numpy data are supported' + + if mono_type not in __mono_types__: + assert False, 'Unsupported mono_type {}, available types are {}'.format(mono_type, __mono_types__) + + if y.ndim == 1: + return y + if y.ndim > 2: + assert False, 'Unsupported audio array, y.ndim > 2, the shape is {}'.format(y.shape) + if mono_type == 'ch0': + return y[0] + if mono_type == 'ch1': + return y[1] + if mono_type == 'random': + return y[np.random.randint(0, 2)] + + if y.dtype == 'float32': + return (y[0] + y[1]) * 0.5 + if y.dtype == 'int16': + y1 = y.astype('int32') + y1 = (y1[0] + y1[1]) // 2 + y1 = np.clip(y1, np.iinfo(y.dtype).min, np.iinfo(y.dtype).max).astype(y.dtype) + return y1 + if y.dtype == 'int8': + y1 = y.astype('int16') + y1 = (y1[0] + y1[1]) // 2 + y1 = np.clip(y1, np.iinfo(y.dtype).min, np.iinfo(y.dtype).max).astype(y.dtype) + return y1 + + assert False, 'Unsupported audio array type, y.dtype={}'.format(y.dtype) + + +def __safe_cast__(y, dtype): + return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype) + + +def depth_convert(y, dtype): # convert audio array to target dtype + + assert type(y) == np.ndarray, 'currently only numpy data are supported' + + __eps__ = 1e-5 + __supported_dtype__ = ['int16', 'int8', 'float32', 'float64'] + if y.dtype not in __supported_dtype__: + assert False, 'Unsupported audio dtype, y.dtype is {}, supported dtypes are {}'.format( + y.dtype, __supported_dtype__) + if dtype not in __supported_dtype__: + assert False, 'Unsupported dtype, target dtype is {}, supported dtypes are {}'.format( + dtype, __supported_dtype__) + + if dtype == y.dtype: + return y + + if dtype == 'float64' and y.dtype == 'float32': + return __safe_cast__(y, dtype) + if dtype == 'float32' and y.dtype == 'float64': + return __safe_cast__(y, dtype) + + if dtype == 'int16' or dtype == 'int8': + if y.dtype in ['float64', 'float32']: + factor = np.iinfo(dtype).max + y = np.clip(y * factor, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype) + y = y.astype(dtype) + # figure + # plot(y) + # show() + else: + if dtype == 'int16' and y.dtype == 'int8': + factor = np.iinfo('int16').max / np.iinfo('int8').max - __eps__ + y = y.astype('float32') * factor + y = y.astype('int16') + + else: #dtype == 'int8' and y.dtype=='int16': + y = y.astype('int32') * np.iinfo('int8').max / np.iinfo('int16').max + y = y.astype('int8') + + if dtype in ['float32', 'float64']: + org_dtype = y.dtype + y = y.astype(dtype) / np.iinfo(org_dtype).max + return y + + +def sound_file_load(file, offset=None, dtype='int16', duration=None): + with sf.SoundFile(file) as sf_desc: + sr_native = sf_desc.samplerate + if offset: + sf_desc.seek(int(offset * sr_native)) + if duration is not None: + frame_duration = int(duration * sr_native) + else: + frame_duration = -1 + y = sf_desc.read(frames=frame_duration, dtype=dtype, always_2d=False).T + + return y, sf_desc.samplerate + + +def normalize(y, norm_type='linear', mul_factor=1.0): + + assert type(y) == np.ndarray, 'currently only numpy data are supported' + + __eps__ = 1e-8 + #set_trace() + if norm_type == 'linear': + # amin = np.min(y) + amax = np.max(np.abs(y)) + factor = 1.0 / (amax + __eps__) + y = y * factor * mul_factor + elif norm_type == 'gaussian': + amean = np.mean(y) + mul_factor = max(0.01, min(mul_factor, 0.2)) + astd = np.std(y) + y = mul_factor * (y - amean) / (astd + __eps__) + else: + assert False, 'not implemented error, norm_type should be in {}'.format(__norm_types__) + + return y + + +def save(y, sr, file): + assert type(y) == np.ndarray, 'currently only numpy data are supported' + assert type(sr) == int and sr > 0 and sr <= 48000, 'make sure type(sr) == int and sr > 0 and sr <= 48000,' + + if y.dtype not in ['int16', 'int8']: + warnings.warn('input data type is {}, saving data to int16 format'.format(y.dtype)) + yout = depth_convert(y, 'int16') + else: + yout = y + + wavfile.write(file, sr, y) + + +def load( + file, + sr=None, + mono=True, + mono_type='average', # ch0,ch1,random,average + normal=True, + norm_type='linear', + norm_mul_factor=1.0, + offset=0.0, + duration=None, + dtype='float32'): + + if has_librosa: + y, r = librosa.load(file, sr=sr, mono=False, offset=offset, duration=duration, + dtype='float32') #alwasy load in float32, then convert to target dtype + + elif has_snf: + y, r = sound_file_load(file, offset=offset, dypte=dtype, duration=duration) + + else: + assert False, 'not implemented error' + + ## + assert (y.ndim == 1 and len(y) > 0) or (y.ndim == 2 and len(y[0]) > 0), 'audio file {} looks empty'.format(file) + + if mono: + y = to_mono(y, mono_type) + + if sr is not None and sr != r: + y = resample(y, r, sr) + r = sr + + if normal: + # print('before nom',np.max(y)) + y = normalize(y, norm_type, norm_mul_factor) + # print('after norm',np.max(y)) + #plot(y) + #show() + if dtype in ['int8', 'int16'] and (normalize == False or normalize == True and norm_type == 'guassian'): + y = normalize(y, 'linear', 1.0) # do normalization before converting to target dtype + + y = depth_convert(y, dtype) + #figure + #plot(y) + #show() + return y, r diff --git a/PaddleAudio/paddleaudio/datasets/__init__.py b/PaddleAudio/paddleaudio/datasets/__init__.py new file mode 100644 index 00000000..23a9c4ca --- /dev/null +++ b/PaddleAudio/paddleaudio/datasets/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .dcase import TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet +from .esc50 import ESC50 +from .gtzan import GTZAN +from .urban_sound import UrbanSound8K + +__all__ = [ + 'ESC50', + 'UrbanSound8K', + 'GTZAN', + 'TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet', +] diff --git a/PaddleAudio/paddleaudio/datasets/dataset.py b/PaddleAudio/paddleaudio/datasets/dataset.py new file mode 100644 index 00000000..b24c045a --- /dev/null +++ b/PaddleAudio/paddleaudio/datasets/dataset.py @@ -0,0 +1,80 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from typing import List, Tuple + +import librosa +import numpy as np +import paddle +from tqdm import tqdm + +from ..features import linear_spect, log_spect, mel_spect +from ..utils.log import logger + + +class AudioClassificationDataset(paddle.io.Dataset): + """ + Base class of audio classification dataset. + """ + _feat_func = { + 'raw': None, + 'mel_spect': mel_spect, + 'linear_spect': linear_spect, + 'log_spect': log_spect, + } + + def __init__(self, files: List[str], labels: List[int], sample_rate: int, feat_type: str = 'raw', **kwargs): + """ + Ags: + files (:obj:`List[str]`): A list of absolute path of audio files. + labels (:obj:`List[int]`): Labels of audio files. + sample_rate (:obj:`int`): Sample rate of audio files. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + super(AudioClassificationDataset, self).__init__() + + if feat_type not in self._feat_func.keys(): + raise RuntimeError(\ + f"Unknown feat_type: {feat_type}, it must be one in {list(self._feat_func.keys())}") + self.feat_type = feat_type + + self.files = files + self.labels = labels + self.records = self._convert_to_records(sample_rate, **kwargs) + + def _get_data(self, input_file: str): + raise NotImplementedError + + def _convert_to_records(self, sample_rate: int, **kwargs) -> List[dict]: + records = [] + feat_func = self._feat_func[self.feat_type] + + logger.info('Start extracting features from audio files.') + for file, label in tqdm(zip(self.files, self.labels), total=len(self.files)): + record = {} + waveform, _ = librosa.load(file, sr=sample_rate) + record['feat'] = feat_func(waveform, **kwargs) if feat_func else waveform + record['label'] = label + records.append(record) + + return records + + def __getitem__(self, idx): + record = self.records[idx] + return np.array(record['feat']), np.array(record['label'], dtype=np.int64) + + def __len__(self): + return len(self.records) diff --git a/PaddleAudio/paddleaudio/datasets/dcase.py b/PaddleAudio/paddleaudio/datasets/dcase.py new file mode 100644 index 00000000..a8ce433e --- /dev/null +++ b/PaddleAudio/paddleaudio/datasets/dcase.py @@ -0,0 +1,168 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import os +from typing import List, Tuple + +from ..utils.download import download_and_decompress +from ..utils.env import DATA_HOME +from ..utils.log import logger +from .dataset import AudioClassificationDataset + +__all__ = ['TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet'] + + +class TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet(AudioClassificationDataset): + """ + TAU Urban Acoustic Scenes 2020 Mobile Development dataset + This dataset is used in DCASE2020 - Task 1, Acoustic scene classification / Subtask A / Development + """ + source_url = 'https://zenodo.org/record/3819968/files/' + base_name = 'TAU-urban-acoustic-scenes-2020-mobile-development' + archieves = [ + { + 'url': source_url + base_name + '.meta.zip', + 'md5': '6eae9db553ce48e4ea246e34e50a3cf5', + }, + { + 'url': source_url + base_name + '.audio.1.zip', + 'md5': 'b1e85b8a908d3d6a6ab73268f385d5c8', + }, + { + 'url': source_url + base_name + '.audio.2.zip', + 'md5': '4310a13cc2943d6ce3f70eba7ba4c784', + }, + { + 'url': source_url + base_name + '.audio.3.zip', + 'md5': 'ed38956c4246abb56190c1e9b602b7b8', + }, + { + 'url': source_url + base_name + '.audio.4.zip', + 'md5': '97ab8560056b6816808dedc044dcc023', + }, + { + 'url': source_url + base_name + '.audio.5.zip', + 'md5': 'b50f5e0bfed33cd8e52cb3e7f815c6cb', + }, + { + 'url': source_url + base_name + '.audio.6.zip', + 'md5': 'fbf856a3a86fff7520549c899dc94372', + }, + { + 'url': source_url + base_name + '.audio.7.zip', + 'md5': '0dbffe7b6e45564da649378723284062', + }, + { + 'url': source_url + base_name + '.audio.8.zip', + 'md5': 'bb6f77832bf0bd9f786f965beb251b2e', + }, + { + 'url': source_url + base_name + '.audio.9.zip', + 'md5': 'a65596a5372eab10c78e08a0de797c9e', + }, + { + 'url': source_url + base_name + '.audio.10.zip', + 'md5': '2ad595819ffa1d56d2de4c7ed43205a6', + }, + { + 'url': source_url + base_name + '.audio.11.zip', + 'md5': '0ad29f7040a4e6a22cfd639b3a6738e5', + }, + { + 'url': source_url + base_name + '.audio.12.zip', + 'md5': 'e5f4400c6b9697295fab4cf507155a2f', + }, + { + 'url': source_url + base_name + '.audio.13.zip', + 'md5': '8855ab9f9896422746ab4c5d89d8da2f', + }, + { + 'url': source_url + base_name + '.audio.14.zip', + 'md5': '092ad744452cd3e7de78f988a3d13020', + }, + { + 'url': source_url + base_name + '.audio.15.zip', + 'md5': '4b5eb85f6592aebf846088d9df76b420', + }, + { + 'url': source_url + base_name + '.audio.16.zip', + 'md5': '2e0a89723e58a3836be019e6996ae460', + }, + ] + label_list = ['airport', 'shopping_mall', 'metro_station', 'street_pedestrian', \ + 'public_square', 'street_traffic', 'tram', 'bus', 'metro', 'park'] + + meta = os.path.join(base_name, 'meta.csv') + meta_info = collections.namedtuple('META_INFO', ('filename', 'scene_label', 'identifier', 'source_label')) + subset_meta = { + 'train': os.path.join(base_name, 'evaluation_setup', 'fold1_train.csv'), + 'dev': os.path.join(base_name, 'evaluation_setup', 'fold1_evaluate.csv'), + 'test': os.path.join(base_name, 'evaluation_setup', 'fold1_test.csv'), + } + subset_meta_info = collections.namedtuple('SUBSET_META_INFO', ('filename', 'scene_label')) + audio_path = os.path.join(base_name, 'audio') + sample_rate = 44100 # 44.1 khz + duration = 10 # 10s + + def __init__(self, mode: str = 'train', feat_type: str = 'raw', **kwargs): + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + files, labels = self._get_data(mode) + super(TAUUrbanAcousticScenes_2020_Mobile_DevelopmentSet, \ + self).__init__(files=files, + labels=labels, + sample_rate=self.sample_rate, + feat_type=feat_type, + **kwargs) + + def _get_meta_info(self, subset: str = None, skip_header: bool = True) -> List[collections.namedtuple]: + if subset is None: + meta_file = self.meta + meta_info = self.meta_info + else: + assert subset in self.subset_meta, f'Subset must be one in {list(self.subset_meta.keys())}, but got {subset}.' + meta_file = self.subset_meta[subset] + meta_info = self.subset_meta_info + + ret = [] + with open(os.path.join(DATA_HOME, meta_file), 'r') as rf: + lines = rf.readlines()[1:] if skip_header else rf.readlines() + for line in lines: + ret.append(meta_info(*line.strip().split('\t'))) + return ret + + def _get_data(self, mode: str) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download_and_decompress(self.archieves, DATA_HOME) + + meta_info = self._get_meta_info(subset=mode, skip_header=True) + + files = [] + labels = [] + for sample in meta_info: + filename, label = sample + filename = os.path.basename(filename) + target = self.label_list.index(label) + + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + return files, labels diff --git a/PaddleAudio/paddleaudio/datasets/esc50.py b/PaddleAudio/paddleaudio/datasets/esc50.py new file mode 100644 index 00000000..af108e44 --- /dev/null +++ b/PaddleAudio/paddleaudio/datasets/esc50.py @@ -0,0 +1,88 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import os +from typing import List, Tuple + +from ..utils.download import download_and_decompress +from ..utils.env import DATA_HOME +from ..utils.log import logger +from .dataset import AudioClassificationDataset + +__all__ = ['ESC50'] + + +class ESC50(AudioClassificationDataset): + """ + Environment Sound Classification Dataset + """ + + archieves = [ + { + 'url': 'https://github.com/karoldvl/ESC-50/archive/master.zip', + 'md5': '70aba3bada37d2674b8f6cd5afd5f065', + }, + ] + meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv') + meta_info = collections.namedtuple('META_INFO', + ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take')) + audio_path = os.path.join('ESC-50-master', 'audio') + sample_rate = 44100 # 44.1 khz + duration = 5 # 5s + + def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs): + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + split (:obj:`int`, `optional`, defaults to 1): + It specify the fold of dev dataset. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + files, labels = self._get_data(mode, split) + super(ESC50, self).__init__(files=files, + labels=labels, + sample_rate=self.sample_rate, + feat_type=feat_type, + **kwargs) + + def _get_meta_info(self) -> List[collections.namedtuple]: + ret = [] + with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: + for line in rf.readlines()[1:]: + ret.append(self.meta_info(*line.strip().split(','))) + return ret + + def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download_and_decompress(self.archieves, DATA_HOME) + + meta_info = self._get_meta_info() + + files = [] + labels = [] + for sample in meta_info: + filename, fold, target, _, _, _, _ = sample + if mode == 'train' and int(fold) != split: + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + if mode != 'train' and int(fold) == split: + files.append(os.path.join(DATA_HOME, self.audio_path, filename)) + labels.append(int(target)) + + return files, labels diff --git a/PaddleAudio/paddleaudio/datasets/gtzan.py b/PaddleAudio/paddleaudio/datasets/gtzan.py new file mode 100644 index 00000000..4fb4e090 --- /dev/null +++ b/PaddleAudio/paddleaudio/datasets/gtzan.py @@ -0,0 +1,101 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import os +import random +from typing import List, Tuple + +from ..utils.download import download_and_decompress +from ..utils.env import DATA_HOME +from ..utils.log import logger +from .dataset import AudioClassificationDataset + +__all__ = ['GTZAN'] + + +class GTZAN(AudioClassificationDataset): + """ + GTZAN Dataset + """ + + archieves = [ + { + 'url': 'http://opihi.cs.uvic.ca/sound/genres.tar.gz', + 'md5': '5b3d6dddb579ab49814ab86dba69e7c7', + }, + ] + label_list = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock'] + meta = os.path.join('genres', 'input.mf') + meta_info = collections.namedtuple('META_INFO', ('file_path', 'label')) + audio_path = 'genres' + sample_rate = 22050 # 44.1 khz + duration = 30 # 5s + + def __init__(self, mode='train', seed=0, n_folds=5, split=1, feat_type='raw', **kwargs): + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + seed (:obj:`int`, `optional`, defaults to 0): + Set the random seed to shuffle samples. + n_folds (:obj:`int`, `optional`, defaults to 5): + Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. + split (:obj:`int`, `optional`, defaults to 1): + It specify the fold of dev dataset. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + assert split <= n_folds, f'The selected split should not be larger than n_fold, but got {split} > {n_folds}' + files, labels = self._get_data(mode, seed, n_folds, split) + super(GTZAN, self).__init__(files=files, + labels=labels, + sample_rate=self.sample_rate, + feat_type=feat_type, + **kwargs) + + def _get_meta_info(self) -> List[collections.namedtuple]: + ret = [] + with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: + for line in rf.readlines(): + ret.append(self.meta_info(*line.strip().split('\t'))) + return ret + + def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download_and_decompress(self.archieves, DATA_HOME) + + meta_info = self._get_meta_info() + random.seed(seed) # shuffle samples to split data + random.shuffle(meta_info) # make sure using the same seed to create train and dev dataset + + files = [] + labels = [] + n_samples_per_fold = len(meta_info) // n_folds + for idx, sample in enumerate(meta_info): + file_path, label = sample + filename = os.path.basename(file_path) + target = self.label_list.index(label) + fold = idx // n_samples_per_fold + 1 + + if mode == 'train' and int(fold) != split: + files.append(os.path.join(DATA_HOME, self.audio_path, label, filename)) + labels.append(target) + + if mode != 'train' and int(fold) == split: + files.append(os.path.join(DATA_HOME, self.audio_path, label, filename)) + labels.append(target) + + return files, labels diff --git a/PaddleAudio/paddleaudio/datasets/urban_sound.py b/PaddleAudio/paddleaudio/datasets/urban_sound.py new file mode 100644 index 00000000..0f052da6 --- /dev/null +++ b/PaddleAudio/paddleaudio/datasets/urban_sound.py @@ -0,0 +1,88 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import os +from typing import List, Tuple + +from ..utils.download import download_and_decompress +from ..utils.env import DATA_HOME +from ..utils.log import logger +from .dataset import AudioClassificationDataset + +__all__ = ['UrbanSound8K'] + + +class UrbanSound8K(AudioClassificationDataset): + """ + UrbanSound8K Dataset + """ + + archieves = [ + { + 'url': 'https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz', + 'md5': '9aa69802bbf37fb986f71ec1483a196e', + }, + ] + meta = os.path.join('UrbanSound8K', 'metadata', 'UrbanSound8K.csv') + meta_info = collections.namedtuple('META_INFO', + ('filename', 'fsid', 'start', 'end', 'salience', 'fold', 'class_id', 'label')) + audio_path = os.path.join('UrbanSound8K', 'audio') + sample_rate = 48000 # 48 khz + duration = 4 # 4s + + def __init__(self, mode: str = 'train', split: int = 1, feat_type: str = 'raw', **kwargs): + files, labels = self._get_data(mode, split) + super(UrbanSound8K, self).__init__(files=files, + labels=labels, + sample_rate=self.sample_rate, + feat_type=feat_type, + **kwargs) + """ + Ags: + mode (:obj:`str`, `optional`, defaults to `train`): + It identifies the dataset mode (train or dev). + split (:obj:`int`, `optional`, defaults to 1): + It specify the fold of dev dataset. + feat_type (:obj:`str`, `optional`, defaults to `raw`): + It identifies the feature type that user wants to extrace of an audio file. + """ + + def _get_meta_info(self): + ret = [] + with open(os.path.join(DATA_HOME, self.meta), 'r') as rf: + for line in rf.readlines()[1:]: + ret.append(self.meta_info(*line.strip().split(','))) + return ret + + def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]: + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)) or \ + not os.path.isfile(os.path.join(DATA_HOME, self.meta)): + download_and_decompress(self.archieves, DATA_HOME) + + meta_info = self._get_meta_info() + + files = [] + labels = [] + for sample in meta_info: + filename, _, _, _, _, fold, target, _ = sample + if mode == 'train' and int(fold) != split: + files.append(os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', filename)) + labels.append(int(target)) + + if mode != 'train' and int(fold) == split: + files.append(os.path.join(DATA_HOME, self.audio_path, f'fold{fold}', filename)) + labels.append(int(target)) + + return files, labels diff --git a/PaddleAudio/paddleaudio/features/__init__.py b/PaddleAudio/paddleaudio/features/__init__.py new file mode 100644 index 00000000..0a6429f3 --- /dev/null +++ b/PaddleAudio/paddleaudio/features/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .augmentation import * +from .features import * diff --git a/PaddleAudio/paddleaudio/features/augmentation.py b/PaddleAudio/paddleaudio/features/augmentation.py new file mode 100644 index 00000000..7215c4b5 --- /dev/null +++ b/PaddleAudio/paddleaudio/features/augmentation.py @@ -0,0 +1,130 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle + +from ..backends import depth_convert +from .utils import randint, weighted_sampling + +__all__ = ['depth_augment', 'spect_augment', 'random_crop1d', 'random_crop2d'] + + +# example y = depth_augment(y,['int8','int16'],[0.8,0.1]) +def depth_augment(y, choices=['int8', 'int16'], probs=[0.5, 0.5]): + assert len(probs) == len(choices), 'number of choices {} must be equal to size of probs {}'.format( + len(choices), len(probs)) + k = weighted_sampling(probs) + #k = randint(len(choices)) + src_depth = y.dtype + y1 = depth_convert(y, choices[k]) + y2 = depth_convert(y1, src_depth) + return y2 + + +def adaptive_spect_augment(spect, tempo_axis=0, level=0.1): + + assert spect.ndim == 2., 'only supports 2d tensor or numpy array' + if tempo_axis == 0: + nt, nf = spect.shape + else: + nf, nt = spect.shape + + time_mask_width = int(nt * level * 0.5) + freq_mask_width = int(nf * level * 0.5) + + num_time_mask = int(10 * level) + num_freq_mask = int(10 * level) + + # num_zeros = num_time_mask*time_mask_width*nf + num_freq_mask*freq_mask_width*nt + # factor = (nt*nf)/(nt*nf-num_zeros) + + if tempo_axis == 0: + for i in range(num_time_mask): + start = randint(nt - time_mask_width) + spect[start:start + time_mask_width, :] = 0 + for i in range(num_freq_mask): + start = randint(nf - freq_mask_width) + spect[:, start:start + freq_mask_width] = 0 + else: + for i in range(num_time_mask): + start = randint(nt - time_mask_width) + spect[:, start:start + time_mask_width] = 0 + for i in range(num_freq_mask): + start = randint(nf - freq_mask_width) + spect[start:start + freq_mask_width, :] = 0 + + return spect + + +def spect_augment( + spect, + tempo_axis=0, + max_time_mask=3, + max_freq_mask=3, + max_time_mask_width=30, + max_freq_mask_width=20, +): + + assert spect.ndim == 2., 'only supports 2d tensor or numpy array' + if tempo_axis == 0: + nt, nf = spect.shape + else: + nf, nt = spect.shape + + num_time_mask = randint(max_time_mask) + num_freq_mask = randint(max_freq_mask) + + time_mask_width = randint(max_time_mask_width) + freq_mask_width = randint(max_freq_mask_width) + + #print(num_time_mask) + #print(num_freq_mask) + + if tempo_axis == 0: + for i in range(num_time_mask): + start = randint(nt - time_mask_width) + spect[start:start + time_mask_width, :] = 0 + for i in range(num_freq_mask): + start = randint(nf - freq_mask_width) + spect[:, start:start + freq_mask_width] = 0 + else: + for i in range(num_time_mask): + start = randint(nt - time_mask_width) + spect[:, start:start + time_mask_width] = 0 + for i in range(num_freq_mask): + start = randint(nf - freq_mask_width) + spect[start:start + freq_mask_width, :] = 0 + + return spect + + +def random_crop1d(y, crop_len): + assert y.ndim == 1, 'only accept 1d tensor or numpy array' + n = len(y) + idx = randint(n - crop_len) + return y[idx:idx + crop_len] + + +def random_crop2d(s, crop_len, tempo_axis=0): # random crop according to temporal direction + assert tempo_axis < s.ndim, 'axis out of range' + n = s.shape[tempo_axis] + idx = randint(high=n - crop_len) + if type(s) == np.ndarray: + sli = [slice(None) for i in range(s.ndim)] + sli[tempo_axis] = slice(idx, idx + crop_len) + out = s[tuple(sli)] + else: + out = paddle.index_select(s, paddle.Tensor(np.array([i for i in range(idx, idx + crop_len)])), axis=tempo_axis) + return out diff --git a/PaddleAudio/paddleaudio/features/features.py b/PaddleAudio/paddleaudio/features/features.py new file mode 100644 index 00000000..ca6f6db2 --- /dev/null +++ b/PaddleAudio/paddleaudio/features/features.py @@ -0,0 +1,100 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import librosa +import numpy as np +import paddle + +__all__ = ['mel_spect', 'linear_spect', 'log_spect'] + + +#mel +def mel_spect(y, + sample_rate=16000, + window_size=512, + hop_length=320, + mel_bins=64, + fmin=50, + fmax=14000, + window='hann', + center=True, + pad_mode='reflect', + ref=1.0, + amin=1e-10, + top_db=None): + """ compute mel-spectrogram from input waveform y. + Create a Mel filter-bank. + This produces a linear transformation matrix to project + FFT bins onto Mel-frequency bins. + + """ + + s = librosa.stft(y, + n_fft=window_size, + hop_length=hop_length, + win_length=window_size, + window=window, + center=center, + pad_mode=pad_mode) + + power = np.abs(s)**2 + melW = librosa.filters.mel(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax) + mel = np.matmul(melW, power) + db = librosa.power_to_db(mel, ref=ref, amin=amin, top_db=None) + return db + + +def linear_spect(y, + sample_rate=16000, + window_size=512, + hop_length=320, + window='hann', + center=True, + pad_mode='reflect', + power=2): + + s = librosa.stft(y, + n_fft=window_size, + hop_length=hop_length, + win_length=window_size, + window=window, + center=center, + pad_mode=pad_mode) + + return np.abs(s)**power + + +def log_spect(y, + sample_rate=16000, + window_size=512, + hop_length=320, + window='hann', + center=True, + pad_mode='reflect', + power=2.0, + offset=1.0): + + s = librosa.stft( + y, + n_fft=window_size, + hop_length=hop_length, + win_length=window_size, + window=window, + center=center, + pad_mode=pad_mode, + ) + + s = np.abs(s)**power + + return np.log(offset + s) # remove diff --git a/PaddleAudio/paddleaudio/features/utils.py b/PaddleAudio/paddleaudio/features/utils.py new file mode 100644 index 00000000..672f0827 --- /dev/null +++ b/PaddleAudio/paddleaudio/features/utils.py @@ -0,0 +1,38 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle + +__all__ = ['randint', 'rand', 'weighted_sampling'] + + +def randint(high, use_paddle=True): + if use_paddle: + return int(paddle.randint(0, high=high)) + return int(np.random.randint(0, high=high)) + + +def rand(use_paddle=True): + if use_paddle: + return float(paddle.rand((1, ))) + return float(np.random.rand(1)) + + +def weighted_sampling(weights): + n = len(weights) + w = np.cumsum(weights) + w = w / w[-1] + flag = rand() < w + return np.argwhere(flag)[0][0] diff --git a/PaddleAudio/paddleaudio/models/PANNs/__init__.py b/PaddleAudio/paddleaudio/models/PANNs/__init__.py new file mode 100644 index 00000000..3292083c --- /dev/null +++ b/PaddleAudio/paddleaudio/models/PANNs/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .cnn6 import CNN6 +from .cnn10 import CNN10 +from .cnn14 import CNN14 diff --git a/PaddleAudio/paddleaudio/models/PANNs/cnn10.py b/PaddleAudio/paddleaudio/models/PANNs/cnn10.py new file mode 100644 index 00000000..e27995fc --- /dev/null +++ b/PaddleAudio/paddleaudio/models/PANNs/cnn10.py @@ -0,0 +1,85 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ...utils.log import logger +from .conv import ConvBlock + + +class CNN10(nn.Layer): + """ + The CNN10(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional + block consists of 2 convolutional layers with a kernel size of 3 × 3. + + Reference: + PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition + https://arxiv.org/pdf/1912.10211.pdf + """ + emb_size = 512 + + def __init__(self, extract_embedding: bool = True, checkpoint: str = None): + + super(CNN10, self).__init__() + self.bn0 = nn.BatchNorm2D(64) + self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) + self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) + + self.fc1 = nn.Linear(512, self.emb_size) + self.fc_audioset = nn.Linear(self.emb_size, 527) + + if checkpoint is not None and os.path.isfile(checkpoint): + state_dict = paddle.load(checkpoint) + self.set_state_dict(state_dict) + print(f'Loaded CNN10 pretrained parameters from: {checkpoint}') + else: + print('No valid checkpoints for CNN10. Start training from scratch.') + + self.extract_embedding = extract_embedding + + def forward(self, x): + x.stop_gradient = False + x = x.transpose([0, 3, 2, 1]) + x = self.bn0(x) + x = x.transpose([0, 3, 2, 1]) + + x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = x.mean(axis=3) + x = x.max(axis=2) + x.mean(axis=2) + + x = F.dropout(x, p=0.5, training=self.training) + x = F.relu(self.fc1(x)) + + if self.extract_embedding: + output = F.dropout(x, p=0.5, training=self.training) + else: + output = F.sigmoid(self.fc_audioset(x)) + return output diff --git a/PaddleAudio/paddleaudio/models/PANNs/cnn14.py b/PaddleAudio/paddleaudio/models/PANNs/cnn14.py new file mode 100644 index 00000000..5844040a --- /dev/null +++ b/PaddleAudio/paddleaudio/models/PANNs/cnn14.py @@ -0,0 +1,93 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ...utils.log import logger +from .conv import ConvBlock + + +class CNN14(nn.Layer): + """ + The CNN14(14-layer CNNs) mainly consist of 6 convolutional blocks while each convolutional + block consists of 2 convolutional layers with a kernel size of 3 × 3. + + Reference: + PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition + https://arxiv.org/pdf/1912.10211.pdf + """ + emb_size = 2048 + + def __init__(self, extract_embedding: bool = True, checkpoint: str = None): + + super(CNN14, self).__init__() + self.bn0 = nn.BatchNorm2D(64) + self.conv_block1 = ConvBlock(in_channels=1, out_channels=64) + self.conv_block2 = ConvBlock(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock(in_channels=256, out_channels=512) + self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024) + self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048) + + self.fc1 = nn.Linear(2048, self.emb_size) + self.fc_audioset = nn.Linear(self.emb_size, 527) + + if checkpoint is not None and os.path.isfile(checkpoint): + state_dict = paddle.load(checkpoint) + self.set_state_dict(state_dict) + logger.info(f'Loaded CNN14 pretrained parameters from: {checkpoint}') + else: + logger.error('No valid checkpoints for CNN14. Start training from scratch.') + + self.extract_embedding = extract_embedding + + def forward(self, x): + x.stop_gradient = False + x = x.transpose([0, 3, 2, 1]) + x = self.bn0(x) + x = x.transpose([0, 3, 2, 1]) + + x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = x.mean(axis=3) + x = x.max(axis=2) + x.mean(axis=2) + + x = F.dropout(x, p=0.5, training=self.training) + x = F.relu(self.fc1(x)) + + if self.extract_embedding: + output = F.dropout(x, p=0.5, training=self.training) + else: + output = F.sigmoid(self.fc_audioset(x)) + return output diff --git a/PaddleAudio/paddleaudio/models/PANNs/cnn6.py b/PaddleAudio/paddleaudio/models/PANNs/cnn6.py new file mode 100644 index 00000000..537a4edd --- /dev/null +++ b/PaddleAudio/paddleaudio/models/PANNs/cnn6.py @@ -0,0 +1,85 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + +from ...utils.log import logger +from .conv import ConvBlock5x5 + + +class CNN6(nn.Layer): + """ + The CNN14(14-layer CNNs) mainly consist of 4 convolutional blocks while each convolutional + block consists of 1 convolutional layers with a kernel size of 5 × 5. + + Reference: + PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition + https://arxiv.org/pdf/1912.10211.pdf + """ + emb_size = 512 + + def __init__(self, extract_embedding: bool = True, checkpoint: str = None): + + super(CNN6, self).__init__() + self.bn0 = nn.BatchNorm2D(64) + self.conv_block1 = ConvBlock5x5(in_channels=1, out_channels=64) + self.conv_block2 = ConvBlock5x5(in_channels=64, out_channels=128) + self.conv_block3 = ConvBlock5x5(in_channels=128, out_channels=256) + self.conv_block4 = ConvBlock5x5(in_channels=256, out_channels=512) + + self.fc1 = nn.Linear(512, self.emb_size) + self.fc_audioset = nn.Linear(self.emb_size, 527) + + if checkpoint is not None and os.path.isfile(checkpoint): + state_dict = paddle.load(checkpoint) + self.set_state_dict(state_dict) + print(f'Loaded CNN6 pretrained parameters from: {checkpoint}') + else: + print('No valid checkpoints for CNN6. Start training from scratch.') + + self.extract_embedding = extract_embedding + + def forward(self, x): + x.stop_gradient = False + x = x.transpose([0, 3, 2, 1]) + x = self.bn0(x) + x = x.transpose([0, 3, 2, 1]) + + x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') + x = F.dropout(x, p=0.2, training=self.training) + + x = x.mean(axis=3) + x = x.max(axis=2) + x.mean(axis=2) + + x = F.dropout(x, p=0.5, training=self.training) + x = F.relu(self.fc1(x)) + + if self.extract_embedding: + output = F.dropout(x, p=0.5, training=self.training) + else: + output = F.sigmoid(self.fc_audioset(x)) + return output diff --git a/PaddleAudio/paddleaudio/models/PANNs/conv.py b/PaddleAudio/paddleaudio/models/PANNs/conv.py new file mode 100644 index 00000000..775355de --- /dev/null +++ b/PaddleAudio/paddleaudio/models/PANNs/conv.py @@ -0,0 +1,86 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F + + +class ConvBlock(nn.Layer): + def __init__(self, in_channels, out_channels): + super(ConvBlock, self).__init__() + + self.conv1 = nn.Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias_attr=False) + self.conv2 = nn.Conv2D(in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias_attr=False) + self.bn1 = nn.BatchNorm2D(out_channels) + self.bn2 = nn.BatchNorm2D(out_channels) + + def forward(self, x, pool_size=(2, 2), pool_type='avg'): + x = self.conv1(x) + x = self.bn1(x) + x = F.relu(x) + + x = self.conv2(x) + x = self.bn2(x) + x = F.relu(x) + + if pool_type == 'max': + x = F.max_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg': + x = F.avg_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg+max': + x = F.avg_pool2d(x, kernel_size=pool_size) + F.max_pool2d(x, kernel_size=pool_size) + else: + raise Exception( + f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".') + return x + + +class ConvBlock5x5(nn.Layer): + def __init__(self, in_channels, out_channels): + super(ConvBlock5x5, self).__init__() + + self.conv1 = nn.Conv2D(in_channels=in_channels, + out_channels=out_channels, + kernel_size=(5, 5), + stride=(1, 1), + padding=(2, 2), + bias_attr=False) + self.bn1 = nn.BatchNorm2D(out_channels) + + def forward(self, x, pool_size=(2, 2), pool_type='avg'): + x = self.conv1(x) + x = self.bn1(x) + x = F.relu(x) + + if pool_type == 'max': + x = F.max_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg': + x = F.avg_pool2d(x, kernel_size=pool_size) + elif pool_type == 'avg+max': + x = F.avg_pool2d(x, kernel_size=pool_size) + F.max_pool2d(x, kernel_size=pool_size) + else: + raise Exception( + f'Pooling type of {pool_type} is not supported. It must be one of "max", "avg" and "avg+max".') + return x diff --git a/PaddleAudio/paddleaudio/models/__init__.py b/PaddleAudio/paddleaudio/models/__init__.py new file mode 100644 index 00000000..0deb9706 --- /dev/null +++ b/PaddleAudio/paddleaudio/models/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .PANNs import CNN14, CNN10, CNN6 + +__all__ = [ + 'CNN14', + 'CNN10', + 'CNN6', +] diff --git a/PaddleAudio/paddleaudio/scripts/wav2mel.py b/PaddleAudio/paddleaudio/scripts/wav2mel.py new file mode 100644 index 00000000..a1013d36 --- /dev/null +++ b/PaddleAudio/paddleaudio/scripts/wav2mel.py @@ -0,0 +1,76 @@ +import argparse +import glob +import os + +import h5py +import numpy as np +import tqdm + +import paddleaudio as pa + +#from pylab import * +parser = argparse.ArgumentParser(description='wave2mel') +parser.add_argument('--wav_file', type=str, required=False, default='') +parser.add_argument('--wav_list', type=str, required=False, default='') +parser.add_argument('--wav_h5_file', type=str, required=False, default='') +parser.add_argument('--wav_h5_list', type=str, required=False, default='') +parser.add_argument('--output_folder', type=str, required=False, default='./') +parser.add_argument('--output_h5', type=bool, required=False, default=True) +parser.add_argument('--sample_rate', type=int, required=False, default=32000) +parser.add_argument('--window_size', type=int, required=False, default=1024) +parser.add_argument('--mel_bins', type=int, required=False, default=128) +parser.add_argument('--hop_length', type=int, required=False, default=640) #20ms +parser.add_argument('--fmin', type=int, required=False, default=50) #25ms +parser.add_argument('--fmax', type=int, required=False, default=16000) #25ms +parser.add_argument('--skip_existed', type=int, required=False, default=1) #25ms + +args = parser.parse_args() +#args.wav_h5_file = '/ssd2/laiyongquan/audioset/h5/audioset_unblance_group28.h5' + +assert not (args.wav_h5_file == '' and args.wav_h5_list == ''\ +and args.wav_list == '' and args.wav_file == ''), 'one of wav_file,wav_list,\ +wav_h5_file,wav_h5_list needs to specify' + +if args.wav_h5_file != '': + h5_files = [args.wav_h5_file] +if args.wav_h5_list != '': + h5_files = open(args.wav_h5_list).read().split('\n') + h5_files = [h for h in h5_files if len(h.strip()) != 0] + +dst_folder = args.output_folder +print(f'{len(h5_files)} h5 files listed') +for f in h5_files: + print(f'processing {f}') + dst_file = os.path.join(dst_folder, f.split('/')[-1]) + print(f'target file {dst_file}') + if args.skip_existed != 0 and os.path.exists(dst_file): + print(f'skipped file {f}') + continue + assert not os.path.exists(dst_file), f'target file {dst_file} existed' + src_h5 = h5py.File(f) + dst_h5 = h5py.File(dst_file, "w") + for key in tqdm.tqdm(src_h5.keys()): + s = src_h5[key][:] + s = pa.depth_convert(s, 'float32') + # s = pa.resample(s,32000,args.sample_rate) + x = pa.features.mel_spect(s, + sample_rate=args.sample_rate, + window_size=args.window_size, + hop_length=args.hop_length, + mel_bins=args.mel_bins, + fmin=args.fmin, + fmax=args.fmax, + window='hann', + center=True, + pad_mode='reflect', + ref=1.0, + amin=1e-10, + top_db=None) + # figure(figsize=(8,8)) + # imshow(x) + # show() + # print(x.shape) + + dst_h5.create_dataset(key, data=x) + src_h5.close() + dst_h5.close() diff --git a/PaddleAudio/paddleaudio/scripts/wav2mel2.py b/PaddleAudio/paddleaudio/scripts/wav2mel2.py new file mode 100644 index 00000000..3c5188df --- /dev/null +++ b/PaddleAudio/paddleaudio/scripts/wav2mel2.py @@ -0,0 +1,113 @@ +import argparse +import glob +import os + +import h5py +import numpy as np +import tqdm + +import paddleaudio as pa + +parser = argparse.ArgumentParser(description='wave2mel') +parser.add_argument('--wav_file', type=str, required=False, default='') +parser.add_argument('--wav_list', type=str, required=False, default='') +parser.add_argument('--wav_h5_file', type=str, required=False, default='') +parser.add_argument('--wav_h5_list', type=str, required=False, default='') +parser.add_argument('--output_folder', type=str, required=False, default='./') +parser.add_argument('--output_h5', type=bool, required=False, default=True) +parser.add_argument('--dst_h5_file', type=str, required=False, default='') + +parser.add_argument('--sample_rate', type=int, required=False, default=32000) +parser.add_argument('--window_size', type=int, required=False, default=1024) +parser.add_argument('--mel_bins', type=int, required=False, default=128) +parser.add_argument('--hop_length', type=int, required=False, default=640) #20ms +parser.add_argument('--fmin', type=int, required=False, default=50) #25ms +parser.add_argument('--fmax', type=int, required=False, default=16000) #25ms +parser.add_argument('--skip_existed', type=int, required=False, default=1) #25ms + +args = parser.parse_args() + +assert not (args.wav_h5_file == '' and args.wav_h5_list == ''\ +and args.wav_list == '' and args.wav_file == ''), 'one of wav_file,wav_list,\ +wav_h5_file,wav_h5_list needs to specify' + +h5_files = [] +wav_files = [] +if args.wav_h5_file != '': + h5_files = [args.wav_h5_file] +elif args.wav_h5_list != '': + h5_files = open(args.wav_h5_list).read().split('\n') + h5_files = [h for h in h5_files if len(h.strip()) != 0] +elif args.wav_list != '': + wav_files = open(args.wav_list).read().split('\n') + wav_files = [h for h in wav_files if len(h.strip()) != 0] + +elif args.wav_file != '': + wav_files = [args.wav_file] + +dst_folder = args.output_folder + +if len(h5_files) > 0: + print(f'{len(h5_files)} h5 files listed') + for f in h5_files: + print(f'processing {f}') + dst_file = os.path.join(dst_folder, f.split('/')[-1]) + print(f'target file {dst_file}') + if args.skip_existed != 0 and os.path.exists(dst_file): + print(f'skipped file {f}') + continue + assert not os.path.exists(dst_file), f'target file {dst_file} existed' + src_h5 = h5py.File(f) + dst_h5 = h5py.File(dst_file, "w") + for key in tqdm.tqdm(src_h5.keys()): + s = src_h5[key][:] + s = pa.depth_convert(s, 'float32') + # s = pa.resample(s,32000,args.sample_rate) + x = pa.features.mel_spect(s, + sample_rate=args.sample_rate, + window_size=args.window_size, + hop_length=args.hop_length, + mel_bins=args.mel_bins, + fmin=args.fmin, + fmax=args.fmax, + window='hann', + center=True, + pad_mode='reflect', + ref=1.0, + amin=1e-10, + top_db=None) + dst_h5.create_dataset(key, data=x) + src_h5.close() + dst_h5.close() + +if len(wav_files) > 0: + + assert args.dst_h5_file != '', 'for using wav file or wav list, dst_h5_file must be specified' + + dst_file = args.dst_h5_file + assert not os.path.exists(dst_file), f'target file {dst_file} existed' + dst_h5 = h5py.File(dst_file, "w") + print(f'{len(wav_files)} wav files listed') + for f in tqdm.tqdm(wav_files): + s, _ = pa.load(f, sr=args.sample_rate) + # s = pa.resample(s,32000,args.sample_rate) + x = pa.features.mel_spect(s, + sample_rate=args.sample_rate, + window_size=args.window_size, + hop_length=args.hop_length, + mel_bins=args.mel_bins, + fmin=args.fmin, + fmax=args.fmax, + window='hann', + center=True, + pad_mode='reflect', + ref=1.0, + amin=1e-10, + top_db=None) + # figure(figsize=(8,8)) + # imshow(x) + # show() + # print(x.shape) + key = f.split('/')[-1][:11] + dst_h5.create_dataset(key, data=x) + dst_h5.close() diff --git a/PaddleAudio/paddleaudio/utils/download.py b/PaddleAudio/paddleaudio/utils/download.py new file mode 100644 index 00000000..1a0a4f5d --- /dev/null +++ b/PaddleAudio/paddleaudio/utils/download.py @@ -0,0 +1,33 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List + +from paddle.utils import download + +from .log import logger + +download.logger = logger + + +def download_and_decompress(archives: List[Dict[str, str]], path: str): + """ + Download archieves and decompress to specific path. + """ + for archive in archives: + assert 'url' in archive and 'md5' in archive, \ + 'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}' + + logger.info(f'Downloading from: {archive["url"]}') + download.get_path_from_url(archive['url'], path, archive['md5']) diff --git a/PaddleAudio/paddleaudio/utils/env.py b/PaddleAudio/paddleaudio/utils/env.py new file mode 100644 index 00000000..164e6d15 --- /dev/null +++ b/PaddleAudio/paddleaudio/utils/env.py @@ -0,0 +1,52 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +This module is used to store environmental variables in PaddleAudio. +PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the +├ default value through the PPAUDIO_HOME environment variable. +├─ MODEL_HOME --> Store model files. +└─ DATA_HOME --> Store automatically downloaded datasets. +''' + +import os + + +def _get_user_home(): + return os.path.expanduser('~') + + +def _get_ppaudio_home(): + if 'PPAUDIO_HOME' in os.environ: + home_path = os.environ['PPAUDIO_HOME'] + if os.path.exists(home_path): + if os.path.isdir(home_path): + return home_path + else: + raise RuntimeError('The environment variable PPAUDIO_HOME {} is not a directory.'.format(home_path)) + else: + return home_path + return os.path.join(_get_user_home(), '.paddleaudio') + + +def _get_sub_home(directory): + home = os.path.join(_get_ppaudio_home(), directory) + if not os.path.exists(home): + os.makedirs(home) + return home + + +USER_HOME = _get_user_home() +PPAUDIO_HOME = _get_ppaudio_home() +MODEL_HOME = _get_sub_home('models') +DATA_HOME = _get_sub_home('datasets') diff --git a/PaddleAudio/paddleaudio/utils/log.py b/PaddleAudio/paddleaudio/utils/log.py new file mode 100644 index 00000000..d81ecdf8 --- /dev/null +++ b/PaddleAudio/paddleaudio/utils/log.py @@ -0,0 +1,139 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import copy +import functools +import logging +import os +import sys +import threading +import time +from typing import List + +import colorlog +from colorama import Fore + +loggers = {} + +log_config = { + 'DEBUG': { + 'level': 10, + 'color': 'purple' + }, + 'INFO': { + 'level': 20, + 'color': 'green' + }, + 'TRAIN': { + 'level': 21, + 'color': 'cyan' + }, + 'EVAL': { + 'level': 22, + 'color': 'blue' + }, + 'WARNING': { + 'level': 30, + 'color': 'yellow' + }, + 'ERROR': { + 'level': 40, + 'color': 'red' + }, + 'CRITICAL': { + 'level': 50, + 'color': 'bold_red' + } +} + + +class Logger(object): + ''' + Deafult logger in PaddleAudio + Args: + name(str) : Logger name, default is 'PaddleAudio' + ''' + def __init__(self, name: str = None): + name = 'PaddleAudio' if not name else name + self.logger = logging.getLogger(name) + + for key, conf in log_config.items(): + logging.addLevelName(conf['level'], key) + self.__dict__[key] = functools.partial(self.__call__, conf['level']) + self.__dict__[key.lower()] = functools.partial(self.__call__, conf['level']) + + self.format = colorlog.ColoredFormatter('%(log_color)s[%(asctime)-15s] [%(levelname)8s]%(reset)s - %(message)s', + log_colors={key: conf['color'] + for key, conf in log_config.items()}) + + self.handler = logging.StreamHandler() + self.handler.setFormatter(self.format) + + self.logger.addHandler(self.handler) + self.logLevel = 'DEBUG' + self.logger.setLevel(logging.DEBUG) + self.logger.propagate = False + self._is_enable = True + + def disable(self): + self._is_enable = False + + def enable(self): + self._is_enable = True + + @property + def is_enable(self) -> bool: + return self._is_enable + + def __call__(self, log_level: str, msg: str): + if not self.is_enable: + return + + self.logger.log(log_level, msg) + + @contextlib.contextmanager + def use_terminator(self, terminator: str): + old_terminator = self.handler.terminator + self.handler.terminator = terminator + yield + self.handler.terminator = old_terminator + + @contextlib.contextmanager + def processing(self, msg: str, interval: float = 0.1): + ''' + Continuously print a progress bar with rotating special effects. + Args: + msg(str): Message to be printed. + interval(float): Rotation interval. Default to 0.1. + ''' + end = False + + def _printer(): + index = 0 + flags = ['\\', '|', '/', '-'] + while not end: + flag = flags[index % len(flags)] + with self.use_terminator('\r'): + self.info('{}: {}'.format(msg, flag)) + time.sleep(interval) + index += 1 + + t = threading.Thread(target=_printer) + t.start() + yield + end = True + + +logger = Logger() diff --git a/PaddleAudio/requirements.txt b/PaddleAudio/requirements.txt new file mode 100644 index 00000000..95c7509b --- /dev/null +++ b/PaddleAudio/requirements.txt @@ -0,0 +1,13 @@ +colorama +colorlog +easydict +filelock +gitpython +numpy +packaging +Pillow +pyyaml +pyzmq +rarfile +tqdm +librosa diff --git a/PaddleAudio/setup.py b/PaddleAudio/setup.py new file mode 100644 index 00000000..017bf14f --- /dev/null +++ b/PaddleAudio/setup.py @@ -0,0 +1,22 @@ + +import setuptools +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="PaddleAudio", + version="0.0.0", + author="", + author_email="", + description="PaddleAudio, in development", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/ranchlai/PaddleAudio", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', +) -- GitLab