diff --git a/.gitignore b/.gitignore index a18b6a6009c88f2415c3c1df6c870550332d07f7..30c60377fdaa52a786cfcae2446e0950133375f9 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,7 @@ *.blg *.bbl *.bcf +*.toc _old SV2TTS/encoder/saved_models/*_backups SV2TTS/synthesizer/saved_models/* diff --git a/README.md b/_README.md similarity index 71% rename from README.md rename to _README.md index 9685d5987fd122b3acda199352e78b910e90c69a..e375707fc8da009af4788dcab7a8a70130bd9135 100644 --- a/README.md +++ b/_README.md @@ -18,26 +18,3 @@ |[1702.07825](https://arxiv.org/pdf/1702.07825.pdf) | Deep Voice 1 | Deep Voice: Real-time Neural Text-to-Speech | |[1609.03499](https://arxiv.org/pdf/1609.03499.pdf) | Wavenet | Wavenet: A Generative Model for Raw Audio | |[1506.07503](https://arxiv.org/pdf/1506.07503.pdf) | Attention | Attention-Based Models for Speech Recognition | - -### Task list (for writing) -- [ ] Rewrite the SOTA to be more focused -- [ ] Present SV2TTS -- For the encoder: - - [ ] Present the implementation and that of the authors - - [ ] Present the results -- For the Tacotron 2 implementation: - - [ ] Describe the architecture - - [ ] Present the modifications I've made -- [ ] Make speed tests for every part of the framework - -### Milestones -**For the end of March** -- Write about 8 new pages -- Gather the different parts of the framework in a single demonstration tool -- Settle on the vocoder and provide an implementation - -**For later**: -- Explore the approaches I've proposed for a deeper analysis of the framework - -### Other things -- Migrate repo to github once the baseline is decent diff --git a/documents/references.bib b/documents/references.bib index c466a233e0e9e089a9bf7325e69a389c8d11340a..c2d26b7c060d8d0023391169c885cc546f155690 100644 --- a/documents/references.bib +++ b/documents/references.bib @@ -1,3 +1,56 @@ +@misc{NeuralDiscreteRepresentation, + Author = {Aaron van den Oord and Oriol Vinyals and Koray Kavukcuoglu}, + Title = {Neural Discrete Representation Learning}, + Year = {2017}, + Eprint = {arXiv:1711.00937}, +} +@misc{DeepVoice2, + Author = {Sercan Arik and Gregory Diamos and Andrew Gibiansky and John Miller and Kainan Peng and Wei Ping and Jonathan Raiman and Yanqi Zhou}, + Title = {Deep Voice 2: Multi-Speaker Neural Text-to-Speech}, + Year = {2017}, + Eprint = {arXiv:1705.08947}, +} +@misc{CloningFewSamples, + Author = {Sercan O. Arik and Jitong Chen and Kainan Peng and Wei Ping and Yanqi Zhou}, + Title = {Neural Voice Cloning with a Few Samples}, + Year = {2018}, + Eprint = {arXiv:1802.06006}, +} +@INPROCEEDINGS{MOSNaturalness, + author={S. {Shirali-Shahreza} and G. {Penn}}, + booktitle={2018 IEEE Spoken Language Technology Workshop (SLT)}, + title={MOS Naturalness and the Quest for Human-Like Speech}, + year={2018}, + volume={}, + number={}, + pages={346-352}, + keywords={learning (artificial intelligence);natural language processing;speech intelligibility;speech synthesis;MOS naturalness;TTS quality;speech synthesis;speech intelligibility;TTS systems;native North-American speech;Indian speech;deep learning;Testing;Speech coding;Synthesizers;ITU;Protocols;Data models;text-to-speech synthesis;evaluation;naturalness;paired comparison tests}, + doi={10.1109/SLT.2018.8639599}, + ISSN={}, + month={Dec}, +} +@article{EfficientNeuralAudioSynthesis, + author = {Nal Kalchbrenner and + Erich Elsen and + Karen Simonyan and + Seb Noury and + Norman Casagrande and + Edward Lockhart and + Florian Stimberg and + A{\"{a}}ron van den Oord and + Sander Dieleman and + Koray Kavukcuoglu}, + title = {Efficient Neural Audio Synthesis}, + journal = {CoRR}, + volume = {abs/1802.08435}, + year = {2018}, + url = {http://arxiv.org/abs/1802.08435}, + archivePrefix = {arXiv}, + eprint = {1802.08435}, + timestamp = {Mon, 13 Aug 2018 16:47:01 +0200}, + biburl = {https://dblp.org/rec/bib/journals/corr/abs-1802-08435}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} @article{SV2TTS, author = {Ye Jia and Yu Zhang and @@ -380,22 +433,6 @@ bibsource = {dblp computer science bibliography, http://dblp.org} } -@article{sentimentNeuron, - author = {Alec Radford and - Rafal J{\'{o}}zefowicz and - Ilya Sutskever}, - title = {Learning to Generate Reviews and Discovering Sentiment}, - journal = {CoRR}, - volume = {abs/1704.01444}, - year = {2017}, - url = {http://arxiv.org/abs/1704.01444}, - archivePrefix = {arXiv}, - eprint = {1704.01444}, - timestamp = {Wed, 07 Jun 2017 14:43:05 +0200}, - biburl = {http://dblp.org/rec/bib/journals/corr/RadfordJS17}, - bibsource = {dblp computer science bibliography, http://dblp.org} -} - @article{LSTM, author = {Hochreiter, Sepp and Schmidhuber, J\"{u}rgen}, title = {Long Short-Term Memory}, diff --git a/documents/thesis.pdf b/documents/thesis.pdf index 94dd08e6b27a04d46582239a5efe05b96b269039..30071b895ab5eda5a2910e026d65e08012baafa6 100644 Binary files a/documents/thesis.pdf and b/documents/thesis.pdf differ diff --git a/documents/thesis.tex b/documents/thesis.tex index f721252f6d75b026ca03deedd058de02fafa91ac..16a4768ae926a9a08dc84c7004745b562ca3fac4 100644 --- a/documents/thesis.tex +++ b/documents/thesis.tex @@ -2,11 +2,14 @@ \usepackage{geometry} \geometry{ a4paper, - total={150mm,227mm}, - left=30mm, + total={154mm,227mm}, + left=28mm, top=35mm, } +\setlength{\parskip}{0.85em} +\setlength{\parindent}{2.6em} \usepackage{array} +\usepackage{amssymb} \usepackage{enumerate} \usepackage{graphicx} \usepackage{url} @@ -54,7 +57,6 @@ \vfill \end{titlepage} - \setcounter{page}{2} %\color{red} @@ -70,11 +72,26 @@ %\clearpage \section*{Abstract} -Recent advances in deep learning have shown impressive results in the domain of text-to-speech. To this end, a deep neural network is usually trained using a corpus of several hours of professionally recorded speech from a single speaker. Giving a new voice to such a model is highly expensive, as it requires recording a new dataset and retraining the model. A recent research introduced a three-stage pipeline that allows to clone a voice unseen during training from only a few seconds of reference speech, and without retraining the model. The authors share impressively natural-sounding results, but provide no implementation. We reproduce this framework and open-source the first public implementation of it. We adapt the framework with a newer vocoder model, so as to make it run in real-time. +Recent advances in deep learning have shown impressive results in the domain of text-to-speech. To this end, a deep neural network is usually trained using a corpus of several hours of professionally recorded speech from a single speaker. Giving a new voice to such a model is highly expensive, as it requires recording a new dataset and retraining the model. A recent research introduced a three-stage pipeline that allows to clone a voice unseen during training from only a few seconds of reference speech, and without retraining the model. The authors share remarkably natural-sounding results, but provide no implementation. We reproduce this framework and open-source the first public implementation of it. We adapt the framework with a newer vocoder model, so as to make it run in real-time. \clearpage +\tableofcontents +\clearpage \section{Introduction} +% What is now possible with deep learning in TTS +Deep learning models have become predominant in many fields of applied machine learning. Text-to-speech (TTS), the process of synthesizing artificial speech from a text prompt, is no exception. Deep models that would produce more natural-sounding speech than the traditional concatenative TTS approaches begun appearing in 2016. Much of the research focus has been since gathered around making these models more efficient, more natural, or training them in an end-to-end fashion. Inference has come from being hundreds of times slower than real-time on GPU \citep{WaveNet} to possible in real-time on a mobile CPU \citep{EfficientNeuralAudioSynthesis}. As for the quality of the generated speech, \citet{Tacotron2} demonstrate near human naturalness. Interestingly, speech naturalness is best rated with subjective metrics; and comparison with actual human speech leads to the conclusion that there might be such a thing as "speech more natural than human speech". In fact, \citet{MOSNaturalness} argue that the human naturalness threshold has already been crossed. + +% The state of things in voice cloning +Datasets of professionally recorded speech are a scarce resource. Synthesizing a natural voice with a correct pronunciation, lively intonation and a minimum background of noise requires training data with the same qualities. Furthermore, data efficiency often remains one of the shortcomings of deep learning. Training a common text-to-speech model such as Tacotron \citep{Tacotron1} typically requires tens of hours of speech. %It is thus no surprise that there seems to be a theme in speech-related applications of reproducing the voice of whomever is the current president of the United States. %-> kinda funny but not necessary +Yet the ability of generating speech with any voice is attractive for a range of applications be they useful, merely a matter of customization, or mischievous. Research had led to frameworks for voice conversion and voice cloning. They differ in that voice conversion is a form of style transfer on a speech segment from a voice to another, whereas voice cloning consists in capturing the voice of a speaker to perform text-to-speech on arbitrary inputs. + +% What we want to achieve (problem definition, what is SV2TTS) +While the complete training of a single-speaker TTS model is technically a form of voice cloning, the interest rather lies in creating a fixed model that is able to incorporate newer voices with little data. The common approach is to condition a TTS model trained to generalize to new speakers on an embedding of the voice to clone~\citep{DeepVoice2, CloningFewSamples, SV2TTS}. The embedding is low-dimensional and derived by a speaker encoder model that takes reference speech as input. This approach is typically more data efficient than training a separate TTS model for each speaker, in addition to being orders of magnitude faster and less computationally expensive. Interestingly, there is a large discrepancy between the duration of reference speech needed to clone a voice among the different methods, ranging from half an hour per speaker to only a few seconds. + +% Our contributions + + \subsection{Problem definition} @@ -277,7 +294,7 @@ synth: - advantages of conditioning on embedding rather than \\ \clearpage -\bibliographystyle{unsrtnat} +\bibliographystyle{plainnat} \bibliography{references}