diff --git a/README.md b/README.md index e08ec0a0673eb3294948eb634ba5ecf135ef9e08..950708ac09704b47b943dc1bda93610c346fb66a 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ |[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E | Generalized End-To-End Loss for Speaker Verification | |[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron | Tacotron: Towards End-To-End Speech Synthesis | +|[1609.03499](https://arxiv.org/pdf/1609.03499.pdf) | Wavenet | Wavenet: A Generative Model for Raw Audio | |[1509.08062](https://arxiv.org/pdf/1509.08062.pdf) | TE2E | End-to-End Text-Dependent Speaker Verification | |[1409.0473](https://arxiv.org/pdf/1409.0473.pdf) | Attention | Neural Machine Translation by Jointly Learning to Align and Translate | @@ -19,10 +20,11 @@ - [x] Finish the analysis of SV2TTS - Other papers to read: - [x] Tacotron 2 (base for the synthesizer and vocoder of SV2TTS) - - [ ] GE2E (base for the encoder of SV2TTS) + - [ ] GE2E (Encoder of SV2TTS) - [ ] TE2E (base for GE2E) - [ ] Attention (to learn about the attention layer) - - [ ] Tacotron 1 + - [ ] Tacotron 1 (base for Tacotron 2) + - [ ] Wavenet (vocoder of Tacotron) - [ ] Reformat my paper/dataset notes in markdown (?) - [ ] Get started on the SOTA review - [ ] Get started on the description of SV2TTS diff --git a/documents/references.bib b/documents/references.bib new file mode 100644 index 0000000000000000000000000000000000000000..3a76fd1e2d262c3a19f5fe6e0d14972b9622d018 --- /dev/null +++ b/documents/references.bib @@ -0,0 +1,298 @@ + + + + + + + +@inproceedings{HMMSpeakerInterpolation, + title={Speaker interpolation in HMM-based speech synthesis system}, + author={Takayoshi Yoshimura and Takashi Masuko and Keiichi Tokuda and Takao Kobayashi and Tadashi Kitamura}, + booktitle={EUROSPEECH}, + year={1997} +} +@article{STRAIGHT, + title = "Restructuring speech representations using a pitch-adaptive time–frequency smoothing and an instantaneous-frequency-based F0 extraction: Possible role of a repetitive structure in sounds1Speech files available. See http://www.elsevier.nl/locate/specom1", + journal = "Speech Communication", + volume = "27", + number = "3", + pages = "187 - 207", + year = "1999", + issn = "0167-6393", + doi = "https://doi.org/10.1016/S0167-6393(98)00085-5", + url = "http://www.sciencedirect.com/science/article/pii/S0167639398000855", + author = "Hideki Kawahara and Ikuyo Masuda-Katsuse and Alain de Cheveigné", + keywords = "Speech analysis, Pitch-synchronous, Spline smoothing, Instantaneous frequency, F0 extraction, Speech synthesis, Speech modification" +} +@inbook{TTSSOTA, + place={Cambridge}, + title={Speech Synthesis: State of the Art and Challenges for the Future}, + DOI={10.1017/9781316676202.019}, booktitle={Social Signal Processing}, + publisher={Cambridge University Press}, + author={Georgila, Kallirroi}, + editor={Burgoon, + Judee K. and Magnenat-Thalmann, + Nadia and Pantic, + Maja and Vinciarelli, + AlessandroEditors}, + year={2017}, + pages={257–272} +} +@INPROCEEDINGS{SPSSDNN, + author={H. Ze and A. Senior and M. Schuster}, + booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, + title={Statistical parametric speech synthesis using deep neural networks}, + year={2013}, + volume={}, + number={}, + pages={7962-7966}, + keywords={hidden Markov models;neural nets;speech synthesis;statistical parametric speech synthesis;deep neural networks;decision tree clustered context dependent hidden Markov models;HMM;probability densities;speech parameters;speech waveform;decision trees;acoustic realizations;Hidden Markov models;Speech;Speech synthesis;Decision trees;Context;Training data;Neural networks;Statistical parametric speech synthesis;Hidden Markov model;Deep neural network}, + doi={10.1109/ICASSP.2013.6639215}, + ISSN={1520-6149}, + month={May}, +} +@INPROCEEDINGS{MLSA, + author={S. Imai}, + booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing}, + title={Cepstral analysis synthesis on the mel frequency scale}, + year={1983}, + volume={8}, + number={}, + pages={93-96}, + keywords={Cepstral analysis;Frequency synthesizers;Speech synthesis;Mel frequency cepstral coefficient;Vocoders;Speech analysis;Nonlinear filters;Fourier transforms;Cepstrum;Quantization}, + doi={10.1109/ICASSP.1983.1172250}, + ISSN={}, + month={April}, +} +@inproceedings{HMMTTS, + title={Simultaneous modeling of spectrum, pitch and duration in HMM-based speech synthesis}, + author={Takayoshi Yoshimura and Keiichi Tokuda and Takashi Masuko and Takao Kobayashi and Tadashi Kitamura}, + booktitle={EUROSPEECH}, + year={1999} +} +@article{dilated, + author = {Fisher Yu and + Vladlen Koltun}, + title = {Multi-Scale Context Aggregation by Dilated Convolutions}, + journal = {CoRR}, + volume = {abs/1511.07122}, + year = {2015}, + url = {http://arxiv.org/abs/1511.07122}, + archivePrefix = {arXiv}, + eprint = {1511.07122}, + timestamp = {Wed, 07 Jun 2017 14:40:43 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/YuK15}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@article {scalingPolicy, + author = {LOWE, WILL and BENOIT, KENNETH and MIKHAYLOV, SLAVA and LAVER, MICHAEL}, + title = {Scaling Policy Preferences from Coded Political Texts}, + journal = {Legislative Studies Quarterly}, + volume = {36}, + number = {1}, + publisher = {Blackwell Publishing Inc}, + issn = {1939-9162}, + url = {http://dx.doi.org/10.1111/j.1939-9162.2010.00006.x}, + doi = {10.1111/j.1939-9162.2010.00006.x}, + pages = {123--155}, + year = {2011}, +} + + +@article{depecheMood, + author = {Jacopo Staiano and + Marco Guerini}, + title = {DepecheMood: a Lexicon for Emotion Analysis from Crowd-Annotated News}, + journal = {CoRR}, + volume = {abs/1405.1605}, + year = {2014}, + url = {http://arxiv.org/abs/1405.1605}, + archivePrefix = {arXiv}, + eprint = {1405.1605}, + timestamp = {Wed, 07 Jun 2017 14:41:41 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/StaianoG14}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@article{GRULSTMcomp, + author = {Junyoung Chung and + {\c{C}}aglar G{\"{u}}l{\c{c}}ehre and + KyungHyun Cho and + Yoshua Bengio}, + title = {Empirical Evaluation of Gated Recurrent Neural Networks on Sequence + Modeling}, + journal = {CoRR}, + volume = {abs/1412.3555}, + year = {2014}, + url = {http://arxiv.org/abs/1412.3555}, + archivePrefix = {arXiv}, + eprint = {1412.3555}, + timestamp = {Wed, 07 Jun 2017 14:40:04 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/ChungGCB14}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@article{GRU, + author = {KyungHyun Cho and + Bart van Merrienboer and + Dzmitry Bahdanau and + Yoshua Bengio}, + title = {On the Properties of Neural Machine Translation: Encoder-Decoder Approaches}, + journal = {CoRR}, + volume = {abs/1409.1259}, + year = {2014}, + url = {http://arxiv.org/abs/1409.1259}, + archivePrefix = {arXiv}, + eprint = {1409.1259}, + timestamp = {Wed, 07 Jun 2017 14:42:33 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/ChoMBB14}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@article{sentimentNeuron, + author = {Alec Radford and + Rafal J{\'{o}}zefowicz and + Ilya Sutskever}, + title = {Learning to Generate Reviews and Discovering Sentiment}, + journal = {CoRR}, + volume = {abs/1704.01444}, + year = {2017}, + url = {http://arxiv.org/abs/1704.01444}, + archivePrefix = {arXiv}, + eprint = {1704.01444}, + timestamp = {Wed, 07 Jun 2017 14:43:05 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/RadfordJS17}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@article{LSTM, + author = {Hochreiter, Sepp and Schmidhuber, J\"{u}rgen}, + title = {Long Short-Term Memory}, + journal = {Neural Comput.}, + issue_date = {November 15, 1997}, + volume = {9}, + number = {8}, + month = nov, + year = {1997}, + issn = {0899-7667}, + pages = {1735--1780}, + numpages = {46}, + url = {http://dx.doi.org/10.1162/neco.1997.9.8.1735}, + doi = {10.1162/neco.1997.9.8.1735}, + acmid = {1246450}, + publisher = {MIT Press}, + address = {Cambridge, MA, USA}, +} + +@article{wavenet, + author = {A{\"{a}}ron van den Oord and + Sander Dieleman and + Heiga Zen and + Karen Simonyan and + Oriol Vinyals and + Alex Graves and + Nal Kalchbrenner and + Andrew W. Senior and + Koray Kavukcuoglu}, + title = {WaveNet: {A} Generative Model for Raw Audio}, + journal = {CoRR}, + volume = {abs/1609.03499}, + year = {2016}, + url = {http://arxiv.org/abs/1609.03499}, + archivePrefix = {arXiv}, + eprint = {1609.03499}, + timestamp = {Wed, 07 Jun 2017 14:42:54 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/OordDZSVGKSK16}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@inproceedings{CLDNNs, + title={Learning the speech front-end with raw waveform CLDNNs}, + author={Tara N. Sainath and Ron J. Weiss and Andrew W. Senior and Kevin W. Wilson and Oriol Vinyals}, + booktitle={INTERSPEECH}, + year={2015} +} + +@article{keywordSpotting, + author = {Sercan {\"{O}}mer Arik and + Markus Kliegl and + Rewon Child and + Joel Hestness and + Andrew Gibiansky and + Christopher Fougner and + Ryan Prenger and + Adam Coates}, + title = {Convolutional Recurrent Neural Networks for Small-Footprint Keyword + Spotting}, + journal = {CoRR}, + volume = {abs/1703.05390}, + year = {2017}, + url = {http://arxiv.org/abs/1703.05390}, + archivePrefix = {arXiv}, + eprint = {1703.05390}, + timestamp = {Thu, 20 Jul 2017 09:10:44 +0200}, + biburl = {http://dblp.org/rec/bib/journals/corr/ArikKCHGFPC17}, + bibsource = {dblp computer science bibliography, http://dblp.org} +} + +@article{lenet, + abstract = {{Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day}}, + author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.}, + booktitle = {Proceedings of the IEEE}, + citeulike-article-id = {4196818}, + citeulike-linkout-0 = {http://dx.doi.org/10.1109/5.726791}, + citeulike-linkout-1 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=726791}, + day = {06}, + doi = {10.1109/5.726791}, + institution = {Speech \& Image Process. Services Lab., AT\&T Bell Labs., Red Bank, NJ, USA}, + issn = {00189219}, + journal = {Proceedings of the IEEE}, + keywords = {cnn, lenet-5}, + month = nov, + number = {11}, + pages = {2278--2324}, + posted-at = {2016-06-08 06:38:36}, + priority = {0}, + publisher = {IEEE}, + title = {{Gradient-based learning applied to document recognition}}, + url = {http://dx.doi.org/10.1109/5.726791}, + volume = {86}, + year = {1998} +} + +@incollection{alexnet, +title = {ImageNet Classification with Deep Convolutional Neural Networks}, +author = {Alex Krizhevsky and Sutskever, Ilya and Hinton, Geoffrey E}, +booktitle = {Advances in Neural Information Processing Systems 25}, +editor = {F. Pereira and C. J. C. Burges and L. Bottou and K. Q. Weinberger}, +pages = {1097--1105}, +year = {2012}, +publisher = {Curran Associates, Inc.}, +url = {http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf} +} + +@article{dropout, + author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan}, + title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting}, + journal = {J. Mach. Learn. Res.}, + issue_date = {January 2014}, + volume = {15}, + number = {1}, + month = jan, + year = {2014}, + issn = {1532-4435}, + pages = {1929--1958}, + numpages = {30}, + url = {http://dl.acm.org/citation.cfm?id=2627435.2670313}, + acmid = {2670313}, + publisher = {JMLR.org}, + keywords = {deep learning, model combination, neural networks, regularization}, +} + +@article{hydromodeling, + author = {Lee, Hyojin, Kang, Kwangmin}, + title = {Interpolation of Missing Precipitation Data Using Kernel Estimations for Hydrologic Modeling Advances in Meteorology}, + year = {2015}, + url = {http://dx.doi.org/10.1155/2015/935868}, +} diff --git a/documents/thesis.bbl b/documents/thesis.bbl new file mode 100644 index 0000000000000000000000000000000000000000..1fe35f5e7eecb4a1def5d717cd3953552ad13b85 --- /dev/null +++ b/documents/thesis.bbl @@ -0,0 +1,34 @@ +\begin{thebibliography}{1} + +\bibitem{TTSSOTA} +Kallirroi Georgila. +\newblock {\em Speech Synthesis: State of the Art and Challenges for the + Future}, page 257–272. +\newblock Cambridge University Press, 2017. + +\bibitem{MLSA} +S.~Imai. +\newblock Cepstral analysis synthesis on the mel frequency scale. +\newblock In {\em ICASSP '83. IEEE International Conference on Acoustics, + Speech, and Signal Processing}, volume~8, pages 93--96, April 1983. + +\bibitem{HMMSpeakerInterpolation} +Takayoshi Yoshimura, Takashi Masuko, Keiichi Tokuda, Takao Kobayashi, and + Tadashi Kitamura. +\newblock Speaker interpolation in hmm-based speech synthesis system. +\newblock In {\em EUROSPEECH}, 1997. + +\bibitem{HMMTTS} +Takayoshi Yoshimura, Keiichi Tokuda, Takashi Masuko, Takao Kobayashi, and + Tadashi Kitamura. +\newblock Simultaneous modeling of spectrum, pitch and duration in hmm-based + speech synthesis. +\newblock In {\em EUROSPEECH}, 1999. + +\bibitem{SPSSDNN} +H.~Ze, A.~Senior, and M.~Schuster. +\newblock Statistical parametric speech synthesis using deep neural networks. +\newblock In {\em 2013 IEEE International Conference on Acoustics, Speech and + Signal Processing}, pages 7962--7966, May 2013. + +\end{thebibliography} diff --git a/documents/thesis.blg b/documents/thesis.blg new file mode 100644 index 0000000000000000000000000000000000000000..0dfb65d45a442573a284e875e8a35f01e2845766 --- /dev/null +++ b/documents/thesis.blg @@ -0,0 +1,5 @@ +This is BibTeX, Version 0.99dThe top-level auxiliary file: thesis.aux +The style file: plain.bst +Database file #1: references.bib +Warning--can't use both author and editor fields in TTSSOTA +(There was 1 warning) diff --git a/documents/thesis.pdf b/documents/thesis.pdf index e8944d49122c286870ee954c4c4f7deeee1c0405..b99bd9fac8d3c57bf9200965bf6149453131e99c 100644 Binary files a/documents/thesis.pdf and b/documents/thesis.pdf differ diff --git a/documents/thesis.tex b/documents/thesis.tex index 575a81d95a7a0662674cf78c4669604e91dae525..51525407910c16c196714687da60efe7a2353566 100644 --- a/documents/thesis.tex +++ b/documents/thesis.tex @@ -73,9 +73,32 @@ To do when I'll have a good overview of the project. Try to answer: \color{red} Concise presentation of the problem -Methods before deep learning +*Note that layers will be explained in an upcoming section* -Wavenet +Preprocessing of text into phonemes? + +SOTA ON MULTISPEAKER TTS: + +First SPSS methods [2 - 20] of https://arxiv.org/pdf/1606.06061.pdf + +\color{black} +Previous state of the art in TTS include hidden Markov models (HMM) based speech synthesis, which is a statistical parametric speech synthesis (SPSS) method. HMMs are trained to synthesize mel-frequency cepstral coefficients (MFCC) with energy, their delta and delta-delta coefficients \cite{TTSSOTA}. The result is passed through a vocoder\footnote{Specifically in TTS, some authors define a vocoder as a voice encoder that retrieves speech parameters to be used in synthesis. The more common definition however, is that of a function that generates a raw audio waveform from temporal features such as MFFC. This is the one we will use. \color{red} Review this \color{black}} such as MLSA \cite{MLSA}. The spectral parameters, pitch parameters and state durations of the model are conditioned on the phoneme contexts such that different contexts are clustered by a decision tree and a distribution is learned for each cluster \cite{HMMTTS}. It is thus possible to modify the voice generated by conditioning on a speaker or tuning these parameters with adaptation or interpolation techniques (e.g. \cite{HMMSpeakerInterpolation}), effectively making HMM-based speech synthesis a multispeaker TTS system. \color{red} Compare with concatenative \cite{SPSSDNN} ? \color{black} + +\cite{SPSSDNN} proposed to model + +\color{red} + + + + +Wavenet: + +\color{black} +Breakthrough in TTS with raw waveform gen +https://deepmind.com/blog/wavenet-generative-model-raw-audio/ ?? +Dilated causal convolutions +Condition on a speaker identity +\color{red} Tacotron @@ -88,7 +111,12 @@ Extensions? +\color{red} +\color{black} +\clearpage +\bibliographystyle{plain} +\bibliography{references} @@ -105,11 +133,6 @@ Extensions? - - -\color{red} -\color{black} -