references.bib

@misc{NeuralDiscreteRepresentation,
	Author = {Aaron van den Oord and Oriol Vinyals and Koray Kavukcuoglu},
	Title = {Neural Discrete Representation Learning},
	Year = {2017},
	Eprint = {arXiv:1711.00937},
}
@article{UMAP,
	author = {McInnes, Leland and Healy, John},
	year = {2018},
	month = {02},
	pages = {},
	title = {UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction}
}
@article{LibriTTS,
	author    = {Heiga Zen and
	Viet Dang and
	Rob Clark and
	Yu Zhang and
	Ron J. Weiss and
	Ye Jia and
	Zhifeng Chen and
	Yonghui Wu},
	title     = {LibriTTS: {A} Corpus Derived from LibriSpeech for Text-to-Speech},
	journal   = {CoRR},
	volume    = {abs/1904.02882},
	year      = {2019},
	url       = {http://arxiv.org/abs/1904.02882},
	archivePrefix = {arXiv},
	eprint    = {1904.02882},
	timestamp = {Wed, 24 Apr 2019 12:21:25 +0200},
	biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1904-02882},
	bibsource = {dblp computer science bibliography, https://dblp.org}
}
@InProceedings{VoxCeleb2,
	author       = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
	title        = "VoxCeleb2: Deep Speaker Recognition",
	booktitle    = "INTERSPEECH",
	year         = "2018",
}
@InProceedings{VoxCeleb1,
	author       = "Nagrani, A. and Chung, J.~S. and Zisserman, A.",
	title        = "VoxCeleb: a large-scale speaker identification dataset",
	booktitle    = "INTERSPEECH",
	year         = "2017",
}
@INPROCEEDINGS{LibriSpeech, 
	author={V. {Panayotov} and G. {Chen} and D. {Povey} and S. {Khudanpur}}, 
	booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
	title={Librispeech: An ASR corpus based on public domain audio books}, 
	year={2015}, 
	volume={}, 
	number={}, 
	pages={5206-5210}, 
	keywords={natural language processing;speech recognition;LibriSpeech corpus;ASR corpus;public domain audio books;read english speech;training speech recognition systems;evaluating speech recognition systems;LibriVox project;language-model training data;pre-built language models;acoustic models;Wall Street Journal;WSJ;Kaldi scripts;frequency 16 kHz;Resource description framework;Genomics;Bioinformatics;Blogs;Information services;Electronic publishing;Speech Recognition;Corpus;LibriVox}, 
	doi={10.1109/ICASSP.2015.7178964}, 
	ISSN={1520-6149}, 
	month={April},}
@misc{DeepVoice2,
	Author = {Sercan Arik and Gregory Diamos and Andrew Gibiansky and John Miller and Kainan Peng and Wei Ping and Jonathan Raiman and Yanqi Zhou},
	Title = {Deep Voice 2: Multi-Speaker Neural Text-to-Speech},
	Year = {2017},
	Eprint = {arXiv:1705.08947},
}
@misc{CloningFewSamples,
	Author = {Sercan O. Arik and Jitong Chen and Kainan Peng and Wei Ping and Yanqi Zhou},
	Title = {Neural Voice Cloning with a Few Samples},
	Year = {2018},
	Eprint = {arXiv:1802.06006},
}
@INPROCEEDINGS{MOSNaturalness, 
	author={S. {Shirali-Shahreza} and G. {Penn}}, 
	booktitle={2018 IEEE Spoken Language Technology Workshop (SLT)}, 
	title={MOS Naturalness and the Quest for Human-Like Speech}, 
	year={2018}, 
	volume={}, 
	number={}, 
	pages={346-352}, 
	keywords={learning (artificial intelligence);natural language processing;speech intelligibility;speech synthesis;MOS naturalness;TTS quality;speech synthesis;speech intelligibility;TTS systems;native North-American speech;Indian speech;deep learning;Testing;Speech coding;Synthesizers;ITU;Protocols;Data models;text-to-speech synthesis;evaluation;naturalness;paired comparison tests}, 
	doi={10.1109/SLT.2018.8639599}, 
	ISSN={}, 
	month={Dec},
}

@article{SV2TTS,
  author    = {Ye Jia and
               Yu Zhang and
               Ron J. Weiss and
               Quan Wang and
               Jonathan Shen and
               Fei Ren and
               Zhifeng Chen and
               Patrick Nguyen and
               Ruoming Pang and
               Ignacio Lopez{-}Moreno and
               Yonghui Wu},
  title     = {Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech
               Synthesis},
  journal   = {CoRR},
  volume    = {abs/1806.04558},
  year      = {2018},
  url       = {http://arxiv.org/abs/1806.04558},
  archivePrefix = {arXiv},
  eprint    = {1806.04558},
  timestamp = {Mon, 13 Aug 2018 16:48:47 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1806-04558},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{WaveRNN,
	Author = {Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and Aaron van den Oord and Sander Dieleman and Koray Kavukcuoglu},
	Title = {Efficient Neural Audio Synthesis},
	Year = {2018},
	Eprint = {arXiv:1802.08435},
}
@misc{FastWaveNet,
	Author = {Tom Le Paine and Pooya Khorrami and Shiyu Chang and Yang Zhang and Prajit Ramachandran and Mark A. Hasegawa-Johnson and Thomas S. Huang},
	Title = {Fast Wavenet Generation Algorithm},
	Year = {2016},
	Eprint = {arXiv:1611.09482},
}
@article{ParallelWaveNet,
	author    = {A{\"{a}}ron van den Oord and
	Yazhe Li and
	Igor Babuschkin and
	Karen Simonyan and
	Oriol Vinyals and
	Koray Kavukcuoglu and
	George van den Driessche and
	Edward Lockhart and
	Luis C. Cobo and
	Florian Stimberg and
	Norman Casagrande and
	Dominik Grewe and
	Seb Noury and
	Sander Dieleman and
	Erich Elsen and
	Nal Kalchbrenner and
	Heiga Zen and
	Alex Graves and
	Helen King and
	Tom Walters and
	Dan Belov and
	Demis Hassabis},
	title     = {Parallel WaveNet: Fast High-Fidelity Speech Synthesis},
	journal   = {CoRR},
	volume    = {abs/1711.10433},
	year      = {2017},
	url       = {http://arxiv.org/abs/1711.10433},
	archivePrefix = {arXiv},
	eprint    = {1711.10433},
	timestamp = {Mon, 13 Aug 2018 16:49:17 +0200},
	biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1711-10433},
	bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{TE2E,
	author    = {Georg Heigold and
	Ignacio Moreno and
	Samy Bengio and
	Noam Shazeer},
	title     = {End-to-End Text-Dependent Speaker Verification},
	journal   = {CoRR},
	volume    = {abs/1509.08062},
	year      = {2015},
	url       = {http://arxiv.org/abs/1509.08062},
	archivePrefix = {arXiv},
	eprint    = {1509.08062},
	timestamp = {Mon, 13 Aug 2018 16:49:06 +0200},
	biburl    = {https://dblp.org/rec/bib/journals/corr/HeigoldMBS15},
	bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{GE2E,
	Author = {Li Wan and Quan Wang and Alan Papir and Ignacio Lopez Moreno},
	Title = {Generalized End-to-End Loss for Speaker Verification},
	Year = {2017},
	Eprint = {arXiv:1710.10467},
}
@inproceedings{ConcatenativeGoogle,
  title={Recent Advances in Google Real-Time HMM-Driven Unit Selection Synthesizer.},
  author={Gonzalvo, Xavi and Tazari, Siamak and Chan, Chun-an and Becker, Markus and Gutkin, Alexander and Silen, Hanna},
  booktitle={Interspeech},
  pages={2238--2242},
  year={2016}
}
@article{LSTM-RNN,
  author    = {Heiga Zen and
               Yannis Agiomyrgiannakis and
               Niels Egberts and
               Fergus Henderson and
               Przemyslaw Szczepaniak},
  title     = {Fast, Compact, and High Quality {LSTM-RNN} Based Statistical Parametric
               Speech Synthesizers for Mobile Devices},
  journal   = {CoRR},
  volume    = {abs/1606.06061},
  year      = {2016},
  url       = {http://arxiv.org/abs/1606.06061},
  archivePrefix = {arXiv},
  eprint    = {1606.06061},
  timestamp = {Mon, 13 Aug 2018 16:47:55 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/ZenAEHS16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Attention,
  author    = {Dzmitry Bahdanau and
               Kyunghyun Cho and
               Yoshua Bengio},
  title     = {Neural Machine Translation by Jointly Learning to Align and Translate},
  journal   = {CoRR},
  volume    = {abs/1409.0473},
  year      = {2014},
  url       = {http://arxiv.org/abs/1409.0473},
  archivePrefix = {arXiv},
  eprint    = {1409.0473},
  timestamp = {Mon, 13 Aug 2018 16:46:05 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/BahdanauCB14},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Tacotron1,
  author    = {Yuxuan Wang and
               R. J. Skerry{-}Ryan and
               Daisy Stanton and
               Yonghui Wu and
               Ron J. Weiss and
               Navdeep Jaitly and
               Zongheng Yang and
               Ying Xiao and
               Zhifeng Chen and
               Samy Bengio and
               Quoc V. Le and
               Yannis Agiomyrgiannakis and
               Rob Clark and
               Rif A. Saurous},
  title     = {Tacotron: {A} Fully End-to-End Text-To-Speech Synthesis Model},
  journal   = {CoRR},
  volume    = {abs/1703.10135},
  year      = {2017},
  url       = {http://arxiv.org/abs/1703.10135},
  archivePrefix = {arXiv},
  eprint    = {1703.10135},
  timestamp = {Mon, 13 Aug 2018 16:46:33 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/WangSSWWJYXCBLA17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Tacotron2,
  author    = {Jonathan Shen and
               Ruoming Pang and
               Ron J. Weiss and
               Mike Schuster and
               Navdeep Jaitly and
               Zongheng Yang and
               Zhifeng Chen and
               Yu Zhang and
               Yuxuan Wang and
               R. J. Skerry{-}Ryan and
               Rif A. Saurous and
               Yannis Agiomyrgiannakis and
               Yonghui Wu},
  title     = {Natural {TTS} Synthesis by Conditioning WaveNet on Mel Spectrogram
               Predictions},
  journal   = {CoRR},
  volume    = {abs/1712.05884},
  year      = {2017},
  url       = {http://arxiv.org/abs/1712.05884},
  archivePrefix = {arXiv},
  eprint    = {1712.05884},
  timestamp = {Mon, 13 Aug 2018 16:48:59 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1712-05884},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DeepVoice1,
  author    = {Sercan {\"{O}}mer Arik and
               Mike Chrzanowski and
               Adam Coates and
               Greg Diamos and
               Andrew Gibiansky and
               Yongguo Kang and
               Xian Li and
               John Miller and
               Jonathan Raiman and
               Shubho Sengupta and
               Mohammad Shoeybi},
  title     = {Deep Voice: Real-time Neural Text-to-Speech},
  journal   = {CoRR},
  volume    = {abs/1702.07825},
  year      = {2017},
  url       = {http://arxiv.org/abs/1702.07825},
  archivePrefix = {arXiv},
  eprint    = {1702.07825},
  timestamp = {Mon, 13 Aug 2018 16:49:17 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/ArikCCDGKLMRSS17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{BDLSTMTTS,
	author = {Fan, Y and Qian, Yuang and Xie, Feng-Long and Soong, Frank},
	year = {2014},
	month = {01},
	pages = {1964-1968},
	title = {TTS synthesis with bidirectional LSTM based Recurrent Neural Networks},
	booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH}
}
@INPROCEEDINGS{Tokuda-2000, 
	author={K. Tokuda and T. Yoshimura and T. Masuko and T. Kobayashi and T. Kitamura}, 
	booktitle={2000 IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings (Cat. No.00CH37100)}, 
	title={Speech parameter generation algorithms for HMM-based speech synthesis}, 
	year={2000}, 
	volume={3}, 
	number={}, 
	pages={1315-1318 vol.3}, 
	keywords={speech synthesis;hidden Markov models;maximum likelihood estimation;speech parameter generation algorithms;HMM-based speech synthesis;speech parameter sequence;observation vector;spectral parameter vector;dynamic feature vector;state sequence;forward-backward algorithm;formant structure;multi-mixture HMM;Speech synthesis;Hidden Markov models;Databases;Context modeling;Computer science;Character generation;Runtime;Interpolation;Cepstral analysis}, 
	doi={10.1109/ICASSP.2000.861820}, 
	ISSN={1520-6149}, 
	month={June},
}
@INPROCEEDINGS{Tokuda-1995, 
	author={K. Tokuda and T. Kobayashi and S. Imai}, 
	booktitle={1995 International Conference on Acoustics, Speech, and Signal Processing}, 
	title={Speech parameter generation from HMM using dynamic features}, 
	year={1995}, 
	volume={1}, 
	number={}, 
	pages={660-663 vol.1}, 
	keywords={speech synthesis;hidden Markov models;parameter estimation;cepstral analysis;speech recognition;dynamic features;speech parameter generation;HMM;speech recognition;speech synthesis by rule;optimum state sequence;linear equation;fast algorithm;RLS algorithm;adaptive filtering;Hidden Markov models;Speech synthesis;Speech recognition;Resonance light scattering;Filtering algorithms;Cepstral analysis;Viterbi algorithm;Laboratories;Equations;Speech enhancement}, 
	doi={10.1109/ICASSP.1995.479684}, 
	ISSN={1520-6149}, 
	month={May},
}
@article{Tokuda-2013,
	doi	= {10.1109/JPROC.2013.2251852},
	title	= {Speech Synthesis Based on Hidden Markov Models},
	author	= {Tokuda, Keiichi; Nankaku, Yoshihiko; Toda, Tomoki; Zen, Heiga; Yamagishi, Junichi; Oura, Keiichiro},
	publisher	= {IEEE},
	journal	= {Proceedings of the IEEE},
	issnp	= {0018-9219},
	issne	= {1558-2256},
	year	= {2013},
	month	= {05},
	volume	= {101},
	issue	= {5},
	page	= {1234--1252},
}
@MISC{TOBI,
	author = {Mary E. Beckman and Gayle Ayers Elam},
	title = {Guidelines for ToBI Labelling},
	year = {1997},
	month = {03},
}
@MISC{Lu_combininga,
	author = {Heng Lu and Simon King and Oliver Watts},
	title = {Combining a Vector Space Representation of Linguistic Context with a Deep Neural Network for Text-To-Speech Synthesis},
	year = {2013},
	month = {September}
}
@INPROCEEDINGS{Hashimoto-2015, 
	author={K. Hashimoto and K. Oura and Y. Nankaku and K. Tokuda}, 
	booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
	title={The effect of neural networks in statistical parametric speech synthesis}, 
	year={2015}, 
	volume={}, 
	number={}, 
	pages={4455-4459}, 
	keywords={neural nets;speech synthesis;statistical analysis;statistical parametric speech synthesis;deep neural networks;generative models;acoustic models;parameter generation;Hidden Markov models;Artificial neural networks;Speech;Statistical parametric speech synthesis;deep neural network;hidden Markov model}, 
	doi={10.1109/ICASSP.2015.7178813}, 
	ISSN={1520-6149}, 
	month={April}
}
@inproceedings{Yin2014ModelingDP,
  title={Modeling DCT parameterized F0 trajectory at intonation phrase level with DNN or decision tree},
  author={Xiang Yin and Ming Lei and Zhiliang Hong and Frank K. Soong and Lei He and Zhen-Hua Ling and Li-Rong Dai},
  booktitle={INTERSPEECH},
  year={2014}
}
@INPROCEEDINGS{OnTheTrainingAspects, 
	author={Y. Qian and Y. Fan and W. Hu and F. K. Soong}, 
	booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
	title={On the training aspects of Deep Neural Network (DNN) for parametric TTS synthesis}, 
	year={2014}, 
	volume={}, 
	number={}, 
	pages={3829-3833}, 
	keywords={backpropagation;feature extraction;neural nets;speech synthesis;DNN training;deep neural network;parametric TTS synthesis;text-to-speech synthesis;text features;acoustic features;objective measure;subjective measure;HMM;hidden Markov model;diagonal Gaussian probability family;layer-wise BP pretraining;backpropagation;hyperbolic tangent activation function;sigmoidal function;Decision support systems;Conferences;Acoustics;Speech;Speech processing;Speech Synthesis;HMM;DNN;TTS}, 
	doi={10.1109/ICASSP.2014.6854318}, 
	ISSN={1520-6149}, 
	month={May},
}
@inproceedings{HMMSpeakerInterpolation,
  title={Speaker interpolation in HMM-based speech synthesis system},
  author={Takayoshi Yoshimura and Takashi Masuko and Keiichi Tokuda and Takao Kobayashi and Tadashi Kitamura},
  booktitle={EUROSPEECH},
  year={1997}
}
@article{STRAIGHT,
	title = "Restructuring speech representations using a pitch-adaptive time–frequency smoothing and an instantaneous-frequency-based F0 extraction: Possible role of a repetitive structure in sounds1Speech files available. See http://www.elsevier.nl/locate/specom1",
	journal = "Speech Communication",
	volume = "27",
	number = "3",
	pages = "187 - 207",
	year = "1999",
	issn = "0167-6393",
	doi = "https://doi.org/10.1016/S0167-6393(98)00085-5",
	url = "http://www.sciencedirect.com/science/article/pii/S0167639398000855",
	author = "Hideki Kawahara and Ikuyo Masuda-Katsuse and Alain de Cheveigné",
	keywords = "Speech analysis, Pitch-synchronous, Spline smoothing, Instantaneous frequency, F0 extraction, Speech synthesis, Speech modification"
}
@inbook{TTSSOTA, 
	place={Cambridge}, 
	title={Speech Synthesis: State of the Art and Challenges for the Future}, 
	DOI={10.1017/9781316676202.019}, booktitle={Social Signal Processing}, 
	publisher={Cambridge University Press}, 
	author={Georgila, Kallirroi}, 
	editor={Burgoon, 
	Judee K. and Magnenat-Thalmann, 
	Nadia and Pantic, 
	Maja and Vinciarelli, 
	AlessandroEditors}, 
	year={2017}, 
	pages={257–272}
}
@INPROCEEDINGS{SPSSDNN, 
	author={H. Zen and A. Senior and M. Schuster}, 
	booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, 
	title={Statistical parametric speech synthesis using deep neural networks}, 
	year={2013}, 
	volume={}, 
	number={}, 
	pages={7962-7966}, 
	keywords={hidden Markov models;neural nets;speech synthesis;statistical parametric speech synthesis;deep neural networks;decision tree clustered context dependent hidden Markov models;HMM;probability densities;speech parameters;speech waveform;decision trees;acoustic realizations;Hidden Markov models;Speech;Speech synthesis;Decision trees;Context;Training data;Neural networks;Statistical parametric speech synthesis;Hidden Markov model;Deep neural network}, 
	doi={10.1109/ICASSP.2013.6639215}, 
	ISSN={1520-6149}, 
	month={May},
}
@INPROCEEDINGS{MLSA, 
	author={S. Imai}, 
	booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing}, 
	title={Cepstral analysis synthesis on the mel frequency scale}, 
	year={1983}, 
	volume={8}, 
	number={}, 
	pages={93-96}, 
	keywords={Cepstral analysis;Frequency synthesizers;Speech synthesis;Mel frequency cepstral coefficient;Vocoders;Speech analysis;Nonlinear filters;Fourier transforms;Cepstrum;Quantization}, 
	doi={10.1109/ICASSP.1983.1172250}, 
	ISSN={}, 
	month={April},
}
@inproceedings{HMMTTS,
  title={Simultaneous modeling of spectrum, pitch and duration in HMM-based speech synthesis},
  author={Takayoshi Yoshimura and Keiichi Tokuda and Takashi Masuko and Takao Kobayashi and Tadashi Kitamura},
  booktitle={EUROSPEECH},
  year={1999}
}

@article{GRU,
  author    = {KyungHyun Cho and
               Bart van Merrienboer and
               Dzmitry Bahdanau and
               Yoshua Bengio},
  title     = {On the Properties of Neural Machine Translation: Encoder-Decoder Approaches},
  journal   = {CoRR},
  volume    = {abs/1409.1259},
  year      = {2014},
  url       = {http://arxiv.org/abs/1409.1259},
  archivePrefix = {arXiv},
  eprint    = {1409.1259},
  timestamp = {Wed, 07 Jun 2017 14:42:33 +0200},
  biburl    = {http://dblp.org/rec/bib/journals/corr/ChoMBB14},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}

@article{LSTM,
 author = {Hochreiter, Sepp and Schmidhuber, J\"{u}rgen},
 title = {Long Short-Term Memory},
 journal = {Neural Comput.},
 issue_date = {November 15, 1997},
 volume = {9},
 number = {8},
 month = nov,
 year = {1997},
 issn = {0899-7667},
 pages = {1735--1780},
 numpages = {46},
 url = {http://dx.doi.org/10.1162/neco.1997.9.8.1735},
 doi = {10.1162/neco.1997.9.8.1735},
 acmid = {1246450},
 publisher = {MIT Press},
 address = {Cambridge, MA, USA},
} 

@article{WaveNet,
  author    = {A{\"{a}}ron van den Oord and
               Sander Dieleman and
               Heiga Zen and
               Karen Simonyan and
               Oriol Vinyals and
               Alex Graves and
               Nal Kalchbrenner and
               Andrew W. Senior and
               Koray Kavukcuoglu},
  title     = {WaveNet: {A} Generative Model for Raw Audio},
  journal   = {CoRR},
  volume    = {abs/1609.03499},
  year      = {2016},
  url       = {http://arxiv.org/abs/1609.03499},
  archivePrefix = {arXiv},
  eprint    = {1609.03499},
  timestamp = {Wed, 07 Jun 2017 14:42:54 +0200},
  biburl    = {http://dblp.org/rec/bib/journals/corr/OordDZSVGKSK16},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}

@inproceedings{CLDNNs,
  title={Learning the speech front-end with raw waveform CLDNNs},
  author={Tara N. Sainath and Ron J. Weiss and Andrew W. Senior and Kevin W. Wilson and Oriol Vinyals},
  booktitle={INTERSPEECH},
  year={2015}
}

@article{lenet,
    abstract = {{Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day}},
    author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
    booktitle = {Proceedings of the IEEE},
    citeulike-article-id = {4196818},
    citeulike-linkout-0 = {http://dx.doi.org/10.1109/5.726791},
    citeulike-linkout-1 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=726791},
    day = {06},
    doi = {10.1109/5.726791},
    institution = {Speech \& Image Process. Services Lab., AT\&T Bell Labs., Red Bank, NJ, USA},
    issn = {00189219},
    journal = {Proceedings of the IEEE},
    keywords = {cnn, lenet-5},
    month = nov,
    number = {11},
    pages = {2278--2324},
    posted-at = {2016-06-08 06:38:36},
    priority = {0},
    publisher = {IEEE},
    title = {{Gradient-based learning applied to document recognition}},
    url = {http://dx.doi.org/10.1109/5.726791},
    volume = {86},
    year = {1998}
}

@incollection{alexnet,
title = {ImageNet Classification with Deep Convolutional Neural Networks},
author = {Alex Krizhevsky and Sutskever, Ilya and Hinton, Geoffrey E},
booktitle = {Advances in Neural Information Processing Systems 25},
editor = {F. Pereira and C. J. C. Burges and L. Bottou and K. Q. Weinberger},
pages = {1097--1105},
year = {2012},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf}
}

@article{dropout,
 author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
 title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
 journal = {J. Mach. Learn. Res.},
 issue_date = {January 2014},
 volume = {15},
 number = {1},
 month = jan,
 year = {2014},
 issn = {1532-4435},
 pages = {1929--1958},
 numpages = {30},
 url = {http://dl.acm.org/citation.cfm?id=2627435.2670313},
 acmid = {2670313},
 publisher = {JMLR.org},
 keywords = {deep learning, model combination, neural networks, regularization},
}