references.bib 23.8 KB
Newer Older
1 2 3 4 5 6
@misc{NeuralDiscreteRepresentation,
	Author = {Aaron van den Oord and Oriol Vinyals and Koray Kavukcuoglu},
	Title = {Neural Discrete Representation Learning},
	Year = {2017},
	Eprint = {arXiv:1711.00937},
}
7 8 9 10 11 12 13
@article{UMAP,
	author = {McInnes, Leland and Healy, John},
	year = {2018},
	month = {02},
	pages = {},
	title = {UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction}
}
14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
@article{LibriTTS,
	author    = {Heiga Zen and
	Viet Dang and
	Rob Clark and
	Yu Zhang and
	Ron J. Weiss and
	Ye Jia and
	Zhifeng Chen and
	Yonghui Wu},
	title     = {LibriTTS: {A} Corpus Derived from LibriSpeech for Text-to-Speech},
	journal   = {CoRR},
	volume    = {abs/1904.02882},
	year      = {2019},
	url       = {http://arxiv.org/abs/1904.02882},
	archivePrefix = {arXiv},
	eprint    = {1904.02882},
	timestamp = {Wed, 24 Apr 2019 12:21:25 +0200},
	biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1904-02882},
	bibsource = {dblp computer science bibliography, https://dblp.org}
}
@InProceedings{VoxCeleb2,
	author       = "Chung, J.~S. and Nagrani, A. and Zisserman, A.",
	title        = "VoxCeleb2: Deep Speaker Recognition",
	booktitle    = "INTERSPEECH",
	year         = "2018",
}
@InProceedings{VoxCeleb1,
	author       = "Nagrani, A. and Chung, J.~S. and Zisserman, A.",
	title        = "VoxCeleb: a large-scale speaker identification dataset",
	booktitle    = "INTERSPEECH",
	year         = "2017",
}
@INPROCEEDINGS{LibriSpeech, 
	author={V. {Panayotov} and G. {Chen} and D. {Povey} and S. {Khudanpur}}, 
	booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
	title={Librispeech: An ASR corpus based on public domain audio books}, 
	year={2015}, 
	volume={}, 
	number={}, 
	pages={5206-5210}, 
	keywords={natural language processing;speech recognition;LibriSpeech corpus;ASR corpus;public domain audio books;read english speech;training speech recognition systems;evaluating speech recognition systems;LibriVox project;language-model training data;pre-built language models;acoustic models;Wall Street Journal;WSJ;Kaldi scripts;frequency 16 kHz;Resource description framework;Genomics;Bioinformatics;Blogs;Information services;Electronic publishing;Speech Recognition;Corpus;LibriVox}, 
	doi={10.1109/ICASSP.2015.7178964}, 
	ISSN={1520-6149}, 
	month={April},}
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82
@misc{DeepVoice2,
	Author = {Sercan Arik and Gregory Diamos and Andrew Gibiansky and John Miller and Kainan Peng and Wei Ping and Jonathan Raiman and Yanqi Zhou},
	Title = {Deep Voice 2: Multi-Speaker Neural Text-to-Speech},
	Year = {2017},
	Eprint = {arXiv:1705.08947},
}
@misc{CloningFewSamples,
	Author = {Sercan O. Arik and Jitong Chen and Kainan Peng and Wei Ping and Yanqi Zhou},
	Title = {Neural Voice Cloning with a Few Samples},
	Year = {2018},
	Eprint = {arXiv:1802.06006},
}
@INPROCEEDINGS{MOSNaturalness, 
	author={S. {Shirali-Shahreza} and G. {Penn}}, 
	booktitle={2018 IEEE Spoken Language Technology Workshop (SLT)}, 
	title={MOS Naturalness and the Quest for Human-Like Speech}, 
	year={2018}, 
	volume={}, 
	number={}, 
	pages={346-352}, 
	keywords={learning (artificial intelligence);natural language processing;speech intelligibility;speech synthesis;MOS naturalness;TTS quality;speech synthesis;speech intelligibility;TTS systems;native North-American speech;Indian speech;deep learning;Testing;Speech coding;Synthesizers;ITU;Protocols;Data models;text-to-speech synthesis;evaluation;naturalness;paired comparison tests}, 
	doi={10.1109/SLT.2018.8639599}, 
	ISSN={}, 
	month={Dec},
}
C
Corentin Jemine 已提交
83

84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
@article{SV2TTS,
  author    = {Ye Jia and
               Yu Zhang and
               Ron J. Weiss and
               Quan Wang and
               Jonathan Shen and
               Fei Ren and
               Zhifeng Chen and
               Patrick Nguyen and
               Ruoming Pang and
               Ignacio Lopez{-}Moreno and
               Yonghui Wu},
  title     = {Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech
               Synthesis},
  journal   = {CoRR},
  volume    = {abs/1806.04558},
  year      = {2018},
  url       = {http://arxiv.org/abs/1806.04558},
  archivePrefix = {arXiv},
  eprint    = {1806.04558},
  timestamp = {Mon, 13 Aug 2018 16:48:47 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1806-04558},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
@misc{WaveRNN,
	Author = {Nal Kalchbrenner and Erich Elsen and Karen Simonyan and Seb Noury and Norman Casagrande and Edward Lockhart and Florian Stimberg and Aaron van den Oord and Sander Dieleman and Koray Kavukcuoglu},
	Title = {Efficient Neural Audio Synthesis},
	Year = {2018},
	Eprint = {arXiv:1802.08435},
}
@misc{FastWaveNet,
	Author = {Tom Le Paine and Pooya Khorrami and Shiyu Chang and Yang Zhang and Prajit Ramachandran and Mark A. Hasegawa-Johnson and Thomas S. Huang},
	Title = {Fast Wavenet Generation Algorithm},
	Year = {2016},
	Eprint = {arXiv:1611.09482},
}
@article{ParallelWaveNet,
	author    = {A{\"{a}}ron van den Oord and
	Yazhe Li and
	Igor Babuschkin and
	Karen Simonyan and
	Oriol Vinyals and
	Koray Kavukcuoglu and
	George van den Driessche and
	Edward Lockhart and
	Luis C. Cobo and
	Florian Stimberg and
	Norman Casagrande and
	Dominik Grewe and
	Seb Noury and
	Sander Dieleman and
	Erich Elsen and
	Nal Kalchbrenner and
	Heiga Zen and
	Alex Graves and
	Helen King and
	Tom Walters and
	Dan Belov and
	Demis Hassabis},
	title     = {Parallel WaveNet: Fast High-Fidelity Speech Synthesis},
	journal   = {CoRR},
	volume    = {abs/1711.10433},
	year      = {2017},
	url       = {http://arxiv.org/abs/1711.10433},
	archivePrefix = {arXiv},
	eprint    = {1711.10433},
	timestamp = {Mon, 13 Aug 2018 16:49:17 +0200},
	biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1711-10433},
	bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{TE2E,
	author    = {Georg Heigold and
	Ignacio Moreno and
	Samy Bengio and
	Noam Shazeer},
	title     = {End-to-End Text-Dependent Speaker Verification},
	journal   = {CoRR},
	volume    = {abs/1509.08062},
	year      = {2015},
	url       = {http://arxiv.org/abs/1509.08062},
	archivePrefix = {arXiv},
	eprint    = {1509.08062},
	timestamp = {Mon, 13 Aug 2018 16:49:06 +0200},
	biburl    = {https://dblp.org/rec/bib/journals/corr/HeigoldMBS15},
	bibsource = {dblp computer science bibliography, https://dblp.org}
}
@misc{GE2E,
	Author = {Li Wan and Quan Wang and Alan Papir and Ignacio Lopez Moreno},
	Title = {Generalized End-to-End Loss for Speaker Verification},
	Year = {2017},
	Eprint = {arXiv:1710.10467},
}
C
Corentin Jemine 已提交
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
@inproceedings{ConcatenativeGoogle,
  title={Recent Advances in Google Real-Time HMM-Driven Unit Selection Synthesizer.},
  author={Gonzalvo, Xavi and Tazari, Siamak and Chan, Chun-an and Becker, Markus and Gutkin, Alexander and Silen, Hanna},
  booktitle={Interspeech},
  pages={2238--2242},
  year={2016}
}
@article{LSTM-RNN,
  author    = {Heiga Zen and
               Yannis Agiomyrgiannakis and
               Niels Egberts and
               Fergus Henderson and
               Przemyslaw Szczepaniak},
  title     = {Fast, Compact, and High Quality {LSTM-RNN} Based Statistical Parametric
               Speech Synthesizers for Mobile Devices},
  journal   = {CoRR},
  volume    = {abs/1606.06061},
  year      = {2016},
  url       = {http://arxiv.org/abs/1606.06061},
  archivePrefix = {arXiv},
  eprint    = {1606.06061},
  timestamp = {Mon, 13 Aug 2018 16:47:55 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/ZenAEHS16},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Attention,
  author    = {Dzmitry Bahdanau and
               Kyunghyun Cho and
               Yoshua Bengio},
  title     = {Neural Machine Translation by Jointly Learning to Align and Translate},
  journal   = {CoRR},
  volume    = {abs/1409.0473},
  year      = {2014},
  url       = {http://arxiv.org/abs/1409.0473},
  archivePrefix = {arXiv},
  eprint    = {1409.0473},
  timestamp = {Mon, 13 Aug 2018 16:46:05 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/BahdanauCB14},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Tacotron1,
  author    = {Yuxuan Wang and
               R. J. Skerry{-}Ryan and
               Daisy Stanton and
               Yonghui Wu and
               Ron J. Weiss and
               Navdeep Jaitly and
               Zongheng Yang and
               Ying Xiao and
               Zhifeng Chen and
               Samy Bengio and
               Quoc V. Le and
               Yannis Agiomyrgiannakis and
               Rob Clark and
               Rif A. Saurous},
  title     = {Tacotron: {A} Fully End-to-End Text-To-Speech Synthesis Model},
  journal   = {CoRR},
  volume    = {abs/1703.10135},
  year      = {2017},
  url       = {http://arxiv.org/abs/1703.10135},
  archivePrefix = {arXiv},
  eprint    = {1703.10135},
  timestamp = {Mon, 13 Aug 2018 16:46:33 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/WangSSWWJYXCBLA17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Tacotron2,
  author    = {Jonathan Shen and
               Ruoming Pang and
               Ron J. Weiss and
               Mike Schuster and
               Navdeep Jaitly and
               Zongheng Yang and
               Zhifeng Chen and
               Yu Zhang and
               Yuxuan Wang and
               R. J. Skerry{-}Ryan and
               Rif A. Saurous and
               Yannis Agiomyrgiannakis and
               Yonghui Wu},
  title     = {Natural {TTS} Synthesis by Conditioning WaveNet on Mel Spectrogram
               Predictions},
  journal   = {CoRR},
  volume    = {abs/1712.05884},
  year      = {2017},
  url       = {http://arxiv.org/abs/1712.05884},
  archivePrefix = {arXiv},
  eprint    = {1712.05884},
  timestamp = {Mon, 13 Aug 2018 16:48:59 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/abs-1712-05884},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
C
Corentin Jemine 已提交
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
@article{DeepVoice1,
  author    = {Sercan {\"{O}}mer Arik and
               Mike Chrzanowski and
               Adam Coates and
               Greg Diamos and
               Andrew Gibiansky and
               Yongguo Kang and
               Xian Li and
               John Miller and
               Jonathan Raiman and
               Shubho Sengupta and
               Mohammad Shoeybi},
  title     = {Deep Voice: Real-time Neural Text-to-Speech},
  journal   = {CoRR},
  volume    = {abs/1702.07825},
  year      = {2017},
  url       = {http://arxiv.org/abs/1702.07825},
  archivePrefix = {arXiv},
  eprint    = {1702.07825},
  timestamp = {Mon, 13 Aug 2018 16:49:17 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/ArikCCDGKLMRSS17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
C
Corentin Jemine 已提交
291 292 293 294 295 296 297 298
@article{BDLSTMTTS,
	author = {Fan, Y and Qian, Yuang and Xie, Feng-Long and Soong, Frank},
	year = {2014},
	month = {01},
	pages = {1964-1968},
	title = {TTS synthesis with bidirectional LSTM based Recurrent Neural Networks},
	booktitle = {Proceedings of the Annual Conference of the International Speech Communication Association, INTERSPEECH}
}
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
@INPROCEEDINGS{Tokuda-2000, 
	author={K. Tokuda and T. Yoshimura and T. Masuko and T. Kobayashi and T. Kitamura}, 
	booktitle={2000 IEEE International Conference on Acoustics, Speech, and Signal Processing. Proceedings (Cat. No.00CH37100)}, 
	title={Speech parameter generation algorithms for HMM-based speech synthesis}, 
	year={2000}, 
	volume={3}, 
	number={}, 
	pages={1315-1318 vol.3}, 
	keywords={speech synthesis;hidden Markov models;maximum likelihood estimation;speech parameter generation algorithms;HMM-based speech synthesis;speech parameter sequence;observation vector;spectral parameter vector;dynamic feature vector;state sequence;forward-backward algorithm;formant structure;multi-mixture HMM;Speech synthesis;Hidden Markov models;Databases;Context modeling;Computer science;Character generation;Runtime;Interpolation;Cepstral analysis}, 
	doi={10.1109/ICASSP.2000.861820}, 
	ISSN={1520-6149}, 
	month={June},
}
@INPROCEEDINGS{Tokuda-1995, 
	author={K. Tokuda and T. Kobayashi and S. Imai}, 
	booktitle={1995 International Conference on Acoustics, Speech, and Signal Processing}, 
	title={Speech parameter generation from HMM using dynamic features}, 
	year={1995}, 
	volume={1}, 
	number={}, 
	pages={660-663 vol.1}, 
	keywords={speech synthesis;hidden Markov models;parameter estimation;cepstral analysis;speech recognition;dynamic features;speech parameter generation;HMM;speech recognition;speech synthesis by rule;optimum state sequence;linear equation;fast algorithm;RLS algorithm;adaptive filtering;Hidden Markov models;Speech synthesis;Speech recognition;Resonance light scattering;Filtering algorithms;Cepstral analysis;Viterbi algorithm;Laboratories;Equations;Speech enhancement}, 
	doi={10.1109/ICASSP.1995.479684}, 
	ISSN={1520-6149}, 
	month={May},
}
325 326 327 328 329 330 331 332 333 334 335 336 337 338
@article{Tokuda-2013,
	doi	= {10.1109/JPROC.2013.2251852},
	title	= {Speech Synthesis Based on Hidden Markov Models},
	author	= {Tokuda, Keiichi; Nankaku, Yoshihiko; Toda, Tomoki; Zen, Heiga; Yamagishi, Junichi; Oura, Keiichiro},
	publisher	= {IEEE},
	journal	= {Proceedings of the IEEE},
	issnp	= {0018-9219},
	issne	= {1558-2256},
	year	= {2013},
	month	= {05},
	volume	= {101},
	issue	= {5},
	page	= {1234--1252},
}
339 340 341 342 343 344
@MISC{TOBI,
	author = {Mary E. Beckman and Gayle Ayers Elam},
	title = {Guidelines for ToBI Labelling},
	year = {1997},
	month = {03},
}
345 346 347
@MISC{Lu_combininga,
	author = {Heng Lu and Simon King and Oliver Watts},
	title = {Combining a Vector Space Representation of Linguistic Context with a Deep Neural Network for Text-To-Speech Synthesis},
348 349
	year = {2013},
	month = {September}
350 351 352 353 354 355 356 357 358 359 360 361 362 363
}
@INPROCEEDINGS{Hashimoto-2015, 
	author={K. Hashimoto and K. Oura and Y. Nankaku and K. Tokuda}, 
	booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
	title={The effect of neural networks in statistical parametric speech synthesis}, 
	year={2015}, 
	volume={}, 
	number={}, 
	pages={4455-4459}, 
	keywords={neural nets;speech synthesis;statistical analysis;statistical parametric speech synthesis;deep neural networks;generative models;acoustic models;parameter generation;Hidden Markov models;Artificial neural networks;Speech;Statistical parametric speech synthesis;deep neural network;hidden Markov model}, 
	doi={10.1109/ICASSP.2015.7178813}, 
	ISSN={1520-6149}, 
	month={April}
}
C
Cleanup  
Corentin Jemine 已提交
364 365 366 367 368 369
@inproceedings{Yin2014ModelingDP,
  title={Modeling DCT parameterized F0 trajectory at intonation phrase level with DNN or decision tree},
  author={Xiang Yin and Ming Lei and Zhiliang Hong and Frank K. Soong and Lei He and Zhen-Hua Ling and Li-Rong Dai},
  booktitle={INTERSPEECH},
  year={2014}
}
370
@INPROCEEDINGS{OnTheTrainingAspects, 
C
Cleanup  
Corentin Jemine 已提交
371 372 373 374 375 376 377 378 379 380
	author={Y. Qian and Y. Fan and W. Hu and F. K. Soong}, 
	booktitle={2014 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, 
	title={On the training aspects of Deep Neural Network (DNN) for parametric TTS synthesis}, 
	year={2014}, 
	volume={}, 
	number={}, 
	pages={3829-3833}, 
	keywords={backpropagation;feature extraction;neural nets;speech synthesis;DNN training;deep neural network;parametric TTS synthesis;text-to-speech synthesis;text features;acoustic features;objective measure;subjective measure;HMM;hidden Markov model;diagonal Gaussian probability family;layer-wise BP pretraining;backpropagation;hyperbolic tangent activation function;sigmoidal function;Decision support systems;Conferences;Acoustics;Speech;Speech processing;Speech Synthesis;HMM;DNN;TTS}, 
	doi={10.1109/ICASSP.2014.6854318}, 
	ISSN={1520-6149}, 
381
	month={May},
C
Cleanup  
Corentin Jemine 已提交
382
}
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
@inproceedings{HMMSpeakerInterpolation,
  title={Speaker interpolation in HMM-based speech synthesis system},
  author={Takayoshi Yoshimura and Takashi Masuko and Keiichi Tokuda and Takao Kobayashi and Tadashi Kitamura},
  booktitle={EUROSPEECH},
  year={1997}
}
@article{STRAIGHT,
	title = "Restructuring speech representations using a pitch-adaptive time–frequency smoothing and an instantaneous-frequency-based F0 extraction: Possible role of a repetitive structure in sounds1Speech files available. See http://www.elsevier.nl/locate/specom1",
	journal = "Speech Communication",
	volume = "27",
	number = "3",
	pages = "187 - 207",
	year = "1999",
	issn = "0167-6393",
	doi = "https://doi.org/10.1016/S0167-6393(98)00085-5",
	url = "http://www.sciencedirect.com/science/article/pii/S0167639398000855",
	author = "Hideki Kawahara and Ikuyo Masuda-Katsuse and Alain de Cheveigné",
	keywords = "Speech analysis, Pitch-synchronous, Spline smoothing, Instantaneous frequency, F0 extraction, Speech synthesis, Speech modification"
}
@inbook{TTSSOTA, 
	place={Cambridge}, 
	title={Speech Synthesis: State of the Art and Challenges for the Future}, 
	DOI={10.1017/9781316676202.019}, booktitle={Social Signal Processing}, 
	publisher={Cambridge University Press}, 
	author={Georgila, Kallirroi}, 
	editor={Burgoon, 
	Judee K. and Magnenat-Thalmann, 
	Nadia and Pantic, 
	Maja and Vinciarelli, 
	AlessandroEditors}, 
	year={2017}, 
	pages={257–272}
}
@INPROCEEDINGS{SPSSDNN, 
417
	author={H. Zen and A. Senior and M. Schuster}, 
418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
	booktitle={2013 IEEE International Conference on Acoustics, Speech and Signal Processing}, 
	title={Statistical parametric speech synthesis using deep neural networks}, 
	year={2013}, 
	volume={}, 
	number={}, 
	pages={7962-7966}, 
	keywords={hidden Markov models;neural nets;speech synthesis;statistical parametric speech synthesis;deep neural networks;decision tree clustered context dependent hidden Markov models;HMM;probability densities;speech parameters;speech waveform;decision trees;acoustic realizations;Hidden Markov models;Speech;Speech synthesis;Decision trees;Context;Training data;Neural networks;Statistical parametric speech synthesis;Hidden Markov model;Deep neural network}, 
	doi={10.1109/ICASSP.2013.6639215}, 
	ISSN={1520-6149}, 
	month={May},
}
@INPROCEEDINGS{MLSA, 
	author={S. Imai}, 
	booktitle={ICASSP '83. IEEE International Conference on Acoustics, Speech, and Signal Processing}, 
	title={Cepstral analysis synthesis on the mel frequency scale}, 
	year={1983}, 
	volume={8}, 
	number={}, 
	pages={93-96}, 
	keywords={Cepstral analysis;Frequency synthesizers;Speech synthesis;Mel frequency cepstral coefficient;Vocoders;Speech analysis;Nonlinear filters;Fourier transforms;Cepstrum;Quantization}, 
	doi={10.1109/ICASSP.1983.1172250}, 
	ISSN={}, 
	month={April},
}
@inproceedings{HMMTTS,
  title={Simultaneous modeling of spectrum, pitch and duration in HMM-based speech synthesis},
  author={Takayoshi Yoshimura and Keiichi Tokuda and Takashi Masuko and Takao Kobayashi and Tadashi Kitamura},
  booktitle={EUROSPEECH},
  year={1999}
}

@article{GRU,
  author    = {KyungHyun Cho and
               Bart van Merrienboer and
               Dzmitry Bahdanau and
               Yoshua Bengio},
  title     = {On the Properties of Neural Machine Translation: Encoder-Decoder Approaches},
  journal   = {CoRR},
  volume    = {abs/1409.1259},
  year      = {2014},
  url       = {http://arxiv.org/abs/1409.1259},
  archivePrefix = {arXiv},
  eprint    = {1409.1259},
  timestamp = {Wed, 07 Jun 2017 14:42:33 +0200},
  biburl    = {http://dblp.org/rec/bib/journals/corr/ChoMBB14},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}

@article{LSTM,
 author = {Hochreiter, Sepp and Schmidhuber, J\"{u}rgen},
 title = {Long Short-Term Memory},
 journal = {Neural Comput.},
 issue_date = {November 15, 1997},
 volume = {9},
 number = {8},
 month = nov,
 year = {1997},
 issn = {0899-7667},
 pages = {1735--1780},
 numpages = {46},
 url = {http://dx.doi.org/10.1162/neco.1997.9.8.1735},
 doi = {10.1162/neco.1997.9.8.1735},
 acmid = {1246450},
 publisher = {MIT Press},
 address = {Cambridge, MA, USA},
} 

C
Corentin Jemine 已提交
485
@article{WaveNet,
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567
  author    = {A{\"{a}}ron van den Oord and
               Sander Dieleman and
               Heiga Zen and
               Karen Simonyan and
               Oriol Vinyals and
               Alex Graves and
               Nal Kalchbrenner and
               Andrew W. Senior and
               Koray Kavukcuoglu},
  title     = {WaveNet: {A} Generative Model for Raw Audio},
  journal   = {CoRR},
  volume    = {abs/1609.03499},
  year      = {2016},
  url       = {http://arxiv.org/abs/1609.03499},
  archivePrefix = {arXiv},
  eprint    = {1609.03499},
  timestamp = {Wed, 07 Jun 2017 14:42:54 +0200},
  biburl    = {http://dblp.org/rec/bib/journals/corr/OordDZSVGKSK16},
  bibsource = {dblp computer science bibliography, http://dblp.org}
}

@inproceedings{CLDNNs,
  title={Learning the speech front-end with raw waveform CLDNNs},
  author={Tara N. Sainath and Ron J. Weiss and Andrew W. Senior and Kevin W. Wilson and Oriol Vinyals},
  booktitle={INTERSPEECH},
  year={2015}
}

@article{lenet,
    abstract = {{Multilayer neural networks trained with the back-propagation algorithm constitute the best example of a successful gradient based learning technique. Given an appropriate network architecture, gradient-based learning algorithms can be used to synthesize a complex decision surface that can classify high-dimensional patterns, such as handwritten characters, with minimal preprocessing. This paper reviews various methods applied to handwritten character recognition and compares them on a standard handwritten digit recognition task. Convolutional neural networks, which are specifically designed to deal with the variability of 2D shapes, are shown to outperform all other techniques. Real-life document recognition systems are composed of multiple modules including field extraction, segmentation recognition, and language modeling. A new learning paradigm, called graph transformer networks (GTN), allows such multimodule systems to be trained globally using gradient-based methods so as to minimize an overall performance measure. Two systems for online handwriting recognition are described. Experiments demonstrate the advantage of global training, and the flexibility of graph transformer networks. A graph transformer network for reading a bank cheque is also described. It uses convolutional neural network character recognizers combined with global training techniques to provide record accuracy on business and personal cheques. It is deployed commercially and reads several million cheques per day}},
    author = {Lecun, Y. and Bottou, L. and Bengio, Y. and Haffner, P.},
    booktitle = {Proceedings of the IEEE},
    citeulike-article-id = {4196818},
    citeulike-linkout-0 = {http://dx.doi.org/10.1109/5.726791},
    citeulike-linkout-1 = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=726791},
    day = {06},
    doi = {10.1109/5.726791},
    institution = {Speech \& Image Process. Services Lab., AT\&T Bell Labs., Red Bank, NJ, USA},
    issn = {00189219},
    journal = {Proceedings of the IEEE},
    keywords = {cnn, lenet-5},
    month = nov,
    number = {11},
    pages = {2278--2324},
    posted-at = {2016-06-08 06:38:36},
    priority = {0},
    publisher = {IEEE},
    title = {{Gradient-based learning applied to document recognition}},
    url = {http://dx.doi.org/10.1109/5.726791},
    volume = {86},
    year = {1998}
}

@incollection{alexnet,
title = {ImageNet Classification with Deep Convolutional Neural Networks},
author = {Alex Krizhevsky and Sutskever, Ilya and Hinton, Geoffrey E},
booktitle = {Advances in Neural Information Processing Systems 25},
editor = {F. Pereira and C. J. C. Burges and L. Bottou and K. Q. Weinberger},
pages = {1097--1105},
year = {2012},
publisher = {Curran Associates, Inc.},
url = {http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf}
}

@article{dropout,
 author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
 title = {Dropout: A Simple Way to Prevent Neural Networks from Overfitting},
 journal = {J. Mach. Learn. Res.},
 issue_date = {January 2014},
 volume = {15},
 number = {1},
 month = jan,
 year = {2014},
 issn = {1532-4435},
 pages = {1929--1958},
 numpages = {30},
 url = {http://dl.acm.org/citation.cfm?id=2627435.2670313},
 acmid = {2670313},
 publisher = {JMLR.org},
 keywords = {deep learning, model combination, neural networks, regularization},
}