download.py 1.5 KB
Newer Older
Z
Zeyu Chen 已提交
1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
#
# Licensed under the Apache License, Version 2.0 (the 'License');
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an 'AS IS' BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Script for downloading training data.
'''
import os
import urllib
import sys

if sys.version_info >= (3, 0):
    import urllib.request
import zipfile

URLLIB = urllib
if sys.version_info >= (3, 0):
    URLLIB = urllib.request

remote_path = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi'
base_path = 'data'
Z
Zeyu Chen 已提交
31
trg_path = os.path.join(base_path, 'en-vi')
32 33 34 35 36 37 38 39 40
filenames = [
    'train.en', 'train.vi', 'tst2012.en', 'tst2012.vi', 'tst2013.en',
    'tst2013.vi', 'vocab.en', 'vocab.vi'
]


def main(arguments):
    print("Downloading data......")

Z
Zeyu Chen 已提交
41
    if not os.path.exists(trg_path):
42 43
        if not os.path.exists(base_path):
            os.mkdir(base_path)
Z
Zeyu Chen 已提交
44
        os.mkdir(trg_path)
45 46

    for filename in filenames:
Z
Zeyu Chen 已提交
47 48 49 50
        url = os.path.join(remote_path, filename)
        trg_file = os.path.join(trg_path, filename)
        URLLIB.urlretrieve(url, trg_file)
    print("Downloaded success......")
51 52 53 54


if __name__ == '__main__':
    sys.exit(main(sys.argv[1:]))