preprocess.py 2.1 KB
Newer Older
Z
zhangjinchao01 已提交
1
#!/bin/env python
2
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
Z
zhangjinchao01 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Example:
    python preprocess.py -i INPUT [-d DICTSIZE] [-m]

Options:
    -h, --help     show this help message and exit
    -i INPUT       input original dataset path
    -d DICTSIZE    specified word count of dictionary
    -m --mergeDict merge source and target dictionary
"""
import os
from optparse import OptionParser

Q
qiaolongfei 已提交
28
from paddle.v2.dataset.wmt14_util import SeqToSeqDatasetCreater
Z
zhangjinchao01 已提交
29

30

Z
zhangjinchao01 已提交
31 32 33 34
def main():
    usage = "usage: \n" \
            "python %prog -i INPUT [-d DICTSIZE] [-m]"
    parser = OptionParser(usage)
35 36 37 38 39 40 41 42 43 44 45 46 47
    parser.add_option(
        "-i", action="store", dest="input", help="input original dataset path")
    parser.add_option(
        "-d",
        action="store",
        dest="dictsize",
        help="specified word count of dictionary")
    parser.add_option(
        "-m",
        "--mergeDict",
        action="store_true",
        dest="mergeDict",
        help="merge source and target dictionary")
Z
zhangjinchao01 已提交
48 49 50 51 52 53 54 55 56 57 58
    (options, args) = parser.parse_args()
    if options.input[-1] == os.path.sep:
        options.input = options.input[:-1]
    outname = os.path.basename(options.input)
    output_path = os.path.join(os.path.dirname(options.input), 'pre-' + outname)
    dictsize = int(options.dictsize) if options.dictsize else -1
    if not os.path.exists(output_path):
        os.mkdir(output_path)
        data_creator = SeqToSeqDatasetCreater(options.input, output_path)
        data_creator.create_dataset(dictsize, options.mergeDict)

59

Z
zhangjinchao01 已提交
60
if __name__ == "__main__":
61
    main()