#!/bin/env python # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Example: python preprocess.py -i INPUT [-d DICTSIZE] [-m] Options: -h, --help show this help message and exit -i INPUT input original dataset path -d DICTSIZE specified word count of dictionary -m --mergeDict merge source and target dictionary """ import os from optparse import OptionParser from paddle.v2.dataset.wmt14_util import SeqToSeqDatasetCreater def main(): usage = "usage: \n" \ "python %prog -i INPUT [-d DICTSIZE] [-m]" parser = OptionParser(usage) parser.add_option( "-i", action="store", dest="input", help="input original dataset path") parser.add_option( "-d", action="store", dest="dictsize", help="specified word count of dictionary") parser.add_option( "-m", "--mergeDict", action="store_true", dest="mergeDict", help="merge source and target dictionary") (options, args) = parser.parse_args() if options.input[-1] == os.path.sep: options.input = options.input[:-1] outname = os.path.basename(options.input) output_path = os.path.join(os.path.dirname(options.input), 'pre-' + outname) dictsize = int(options.dictsize) if options.dictsize else -1 if not os.path.exists(output_path): os.mkdir(output_path) data_creator = SeqToSeqDatasetCreater(options.input, output_path) data_creator.create_dataset(dictsize, options.mergeDict) if __name__ == "__main__": main()