get_vocab.py 2.2 KB
Newer Older
Q
qiuxuezhong 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
# -*- coding:utf8 -*-
# ==============================================================================
# Copyright 2017 Baidu.com, Inc. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""
Utility function to generate vocabulary file.
"""

Y
Yibing Liu 已提交
21

Q
qiuxuezhong 已提交
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
import argparse
import sys
import json

from itertools import chain


def get_vocab(files, vocab_file):
    """
    Builds vocabulary file from field 'segmented_paragraphs'
    and 'segmented_question'.

    Args:
        files: A list of file names.
        vocab_file: The file that stores the vocabulary.
    """
    vocab = {}
    for f in files:
        with open(f, 'r') as fin:
            for line in fin:
                obj = json.loads(line.strip())
                paras = [
Y
Yibing Liu 已提交
44 45
                        chain(*d['segmented_paragraphs'])
                        for d in obj['documents']]
Q
qiuxuezhong 已提交
46 47 48 49 50
                doc_tokens = chain(*paras)
                question_tokens = obj['segmented_question']
                for t in list(doc_tokens) + question_tokens:
                    vocab[t] = vocab.get(t, 0) + 1
    # output
Y
Yibing Liu 已提交
51 52 53
    sorted_vocab = sorted([(v, c) for v, c in vocab.items()],
            key=lambda x: x[1],
            reverse=True)
Q
qiuxuezhong 已提交
54 55 56 57 58 59 60
    with open(vocab_file, 'w') as outf:
        for w, c in sorted_vocab:
            print >> outf, '{}\t{}'.format(w.encode('utf8'), c)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
Y
Yibing Liu 已提交
61 62 63 64
    parser.add_argument('--files', nargs='+', required=True,
            help='file list to count vocab from.')
    parser.add_argument('--vocab', required=True,
            help='file to store counted vocab.')
Q
qiuxuezhong 已提交
65 66
    args = parser.parse_args()
    get_vocab(args.files, args.vocab)
Y
Yibing Liu 已提交
67