pretrained_word2vec.py 2.1 KB
Newer Older
M
mapingshuo 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14 15 16 17
"""
This Module provide pretrained word-embeddings 
"""

Y
Yibing Liu 已提交
18
from __future__ import print_function, unicode_literals
19
import numpy as np
M
mapingshuo 已提交
20
import time, datetime
M
mapingshuo 已提交
21 22
import os, sys

Y
Yibing Liu 已提交
23

M
mapingshuo 已提交
24
def maybe_open(filepath):
Y
Yibing Liu 已提交
25
    if sys.version_info <= (3, 0):  # for python2
M
mapingshuo 已提交
26 27 28
        return open(filepath, 'r')
    else:
        return open(filepath, 'r', encoding="utf-8")
Y
Yibing Liu 已提交
29 30


M
mapingshuo 已提交
31
def Glove840B_300D(filepath, keys=None):
32 33 34 35
    """
    input: the "glove.840B.300d.txt" file path
    return: a dict, key: word (unicode), value: a numpy array with shape [300]
    """
M
mapingshuo 已提交
36
    if keys is not None:
Y
Yibing Liu 已提交
37
        assert (isinstance(keys, set))
38
    print("loading word2vec from ", filepath)
M
mapingshuo 已提交
39 40
    print("please wait for a minute.")
    start = time.time()
41
    word2vec = {}
M
mapingshuo 已提交
42
    with maybe_open(filepath) as f:
M
mapingshuo 已提交
43
        for line in f:
Y
Yibing Liu 已提交
44
            if sys.version_info <= (3, 0):  # for python2
M
mapingshuo 已提交
45 46 47
                line = line.decode('utf-8')
            info = line.strip("\n").split(" ")
            word = info[0]
M
mapingshuo 已提交
48 49 50
            if (keys is not None) and (word not in keys):
                continue
            vector = info[1:]
Y
Yibing Liu 已提交
51
            assert (len(vector) == 300)
M
mapingshuo 已提交
52 53 54
            word2vec[word] = np.asarray(vector, dtype='float32')

    end = time.time()
Y
Yibing Liu 已提交
55 56 57 58
    print(
        "Spent ",
        str(datetime.timedelta(seconds=end - start)),
        " on loading word2vec.")
59
    return word2vec
M
mapingshuo 已提交
60

Y
Yibing Liu 已提交
61

62
if __name__ == '__main__':
M
mapingshuo 已提交
63 64
    from os.path import expanduser
    home = expanduser("~")
Y
Yibing Liu 已提交
65 66
    embed_dict = Glove840B_300D(
        os.path.join(home, "./.cache/paddle/dataset/glove.840B.300d.txt"))
M
mapingshuo 已提交
67
    exit(0)