【04.word2vec】Why is the training process so slow
Created by: stonyhu
I spent 20 hours to train the cbow of word2vec demo, but there is still no pass produced. However, I use the Google word2vec tool, it took me several mintues to get things done. Please give me explaination that why paddle is so slow.
# edit-mode: -*- python -*-
import os
import logging
import gzip
import math
import paddle.v2 as paddle
logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)
save_dir = "output_cbow"
dict_file = "../data/dict.txt"
train_file = "../data/train.txt"
test_file = "../data/test.txt"
N = 3
emb_size = 128
hidden_size = 256
#batch_size=512
def load_dict(dict_file):
word_dict = dict()
with open(dict_file, 'r') as f:
for i, line in enumerate(f):
parts = line.strip().split('\t\t')
id = int(parts[0])
w = parts[1]
word_dict[w] = id
word_dict['<unk>'] = len(word_dict)
return word_dict
def reader_creator(filename, word_idx, n):
def reader():
with open(filename, 'r') as f:
UNK = word_idx['<unk>']
for line in f:
assert n > -1, 'Invalid gram length'
parts = line.strip().split('\t\t')
if len(parts) != 3:
continue
feature_types = parts[2].split('\t')
words = []
for feature_type in feature_types:
pos = feature_type.find(':')
extractor_name = feature_type[:pos]
feature_line = feature_type[pos+1:]
if extractor_name in ['basic_wordseg'] and feature_line != '':
features = feature_line.split('\x01')
for feature in features:
words.append(extractor_name + '::' + feature)
break
l = []
for i in range(n):
l += ['<s>']
l += words
for i in range(n):
l += ['<e>']
if len(l) >= n * 2 + 1:
word_vec = [word_idx.get(w, UNK) for w in l]
ll = []
for i in range(n, len(word_vec) - n):
ll.extend(word_vec[i - n:i])
ll.extend(word_vec[i+1:i+n+1])
ll.append(word_vec[i])
yield tuple(ll)
return reader
def wordemb(layer):
wordemb = paddle.layer.table_projection(
input=layer,
size=emb_size,
param_attr=paddle.attr.Param(
name="_proj",
initial_std=0.001,
learning_rate=1,
l2_rate=0,))
return wordemb
def main():
if not os.path.exists(save_dir):
os.mkdir(save_dir)
paddle.init(use_gpu=True, trainer_count=2, gpu_id=0)
word_dict = load_dict(dict_file)
dict_size = len(word_dict)
f_third_w = paddle.layer.data(
name="w_t-3", type=paddle.data_type.integer_value(dict_size))
f_second_w = paddle.layer.data(
name="w_t-2", type=paddle.data_type.integer_value(dict_size))
f_first_w = paddle.layer.data(
name="w_t-1", type=paddle.data_type.integer_value(dict_size))
b_first_w = paddle.layer.data(
name="w_t+1", type=paddle.data_type.integer_value(dict_size))
b_second_w = paddle.layer.data(
name="w_t+2", type=paddle.data_type.integer_value(dict_size))
b_third_w = paddle.layer.data(
name="w_t+3", type=paddle.data_type.integer_value(dict_size))
target_word = paddle.layer.data(
name="w_t", type=paddle.data_type.integer_value(dict_size))
emb_f_third = wordemb(f_third_w)
emb_f_second = wordemb(f_second_w)
emb_f_first = wordemb(f_first_w)
emb_b_first = wordemb(b_first_w)
emb_b_second = wordemb(b_second_w)
emb_b_third = wordemb(b_third_w)
context_emb = paddle.layer.concat(input=[
emb_f_third, emb_f_second, emb_f_first,
emb_b_first, emb_b_second, emb_b_third])
hidden_layer = paddle.layer.fc(
input=context_emb,
size=hidden_size,
act=paddle.activation.Sigmoid(),
layer_attr=paddle.attr.Extra(drop_rate=0.5),
bias_attr=paddle.attr.Param(learning_rate=2),
param_attr=paddle.attr.Param(
initial_std=1. / math.sqrt(emb_size * 8), learning_rate=1))
predict_word = paddle.layer.fc(
input=hidden_layer,
size=dict_size,
bias_attr=paddle.attr.Param(learning_rate=2),
act=paddle.activation.Softmax())
result_list = []
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id and event.batch_id % 10 == 0:
logger.info("Pass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
if isinstance(event, paddle.event.EndPass):
# save pass to file
model_name = os.path.join(save_dir, "pass_%05d.tar.gz" %
event.pass_id)
with gzip.open(model_name, "w") as f:
parameters.to_tar(f)
result = trainer.test(paddle.batch(
reader=reader_creator(test_file, word_dict, N), batch_size=512))
logger.info("Test with Pass %d, Cost %f, %s\n" % (
event.pass_id, result.cost, result.metrics))
result_list.append((event.pass_id, result.cost,
result.metrics['classification_error_evaluator']))
cost = paddle.layer.classification_cost(input=predict_word, label=target_word)
parameters = paddle.parameters.create(cost)
adam_optimizer = paddle.optimizer.Adam(
learning_rate=3e-3,
regularization=paddle.optimizer.L2Regularization(8e-4))
trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
trainer.train(paddle.batch(
reader=reader_creator(train_file, word_dict, N), batch_size=512),
num_passes=30,
event_handler=event_handler)
# find the best pass
best = sorted(result_list, key=lambda list: float(list[1]))[0]
logger.info("Best pass is %s, testing Avgcost is %s" % (best[0], best[1]))
logger.info("The classification accuracy is %.2f%%" % (100 - float(best[2]) * 100))
if __name__ == "__main__":
main()