提交 501ce212 编写于 作者: C caoying03

rewrite the text classification demo.

上级 4f0d8acf
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import random import random
from paddle.v2.image import load_and_transform from paddle.v2.image import load_and_transform
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
import gzip import gzip
import paddle.v2 as paddle import paddle.v2 as paddle
...@@ -51,10 +37,10 @@ def main(): ...@@ -51,10 +37,10 @@ def main():
learning_rate_schedule="discexp", ) learning_rate_schedule="discexp", )
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.reader.shuffle(reader.test_reader("train.list"), buf_size=1000), paddle.reader.shuffle(reader.train_reader("train.list"), buf_size=1000),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
test_reader = paddle.batch( test_reader = paddle.batch(
reader.train_reader("test.list"), batch_size=BATCH_SIZE) reader.test_reader("test.list"), batch_size=BATCH_SIZE)
# End batch and end pass event handler # End batch and end pass event handler
def event_handler(event): def event_handler(event):
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.v2 as paddle import paddle.v2 as paddle
__all__ = ['vgg13', 'vgg16', 'vgg19'] __all__ = ['vgg13', 'vgg16', 'vgg19']
......
data
*.tar.gz
*.log
*.pyc
此差异已折叠。
此差异已折叠。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import gzip
import paddle.v2 as paddle
import network_conf
import reader
from utils import *
def infer(topology, data_dir, model_path, word_dict_path, label_dict_path,
batch_size):
def _infer_a_batch(inferer, test_batch):
probs = inferer.infer(input=test_batch, field=['value'])
assert len(probs) == len(test_batch)
for prob in probs:
lab = prob.argmax()
print("%d\t%s\t%s" %
(lab, label_reverse_dict[lab],
"\t".join(["{:0.4f}".format(p) for p in prob])))
logger.info('begin to predict...')
use_default_data = (data_dir is None)
if use_default_data:
word_dict = paddle.dataset.imdb.word_dict()
label_reverse_dict = {0: "positive", 1: "negative"}
test_reader = paddle.dataset.imdb.test(word_dict)
else:
assert os.path.exists(
word_dict_path), 'the word dictionary file does not exist'
assert os.path.exists(
label_dict_path), 'the label dictionary file does not exist'
word_dict = load_dict(word_dict_path)
label_reverse_dict = load_reverse_dict(label_dict_path)
test_reader = reader.test_reader(data_dir, word_dict)()
dict_dim = len(word_dict)
class_num = len(label_reverse_dict)
prob_layer = topology(dict_dim, class_num, is_infer=True)
# initialize PaddlePaddle
paddle.init(use_gpu=False, trainer_count=1)
# load the trained models
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(model_path, 'r'))
inferer = paddle.inference.Inference(
output_layer=prob_layer, parameters=parameters)
test_batch = []
for idx, item in enumerate(test_reader):
test_batch.append([item[0]])
if len(test_batch) == batch_size:
_infer_a_batch(inferer, test_batch)
test_batch = []
_infer_a_batch(inferer, test_batch)
test_batch = []
if __name__ == '__main__':
model_path = 'dnn_params_pass_00000.tar.gz'
assert os.path.exists(model_path), "the trained model does not exist."
nn_type = 'dnn'
test_dir = None
word_dict = None
label_dict = None
if nn_type == 'dnn':
topology = network_conf.fc_net
elif nn_type == 'cnn':
topology = network_conf.convolution_net
infer(
topology=topology,
data_dir=test_dir,
word_dict_path=word_dict,
label_dict_path=label_dict,
model_path=model_path,
batch_size=10)
import sys
import math
import gzip
from paddle.v2.layer import parse_network
import paddle.v2 as paddle
__all__ = ["fc_net", "convolution_net"]
def fc_net(dict_dim,
class_num,
emb_dim=28,
hidden_layer_sizes=[28, 8],
is_infer=False):
"""
define the topology of the dnn network
:param dict_dim: size of word dictionary
:type input_dim: int
:params class_num: number of instance class
:type class_num: int
:params emb_dim: embedding vector dimension
:type emb_dim: int
"""
# define the input layers
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(dict_dim))
if not is_infer:
lbl = paddle.layer.data("label",
paddle.data_type.integer_value(class_num))
# define the embedding layer
emb = paddle.layer.embedding(input=data, size=emb_dim)
# max pooling to reduce the input sequence into a vector (non-sequence)
seq_pool = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
for idx, hidden_size in enumerate(hidden_layer_sizes):
hidden_init_std = 1.0 / math.sqrt(hidden_size)
hidden = paddle.layer.fc(
input=hidden if idx else seq_pool,
size=hidden_size,
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=hidden_init_std))
prob = paddle.layer.fc(
input=hidden,
size=class_num,
act=paddle.activation.Softmax(),
param_attr=paddle.attr.Param(initial_std=1.0 / math.sqrt(class_num)))
if is_infer:
return prob
else:
return paddle.layer.classification_cost(
input=prob, label=lbl), prob, lbl
def convolution_net(dict_dim,
class_dim=2,
emb_dim=28,
hid_dim=128,
is_infer=False):
"""
cnn network definition
:param dict_dim: size of word dictionary
:type input_dim: int
:params class_dim: number of instance class
:type class_dim: int
:params emb_dim: embedding vector dimension
:type emb_dim: int
:params hid_dim: number of same size convolution kernels
:type hid_dim: int
"""
# input layers
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(dict_dim))
lbl = paddle.layer.data("label", paddle.data_type.integer_value(class_dim))
# embedding layer
emb = paddle.layer.embedding(input=data, size=emb_dim)
# convolution layers with max pooling
conv_3 = paddle.networks.sequence_conv_pool(
input=emb, context_len=3, hidden_size=hid_dim)
conv_4 = paddle.networks.sequence_conv_pool(
input=emb, context_len=4, hidden_size=hid_dim)
# fc and output layer
prob = paddle.layer.fc(
input=[conv_3, conv_4], size=class_dim, act=paddle.activation.Softmax())
if is_infer:
return prob
else:
cost = paddle.layer.classification_cost(input=prob, label=lbl)
return cost, prob, lbl
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
def train_reader(data_dir, word_dict, label_dict):
"""
Reader interface for training data
:param data_dir: data directory
:type data_dir: str
:param word_dict: path of word dictionary,
the dictionary must has a "UNK" in it.
:type word_dict: Python dict
:param label_dict: path of label dictionary
:type label_dict: Python dict
"""
def reader():
UNK_ID = word_dict["<UNK>"]
word_col = 1
lbl_col = 0
for file_name in os.listdir(data_dir):
with open(os.path.join(data_dir, file_name), "r") as f:
for line in f:
line_split = line.strip().split("\t")
word_ids = [
word_dict.get(w, UNK_ID)
for w in line_split[word_col].split()
]
yield word_ids, label_dict[line_split[lbl_col]]
return reader
def test_reader(data_dir, word_dict):
"""
Reader interface for testing data
:param data_dir: data directory.
:type data_dir: str
:param word_dict: path of word dictionary,
the dictionary must has a "UNK" in it.
:type word_dict: Python dict
"""
def reader():
UNK_ID = word_dict["<UNK>"]
word_col = 1
for file_name in os.listdir(data_dir):
with open(os.path.join(data_dir, file_name), "r") as f:
for line in f:
line_split = line.strip().split("\t")
if len(line_split) < word_col: continue
word_ids = [
word_dict.get(w, UNK_ID)
for w in line_split[word_col].split()
]
yield word_ids, line_split[word_col]
return reader
#!/bin/sh
python train.py \
--nn_type="dnn" \
--batch_size=64 \
--num_passes=10 \
2>&1 | tee train.log
import sys
import paddle.v2 as paddle
import gzip
def convolution_net(dict_dim, class_dim=2, emb_dim=28, hid_dim=128):
"""
cnn network definition
:param dict_dim: size of word dictionary
:type input_dim: int
:params class_dim: number of instance class
:type class_dim: int
:params emb_dim: embedding vector dimension
:type emb_dim: int
:params hid_dim: number of same size convolution kernels
:type hid_dim: int
"""
# input layers
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(dict_dim))
lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
#embedding layer
emb = paddle.layer.embedding(input=data, size=emb_dim)
# convolution layers with max pooling
conv_3 = paddle.networks.sequence_conv_pool(
input=emb, context_len=3, hidden_size=hid_dim)
conv_4 = paddle.networks.sequence_conv_pool(
input=emb, context_len=4, hidden_size=hid_dim)
# fc and output layer
output = paddle.layer.fc(
input=[conv_3, conv_4], size=class_dim, act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost, output, lbl
def train_cnn_model(num_pass):
"""
train cnn model
:params num_pass: train pass number
:type num_pass: int
"""
# load word dictionary
print 'load dictionary...'
word_dict = paddle.dataset.imdb.word_dict()
dict_dim = len(word_dict)
class_dim = 2
# define data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=100)
test_reader = paddle.batch(
lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
# network config
[cost, output, label] = convolution_net(dict_dim, class_dim=class_dim)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
adam_optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# add auc evaluator
paddle.evaluator.auc(input=output, label=label)
# create trainer
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=adam_optimizer)
# Define end batch and end pass event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
with gzip.open("cnn_params_pass" + str(event.pass_id) + ".tar.gz",
'w') as f:
parameters.to_tar(f)
# begin training network
feeding = {'word': 0, 'label': 1}
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_pass)
print("Training finished.")
def cnn_infer(file_name):
"""
predict instance labels by cnn network
:params file_name: network parameter file
:type file_name: str
"""
print("Begin to predict...")
word_dict = paddle.dataset.imdb.word_dict()
dict_dim = len(word_dict)
class_dim = 2
[_, output, _] = convolution_net(dict_dim, class_dim=class_dim)
parameters = paddle.parameters.Parameters.from_tar(gzip.open(file_name))
infer_data = []
infer_data_label = []
for item in paddle.dataset.imdb.test(word_dict):
infer_data.append([item[0]])
infer_data_label.append(item[1])
predictions = paddle.infer(
output_layer=output,
parameters=parameters,
input=infer_data,
field=['value'])
for i, prob in enumerate(predictions):
print prob, infer_data_label[i]
if __name__ == "__main__":
paddle.init(use_gpu=False, trainer_count=1)
num_pass = 5
train_cnn_model(num_pass=num_pass)
param_file_name = "cnn_params_pass" + str(num_pass - 1) + ".tar.gz"
cnn_infer(file_name=param_file_name)
import sys
import math
import paddle.v2 as paddle
import gzip
def fc_net(dict_dim, class_dim=2, emb_dim=28):
"""
dnn network definition
:param dict_dim: size of word dictionary
:type input_dim: int
:params class_dim: number of instance class
:type class_dim: int
:params emb_dim: embedding vector dimension
:type emb_dim: int
"""
# input layers
data = paddle.layer.data("word",
paddle.data_type.integer_value_sequence(dict_dim))
lbl = paddle.layer.data("label", paddle.data_type.integer_value(class_dim))
# embedding layer
emb = paddle.layer.embedding(input=data, size=emb_dim)
# max pooling
seq_pool = paddle.layer.pooling(
input=emb, pooling_type=paddle.pooling.Max())
# two hidden layers
hd_layer_size = [28, 8]
hd_layer_init_std = [1.0 / math.sqrt(s) for s in hd_layer_size]
hd1 = paddle.layer.fc(
input=seq_pool,
size=hd_layer_size[0],
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=hd_layer_init_std[0]))
hd2 = paddle.layer.fc(
input=hd1,
size=hd_layer_size[1],
act=paddle.activation.Tanh(),
param_attr=paddle.attr.Param(initial_std=hd_layer_init_std[1]))
# output layer
output = paddle.layer.fc(
input=hd2,
size=class_dim,
act=paddle.activation.Softmax(),
param_attr=paddle.attr.Param(initial_std=1.0 / math.sqrt(class_dim)))
cost = paddle.layer.classification_cost(input=output, label=lbl)
return cost, output, lbl
def train_dnn_model(num_pass):
"""
train dnn model
:params num_pass: train pass number
:type num_pass: int
"""
# load word dictionary
print 'load dictionary...'
word_dict = paddle.dataset.imdb.word_dict()
dict_dim = len(word_dict)
class_dim = 2
# define data reader
train_reader = paddle.batch(
paddle.reader.shuffle(
lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=100)
test_reader = paddle.batch(
lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
# network config
[cost, output, label] = fc_net(dict_dim, class_dim=class_dim)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
adam_optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# add auc evaluator
paddle.evaluator.auc(input=output, label=label)
# create trainer
trainer = paddle.trainer.SGD(
cost=cost, parameters=parameters, update_equation=adam_optimizer)
# Define end batch and end pass event handler
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "\nPass %d, Batch %d, Cost %f, %s" % (
event.pass_id, event.batch_id, event.cost, event.metrics)
else:
sys.stdout.write('.')
sys.stdout.flush()
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
with gzip.open("dnn_params_pass" + str(event.pass_id) + ".tar.gz",
'w') as f:
parameters.to_tar(f)
# begin training network
feeding = {'word': 0, 'label': 1}
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_pass)
print("Training finished.")
def dnn_infer(file_name):
"""
predict instance labels by dnn network
:params file_name: network parameter file
:type file_name: str
"""
print("Begin to predict...")
word_dict = paddle.dataset.imdb.word_dict()
dict_dim = len(word_dict)
class_dim = 2
[_, output, _] = fc_net(dict_dim, class_dim=class_dim)
parameters = paddle.parameters.Parameters.from_tar(gzip.open(file_name))
infer_data = []
infer_data_label = []
for item in paddle.dataset.imdb.test(word_dict):
infer_data.append([item[0]])
infer_data_label.append(item[1])
predictions = paddle.infer(
output_layer=output,
parameters=parameters,
input=infer_data,
field=['value'])
for i, prob in enumerate(predictions):
print prob, infer_data_label[i]
if __name__ == "__main__":
paddle.init(use_gpu=False, trainer_count=1)
num_pass = 5
train_dnn_model(num_pass=num_pass)
param_file_name = "dnn_params_pass" + str(num_pass - 1) + ".tar.gz"
dnn_infer(file_name=param_file_name)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import gzip
import paddle.v2 as paddle
import network_conf
import reader
from utils import *
def train(topology,
train_data_dir=None,
test_data_dir=None,
word_dict_path=None,
label_dict_path=None,
batch_size=32,
num_passes=10):
"""
train dnn model
:params train_data_path: path of training data, if this parameter
is not specified, paddle.dataset.imdb will be used to run this example
:type train_data_path: str
:params test_data_path: path of testing data, if this parameter
is not specified, paddle.dataset.imdb will be used to run this example
:type test_data_path: str
:params word_dict_path: path of training data, if this parameter
is not specified, paddle.dataset.imdb will be used to run this example
:type word_dict_path: str
:params num_pass: train pass number
:type num_pass: int
"""
use_default_data = (train_data_dir is None)
if use_default_data:
logger.info(("No training data are porivided, "
"use paddle.dataset.imdb to train the model."))
logger.info("please wait to build the word dictionary ...")
word_dict = paddle.dataset.imdb.word_dict()
train_reader = paddle.batch(
paddle.reader.shuffle(
lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
batch_size=100)
test_reader = paddle.batch(
lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
class_num = 2
else:
if word_dict_path is None or not os.path.exists(word_dict_path):
logger.info(("word dictionary is not given, the dictionary "
"is automatically built from the training data."))
# build the word dictionary to map the original string-typed
# words into integer-typed index
build_dict(
data_dir=train_data_dir,
save_path=word_dict_path,
use_col=1,
cutoff_fre=5,
insert_extra_words=["<UNK>"])
if not os.path.exists(label_dict_path):
logger.info(("label dictionary is not given, the dictionary "
"is automatically built from the training data."))
# build the label dictionary to map the original string-typed
# label into integer-typed index
build_dict(
data_dir=train_data_dir, save_path=label_dict_path, use_col=0)
word_dict = load_dict(word_dict_path)
lbl_dict = load_dict(label_dict_path)
class_num = len(lbl_dict)
logger.info("class number is : %d." % (len(lbl_dict)))
train_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_reader(train_data_dir, word_dict, lbl_dict),
buf_size=1000),
batch_size=batch_size)
if test_data_dir is not None:
# here, because training and testing data share a same format,
# we still use the reader.train_reader to read the testing data.
test_reader = paddle.batch(
paddle.reader.shuffle(
reader.train_reader(test_data_dir, word_dict, lbl_dict),
buf_size=1000),
batch_size=batch_size)
else:
test_reader = None
dict_dim = len(word_dict)
logger.info("length of word dictionary is : %d." % (dict_dim))
paddle.init(use_gpu=False, trainer_count=1)
# network config
cost, prob, label = topology(dict_dim, class_num)
# create parameters
parameters = paddle.parameters.create(cost)
# create optimizer
adam_optimizer = paddle.optimizer.Adam(
learning_rate=1e-3,
regularization=paddle.optimizer.L2Regularization(rate=1e-3),
model_average=paddle.optimizer.ModelAverage(average_window=0.5))
# create trainer
trainer = paddle.trainer.SGD(
cost=cost,
extra_layers=paddle.evaluator.auc(input=prob, label=label),
parameters=parameters,
update_equation=adam_optimizer)
# begin training network
feeding = {"word": 0, "label": 1}
def _event_handler(event):
"""
Define end batch and end pass event handler
"""
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
logger.info("Pass %d, Batch %d, Cost %f, %s\n" % (
event.pass_id, event.batch_id, event.cost, event.metrics))
if isinstance(event, paddle.event.EndPass):
if test_reader is not None:
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("Test at Pass %d, %s \n" % (event.pass_id,
result.metrics))
with gzip.open("dnn_params_pass_%05d.tar.gz" % event.pass_id,
"w") as f:
parameters.to_tar(f)
trainer.train(
reader=train_reader,
event_handler=_event_handler,
feeding=feeding,
num_passes=num_passes)
logger.info("Training has finished.")
def main(args):
if args.nn_type == "dnn":
topology = network_conf.fc_net
elif args.nn_type == "cnn":
topology = network_conf.convolution_net
train(
topology=topology,
train_data_dir=args.train_data_dir,
test_data_dir=args.test_data_dir,
word_dict_path=args.word_dict,
label_dict_path=args.label_dict,
batch_size=args.batch_size,
num_passes=args.num_passes)
if __name__ == "__main__":
args = parse_train_cmd()
if args.train_data_dir is not None:
assert args.word_dict and args.label_dict, (
"the parameter train_data_dir, word_dict_path, and label_dict_path "
"should be set at the same time.")
main(args)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os
import argparse
from collections import defaultdict
logger = logging.getLogger("logger")
logger.setLevel(logging.INFO)
def parse_train_cmd():
parser = argparse.ArgumentParser(
description="PaddlePaddle text classification demo")
parser.add_argument(
"--nn_type",
type=str,
help="define which type of network to use, available: [dnn, cnn]",
default="dnn")
parser.add_argument(
"--train_data_dir",
type=str,
required=False,
help=("path of training dataset (default: None). "
"if this parameter is not set, "
"paddle.dataset.imdb will be used."),
default=None)
parser.add_argument(
"--test_data_dir",
type=str,
required=False,
help=("path of testing dataset (default: None). "
"if this parameter is not set, "
"paddle.dataset.imdb will be used."),
default=None)
parser.add_argument(
"--word_dict",
type=str,
required=False,
help=("path of word dictionary (default: None)."
"if this parameter is not set, paddle.dataset.imdb will be used."
"if this parameter is set, but the file does not exist, "
"word dictionay will be built from "
"the training data automatically."),
default=None)
parser.add_argument(
"--label_dict",
type=str,
required=False,
help=("path of label dictionay (default: None)."
"if this parameter is not set, paddle.dataset.imdb will be used."
"if this parameter is set, but the file does not exist, "
"word dictionay will be built from "
"the training data automatically."),
default=None)
parser.add_argument(
"--batch_size",
type=int,
default=32,
help="the number of training examples in one forward/backward pass")
parser.add_argument(
"--num_passes", type=int, default=10, help="number of passes to train")
return parser.parse_args()
def build_dict(data_dir,
save_path,
use_col=0,
cutoff_fre=0,
insert_extra_words=[]):
values = defaultdict(int)
for file_name in os.listdir(data_dir):
file_path = os.path.join(data_dir, file_name)
if not os.path.isfile(file_path):
continue
with open(file_path, "r") as fdata:
for line in fdata:
line_splits = line.strip().split("\t")
if len(line_splits) < use_col: continue
for w in line_splits[use_col].split():
values[w] += 1
with open(save_path, "w") as f:
for w in insert_extra_words:
f.write("%s\t-1\n" % (w))
for v, count in sorted(
values.iteritems(), key=lambda x: x[1], reverse=True):
if count < cutoff_fre:
break
f.write("%s\t%d\n" % (v, count))
def load_dict(dict_path):
return dict((line.strip().split("\t")[0], idx)
for idx, line in enumerate(open(dict_path, "r").readlines()))
def load_reverse_dict(dict_path):
return dict((idx, line.strip().split("\t")[0])
for idx, line in enumerate(open(dict_path, "r").readlines()))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册