diff --git a/Senta/nets.py b/Senta/nets.py index 6333c5b4b4bb4913cb656cff6789aab360e2cb57..b70285f858cacb61213f6a5b904a92e373dcac26 100644 --- a/Senta/nets.py +++ b/Senta/nets.py @@ -28,8 +28,8 @@ def bow_net(data, fc_2 = fluid.layers.fc( input=fc_1, size=hid_dim2, act="tanh", name="bow_fc2") # softmax layer - prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax", - name="fc_softmax") + prediction = fluid.layers.fc( + input=[fc_2], size=class_dim, act="softmax", name="fc_softmax") cost = fluid.layers.cross_entropy(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost) acc = fluid.layers.accuracy(input=prediction, label=label) diff --git a/Senta/sentiment_classify.py b/Senta/sentiment_classify.py index 1c9c41006472f24bf121189b672efd5d605ff83d..636a86497f6e320bf72d8a94f8e7155a5857a525 100644 --- a/Senta/sentiment_classify.py +++ b/Senta/sentiment_classify.py @@ -1,6 +1,9 @@ # coding: utf-8 import sys +# NOTE: just hack for fast test sys.path.append("../") +sys.path.append("../paddle_hub/") +import os import time import unittest import contextlib @@ -114,7 +117,7 @@ def remove_feed_fetch_op(program): def train_net(train_reader, word_dict, - network, + network_name, use_gpu, parallel, save_dirname, @@ -124,39 +127,85 @@ def train_net(train_reader, """ train network """ - if network == "bilstm_net": + if network_name == "bilstm_net": network = bilstm_net - elif network == "bow_net": + elif network_name == "bow_net": network = bow_net - elif network == "cnn_net": + elif network_name == "cnn_net": network = cnn_net - elif network == "lstm_net": + elif network_name == "lstm_net": network = lstm_net - elif network == "gru_net": + elif network_name == "gru_net": + network = gru_net + else: + print("unknown network type") + return + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + cost, acc, pred, emb = network(data, label, len(word_dict) + 2) + + # set optimizer + sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr) + sgd_optimizer.minimize(cost) + + # set place, executor, datafeeder + place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=["words", "label"], place=place) + exe.run(fluid.default_startup_program()) + # start training... + + for pass_id in range(pass_num): + data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0 + for batch in train_reader(): + avg_cost_np, avg_acc_np = exe.run( + fluid.default_main_program(), + feed=feeder.feed(batch), + fetch_list=[cost, acc], + return_numpy=True) + data_size = len(batch) + total_acc += data_size * avg_acc_np + total_cost += data_size * avg_cost_np + data_count += data_size + avg_cost = total_cost / data_count + avg_acc = total_acc / data_count + print("[train info]: pass_id: %d, avg_acc: %f, avg_cost: %f" % + (pass_id, avg_acc, avg_cost)) + + # save the model + module_path = os.path.join(save_dirname, network_name) + hub.ModuleDesc.save_module_dict( + module_path=module_path, word_dict=word_dict) + fluid.io.save_inference_model(module_path, ["words"], emb, exe) + + +def retrain_net(train_reader, + word_dict, + network_name, + use_gpu, + parallel, + save_dirname, + lr=0.002, + batch_size=128, + pass_num=30): + """ + train network + """ + if network_name == "bilstm_net": + network = bilstm_net + elif network_name == "bow_net": + network = bow_net + elif network_name == "cnn_net": + network = cnn_net + elif network_name == "lstm_net": + network = lstm_net + elif network_name == "gru_net": network = gru_net else: print("unknown network type") return - # word seq data - # data = fluid.layers.data( - # name="words", shape=[1], dtype="int64", lod_level=1) - - # if not parallel: - # # set network - # cost, acc, pred, emb = network(data, label, len(word_dict) + 2) - # else: - # places = fluid.layers.get_places(device_count=2) - # pd = fluid.layers.ParallelDo(places) - # with pd.do(): - # # set network - # cost, acc, prediction, emb = network( - # pd.read_input(data), pd.read_input(label), - # len(word_dict) + 2) - # pd.write_output(cost) - # pd.write_output(acc) - # cost, acc = pd() - # cost = fluid.layers.mean(cost) - # acc = fluid.layers.mean(acc) dict_dim = len(word_dict) + 2 emb_dim = 128 @@ -164,7 +213,8 @@ def train_net(train_reader, hid_dim2 = 96 class_dim = 2 - module_link = "https://paddlehub.cdn.bcebos.com/senta/bow_module_3.tar.gz" + # module_link = "https://paddlehub.cdn.bcebos.com/senta/bow_module_3.tar.gz" + module_link = "./models/bow_net/" module = hub.Module(module_link) main_program = fluid.Program() @@ -178,6 +228,7 @@ def train_net(train_reader, label = fluid.layers.data(name="label", shape=[1], dtype="int64") data = fluid.default_main_program().global_block().var("words") + #TODO(ZeyuChen): how to get output paramter according to proto config emb = module.get_module_output() # # # embedding layer @@ -198,20 +249,6 @@ def train_net(train_reader, fluid.layers.cross_entropy(input=pred, label=label)) acc = fluid.layers.accuracy(input=pred, label=label) - # Original Senta BoW networks - # label = fluid.layers.data(name="label", shape=[1], dtype="int64") - # data = fluid.layers.data( - # name="words", shape=[1], dtype="int64", lod_level=1) - # cost, acc, pred, emb = network(data, label, len(word_dict) + 2) - - # print("new program") - # with open("program_senta.prototxt", "w") as fo: - # fo.write(str(fluid.default_main_program())) - # print("program_senta", fluid.default_main_program()) - with open("senta_load_module.prototxt", "w") as fo: - fo.write(str(fluid.default_main_program())) - print("senta_load_module", fluid.default_main_program()) - # set optimizer sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=lr) sgd_optimizer.minimize(cost) @@ -246,8 +283,8 @@ def train_net(train_reader, # print("senta_load_module", fluid.default_main_program()) # save the model - bow_module_path = save_dirname + "/" + "bow_module" - fluid.io.save_inference_model(bow_module_path, ["words"], emb, exe) + module_path = os.path.join(save_dirname, network_name + "_retrain") + fluid.io.save_inference_model(module_path, ["words"], emb, exe) def eval_net(test_reader, use_gpu, model_path=None): @@ -339,9 +376,12 @@ def main(args): args.word_dict_path, args.batch_size, args.mode) - train_net(train_reader, word_dict, args.model_type, args.use_gpu, - args.is_parallel, args.model_path, args.lr, args.batch_size, - args.num_passes) + # train_net(train_reader, word_dict, args.model_type, args.use_gpu, + # args.is_parallel, args.model_path, args.lr, args.batch_size, + # args.num_passes) + retrain_net(train_reader, word_dict, args.model_type, args.use_gpu, + args.is_parallel, args.model_path, args.lr, args.batch_size, + args.num_passes) # eval mode elif args.mode == "eval": diff --git a/paddle_hub/module.py b/paddle_hub/module.py index 29e61c72cc51522238eb767f7f61df81bbf6a401..d2a5c61c825a23e321e8d20df7a6e591c831d8cc 100644 --- a/paddle_hub/module.py +++ b/paddle_hub/module.py @@ -19,18 +19,33 @@ from __future__ import print_function import paddle.fluid as fluid import numpy as np import tempfile +import utils import os from collections import defaultdict from downloader import download_and_uncompress __all__ = ["Module", "ModuleDesc"] +DICT_NAME = "dict.txt" +ASSETS_PATH = "assets" + + +def mkdir(path): + """ the same as the shell command mkdir -p " + """ + if not os.path.exists(path): + os.makedirs(path) class Module(object): def __init__(self, module_url): # donwload module - module_dir = download_and_uncompress(module_url) + if module_url.startswith("http"): # if it's remote url links + # if it's remote url link, then download and uncompress it + module_dir = download_and_uncompress(module_url) + else: + # otherwise it's local path, no need to deal with it + module_dir = module_url # load paddle inference model place = fluid.CPUPlace() @@ -90,6 +105,7 @@ class Module(object): def get_module_output(self): for var in self.inference_program.list_vars(): print(var) + # NOTE: just hack for load Senta's if var.name == "embedding_0.tmp_0": return var @@ -128,17 +144,22 @@ class Module(object): # load assets folder def _load_assets(self, module_dir): - assets_dir = os.path.join(module_dir, "assets") - tokens_path = os.path.join(assets_dir, "tokens.txt") + assets_dir = os.path.join(module_dir, ASSETS_PATH) + tokens_path = os.path.join(assets_dir, DICT_NAME) word_id = 0 with open(tokens_path) as fi: words = fi.readlines() - words = map(str.strip, words) - for w in words: - self.dict[w] = word_id - word_id += 1 - print(w, word_id) + #TODO(ZeyuChen) check whether word id is duplicated and valid + for line in fi: + w, w_id = line.split() + self.dict[w] = int(w_id) + + # words = map(str.strip, words) + # for w in words: + # self.dict[w] = word_id + # word_id += 1 + # print(w, word_id) def add_module_feed_list(self, feed_list): self.feed_list = feed_list @@ -146,35 +167,27 @@ class Module(object): def add_module_output_list(self, output_list): self.output_list = output_list - def _mkdir(self, path): - if not os.path.exists(path): - os.makedirs(path) - class ModuleDesc(object): def __init__(self): pass - @staticmethod - def _mkdir(path): - if not os.path.exists(path): - os.makedirs(path) - @staticmethod def save_dict(path, word_dict, dict_name): """ Save dictionary for NLP module """ - ModuleDesc._mkdir(path) + mkdir(path) with open(os.path.join(path, dict_name), "w") as fo: - print("tokens.txt path", os.path.join(path, "tokens.txt")) - dict_str = "\n".join(word_dict) - fo.write(dict_str) + print("tokens.txt path", os.path.join(path, DICT_NAME)) + for w in word_dict: + w_id = word_dict[w] + fo.write("{}\t{}\n".format(w, w_id)) @staticmethod - def save_module_dict(module_path, word_dict, dict_name="dict.txt"): + def save_module_dict(module_path, word_dict, dict_name=DICT_NAME): """ Save dictionary for NLP module """ - assets_path = os.path.join(module_path, "assets") + assets_path = os.path.join(module_path, ASSETS_PATH) print("save_module_dict", assets_path) ModuleDesc.save_dict(assets_path, word_dict, dict_name) pass