diff --git a/.gitignore b/.gitignore index d6ad30bcdedca78251643619cd173047e59c9e00..2f0db776988e536a42d7c307433d642b5d77bf3e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ __pycache__/ *$py.class .vscode sftp.json +data # C extensions *.so diff --git a/README.md b/README.md index fe9146dc5b245cc0405c745c1bc69f941d1d47c4..70a87dea35a2d8524f1675be3e3d997ab4f591ec 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Natural Language Processing with PyTorch 中文版 +# PyTorch 自然语言处理(Natural Language Processing with PyTorch 中文版) ![](cover.jpg) @@ -20,8 +20,31 @@ * [Chapter 8.用于自然语言处理的高级 Sequence](docs/8.md) * [Chapter 9.经典, 前沿和后续步骤](docs/9.md) ---- +代码地址: * [在线阅读](https://nlp-pt.apachecn.org) * [ApacheCN 机器学习交流群 629470233](http://shang.qq.com/wpa/qunwpa?idkey=30e5f1123a79867570f665aa3a483ca404b1c3f77737bc01ec520ed5f078ddef) * [ApacheCN 学习资源](http://www.apachecn.org/) + +## 精品推荐 + +> 深度学习必学 + +1. [反向传递](/docs/dl/反向传递.md): +2. [CNN原理](/docs/dl/CNN原理.md): +3. [RNN原理](/docs/dl/RNN原理.md): +4. [LSTM原理](/docs/dl/LSTM原理.md): + +> 自然语言处理 + +* Python 自然语言处理 第二版: +* 推荐一个[liuhuanyong大佬](https://github.com/liuhuanyong)整理的nlp全面知识体系: +* 开源 - 词向量库集合: + * + * + * + * + +--- + +内容由 [ApacheCN](https://github.com/apachecn) 团队提供支持 diff --git a/docs/chatbot.md b/docs/chatbot.md index 70f7deee133f53ed681c567ed7a9feb0da93d621..15711295a5ef705fc3d31a2b35c486939d20e1a6 100644 --- a/docs/chatbot.md +++ b/docs/chatbot.md @@ -102,3 +102,58 @@ u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L207', 'L208'] 4. 打印输出 pairs 5. voc.trim, 过滤掉词频数据(有利于让训练更快收敛的策略是去除词汇表中很少使用的单词。减少特征空间也会降低模型学习目标函数的难度) +``` +['there .', 'where ?'] +['you have my word . as a gentleman', 'you re sweet .'] +['hi .', 'looks like things worked out tonight huh ?'] +['you know chastity ?', 'i believe we share an art instructor'] +['have fun tonight ?', 'tons'] +['well no . . .', 'then that s all you had to say .'] +['then that s all you had to say .', 'but'] +['but', 'you always been this selfish ?'] +['do you listen to this crap ?', 'what crap ?'] +['what good stuff ?', 'the real you .'] +``` + +### 为模型格式化数据 + +1. 加速训练,利用GPU并行计算能力,则需要使用小批量 `mini-batches` +2. 为了保证数据长短一致,设置 `(max_length,batch_size)`, 短于 max_length 的句子在 EOS_token 之后进行零填充 `(zero padded)` +3. 矩阵转置(以便跨第一维的索引返回批处理中所有句子的时间步长) + +![](https://pytorch.apachecn.org/docs/1.0/img/b2f1969c698070d055c23fc81ab07b1b.jpg) + +## 定义模型 + +Seq2seq模型的目标是将可变长度序列作为输入,并使用固定大小的模型将可变长度序列作为输出返回。 + +* Seq2Seq模型: + 1. 编码器,其将可变长度输入序列编码为固定长度上下文向量。 + 2. 解码器,它接收输入文字和上下文矢量,并返回序列中下一句文字的概率和在下一次迭代中使用的隐藏状态。 + +![](https://pytorch.apachecn.org/docs/1.0/img/32a87cf8d0353ceb0037776f833b92a7.jpg) + + +* 编码器: + +如果将填充的一批序列传递给RNN模块,我们必须分别使用torch.nn.utils.rnn.pack_padded_sequence和torch.nn.utils.rnn.pad_packed_sequence在RNN传递时分别进行填充和反填充。 + +```py +def forward(self, input_seq, input_lengths, hidden=None): + # Convert word indexes to embeddings + embedded = self.embedding(input_seq) + # Pack padded batch of sequences for RNN module + packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths) + # Forward pass through GRU + outputs, hidden = self.gru(packed, hidden) + # Unpack padding + outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs) + # Sum bidirectional GRU outputs + outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] + # Return output and final hidden state + return outputs, hidden +``` + +![](https://pytorch.apachecn.org/docs/1.0/img/c653271eb5fb762482bceb5e2464e680.jpg) + +* 解码器: \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8924c42403c87fb038c41f19aedf2f59af2b369 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +fire +torch diff --git a/src/Chinese_ChatBot/QQ_ETL.py b/src/Chinese_ChatBot/QQ_ETL.py new file mode 100644 index 0000000000000000000000000000000000000000..a22674e16841e01e77d0705dd492ecef06ef3a8a --- /dev/null +++ b/src/Chinese_ChatBot/QQ_ETL.py @@ -0,0 +1,159 @@ +#!/usr/bin/python +# coding: utf-8 +import re +import codecs +import pandas as pd + + +script_name = "QQ聊天记录整理" + +# 1、通过正则语句,提取出所有的记录头和记录内容两个数组。一条记录头对应一条记录内容,所以两个数组长度应该相等。 +# 2、处理记录内容 +# 2.1、windows的换行为'\r\n',单'\n'体现不出换行效果。手机端导出的记录有的换行是\n,需要替换一下。 +# 2.2、记录头放在了每条记录末行后面,为了记录头整齐美观,需要计算一下记录头前补多少空格。windows记事本显示中文字符占两格,英文占1格,而python中文字符长度是却是1,如果想要显示整齐,还需要计算一下,然后补齐空格数。补齐后记录头距离行首位置为100的整数倍。 +# 3、读和写文件的时候注意编码转换 + + + +def length_w(text): + '''计算字符串在windows记事本中的实际显示长度''' + # 取文本长度,中文按2格计算。 + length = len(text) # 取其长度(中文字符长度为1,英文1) + utf8_length = len(text.encode('utf-8')) # 取其长度(中文长3,英文1) + length = int((utf8_length-length)/2)+length # 按(中文2英文1)计算长度 + + # 这个写法实际上还是有问题的,有些特殊字符会导致计算长度和实际显示长度不一致。所以下面计算换行问题的代码中换了另一种写法,避免因特殊字符导致每行实际显示长度超出限定值,虽然还是不精确,但是不会超出限定值。 + # 比如: + # '°'在记事本中显示占2格,b'\xc2\xb0'utf-8编码长度为2。 + # '�'在记事本中显示占1格,b'\xef\xbf\xbd'utf-8编码长度为3。 + # ''在记事本中显示占2格,b'\x01'utf-8编码长度为1。(特殊字符无法显示) + # 至于特殊'\t'制表符最好最开始就用四个空格替换掉,避免其自动缩进带来的影响 + + return length + +def chinese_linefeed(text,limit): + '''中英文混合排版,限制单行长度,超出长度换行''' + text_format= '' # 结果变量,初始化 + text = text.replace('\t',' ') + text = text.replace('\r\n','\n') + text_arr = text.split('\n') # 按行分割文本 + for line in text_arr: + # 逐行处理 + text_format+='\r\n' + num = 0 # 长度计数变量,初始化 + for i in line: + # 从该行第一个字符起计算长度 + # 中文长度为2 + # asc2码(英文及其字符等)长度为1 + # 其他长度为2(一些特殊) + if i >= u'\u4e00' and i <= u'\u9fa5': + char_len=2 + elif i >= u'\u001c' and i <= u'\u00ff': + char_len=1 + else: + char_len=2 + # 累计长度小于limit,直接保存至结果变量,计数变量累加 + # 累计长度大于limit,换行后再保存,计数变量重置 + if num+char_len<=limit: + text_format+=i + num+=char_len + else: + text_format+='\r\n'+i + num=char_len + return text_format.strip() + + +def format_chat_data(infile, outfile): + """ + # QQ聊天记录手机端导出文本 + """ + + # 读取文件 + fp = codecs.open(infile,'r','utf-8') + txt = fp.read() + fp.close() + + re_pat = r'20[\d-]{8}\s[\d:]{7,8}\s+[^\n]+(?:\d{5,11}|@\w+\.[comnet]{2,3})\)' # 正则语句,匹配记录头 + log_title_arr = re.findall(re_pat, txt) # 记录头数组['2016-06-24 15:42:52 张某(40**21)',…] + log_content_arr = re.split(re_pat, txt) # 记录内容数组['\n', '\n选修的\n\n', '\n就怕这次…] + log_content_arr.pop(0) # 剔除掉第一个(分割造成的冗余部分) + + # 数组长度 + l1 = len(log_title_arr) + l2 = len(log_content_arr) + print('记录头数: %d\n记录内容: %d'%(l1,l2)) + + if l1==l2: + # 整理后的记录 + log_format = '' + + # 开始整理 + for i in range(0,l1): + title = log_title_arr[i] # 记录头 + content = log_content_arr[i].strip() # 删记录内容首尾空白字符 + content = content.replace('\r\n','\n') # 记录中的'\n',替换为'\r\n' + content = content.replace('\n','\r\n') + content = chinese_linefeed(content,100) # 每行过长自动换行 + lastline = content.split('\r\n')[-1] # 取记录内容最后一行 + length = length_w(lastline) # 取其长度 + # space = (100-(length%100))*' ' if length%100!=0 else ''# 该行记录头前补空格,变整齐为100整数倍;余数为0则不用补空格 + space = ' | ' # 该行记录头前补空格,变整齐为100整数倍;余数为0则不用补空格 + log_format += content + space + '['+title+']\r\n'# 拼接合成记录 + + # 写到文件 + fp = codecs.open(outfile, 'w', 'utf-8') + fp.write(log_format) + fp.close() + + print("整理完毕~^_^~") + else: + print('记录头和记录内容条数不匹配,请修正代码') + + +def split_line(line): + l = re.sub(r"[\[\]]+", "", str(line).strip()).split(" | ") + if len(l) == 2: + content = l[0] + names = l[1].split(" ") + if names == 3: + c_time = names[0] + names[1] + c_id = names[2] + print([content, c_time, c_id]) + # return "%s | %s | %s" % (content, c_time, c_id) + return content + return content + return "" + + +# Extracts pairs of sentences from conversations +def extractSentencePairs(conversation): + qa_pairs = [] + for i in range(len(conversation) - 1): # We ignore the last line (no answer for it) + inputLine = conversation[i].strip() + targetLine = conversation[i+1].strip() + # Filter wrong samples (if one of the lists is empty) + if inputLine and targetLine: + qa_pairs.append("%s | %s" % (inputLine, targetLine)) + return qa_pairs + + +def format_2(infile, outfile): + df = pd.read_csv(infile, sep='\00001', header=None, names=["txt"]) + # print(df["txt"].head(5)) + df["content"] = df["txt"].apply(lambda line: split_line(line)) + # df.query("content!=''")["content"].to_csv(outfile, sep="\t", header=False, index=False) + + lines = df.query("content!=''")["content"].tolist() + # print(lines) + chats = extractSentencePairs(lines) + df_chats = pd.DataFrame(chats, columns=['lines']) + df_chats.to_csv(outfile, sep="\t", header=False, index=False) + print(">>> 数据合并成功: %s" % outfile) + + +if __name__ == "__main__": + infile = r'data/QQChat/ML_ApacheCN.csv' + outfile_1 = r'data/QQChat/format_1.csv' + outfile_2 = r'data/QQChat/format_2.csv' + # format_chat_data(infile, outfile_1) + format_2(outfile_1, outfile_2) \ No newline at end of file diff --git a/src/Chinese_ChatBot/run_demo.py b/src/Chinese_ChatBot/run_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..b1cfb508b6e8fa798839a01b35af56e693ca4c65 --- /dev/null +++ b/src/Chinese_ChatBot/run_demo.py @@ -0,0 +1,101 @@ +#!/usr/bin/python +# coding: utf-8 +import os +import re +import unicodedata +import torch +import torch.nn as nn +from u_tools import normalizeString, load_model +from u_class import Voc, GreedySearchDecoder + + +# Default word tokens +PAD_token = 0 # Used for padding short sentences +SOS_token = 1 # Start-of-sentence token +EOS_token = 2 # End-of-sentence token +MAX_LENGTH = 50 # Maximum sentence length to consider + + +def indexesFromSentence(voc, sentence): + return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token] + + +def evaluate(encoder, decoder, searcher, voc, sentence, max_length=MAX_LENGTH): + ### Format input sentence as a batch + # words -> indexes + indexes_batch = [indexesFromSentence(voc, sentence)] + # Create lengths tensor + lengths = torch.tensor([len(indexes) for indexes in indexes_batch]) + # Transpose dimensions of batch to match models' expectations + input_batch = torch.LongTensor(indexes_batch).transpose(0, 1) + # Use appropriate device + input_batch = input_batch.to(device) + lengths = lengths.to(device) + # Decode sentence with searcher + tokens, scores = searcher(input_batch, lengths, max_length) + # indexes -> words + decoded_words = [voc.index2word[token.item()] for token in tokens] + return decoded_words + + +def evaluateInput(encoder, decoder, searcher, voc): + input_sentence = '' + while(1): + try: + # Get input sentence + input_sentence = input('> ') + # Check if it is quit case + if input_sentence == 'q' or input_sentence == 'quit': break + # Normalize sentence + input_sentence = normalizeString(input_sentence) + # Evaluate sentence + output_words = evaluate(encoder, decoder, searcher, voc, input_sentence) + # Format and print response sentence + output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')] + # print('Bot:', ' '.join(output_words)) + print('Bot:', ''.join(output_words)) + + except KeyError: + print("Error: Encountered unknown word.") + + +if __name__ == "__main__": + global device, corpus_name + USE_CUDA = torch.cuda.is_available() + device = torch.device("cuda" if USE_CUDA else "cpu") + corpus_name = "Chinese_ChatBot" + + + # Configure models + attn_model = 'dot' + #attn_model = 'general' + #attn_model = 'concat' + hidden_size = 500 + encoder_n_layers = 2 + decoder_n_layers = 2 + dropout = 0.1 + cp_start_iteration = 0 + learning_rate = 0.0001 + decoder_learning_ratio = 5.0 + n_iteration = 8000 + + voc = Voc(corpus_name) + loadFilename = "data/save/cb_model/%s/2-2_500/%s_checkpoint.tar" % (corpus_name, n_iteration) + if os.path.exists(loadFilename): + checkpoint = torch.load(loadFilename) + voc.__dict__ = checkpoint['voc_dict'] + + cp_start_iteration, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding = load_model(loadFilename, voc, cp_start_iteration, attn_model, hidden_size, encoder_n_layers, decoder_n_layers, dropout, learning_rate, decoder_learning_ratio) + + # Use appropriate device + encoder = encoder.to(device) + decoder = decoder.to(device) + # Set dropout layers to eval mode + encoder.eval() + decoder.eval() + + # Initialize search module + searcher = GreedySearchDecoder(encoder, decoder, device) + + # Begin chatting (uncomment and run the following line to begin) + evaluateInput(encoder, decoder, searcher, voc) diff --git a/src/Chinese_ChatBot/run_train.py b/src/Chinese_ChatBot/run_train.py new file mode 100644 index 0000000000000000000000000000000000000000..e3b106a3f5e857ea3dbf6979738ff02a46e8f009 --- /dev/null +++ b/src/Chinese_ChatBot/run_train.py @@ -0,0 +1,350 @@ +#!/usr/bin/python +# coding: utf-8 +import os +import re +import fire +import random +import unicodedata +import itertools +import csv +import math +import codecs +import torch +import torch.nn as nn +from u_tools import normalizeString, load_model +from u_class import Voc + + +# Default word tokens +PAD_token = 0 # Used for padding short sentences +SOS_token = 1 # Start-of-sentence token +EOS_token = 2 # End-of-sentence token +MAX_LENGTH = 50 # Maximum sentence length to consider +MIN_COUNT = 3 # Minimum word count threshold for trimming + + +def printLines(file, n=10): + with open(file, 'rb') as datafile: + lines = datafile.readlines() + for line in lines[:n]: + print(line) + + +def indexesFromSentence(voc, sentence): + return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token] + +def zeroPadding(l, fillvalue=PAD_token): + return list(itertools.zip_longest(*l, fillvalue=fillvalue)) + +def binaryMatrix(l, value=PAD_token): + m = [] + for i, seq in enumerate(l): + m.append([]) + for token in seq: + if token == PAD_token: + m[i].append(0) + else: + m[i].append(1) + return m + +# Returns padded input sequence tensor and lengths +def inputVar(l, voc): + indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l] + lengths = torch.tensor([len(indexes) for indexes in indexes_batch]) + padList = zeroPadding(indexes_batch) + padVar = torch.LongTensor(padList) + return padVar, lengths + +# Returns padded target sequence tensor, padding mask, and max target length +def outputVar(l, voc): + indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l] + max_target_len = max([len(indexes) for indexes in indexes_batch]) + padList = zeroPadding(indexes_batch) + mask = binaryMatrix(padList) + mask = torch.ByteTensor(mask) + padVar = torch.LongTensor(padList) + return padVar, mask, max_target_len + + +# 初始化Voc对象 和 格式化pairs对话存放到list中 +def readVocs(datafile): + print("Reading lines...") + # Read the file and split into lines + lines = open(datafile, encoding='utf-8').read().strip().split('\n') + # Split every line into pairs and normalize + pairs = [[normalizeString(s) for s in l.split(' | ')] for l in lines] + return pairs + + +# 如果对 'p' 中的两个句子都低于 MAX_LENGTH 阈值,则返回True +def filterPair(p): + # Input sequences need to preserve the last word for EOS token + return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH + + +# 过滤满足条件的 pairs 对话 +def filterPairs(pairs): + return [pair for pair in pairs if filterPair(pair)] + +# 使用上面定义的函数,返回一个填充的voc对象和对列表 +def loadPrepareData(corpus, corpus_name, datafile, voc, save_dir): + print("Start preparing training data ...") + pairs = readVocs(datafile) + print("Read {!s} sentence pairs".format(len(pairs))) + pairs = filterPairs(pairs) + print("Trimmed to {!s} sentence pairs".format(len(pairs))) + print("Counting words...") + for pair in pairs: + voc.addSentence(pair[0]) + voc.addSentence(pair[1]) + print("Counted words:", voc.num_words) + return voc, pairs + + +# Returns all items for a given batch of pairs +def batch2TrainData(voc, pair_batch): + pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True) + input_batch, output_batch = [], [] + for pair in pair_batch: + input_batch.append(pair[0]) + output_batch.append(pair[1]) + inp, lengths = inputVar(input_batch, voc) + output, mask, max_target_len = outputVar(output_batch, voc) + return inp, lengths, output, mask, max_target_len + + +def trimRareWords(voc, pairs, MIN_COUNT): + # Trim words used under the MIN_COUNT from the voc + voc.trim(MIN_COUNT) + # Filter out pairs with trimmed words + keep_pairs = [] + for pair in pairs: + input_sentence = pair[0] + output_sentence = pair[1] + keep_input = True + keep_output = True + # Check input sentence + for word in input_sentence.split(' '): + if word not in voc.word2index: + keep_input = False + break + # Check output sentence + for word in output_sentence.split(' '): + if word not in voc.word2index: + keep_output = False + break + + # Only keep pairs that do not contain trimmed word(s) in their input or output sentence + if keep_input and keep_output: + keep_pairs.append(pair) + + print("Trimmed from {} pairs to {}, {:.4f} of total".format(len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs))) + return keep_pairs + + +def maskNLLLoss(inp, target, mask): + nTotal = mask.sum() + crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1)) + loss = crossEntropy.masked_select(mask).mean() + loss = loss.to(device) + return loss, nTotal.item() + + +def train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH): + + # Zero gradients + encoder_optimizer.zero_grad() + decoder_optimizer.zero_grad() + + # Set device options + input_variable = input_variable.to(device) + lengths = lengths.to(device) + target_variable = target_variable.to(device) + mask = mask.to(device) + + # Initialize variables + loss = 0 + print_losses = [] + n_totals = 0 + + # Forward pass through encoder + encoder_outputs, encoder_hidden = encoder(input_variable, lengths) + + # Create initial decoder input (start with SOS tokens for each sentence) + decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]]) + decoder_input = decoder_input.to(device) + + # Set initial decoder hidden state to the encoder's final hidden state + decoder_hidden = encoder_hidden[:decoder.n_layers] + + # Determine if we are using teacher forcing this iteration + use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False + + # Forward batch of sequences through decoder one time step at a time + if use_teacher_forcing: + for t in range(max_target_len): + decoder_output, decoder_hidden = decoder( + decoder_input, decoder_hidden, encoder_outputs + ) + # Teacher forcing: next input is current target + decoder_input = target_variable[t].view(1, -1) + # Calculate and accumulate loss + mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t]) + loss += mask_loss + print_losses.append(mask_loss.item() * nTotal) + n_totals += nTotal + else: + for t in range(max_target_len): + decoder_output, decoder_hidden = decoder( + decoder_input, decoder_hidden, encoder_outputs + ) + # No teacher forcing: next input is decoder's own current output + _, topi = decoder_output.topk(1) + decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]]) + decoder_input = decoder_input.to(device) + # Calculate and accumulate loss + mask_loss, nTotal = maskNLLLoss(decoder_output, target_variable[t], mask[t]) + loss += mask_loss + print_losses.append(mask_loss.item() * nTotal) + n_totals += nTotal + + # Perform backpropatation + loss.backward() + + # Clip gradients: gradients are modified in place + _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip) + _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip) + + # Adjust model weights + encoder_optimizer.step() + decoder_optimizer.step() + + return sum(print_losses) / n_totals + + +def trainIters(model_name, cp_start_iteration, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name): + + # Load batches for each iteration + training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)]) for _ in range(n_iteration)] + + # Initializations + print('Initializing ...') + # start_iteration = 1 + print_loss = 0 + start_iteration = cp_start_iteration + 1 + + # Training loop + print("Training...") + for iteration in range(start_iteration, n_iteration+1): + training_batch = training_batches[iteration-1] + # Extract fields from batch + input_variable, lengths, target_variable, mask, max_target_len = training_batch + + # Run a training iteration with batch + loss = train(input_variable, lengths, target_variable, mask, max_target_len, encoder, decoder, embedding, encoder_optimizer, decoder_optimizer, batch_size, clip) + print_loss += loss + + # Print progress + if iteration % print_every == 0: + print_loss_avg = print_loss / print_every + print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg)) + print_loss = 0 + + # Save checkpoint + if (iteration % save_every == 0): + directory = os.path.join(save_dir, model_name, corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size)) + if not os.path.exists(directory): + os.makedirs(directory) + torch.save({ + 'iteration': iteration, + 'state_dict_en': encoder.state_dict(), + 'state_dict_de': decoder.state_dict(), + 'state_dict_en_opt': encoder_optimizer.state_dict(), + 'state_dict_de_opt': decoder_optimizer.state_dict(), + 'loss': loss, + 'voc_dict': voc.__dict__, + 'embedding': embedding.state_dict() + }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint'))) + + +def TrainModel(): + global device, corpus_name + USE_CUDA = torch.cuda.is_available() + device = torch.device("cuda" if USE_CUDA else "cpu") + + corpus_name = "Chinese_ChatBot" + corpus = os.path.join("data", corpus_name) + datafile = os.path.join(corpus, "format_data.csv") + save_dir = os.path.join("data", "save") + + global teacher_forcing_ratio, hidden_size + # Configure models + model_name = 'cb_model' + attn_model = 'dot' + #attn_model = 'general' + #attn_model = 'concat' + hidden_size = 500 + encoder_n_layers = 2 + decoder_n_layers = 2 + dropout = 0.1 + cp_start_iteration = 0 + learning_rate = 0.0001 + decoder_learning_ratio = 5.0 + teacher_forcing_ratio = 1.0 + clip = 50.0 + print_every = 1 + batch_size = 64 + save_every = 1000 + n_iteration = 8000 + + voc = Voc(corpus_name) + loadFilename = "data/save/cb_model/%s/2-2_500/5000_checkpoint.tar" % (corpus_name) + if os.path.exists(loadFilename): + checkpoint = torch.load(loadFilename) + voc.__dict__ = checkpoint['voc_dict'] + + # Load/Assemble voc and pairs + voc, pairs = loadPrepareData(corpus, corpus_name, datafile, voc, save_dir) + # Print some pairs to validate + print("\npairs:") + for pair in pairs[:10]: + print(pair) + + # Trim voc and pairs + pairs = trimRareWords(voc, pairs, MIN_COUNT) + + # # Example for validation + # small_batch_size = 5 + # batches = batch2TrainData(voc, [random.choice(pairs) for _ in range(small_batch_size)]) + # input_variable, lengths, target_variable, mask, max_target_len = batches + # print("input_variable:", input_variable) + # print("lengths:", lengths) + # print("target_variable:", target_variable) + # print("mask:", mask) + # print("max_target_len:", max_target_len) + + cp_start_iteration, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding = load_model(loadFilename, voc, cp_start_iteration, attn_model, hidden_size, encoder_n_layers, decoder_n_layers, dropout, learning_rate, decoder_learning_ratio) + + # Use appropriate device + encoder = encoder.to(device) + decoder = decoder.to(device) + for state in encoder_optimizer.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.cuda() + + for state in decoder_optimizer.state.values(): + for k, v in state.items(): + if isinstance(v, torch.Tensor): + state[k] = v.cuda() + + # Ensure dropout layers are in train mode + encoder.train() + decoder.train() + + print("Starting Training!") + trainIters(model_name, cp_start_iteration, voc, pairs, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding, encoder_n_layers, decoder_n_layers, save_dir, n_iteration, batch_size, print_every, save_every, clip, corpus_name) + + +if __name__ == "__main__": + fire.Fire() diff --git a/src/Chinese_ChatBot/u_class.py b/src/Chinese_ChatBot/u_class.py new file mode 100644 index 0000000000000000000000000000000000000000..7050d76be939fa75e3716936e4f4aeee34b98222 --- /dev/null +++ b/src/Chinese_ChatBot/u_class.py @@ -0,0 +1,204 @@ +#!/usr/bin/python +# coding: utf-8 +import re +import unicodedata +import torch +import torch.nn as nn +import torch.nn.functional as F + + +# Default word tokens +PAD_token = 0 # Used for padding short sentences +SOS_token = 1 # Start-of-sentence token +EOS_token = 2 # End-of-sentence token + + +class Voc: + def __init__(self, name): + self.name = name + self.trimmed = False + self.word2index = {} + self.word2count = {} + self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"} + self.num_words = 3 # Count SOS, EOS, PAD + + def addSentence(self, sentence): + for word in sentence.split(' '): + self.addWord(word) + + def addWord(self, word): + if word not in self.word2index: + self.word2index[word] = self.num_words + self.word2count[word] = 1 + self.index2word[self.num_words] = word + self.num_words += 1 + else: + self.word2count[word] += 1 + + # Remove words below a certain count threshold + def trim(self, min_count): + if self.trimmed: + return + self.trimmed = True + + keep_words = [] + + for k, v in self.word2count.items(): + if v >= min_count: + keep_words.append(k) + + print('keep_words {} / {} = {:.4f}'.format( + len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index) + )) + + # Reinitialize dictionaries + self.word2index = {} + self.word2count = {} + self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"} + self.num_words = 3 # Count default tokens + + for word in keep_words: + self.addWord(word) + + +class EncoderRNN(nn.Module): + def __init__(self, hidden_size, embedding, n_layers=1, dropout=0): + super(EncoderRNN, self).__init__() + self.n_layers = n_layers + self.hidden_size = hidden_size + self.embedding = embedding + + # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size' + # because our input size is a word embedding with number of features == hidden_size + self.gru = nn.GRU(hidden_size, hidden_size, n_layers, + dropout=(0 if n_layers == 1 else dropout), bidirectional=True) + + def forward(self, input_seq, input_lengths, hidden=None): + # Convert word indexes to embeddings + embedded = self.embedding(input_seq) + # Pack padded batch of sequences for RNN module + packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths) + # Forward pass through GRU + outputs, hidden = self.gru(packed, hidden) + # Unpack padding + outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs) + # Sum bidirectional GRU outputs + outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] + # Return output and final hidden state + return outputs, hidden + + +# Luong attention layer +class Attn(nn.Module): + def __init__(self, method, hidden_size): + super(Attn, self).__init__() + self.method = method + if self.method not in ['dot', 'general', 'concat']: + raise ValueError(self.method, "is not an appropriate attention method.") + self.hidden_size = hidden_size + if self.method == 'general': + self.attn = torch.nn.Linear(self.hidden_size, hidden_size) + elif self.method == 'concat': + self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size) + self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size)) + + def dot_score(self, hidden, encoder_output): + return torch.sum(hidden * encoder_output, dim=2) + + def general_score(self, hidden, encoder_output): + energy = self.attn(encoder_output) + return torch.sum(hidden * energy, dim=2) + + def concat_score(self, hidden, encoder_output): + energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh() + return torch.sum(self.v * energy, dim=2) + + def forward(self, hidden, encoder_outputs): + # Calculate the attention weights (energies) based on the given method + if self.method == 'general': + attn_energies = self.general_score(hidden, encoder_outputs) + elif self.method == 'concat': + attn_energies = self.concat_score(hidden, encoder_outputs) + elif self.method == 'dot': + attn_energies = self.dot_score(hidden, encoder_outputs) + + # Transpose max_length and batch_size dimensions + attn_energies = attn_energies.t() + + # Return the softmax normalized probability scores (with added dimension) + return F.softmax(attn_energies, dim=1).unsqueeze(1) + + +class LuongAttnDecoderRNN(nn.Module): + def __init__(self, attn_model, embedding, hidden_size, output_size, n_layers=1, dropout=0.1): + super(LuongAttnDecoderRNN, self).__init__() + + # Keep for reference + self.attn_model = attn_model + self.hidden_size = hidden_size + self.output_size = output_size + self.n_layers = n_layers + self.dropout = dropout + + # Define layers + self.embedding = embedding + self.embedding_dropout = nn.Dropout(dropout) + self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout)) + self.concat = nn.Linear(hidden_size * 2, hidden_size) + self.out = nn.Linear(hidden_size, output_size) + + self.attn = Attn(attn_model, hidden_size) + + def forward(self, input_step, last_hidden, encoder_outputs): + # Note: we run this one step (word) at a time + # Get embedding of current input word + embedded = self.embedding(input_step) + embedded = self.embedding_dropout(embedded) + # Forward through unidirectional GRU + rnn_output, hidden = self.gru(embedded, last_hidden) + # Calculate attention weights from the current GRU output + attn_weights = self.attn(rnn_output, encoder_outputs) + # Multiply attention weights to encoder outputs to get new "weighted sum" context vector + context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) + # Concatenate weighted context vector and GRU output using Luong eq. 5 + rnn_output = rnn_output.squeeze(0) + context = context.squeeze(1) + concat_input = torch.cat((rnn_output, context), 1) + concat_output = torch.tanh(self.concat(concat_input)) + # Predict next word using Luong eq. 6 + output = self.out(concat_output) + output = F.softmax(output, dim=1) + # Return output and final hidden state + return output, hidden + + +class GreedySearchDecoder(nn.Module): + def __init__(self, encoder, decoder, device): + super(GreedySearchDecoder, self).__init__() + self.encoder = encoder + self.decoder = decoder + self.device = device + + def forward(self, input_seq, input_length, max_length): + # Forward input through encoder model + encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length) + # Prepare encoder's final hidden layer to be first hidden input to the decoder + decoder_hidden = encoder_hidden[:self.decoder.n_layers] + # Initialize decoder input with SOS_token + decoder_input = torch.ones(1, 1, device=self.device, dtype=torch.long) * SOS_token + # Initialize tensors to append decoded words to + all_tokens = torch.zeros([0], device=self.device, dtype=torch.long) + all_scores = torch.zeros([0], device=self.device) + # Iteratively decode one word token at a time + for _ in range(max_length): + # Forward pass through decoder + decoder_output, decoder_hidden = self.decoder(decoder_input, decoder_hidden, encoder_outputs) + # Obtain most likely word token and its softmax score + decoder_scores, decoder_input = torch.max(decoder_output, dim=1) + # Record token and score + all_tokens = torch.cat((all_tokens, decoder_input), dim=0) + all_scores = torch.cat((all_scores, decoder_scores), dim=0) + # Prepare current token to be next decoder input (add a dimension) + decoder_input = torch.unsqueeze(decoder_input, 0) + # Return collections of word tokens and scores + return all_tokens, all_scores diff --git a/src/Chinese_ChatBot/u_tools.py b/src/Chinese_ChatBot/u_tools.py new file mode 100644 index 0000000000000000000000000000000000000000..0e6b57180bb513f3f321c01a2377b39ec2b91adb --- /dev/null +++ b/src/Chinese_ChatBot/u_tools.py @@ -0,0 +1,68 @@ +#!/usr/bin/python +# coding: utf-8 +import os +import re +import unicodedata +import torch +import torch.nn as nn +from torch import optim +from u_class import Voc, EncoderRNN, LuongAttnDecoderRNN + + +# Turn a Unicode string to plain ASCII, thanks to +# https://stackoverflow.com/a/518232/2809427 +def unicodeToAscii(s): + return ''.join( + c for c in unicodedata.normalize('NFD', s) + if unicodedata.category(c) != 'Mn' + ) + + +# Lowercase, trim, and remove non-letter characters +def normalizeString(s): + s = unicodeToAscii(s.lower().strip()) + s = re.sub(r"([.!? ])", r" \1", s) + # s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) + s = re.sub(r"[^a-zA-Z.!? \u4E00-\u9FA5]+", r" ", s) + s = re.sub(r"\s+", r" ", s).strip() + # '咋死 ? ? ?红烧还是爆炒dddd' > '咋 死 ? ? ? 红 烧 还 是 爆 炒 d d d d' + s = " ".join(list(s)) + return s + + +def load_model(loadFilename, voc, cp_start_iteration, attn_model, hidden_size, encoder_n_layers, decoder_n_layers, + dropout, learning_rate, decoder_learning_ratio): + + # Load model if a loadFilename is provided + if os.path.exists(loadFilename): + # If loading on same machine the model was trained on + checkpoint = torch.load(loadFilename) + cp_start_iteration = checkpoint['iteration'] + encoder_sd = checkpoint['state_dict_en'] + decoder_sd = checkpoint['state_dict_de'] + encoder_optimizer_sd = checkpoint['state_dict_en_opt'] + decoder_optimizer_sd = checkpoint['state_dict_de_opt'] + # loss = checkpoint['loss'] + # voc.__dict__ = checkpoint['voc_dict'] + embedding_sd = checkpoint['embedding'] + + print('Building encoder and decoder ...') + # Initialize word embeddings + embedding = nn.Embedding(voc.num_words, hidden_size) + if os.path.exists(loadFilename): + embedding.load_state_dict(embedding_sd) + + print('Initialize encoder & decoder models') + encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout) + decoder = LuongAttnDecoderRNN(attn_model, embedding, hidden_size, voc.num_words, decoder_n_layers, dropout) + print('Building optimizers ...') + encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate) + decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio) + if os.path.exists(loadFilename): + encoder.load_state_dict(encoder_sd) + decoder.load_state_dict(decoder_sd) + encoder_optimizer.load_state_dict(encoder_optimizer_sd) + decoder_optimizer.load_state_dict(decoder_optimizer_sd) + + print('Models built and ready to go!') + return cp_start_iteration, encoder, decoder, encoder_optimizer, decoder_optimizer, embedding diff --git "a/src/1.\350\257\255\350\250\200\345\244\204\347\220\206\344\270\216Python/test.py" "b/src/Python\350\207\252\347\204\266\350\257\255\350\250\200\345\244\204\347\220\206/test.py" similarity index 100% rename from "src/1.\350\257\255\350\250\200\345\244\204\347\220\206\344\270\216Python/test.py" rename to "src/Python\350\207\252\347\204\266\350\257\255\350\250\200\345\244\204\347\220\206/test.py"