From 723ff56409fb00d136eb0c621d54594042012bf7 Mon Sep 17 00:00:00 2001
From: chenlong <chenlong@csdn.net>
Date: Tue, 18 Apr 2023 16:40:50 +0800
Subject: [PATCH] readme

---
 README.md      | 20 +++++++++++---------
 clean_data.py  | 23 +++++++++++++++++++++--
 src/dataset.py | 22 ++++++++++++++--------
 train_sft.py   |  2 +-
 4 files changed, 47 insertions(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 96054cf..97ea952 100644
--- a/README.md
+++ b/README.md
@@ -1,17 +1,19 @@
 
-## chatCSDN
-源码来源于https://github.com/BlinkDL/RWKV-LM/tree/main/RWKV-v4neo
-主要是在作者提供的1.5B参数的基础之上，使用CSDN的问答数据和博客数据进行再次训练，经过Prompt-tuning和Instruction-Tuning，以及RLHF等微调之后得到拥有IT行业知识体系的chatCSDN。
+## ChatCSDN
+ChatCSDN基于RWKV1.5B基模型
+源码来源于：https://github.com/BlinkDL/RWKV-LM/tree/main/RWKV-v4neo
+主要是在RWKV提供的1.5B参数的基础之上，使用CSDN的问答数据和博客数据进行增量预训练，经过指令微调，得到拥有IT行业知识体系的大语言模型。
 原始模型参数地址：https://huggingface.co/BlinkDL/rwkv-4-pile-1b5
 
 ## 预处理数据
-使用项目 https://github.com/EleutherAI/gpt-neox 提供的数据转换工具讲.jsonl文件转换为.bin和.idx文件，目前已经将代码全部移植过来，放在tools文件夹中。词典使用的是20B_tokenizer.json。
+使用项目 https://github.com/EleutherAI/gpt-neox 提供的数据转换工具将.jsonl文件转换为.bin和.idx文件，目前已经将代码全部移植过来，放在tools文件夹中。词典使用的是20B_tokenizer.json。
 jsonl文件格式示例：
 ```
 {"meta": {"ID": 101}, "text": "This is the first document."}
 {"meta": {"ID": 102}, "text": "Hello\nWorld"}
 {"meta": {"ID": 103}, "text": "1+1=2\n1+2=3\n2+2=4"}
 ```
+
 使用clean_data.py中的clean_ask_data和clean_blog_data方法可以将从odps中拉取的数据转换成jsonl文件。
 进入tools文件夹下使用如下命令进行转换：
 ```
@@ -32,10 +34,10 @@ python preprocess_data.py \
             --append-eod
 ```
 
-## 训练示例
+## 预训练示例
 
 ```
-python train.py  --load_model "rwkv-80.pth" --wandb "" --proj_dir "out" \
+python train.py  --load_model "RWKV-4-Pile-1B5-EngChn-test4-20230115.pth" --wandb "" --proj_dir "out" \
 --data_file "data/blog_text_document" --data_type "binidx" --vocab_size 50277 \
 --ctx_len 1024 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 10 \
 --micro_bsz 8 --n_layer 24 --n_embd 2048 --pre_ffn 0 --head_qk 0 \
@@ -46,12 +48,12 @@ python train.py  --load_model "rwkv-80.pth" --wandb "" --proj_dir "out" \
 
 ## 接入Instruction Tuning
 
-使用指令数据集进行监督训练，精调语言模型，指令数据集格式为句子对。这部分数据需要由开发人员来进行编写，有的语料需要涉及到推理过程。
+使用指令数据集进行监督训练，精调语言模型，指令数据可以看成是问答对，训练时对答案部分进行loss计算。这部分数据来源于BELLE（https://github.com/LianjiaTech/BELLE）开源的数据集。
 
 ```
-python train_sft.py  --load_model "rwkv-500.pth" --wandb "" --proj_dir "out_sft" \
+python train_sft.py  --load_model "out/rwkv-790.pth" --wandb "" --proj_dir "out_sft" \
 --data_file "data/prompts.csv" --data_type "utf-8" --vocab_size 50277 \
---ctx_len 1024 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 5 \
+--ctx_len 1024 --epoch_steps 200 --epoch_count 1000 --epoch_begin 0 --epoch_save 20 \
 --micro_bsz 8 --n_layer 24 --n_embd 2048 --pre_ffn 0 --head_qk 0 \
 --lr_init 1e-5 --lr_final 1e-5 --warmup_steps 0 --beta1 0.9 --beta2 0.999 --adam_eps 1e-8 \
 --accelerator gpu --devices 1 --precision bf16 --strategy deepspeed_stage_2_offload --grad_cp 1 \
diff --git a/clean_data.py b/clean_data.py
index c7fdadd..143b682 100644
--- a/clean_data.py
+++ b/clean_data.py
@@ -67,7 +67,26 @@ def clean_ask_data():
             meta = {"ID": index}
             ss = json.dumps({"meta": meta, "text": content}, check_circular=False)
             file_w.write(ss + "\n")
-            index +=1
+            index += 1
 
 
-clean_ask_data()
+def belle_to_csv():
+    import json
+    import pandas as pd
+    belle_data_path = "./data/Belle.train.json"
+    prompts_path = "./data/prompts.csv"
+
+    data_list = []
+    with open(belle_data_path) as file:
+        for line in file:
+            line = line.strip()
+            json_obj = json.loads(line.strip())
+            data_list.append({
+                "input": json_obj["input"],
+                "target": json_obj["target"]
+            })
+            print(json_obj["input"])
+
+    pf = pd.DataFrame(data_list, columns=["input", "target"])
+    pf.to_csv(prompts_path, index=False)
+
diff --git a/src/dataset.py b/src/dataset.py
index 2a0e78a..993696e 100644
--- a/src/dataset.py
+++ b/src/dataset.py
@@ -237,21 +237,27 @@ class S2SDataset(Dataset):
         data_list = []
 
         for index, row in pf.iterrows():
-            question = row["question"]
-            answer = row["answer"]
-            data_list.append((self.tokenizer.tokenizer.encode(question),
-                              self.tokenizer.tokenizer.encode("\n"),
-                              self.tokenizer.tokenizer.encode(answer)))
+            input = row["input"]
+            target = row["target"]
+            input_instruction = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction: {input}\n\n### Response:"
+            input_tokens = self.tokenizer.tokenizer.encode(input_instruction)
+            target_tokens = self.tokenizer.tokenizer.encode(target)
+            if len(input_tokens) + len(target_tokens) > self.args.ctx_len:
+                continue
+            data_list.append((input_tokens, target_tokens))
         self.data = data_list
 
     def __len__(self):
-        return len(self.data)
+        return self.args.epoch_steps * self.args.micro_bsz
 
     def __getitem__(self, index):
+
         ctx_len = self.args.ctx_len
         req_len = ctx_len + 1
-        question, sep, answer = self.data[index]
-        text = question + sep + answer
+
+        i = np.random.randint(0, len(self.data))
+        question, answer = self.data[i]
+        text = question + answer
         text = text[:req_len]
 
         text = text + [0] * (req_len - len(text))
diff --git a/train_sft.py b/train_sft.py
index 51275db..f3f2103 100644
--- a/train_sft.py
+++ b/train_sft.py
@@ -266,6 +266,6 @@ if __name__ == "__main__":
         trainer.strategy.config["zero_optimization"]["reduce_bucket_size"] = args.ds_bucket_mb * 1000 * 1000
 
     # must set shuffle=True, persistent_workers=False (because worker is in another thread)
-    data_loader = DataLoader(train_data, shuffle=True, pin_memory=True, batch_size=args.micro_bsz, num_workers=1, persistent_workers=False, drop_last=True)
+    data_loader = DataLoader(train_data, shuffle=False, pin_memory=True, batch_size=args.micro_bsz, num_workers=1, persistent_workers=False, drop_last=True)
 
     trainer.fit(model, data_loader)
-- 
GitLab