From 182e4f318af459ae73de2e26cafb95f30bb3f2db Mon Sep 17 00:00:00 2001 From: Chengmo Date: Tue, 22 Sep 2020 10:28:47 +0800 Subject: [PATCH] add gen_tree (#214) Co-authored-by: wuzhihua <35824027+fuyinno4@users.noreply.github.com> --- models/treebased/tdm/README.md | 1 + models/treebased/tdm/build_tree.md | 19 -- models/treebased/tdm/config.yaml | 28 +- models/treebased/tdm/gen_tree/README.md | 120 +++++++ models/treebased/tdm/gen_tree/__init__.py | 17 + models/treebased/tdm/gen_tree/cluster.py | 311 ++++++++++++++++++ models/treebased/tdm/gen_tree/emb_util.py | 73 ++++ models/treebased/tdm/gen_tree/gen_tree.py | 52 +++ models/treebased/tdm/gen_tree/tree_builder.py | 46 +++ models/treebased/tdm/gen_tree/tree_impl.py | 122 +++++++ .../tdm/gen_tree/tree_search_util.py | 206 ++++++++++++ 11 files changed, 957 insertions(+), 38 deletions(-) delete mode 100644 models/treebased/tdm/build_tree.md create mode 100644 models/treebased/tdm/gen_tree/README.md create mode 100644 models/treebased/tdm/gen_tree/__init__.py create mode 100644 models/treebased/tdm/gen_tree/cluster.py create mode 100644 models/treebased/tdm/gen_tree/emb_util.py create mode 100644 models/treebased/tdm/gen_tree/gen_tree.py create mode 100644 models/treebased/tdm/gen_tree/tree_builder.py create mode 100644 models/treebased/tdm/gen_tree/tree_impl.py create mode 100644 models/treebased/tdm/gen_tree/tree_search_util.py diff --git a/models/treebased/tdm/README.md b/models/treebased/tdm/README.md index 14f25323..ea53f44c 100644 --- a/models/treebased/tdm/README.md +++ b/models/treebased/tdm/README.md @@ -13,6 +13,7 @@ cd paddle-rec python -m paddlerec.run -m models/treebased/tdm/config.yaml ``` +3. 建树及自定义训练的细节可以查阅[TDM-Demo建树及训练](./gen_tree/README.md) ## 树结构的准备 ### 名词概念 diff --git a/models/treebased/tdm/build_tree.md b/models/treebased/tdm/build_tree.md deleted file mode 100644 index 37ecb68f..00000000 --- a/models/treebased/tdm/build_tree.md +++ /dev/null @@ -1,19 +0,0 @@ - - -wget https://paddlerec.bj.bcebos.com/utils/tree_build_utils.tar.gz --no-check-certificate - -# input_path: embedding的路径 -# emb_shape: embedding中key-value,value的维度 -# emb格式要求: embedding_id(int64),embedding(float),embedding(float),......,embedding(float) -# cluster_threads: 建树聚类所用线程 -python_172_anytree/bin/python -u main.py --input_path=./gen_emb/item_emb.txt --output_path=./ --emb_shape=24 --cluster_threads=4 - -建树流程是:1、读取emb -> 2、kmeans聚类 -> 3、聚类结果整理为树 -> 4、基于树结构得到模型所需的4个文件 - 1 Layer_list:记录了每一层都有哪些节点。训练用 - 2 Travel_list:记录每个叶子节点的Travel路径。训练用 - 3 Tree_Info:记录了每个节点的信息,主要为:是否是item/item_id,所在层级,父节点,子节点。检索用 - 4 Tree_Embedding:记录所有节点的Embedding。训练及检索用 - -注意一下训练数据输入的item是建树之前用的item id,还是基于树的node id,还是基于叶子的leaf id,在tdm_reader.py中,可以加载字典,做映射。 -用厂内版建树得到的输出文件夹里,有名为id2nodeid.txt的映射文件,格式是『hash值』+ 『树节点ID』+『叶子节点ID(表示第几个叶子节点,tdm_sampler op 所需的输入)』 -在另一个id2bidword.txt中,也有映射关系,格式是『hash值』+『原始item ID』,这个文件中仅存储了叶子节点的信息。 diff --git a/models/treebased/tdm/config.yaml b/models/treebased/tdm/config.yaml index fed727ad..c1e6f6a4 100755 --- a/models/treebased/tdm/config.yaml +++ b/models/treebased/tdm/config.yaml @@ -59,49 +59,39 @@ hyper_parameters: tree_emb_path: "{workspace}/tree/tree_emb.npy" # select runner by name -mode: runner1 -# config of each runner. -# runner is a kind of paddle training class, which wraps the train/infer process. +mode: [runner1] + runner: - name: runner1 class: train startup_class_path: "{workspace}/tdm_startup.py" - # num of epochs epochs: 10 - # device to run training or infer device: cpu save_checkpoint_interval: 2 # save model interval of epochs - save_inference_interval: 4 # save inference save_checkpoint_path: "increment" # save checkpoint path - save_inference_path: "inference" # save inference path - save_inference_feed_varnames: [] # feed vars of save inference - save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "" # load model path print_interval: 10 + phases: [phase1] - name: runner2 class: infer startup_class_path: "{workspace}/tdm_startup.py" - # device to run training or infer device: cpu init_model_path: "increment/0" # load model path print_interval: 1 + phases: [phase2] - name: runner3 class: local_cluster_train startup_class_path: "{workspace}/tdm_startup.py" fleet_mode: ps epochs: 10 - # device to run training or infer device: cpu save_checkpoint_interval: 2 # save model interval of epochs - save_inference_interval: 4 # save inference save_checkpoint_path: "increment" # save checkpoint path - save_inference_path: "inference" # save inference path - save_inference_feed_varnames: [] # feed vars of save inference - save_inference_fetch_varnames: [] # fetch vars of save inference init_model_path: "init_model" # load model path print_interval: 10 + phases: [phase1] # runner will run all the phase in each epoch phase: @@ -109,7 +99,7 @@ phase: model: "{workspace}/model.py" # user-defined model dataset_name: dataset_train # select dataset by name thread_num: 1 -# - name: phase2 -# model: "{workspace}/model.py" -# dataset_name: dataset_infer -# thread_num: 2 +- name: phase2 + model: "{workspace}/model.py" + dataset_name: dataset_infer + thread_num: 2 diff --git a/models/treebased/tdm/gen_tree/README.md b/models/treebased/tdm/gen_tree/README.md new file mode 100644 index 00000000..0444c400 --- /dev/null +++ b/models/treebased/tdm/gen_tree/README.md @@ -0,0 +1,120 @@ +# TDM-Demo建树及训练 + +## 建树所需环境 +Requirements: +- python >= 2.7 +- paddlepaddle >= 1.7.2(建议1.7.2) +- paddle-rec (克隆github paddlerec,执行python setup.py install) +- sklearn +- anytree + + +## 建树流程 + +### 生成建树所需Embedding + +- 生成Fake的emb + +```shell +cd gen_tree +python -u emb_util.py +``` + +生成的emb维度是[13, 64],含义是共有13个item,每个item的embedding维度是64,生成的item_emb位于`gen_tree/item_emb.txt` + +格式为`emb_value_0(float) 空格 emb_value_1(float) ... emb_value_63(float) \t item_id ` + +在demo中,要求item的编号从0开始,范围 [0, item_nums-1] + +真实场景可以通过各种hash映射满足该要求 + +### 对Item_embedding进行聚类建树 + +执行 + +```shell +cd gen_tree +# emd_path: item_emb的地址 +# emb_size: item_emb的第二个维度,即每个item的emb的size(示例中为64) +# threads: 多线程建树配置的线程数 +# n_clusters: 最终建树为几叉树,此处设置为2叉树 +python gen_tree.py --emd_path item_emb.txt --emb_size 64 --output_dir ./output --threads 1 --n_clusters 2 +``` + +生成的训练所需树结构文件位于`gen_tree/output` +```shell +. +├── id2item.json # 树节点id到item id的映射表 +├── layer_list.txt # 树的每个层级都有哪些节点 +├── travel_list.npy # 每个item从根到叶子的遍历路径,按item顺序排序 +├── travel_list.txt # 上个文件的明文txt +├── tree_embedding.txt # 所有节点按节点id排列组成的embedding +├── tree_emb.npy # 上个文件的.npy版本 +├── tree_info.npy # 每个节点:是否对应item/父/层级/子节点,按节点顺序排列 +├── tree_info.txt # 上个文件的明文txt +└── tree.pkl # 聚类得到的树结构 +``` + +我们最终需要使用建树生成的以下四个文件,参与网络训练,参考`models/treebased/tdm/config.yaml` + +1. layer_list.txt +2. travel_list.npy +3. tree_info.npy +4. tree_emb.npy + + +### 执行训练 + +- 更改`config.yaml`中的配置 + +首先更改 +```yaml +hyper_parameters: + # ... + tree: + # 单机训练建议tree只load一次,保存为paddle tensor,之后从paddle模型热启 + # 分布式训练trainer需要独立load + # 预测时也改为从paddle模型加载 + load_tree_from_numpy: True # only once + load_paddle_model: False # train & infer need, after load from npy, change it to True + tree_layer_path: "{workspace}/tree/layer_list.txt" + tree_travel_path: "{workspace}/tree/travel_list.npy" + tree_info_path: "{workspace}/tree/tree_info.npy" + tree_emb_path: "{workspace}/tree/tree_emb.npy" +``` +将上述几个path改为建树得到的文件所在的地址 + +再更改 +```yaml +hyper_parameters: + max_layers: 4 # 不含根节点,树的层数 + node_nums: 26 # 树共有多少个节点,数量与tree_info文件的行数相等 + leaf_node_nums: 13 # 树共有多少个叶子节点 + layer_node_num_list: [2, 4, 8, 10] # 树的每层有多少个节点 + child_nums: 2 # 每个节点最多有几个孩子结点(几叉树) + neg_sampling_list: [1, 2, 3, 4] # 在树的每层做多少负采样,训练自定义的参数 +``` + +若并不知道对上面几个参数具体值,可以试运行一下,paddlerec读取建树生成的文件后,会将具体信息打印到屏幕上,如下所示: +```shell +... +File_list: ['models/treebased/tdm/data/train/demo_fake_input.txt'] +2020-09-10 15:17:19,259 - INFO - Run TDM Trainer Startup Pass +2020-09-10 15:17:19,283 - INFO - load tree from numpy +2020-09-10 15:17:19,284 - INFO - TDM Tree leaf node nums: 13 +2020-09-10 15:17:19,284 - INFO - TDM Tree max layer: 4 +2020-09-10 15:17:19,284 - INFO - TDM Tree layer_node_num_list: [2, 4, 8, 10] +2020-09-10 15:17:19,285 - INFO - Begin Save Init model. +2020-09-10 15:17:19,394 - INFO - End Save Init model. +Running SingleRunner. +... +``` +将其抄到配置中即可 + +- 训练 + +执行 +``` +cd /PaddleRec # PaddleRec 克隆的根目录 +python -m paddlerec.run -m models/treebased/tdm/config.yaml +``` diff --git a/models/treebased/tdm/gen_tree/__init__.py b/models/treebased/tdm/gen_tree/__init__.py new file mode 100644 index 00000000..2dead222 --- /dev/null +++ b/models/treebased/tdm/gen_tree/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from . import cluster + +__all__ = [] +__all__ += cluster.__all__ diff --git a/models/treebased/tdm/gen_tree/cluster.py b/models/treebased/tdm/gen_tree/cluster.py new file mode 100644 index 00000000..9b815ad1 --- /dev/null +++ b/models/treebased/tdm/gen_tree/cluster.py @@ -0,0 +1,311 @@ +# Copyright (C) 2016-2018 Alibaba Group Holding Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import codecs +import os +import time +import collections +import argparse + +import multiprocessing as mp +import numpy as np + +from sklearn.cluster import KMeans +import tree_builder + +__all__ = ['Cluster'] + + +class Cluster: + def __init__(self, + filename, + emb_size, + id_offset=None, + parall=16, + prev_result=None, + output_dir='./', + _n_clusters=2): + self.filename = filename + self.emb_size = emb_size + self.mini_batch = 256 + self.ids = None + self.data = None + self.items = None + self.parall = parall + self.queue = None + self.timeout = 5 + self.id_offset = id_offset + self.codes = None + self.prev_result = prev_result + self.output_dir = output_dir + self.n_clusters = _n_clusters + + def _read(self): + t1 = time.time() + ids = list() + data = list() + items = list() + count = 0 + with codecs.open(self.filename, 'r', encoding='utf-8') as f: + for line in f: + arr = line.rstrip().split('\t') + if not arr: + break + elif len(arr) == 1: + label = arr[0] + emb_vec = (np.random.random_sample( + (self.emb_size, ))).tolist() + elif len(arr) == 2: + label = arr[1] + emb_vec = arr[0].split() + if len(emb_vec) != self.emb_size: + continue + if label in items: + index = items.index(label) + for i in range(0, len(emb_vec)): + data[index][i + 1] += float(emb_vec[i]) + data[index][0] += 1 + else: + items.append(label) + ids.append(count) + count += 1 + vector = list() + vector.append(1) + for i in range(0, len(emb_vec)): + vector.append(float(emb_vec[i])) + data.append(vector) + for i in range(len(data)): + data_len = len(data[0]) + for j in range(1, data_len): + data[i][j] /= data[i][0] + data[i] = data[i][1:] + self.ids = np.array(ids) + self.data = np.array(data) + self.items = np.array(items) + t2 = time.time() + + print("Read data done, {} records read, elapsed: {}".format( + len(ids), t2 - t1)) + + def train(self): + ''' Cluster data ''' + self._read() + queue = mp.Queue() + self.process_prev_result(queue) + processes = [] + pipes = [] + for _ in range(self.parall): + a, b = mp.Pipe() + p = mp.Process(target=self._train, args=(b, queue)) + processes.append(p) + pipes.append(a) + p.start() + + self.codes = np.zeros((len(self.ids), ), dtype=np.int64) + for pipe in pipes: + codes = pipe.recv() + for i in range(len(codes)): + if codes[i] > 0: + self.codes[i] = codes[i] + + for p in processes: + p.join() + + assert (queue.empty()) + builder = tree_builder.TreeBuilder(self.output_dir, self.n_clusters) + builder.build(self.ids, self.codes, items=self.items, data=self.data) + + def process_prev_result(self, queue): + if not self.prev_result: + queue.put((0, np.array(range(len(self.ids))))) + return True + + di = dict() + for i, node_id in enumerate(self.ids): + di[node_id] = i + + indexes = [] + clusters = [] + with open(self.prev_result) as f: + for line in f: + arr = line.split(",") + if arr < 2: + break + ni = [di[int(m)] for m in arr] + clusters.append(ni) + indexes += ni + assert len(set(indexes)) == len(self.ids), \ + "ids count: {}, index count: {}".format(len(self.ids), + len(set(indexes))) + count = len(clusters) + assert (count & (count - 1)) == 0, \ + "Prev cluster count: {}".format(count) + for i, ni in enumerate(clusters): + queue.put((i + count - 1, np.array(ni))) + return True + + def _train(self, pipe, queue): + last_size = -1 + catch_time = 0 + processed = False + code = np.zeros((len(self.ids), ), dtype=np.int64) + while True: + for _ in range(3): + try: + pcode, index = queue.get(timeout=self.timeout) + except: + index = None + if index is not None: + break + + if index is None: + if processed and (last_size <= self.mini_batch or + catch_time >= 3): + print("Process {} exits".format(os.getpid())) + break + else: + print("Got empty job, pid: {}, time: {}".format(os.getpid( + ), catch_time)) + catch_time += 1 + continue + + processed = True + catch_time = 0 + last_size = len(index) + if last_size <= self.mini_batch: + self._minbatch(pcode, index, code) + else: + start = time.time() + sub_index = self._cluster(index) + if last_size > self.mini_batch: + print("Train iteration done, pcode:{}, " + "data size: {}, elapsed time: {}" + .format(pcode, len(index), time.time() - start)) + self.timeout = int(0.4 * self.timeout + 0.6 * (time.time() - + start)) + if self.timeout < 5: + self.timeout = 5 + + for i in range(self.n_clusters): + if len(sub_index[i]) > 1: + queue.put( + (self.n_clusters * pcode + i + 1, sub_index[i])) + + process_count = 0 + for c in code: + if c > 0: + process_count += 1 + print("Process {} process {} items".format(os.getpid(), process_count)) + pipe.send(code) + + def _minbatch(self, pcode, index, code): + dq = collections.deque() + dq.append((pcode, index)) + batch_size = len(index) + tstart = time.time() + while dq: + pcode, index = dq.popleft() + if len(index) <= self.n_clusters: + for i in range(len(index)): + code[index[i]] = self.n_clusters * pcode + i + 1 + continue + + sub_index = self._cluster(index) + for i in range(self.n_clusters): + if len(sub_index[i]) > 1: + dq.append((self.n_clusters * pcode + i + 1, sub_index[i])) + elif len(sub_index[i]) > 0: + for j in range(len(sub_index[i])): + code[sub_index[i][j]] = self.n_clusters * \ + pcode + i + j + 1 + print("Minbatch, batch size: {}, elapsed: {}".format( + batch_size, time.time() - tstart)) + + def _cluster(self, index): + data = self.data[index] + kmeans = KMeans(n_clusters=self.n_clusters, random_state=0).fit(data) + labels = kmeans.labels_ + sub_indexes = [] + remain_index = [] + ave_num = len(index) / self.n_clusters + + for i in range(self.n_clusters): + sub_i = np.where(labels == i)[0] + sub_index = index[sub_i] + if len(sub_index) <= ave_num: + sub_indexes.append(sub_index) + else: + distances = kmeans.transform(data[sub_i])[:, i] + sorted_index = sub_index[np.argsort(distances)] + sub_indexes.append(sorted_index[:ave_num]) + remain_index.extend(list(sorted_index[ave_num:])) + idx = 0 + while idx < self.n_clusters and len(remain_index) > 0: + if len(sub_indexes[idx]) >= ave_num: + idx += 1 + else: + diff = min(len(remain_index), ave_num - len(sub_indexes[idx])) + sub_indexes[idx] = np.append(sub_indexes[idx], + np.array(remain_index[0:diff])) + remain_index = remain_index[diff:] + idx += 1 + if len(remain_index) > 0: + sub_indexes[0] = np.append(sub_indexes[0], np.array(remain_index)) + + return sub_indexes + + def _cluster1(self, index): + pass + + def _rebalance(self, lindex, rindex, distances): + sorted_index = rindex[np.argsort(distances)] + idx = np.concatenate((lindex, sorted_index)) + mid = int(len(idx) / 2) + return idx[mid:], idx[:mid] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Tree cluster") + parser.add_argument( + "--embed_file", + required=True, + help="filename of the embedded vector file") + parser.add_argument( + "--emb_size", + type=int, + default=64, + help="dimension of input embedded vector") + parser.add_argument( + "--id_offset", + default=None, + help="id offset of the generated tree internal node") + parser.add_argument( + "--parall", + type=int, + default=16, + help="Parall execution process number") + parser.add_argument( + "--prev_result", + default=None, + help="filename of the previous cluster reuslt") + + argments = parser.parse_args() + t1 = time.time() + cluster = Cluster(argments.embed_file, argments.emb_size, + argments.id_offset, argments.parall, + argments.prev_result) + cluster.train() + t2 = time.time() + print("Train complete successfully, elapsed: {}".format(t2 - t1)) diff --git a/models/treebased/tdm/gen_tree/emb_util.py b/models/treebased/tdm/gen_tree/emb_util.py new file mode 100644 index 00000000..2a4c2009 --- /dev/null +++ b/models/treebased/tdm/gen_tree/emb_util.py @@ -0,0 +1,73 @@ +# -*- coding=utf8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import paddle +import paddle.fluid as fluid +import numpy as np +import json +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument( + "--mode", + default="create_fake_emb", + choices=["create_fake_emb", "save_item_emb"], + type=str, + help=".") +parser.add_argument("--emb_id_nums", default=13, type=int, help=".") +parser.add_argument("--emb_shape", default=64, type=int, help=".") +parser.add_argument("--emb_path", default='./item_emb.txt', type=str, help='.') +args = parser.parse_args() + + +def create_fake_emb(emb_id_nums, emb_shape, emb_path): + x = fluid.data(name="item", shape=[1], lod_level=1, dtype="int64") + + # use layers.embedding to init emb value + item_emb = fluid.layers.embedding( + input=x, + is_sparse=True, + size=[emb_id_nums, emb_shape], + param_attr=fluid.ParamAttr( + name="Item_Emb", + initializer=fluid.initializer.TruncatedNormal( + loc=0.0, scale=2.0))) + + # run startup to init emb tensor + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + + # get np.array(emb_tensor) + print("Get Emb") + item_emb_array = np.array(fluid.global_scope().find_var("Item_Emb") + .get_tensor()) + with open(emb_path, 'w+') as f: + emb_str = "" + for index, value in enumerate(item_emb_array): + line = [] + for v in value: + line.append(str(v)) + line_str = " ".join(line) + line_str += "\t" + line_str += str(index) + line_str += "\n" + emb_str += line_str + f.write(emb_str) + print("Item Emb write Finish") + + +if __name__ == "__main__": + create_fake_emb(args.emb_id_nums, args.emb_shape, args.emb_path) diff --git a/models/treebased/tdm/gen_tree/gen_tree.py b/models/treebased/tdm/gen_tree/gen_tree.py new file mode 100644 index 00000000..cb75d7a3 --- /dev/null +++ b/models/treebased/tdm/gen_tree/gen_tree.py @@ -0,0 +1,52 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import argparse +from cluster import Cluster +import time +import argparse +from tree_search_util import tree_search_main +parser = argparse.ArgumentParser() +parser.add_argument("--emd_path", default='', type=str, help=".") +parser.add_argument("--emb_size", default=64, type=int, help=".") +parser.add_argument("--threads", default=1, type=int, help=".") +parser.add_argument("--n_clusters", default=3, type=int, help=".") +parser.add_argument("--output_dir", default='', type=str, help='.') +args = parser.parse_args() + + +def main(): + cur_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + if not os.path.exists(args.output_dir): + os.system("mkdir -p " + args.output_dir) + print('%s start build tree' % cur_time) + # 1. Tree clustering, generating two files in current directory, tree.pkl, id2item.json + cluster = Cluster( + args.emd_path, + args.emb_size, + parall=args.threads, + output_dir=args.output_dir, + _n_clusters=args.n_clusters) + cluster.train() + + # 2. Tree searching, generating tree_info, travel_list, layer_list for train process. + tree_search_main( + os.path.join(args.output_dir, "tree.pkl"), + os.path.join(args.output_dir, "id2item.json"), args.output_dir, + args.n_clusters) + + +if __name__ == "__main__": + main() diff --git a/models/treebased/tdm/gen_tree/tree_builder.py b/models/treebased/tdm/gen_tree/tree_builder.py new file mode 100644 index 00000000..2fd73340 --- /dev/null +++ b/models/treebased/tdm/gen_tree/tree_builder.py @@ -0,0 +1,46 @@ +# Copyright (C) 2016-2018 Alibaba Group Holding Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import numpy as np + +import sys +import os +import codecs +from tree_impl import _build + +_CUR_DIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.append(os.path.join(_CUR_DIR, "..")) + + +class TreeBuilder: + def __init__(self, output_dir='./', n_clusters=2): + self.output_dir = output_dir + self.n_clusters = n_clusters + + def build( + self, + ids, + codes, + data=None, + items=None, + id_offset=None, ): + _build(ids, codes, data, items, self.output_dir, self.n_clusters) + + def _ancessors(self, code): + ancs = [] + while code > 0: + code = int((code - 1) / 2) + ancs.append(code) + return ancs diff --git a/models/treebased/tdm/gen_tree/tree_impl.py b/models/treebased/tdm/gen_tree/tree_impl.py new file mode 100644 index 00000000..39b423d2 --- /dev/null +++ b/models/treebased/tdm/gen_tree/tree_impl.py @@ -0,0 +1,122 @@ +# Copyright (C) 2016-2018 Alibaba Group Holding Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from anytree import NodeMixin, RenderTree +import numpy as np +from anytree.exporter.dictexporter import DictExporter +import pickle +import json +import os +import time + + +class BaseClass(object): + pass + + +class TDMTreeClass(BaseClass, NodeMixin): + def __init__(self, + key_code, + emb_vec, + ids=None, + text=None, + parent=None, + children=None): + super(TDMTreeClass, self).__init__() + self.key_code = key_code + self.ids = ids + self.emb_vec = emb_vec + self.text = text + self.parent = parent + if children: + self.children = children + + def set_parent(self, parent): + self.parent = parent + + def set_children(self, children): + self.children = children + + +def _build(ids, codes, data, items, output_dir, n_clusters=2): + code_list = [0] * 50000000 + node_dict = {} + max_code = 0 + id2item = {} + curtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) + print('%s start gen code_list' % curtime) + for _id, code, datum, item in zip(ids, codes, data, items): + code_list[code] = [datum, _id] + id2item[str(_id)] = item + max_code = max(code, max_code) + ancessors = _ancessors(code, n_clusters) + for ancessor in ancessors: + code_list[ancessor] = [[]] + + for code in range(max_code, -1, -1): + if code_list[code] == 0: + continue + if len(code_list[code]) > 1: + pass + elif len(code_list[code]) == 1: + code_list[code][0] = np.mean(code_list[code][0], axis=0) + if code > 0: + ancessor = int((code - 1) / n_clusters) + code_list[ancessor][0].append(code_list[code][0]) + + print('start gen node_dict') + for code in range(0, max_code + 1): + if code_list[code] == 0: + continue + if len(code_list[code]) > 1: + [datum, _id] = code_list[code] + node_dict[code] = TDMTreeClass(code, emb_vec=datum, ids=_id) + elif len(code_list[code]) == 1: + [datum] = code_list[code] + node_dict[code] = TDMTreeClass(code, emb_vec=datum) + if code > 0: + ancessor = int((code - 1) / n_clusters) + node_dict[code].set_parent(node_dict[ancessor]) + + save_tree(node_dict[0], os.path.join(output_dir, 'tree.pkl')) + save_dict(id2item, os.path.join(output_dir, 'id2item.json')) + + +def render(root): + for row in RenderTree(root, childiter=reversed): + print("%s%s" % (row.pre, row.node.text)) + + +def save_tree(root, path): + print('save tree to %s' % path) + exporter = DictExporter() + data = exporter.export(root) + f = open(path, 'wb') + pickle.dump(data, f) + f.close() + + +def save_dict(dic, filename): + """save dict into json file""" + print('save dict to %s' % filename) + with open(filename, "w") as json_file: + json.dump(dic, json_file, ensure_ascii=False) + + +def _ancessors(code, n_clusters): + ancs = [] + while code > 0: + code = int((code - 1) / n_clusters) + ancs.append(code) + return ancs diff --git a/models/treebased/tdm/gen_tree/tree_search_util.py b/models/treebased/tdm/gen_tree/tree_search_util.py new file mode 100644 index 00000000..9ef612f7 --- /dev/null +++ b/models/treebased/tdm/gen_tree/tree_search_util.py @@ -0,0 +1,206 @@ +# -*- coding=utf8 -*- +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import pickle +import time +import os +import numpy as np + +from anytree import (AsciiStyle, LevelOrderGroupIter, LevelOrderIter, Node, + NodeMixin, RenderTree) +from anytree.importer.dictimporter import DictImporter +from anytree.iterators.abstractiter import AbstractIter +from anytree.walker import Walker +from tree_impl import TDMTreeClass + + +class myLevelOrderIter(AbstractIter): + @staticmethod + def _iter(children, filter_, stop, maxlevel): + level = 1 + while children: + next_children = [] + for child in children: + if filter_(child): + yield child, level + next_children += AbstractIter._get_children(child.children, + stop) + children = next_children + level += 1 + if AbstractIter._abort_at_level(level, maxlevel): + break + + +class Tree_search(object): + def __init__(self, tree_path, id2item_path, child_num=2): + self.root = None + self.id2item = None + self.item2id = None + self.child_num = child_num + + self.load(tree_path) + # self.load_id2item(id2item_path) + + self.level_code = [[]] + self.max_level = 0 + self.keycode_id_dict = {} + # embedding + self.keycode_nodeid_dict = {} + self.tree_info = [] + self.id_node_dict = {} + + self.get_keycode_mapping() + self.travel_tree() + self.get_children() + + def get_keycode_mapping(self): + nodeid = 0 + self.embedding = [] + print("Begin Keycode Mapping") + for node in myLevelOrderIter(self.root): + node, level = node + if level - 1 > self.max_level: + self.max_level = level - 1 + self.level_code.append([]) + if node.ids is not None: + self.keycode_id_dict[node.key_code] = node.ids + self.id_node_dict[node.ids] = node + self.keycode_nodeid_dict[node.key_code] = nodeid + self.level_code[self.max_level].append(nodeid) + + node_infos = [] + if node.ids is not None: # item_id + node_infos.append(node.ids) + else: + node_infos.append(0) + node_infos.append(self.max_level) # layer_id + if node.parent: # ancestor_id + node_infos.append(self.keycode_nodeid_dict[ + node.parent.key_code]) + else: + node_infos.append(0) + self.tree_info.append(node_infos) + self.embedding.append(node.emb_vec) + nodeid += 1 + if nodeid % 1000 == 0: + print("travel node id {}".format(nodeid)) + + def load(self, path): + print("Begin Load Tree") + f = open(path, "rb") + data = pickle.load(f) + pickle.dump(data, open(path, "wb"), protocol=2) + importer = DictImporter() + self.root = importer.import_(data) + f.close() + + def load_id2item(self, path): + """load dict from json file""" + with open(path, "rb") as json_file: + self.id2item = json.load(json_file) + + self.item2id = {value: int(key) for key, value in self.id2item.items()} + + def get_children(self): + """get every node children info""" + print("Begin Keycode Mapping") + for node in myLevelOrderIter(self.root): + node, level = node + node_id = self.keycode_nodeid_dict[node.key_code] + child_idx = 0 + if node.children: + for child in node.children: + self.tree_info[node_id].append(self.keycode_nodeid_dict[ + child.key_code]) + child_idx += 1 + while child_idx < self.child_num: + self.tree_info[node_id].append(0) + child_idx += 1 + if node_id % 1000 == 0: + print("get children node id {}".format(node_id)) + + def travel_tree(self): + self.travel_list = [] + tree_walker = Walker() + print("Begin Travel Tree") + for item in sorted(self.id_node_dict.keys()): + node = self.id_node_dict[int(item)] + paths, _, _ = tree_walker.walk(node, self.root) + paths = list(paths) + paths.reverse() + travel = [self.keycode_nodeid_dict[i.key_code] for i in paths] + while len(travel) < self.max_level: + travel.append(0) + self.travel_list.append(travel) + + +def tree_search_main(tree_path, id2item_path, output_dir, n_clusters=2): + print("Begin Tree Search") + t = Tree_search(tree_path, id2item_path, n_clusters) + + # 1. Walk all leaf nodes, get travel path array + travel_list = np.array(t.travel_list) + np.save(os.path.join(output_dir, "travel_list.npy"), travel_list) + with open(os.path.join(output_dir, "travel_list.txt"), 'w') as fout: + for i, travel in enumerate(t.travel_list): + travel = map(str, travel) + fout.write(','.join(travel)) + fout.write("\n") + print("End Save tree travel") + + # 2. Walk all layer of tree, get layer array + layer_num = 0 + with open(os.path.join(output_dir, "layer_list.txt"), 'w') as fout: + for layer in t.level_code: + # exclude layer 0 + if layer_num == 0: + layer_num += 1 + continue + for idx in range(len(layer) - 1): + fout.write(str(layer[idx]) + ',') + fout.write(str(layer[-1]) + "\n") + print("Layer {} has {} node, the first {}, the last {}".format( + layer_num, len(layer), layer[0], layer[-1])) + layer_num += 1 + print("End Save tree layer") + + # 3. Walk all node of tree, get tree info + tree_info = np.array(t.tree_info) + np.save(os.path.join(output_dir, "tree_info.npy"), tree_info) + with open(os.path.join(output_dir, "tree_info.txt"), 'w') as fout: + for i, node_infos in enumerate(t.tree_info): + node_infos = map(str, node_infos) + fout.write(','.join(node_infos)) + fout.write("\n") + print("End Save tree info") + + # 4. save embedding + embedding = np.array(t.embedding) + np.save(os.path.join(output_dir, "tree_emb.npy"), embedding) + with open(os.path.join(output_dir, "tree_embedding.txt"), "w") as fout: + for i, emb in enumerate(t.embedding): + emb = map(str, emb) + fout.write(','.join(emb)) + fout.write("\n") + + +if __name__ == "__main__": + tree_path = "./tree.pkl" + id2item_path = "./id2item.json" + output_dir = "./output" + if not os.path.exists(output_dir): + os.system("mkdir -p " + output_dir) + tree_search_main(tree_path, id2item_path, output_dir) -- GitLab