Merge pull request #238 from leolin49/master

add python maker

Merge pull request #238 from leolin49/master
add python maker
8b32cf6c · Lion · GitHub · 3b22a015 · 890e25e4 · 8b32cf6c
7 changed file
--- a/maker/python/README.md
+++ b/maker/python/README.md
+# ip2region xdb python 生成实现
+
+
+# 脚本执行
+
+```
+# 切换到python maker 根目录
+> python main.py
+ip2region xdb maker
+main.py [command] [command options]
+Command:
+  gen      generate the binary db file
+```
+
+# `xdb` 数据生成
+
+通过 `python main.py gen` 命令生成 ip2region.xdb 二进制文件:
+```
+➜  python git:(v2.0_xdb) ✗ python main.py gen
+main.py gen [command options]
+options:
+ --src string    source ip text file path
+ --dst string    destination binary xdb file path
+```
+
+例如，使用默认的 data/ip.merge.txt 作为源数据，生成一个 ip2region.xdb 到当前目录：
+```
+➜  python git:(v2.0_xdb) ✗ python main.py gen --src=../../data/ip.merge.txt --dst=./ip2region.xdb
+# 会看到一堆输出，最终会看到类似如下输出表示运行结束
+...
+2022-07-13 19:58:00,540-root-238-INFO - write done, dataBlocks: 13804, indexBlocks: (683591, 720221), indexPtr: (982904, 11065984)
+2022-07-13 19:58:00,540-root-63-INFO - Done, elapsed: 3m3s
+```
+
+
+# `xdb` 数据查询 和 bench 测试
+
+基于xdb格式的查询功能和测试见 [ip2region binding](https://github.com/lionsoul2014/ip2region/tree/master/binding)
\ No newline at end of file
--- a/maker/python/main.py
+++ b/maker/python/main.py
+# Copyright 2022 The Ip2Region Authors. All rights reserved.
+# Use of this source code is governed by a Apache2.0-style
+# license that can be found in the LICENSE file.
+#
+# Author: linyufeng <leolin49@foxmail.com>
+# Date  : 2022/7/14 17:00
+#
+import logging
+import sys
+import time
+
+import xdb.maker as mk
+import xdb.index as idx
+
+# Format log
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s-%(name)s-%(lineno)s-%(levelname)s - %(message)s",
+)
+log = logging.getLogger(__name__)
+
+
+def print_help():
+    print("ip2region xdb python maker")
+    print("{} [command] [command options]".format(sys.argv[0]))
+    print("Command: ")
+    print("  gen      generate the binary db file")
+
+
+def gen_db():
+    src_file, dst_file = "", ""
+    index_policy = idx.Vector_Index_Policy
+    # Check input parameters
+    for i in range(2, len(sys.argv)):
+        r = sys.argv[i]
+        if len(r) < 5:
+            continue
+        if not r.startswith("--"):
+            continue
+        s_idx = r.index("=")
+        if s_idx < 0:
+            print("missing = for args pair '{}'".format(r))
+            return
+        if r[2:s_idx] == "src":
+            src_file = r[s_idx + 1:]
+        elif r[2:s_idx] == "dst":
+            dst_file = r[s_idx + 1:]
+        elif r[2:s_idx] == "index":
+            index_policy = idx.index_policy_from_string(r[s_idx + 1:])
+        else:
+            print("undefined option `{}`".format(r))
+            return
+    if src_file == "" or dst_file == "":
+        print("{} gen [command options]".format(sys.argv[0]))
+        print("options:")
+        print(" --src string    source ip text file path")
+        print(" --dst string    destination binary xdb file path")
+        return
+
+    start_time = time.time()
+    # Make the binary file
+    maker = mk.new_maker(index_policy, src_file, dst_file)
+    maker.init()
+    maker.start()
+    maker.end()
+
+    logging.info(
+        "Done, elapsed: {:.0f}m{:.0f}s".format(
+            (time.time() - start_time) / 60, (time.time() - start_time) % 60
+        )
+    )
+
+
+def main():
+    if len(sys.argv) < 2:
+        print_help()
+        return
+
+    cmd = sys.argv[1].lower()
+    if cmd == "gen":
+        gen_db()
+    else:
+        print_help()
+
+
+if __name__ == "__main__":
+    main()
--- a/maker/python/xdb/__init__.py
+++ b/maker/python/xdb/__init__.py
+# Copyright 2022 The Ip2Region Authors. All rights reserved.
+# Use of this source code is governed by a Apache2.0-style
+# license that can be found in the LICENSE file.
+#
+# Author: linyufeng <leolin49@foxmail.com>
+# Date  : 2022/7/14 17:00
+#
--- a/maker/python/xdb/index.py
+++ b/maker/python/xdb/index.py
+# Copyright 2022 The Ip2Region Authors. All rights reserved.
+# Use of this source code is governed by a Apache2.0-style
+# license that can be found in the LICENSE file.
+#
+# Author: linyufeng <leolin49@foxmail.com>
+# Date  : 2022/7/14 17:00
+#
+import struct
+
+Vector_Index_Policy = 1
+BTree_Index_Policy = 2
+
+
+def index_policy_from_string(s: str) -> int:
+    sl = s.lower()
+    if sl == "vector":
+        return Vector_Index_Policy
+    elif sl == "btree":
+        return BTree_Index_Policy
+    else:
+        print("invalid policy `{}`, used default vector index".format(s))
+        return Vector_Index_Policy
+
+
+class VectorIndexBlock:
+    first_ptr = 0
+    last_ptr = 0
+
+    def __init__(self, fp=0, lp=0):
+        self.first_ptr = fp
+        self.last_ptr = lp
+
+    def __str__(self):
+        return "FirstPtr: {}, LastPrt: {}".format(self.first_ptr, self.last_ptr)
+
+    def encode(self) -> bytes:
+        return struct.pack("<II", self.first_ptr, self.last_ptr)
+
+
+Segment_Index_Block_Size = 14
+
+
+class SegmentIndexBlock:
+    start_ip = 0
+    end_ip = 0
+    data_len = 0
+    data_ptr = 0
+
+    def __init__(self, sip, eip, dl, dp):
+        self.start_ip = sip
+        self.end_ip = eip
+        self.data_len = dl
+        self.data_ptr = dp
+
+    def __str__(self):
+        return "{sip: {}, eip: {}, len: {}, ptr: {}}".format(
+            self.start_ip, self.end_ip, self.data_len, self.data_ptr
+        )
+
+    def encode(self) -> bytes:
+        return struct.pack(
+            "<IIHI", self.start_ip, self.end_ip, self.data_len, self.data_ptr
+        )
--- a/maker/python/xdb/maker.py
+++ b/maker/python/xdb/maker.py
+# Copyright 2022 The Ip2Region Authors. All rights reserved.
+# Use of this source code is governed by a Apache2.0-style
+# license that can be found in the LICENSE file.
+#
+# Author: linyufeng <leolin49@foxmail.com>
+# Date  : 2022/7/14 17:00
+#
+# ----
+# ip2region database v2.0 structure
+#
+# +----------------+-------------------+---------------+--------------+
+# | header space   | speed up index    |  data payload | block index  |
+# +----------------+-------------------+---------------+--------------+
+# | 256 bytes      | 512 KiB (fixed)   | dynamic size  | dynamic size |
+# +----------------+-------------------+---------------+--------------+
+#
+# 1. padding space : for header info like block index ptr, version, release date eg ... or any other temporary needs.
+# -- 2bytes: version number, different version means structure update, it fixed to 2 for now
+# -- 2bytes: index algorithm code.
+# -- 4bytes: generate unix timestamp (version)
+# -- 4bytes: index block start ptr
+# -- 4bytes: index block end ptr
+#
+#
+# 2. data block : region or whatever data info.
+# 3. segment index block : binary index block.
+# 4. vector index block  : fixed index info for block index search speed up.
+# space structure table:
+# -- 0   -> | 1rt super block | 2nd super block | 3rd super block | ... | 255th super block
+# -- 1   -> | 1rt super block | 2nd super block | 3rd super block | ... | 255th super block
+# -- 2   -> | 1rt super block | 2nd super block | 3rd super block | ... | 255th super block
+# -- ...
+# -- 255 -> | 1rt super block | 2nd super block | 3rd super block | ... | 255th super block
+#
+#
+# super block structure:
+# +-----------------------+----------------------+
+# | first index block ptr | last index block ptr |
+# +-----------------------+----------------------+
+#
+# data entry structure:
+# +--------------------+-----------------------+
+# | 2bytes (for desc)  |  dynamic length	   |
+# +--------------------+-----------------------+
+#  data length   whatever in bytes
+#
+# index entry structure
+# +------------+-----------+---------------+------------+
+# | 4bytes	   | 4bytes	   | 2bytes		   | 4 bytes    |
+# +------------+-----------+---------------+------------+
+#  start ip 	  end ip	  data length     data ptr
+import logging
+import struct
+import time
+import sys
+
+import xdb.segment as seg
+import xdb.index as idx
+import xdb.util as util
+
+
+Version_No = 2
+Header_Info_Length = 256
+Vector_Index_Rows = 256
+Vector_Index_Cols = 256
+Vector_Index_Size = 8
+Vector_Index_Length = Vector_Index_Rows * Vector_Index_Cols * Vector_Index_Size
+
+
+class Maker:
+    src_handle = None
+    dst_handle = None
+    index_policy = idx.Vector_Index_Policy
+    segments = None
+    region_pool = None
+    vector_index = None
+
+    def __init__(self, sh, dh, ip, sg, rp, vi):
+        self.src_handle = sh
+        self.dst_handle = dh
+        self.index_policy = ip
+        self.segments = sg
+        self.region_pool = rp
+        self.vector_index = vi
+
+    def init(self):
+        """
+        Init the `xdb` binary file.
+        1. Init the file header
+        2. Load all the segments
+        """
+        self.init_db_header()
+        self.load_segments()
+
+    def init_db_header(self):
+        """
+        Init and write the file header to the destination xdb file.
+        """
+        logging.info("try to init the db header ... ")
+        self.src_handle.seek(0, 0)
+
+        # Make and write the header space
+        header = bytearray([0] * 256)
+        # 1. Version number
+        header[0:2] = Version_No.to_bytes(2, byteorder="little")
+        # 2. Index policy code
+        header[2:4] = int(self.index_policy).to_bytes(2, byteorder="little")
+        # 3. Generate unix timestamp
+        header[4:8] = int(time.time()).to_bytes(4, byteorder="little")
+        # 4. Index block start ptr
+        header[8:12] = int(0).to_bytes(4, byteorder="little")
+        # 5. Index block end ptr
+        header[12:16] = int(0).to_bytes(4, byteorder="little")
+        # Write header buffer to file
+        self.dst_handle.write(header)
+
+    def load_segments(self) -> list:
+        """
+        Load the segments [start ip|end ip|region] from source ip text file.
+        :return: the list of Segment
+        """
+        logging.info("try to load the segments ... ")
+        last = None
+        s_tm = time.time()
+
+        lines = self.src_handle.read().splitlines()
+        for line in lines:
+            logging.info("load segment: `{}`".format(line))
+            ps = line.split("|", maxsplit=2)
+            if len(ps) != 3:
+                logging.error("invalid ip segment line `{}`".format(line))
+                return []
+            sip = util.check_ip(ps[0])
+            if sip == -1:
+                logging.error(
+                    "invalid ip address `{}` in line `{}`".format(ps[0], line)
+                )
+                return []
+            eip = util.check_ip(ps[1])
+            if eip == -1:
+                logging.error(
+                    "invalid ip address `{}` in line `{}`".format(ps[1], line)
+                )
+                return []
+            if sip > eip:
+                logging.error(
+                    "start ip({}) should not be greater than end ip({})".format(
+                        ps[0], ps[1]
+                    )
+                )
+                return []
+            if len(ps[2]) < 1:
+                logging.error("empty region info in segment line `{}`".format(line))
+                return []
+
+            segment = seg.Segment(sip=sip, eip=eip, reg=ps[2])
+            # Check the continuity of data segment
+            if last is not None:
+                if last.end_ip + 1 != segment.start_ip:
+                    logging.error(
+                        "discontinuous data segment: last.eip+1({})!=seg.sip({}, {})".format(
+                            sip, eip, ps[0]
+                        )
+                    )
+                    return []
+            self.segments.append(segment)
+            last = segment
+        logging.info(
+            "all segments loaded, length: {}, elapsed: {}".format(
+                len(self.segments), time.time() - s_tm
+            )
+        )
+
+    def set_vector_index(self, ip, ptr):
+        """
+        Init and refresh the vector index based on the IP pre-two bytes.
+        """
+        row, col = (ip >> 24) & 0xFF, (ip >> 16) & 0xFF
+        vi_block = self.vector_index[row][col]
+        if vi_block.first_ptr == 0:
+            vi_block.first_ptr = ptr
+            vi_block.last_ptr = ptr + idx.Segment_Index_Block_Size
+        else:
+            vi_block.last_ptr = ptr + idx.Segment_Index_Block_Size
+        self.vector_index[row][col] = vi_block
+
+    def start(self):
+        """
+        Start to make the 'xdb' binary file.
+        """
+        if len(self.segments) < 1:
+            logging.error("empty segment list")
+            return
+
+        # 1. Write all the region/data to the binary file
+        self.dst_handle.seek(Header_Info_Length + Vector_Index_Length, 0)
+
+        logging.info("try to write the data block ... ")
+        for s in self.segments:
+            logging.info("try to write region '{}'...".format(s.region))
+            if s.region in self.region_pool:
+                logging.info(
+                    " --[Cached] with ptr={}".format(self.region_pool[s.region])
+                )
+                continue
+            region = bytes(s.region, encoding="utf-8")
+            if len(region) > 0xFFFF:
+                logging.error(
+                    "too long region info `{}`: should be less than {} bytes".format(
+                        s.region, 0xFFFF
+                    )
+                )
+                return
+            # Get the first ptr of the next region
+            pos = self.dst_handle.seek(0, 1)
+            logging.info("{} {} {}".format(pos, region, s.region))
+            self.dst_handle.write(region)
+            self.region_pool[s.region] = pos
+            logging.info(" --[Added] with ptr={}".format(pos))
+        # 2. Write the index block and cache the super index block
+        logging.info("try to write the segment index block ... ")
+        counter, start_index_ptr, end_index_ptr = 0, -1, -1
+        for sg in self.segments:
+            if sg.region not in self.region_pool:
+                logging.error("missing ptr cache for region `{}`".format(sg.region))
+                return
+            data_len = len(bytes(sg.region, encoding="utf-8"))
+            if data_len < 1:
+                logging.error("empty region info for segment '{}'".format(sg.region))
+                return
+
+            seg_list = sg.split()
+            logging.info(
+                "try to index segment({} split) {} ...".format(len(seg_list), sg)
+            )
+            for s in seg_list:
+                pos = self.dst_handle.seek(0, 1)
+
+                s_index = idx.SegmentIndexBlock(
+                    sip=s.start_ip,
+                    eip=s.end_ip,
+                    dl=data_len,
+                    dp=self.region_pool[sg.region],
+                )
+                self.dst_handle.write(s_index.encode())
+                logging.info(
+                    "|-segment index: {}, ptr: {}, segment: {}".format(counter, pos, s)
+                )
+                self.set_vector_index(s.start_ip, pos)
+                counter += 1
+
+                # Check and record the start index ptr
+                if start_index_ptr == -1:
+                    start_index_ptr = pos
+                end_index_ptr = pos
+
+        # 3. Synchronized the vector index block
+        logging.info("try to write the vector index block ... ")
+        self.dst_handle.seek(Header_Info_Length, 0)
+        for i in range(0, len(self.vector_index)):
+            for j in range(0, len(self.vector_index[i])):
+                vi = self.vector_index[i][j]
+                self.dst_handle.write(vi.encode())
+
+        # 4. Synchronized the segment index info
+        logging.info("try to write the segment index ptr ... ")
+        buff = struct.pack("<II", start_index_ptr, end_index_ptr)
+        self.dst_handle.seek(8, 0)
+        self.dst_handle.write(buff)
+
+        logging.info(
+            "write done, dataBlocks: {}, indexBlocks: ({}, {}), indexPtr: ({}, {})".format(
+                len(self.region_pool),
+                len(self.segments),
+                counter,
+                start_index_ptr,
+                end_index_ptr,
+            )
+        )
+
+    def end(self):
+        """
+        End of make the 'xdb' binary file.
+        """
+        try:
+            self.src_handle.close()
+            self.dst_handle.close()
+        except IOError as e:
+            logging.error(e)
+            sys.exit()
+
+
+def new_maker(policy: int, srcfile: str, dstfile: str) -> Maker:
+    """
+    Create a xdb Maker to make the xdb binary file
+    :param policy: index algorithm code 1:vector, 2:b-tree
+    :param srcfile: source ip text file path
+    :param dstfile: destination binary xdb file path
+    :return: the 'xdb' Maker
+    """
+    try:
+        sh = open(srcfile, mode="r", encoding="utf-8")
+        dh = open(dstfile, mode="wb")
+        return Maker(
+            sh=sh,
+            dh=dh,
+            ip=policy,
+            sg=[],
+            rp={},
+            vi=[
+                [idx.VectorIndexBlock() for _ in range(Vector_Index_Rows)]
+                for _ in range(Vector_Index_Cols)
+            ],
+        )
+    except IOError as e:
+        logging.error(e)
+        sys.exit()
--- a/maker/python/xdb/segment.py
+++ b/maker/python/xdb/segment.py
+# Copyright 2022 The Ip2Region Authors. All rights reserved.
+# Use of this source code is governed by a Apache2.0-style
+# license that can be found in the LICENSE file.
+#
+# Author: linyufeng <leolin49@foxmail.com>
+# Date  : 2022/7/14 17:00
+#
+import xdb.util as util
+
+
+class Segment:
+    start_ip = 0
+    end_ip = 0
+    region = ""
+
+    def __init__(self, sip=0, eip=0, reg=""):
+        self.start_ip, self.end_ip = sip, eip
+        self.region = reg
+
+    def __str__(self):
+        return "{}|{}|{}".format(
+            util.long2ip(self.start_ip), util.long2ip(self.end_ip), self.region
+        )
+
+    def split(self) -> list:
+        """
+        Split the segment based on the pre-two bytes.
+        :return: the list of segment ofter split
+        """
+        # Example:
+        # split the segment "116.31.76.0|117.21.79.49|region"
+        #
+        # Return the list with segments:
+        # 116.31.76.0 | 116.31.255.255  | region
+        # 116.32.0.0  | 116.32.255.255  | region
+        # ...         | ...             | region
+        # 116.255.0.0 | 116.255.255.255 | region
+        # 117.0.0.0   | 117.0.255.255   | region
+        # 117.1.0.0   | 117.1.255.255   | region
+        # ...         | ...             | region
+        # 117.21.0.0  | 117.21.79.49    | region
+
+        # 1. Split the segment with the first byte
+        t_list_1 = []
+        s_byte_1, e_byte_1 = (self.start_ip >> 24) & 0xFF, (self.end_ip >> 24) & 0xFF
+        n_sip = self.start_ip
+        for i in range(s_byte_1, e_byte_1 + 1):
+            sip = (i << 24) | (n_sip & 0xFFFFFF)
+            eip = (i << 24) | 0xFFFFFF
+            if eip < self.end_ip:
+                n_sip = (i + 1) << 24
+            else:
+                eip = self.end_ip
+            # Append the new segment (maybe)
+            t_list_1.append(Segment(sip, eip))
+
+        # 2. Split the segments with the second byte
+        t_list_2 = []
+        for s in t_list_1:
+            base = s.start_ip & 0xFF000000
+            n_sip = s.start_ip
+            s_byte_2, e_byte_2 = (s.start_ip >> 16) & 0xFF, (s.end_ip >> 16) & 0xFF
+            for i in range(s_byte_2, e_byte_2 + 1):
+                sip = base | (i << 16) | (n_sip & 0xFFFF)
+                eip = base | (i << 16) | 0xFFFF
+                if eip < self.end_ip:
+                    n_sip = 0
+                else:
+                    eip = self.end_ip
+                t_list_2.append(Segment(sip, eip, self.region))
+        return t_list_2
--- a/maker/python/xdb/util.py
+++ b/maker/python/xdb/util.py
+# Copyright 2022 The Ip2Region Authors. All rights reserved.
+# Use of this source code is governed by a Apache2.0-style
+# license that can be found in the LICENSE file.
+#
+# Author: linyufeng <leolin49@foxmail.com>
+# Date  : 2022/7/14 17:00
+#
+_SHIFT_INDEX = (24, 16, 8, 0)
+
+
+def check_ip(ip: str) -> int:
+    """
+    Convert ip string to integer.
+    Return -1 if ip is not the correct ipv4 address.
+    """
+    if not is_ipv4(ip):
+        return -1
+    ps = ip.split(".")
+    val = 0
+    for i in range(len(ps)):
+        d = int(ps[i])
+        val |= d << _SHIFT_INDEX[i]
+    return val
+
+
+def long2ip(num: int) -> str:
+    """
+    Convert integer to ip string.
+    Return empty string if the num greater than UINT32_MAX or less than 0.
+    """
+    if num < 0 or num > 0xFFFFFFFF:
+        return ""
+    return "{}.{}.{}.{}".format(
+        (num >> 24) & 0xFF, (num >> 16) & 0xFF, (num >> 8) & 0xFF, num & 0xFF
+    )
+
+
+def is_ipv4(ip: str) -> bool:
+    """
+    Determine whether it is an ipv4 address.
+    """
+    ps = ip.split(".")
+    if len(ps) != 4:
+        return False
+    for p in ps:
+        if not p.isdigit() or len(p) > 3 or (int(p) < 0 or int(p) > 255):
+            return False
+    return True