未验证 提交 8b32cf6c 编写于 作者: L Lion 提交者: GitHub

Merge pull request #238 from leolin49/master

add python maker
# ip2region xdb python 生成实现
# 脚本执行
```
# 切换到python maker 根目录
> python main.py
ip2region xdb maker
main.py [command] [command options]
Command:
gen generate the binary db file
```
# `xdb` 数据生成
通过 `python main.py gen` 命令生成 ip2region.xdb 二进制文件:
```
➜ python git:(v2.0_xdb) ✗ python main.py gen
main.py gen [command options]
options:
--src string source ip text file path
--dst string destination binary xdb file path
```
例如,使用默认的 data/ip.merge.txt 作为源数据,生成一个 ip2region.xdb 到当前目录:
```
➜ python git:(v2.0_xdb) ✗ python main.py gen --src=../../data/ip.merge.txt --dst=./ip2region.xdb
# 会看到一堆输出,最终会看到类似如下输出表示运行结束
...
2022-07-13 19:58:00,540-root-238-INFO - write done, dataBlocks: 13804, indexBlocks: (683591, 720221), indexPtr: (982904, 11065984)
2022-07-13 19:58:00,540-root-63-INFO - Done, elapsed: 3m3s
```
# `xdb` 数据查询 和 bench 测试
基于xdb格式的查询功能和测试见 [ip2region binding](https://github.com/lionsoul2014/ip2region/tree/master/binding)
\ No newline at end of file
# Copyright 2022 The Ip2Region Authors. All rights reserved.
# Use of this source code is governed by a Apache2.0-style
# license that can be found in the LICENSE file.
#
# Author: linyufeng <leolin49@foxmail.com>
# Date : 2022/7/14 17:00
#
import logging
import sys
import time
import xdb.maker as mk
import xdb.index as idx
# Format log
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s-%(name)s-%(lineno)s-%(levelname)s - %(message)s",
)
log = logging.getLogger(__name__)
def print_help():
print("ip2region xdb python maker")
print("{} [command] [command options]".format(sys.argv[0]))
print("Command: ")
print(" gen generate the binary db file")
def gen_db():
src_file, dst_file = "", ""
index_policy = idx.Vector_Index_Policy
# Check input parameters
for i in range(2, len(sys.argv)):
r = sys.argv[i]
if len(r) < 5:
continue
if not r.startswith("--"):
continue
s_idx = r.index("=")
if s_idx < 0:
print("missing = for args pair '{}'".format(r))
return
if r[2:s_idx] == "src":
src_file = r[s_idx + 1:]
elif r[2:s_idx] == "dst":
dst_file = r[s_idx + 1:]
elif r[2:s_idx] == "index":
index_policy = idx.index_policy_from_string(r[s_idx + 1:])
else:
print("undefined option `{}`".format(r))
return
if src_file == "" or dst_file == "":
print("{} gen [command options]".format(sys.argv[0]))
print("options:")
print(" --src string source ip text file path")
print(" --dst string destination binary xdb file path")
return
start_time = time.time()
# Make the binary file
maker = mk.new_maker(index_policy, src_file, dst_file)
maker.init()
maker.start()
maker.end()
logging.info(
"Done, elapsed: {:.0f}m{:.0f}s".format(
(time.time() - start_time) / 60, (time.time() - start_time) % 60
)
)
def main():
if len(sys.argv) < 2:
print_help()
return
cmd = sys.argv[1].lower()
if cmd == "gen":
gen_db()
else:
print_help()
if __name__ == "__main__":
main()
# Copyright 2022 The Ip2Region Authors. All rights reserved.
# Use of this source code is governed by a Apache2.0-style
# license that can be found in the LICENSE file.
#
# Author: linyufeng <leolin49@foxmail.com>
# Date : 2022/7/14 17:00
#
# Copyright 2022 The Ip2Region Authors. All rights reserved.
# Use of this source code is governed by a Apache2.0-style
# license that can be found in the LICENSE file.
#
# Author: linyufeng <leolin49@foxmail.com>
# Date : 2022/7/14 17:00
#
import struct
Vector_Index_Policy = 1
BTree_Index_Policy = 2
def index_policy_from_string(s: str) -> int:
sl = s.lower()
if sl == "vector":
return Vector_Index_Policy
elif sl == "btree":
return BTree_Index_Policy
else:
print("invalid policy `{}`, used default vector index".format(s))
return Vector_Index_Policy
class VectorIndexBlock:
first_ptr = 0
last_ptr = 0
def __init__(self, fp=0, lp=0):
self.first_ptr = fp
self.last_ptr = lp
def __str__(self):
return "FirstPtr: {}, LastPrt: {}".format(self.first_ptr, self.last_ptr)
def encode(self) -> bytes:
return struct.pack("<II", self.first_ptr, self.last_ptr)
Segment_Index_Block_Size = 14
class SegmentIndexBlock:
start_ip = 0
end_ip = 0
data_len = 0
data_ptr = 0
def __init__(self, sip, eip, dl, dp):
self.start_ip = sip
self.end_ip = eip
self.data_len = dl
self.data_ptr = dp
def __str__(self):
return "{sip: {}, eip: {}, len: {}, ptr: {}}".format(
self.start_ip, self.end_ip, self.data_len, self.data_ptr
)
def encode(self) -> bytes:
return struct.pack(
"<IIHI", self.start_ip, self.end_ip, self.data_len, self.data_ptr
)
# Copyright 2022 The Ip2Region Authors. All rights reserved.
# Use of this source code is governed by a Apache2.0-style
# license that can be found in the LICENSE file.
#
# Author: linyufeng <leolin49@foxmail.com>
# Date : 2022/7/14 17:00
#
# ----
# ip2region database v2.0 structure
#
# +----------------+-------------------+---------------+--------------+
# | header space | speed up index | data payload | block index |
# +----------------+-------------------+---------------+--------------+
# | 256 bytes | 512 KiB (fixed) | dynamic size | dynamic size |
# +----------------+-------------------+---------------+--------------+
#
# 1. padding space : for header info like block index ptr, version, release date eg ... or any other temporary needs.
# -- 2bytes: version number, different version means structure update, it fixed to 2 for now
# -- 2bytes: index algorithm code.
# -- 4bytes: generate unix timestamp (version)
# -- 4bytes: index block start ptr
# -- 4bytes: index block end ptr
#
#
# 2. data block : region or whatever data info.
# 3. segment index block : binary index block.
# 4. vector index block : fixed index info for block index search speed up.
# space structure table:
# -- 0 -> | 1rt super block | 2nd super block | 3rd super block | ... | 255th super block
# -- 1 -> | 1rt super block | 2nd super block | 3rd super block | ... | 255th super block
# -- 2 -> | 1rt super block | 2nd super block | 3rd super block | ... | 255th super block
# -- ...
# -- 255 -> | 1rt super block | 2nd super block | 3rd super block | ... | 255th super block
#
#
# super block structure:
# +-----------------------+----------------------+
# | first index block ptr | last index block ptr |
# +-----------------------+----------------------+
#
# data entry structure:
# +--------------------+-----------------------+
# | 2bytes (for desc) | dynamic length |
# +--------------------+-----------------------+
# data length whatever in bytes
#
# index entry structure
# +------------+-----------+---------------+------------+
# | 4bytes | 4bytes | 2bytes | 4 bytes |
# +------------+-----------+---------------+------------+
# start ip end ip data length data ptr
import logging
import struct
import time
import sys
import xdb.segment as seg
import xdb.index as idx
import xdb.util as util
Version_No = 2
Header_Info_Length = 256
Vector_Index_Rows = 256
Vector_Index_Cols = 256
Vector_Index_Size = 8
Vector_Index_Length = Vector_Index_Rows * Vector_Index_Cols * Vector_Index_Size
class Maker:
src_handle = None
dst_handle = None
index_policy = idx.Vector_Index_Policy
segments = None
region_pool = None
vector_index = None
def __init__(self, sh, dh, ip, sg, rp, vi):
self.src_handle = sh
self.dst_handle = dh
self.index_policy = ip
self.segments = sg
self.region_pool = rp
self.vector_index = vi
def init(self):
"""
Init the `xdb` binary file.
1. Init the file header
2. Load all the segments
"""
self.init_db_header()
self.load_segments()
def init_db_header(self):
"""
Init and write the file header to the destination xdb file.
"""
logging.info("try to init the db header ... ")
self.src_handle.seek(0, 0)
# Make and write the header space
header = bytearray([0] * 256)
# 1. Version number
header[0:2] = Version_No.to_bytes(2, byteorder="little")
# 2. Index policy code
header[2:4] = int(self.index_policy).to_bytes(2, byteorder="little")
# 3. Generate unix timestamp
header[4:8] = int(time.time()).to_bytes(4, byteorder="little")
# 4. Index block start ptr
header[8:12] = int(0).to_bytes(4, byteorder="little")
# 5. Index block end ptr
header[12:16] = int(0).to_bytes(4, byteorder="little")
# Write header buffer to file
self.dst_handle.write(header)
def load_segments(self) -> list:
"""
Load the segments [start ip|end ip|region] from source ip text file.
:return: the list of Segment
"""
logging.info("try to load the segments ... ")
last = None
s_tm = time.time()
lines = self.src_handle.read().splitlines()
for line in lines:
logging.info("load segment: `{}`".format(line))
ps = line.split("|", maxsplit=2)
if len(ps) != 3:
logging.error("invalid ip segment line `{}`".format(line))
return []
sip = util.check_ip(ps[0])
if sip == -1:
logging.error(
"invalid ip address `{}` in line `{}`".format(ps[0], line)
)
return []
eip = util.check_ip(ps[1])
if eip == -1:
logging.error(
"invalid ip address `{}` in line `{}`".format(ps[1], line)
)
return []
if sip > eip:
logging.error(
"start ip({}) should not be greater than end ip({})".format(
ps[0], ps[1]
)
)
return []
if len(ps[2]) < 1:
logging.error("empty region info in segment line `{}`".format(line))
return []
segment = seg.Segment(sip=sip, eip=eip, reg=ps[2])
# Check the continuity of data segment
if last is not None:
if last.end_ip + 1 != segment.start_ip:
logging.error(
"discontinuous data segment: last.eip+1({})!=seg.sip({}, {})".format(
sip, eip, ps[0]
)
)
return []
self.segments.append(segment)
last = segment
logging.info(
"all segments loaded, length: {}, elapsed: {}".format(
len(self.segments), time.time() - s_tm
)
)
def set_vector_index(self, ip, ptr):
"""
Init and refresh the vector index based on the IP pre-two bytes.
"""
row, col = (ip >> 24) & 0xFF, (ip >> 16) & 0xFF
vi_block = self.vector_index[row][col]
if vi_block.first_ptr == 0:
vi_block.first_ptr = ptr
vi_block.last_ptr = ptr + idx.Segment_Index_Block_Size
else:
vi_block.last_ptr = ptr + idx.Segment_Index_Block_Size
self.vector_index[row][col] = vi_block
def start(self):
"""
Start to make the 'xdb' binary file.
"""
if len(self.segments) < 1:
logging.error("empty segment list")
return
# 1. Write all the region/data to the binary file
self.dst_handle.seek(Header_Info_Length + Vector_Index_Length, 0)
logging.info("try to write the data block ... ")
for s in self.segments:
logging.info("try to write region '{}'...".format(s.region))
if s.region in self.region_pool:
logging.info(
" --[Cached] with ptr={}".format(self.region_pool[s.region])
)
continue
region = bytes(s.region, encoding="utf-8")
if len(region) > 0xFFFF:
logging.error(
"too long region info `{}`: should be less than {} bytes".format(
s.region, 0xFFFF
)
)
return
# Get the first ptr of the next region
pos = self.dst_handle.seek(0, 1)
logging.info("{} {} {}".format(pos, region, s.region))
self.dst_handle.write(region)
self.region_pool[s.region] = pos
logging.info(" --[Added] with ptr={}".format(pos))
# 2. Write the index block and cache the super index block
logging.info("try to write the segment index block ... ")
counter, start_index_ptr, end_index_ptr = 0, -1, -1
for sg in self.segments:
if sg.region not in self.region_pool:
logging.error("missing ptr cache for region `{}`".format(sg.region))
return
data_len = len(bytes(sg.region, encoding="utf-8"))
if data_len < 1:
logging.error("empty region info for segment '{}'".format(sg.region))
return
seg_list = sg.split()
logging.info(
"try to index segment({} split) {} ...".format(len(seg_list), sg)
)
for s in seg_list:
pos = self.dst_handle.seek(0, 1)
s_index = idx.SegmentIndexBlock(
sip=s.start_ip,
eip=s.end_ip,
dl=data_len,
dp=self.region_pool[sg.region],
)
self.dst_handle.write(s_index.encode())
logging.info(
"|-segment index: {}, ptr: {}, segment: {}".format(counter, pos, s)
)
self.set_vector_index(s.start_ip, pos)
counter += 1
# Check and record the start index ptr
if start_index_ptr == -1:
start_index_ptr = pos
end_index_ptr = pos
# 3. Synchronized the vector index block
logging.info("try to write the vector index block ... ")
self.dst_handle.seek(Header_Info_Length, 0)
for i in range(0, len(self.vector_index)):
for j in range(0, len(self.vector_index[i])):
vi = self.vector_index[i][j]
self.dst_handle.write(vi.encode())
# 4. Synchronized the segment index info
logging.info("try to write the segment index ptr ... ")
buff = struct.pack("<II", start_index_ptr, end_index_ptr)
self.dst_handle.seek(8, 0)
self.dst_handle.write(buff)
logging.info(
"write done, dataBlocks: {}, indexBlocks: ({}, {}), indexPtr: ({}, {})".format(
len(self.region_pool),
len(self.segments),
counter,
start_index_ptr,
end_index_ptr,
)
)
def end(self):
"""
End of make the 'xdb' binary file.
"""
try:
self.src_handle.close()
self.dst_handle.close()
except IOError as e:
logging.error(e)
sys.exit()
def new_maker(policy: int, srcfile: str, dstfile: str) -> Maker:
"""
Create a xdb Maker to make the xdb binary file
:param policy: index algorithm code 1:vector, 2:b-tree
:param srcfile: source ip text file path
:param dstfile: destination binary xdb file path
:return: the 'xdb' Maker
"""
try:
sh = open(srcfile, mode="r", encoding="utf-8")
dh = open(dstfile, mode="wb")
return Maker(
sh=sh,
dh=dh,
ip=policy,
sg=[],
rp={},
vi=[
[idx.VectorIndexBlock() for _ in range(Vector_Index_Rows)]
for _ in range(Vector_Index_Cols)
],
)
except IOError as e:
logging.error(e)
sys.exit()
# Copyright 2022 The Ip2Region Authors. All rights reserved.
# Use of this source code is governed by a Apache2.0-style
# license that can be found in the LICENSE file.
#
# Author: linyufeng <leolin49@foxmail.com>
# Date : 2022/7/14 17:00
#
import xdb.util as util
class Segment:
start_ip = 0
end_ip = 0
region = ""
def __init__(self, sip=0, eip=0, reg=""):
self.start_ip, self.end_ip = sip, eip
self.region = reg
def __str__(self):
return "{}|{}|{}".format(
util.long2ip(self.start_ip), util.long2ip(self.end_ip), self.region
)
def split(self) -> list:
"""
Split the segment based on the pre-two bytes.
:return: the list of segment ofter split
"""
# Example:
# split the segment "116.31.76.0|117.21.79.49|region"
#
# Return the list with segments:
# 116.31.76.0 | 116.31.255.255 | region
# 116.32.0.0 | 116.32.255.255 | region
# ... | ... | region
# 116.255.0.0 | 116.255.255.255 | region
# 117.0.0.0 | 117.0.255.255 | region
# 117.1.0.0 | 117.1.255.255 | region
# ... | ... | region
# 117.21.0.0 | 117.21.79.49 | region
# 1. Split the segment with the first byte
t_list_1 = []
s_byte_1, e_byte_1 = (self.start_ip >> 24) & 0xFF, (self.end_ip >> 24) & 0xFF
n_sip = self.start_ip
for i in range(s_byte_1, e_byte_1 + 1):
sip = (i << 24) | (n_sip & 0xFFFFFF)
eip = (i << 24) | 0xFFFFFF
if eip < self.end_ip:
n_sip = (i + 1) << 24
else:
eip = self.end_ip
# Append the new segment (maybe)
t_list_1.append(Segment(sip, eip))
# 2. Split the segments with the second byte
t_list_2 = []
for s in t_list_1:
base = s.start_ip & 0xFF000000
n_sip = s.start_ip
s_byte_2, e_byte_2 = (s.start_ip >> 16) & 0xFF, (s.end_ip >> 16) & 0xFF
for i in range(s_byte_2, e_byte_2 + 1):
sip = base | (i << 16) | (n_sip & 0xFFFF)
eip = base | (i << 16) | 0xFFFF
if eip < self.end_ip:
n_sip = 0
else:
eip = self.end_ip
t_list_2.append(Segment(sip, eip, self.region))
return t_list_2
# Copyright 2022 The Ip2Region Authors. All rights reserved.
# Use of this source code is governed by a Apache2.0-style
# license that can be found in the LICENSE file.
#
# Author: linyufeng <leolin49@foxmail.com>
# Date : 2022/7/14 17:00
#
_SHIFT_INDEX = (24, 16, 8, 0)
def check_ip(ip: str) -> int:
"""
Convert ip string to integer.
Return -1 if ip is not the correct ipv4 address.
"""
if not is_ipv4(ip):
return -1
ps = ip.split(".")
val = 0
for i in range(len(ps)):
d = int(ps[i])
val |= d << _SHIFT_INDEX[i]
return val
def long2ip(num: int) -> str:
"""
Convert integer to ip string.
Return empty string if the num greater than UINT32_MAX or less than 0.
"""
if num < 0 or num > 0xFFFFFFFF:
return ""
return "{}.{}.{}.{}".format(
(num >> 24) & 0xFF, (num >> 16) & 0xFF, (num >> 8) & 0xFF, num & 0xFF
)
def is_ipv4(ip: str) -> bool:
"""
Determine whether it is an ipv4 address.
"""
ps = ip.split(".")
if len(ps) != 4:
return False
for p in ps:
if not p.isdigit() or len(p) > 3 or (int(p) < 0 or int(p) > 255):
return False
return True
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册