diff --git a/cmake/taosadapter_CMakeLists.txt.in b/cmake/taosadapter_CMakeLists.txt.in index 13b247770ea7eef6b64209ca98787ff6d733bf85..d1560574593f613d2f9cf5f486a22d24c32764f9 100644 --- a/cmake/taosadapter_CMakeLists.txt.in +++ b/cmake/taosadapter_CMakeLists.txt.in @@ -2,7 +2,7 @@ # taosadapter ExternalProject_Add(taosadapter GIT_REPOSITORY https://github.com/taosdata/taosadapter.git - GIT_TAG 213f8b3 + GIT_TAG 3e08996 SOURCE_DIR "${TD_SOURCE_DIR}/tools/taosadapter" BINARY_DIR "" #BUILD_IN_SOURCE TRUE diff --git a/cmake/taostools_CMakeLists.txt.in b/cmake/taostools_CMakeLists.txt.in index a69e1578c0acee7fe4bfa526743f8a234c0a27c3..13a81f88eab42c64be7ea0cf759da21ddce7a456 100644 --- a/cmake/taostools_CMakeLists.txt.in +++ b/cmake/taostools_CMakeLists.txt.in @@ -2,7 +2,7 @@ # taos-tools ExternalProject_Add(taos-tools GIT_REPOSITORY https://github.com/taosdata/taos-tools.git - GIT_TAG 7d24ed5 + GIT_TAG 0cd564a SOURCE_DIR "${TD_SOURCE_DIR}/tools/taos-tools" BINARY_DIR "" #BUILD_IN_SOURCE TRUE diff --git a/docs/examples/go/go.mod b/docs/examples/go/go.mod deleted file mode 100644 index 2bc1a74cb6ef14221fa384701773dc73fe3b161d..0000000000000000000000000000000000000000 --- a/docs/examples/go/go.mod +++ /dev/null @@ -1,6 +0,0 @@ -module goexample - -go 1.17 - -require github.com/taosdata/driver-go/v3 3.0 - diff --git a/docs/examples/python/conn_native_pandas.py b/docs/examples/python/conn_native_pandas.py index 56942ef57085766cd128b03cabb7a357587eab16..f3bab15efbe6669a88828fb194682dbfedb382df 100644 --- a/docs/examples/python/conn_native_pandas.py +++ b/docs/examples/python/conn_native_pandas.py @@ -1,8 +1,11 @@ import pandas -from sqlalchemy import create_engine +from sqlalchemy import create_engine, text engine = create_engine("taos://root:taosdata@localhost:6030/power") -df = pandas.read_sql("SELECT * FROM meters", engine) +conn = engine.connect() +df = pandas.read_sql(text("SELECT * FROM power.meters"), conn) +conn.close() + # print index print(df.index) diff --git a/docs/examples/python/conn_rest_pandas.py b/docs/examples/python/conn_rest_pandas.py index 0164080cd5a05e72dce40b1d111ea423623ff9b2..1b207d6ff10a353f3473116ce807cc8daf362ca7 100644 --- a/docs/examples/python/conn_rest_pandas.py +++ b/docs/examples/python/conn_rest_pandas.py @@ -1,8 +1,10 @@ import pandas -from sqlalchemy import create_engine +from sqlalchemy import create_engine, text engine = create_engine("taosrest://root:taosdata@localhost:6041") -df: pandas.DataFrame = pandas.read_sql("SELECT * FROM power.meters", engine) +conn = engine.connect() +df: pandas.DataFrame = pandas.read_sql(text("SELECT * FROM power.meters"), conn) +conn.close() # print index print(df.index) diff --git a/docs/examples/python/connect_rest_examples.py b/docs/examples/python/connect_rest_examples.py index 900ec1022ec81ac2db761d918d1ec11c9bb26852..0f8625ae5387a275f7b84948ad80191b8e443862 100644 --- a/docs/examples/python/connect_rest_examples.py +++ b/docs/examples/python/connect_rest_examples.py @@ -1,24 +1,25 @@ # ANCHOR: connect from taosrest import connect, TaosRestConnection, TaosRestCursor -conn: TaosRestConnection = connect(url="http://localhost:6041", - user="root", - password="taosdata", - timeout=30) +conn = connect(url="http://localhost:6041", + user="root", + password="taosdata", + timeout=30) # ANCHOR_END: connect # ANCHOR: basic # create STable -cursor: TaosRestCursor = conn.cursor() +cursor = conn.cursor() cursor.execute("DROP DATABASE IF EXISTS power") cursor.execute("CREATE DATABASE power") -cursor.execute("CREATE STABLE power.meters (ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) TAGS (location BINARY(64), groupId INT)") +cursor.execute( + "CREATE STABLE power.meters (ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) TAGS (location BINARY(64), groupId INT)") # insert data -cursor.execute("""INSERT INTO power.d1001 USING power.meters TAGS(California.SanFrancisco, 2) VALUES ('2018-10-03 14:38:05.000', 10.30000, 219, 0.31000) ('2018-10-03 14:38:15.000', 12.60000, 218, 0.33000) ('2018-10-03 14:38:16.800', 12.30000, 221, 0.31000) - power.d1002 USING power.meters TAGS(California.SanFrancisco, 3) VALUES ('2018-10-03 14:38:16.650', 10.30000, 218, 0.25000) - power.d1003 USING power.meters TAGS(California.LosAngeles, 2) VALUES ('2018-10-03 14:38:05.500', 11.80000, 221, 0.28000) ('2018-10-03 14:38:16.600', 13.40000, 223, 0.29000) - power.d1004 USING power.meters TAGS(California.LosAngeles, 3) VALUES ('2018-10-03 14:38:05.000', 10.80000, 223, 0.29000) ('2018-10-03 14:38:06.500', 11.50000, 221, 0.35000)""") +cursor.execute("""INSERT INTO power.d1001 USING power.meters TAGS('California.SanFrancisco', 2) VALUES ('2018-10-03 14:38:05.000', 10.30000, 219, 0.31000) ('2018-10-03 14:38:15.000', 12.60000, 218, 0.33000) ('2018-10-03 14:38:16.800', 12.30000, 221, 0.31000) + power.d1002 USING power.meters TAGS('California.SanFrancisco', 3) VALUES ('2018-10-03 14:38:16.650', 10.30000, 218, 0.25000) + power.d1003 USING power.meters TAGS('California.LosAngeles', 2) VALUES ('2018-10-03 14:38:05.500', 11.80000, 221, 0.28000) ('2018-10-03 14:38:16.600', 13.40000, 223, 0.29000) + power.d1004 USING power.meters TAGS('California.LosAngeles', 3) VALUES ('2018-10-03 14:38:05.000', 10.80000, 223, 0.29000) ('2018-10-03 14:38:06.500', 11.50000, 221, 0.35000)""") print("inserted row count:", cursor.rowcount) # query data @@ -28,7 +29,7 @@ print("queried row count:", cursor.rowcount) # get column names from cursor column_names = [meta[0] for meta in cursor.description] # get rows -data: list[tuple] = cursor.fetchall() +data = cursor.fetchall() print(column_names) for row in data: print(row) diff --git a/docs/examples/python/connection_usage_native_reference.py b/docs/examples/python/connection_usage_native_reference.py index 4803511e427bf4d906fd3a14ff6faf5a000da96c..0a23c5f95b9d0f113e861aae07255c46bb5ae0a5 100644 --- a/docs/examples/python/connection_usage_native_reference.py +++ b/docs/examples/python/connection_usage_native_reference.py @@ -8,7 +8,7 @@ conn.execute("CREATE DATABASE test") # change database. same as execute "USE db" conn.select_db("test") conn.execute("CREATE STABLE weather(ts TIMESTAMP, temperature FLOAT) TAGS (location INT)") -affected_row: int = conn.execute("INSERT INTO t1 USING weather TAGS(1) VALUES (now, 23.5) (now+1m, 23.5) (now+2m 24.4)") +affected_row = conn.execute("INSERT INTO t1 USING weather TAGS(1) VALUES (now, 23.5) (now+1m, 23.5) (now+2m, 24.4)") print("affected_row", affected_row) # output: # affected_row 3 @@ -16,10 +16,10 @@ print("affected_row", affected_row) # ANCHOR: query # Execute a sql and get its result set. It's useful for SELECT statement -result: taos.TaosResult = conn.query("SELECT * from weather") +result = conn.query("SELECT * from weather") # Get fields from result -fields: taos.field.TaosFields = result.fields +fields = result.fields for field in fields: print(field) # {name: ts, type: 9, bytes: 8} @@ -42,4 +42,4 @@ print(data) # ANCHOR_END: query -conn.close() +conn.close() \ No newline at end of file diff --git a/docs/examples/python/fast_write_example.py b/docs/examples/python/fast_write_example.py index c9d606388fdecd85f1468f24cc497ecc5941f035..626e3310b120b9415952614b4b110ed29f787582 100644 --- a/docs/examples/python/fast_write_example.py +++ b/docs/examples/python/fast_write_example.py @@ -1,15 +1,14 @@ # install dependencies: # recommend python >= 3.8 -# pip3 install faster-fifo # import logging import math +import multiprocessing import sys import time import os -from multiprocessing import Process -from faster_fifo import Queue +from multiprocessing import Process, Queue from mockdatasource import MockDataSource from queue import Empty from typing import List @@ -22,8 +21,7 @@ TABLE_COUNT = 1000 QUEUE_SIZE = 1000000 MAX_BATCH_SIZE = 3000 -read_processes = [] -write_processes = [] +_DONE_MESSAGE = '__DONE__' def get_connection(): @@ -44,41 +42,64 @@ def get_connection(): # ANCHOR: read -def run_read_task(task_id: int, task_queues: List[Queue]): +def run_read_task(task_id: int, task_queues: List[Queue], infinity): table_count_per_task = TABLE_COUNT // READ_TASK_COUNT - data_source = MockDataSource(f"tb{task_id}", table_count_per_task) + data_source = MockDataSource(f"tb{task_id}", table_count_per_task, infinity) try: for batch in data_source: + if isinstance(batch, tuple): + batch = [batch] for table_id, rows in batch: # hash data to different queue i = table_id % len(task_queues) # block putting forever when the queue is full - task_queues[i].put_many(rows, block=True, timeout=-1) + for row in rows: + task_queues[i].put(row) + if not infinity: + for queue in task_queues: + queue.put(_DONE_MESSAGE) except KeyboardInterrupt: pass + finally: + logging.info('read task over') # ANCHOR_END: read + # ANCHOR: write -def run_write_task(task_id: int, queue: Queue): +def run_write_task(task_id: int, queue: Queue, done_queue: Queue): from sql_writer import SQLWriter log = logging.getLogger(f"WriteTask-{task_id}") writer = SQLWriter(get_connection) lines = None try: while True: - try: - # get as many as possible - lines = queue.get_many(block=False, max_messages_to_get=MAX_BATCH_SIZE) + over = False + lines = [] + for _ in range(MAX_BATCH_SIZE): + try: + line = queue.get_nowait() + if line == _DONE_MESSAGE: + over = True + break + if line: + lines.append(line) + except Empty: + time.sleep(0.1) + if len(lines) > 0: writer.process_lines(lines) - except Empty: - time.sleep(0.01) + if over: + done_queue.put(_DONE_MESSAGE) + break except KeyboardInterrupt: pass except BaseException as e: log.debug(f"lines={lines}") raise e + finally: + writer.close() + log.debug('write task over') # ANCHOR_END: write @@ -103,47 +124,64 @@ def set_global_config(): # ANCHOR: monitor -def run_monitor_process(): +def run_monitor_process(done_queue: Queue): log = logging.getLogger("DataBaseMonitor") - conn = get_connection() - conn.execute("DROP DATABASE IF EXISTS test") - conn.execute("CREATE DATABASE test") - conn.execute("CREATE STABLE test.meters (ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) " - "TAGS (location BINARY(64), groupId INT)") + conn = None + try: + conn = get_connection() - def get_count(): - res = conn.query("SELECT count(*) FROM test.meters") - rows = res.fetch_all() - return rows[0][0] if rows else 0 + def get_count(): + res = conn.query("SELECT count(*) FROM test.meters") + rows = res.fetch_all() + return rows[0][0] if rows else 0 - last_count = 0 - while True: - time.sleep(10) - count = get_count() - log.info(f"count={count} speed={(count - last_count) / 10}") - last_count = count + last_count = 0 + while True: + try: + done = done_queue.get_nowait() + if done == _DONE_MESSAGE: + break + except Empty: + pass + time.sleep(10) + count = get_count() + log.info(f"count={count} speed={(count - last_count) / 10}") + last_count = count + finally: + conn.close() # ANCHOR_END: monitor # ANCHOR: main -def main(): +def main(infinity): set_global_config() logging.info(f"READ_TASK_COUNT={READ_TASK_COUNT}, WRITE_TASK_COUNT={WRITE_TASK_COUNT}, " f"TABLE_COUNT={TABLE_COUNT}, QUEUE_SIZE={QUEUE_SIZE}, MAX_BATCH_SIZE={MAX_BATCH_SIZE}") - monitor_process = Process(target=run_monitor_process) + conn = get_connection() + conn.execute("DROP DATABASE IF EXISTS test") + conn.execute("CREATE DATABASE IF NOT EXISTS test") + conn.execute("CREATE STABLE IF NOT EXISTS test.meters (ts TIMESTAMP, current FLOAT, voltage INT, phase FLOAT) " + "TAGS (location BINARY(64), groupId INT)") + conn.close() + + done_queue = Queue() + monitor_process = Process(target=run_monitor_process, args=(done_queue,)) monitor_process.start() - time.sleep(3) # waiting for database ready. + logging.debug(f"monitor task started with pid {monitor_process.pid}") task_queues: List[Queue] = [] + write_processes = [] + read_processes = [] + # create task queues for i in range(WRITE_TASK_COUNT): - queue = Queue(max_size_bytes=QUEUE_SIZE) + queue = Queue() task_queues.append(queue) # create write processes for i in range(WRITE_TASK_COUNT): - p = Process(target=run_write_task, args=(i, task_queues[i])) + p = Process(target=run_write_task, args=(i, task_queues[i], done_queue)) p.start() logging.debug(f"WriteTask-{i} started with pid {p.pid}") write_processes.append(p) @@ -151,13 +189,19 @@ def main(): # create read processes for i in range(READ_TASK_COUNT): queues = assign_queues(i, task_queues) - p = Process(target=run_read_task, args=(i, queues)) + p = Process(target=run_read_task, args=(i, queues, infinity)) p.start() logging.debug(f"ReadTask-{i} started with pid {p.pid}") read_processes.append(p) try: monitor_process.join() + for p in read_processes: + p.join() + for p in write_processes: + p.join() + time.sleep(1) + return except KeyboardInterrupt: monitor_process.terminate() [p.terminate() for p in read_processes] @@ -176,5 +220,6 @@ def assign_queues(read_task_id, task_queues): if __name__ == '__main__': - main() + multiprocessing.set_start_method('spawn') + main(False) # ANCHOR_END: main diff --git a/docs/examples/python/kafka_example.py b/docs/examples/python/kafka_example.py index 735059eec0f3dcf5094810916e66a39db5682560..a89287d372d505225e2d679e7f205a3857014533 100644 --- a/docs/examples/python/kafka_example.py +++ b/docs/examples/python/kafka_example.py @@ -26,7 +26,8 @@ class Consumer(object): 'bath_consume': True, 'batch_size': 1000, 'async_model': True, - 'workers': 10 + 'workers': 10, + 'testing': False } LOCATIONS = ['California.SanFrancisco', 'California.LosAngles', 'California.SanDiego', 'California.SanJose', @@ -46,11 +47,12 @@ class Consumer(object): def __init__(self, **configs): self.config: dict = self.DEFAULT_CONFIGS self.config.update(configs) - self.consumer = KafkaConsumer( - self.config.get('kafka_topic'), # topic - bootstrap_servers=self.config.get('kafka_brokers'), - group_id=self.config.get('kafka_group_id'), - ) + if not self.config.get('testing'): + self.consumer = KafkaConsumer( + self.config.get('kafka_topic'), # topic + bootstrap_servers=self.config.get('kafka_brokers'), + group_id=self.config.get('kafka_group_id'), + ) self.taos = taos.connect( host=self.config.get('taos_host'), user=self.config.get('taos_user'), @@ -60,7 +62,7 @@ class Consumer(object): ) if self.config.get('async_model'): self.pool = ThreadPoolExecutor(max_workers=self.config.get('workers')) - self.tasks: list[Future] = [] + self.tasks = [] # tags and table mapping # key: {location}_{groupId} value: self.tag_table_mapping = {} i = 0 @@ -104,8 +106,8 @@ class Consumer(object): for task in self.tasks: while not task.done(): pass - if self.pool is not None: - self.pool.shutdown() + if self.pool is not None: + self.pool.shutdown() # clean data if self.config.get('clean_after_testing'): @@ -115,14 +117,14 @@ class Consumer(object): if self.taos is not None: self.taos.close() - def _run(self, f: Callable[[ConsumerRecord], bool]): + def _run(self, f): for message in self.consumer: if self.config.get('async_model'): self.pool.submit(f(message)) else: f(message) - def _run_batch(self, f: Callable[[list[list[ConsumerRecord]]], None]): + def _run_batch(self, f): while True: messages = self.consumer.poll(timeout_ms=500, max_records=self.config.get('batch_size')) if messages: @@ -140,7 +142,7 @@ class Consumer(object): logging.info('## insert sql %s', sql) return self.taos.execute(sql=sql) == 1 - def _to_taos_batch(self, messages: list[list[ConsumerRecord]]): + def _to_taos_batch(self, messages): sql = self._build_sql_batch(messages=messages) if len(sql) == 0: # decode error, skip return @@ -162,7 +164,7 @@ class Consumer(object): table_name = self._get_table_name(location=location, group_id=group_id) return self.INSERT_PART_SQL.format(table_name, ts, current, voltage, phase) - def _build_sql_batch(self, messages: list[list[ConsumerRecord]]) -> str: + def _build_sql_batch(self, messages) -> str: sql_list = [] for partition_messages in messages: for message in partition_messages: @@ -186,7 +188,55 @@ def _get_location_and_group(key: str) -> (str, int): return fields[0], fields[1] +def test_to_taos(consumer: Consumer): + msg = { + 'location': 'California.SanFrancisco', + 'groupId': 1, + 'ts': '2022-12-06 15:13:38.643', + 'current': 3.41, + 'voltage': 105, + 'phase': 0.02027, + } + record = ConsumerRecord(checksum=None, headers=None, offset=1, key=None, value=json.dumps(msg), partition=1, + topic='test', serialized_key_size=None, serialized_header_size=None, + serialized_value_size=None, timestamp=time.time(), timestamp_type=None) + assert consumer._to_taos(message=record) + + +def test_to_taos_batch(consumer: Consumer): + records = [ + [ + ConsumerRecord(checksum=None, headers=None, offset=1, key=None, + value=json.dumps({'location': 'California.SanFrancisco', + 'groupId': 1, + 'ts': '2022-12-06 15:13:38.643', + 'current': 3.41, + 'voltage': 105, + 'phase': 0.02027, }), + partition=1, topic='test', serialized_key_size=None, serialized_header_size=None, + serialized_value_size=None, timestamp=time.time(), timestamp_type=None), + ConsumerRecord(checksum=None, headers=None, offset=1, key=None, + value=json.dumps({'location': 'California.LosAngles', + 'groupId': 2, + 'ts': '2022-12-06 15:13:39.643', + 'current': 3.41, + 'voltage': 102, + 'phase': 0.02027, }), + partition=1, topic='test', serialized_key_size=None, serialized_header_size=None, + serialized_value_size=None, timestamp=time.time(), timestamp_type=None), + ] + ] + + consumer._to_taos_batch(messages=records) + + if __name__ == '__main__': - consumer = Consumer(async_model=True) + consumer = Consumer(async_model=True, testing=True) + # init env consumer.init_env() - consumer.consume() \ No newline at end of file + # consumer.consume() + # test build sql + # test build sql batch + test_to_taos(consumer) + test_to_taos_batch(consumer) + \ No newline at end of file diff --git a/docs/examples/python/mockdatasource.py b/docs/examples/python/mockdatasource.py index 1c516a800e007934f8e6815f82024a53fea70073..15a7d2ff8c5b1febefcf4f81f481220947257b50 100644 --- a/docs/examples/python/mockdatasource.py +++ b/docs/examples/python/mockdatasource.py @@ -10,13 +10,14 @@ class MockDataSource: "9.4,118,0.141,California.SanFrancisco,4" ] - def __init__(self, tb_name_prefix, table_count): + def __init__(self, tb_name_prefix, table_count, infinity=True): self.table_name_prefix = tb_name_prefix + "_" self.table_count = table_count self.max_rows = 10000000 self.current_ts = round(time.time() * 1000) - self.max_rows * 100 # [(tableId, tableName, values),] self.data = self._init_data() + self.infinity = infinity def _init_data(self): lines = self.samples * (self.table_count // 5 + 1) @@ -28,14 +29,19 @@ class MockDataSource: def __iter__(self): self.row = 0 - return self + if not self.infinity: + return iter(self._iter_data()) + else: + return self def __next__(self): """ next 1000 rows for each table. return: {tableId:[row,...]} """ - # generate 1000 timestamps + return self._iter_data() + + def _iter_data(self): ts = [] for _ in range(1000): self.current_ts += 100 @@ -47,3 +53,10 @@ class MockDataSource: rows = [table_name + ',' + t + ',' + values for t in ts] result.append((table_id, rows)) return result + + +if __name__ == '__main__': + datasource = MockDataSource('t', 10, False) + for data in datasource: + print(data) + \ No newline at end of file diff --git a/docs/examples/python/sql_writer.py b/docs/examples/python/sql_writer.py index 758167376b009f21afc701be7d89c1bfbabdeb9f..db51bb71744c89ed130370f611f655086de0ef74 100644 --- a/docs/examples/python/sql_writer.py +++ b/docs/examples/python/sql_writer.py @@ -10,6 +10,7 @@ class SQLWriter: self._tb_tags = {} self._conn = get_connection_func() self._max_sql_length = self.get_max_sql_length() + self._conn.execute("create database if not exists test") self._conn.execute("USE test") def get_max_sql_length(self): @@ -20,7 +21,7 @@ class SQLWriter: return int(r[1]) return 1024 * 1024 - def process_lines(self, lines: str): + def process_lines(self, lines: [str]): """ :param lines: [[tbName,ts,current,voltage,phase,location,groupId]] """ @@ -60,6 +61,7 @@ class SQLWriter: buf.append(q) sql_len += len(q) sql += " ".join(buf) + self.create_tables() self.execute_sql(sql) self._tb_values.clear() @@ -88,3 +90,23 @@ class SQLWriter: except BaseException as e: self.log.error("Execute SQL: %s", sql) raise e + + def close(self): + if self._conn: + self._conn.close() + + +if __name__ == '__main__': + def get_connection_func(): + conn = taos.connect() + return conn + + + writer = SQLWriter(get_connection_func=get_connection_func) + writer.execute_sql( + "create stable if not exists meters (ts timestamp, current float, voltage int, phase float) " + "tags (location binary(64), groupId int)") + writer.execute_sql( + "INSERT INTO d21001 USING meters TAGS ('California.SanFrancisco', 2) " + "VALUES ('2021-07-13 14:06:32.272', 10.2, 219, 0.32)") + \ No newline at end of file diff --git a/docs/examples/python/tmq_example.py b/docs/examples/python/tmq_example.py index a4625ca11accfbf7d263f4c1993f712987a136cb..32778e9f25f4a8b58007f39d4aeac53c30fe6895 100644 --- a/docs/examples/python/tmq_example.py +++ b/docs/examples/python/tmq_example.py @@ -1,58 +1,55 @@ +from taos.tmq import Consumer import taos -from taos.tmq import * -conn = taos.connect() -print("init") -conn.execute("drop topic if exists topic_ctb_column") -conn.execute("drop database if exists py_tmq") -conn.execute("create database if not exists py_tmq vgroups 2") -conn.select_db("py_tmq") -conn.execute( - "create stable if not exists stb1 (ts timestamp, c1 int, c2 float, c3 binary(10)) tags(t1 int)" -) -conn.execute("create table if not exists tb1 using stb1 tags(1)") -conn.execute("create table if not exists tb2 using stb1 tags(2)") -conn.execute("create table if not exists tb3 using stb1 tags(3)") - -print("create topic") -conn.execute( - "create topic if not exists topic_ctb_column as select ts, c1, c2, c3 from stb1" -) - -print("build consumer") -conf = TaosTmqConf() -conf.set("group.id", "tg2") -conf.set("td.connect.user", "root") -conf.set("td.connect.pass", "taosdata") -conf.set("enable.auto.commit", "true") - - -def tmq_commit_cb_print(tmq, resp, offset, param=None): - print(f"commit: {resp}, tmq: {tmq}, offset: {offset}, param: {param}") - - -conf.set_auto_commit_cb(tmq_commit_cb_print, None) -tmq = conf.new_consumer() - -print("build topic list") - -topic_list = TaosTmqList() -topic_list.append("topic_ctb_column") - -print("basic consume loop") -tmq.subscribe(topic_list) - -sub_list = tmq.subscription() - -print("subscribed topics: ", sub_list) - -while 1: - res = tmq.poll(1000) - if res: - topic = res.get_topic_name() - vg = res.get_vgroup_id() - db = res.get_db_name() - print(f"topic: {topic}\nvgroup id: {vg}\ndb: {db}") - for row in res: - print(row) +def init_tmq_env(db, topic): + conn = taos.connect() + conn.execute("drop topic if exists {}".format(topic)) + conn.execute("drop database if exists {}".format(db)) + conn.execute("create database if not exists {}".format(db)) + conn.select_db(db) + conn.execute( + "create stable if not exists stb1 (ts timestamp, c1 int, c2 float, c3 varchar(16)) tags(t1 int, t3 varchar(16))") + conn.execute("create table if not exists tb1 using stb1 tags(1, 't1')") + conn.execute("create table if not exists tb2 using stb1 tags(2, 't2')") + conn.execute("create table if not exists tb3 using stb1 tags(3, 't3')") + conn.execute("create topic if not exists {} as select ts, c1, c2, c3 from stb1".format(topic)) + conn.execute("insert into tb1 values (now, 1, 1.0, 'tmq test')") + conn.execute("insert into tb2 values (now, 2, 2.0, 'tmq test')") + conn.execute("insert into tb3 values (now, 3, 3.0, 'tmq test')") + + +def cleanup(db, topic): + conn = taos.connect() + conn.execute("drop topic if exists {}".format(topic)) + conn.execute("drop database if exists {}".format(db)) + + +if __name__ == '__main__': + init_tmq_env("tmq_test", "tmq_test_topic") # init env + consumer = Consumer( + { + "group.id": "tg2", + "td.connect.user": "root", + "td.connect.pass": "taosdata", + "enable.auto.commit": "true", + } + ) + consumer.subscribe(["tmq_test_topic"]) + + try: + while True: + res = consumer.poll(1) + if not res: + break + err = res.error() + if err is not None: + raise err + val = res.value() + + for block in val: + print(block.fetchall()) + finally: + consumer.unsubscribe() + consumer.close() + cleanup("tmq_test", "tmq_test_topic") \ No newline at end of file diff --git a/include/libs/wal/wal.h b/include/libs/wal/wal.h index a1ae1e429dd5dbb18f6521b263576e7096482327..a0f421212a56603402c61c9bb2763a3d1e7cee1c 100644 --- a/include/libs/wal/wal.h +++ b/include/libs/wal/wal.h @@ -201,6 +201,7 @@ int32_t walFetchHead(SWalReader *pRead, int64_t ver, SWalCkHead *pHead); int32_t walFetchBody(SWalReader *pRead, SWalCkHead **ppHead); int32_t walSkipFetchBody(SWalReader *pRead, const SWalCkHead *pHead); +SWalRef *walRefFirstVer(SWal *, SWalRef *); SWalRef *walRefCommittedVer(SWal *); SWalRef *walOpenRef(SWal *); diff --git a/packaging/tools/install.sh b/packaging/tools/install.sh index 4be179b04d7241895c08b5b1d0e62ebef9603d09..dfdbaa6fdd634679d45b1fa3eeaa5aabdea36a23 100755 --- a/packaging/tools/install.sh +++ b/packaging/tools/install.sh @@ -210,8 +210,8 @@ function install_bin() { [ -x ${install_main_dir}/bin/${serverName} ] && ${csudo}ln -s ${install_main_dir}/bin/${serverName} ${bin_link_dir}/${serverName} || : [ -x ${install_main_dir}/bin/${udfdName} ] && ${csudo}ln -s ${install_main_dir}/bin/${udfdName} ${bin_link_dir}/${udfdName} || : [ -x ${install_main_dir}/bin/${adapterName} ] && ${csudo}ln -s ${install_main_dir}/bin/${adapterName} ${bin_link_dir}/${adapterName} || : - [ -x ${install_main_dir}/bin/${benchmarkName} ] && ${csudo}ln -s ${install_main_dir}/bin/${benchmarkName} ${bin_link_dir}/${demoName} || : - [ -x ${install_main_dir}/bin/${benchmarkName} ] && ${csudo}ln -s ${install_main_dir}/bin/${benchmarkName} ${bin_link_dir}/${benchmarkName} || : + [ -x ${install_main_dir}/bin/${benchmarkName} ] && ${csudo}ln -sf ${install_main_dir}/bin/${benchmarkName} ${bin_link_dir}/${demoName} || : + [ -x ${install_main_dir}/bin/${benchmarkName} ] && ${csudo}ln -sf ${install_main_dir}/bin/${benchmarkName} ${bin_link_dir}/${benchmarkName} || : [ -x ${install_main_dir}/bin/${dumpName} ] && ${csudo}ln -s ${install_main_dir}/bin/${dumpName} ${bin_link_dir}/${dumpName} || : [ -x ${install_main_dir}/bin/${xname} ] && ${csudo}ln -s ${install_main_dir}/bin/${xname} ${bin_link_dir}/${xname} || : [ -x ${install_main_dir}/bin/TDinsight.sh ] && ${csudo}ln -s ${install_main_dir}/bin/TDinsight.sh ${bin_link_dir}/TDinsight.sh || : @@ -746,7 +746,7 @@ function is_version_compatible() { deb_erase() { confirm="" while [ "" == "${confirm}" ]; do - echo -e -n "${RED}Exist tdengine deb detected, do you want to remove it? [yes|no] ${NC}:" + echo -e -n "${RED}Existing TDengine deb is detected, do you want to remove it? [yes|no] ${NC}:" read confirm if [ "yes" == "$confirm" ]; then ${csudo}dpkg --remove tdengine ||: @@ -760,7 +760,7 @@ deb_erase() { rpm_erase() { confirm="" while [ "" == "${confirm}" ]; do - echo -e -n "${RED}Exist tdengine rpm detected, do you want to remove it? [yes|no] ${NC}:" + echo -e -n "${RED}Existing TDengine rpm is detected, do you want to remove it? [yes|no] ${NC}:" read confirm if [ "yes" == "$confirm" ]; then ${csudo}rpm -e tdengine ||: @@ -787,7 +787,7 @@ function updateProduct() { if echo $osinfo | grep -qwi "centos"; then rpm -q tdengine 2>&1 > /dev/null && rpm_erase tdengine ||: elif echo $osinfo | grep -qwi "ubuntu"; then - dpkg -l tdengine 2>&1 > /dev/null && deb_erase tdengine ||: + dpkg -l tdengine 2>&1 | grep ii > /dev/null && deb_erase tdengine ||: fi tar -zxf ${tarName} diff --git a/source/dnode/mgmt/mgmt_vnode/src/vmInt.c b/source/dnode/mgmt/mgmt_vnode/src/vmInt.c index 99ba9b9b3bcc1e0efe9d1a1bb987e01c89786374..951544c76625ab8091cadd6e944cd702ca5e3a44 100644 --- a/source/dnode/mgmt/mgmt_vnode/src/vmInt.c +++ b/source/dnode/mgmt/mgmt_vnode/src/vmInt.c @@ -79,8 +79,6 @@ int32_t vmOpenVnode(SVnodeMgmt *pMgmt, SWrapperCfg *pCfg, SVnode *pImpl) { void vmCloseVnode(SVnodeMgmt *pMgmt, SVnodeObj *pVnode) { char path[TSDB_FILENAME_LEN] = {0}; - vnodeProposeCommitOnNeed(pVnode->pImpl); - taosThreadRwlockWrlock(&pMgmt->lock); taosHashRemove(pMgmt->hash, &pVnode->vgId, sizeof(int32_t)); taosThreadRwlockUnlock(&pMgmt->lock); diff --git a/source/dnode/vnode/src/inc/tsdb.h b/source/dnode/vnode/src/inc/tsdb.h index 5a2e462c8cd69598ee8642c7109983c97c034294..2efb00ae32c510211a0ac0d6c44450fa1b48464f 100644 --- a/source/dnode/vnode/src/inc/tsdb.h +++ b/source/dnode/vnode/src/inc/tsdb.h @@ -202,6 +202,7 @@ int32_t tsdbCmprColData(SColData *pColData, int8_t cmprAlg, SBlockCol *pBlockCol uint8_t **ppBuf); int32_t tsdbDecmprColData(uint8_t *pIn, SBlockCol *pBlockCol, int8_t cmprAlg, int32_t nVal, SColData *pColData, uint8_t **ppBuf); +int32_t tRowInfoCmprFn(const void *p1, const void *p2); // tsdbMemTable ============================================================================================== // SMemTable int32_t tsdbMemTableCreate(STsdb *pTsdb, SMemTable **ppMemTable); diff --git a/source/dnode/vnode/src/inc/vnodeInt.h b/source/dnode/vnode/src/inc/vnodeInt.h index 75367883f1d82edf349bbccbee5552dcf299a003..2501af7f04203f895513cc22aacbe63d0fb64b8d 100644 --- a/source/dnode/vnode/src/inc/vnodeInt.h +++ b/source/dnode/vnode/src/inc/vnodeInt.h @@ -247,7 +247,7 @@ int32_t tsdbSnapReaderClose(STsdbSnapReader** ppReader); int32_t tsdbSnapRead(STsdbSnapReader* pReader, uint8_t** ppData); // STsdbSnapWriter ======================================== int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, STsdbSnapWriter** ppWriter); -int32_t tsdbSnapWrite(STsdbSnapWriter* pWriter, uint8_t* pData, uint32_t nData); +int32_t tsdbSnapWrite(STsdbSnapWriter* pWriter, SSnapDataHdr* pHdr); int32_t tsdbSnapWriterPrepareClose(STsdbSnapWriter* pWriter); int32_t tsdbSnapWriterClose(STsdbSnapWriter** ppWriter, int8_t rollback); // STqSnapshotReader == diff --git a/source/dnode/vnode/src/sma/smaSnapshot.c b/source/dnode/vnode/src/sma/smaSnapshot.c index 34f884f9f95776be10f1416fdf50f99a8fafce6e..0a6fac0fe7c2d6c724855907e61f498a82abbd57 100644 --- a/source/dnode/vnode/src/sma/smaSnapshot.c +++ b/source/dnode/vnode/src/sma/smaSnapshot.c @@ -423,10 +423,10 @@ int32_t rsmaSnapWrite(SRSmaSnapWriter* pWriter, uint8_t* pData, uint32_t nData) // rsma1/rsma2 if (pHdr->type == SNAP_DATA_RSMA1) { pHdr->type = SNAP_DATA_TSDB; - code = tsdbSnapWrite(pWriter->pDataWriter[0], pData, nData); + code = tsdbSnapWrite(pWriter->pDataWriter[0], pHdr); } else if (pHdr->type == SNAP_DATA_RSMA2) { pHdr->type = SNAP_DATA_TSDB; - code = tsdbSnapWrite(pWriter->pDataWriter[1], pData, nData); + code = tsdbSnapWrite(pWriter->pDataWriter[1], pHdr); } else if (pHdr->type == SNAP_DATA_QTASK) { code = rsmaSnapWriteQTaskInfo(pWriter, pData, nData); } else { diff --git a/source/dnode/vnode/src/tq/tq.c b/source/dnode/vnode/src/tq/tq.c index 1d5fae33eb8049633c8c4b568de98da68ead3d68..b195cfafb00864b353cf2664dc347efeafb7d9fa 100644 --- a/source/dnode/vnode/src/tq/tq.c +++ b/source/dnode/vnode/src/tq/tq.c @@ -521,7 +521,12 @@ int32_t tqProcessPollReq(STQ* pTq, SRpcMsg* pMsg) { tqOffsetResetToData(&fetchOffsetNew, 0, 0); } } else { - tqOffsetResetToLog(&fetchOffsetNew, walGetFirstVer(pTq->pVnode->pWal)); + pHandle->pRef = walRefFirstVer(pTq->pVnode->pWal, pHandle->pRef); + if (pHandle->pRef == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } + tqOffsetResetToLog(&fetchOffsetNew, pHandle->pRef->refVer - 1); } } else if (reqOffset.type == TMQ_OFFSET__RESET_LATEST) { if (pHandle->execHandle.subType == TOPIC_SUB_TYPE__COLUMN) { @@ -719,6 +724,8 @@ int32_t tqProcessPollReq(STQ* pTq, SRpcMsg* pMsg) { int32_t tqProcessDeleteSubReq(STQ* pTq, int64_t version, char* msg, int32_t msgLen) { SMqVDeleteReq* pReq = (SMqVDeleteReq*)msg; + tqDebug("vgId:%d, delete sub: %s", pTq->pVnode->config.vgId, pReq->subKey); + taosWLockLatch(&pTq->pushLock); int32_t code = taosHashRemove(pTq->pPushMgr, pReq->subKey, strlen(pReq->subKey)); if (code != 0) { diff --git a/source/dnode/vnode/src/tsdb/tsdbSnapshot.c b/source/dnode/vnode/src/tsdb/tsdbSnapshot.c index 08d52554c6cf7452f201e745afe2a99c632dd77f..98c9c0fddab29327cb239aaa632a972664faef84 100644 --- a/source/dnode/vnode/src/tsdb/tsdbSnapshot.c +++ b/source/dnode/vnode/src/tsdb/tsdbSnapshot.c @@ -15,274 +15,628 @@ #include "tsdb.h" -// STsdbSnapReader ======================================== -typedef enum { SNAP_DATA_FILE_ITER = 0, SNAP_STT_FILE_ITER } EFIterT; +extern int32_t tsdbReadDataBlockEx(SDataFReader* pReader, SDataBlk* pDataBlk, SBlockData* pBlockData); +extern int32_t tsdbUpdateTableSchema(SMeta* pMeta, int64_t suid, int64_t uid, SSkmInfo* pSkmInfo); +extern int32_t tsdbWriteDataBlock(SDataFWriter* pWriter, SBlockData* pBlockData, SMapData* mDataBlk, int8_t cmprAlg); +extern int32_t tsdbWriteSttBlock(SDataFWriter* pWriter, SBlockData* pBlockData, SArray* aSttBlk, int8_t cmprAlg); + +// STsdbDataIter2 ======================================== +#define TSDB_MEM_TABLE_DATA_ITER 0 +#define TSDB_DATA_FILE_DATA_ITER 1 +#define TSDB_STT_FILE_DATA_ITER 2 +#define TSDB_TOMB_FILE_DATA_ITER 3 + +typedef struct STsdbDataIter2 STsdbDataIter2; +typedef struct STsdbFilterInfo STsdbFilterInfo; + typedef struct { - SRBTreeNode n; - SRowInfo rInfo; - EFIterT type; + int64_t suid; + int64_t uid; + SDelData delData; +} SDelInfo; + +struct STsdbDataIter2 { + STsdbDataIter2* next; + SRBTreeNode rbtn; + + int32_t type; + SRowInfo rowInfo; + SDelInfo delInfo; union { + // TSDB_MEM_TABLE_DATA_ITER struct { - SArray* aBlockIdx; - int32_t iBlockIdx; - SBlockIdx* pBlockIdx; - SMapData mBlock; - int32_t iBlock; - }; // .data file + SMemTable* pMemTable; + } mIter; + + // TSDB_DATA_FILE_DATA_ITER struct { - int32_t iStt; - SArray* aSttBlk; - int32_t iSttBlk; - }; // .stt file + SDataFReader* pReader; + SArray* aBlockIdx; // SArray + SMapData mDataBlk; + SBlockData bData; + int32_t iBlockIdx; + int32_t iDataBlk; + int32_t iRow; + } dIter; + + // TSDB_STT_FILE_DATA_ITER + struct { + SDataFReader* pReader; + int32_t iStt; + SArray* aSttBlk; + SBlockData bData; + int32_t iSttBlk; + int32_t iRow; + } sIter; + // TSDB_TOMB_FILE_DATA_ITER + struct { + SDelFReader* pReader; + SArray* aDelIdx; + SArray* aDelData; + int32_t iDelIdx; + int32_t iDelData; + } tIter; }; - SBlockData bData; - int32_t iRow; -} SFDataIter; +}; -struct STsdbSnapReader { - STsdb* pTsdb; +#define TSDB_FILTER_FLAG_BY_VERSION 0x1 +struct STsdbFilterInfo { + int32_t flag; int64_t sver; int64_t ever; - STsdbFS fs; - int8_t type; - // for data file - int8_t dataDone; - int32_t fid; - SDataFReader* pDataFReader; - SFDataIter* pIter; - SRBTree rbt; - SFDataIter aFDataIter[TSDB_MAX_STT_TRIGGER + 1]; - SBlockData bData; - SSkmInfo skmTable; - // for del file - int8_t delDone; - SDelFReader* pDelFReader; - SArray* aDelIdx; // SArray - int32_t iDelIdx; - SArray* aDelData; // SArray - uint8_t* aBuf[5]; }; -extern int32_t tRowInfoCmprFn(const void* p1, const void* p2); -extern int32_t tsdbReadDataBlockEx(SDataFReader* pReader, SDataBlk* pDataBlk, SBlockData* pBlockData); -extern int32_t tsdbUpdateTableSchema(SMeta* pMeta, int64_t suid, int64_t uid, SSkmInfo* pSkmInfo); +#define TSDB_RBTN_TO_DATA_ITER(pNode) ((STsdbDataIter2*)(((char*)pNode) - offsetof(STsdbDataIter2, rbtn))) + +/* open */ +static int32_t tsdbOpenDataFileDataIter(SDataFReader* pReader, STsdbDataIter2** ppIter) { + int32_t code = 0; + int32_t lino = 0; + + // create handle + STsdbDataIter2* pIter = (STsdbDataIter2*)taosMemoryCalloc(1, sizeof(*pIter)); + if (pIter == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } + + pIter->type = TSDB_DATA_FILE_DATA_ITER; + pIter->dIter.pReader = pReader; + if ((pIter->dIter.aBlockIdx = taosArrayInit(0, sizeof(SBlockIdx))) == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } + + code = tBlockDataCreate(&pIter->dIter.bData); + TSDB_CHECK_CODE(code, lino, _exit); + + pIter->dIter.iBlockIdx = 0; + pIter->dIter.iDataBlk = 0; + pIter->dIter.iRow = 0; + + // read data + code = tsdbReadBlockIdx(pReader, pIter->dIter.aBlockIdx); + TSDB_CHECK_CODE(code, lino, _exit); -static int32_t tFDataIterCmprFn(const SRBTreeNode* pNode1, const SRBTreeNode* pNode2) { - SFDataIter* pIter1 = (SFDataIter*)(((uint8_t*)pNode1) - offsetof(SFDataIter, n)); - SFDataIter* pIter2 = (SFDataIter*)(((uint8_t*)pNode2) - offsetof(SFDataIter, n)); + if (taosArrayGetSize(pIter->dIter.aBlockIdx) == 0) goto _clear; - return tRowInfoCmprFn(&pIter1->rInfo, &pIter2->rInfo); +_exit: + if (code) { + if (pIter) { + _clear: + tBlockDataDestroy(&pIter->dIter.bData, 1); + taosArrayDestroy(pIter->dIter.aBlockIdx); + taosMemoryFree(pIter); + pIter = NULL; + } + } + *ppIter = pIter; + return code; } -static int32_t tsdbSnapReadOpenFile(STsdbSnapReader* pReader) { +static int32_t tsdbOpenSttFileDataIter(SDataFReader* pReader, int32_t iStt, STsdbDataIter2** ppIter) { int32_t code = 0; int32_t lino = 0; - SDFileSet dFileSet = {.fid = pReader->fid}; - SDFileSet* pSet = taosArraySearch(pReader->fs.aDFileSet, &dFileSet, tDFileSetCmprFn, TD_GT); - if (pSet == NULL) return code; + // create handle + STsdbDataIter2* pIter = (STsdbDataIter2*)taosMemoryCalloc(1, sizeof(*pIter)); + if (pIter == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } - pReader->fid = pSet->fid; - code = tsdbDataFReaderOpen(&pReader->pDataFReader, pReader->pTsdb, pSet); - TSDB_CHECK_CODE(code, lino, _exit); + pIter->type = TSDB_STT_FILE_DATA_ITER; + pIter->sIter.pReader = pReader; + pIter->sIter.iStt = iStt; + pIter->sIter.aSttBlk = taosArrayInit(0, sizeof(SSttBlk)); + if (pIter->sIter.aSttBlk == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } - pReader->pIter = NULL; - tRBTreeCreate(&pReader->rbt, tFDataIterCmprFn); + code = tBlockDataCreate(&pIter->sIter.bData); + TSDB_CHECK_CODE(code, lino, _exit); - // .data file - SFDataIter* pIter = &pReader->aFDataIter[0]; - pIter->type = SNAP_DATA_FILE_ITER; + pIter->sIter.iSttBlk = 0; + pIter->sIter.iRow = 0; - code = tsdbReadBlockIdx(pReader->pDataFReader, pIter->aBlockIdx); + // read data + code = tsdbReadSttBlk(pReader, iStt, pIter->sIter.aSttBlk); TSDB_CHECK_CODE(code, lino, _exit); - for (pIter->iBlockIdx = 0; pIter->iBlockIdx < taosArrayGetSize(pIter->aBlockIdx); pIter->iBlockIdx++) { - pIter->pBlockIdx = (SBlockIdx*)taosArrayGet(pIter->aBlockIdx, pIter->iBlockIdx); + if (taosArrayGetSize(pIter->sIter.aSttBlk) == 0) goto _clear; - code = tsdbReadDataBlk(pReader->pDataFReader, pIter->pBlockIdx, &pIter->mBlock); +_exit: + if (code) { + if (pIter) { + _clear: + taosArrayDestroy(pIter->sIter.aSttBlk); + tBlockDataDestroy(&pIter->sIter.bData, 1); + taosMemoryFree(pIter); + pIter = NULL; + } + } + *ppIter = pIter; + return code; +} + +static int32_t tsdbOpenTombFileDataIter(SDelFReader* pReader, STsdbDataIter2** ppIter) { + int32_t code = 0; + int32_t lino = 0; + + STsdbDataIter2* pIter = (STsdbDataIter2*)taosMemoryCalloc(1, sizeof(*pIter)); + if (pIter == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } + pIter->type = TSDB_TOMB_FILE_DATA_ITER; + + pIter->tIter.pReader = pReader; + if ((pIter->tIter.aDelIdx = taosArrayInit(0, sizeof(SDelIdx))) == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } + if ((pIter->tIter.aDelData = taosArrayInit(0, sizeof(SDelData))) == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; TSDB_CHECK_CODE(code, lino, _exit); + } - for (pIter->iBlock = 0; pIter->iBlock < pIter->mBlock.nItem; pIter->iBlock++) { - SDataBlk dataBlk; - tMapDataGetItemByIdx(&pIter->mBlock, pIter->iBlock, &dataBlk, tGetDataBlk); + code = tsdbReadDelIdx(pReader, pIter->tIter.aDelIdx); + TSDB_CHECK_CODE(code, lino, _exit); - if (dataBlk.minVer > pReader->ever || dataBlk.maxVer < pReader->sver) continue; + if (taosArrayGetSize(pIter->tIter.aDelIdx) == 0) goto _clear; - code = tsdbReadDataBlockEx(pReader->pDataFReader, &dataBlk, &pIter->bData); - TSDB_CHECK_CODE(code, lino, _exit); + pIter->tIter.iDelIdx = 0; + pIter->tIter.iDelData = 0; - ASSERT(pIter->pBlockIdx->suid == pIter->bData.suid); - ASSERT(pIter->pBlockIdx->uid == pIter->bData.uid); +_exit: + if (code) { + if (pIter) { + _clear: + taosArrayDestroy(pIter->tIter.aDelIdx); + taosArrayDestroy(pIter->tIter.aDelData); + taosMemoryFree(pIter); + pIter = NULL; + } + } + *ppIter = pIter; + return code; +} + +/* close */ +static void tsdbCloseDataFileDataIter(STsdbDataIter2* pIter) { + tBlockDataDestroy(&pIter->dIter.bData, 1); + tMapDataClear(&pIter->dIter.mDataBlk); + taosArrayDestroy(pIter->dIter.aBlockIdx); + taosMemoryFree(pIter); +} + +static void tsdbCloseSttFileDataIter(STsdbDataIter2* pIter) { + tBlockDataDestroy(&pIter->sIter.bData, 1); + taosArrayDestroy(pIter->sIter.aSttBlk); + taosMemoryFree(pIter); +} - for (pIter->iRow = 0; pIter->iRow < pIter->bData.nRow; pIter->iRow++) { - int64_t rowVer = pIter->bData.aVersion[pIter->iRow]; +static void tsdbCloseTombFileDataIter(STsdbDataIter2* pIter) { + taosArrayDestroy(pIter->tIter.aDelData); + taosArrayDestroy(pIter->tIter.aDelIdx); + taosMemoryFree(pIter); +} - if (rowVer >= pReader->sver && rowVer <= pReader->ever) { - pIter->rInfo.suid = pIter->pBlockIdx->suid; - pIter->rInfo.uid = pIter->pBlockIdx->uid; - pIter->rInfo.row = tsdbRowFromBlockData(&pIter->bData, pIter->iRow); - goto _add_iter_and_break; +static void tsdbCloseDataIter2(STsdbDataIter2* pIter) { + if (pIter->type == TSDB_MEM_TABLE_DATA_ITER) { + ASSERT(0); + } else if (pIter->type == TSDB_DATA_FILE_DATA_ITER) { + tsdbCloseDataFileDataIter(pIter); + } else if (pIter->type == TSDB_STT_FILE_DATA_ITER) { + tsdbCloseSttFileDataIter(pIter); + } else if (pIter->type == TSDB_TOMB_FILE_DATA_ITER) { + tsdbCloseTombFileDataIter(pIter); + } else { + ASSERT(0); + } +} + +/* cmpr */ +static int32_t tsdbDataIterCmprFn(const SRBTreeNode* pNode1, const SRBTreeNode* pNode2) { + STsdbDataIter2* pIter1 = TSDB_RBTN_TO_DATA_ITER(pNode1); + STsdbDataIter2* pIter2 = TSDB_RBTN_TO_DATA_ITER(pNode2); + return tRowInfoCmprFn(&pIter1->rowInfo, &pIter2->rowInfo); +} + +/* seek */ + +/* iter next */ +static int32_t tsdbDataFileDataIterNext(STsdbDataIter2* pIter, STsdbFilterInfo* pFilterInfo) { + int32_t code = 0; + int32_t lino = 0; + + for (;;) { + while (pIter->dIter.iRow < pIter->dIter.bData.nRow) { + if (pFilterInfo) { + if (pFilterInfo->flag & TSDB_FILTER_FLAG_BY_VERSION) { + if (pIter->dIter.bData.aVersion[pIter->dIter.iRow] < pFilterInfo->sver || + pIter->dIter.bData.aVersion[pIter->dIter.iRow] > pFilterInfo->ever) { + pIter->dIter.iRow++; + continue; + } } } + + pIter->rowInfo.suid = pIter->dIter.bData.suid; + pIter->rowInfo.uid = pIter->dIter.bData.uid; + pIter->rowInfo.row = tsdbRowFromBlockData(&pIter->dIter.bData, pIter->dIter.iRow); + pIter->dIter.iRow++; + goto _exit; } - continue; + for (;;) { + while (pIter->dIter.iDataBlk < pIter->dIter.mDataBlk.nItem) { + SDataBlk dataBlk; + tMapDataGetItemByIdx(&pIter->dIter.mDataBlk, pIter->dIter.iDataBlk, &dataBlk, tGetDataBlk); + + // filter + if (pFilterInfo) { + if (pFilterInfo->flag & TSDB_FILTER_FLAG_BY_VERSION) { + if (pFilterInfo->sver > dataBlk.maxVer || pFilterInfo->ever < dataBlk.minVer) { + pIter->dIter.iDataBlk++; + continue; + } + } + } - _add_iter_and_break: - tRBTreePut(&pReader->rbt, (SRBTreeNode*)pIter); - break; - } + code = tsdbReadDataBlockEx(pIter->dIter.pReader, &dataBlk, &pIter->dIter.bData); + TSDB_CHECK_CODE(code, lino, _exit); - // .stt file - pIter = &pReader->aFDataIter[1]; - for (int32_t iStt = 0; iStt < pSet->nSttF; iStt++) { - pIter->type = SNAP_STT_FILE_ITER; - pIter->iStt = iStt; + pIter->dIter.iDataBlk++; + pIter->dIter.iRow = 0; - code = tsdbReadSttBlk(pReader->pDataFReader, iStt, pIter->aSttBlk); - TSDB_CHECK_CODE(code, lino, _exit); + break; + } - for (pIter->iSttBlk = 0; pIter->iSttBlk < taosArrayGetSize(pIter->aSttBlk); pIter->iSttBlk++) { - SSttBlk* pSttBlk = (SSttBlk*)taosArrayGet(pIter->aSttBlk, pIter->iSttBlk); + if (pIter->dIter.iRow < pIter->dIter.bData.nRow) break; - if (pSttBlk->minVer > pReader->ever) continue; - if (pSttBlk->maxVer < pReader->sver) continue; + for (;;) { + if (pIter->dIter.iBlockIdx < taosArrayGetSize(pIter->dIter.aBlockIdx)) { + SBlockIdx* pBlockIdx = taosArrayGet(pIter->dIter.aBlockIdx, pIter->dIter.iBlockIdx); - code = tsdbReadSttBlockEx(pReader->pDataFReader, iStt, pSttBlk, &pIter->bData); - TSDB_CHECK_CODE(code, lino, _exit); + code = tsdbReadDataBlk(pIter->dIter.pReader, pBlockIdx, &pIter->dIter.mDataBlk); + TSDB_CHECK_CODE(code, lino, _exit); - for (pIter->iRow = 0; pIter->iRow < pIter->bData.nRow; pIter->iRow++) { - int64_t rowVer = pIter->bData.aVersion[pIter->iRow]; + pIter->dIter.iBlockIdx++; + pIter->dIter.iDataBlk = 0; - if (rowVer >= pReader->sver && rowVer <= pReader->ever) { - pIter->rInfo.suid = pIter->bData.suid; - pIter->rInfo.uid = pIter->bData.uid ? pIter->bData.uid : pIter->bData.aUid[pIter->iRow]; - pIter->rInfo.row = tsdbRowFromBlockData(&pIter->bData, pIter->iRow); - goto _add_iter; + break; + } else { + pIter->rowInfo = (SRowInfo){0}; + goto _exit; } } } - - continue; - - _add_iter: - tRBTreePut(&pReader->rbt, (SRBTreeNode*)pIter); - pIter++; } _exit: if (code) { - tsdbError("vgId:%d, %s failed since %s", TD_VID(pReader->pTsdb->pVnode), __func__, tstrerror(code)); - } else { - tsdbInfo("vgId:%d, %s done, path:%s, fid:%d", TD_VID(pReader->pTsdb->pVnode), __func__, pReader->pTsdb->path, - pReader->fid); + tsdbError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); } return code; } -static int32_t tsdbSnapNextRow(STsdbSnapReader* pReader) { +static int32_t tsdbSttFileDataIterNext(STsdbDataIter2* pIter, STsdbFilterInfo* pFilterInfo) { int32_t code = 0; + int32_t lino = 0; - if (pReader->pIter) { - SFDataIter* pIter = NULL; - while (true) { - _find_row: - pIter = pReader->pIter; - for (pIter->iRow++; pIter->iRow < pIter->bData.nRow; pIter->iRow++) { - int64_t rowVer = pIter->bData.aVersion[pIter->iRow]; - - if (rowVer >= pReader->sver && rowVer <= pReader->ever) { - pIter->rInfo.suid = pIter->bData.suid; - pIter->rInfo.uid = pIter->bData.uid ? pIter->bData.uid : pIter->bData.aUid[pIter->iRow]; - pIter->rInfo.row = tsdbRowFromBlockData(&pIter->bData, pIter->iRow); - goto _out; + for (;;) { + while (pIter->sIter.iRow < pIter->sIter.bData.nRow) { + if (pFilterInfo) { + if (pFilterInfo->flag & TSDB_FILTER_FLAG_BY_VERSION) { + if (pFilterInfo->sver > pIter->sIter.bData.aVersion[pIter->sIter.iRow] || + pFilterInfo->ever < pIter->sIter.bData.aVersion[pIter->sIter.iRow]) { + pIter->sIter.iRow++; + continue; + } } } - if (pIter->type == SNAP_DATA_FILE_ITER) { - while (true) { - for (pIter->iBlock++; pIter->iBlock < pIter->mBlock.nItem; pIter->iBlock++) { - SDataBlk dataBlk; - tMapDataGetItemByIdx(&pIter->mBlock, pIter->iBlock, &dataBlk, tGetDataBlk); - - if (dataBlk.minVer > pReader->ever || dataBlk.maxVer < pReader->sver) continue; + pIter->rowInfo.suid = pIter->sIter.bData.suid; + pIter->rowInfo.uid = pIter->sIter.bData.uid ? pIter->sIter.bData.uid : pIter->sIter.bData.aUid[pIter->sIter.iRow]; + pIter->rowInfo.row = tsdbRowFromBlockData(&pIter->sIter.bData, pIter->sIter.iRow); + pIter->sIter.iRow++; + goto _exit; + } - code = tsdbReadDataBlockEx(pReader->pDataFReader, &dataBlk, &pIter->bData); - if (code) goto _err; + for (;;) { + if (pIter->sIter.iSttBlk < taosArrayGetSize(pIter->sIter.aSttBlk)) { + SSttBlk* pSttBlk = taosArrayGet(pIter->sIter.aSttBlk, pIter->sIter.iSttBlk); - pIter->iRow = -1; - goto _find_row; + if (pFilterInfo) { + if (pFilterInfo->flag & TSDB_FILTER_FLAG_BY_VERSION) { + if (pFilterInfo->sver > pSttBlk->maxVer || pFilterInfo->ever < pSttBlk->minVer) { + pIter->sIter.iSttBlk++; + continue; + } } - - pIter->iBlockIdx++; - if (pIter->iBlockIdx >= taosArrayGetSize(pIter->aBlockIdx)) break; - - pIter->pBlockIdx = (SBlockIdx*)taosArrayGet(pIter->aBlockIdx, pIter->iBlockIdx); - code = tsdbReadDataBlk(pReader->pDataFReader, pIter->pBlockIdx, &pIter->mBlock); - if (code) goto _err; - pIter->iBlock = -1; } - pReader->pIter = NULL; + code = tsdbReadSttBlockEx(pIter->sIter.pReader, pIter->sIter.iStt, pSttBlk, &pIter->sIter.bData); + TSDB_CHECK_CODE(code, lino, _exit); + + pIter->sIter.iRow = 0; + pIter->sIter.iSttBlk++; break; - } else if (pIter->type == SNAP_STT_FILE_ITER) { - for (pIter->iSttBlk++; pIter->iSttBlk < taosArrayGetSize(pIter->aSttBlk); pIter->iSttBlk++) { - SSttBlk* pSttBlk = (SSttBlk*)taosArrayGet(pIter->aSttBlk, pIter->iSttBlk); + } else { + pIter->rowInfo = (SRowInfo){0}; + goto _exit; + } + } + } + +_exit: + if (code) { + tsdbError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); + } + return code; +} - if (pSttBlk->minVer > pReader->ever || pSttBlk->maxVer < pReader->sver) continue; +static int32_t tsdbTombFileDataIterNext(STsdbDataIter2* pIter, STsdbFilterInfo* pFilterInfo) { + int32_t code = 0; + int32_t lino = 0; - code = tsdbReadSttBlockEx(pReader->pDataFReader, pIter->iStt, pSttBlk, &pIter->bData); - if (code) goto _err; + for (;;) { + while (pIter->tIter.iDelData < taosArrayGetSize(pIter->tIter.aDelData)) { + SDelData* pDelData = taosArrayGet(pIter->tIter.aDelData, pIter->tIter.iDelData); - pIter->iRow = -1; - goto _find_row; + if (pFilterInfo) { + if (pFilterInfo->flag & TSDB_FILTER_FLAG_BY_VERSION) { + if (pFilterInfo->sver > pDelData->version || pFilterInfo->ever < pDelData->version) { + pIter->tIter.iDelData++; + continue; + } } + } + + pIter->delInfo.delData = *pDelData; + pIter->tIter.iDelData++; + goto _exit; + } + + for (;;) { + if (pIter->tIter.iDelIdx < taosArrayGetSize(pIter->tIter.aDelIdx)) { + SDelIdx* pDelIdx = taosArrayGet(pIter->tIter.aDelIdx, pIter->tIter.iDelIdx); + + code = tsdbReadDelData(pIter->tIter.pReader, pDelIdx, pIter->tIter.aDelData); + TSDB_CHECK_CODE(code, lino, _exit); - pReader->pIter = NULL; + pIter->delInfo.suid = pDelIdx->suid; + pIter->delInfo.uid = pDelIdx->uid; + pIter->tIter.iDelData = 0; + pIter->tIter.iDelIdx++; break; } else { - ASSERT(0); + pIter->delInfo = (SDelInfo){0}; + goto _exit; } } + } - _out: - pIter = (SFDataIter*)tRBTreeMin(&pReader->rbt); - if (pReader->pIter && pIter) { - int32_t c = tRowInfoCmprFn(&pReader->pIter->rInfo, &pIter->rInfo); - if (c > 0) { - tRBTreePut(&pReader->rbt, (SRBTreeNode*)pReader->pIter); - pReader->pIter = NULL; - } else { - ASSERT(c); - } +_exit: + if (code) { + tsdbError("%s failed at line %d since %s", __func__, lino, tstrerror(code)); + } + return code; +} + +static int32_t tsdbDataIterNext2(STsdbDataIter2* pIter, STsdbFilterInfo* pFilterInfo) { + int32_t code = 0; + + if (pIter->type == TSDB_MEM_TABLE_DATA_ITER) { + ASSERT(0); + return code; + } else if (pIter->type == TSDB_DATA_FILE_DATA_ITER) { + return tsdbDataFileDataIterNext(pIter, pFilterInfo); + } else if (pIter->type == TSDB_STT_FILE_DATA_ITER) { + return tsdbSttFileDataIterNext(pIter, pFilterInfo); + } else if (pIter->type == TSDB_TOMB_FILE_DATA_ITER) { + return tsdbTombFileDataIterNext(pIter, pFilterInfo); + } else { + ASSERT(0); + return code; + } +} + +/* get */ + +// STsdbSnapReader ======================================== +struct STsdbSnapReader { + STsdb* pTsdb; + int64_t sver; + int64_t ever; + int8_t type; + uint8_t* aBuf[5]; + + STsdbFS fs; + TABLEID tbid; + SSkmInfo skmTable; + + // timeseries data + int8_t dataDone; + int32_t fid; + + SDataFReader* pDataFReader; + STsdbDataIter2* iterList; + STsdbDataIter2* pIter; + SRBTree rbt; + SBlockData bData; + + // tombstone data + int8_t delDone; + SDelFReader* pDelFReader; + STsdbDataIter2* pTIter; + SArray* aDelData; +}; + +static int32_t tsdbSnapReadFileDataStart(STsdbSnapReader* pReader) { + int32_t code = 0; + int32_t lino = 0; + + SDFileSet* pSet = taosArraySearch(pReader->fs.aDFileSet, &(SDFileSet){.fid = pReader->fid}, tDFileSetCmprFn, TD_GT); + if (pSet == NULL) { + pReader->fid = INT32_MAX; + goto _exit; + } + + pReader->fid = pSet->fid; + + tRBTreeCreate(&pReader->rbt, tsdbDataIterCmprFn); + + code = tsdbDataFReaderOpen(&pReader->pDataFReader, pReader->pTsdb, pSet); + TSDB_CHECK_CODE(code, lino, _exit); + + code = tsdbOpenDataFileDataIter(pReader->pDataFReader, &pReader->pIter); + TSDB_CHECK_CODE(code, lino, _exit); + + if (pReader->pIter) { + // iter to next with filter info (sver, ever) + code = tsdbDataIterNext2(pReader->pIter, + &(STsdbFilterInfo){.flag = TSDB_FILTER_FLAG_BY_VERSION, // flag + .sver = pReader->sver, + .ever = pReader->ever}); + TSDB_CHECK_CODE(code, lino, _exit); + + if (pReader->pIter->rowInfo.suid || pReader->pIter->rowInfo.uid) { + // add to rbtree + tRBTreePut(&pReader->rbt, &pReader->pIter->rbtn); + + // add to iterList + pReader->pIter->next = pReader->iterList; + pReader->iterList = pReader->pIter; + } else { + tsdbCloseDataIter2(pReader->pIter); } } - if (pReader->pIter == NULL) { - pReader->pIter = (SFDataIter*)tRBTreeMin(&pReader->rbt); + for (int32_t iStt = 0; iStt < pSet->nSttF; ++iStt) { + code = tsdbOpenSttFileDataIter(pReader->pDataFReader, iStt, &pReader->pIter); + TSDB_CHECK_CODE(code, lino, _exit); + if (pReader->pIter) { - tRBTreeDrop(&pReader->rbt, (SRBTreeNode*)pReader->pIter); + // iter to valid row + code = tsdbDataIterNext2(pReader->pIter, + &(STsdbFilterInfo){.flag = TSDB_FILTER_FLAG_BY_VERSION, // flag + .sver = pReader->sver, + .ever = pReader->ever}); + TSDB_CHECK_CODE(code, lino, _exit); + + if (pReader->pIter->rowInfo.suid || pReader->pIter->rowInfo.uid) { + // add to rbtree + tRBTreePut(&pReader->rbt, &pReader->pIter->rbtn); + + // add to iterList + pReader->pIter->next = pReader->iterList; + pReader->iterList = pReader->pIter; + } else { + tsdbCloseDataIter2(pReader->pIter); + } } } - return code; + pReader->pIter = NULL; -_err: +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pReader->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbInfo("vgId:%d %s done, fid:%d", TD_VID(pReader->pTsdb->pVnode), __func__, pReader->fid); + } return code; } -static SRowInfo* tsdbSnapGetRow(STsdbSnapReader* pReader) { +static void tsdbSnapReadFileDataEnd(STsdbSnapReader* pReader) { + while (pReader->iterList) { + STsdbDataIter2* pIter = pReader->iterList; + pReader->iterList = pIter->next; + tsdbCloseDataIter2(pIter); + } + + tsdbDataFReaderClose(&pReader->pDataFReader); +} + +static int32_t tsdbSnapReadNextRow(STsdbSnapReader* pReader, SRowInfo** ppRowInfo) { + int32_t code = 0; + int32_t lino = 0; + if (pReader->pIter) { - return &pReader->pIter->rInfo; - } else { - tsdbSnapNextRow(pReader); + code = tsdbDataIterNext2(pReader->pIter, &(STsdbFilterInfo){.flag = TSDB_FILTER_FLAG_BY_VERSION, // flag + .sver = pReader->sver, + .ever = pReader->ever}); + TSDB_CHECK_CODE(code, lino, _exit); + if (pReader->pIter->rowInfo.suid == 0 && pReader->pIter->rowInfo.uid == 0) { + pReader->pIter = NULL; + } else { + SRBTreeNode* pNode = tRBTreeMin(&pReader->rbt); + if (pNode) { + int32_t c = tsdbDataIterCmprFn(&pReader->pIter->rbtn, pNode); + if (c > 0) { + tRBTreePut(&pReader->rbt, &pReader->pIter->rbtn); + pReader->pIter = NULL; + } else if (c == 0) { + ASSERT(0); + } + } + } + } + + if (pReader->pIter == NULL) { + SRBTreeNode* pNode = tRBTreeMin(&pReader->rbt); + if (pNode) { + tRBTreeDrop(&pReader->rbt, pNode); + pReader->pIter = TSDB_RBTN_TO_DATA_ITER(pNode); + } + } + + if (ppRowInfo) { if (pReader->pIter) { - return &pReader->pIter->rInfo; + *ppRowInfo = &pReader->pIter->rowInfo; } else { - return NULL; + *ppRowInfo = NULL; } } + +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pReader->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } + return code; +} + +static int32_t tsdbSnapReadGetRow(STsdbSnapReader* pReader, SRowInfo** ppRowInfo) { + if (pReader->pIter) { + *ppRowInfo = &pReader->pIter->rowInfo; + return 0; + } + + return tsdbSnapReadNextRow(pReader, ppRowInfo); } static int32_t tsdbSnapCmprData(STsdbSnapReader* pReader, uint8_t** ppData) { @@ -318,155 +672,213 @@ _exit: return code; } -static int32_t tsdbSnapReadData(STsdbSnapReader* pReader, uint8_t** ppData) { +static int32_t tsdbSnapReadTimeSeriesData(STsdbSnapReader* pReader, uint8_t** ppData) { int32_t code = 0; int32_t lino = 0; STsdb* pTsdb = pReader->pTsdb; - while (true) { + tBlockDataReset(&pReader->bData); + + for (;;) { + // start a new file read if need if (pReader->pDataFReader == NULL) { - code = tsdbSnapReadOpenFile(pReader); + code = tsdbSnapReadFileDataStart(pReader); TSDB_CHECK_CODE(code, lino, _exit); } if (pReader->pDataFReader == NULL) break; - SRowInfo* pRowInfo = tsdbSnapGetRow(pReader); + SRowInfo* pRowInfo; + code = tsdbSnapReadGetRow(pReader, &pRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); + if (pRowInfo == NULL) { - tsdbDataFReaderClose(&pReader->pDataFReader); + tsdbSnapReadFileDataEnd(pReader); continue; } - TABLEID id = {.suid = pRowInfo->suid, .uid = pRowInfo->uid}; - SBlockData* pBlockData = &pReader->bData; - - code = tsdbUpdateTableSchema(pTsdb->pVnode->pMeta, id.suid, id.uid, &pReader->skmTable); + code = tsdbUpdateTableSchema(pTsdb->pVnode->pMeta, pRowInfo->suid, pRowInfo->uid, &pReader->skmTable); TSDB_CHECK_CODE(code, lino, _exit); - code = tBlockDataInit(pBlockData, &id, pReader->skmTable.pTSchema, NULL, 0); + code = tBlockDataInit(&pReader->bData, (TABLEID*)pRowInfo, pReader->skmTable.pTSchema, NULL, 0); TSDB_CHECK_CODE(code, lino, _exit); - while (pRowInfo->suid == id.suid && pRowInfo->uid == id.uid) { - code = tBlockDataAppendRow(pBlockData, &pRowInfo->row, NULL, pRowInfo->uid); + do { + if (!TABLE_SAME_SCHEMA(pReader->bData.suid, pReader->bData.uid, pRowInfo->suid, pRowInfo->uid)) break; + + if (pReader->bData.uid && pReader->bData.uid != pRowInfo->uid) { + code = tRealloc((uint8_t**)&pReader->bData.aUid, sizeof(int64_t) * (pReader->bData.nRow + 1)); + TSDB_CHECK_CODE(code, lino, _exit); + + for (int32_t iRow = 0; iRow < pReader->bData.nRow; ++iRow) { + pReader->bData.aUid[iRow] = pReader->bData.uid; + } + pReader->bData.uid = 0; + } + + code = tBlockDataAppendRow(&pReader->bData, &pRowInfo->row, NULL, pRowInfo->uid); TSDB_CHECK_CODE(code, lino, _exit); - code = tsdbSnapNextRow(pReader); + code = tsdbSnapReadNextRow(pReader, &pRowInfo); TSDB_CHECK_CODE(code, lino, _exit); - pRowInfo = tsdbSnapGetRow(pReader); - if (pRowInfo == NULL) { - tsdbDataFReaderClose(&pReader->pDataFReader); - break; - } + if (pReader->bData.nRow >= 4096) break; + } while (pRowInfo); + + ASSERT(pReader->bData.nRow > 0); + + break; + } + + if (pReader->bData.nRow > 0) { + code = tsdbSnapCmprData(pReader, ppData); + TSDB_CHECK_CODE(code, lino, _exit); + } + +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code)); + } + return code; +} + +static int32_t tsdbSnapCmprTombData(STsdbSnapReader* pReader, uint8_t** ppData) { + int32_t code = 0; + int32_t lino = 0; + + int64_t size = sizeof(TABLEID); + for (int32_t iDelData = 0; iDelData < taosArrayGetSize(pReader->aDelData); ++iDelData) { + size += tPutDelData(NULL, taosArrayGet(pReader->aDelData, iDelData)); + } + + uint8_t* pData = (uint8_t*)taosMemoryMalloc(sizeof(SSnapDataHdr) + size); + if (pData == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } + + SSnapDataHdr* pHdr = (SSnapDataHdr*)pData; + pHdr->type = SNAP_DATA_DEL; + pHdr->size = size; + + TABLEID* pId = (TABLEID*)(pData + sizeof(SSnapDataHdr)); + *pId = pReader->tbid; + + size = sizeof(SSnapDataHdr) + sizeof(TABLEID); + for (int32_t iDelData = 0; iDelData < taosArrayGetSize(pReader->aDelData); ++iDelData) { + size += tPutDelData(pData + size, taosArrayGet(pReader->aDelData, iDelData)); + } - if (pBlockData->nRow >= 4096) break; +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pReader->pTsdb->pVnode), __func__, lino, tstrerror(code)); + if (pData) { + taosMemoryFree(pData); + pData = NULL; } + } + *ppData = pData; + return code; +} - code = tsdbSnapCmprData(pReader, ppData); - TSDB_CHECK_CODE(code, lino, _exit); +static void tsdbSnapReadGetTombData(STsdbSnapReader* pReader, SDelInfo** ppDelInfo) { + if (pReader->pTIter == NULL || (pReader->pTIter->delInfo.suid == 0 && pReader->pTIter->delInfo.uid == 0)) { + *ppDelInfo = NULL; + } else { + *ppDelInfo = &pReader->pTIter->delInfo; + } +} - break; +static int32_t tsdbSnapReadNextTombData(STsdbSnapReader* pReader, SDelInfo** ppDelInfo) { + int32_t code = 0; + int32_t lino = 0; + + code = tsdbDataIterNext2( + pReader->pTIter, + &(STsdbFilterInfo){.flag = TSDB_FILTER_FLAG_BY_VERSION, .sver = pReader->sver, .ever = pReader->ever}); + TSDB_CHECK_CODE(code, lino, _exit); + + if (ppDelInfo) { + tsdbSnapReadGetTombData(pReader, ppDelInfo); } _exit: if (code) { - tsdbError("vgId:%d, %s failed since %s, path:%s", TD_VID(pTsdb->pVnode), __func__, tstrerror(code), pTsdb->path); + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pReader->pTsdb->pVnode), __func__, lino, tstrerror(code)); } return code; } -static int32_t tsdbSnapReadDel(STsdbSnapReader* pReader, uint8_t** ppData) { +static int32_t tsdbSnapReadTombData(STsdbSnapReader* pReader, uint8_t** ppData) { int32_t code = 0; int32_t lino = 0; - STsdb* pTsdb = pReader->pTsdb; - SDelFile* pDelFile = pReader->fs.pDelFile; + STsdb* pTsdb = pReader->pTsdb; + // open tombstone data iter if need if (pReader->pDelFReader == NULL) { - if (pDelFile == NULL) { - goto _exit; - } + if (pReader->fs.pDelFile == NULL) goto _exit; // open - code = tsdbDelFReaderOpen(&pReader->pDelFReader, pDelFile, pTsdb); + code = tsdbDelFReaderOpen(&pReader->pDelFReader, pReader->fs.pDelFile, pTsdb); TSDB_CHECK_CODE(code, lino, _exit); - // read index - code = tsdbReadDelIdx(pReader->pDelFReader, pReader->aDelIdx); + code = tsdbOpenTombFileDataIter(pReader->pDelFReader, &pReader->pTIter); TSDB_CHECK_CODE(code, lino, _exit); - pReader->iDelIdx = 0; + if (pReader->pTIter) { + code = tsdbSnapReadNextTombData(pReader, NULL); + TSDB_CHECK_CODE(code, lino, _exit); + } } - while (true) { - if (pReader->iDelIdx >= taosArrayGetSize(pReader->aDelIdx)) { - tsdbDelFReaderClose(&pReader->pDelFReader); - break; - } + // loop to get tombstone data + SDelInfo* pDelInfo; + tsdbSnapReadGetTombData(pReader, &pDelInfo); - SDelIdx* pDelIdx = (SDelIdx*)taosArrayGet(pReader->aDelIdx, pReader->iDelIdx); + if (pDelInfo == NULL) goto _exit; - pReader->iDelIdx++; + pReader->tbid = *(TABLEID*)pDelInfo; - code = tsdbReadDelData(pReader->pDelFReader, pDelIdx, pReader->aDelData); + if (pReader->aDelData) { + taosArrayClear(pReader->aDelData); + } else if ((pReader->aDelData = taosArrayInit(16, sizeof(SDelData))) == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; TSDB_CHECK_CODE(code, lino, _exit); + } - int32_t size = 0; - for (int32_t iDelData = 0; iDelData < taosArrayGetSize(pReader->aDelData); iDelData++) { - SDelData* pDelData = (SDelData*)taosArrayGet(pReader->aDelData, iDelData); - - if (pDelData->version >= pReader->sver && pDelData->version <= pReader->ever) { - size += tPutDelData(NULL, pDelData); - } - } - if (size == 0) continue; - - // org data - size = sizeof(TABLEID) + size; - *ppData = taosMemoryMalloc(sizeof(SSnapDataHdr) + size); - if (*ppData == NULL) { + while (pDelInfo && pDelInfo->suid == pReader->tbid.suid && pDelInfo->uid == pReader->tbid.uid) { + if (taosArrayPush(pReader->aDelData, &pDelInfo->delData) < 0) { code = TSDB_CODE_OUT_OF_MEMORY; TSDB_CHECK_CODE(code, lino, _exit); } - SSnapDataHdr* pHdr = (SSnapDataHdr*)(*ppData); - pHdr->type = SNAP_DATA_DEL; - pHdr->size = size; - - TABLEID* pId = (TABLEID*)(&pHdr[1]); - pId->suid = pDelIdx->suid; - pId->uid = pDelIdx->uid; - int32_t n = sizeof(SSnapDataHdr) + sizeof(TABLEID); - for (int32_t iDelData = 0; iDelData < taosArrayGetSize(pReader->aDelData); iDelData++) { - SDelData* pDelData = (SDelData*)taosArrayGet(pReader->aDelData, iDelData); - - if (pDelData->version < pReader->sver) continue; - if (pDelData->version > pReader->ever) continue; - - n += tPutDelData((*ppData) + n, pDelData); - } - - tsdbInfo("vgId:%d, vnode snapshot tsdb read del data for %s, suid:%" PRId64 " uid:%" PRId64 " size:%d", - TD_VID(pTsdb->pVnode), pTsdb->path, pDelIdx->suid, pDelIdx->uid, size); + code = tsdbSnapReadNextTombData(pReader, &pDelInfo); + TSDB_CHECK_CODE(code, lino, _exit); + } - break; + // encode tombstone data + if (taosArrayGetSize(pReader->aDelData) > 0) { + code = tsdbSnapCmprTombData(pReader, ppData); + TSDB_CHECK_CODE(code, lino, _exit); } _exit: if (code) { - tsdbError("vgId:%d, %s failed since %s, path:%s", TD_VID(pTsdb->pVnode), __func__, tstrerror(code), pTsdb->path); + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbDebug("vgId:%d %s done", TD_VID(pTsdb->pVnode), __func__); } return code; } int32_t tsdbSnapReaderOpen(STsdb* pTsdb, int64_t sver, int64_t ever, int8_t type, STsdbSnapReader** ppReader) { - int32_t code = 0; - int32_t lino = 0; - STsdbSnapReader* pReader = NULL; + int32_t code = 0; + int32_t lino = 0; // alloc - pReader = (STsdbSnapReader*)taosMemoryCalloc(1, sizeof(*pReader)); + STsdbSnapReader* pReader = (STsdbSnapReader*)taosMemoryCalloc(1, sizeof(*pReader)); if (pReader == NULL) { code = TSDB_CODE_OUT_OF_MEMORY; TSDB_CHECK_CODE(code, lino, _exit); @@ -476,118 +888,80 @@ int32_t tsdbSnapReaderOpen(STsdb* pTsdb, int64_t sver, int64_t ever, int8_t type pReader->ever = ever; pReader->type = type; - code = taosThreadRwlockRdlock(&pTsdb->rwLock); - if (code) { - code = TAOS_SYSTEM_ERROR(code); - TSDB_CHECK_CODE(code, lino, _exit); - } - + taosThreadRwlockRdlock(&pTsdb->rwLock); code = tsdbFSRef(pTsdb, &pReader->fs); if (code) { taosThreadRwlockUnlock(&pTsdb->rwLock); TSDB_CHECK_CODE(code, lino, _exit); } + taosThreadRwlockUnlock(&pTsdb->rwLock); - code = taosThreadRwlockUnlock(&pTsdb->rwLock); - if (code) { - code = TAOS_SYSTEM_ERROR(code); - TSDB_CHECK_CODE(code, lino, _exit); - } - - // data + // init pReader->fid = INT32_MIN; - for (int32_t iIter = 0; iIter < sizeof(pReader->aFDataIter) / sizeof(pReader->aFDataIter[0]); iIter++) { - SFDataIter* pIter = &pReader->aFDataIter[iIter]; - - if (iIter == 0) { - pIter->aBlockIdx = taosArrayInit(0, sizeof(SBlockIdx)); - if (pIter->aBlockIdx == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - } else { - pIter->aSttBlk = taosArrayInit(0, sizeof(SSttBlk)); - if (pIter->aSttBlk == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - } - - code = tBlockDataCreate(&pIter->bData); - TSDB_CHECK_CODE(code, lino, _exit); - } code = tBlockDataCreate(&pReader->bData); TSDB_CHECK_CODE(code, lino, _exit); - // del - pReader->aDelIdx = taosArrayInit(0, sizeof(SDelIdx)); - if (pReader->aDelIdx == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - pReader->aDelData = taosArrayInit(0, sizeof(SDelData)); - if (pReader->aDelData == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - _exit: if (code) { - tsdbError("vgId:%d, %s failed at line %d since %s, TSDB path: %s", TD_VID(pTsdb->pVnode), __func__, lino, - tstrerror(code), pTsdb->path); - *ppReader = NULL; - + tsdbError("vgId:%d %s failed at line %d since %s, sver:%" PRId64 " ever:%" PRId64 " type:%d", TD_VID(pTsdb->pVnode), + __func__, lino, tstrerror(code), sver, ever, type); if (pReader) { - taosArrayDestroy(pReader->aDelData); - taosArrayDestroy(pReader->aDelIdx); tBlockDataDestroy(&pReader->bData, 1); - tsdbFSDestroy(&pReader->fs); + tsdbFSUnref(pTsdb, &pReader->fs); taosMemoryFree(pReader); + pReader = NULL; } } else { - *ppReader = pReader; - tsdbInfo("vgId:%d, vnode snapshot tsdb reader opened for %s", TD_VID(pTsdb->pVnode), pTsdb->path); + tsdbInfo("vgId:%d %s done, sver:%" PRId64 " ever:%" PRId64 " type:%d", TD_VID(pTsdb->pVnode), __func__, sver, ever, + type); } + *ppReader = pReader; return code; } int32_t tsdbSnapReaderClose(STsdbSnapReader** ppReader) { - int32_t code = 0; - STsdbSnapReader* pReader = *ppReader; - - // data - if (pReader->pDataFReader) tsdbDataFReaderClose(&pReader->pDataFReader); - for (int32_t iIter = 0; iIter < sizeof(pReader->aFDataIter) / sizeof(pReader->aFDataIter[0]); iIter++) { - SFDataIter* pIter = &pReader->aFDataIter[iIter]; + int32_t code = 0; + int32_t lino = 0; - if (iIter == 0) { - taosArrayDestroy(pIter->aBlockIdx); - tMapDataClear(&pIter->mBlock); - } else { - taosArrayDestroy(pIter->aSttBlk); - } + STsdbSnapReader* pReader = *ppReader; + STsdb* pTsdb = pReader->pTsdb; - tBlockDataDestroy(&pIter->bData, 1); + // tombstone + if (pReader->pTIter) { + tsdbCloseDataIter2(pReader->pTIter); + pReader->pTIter = NULL; + } + if (pReader->pDelFReader) { + tsdbDelFReaderClose(&pReader->pDelFReader); } + taosArrayDestroy(pReader->aDelData); + // timeseries + while (pReader->iterList) { + STsdbDataIter2* pIter = pReader->iterList; + pReader->iterList = pIter->next; + tsdbCloseDataIter2(pIter); + } + if (pReader->pDataFReader) { + tsdbDataFReaderClose(&pReader->pDataFReader); + } tBlockDataDestroy(&pReader->bData, 1); - tDestroyTSchema(pReader->skmTable.pTSchema); - - // del - if (pReader->pDelFReader) tsdbDelFReaderClose(&pReader->pDelFReader); - taosArrayDestroy(pReader->aDelIdx); - taosArrayDestroy(pReader->aDelData); + // other + tDestroyTSchema(pReader->skmTable.pTSchema); tsdbFSUnref(pReader->pTsdb, &pReader->fs); - - tsdbInfo("vgId:%d, vnode snapshot tsdb reader closed for %s", TD_VID(pReader->pTsdb->pVnode), pReader->pTsdb->path); - for (int32_t iBuf = 0; iBuf < sizeof(pReader->aBuf) / sizeof(pReader->aBuf[0]); iBuf++) { tFree(pReader->aBuf[iBuf]); } - taosMemoryFree(pReader); + +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbDebug("vgId:%d %s done", TD_VID(pTsdb->pVnode), __func__); + } *ppReader = NULL; return code; } @@ -600,7 +974,7 @@ int32_t tsdbSnapRead(STsdbSnapReader* pReader, uint8_t** ppData) { // read data file if (!pReader->dataDone) { - code = tsdbSnapReadData(pReader, ppData); + code = tsdbSnapReadTimeSeriesData(pReader, ppData); TSDB_CHECK_CODE(code, lino, _exit); if (*ppData) { goto _exit; @@ -611,7 +985,7 @@ int32_t tsdbSnapRead(STsdbSnapReader* pReader, uint8_t** ppData) { // read del file if (!pReader->delDone) { - code = tsdbSnapReadDel(pReader, ppData); + code = tsdbSnapReadTombData(pReader, ppData); TSDB_CHECK_CODE(code, lino, _exit); if (*ppData) { goto _exit; @@ -622,22 +996,18 @@ int32_t tsdbSnapRead(STsdbSnapReader* pReader, uint8_t** ppData) { _exit: if (code) { - tsdbError("vgId:%d, %s failed since %s, path:%s", TD_VID(pReader->pTsdb->pVnode), __func__, tstrerror(code), - pReader->pTsdb->path); + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pReader->pTsdb->pVnode), __func__, lino, tstrerror(code)); } else { - tsdbDebug("vgId:%d, %s done, path:%s", TD_VID(pReader->pTsdb->pVnode), __func__, pReader->pTsdb->path); + tsdbDebug("vgId:%d %s done", TD_VID(pReader->pTsdb->pVnode), __func__); } return code; } // STsdbSnapWriter ======================================== struct STsdbSnapWriter { - STsdb* pTsdb; - int64_t sver; - int64_t ever; - STsdbFS fs; - - // config + STsdb* pTsdb; + int64_t sver; + int64_t ever; int32_t minutes; int8_t precision; int32_t minRow; @@ -646,641 +1016,816 @@ struct STsdbSnapWriter { int64_t commitID; uint8_t* aBuf[5]; - // for data file - SBlockData bData; - int32_t fid; - TABLEID id; - SSkmInfo skmTable; - struct { - SDataFReader* pReader; - SArray* aBlockIdx; - int32_t iBlockIdx; - SBlockIdx* pBlockIdx; - SMapData mDataBlk; - int32_t iDataBlk; - SBlockData bData; - int32_t iRow; - } dReader; - struct { - SDataFWriter* pWriter; - SArray* aBlockIdx; - SMapData mDataBlk; - SArray* aSttBlk; - SBlockData bData; - SBlockData sData; - } dWriter; - - // for del file - SDelFReader* pDelFReader; + STsdbFS fs; + TABLEID tbid; + + // time-series data + SBlockData inData; + + int32_t fid; + SSkmInfo skmTable; + + /* reader */ + SDataFReader* pDataFReader; + STsdbDataIter2* iterList; + STsdbDataIter2* pDIter; + STsdbDataIter2* pSIter; + SRBTree rbt; // SRBTree + + /* writer */ + SDataFWriter* pDataFWriter; + SArray* aBlockIdx; + SMapData mDataBlk; // SMapData + SArray* aSttBlk; // SArray + SBlockData bData; + SBlockData sData; + + // tombstone data + /* reader */ + SDelFReader* pDelFReader; + STsdbDataIter2* pTIter; + + /* writer */ SDelFWriter* pDelFWriter; - int32_t iDelIdx; - SArray* aDelIdxR; + SArray* aDelIdx; SArray* aDelData; - SArray* aDelIdxW; }; // SNAP_DATA_TSDB -extern int32_t tsdbWriteDataBlock(SDataFWriter* pWriter, SBlockData* pBlockData, SMapData* mDataBlk, int8_t cmprAlg); -extern int32_t tsdbWriteSttBlock(SDataFWriter* pWriter, SBlockData* pBlockData, SArray* aSttBlk, int8_t cmprAlg); - -static int32_t tsdbSnapNextTableData(STsdbSnapWriter* pWriter) { +static int32_t tsdbSnapWriteTableDataStart(STsdbSnapWriter* pWriter, TABLEID* pId) { int32_t code = 0; + int32_t lino = 0; + + if (pId) { + pWriter->tbid = *pId; + } else { + pWriter->tbid = (TABLEID){INT64_MAX, INT64_MAX}; + } + + if (pWriter->pDIter) { + STsdbDataIter2* pIter = pWriter->pDIter; + + // assert last table data end + ASSERT(pIter->dIter.iRow >= pIter->dIter.bData.nRow); + ASSERT(pIter->dIter.iDataBlk >= pIter->dIter.mDataBlk.nItem); + + for (;;) { + if (pIter->dIter.iBlockIdx >= taosArrayGetSize(pIter->dIter.aBlockIdx)) { + pWriter->pDIter = NULL; + break; + } + + SBlockIdx* pBlockIdx = (SBlockIdx*)taosArrayGet(pIter->dIter.aBlockIdx, pIter->dIter.iBlockIdx); + + int32_t c = tTABLEIDCmprFn(pBlockIdx, &pWriter->tbid); + if (c < 0) { + code = tsdbReadDataBlk(pIter->dIter.pReader, pBlockIdx, &pIter->dIter.mDataBlk); + TSDB_CHECK_CODE(code, lino, _exit); + + SBlockIdx* pNewBlockIdx = taosArrayReserve(pWriter->aBlockIdx, 1); + if (pNewBlockIdx == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } + + pNewBlockIdx->suid = pBlockIdx->suid; + pNewBlockIdx->uid = pBlockIdx->uid; + + code = tsdbWriteDataBlk(pWriter->pDataFWriter, &pIter->dIter.mDataBlk, pNewBlockIdx); + TSDB_CHECK_CODE(code, lino, _exit); + + pIter->dIter.iBlockIdx++; + } else if (c == 0) { + code = tsdbReadDataBlk(pIter->dIter.pReader, pBlockIdx, &pIter->dIter.mDataBlk); + TSDB_CHECK_CODE(code, lino, _exit); + + pIter->dIter.iDataBlk = 0; + pIter->dIter.iBlockIdx++; + + break; + } else { + pIter->dIter.iDataBlk = pIter->dIter.mDataBlk.nItem; + break; + } + } + } + + if (pId) { + code = tsdbUpdateTableSchema(pWriter->pTsdb->pVnode->pMeta, pId->suid, pId->uid, &pWriter->skmTable); + TSDB_CHECK_CODE(code, lino, _exit); + + tMapDataReset(&pWriter->mDataBlk); - ASSERT(pWriter->dReader.iRow >= pWriter->dReader.bData.nRow); + code = tBlockDataInit(&pWriter->bData, pId, pWriter->skmTable.pTSchema, NULL, 0); + TSDB_CHECK_CODE(code, lino, _exit); + } - if (pWriter->dReader.iBlockIdx < taosArrayGetSize(pWriter->dReader.aBlockIdx)) { - pWriter->dReader.pBlockIdx = (SBlockIdx*)taosArrayGet(pWriter->dReader.aBlockIdx, pWriter->dReader.iBlockIdx); + if (!TABLE_SAME_SCHEMA(pWriter->tbid.suid, pWriter->tbid.uid, pWriter->sData.suid, pWriter->sData.uid)) { + if ((pWriter->sData.nRow > 0)) { + code = tsdbWriteSttBlock(pWriter->pDataFWriter, &pWriter->sData, pWriter->aSttBlk, pWriter->cmprAlg); + TSDB_CHECK_CODE(code, lino, _exit); + } - code = tsdbReadDataBlk(pWriter->dReader.pReader, pWriter->dReader.pBlockIdx, &pWriter->dReader.mDataBlk); - if (code) goto _exit; + if (pId) { + TABLEID id = {.suid = pWriter->tbid.suid, .uid = pWriter->tbid.suid ? 0 : pWriter->tbid.uid}; + code = tBlockDataInit(&pWriter->sData, &id, pWriter->skmTable.pTSchema, NULL, 0); + TSDB_CHECK_CODE(code, lino, _exit); + } + } - pWriter->dReader.iBlockIdx++; +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); } else { - pWriter->dReader.pBlockIdx = NULL; - tMapDataReset(&pWriter->dReader.mDataBlk); + tsdbTrace("vgId:%d %s done, suid:%" PRId64 " uid:%" PRId64, TD_VID(pWriter->pTsdb->pVnode), __func__, + pWriter->tbid.suid, pWriter->tbid.uid); + } + return code; +} + +static int32_t tsdbSnapWriteTableRowImpl(STsdbSnapWriter* pWriter, TSDBROW* pRow) { + int32_t code = 0; + int32_t lino = 0; + + code = tBlockDataAppendRow(&pWriter->bData, pRow, pWriter->skmTable.pTSchema, pWriter->tbid.uid); + TSDB_CHECK_CODE(code, lino, _exit); + + if (pWriter->bData.nRow >= pWriter->maxRow) { + code = tsdbWriteDataBlock(pWriter->pDataFWriter, &pWriter->bData, &pWriter->mDataBlk, pWriter->cmprAlg); + TSDB_CHECK_CODE(code, lino, _exit); } - pWriter->dReader.iDataBlk = 0; // point to the next one - tBlockDataReset(&pWriter->dReader.bData); - pWriter->dReader.iRow = 0; _exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } return code; } -static int32_t tsdbSnapWriteCopyData(STsdbSnapWriter* pWriter, TABLEID* pId) { +static int32_t tsdbSnapWriteTableRow(STsdbSnapWriter* pWriter, TSDBROW* pRow) { int32_t code = 0; + int32_t lino = 0; - while (true) { - if (pWriter->dReader.pBlockIdx == NULL) break; - if (tTABLEIDCmprFn(pWriter->dReader.pBlockIdx, pId) >= 0) break; + TSDBKEY inKey = pRow ? TSDBROW_KEY(pRow) : TSDBKEY_MAX; + + if (pWriter->pDIter == NULL || (pWriter->pDIter->dIter.iRow >= pWriter->pDIter->dIter.bData.nRow && + pWriter->pDIter->dIter.iDataBlk >= pWriter->pDIter->dIter.mDataBlk.nItem)) { + goto _write_row; + } else { + for (;;) { + while (pWriter->pDIter->dIter.iRow < pWriter->pDIter->dIter.bData.nRow) { + TSDBROW row = tsdbRowFromBlockData(&pWriter->pDIter->dIter.bData, pWriter->pDIter->dIter.iRow); + + int32_t c = tsdbKeyCmprFn(&inKey, &TSDBROW_KEY(&row)); + if (c < 0) { + goto _write_row; + } else if (c > 0) { + code = tsdbSnapWriteTableRowImpl(pWriter, &row); + TSDB_CHECK_CODE(code, lino, _exit); + + pWriter->pDIter->dIter.iRow++; + } else { + ASSERT(0); + } + } - SBlockIdx blkIdx = *pWriter->dReader.pBlockIdx; - code = tsdbWriteDataBlk(pWriter->dWriter.pWriter, &pWriter->dReader.mDataBlk, &blkIdx); - if (code) goto _exit; + for (;;) { + if (pWriter->pDIter->dIter.iDataBlk >= pWriter->pDIter->dIter.mDataBlk.nItem) goto _write_row; - if (taosArrayPush(pWriter->dWriter.aBlockIdx, &blkIdx) == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - goto _exit; + // FIXME: Here can be slow, use array instead + SDataBlk dataBlk; + tMapDataGetItemByIdx(&pWriter->pDIter->dIter.mDataBlk, pWriter->pDIter->dIter.iDataBlk, &dataBlk, tGetDataBlk); + + int32_t c = tDataBlkCmprFn(&dataBlk, &(SDataBlk){.minKey = inKey, .maxKey = inKey}); + if (c > 0) { + goto _write_row; + } else if (c < 0) { + if (pWriter->bData.nRow > 0) { + code = tsdbWriteDataBlock(pWriter->pDataFWriter, &pWriter->bData, &pWriter->mDataBlk, pWriter->cmprAlg); + TSDB_CHECK_CODE(code, lino, _exit); + } + + tMapDataPutItem(&pWriter->pDIter->dIter.mDataBlk, &dataBlk, tPutDataBlk); + pWriter->pDIter->dIter.iDataBlk++; + } else { + code = tsdbReadDataBlockEx(pWriter->pDataFReader, &dataBlk, &pWriter->pDIter->dIter.bData); + TSDB_CHECK_CODE(code, lino, _exit); + + pWriter->pDIter->dIter.iRow = 0; + pWriter->pDIter->dIter.iDataBlk++; + break; + } + } } + } - code = tsdbSnapNextTableData(pWriter); - if (code) goto _exit; +_write_row: + if (pRow) { + code = tsdbSnapWriteTableRowImpl(pWriter, pRow); + TSDB_CHECK_CODE(code, lino, _exit); } _exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } return code; } -static int32_t tsdbSnapWriteTableDataStart(STsdbSnapWriter* pWriter, TABLEID* pId) { +static int32_t tsdbSnapWriteTableDataEnd(STsdbSnapWriter* pWriter) { int32_t code = 0; + int32_t lino = 0; + + // write a NULL row to end current table data write + code = tsdbSnapWriteTableRow(pWriter, NULL); + TSDB_CHECK_CODE(code, lino, _exit); + + if (pWriter->bData.nRow > 0) { + if (pWriter->bData.nRow < pWriter->minRow) { + ASSERT(TABLE_SAME_SCHEMA(pWriter->sData.suid, pWriter->sData.uid, pWriter->tbid.suid, pWriter->tbid.uid)); + for (int32_t iRow = 0; iRow < pWriter->bData.nRow; iRow++) { + code = + tBlockDataAppendRow(&pWriter->sData, &tsdbRowFromBlockData(&pWriter->bData, iRow), NULL, pWriter->tbid.uid); + TSDB_CHECK_CODE(code, lino, _exit); - code = tsdbSnapWriteCopyData(pWriter, pId); - if (code) goto _err; + if (pWriter->sData.nRow >= pWriter->maxRow) { + code = tsdbWriteSttBlock(pWriter->pDataFWriter, &pWriter->sData, pWriter->aSttBlk, pWriter->cmprAlg); + TSDB_CHECK_CODE(code, lino, _exit); + } + } - pWriter->id.suid = pId->suid; - pWriter->id.uid = pId->uid; + tBlockDataClear(&pWriter->bData); + } else { + code = tsdbWriteDataBlock(pWriter->pDataFWriter, &pWriter->bData, &pWriter->mDataBlk, pWriter->cmprAlg); + TSDB_CHECK_CODE(code, lino, _exit); + } + } - code = tsdbUpdateTableSchema(pWriter->pTsdb->pVnode->pMeta, pId->suid, pId->uid, &pWriter->skmTable); - if (code) goto _err; + if (pWriter->mDataBlk.nItem) { + SBlockIdx* pBlockIdx = taosArrayReserve(pWriter->aBlockIdx, 1); + if (pBlockIdx == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } - tMapDataReset(&pWriter->dWriter.mDataBlk); - code = tBlockDataInit(&pWriter->dWriter.bData, pId, pWriter->skmTable.pTSchema, NULL, 0); - if (code) goto _err; + pBlockIdx->suid = pWriter->tbid.suid; + pBlockIdx->uid = pWriter->tbid.uid; - return code; + code = tsdbWriteDataBlk(pWriter->pDataFWriter, &pWriter->mDataBlk, pBlockIdx); + TSDB_CHECK_CODE(code, lino, _exit); + } -_err: - tsdbError("vgId:%d, %s failed since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, tstrerror(code)); +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } return code; } -static int32_t tsdbSnapWriteTableDataEnd(STsdbSnapWriter* pWriter) { +static int32_t tsdbSnapWriteFileDataStart(STsdbSnapWriter* pWriter, int32_t fid) { int32_t code = 0; + int32_t lino = 0; - if (pWriter->id.suid == 0 && pWriter->id.uid == 0) return code; + ASSERT(pWriter->pDataFWriter == NULL && pWriter->fid < fid); - int32_t c = 1; - if (pWriter->dReader.pBlockIdx) { - c = tTABLEIDCmprFn(pWriter->dReader.pBlockIdx, &pWriter->id); - ASSERT(c >= 0); - } + STsdb* pTsdb = pWriter->pTsdb; + + pWriter->fid = fid; + pWriter->tbid = (TABLEID){0}; + SDFileSet* pSet = taosArraySearch(pWriter->fs.aDFileSet, &(SDFileSet){.fid = fid}, tDFileSetCmprFn, TD_EQ); + + // open reader + pWriter->pDataFReader = NULL; + pWriter->iterList = NULL; + pWriter->pDIter = NULL; + pWriter->pSIter = NULL; + tRBTreeCreate(&pWriter->rbt, tsdbDataIterCmprFn); + if (pSet) { + code = tsdbDataFReaderOpen(&pWriter->pDataFReader, pTsdb, pSet); + TSDB_CHECK_CODE(code, lino, _exit); + + code = tsdbOpenDataFileDataIter(pWriter->pDataFReader, &pWriter->pDIter); + TSDB_CHECK_CODE(code, lino, _exit); + if (pWriter->pDIter) { + pWriter->pDIter->next = pWriter->iterList; + pWriter->iterList = pWriter->pDIter; + } - if (c == 0) { - SBlockData* pBData = &pWriter->dWriter.bData; + for (int32_t iStt = 0; iStt < pSet->nSttF; iStt++) { + code = tsdbOpenSttFileDataIter(pWriter->pDataFReader, iStt, &pWriter->pSIter); + TSDB_CHECK_CODE(code, lino, _exit); - for (; pWriter->dReader.iRow < pWriter->dReader.bData.nRow; pWriter->dReader.iRow++) { - TSDBROW row = tsdbRowFromBlockData(&pWriter->dReader.bData, pWriter->dReader.iRow); + if (pWriter->pSIter) { + code = tsdbSttFileDataIterNext(pWriter->pSIter, NULL); + TSDB_CHECK_CODE(code, lino, _exit); - code = tBlockDataAppendRow(pBData, &row, NULL, pWriter->id.uid); - if (code) goto _err; + // add to tree + tRBTreePut(&pWriter->rbt, &pWriter->pSIter->rbtn); - if (pBData->nRow >= pWriter->maxRow) { - code = tsdbWriteDataBlock(pWriter->dWriter.pWriter, pBData, &pWriter->dWriter.mDataBlk, pWriter->cmprAlg); - if (code) goto _err; + // add to list + pWriter->pSIter->next = pWriter->iterList; + pWriter->iterList = pWriter->pSIter; } } - code = tsdbWriteDataBlock(pWriter->dWriter.pWriter, pBData, &pWriter->dWriter.mDataBlk, pWriter->cmprAlg); - if (code) goto _err; + pWriter->pSIter = NULL; + } + + // open writer + SDiskID diskId; + if (pSet) { + diskId = pSet->diskId; + } else { + tfsAllocDisk(pTsdb->pVnode->pTfs, 0 /*TODO*/, &diskId); + tfsMkdirRecurAt(pTsdb->pVnode->pTfs, pTsdb->path, diskId); + } + SDFileSet wSet = {.diskId = diskId, + .fid = fid, + .pHeadF = &(SHeadFile){.commitID = pWriter->commitID}, + .pDataF = (pSet) ? pSet->pDataF : &(SDataFile){.commitID = pWriter->commitID}, + .pSmaF = (pSet) ? pSet->pSmaF : &(SSmaFile){.commitID = pWriter->commitID}, + .nSttF = 1, + .aSttF = {&(SSttFile){.commitID = pWriter->commitID}}}; + code = tsdbDataFWriterOpen(&pWriter->pDataFWriter, pTsdb, &wSet); + TSDB_CHECK_CODE(code, lino, _exit); - for (; pWriter->dReader.iDataBlk < pWriter->dReader.mDataBlk.nItem; pWriter->dReader.iDataBlk++) { - SDataBlk dataBlk; - tMapDataGetItemByIdx(&pWriter->dReader.mDataBlk, pWriter->dReader.iDataBlk, &dataBlk, tGetDataBlk); + if (pWriter->aBlockIdx) { + taosArrayClear(pWriter->aBlockIdx); + } else if ((pWriter->aBlockIdx = taosArrayInit(0, sizeof(SBlockIdx))) == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } - code = tMapDataPutItem(&pWriter->dWriter.mDataBlk, &dataBlk, tPutDataBlk); - if (code) goto _err; - } + tMapDataReset(&pWriter->mDataBlk); - code = tsdbSnapNextTableData(pWriter); - if (code) goto _err; + if (pWriter->aSttBlk) { + taosArrayClear(pWriter->aSttBlk); + } else if ((pWriter->aSttBlk = taosArrayInit(0, sizeof(SSttBlk))) == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); } - if (pWriter->dWriter.mDataBlk.nItem) { - SBlockIdx blockIdx = {.suid = pWriter->id.suid, .uid = pWriter->id.uid}; - code = tsdbWriteDataBlk(pWriter->dWriter.pWriter, &pWriter->dWriter.mDataBlk, &blockIdx); + tBlockDataReset(&pWriter->bData); + tBlockDataReset(&pWriter->sData); - if (taosArrayPush(pWriter->dWriter.aBlockIdx, &blockIdx) == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - goto _err; +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s, fid:%d", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code), + fid); + } else { + tsdbDebug("vgId:%d %s done, fid:%d", TD_VID(pTsdb->pVnode), __func__, fid); + } + return code; +} + +static int32_t tsdbSnapWriteTableData(STsdbSnapWriter* pWriter, SRowInfo* pRowInfo) { + int32_t code = 0; + int32_t lino = 0; + + // switch to new table if need + if (pRowInfo == NULL || pRowInfo->uid != pWriter->tbid.uid) { + if (pWriter->tbid.uid) { + code = tsdbSnapWriteTableDataEnd(pWriter); + TSDB_CHECK_CODE(code, lino, _exit); } + + code = tsdbSnapWriteTableDataStart(pWriter, (TABLEID*)pRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); } - pWriter->id.suid = 0; - pWriter->id.uid = 0; + if (pRowInfo == NULL) goto _exit; - return code; + code = tsdbSnapWriteTableRow(pWriter, &pRowInfo->row); + TSDB_CHECK_CODE(code, lino, _exit); -_err: +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } return code; } -static int32_t tsdbSnapWriteOpenFile(STsdbSnapWriter* pWriter, int32_t fid) { +static int32_t tsdbSnapWriteNextRow(STsdbSnapWriter* pWriter, SRowInfo** ppRowInfo) { int32_t code = 0; - STsdb* pTsdb = pWriter->pTsdb; - - ASSERT(pWriter->dWriter.pWriter == NULL); + int32_t lino = 0; - pWriter->fid = fid; - pWriter->id = (TABLEID){0}; - SDFileSet* pSet = taosArraySearch(pWriter->fs.aDFileSet, &(SDFileSet){.fid = fid}, tDFileSetCmprFn, TD_EQ); + if (pWriter->pSIter) { + code = tsdbDataIterNext2(pWriter->pSIter, NULL); + TSDB_CHECK_CODE(code, lino, _exit); - // Reader - if (pSet) { - code = tsdbDataFReaderOpen(&pWriter->dReader.pReader, pWriter->pTsdb, pSet); - if (code) goto _err; + if (pWriter->pSIter->rowInfo.suid == 0 && pWriter->pSIter->rowInfo.uid == 0) { + pWriter->pSIter = NULL; + } else { + SRBTreeNode* pNode = tRBTreeMin(&pWriter->rbt); + if (pNode) { + int32_t c = tsdbDataIterCmprFn(&pWriter->pSIter->rbtn, pNode); + if (c > 0) { + tRBTreePut(&pWriter->rbt, &pWriter->pSIter->rbtn); + pWriter->pSIter = NULL; + } else if (c == 0) { + ASSERT(0); + } + } + } + } - code = tsdbReadBlockIdx(pWriter->dReader.pReader, pWriter->dReader.aBlockIdx); - if (code) goto _err; - } else { - ASSERT(pWriter->dReader.pReader == NULL); - taosArrayClear(pWriter->dReader.aBlockIdx); - } - pWriter->dReader.iBlockIdx = 0; // point to the next one - code = tsdbSnapNextTableData(pWriter); - if (code) goto _err; - - // Writer - SHeadFile fHead = {.commitID = pWriter->commitID}; - SDataFile fData = {.commitID = pWriter->commitID}; - SSmaFile fSma = {.commitID = pWriter->commitID}; - SSttFile fStt = {.commitID = pWriter->commitID}; - SDFileSet wSet = {.fid = pWriter->fid, .pHeadF = &fHead, .pDataF = &fData, .pSmaF = &fSma}; - if (pSet) { - wSet.diskId = pSet->diskId; - fData = *pSet->pDataF; - fSma = *pSet->pSmaF; - for (int32_t iStt = 0; iStt < pSet->nSttF; iStt++) { - wSet.aSttF[iStt] = pSet->aSttF[iStt]; + if (pWriter->pSIter == NULL) { + SRBTreeNode* pNode = tRBTreeMin(&pWriter->rbt); + if (pNode) { + tRBTreeDrop(&pWriter->rbt, pNode); + pWriter->pSIter = TSDB_RBTN_TO_DATA_ITER(pNode); } - wSet.nSttF = pSet->nSttF + 1; // TODO: fix pSet->nSttF == pTsdb->maxFile - } else { - SDiskID did = {0}; - tfsAllocDisk(pTsdb->pVnode->pTfs, 0, &did); - tfsMkdirRecurAt(pTsdb->pVnode->pTfs, pTsdb->path, did); - wSet.diskId = did; - wSet.nSttF = 1; - } - wSet.aSttF[wSet.nSttF - 1] = &fStt; - - code = tsdbDataFWriterOpen(&pWriter->dWriter.pWriter, pWriter->pTsdb, &wSet); - if (code) goto _err; - taosArrayClear(pWriter->dWriter.aBlockIdx); - tMapDataReset(&pWriter->dWriter.mDataBlk); - taosArrayClear(pWriter->dWriter.aSttBlk); - tBlockDataReset(&pWriter->dWriter.bData); - tBlockDataReset(&pWriter->dWriter.sData); + } - return code; + if (ppRowInfo) { + if (pWriter->pSIter) { + *ppRowInfo = &pWriter->pSIter->rowInfo; + } else { + *ppRowInfo = NULL; + } + } -_err: +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } return code; } -static int32_t tsdbSnapWriteCloseFile(STsdbSnapWriter* pWriter) { +static int32_t tsdbSnapWriteGetRow(STsdbSnapWriter* pWriter, SRowInfo** ppRowInfo) { int32_t code = 0; + int32_t lino = 0; - ASSERT(pWriter->dWriter.pWriter); - - code = tsdbSnapWriteTableDataEnd(pWriter); - if (code) goto _err; - - // copy remain table data - TABLEID id = {.suid = INT64_MAX, .uid = INT64_MAX}; - code = tsdbSnapWriteCopyData(pWriter, &id); - if (code) goto _err; - - code = - tsdbWriteSttBlock(pWriter->dWriter.pWriter, &pWriter->dWriter.sData, pWriter->dWriter.aSttBlk, pWriter->cmprAlg); - if (code) goto _err; - - // Indices - code = tsdbWriteBlockIdx(pWriter->dWriter.pWriter, pWriter->dWriter.aBlockIdx); - if (code) goto _err; - - code = tsdbWriteSttBlk(pWriter->dWriter.pWriter, pWriter->dWriter.aSttBlk); - if (code) goto _err; - - code = tsdbUpdateDFileSetHeader(pWriter->dWriter.pWriter); - if (code) goto _err; - - code = tsdbFSUpsertFSet(&pWriter->fs, &pWriter->dWriter.pWriter->wSet); - if (code) goto _err; - - code = tsdbDataFWriterClose(&pWriter->dWriter.pWriter, 1); - if (code) goto _err; - - if (pWriter->dReader.pReader) { - code = tsdbDataFReaderClose(&pWriter->dReader.pReader); - if (code) goto _err; + if (pWriter->pSIter) { + *ppRowInfo = &pWriter->pSIter->rowInfo; + goto _exit; } -_exit: - return code; + code = tsdbSnapWriteNextRow(pWriter, ppRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); -_err: +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } return code; } -static int32_t tsdbSnapWriteToDataFile(STsdbSnapWriter* pWriter, int32_t iRow, int8_t* done) { +static int32_t tsdbSnapWriteFileDataEnd(STsdbSnapWriter* pWriter) { int32_t code = 0; + int32_t lino = 0; - SBlockData* pBData = &pWriter->bData; - TABLEID id = {.suid = pBData->suid, .uid = pBData->uid ? pBData->uid : pBData->aUid[iRow]}; - TSDBROW row = tsdbRowFromBlockData(pBData, iRow); - TSDBKEY key = TSDBROW_KEY(&row); + ASSERT(pWriter->pDataFWriter); - *done = 0; - while (pWriter->dReader.iRow < pWriter->dReader.bData.nRow || - pWriter->dReader.iDataBlk < pWriter->dReader.mDataBlk.nItem) { - // Merge row by row - for (; pWriter->dReader.iRow < pWriter->dReader.bData.nRow; pWriter->dReader.iRow++) { - TSDBROW trow = tsdbRowFromBlockData(&pWriter->dReader.bData, pWriter->dReader.iRow); - TSDBKEY tKey = TSDBROW_KEY(&trow); + // consume remain data and end with a NULL table row + SRowInfo* pRowInfo; + code = tsdbSnapWriteGetRow(pWriter, &pRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); + for (;;) { + code = tsdbSnapWriteTableData(pWriter, pRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); - ASSERT(pWriter->dReader.bData.suid == id.suid && pWriter->dReader.bData.uid == id.uid); + if (pRowInfo == NULL) break; - int32_t c = tsdbKeyCmprFn(&key, &tKey); - if (c < 0) { - code = tBlockDataAppendRow(&pWriter->dWriter.bData, &row, NULL, id.uid); - if (code) goto _err; - } else if (c > 0) { - code = tBlockDataAppendRow(&pWriter->dWriter.bData, &trow, NULL, id.uid); - if (code) goto _err; - } else { - ASSERT(0); - } + code = tsdbSnapWriteNextRow(pWriter, &pRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); + } - if (pWriter->dWriter.bData.nRow >= pWriter->maxRow) { - code = tsdbWriteDataBlock(pWriter->dWriter.pWriter, &pWriter->dWriter.bData, &pWriter->dWriter.mDataBlk, - pWriter->cmprAlg); - if (code) goto _err; - } + // do file-level updates + code = tsdbWriteSttBlk(pWriter->pDataFWriter, pWriter->aSttBlk); + TSDB_CHECK_CODE(code, lino, _exit); - if (c < 0) { - *done = 1; - goto _exit; - } - } + code = tsdbWriteBlockIdx(pWriter->pDataFWriter, pWriter->aBlockIdx); + TSDB_CHECK_CODE(code, lino, _exit); - // Merge row by block - SDataBlk tDataBlk = {.minKey = key, .maxKey = key}; - for (; pWriter->dReader.iDataBlk < pWriter->dReader.mDataBlk.nItem; pWriter->dReader.iDataBlk++) { - SDataBlk dataBlk; - tMapDataGetItemByIdx(&pWriter->dReader.mDataBlk, pWriter->dReader.iDataBlk, &dataBlk, tGetDataBlk); + code = tsdbUpdateDFileSetHeader(pWriter->pDataFWriter); + TSDB_CHECK_CODE(code, lino, _exit); - int32_t c = tDataBlkCmprFn(&dataBlk, &tDataBlk); - if (c < 0) { - code = tsdbWriteDataBlock(pWriter->dWriter.pWriter, &pWriter->dWriter.bData, &pWriter->dWriter.mDataBlk, - pWriter->cmprAlg); - if (code) goto _err; - - code = tMapDataPutItem(&pWriter->dWriter.mDataBlk, &dataBlk, tPutDataBlk); - if (code) goto _err; - } else if (c > 0) { - code = tBlockDataAppendRow(&pWriter->dWriter.bData, &row, NULL, id.uid); - if (code) goto _err; - - if (pWriter->dWriter.bData.nRow >= pWriter->maxRow) { - code = tsdbWriteDataBlock(pWriter->dWriter.pWriter, &pWriter->dWriter.bData, &pWriter->dWriter.mDataBlk, - pWriter->cmprAlg); - if (code) goto _err; - } + code = tsdbFSUpsertFSet(&pWriter->fs, &pWriter->pDataFWriter->wSet); + TSDB_CHECK_CODE(code, lino, _exit); - *done = 1; - goto _exit; - } else { - code = tsdbReadDataBlockEx(pWriter->dReader.pReader, &dataBlk, &pWriter->dReader.bData); - if (code) goto _err; - pWriter->dReader.iRow = 0; + code = tsdbDataFWriterClose(&pWriter->pDataFWriter, 1); + TSDB_CHECK_CODE(code, lino, _exit); - pWriter->dReader.iDataBlk++; - break; - } - } + if (pWriter->pDataFReader) { + code = tsdbDataFReaderClose(&pWriter->pDataFReader); + TSDB_CHECK_CODE(code, lino, _exit); } -_exit: - return code; + // clear sources + while (pWriter->iterList) { + STsdbDataIter2* pIter = pWriter->iterList; + pWriter->iterList = pIter->next; + tsdbCloseDataIter2(pIter); + } -_err: - tsdbError("vgId:%d, %s failed since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, tstrerror(code)); +_exit: + if (code) { + tsdbError("vgId:%d %s failed since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, tstrerror(code)); + } else { + tsdbDebug("vgId:%d %s is done", TD_VID(pWriter->pTsdb->pVnode), __func__); + } return code; } -static int32_t tsdbSnapWriteToSttFile(STsdbSnapWriter* pWriter, int32_t iRow) { +static int32_t tsdbSnapWriteTimeSeriesData(STsdbSnapWriter* pWriter, SSnapDataHdr* pHdr) { int32_t code = 0; + int32_t lino = 0; - TABLEID id = {.suid = pWriter->bData.suid, - .uid = pWriter->bData.uid ? pWriter->bData.uid : pWriter->bData.aUid[iRow]}; - TSDBROW row = tsdbRowFromBlockData(&pWriter->bData, iRow); - SBlockData* pBData = &pWriter->dWriter.sData; + code = tDecmprBlockData(pHdr->data, pHdr->size, &pWriter->inData, pWriter->aBuf); + TSDB_CHECK_CODE(code, lino, _exit); - if (pBData->suid || pBData->uid) { - if (!TABLE_SAME_SCHEMA(pBData->suid, pBData->uid, id.suid, id.uid)) { - code = tsdbWriteSttBlock(pWriter->dWriter.pWriter, pBData, pWriter->dWriter.aSttBlk, pWriter->cmprAlg); - if (code) goto _err; + ASSERT(pWriter->inData.nRow > 0); - pBData->suid = 0; - pBData->uid = 0; + // switch to new data file if need + int32_t fid = tsdbKeyFid(pWriter->inData.aTSKEY[0], pWriter->minutes, pWriter->precision); + if (pWriter->fid != fid) { + if (pWriter->pDataFWriter) { + code = tsdbSnapWriteFileDataEnd(pWriter); + TSDB_CHECK_CODE(code, lino, _exit); } - } - if (pBData->suid == 0 && pBData->uid == 0) { - code = tsdbUpdateTableSchema(pWriter->pTsdb->pVnode->pMeta, pWriter->id.suid, pWriter->id.uid, &pWriter->skmTable); - if (code) goto _err; - - TABLEID tid = {.suid = pWriter->id.suid, .uid = pWriter->id.suid ? 0 : pWriter->id.uid}; - code = tBlockDataInit(pBData, &tid, pWriter->skmTable.pTSchema, NULL, 0); - if (code) goto _err; + code = tsdbSnapWriteFileDataStart(pWriter, fid); + TSDB_CHECK_CODE(code, lino, _exit); } - code = tBlockDataAppendRow(pBData, &row, NULL, id.uid); - if (code) goto _err; + // loop write each row + SRowInfo* pRowInfo; + code = tsdbSnapWriteGetRow(pWriter, &pRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); + for (int32_t iRow = 0; iRow < pWriter->inData.nRow; ++iRow) { + SRowInfo rInfo = {.suid = pWriter->inData.suid, + .uid = pWriter->inData.uid ? pWriter->inData.uid : pWriter->inData.aUid[iRow], + .row = tsdbRowFromBlockData(&pWriter->inData, iRow)}; - if (pBData->nRow >= pWriter->maxRow) { - code = tsdbWriteSttBlock(pWriter->dWriter.pWriter, pBData, pWriter->dWriter.aSttBlk, pWriter->cmprAlg); - if (code) goto _err; + for (;;) { + if (pRowInfo == NULL) { + code = tsdbSnapWriteTableData(pWriter, &rInfo); + TSDB_CHECK_CODE(code, lino, _exit); + break; + } else { + int32_t c = tRowInfoCmprFn(&rInfo, pRowInfo); + if (c < 0) { + code = tsdbSnapWriteTableData(pWriter, &rInfo); + TSDB_CHECK_CODE(code, lino, _exit); + break; + } else if (c > 0) { + code = tsdbSnapWriteTableData(pWriter, pRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); + + code = tsdbSnapWriteNextRow(pWriter, &pRowInfo); + TSDB_CHECK_CODE(code, lino, _exit); + } else { + ASSERT(0); + } + } + } } _exit: - return code; - -_err: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbDebug("vgId:%d %s done, suid:%" PRId64 " uid:%" PRId64 " nRow:%d", TD_VID(pWriter->pTsdb->pVnode), __func__, + pWriter->inData.suid, pWriter->inData.uid, pWriter->inData.nRow); + } return code; } -static int32_t tsdbSnapWriteRowData(STsdbSnapWriter* pWriter, int32_t iRow) { +// SNAP_DATA_DEL +static int32_t tsdbSnapWriteDelTableDataStart(STsdbSnapWriter* pWriter, TABLEID* pId) { int32_t code = 0; + int32_t lino = 0; - SBlockData* pBlockData = &pWriter->bData; - TABLEID id = {.suid = pBlockData->suid, .uid = pBlockData->uid ? pBlockData->uid : pBlockData->aUid[iRow]}; - - // End last table data write if need - if (tTABLEIDCmprFn(&pWriter->id, &id) != 0) { - code = tsdbSnapWriteTableDataEnd(pWriter); - if (code) goto _err; - } - - // Start new table data write if need - if (pWriter->id.suid == 0 && pWriter->id.uid == 0) { - code = tsdbSnapWriteTableDataStart(pWriter, &id); - if (code) goto _err; - } - - // Merge with .data file data - int8_t done = 0; - if (pWriter->dReader.pBlockIdx && tTABLEIDCmprFn(pWriter->dReader.pBlockIdx, &id) == 0) { - code = tsdbSnapWriteToDataFile(pWriter, iRow, &done); - if (code) goto _err; - } - - // Append to the .stt data block (todo: check if need to set/reload sst block) - if (!done) { - code = tsdbSnapWriteToSttFile(pWriter, iRow); - if (code) goto _err; + if (pId) { + pWriter->tbid = *pId; + } else { + pWriter->tbid = (TABLEID){.suid = INT64_MAX, .uid = INT64_MAX}; } -_exit: - return code; + taosArrayClear(pWriter->aDelData); -_err: - tsdbError("vgId:%d, %s failed since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, tstrerror(code)); - return code; -} + if (pWriter->pTIter) { + while (pWriter->pTIter->tIter.iDelIdx < taosArrayGetSize(pWriter->pTIter->tIter.aDelIdx)) { + SDelIdx* pDelIdx = taosArrayGet(pWriter->pTIter->tIter.aDelIdx, pWriter->pTIter->tIter.iDelIdx); -static int32_t tsdbSnapWriteData(STsdbSnapWriter* pWriter, uint8_t* pData, uint32_t nData) { - int32_t code = 0; - STsdb* pTsdb = pWriter->pTsdb; - SBlockData* pBlockData = &pWriter->bData; + int32_t c = tTABLEIDCmprFn(pDelIdx, &pWriter->tbid); + if (c < 0) { + code = tsdbReadDelData(pWriter->pDelFReader, pDelIdx, pWriter->pTIter->tIter.aDelData); + TSDB_CHECK_CODE(code, lino, _exit); - // Decode data - SSnapDataHdr* pHdr = (SSnapDataHdr*)pData; - code = tDecmprBlockData(pHdr->data, pHdr->size, pBlockData, pWriter->aBuf); - if (code) goto _err; + SDelIdx* pDelIdxNew = taosArrayReserve(pWriter->aDelIdx, 1); + if (pDelIdxNew == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } - ASSERT(pBlockData->nRow > 0); + pDelIdxNew->suid = pDelIdx->suid; + pDelIdxNew->uid = pDelIdx->uid; - // Loop to handle each row - for (int32_t iRow = 0; iRow < pBlockData->nRow; iRow++) { - TSKEY ts = pBlockData->aTSKEY[iRow]; - int32_t fid = tsdbKeyFid(ts, pWriter->minutes, pWriter->precision); + code = tsdbWriteDelData(pWriter->pDelFWriter, pWriter->pTIter->tIter.aDelData, pDelIdxNew); + TSDB_CHECK_CODE(code, lino, _exit); - if (pWriter->dWriter.pWriter == NULL || pWriter->fid != fid) { - if (pWriter->dWriter.pWriter) { - // ASSERT(fid > pWriter->fid); + pWriter->pTIter->tIter.iDelIdx++; + } else if (c == 0) { + code = tsdbReadDelData(pWriter->pDelFReader, pDelIdx, pWriter->aDelData); + TSDB_CHECK_CODE(code, lino, _exit); - code = tsdbSnapWriteCloseFile(pWriter); - if (code) goto _err; + pWriter->pTIter->tIter.iDelIdx++; + break; + } else { + break; } - - code = tsdbSnapWriteOpenFile(pWriter, fid); - if (code) goto _err; } - - code = tsdbSnapWriteRowData(pWriter, iRow); - if (code) goto _err; } - return code; - -_err: - tsdbError("vgId:%d, vnode snapshot tsdb write data for %s failed since %s", TD_VID(pTsdb->pVnode), pTsdb->path, - tstrerror(code)); +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbTrace("vgId:%d %s done, suid:%" PRId64 " uid:%" PRId64, TD_VID(pWriter->pTsdb->pVnode), __func__, + pWriter->tbid.suid, pWriter->tbid.uid); + } return code; } -// SNAP_DATA_DEL -static int32_t tsdbSnapMoveWriteDelData(STsdbSnapWriter* pWriter, TABLEID* pId) { +static int32_t tsdbSnapWriteDelTableDataEnd(STsdbSnapWriter* pWriter) { int32_t code = 0; + int32_t lino = 0; - while (true) { - if (pWriter->iDelIdx >= taosArrayGetSize(pWriter->aDelIdxR)) break; - - SDelIdx* pDelIdx = (SDelIdx*)taosArrayGet(pWriter->aDelIdxR, pWriter->iDelIdx); - - if (tTABLEIDCmprFn(pDelIdx, pId) >= 0) break; - - code = tsdbReadDelData(pWriter->pDelFReader, pDelIdx, pWriter->aDelData); - if (code) goto _exit; - - SDelIdx delIdx = *pDelIdx; - code = tsdbWriteDelData(pWriter->pDelFWriter, pWriter->aDelData, &delIdx); - if (code) goto _exit; - - if (taosArrayPush(pWriter->aDelIdxW, &delIdx) == NULL) { + if (taosArrayGetSize(pWriter->aDelData) > 0) { + SDelIdx* pDelIdx = taosArrayReserve(pWriter->aDelIdx, 1); + if (pDelIdx == NULL) { code = TSDB_CODE_OUT_OF_MEMORY; - goto _exit; + TSDB_CHECK_CODE(code, lino, _exit); } - pWriter->iDelIdx++; + pDelIdx->suid = pWriter->tbid.suid; + pDelIdx->uid = pWriter->tbid.uid; + + code = tsdbWriteDelData(pWriter->pDelFWriter, pWriter->aDelData, pDelIdx); + TSDB_CHECK_CODE(code, lino, _exit); } _exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbTrace("vgId:%d %s done", TD_VID(pWriter->pTsdb->pVnode), __func__); + } return code; } -static int32_t tsdbSnapWriteDel(STsdbSnapWriter* pWriter, uint8_t* pData, uint32_t nData) { +static int32_t tsdbSnapWriteDelTableData(STsdbSnapWriter* pWriter, TABLEID* pId, uint8_t* pData, int64_t size) { int32_t code = 0; - STsdb* pTsdb = pWriter->pTsdb; - - // Open del file if not opened yet - if (pWriter->pDelFWriter == NULL) { - SDelFile* pDelFile = pWriter->fs.pDelFile; - - // reader - if (pDelFile) { - code = tsdbDelFReaderOpen(&pWriter->pDelFReader, pDelFile, pTsdb); - if (code) goto _err; + int32_t lino = 0; - code = tsdbReadDelIdx(pWriter->pDelFReader, pWriter->aDelIdxR); - if (code) goto _err; - } else { - taosArrayClear(pWriter->aDelIdxR); + if (pId == NULL || pId->uid != pWriter->tbid.uid) { + if (pWriter->tbid.uid) { + code = tsdbSnapWriteDelTableDataEnd(pWriter); + TSDB_CHECK_CODE(code, lino, _exit); } - pWriter->iDelIdx = 0; - // writer - SDelFile delFile = {.commitID = pWriter->commitID}; - code = tsdbDelFWriterOpen(&pWriter->pDelFWriter, &delFile, pTsdb); - if (code) goto _err; - taosArrayClear(pWriter->aDelIdxW); + code = tsdbSnapWriteDelTableDataStart(pWriter, pId); + TSDB_CHECK_CODE(code, lino, _exit); } - SSnapDataHdr* pHdr = (SSnapDataHdr*)pData; - TABLEID id = *(TABLEID*)pHdr->data; + if (pId == NULL) goto _exit; - ASSERT(pHdr->size + sizeof(SSnapDataHdr) == nData); + int64_t n = 0; + while (n < size) { + SDelData delData; + n += tGetDelData(pData + n, &delData); - // Move write data < id - code = tsdbSnapMoveWriteDelData(pWriter, &id); - if (code) goto _err; + if (taosArrayPush(pWriter->aDelData, &delData) < 0) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); + } + } + ASSERT(n == size); - // Merge incoming data with current - if (pWriter->iDelIdx < taosArrayGetSize(pWriter->aDelIdxR) && - tTABLEIDCmprFn(taosArrayGet(pWriter->aDelIdxR, pWriter->iDelIdx), &id) == 0) { - SDelIdx* pDelIdx = (SDelIdx*)taosArrayGet(pWriter->aDelIdxR, pWriter->iDelIdx); +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } + return code; +} - code = tsdbReadDelData(pWriter->pDelFReader, pDelIdx, pWriter->aDelData); - if (code) goto _err; +static int32_t tsdbSnapWriteDelDataStart(STsdbSnapWriter* pWriter) { + int32_t code = 0; + int32_t lino = 0; - pWriter->iDelIdx++; - } else { - taosArrayClear(pWriter->aDelData); - } + STsdb* pTsdb = pWriter->pTsdb; + SDelFile* pDelFile = pWriter->fs.pDelFile; - int64_t n = sizeof(SSnapDataHdr) + sizeof(TABLEID); - while (n < nData) { - SDelData delData; + pWriter->tbid = (TABLEID){0}; - n += tGetDelData(pData + n, &delData); + // reader + if (pDelFile) { + code = tsdbDelFReaderOpen(&pWriter->pDelFReader, pDelFile, pTsdb); + TSDB_CHECK_CODE(code, lino, _exit); - if (taosArrayPush(pWriter->aDelData, &delData) == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - goto _err; - } + code = tsdbOpenTombFileDataIter(pWriter->pDelFReader, &pWriter->pTIter); + TSDB_CHECK_CODE(code, lino, _exit); } - SDelIdx delIdx = {.suid = id.suid, .uid = id.uid}; - code = tsdbWriteDelData(pWriter->pDelFWriter, pWriter->aDelData, &delIdx); - if (code) goto _err; + // writer + code = tsdbDelFWriterOpen(&pWriter->pDelFWriter, &(SDelFile){.commitID = pWriter->commitID}, pTsdb); + TSDB_CHECK_CODE(code, lino, _exit); - if (taosArrayPush(pWriter->aDelIdxW, &delIdx) == NULL) { + if ((pWriter->aDelIdx = taosArrayInit(0, sizeof(SDelIdx))) == NULL) { code = TSDB_CODE_OUT_OF_MEMORY; - goto _err; + TSDB_CHECK_CODE(code, lino, _exit); + } + if ((pWriter->aDelData = taosArrayInit(0, sizeof(SDelData))) == NULL) { + code = TSDB_CODE_OUT_OF_MEMORY; + TSDB_CHECK_CODE(code, lino, _exit); } - return code; - -_err: - tsdbError("vgId:%d, vnode snapshot tsdb write del for %s failed since %s", TD_VID(pTsdb->pVnode), pTsdb->path, - tstrerror(code)); +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbDebug("vgId:%d %s done", TD_VID(pTsdb->pVnode), __func__); + } return code; } -static int32_t tsdbSnapWriteDelEnd(STsdbSnapWriter* pWriter) { +static int32_t tsdbSnapWriteDelDataEnd(STsdbSnapWriter* pWriter) { int32_t code = 0; - STsdb* pTsdb = pWriter->pTsdb; + int32_t lino = 0; - if (pWriter->pDelFWriter == NULL) return code; + STsdb* pTsdb = pWriter->pTsdb; - TABLEID id = {.suid = INT64_MAX, .uid = INT64_MAX}; - code = tsdbSnapMoveWriteDelData(pWriter, &id); - if (code) goto _err; + // end remaining table with NULL data + code = tsdbSnapWriteDelTableData(pWriter, NULL, NULL, 0); + TSDB_CHECK_CODE(code, lino, _exit); - code = tsdbWriteDelIdx(pWriter->pDelFWriter, pWriter->aDelIdxW); - if (code) goto _err; + // update file-level info + code = tsdbWriteDelIdx(pWriter->pDelFWriter, pWriter->aDelIdx); + TSDB_CHECK_CODE(code, lino, _exit); code = tsdbUpdateDelFileHdr(pWriter->pDelFWriter); - if (code) goto _err; + TSDB_CHECK_CODE(code, lino, _exit); code = tsdbFSUpsertDelFile(&pWriter->fs, &pWriter->pDelFWriter->fDel); - if (code) goto _err; + TSDB_CHECK_CODE(code, lino, _exit); code = tsdbDelFWriterClose(&pWriter->pDelFWriter, 1); - if (code) goto _err; + TSDB_CHECK_CODE(code, lino, _exit); if (pWriter->pDelFReader) { code = tsdbDelFReaderClose(&pWriter->pDelFReader); - if (code) goto _err; + TSDB_CHECK_CODE(code, lino, _exit); + } + + if (pWriter->pTIter) { + tsdbCloseDataIter2(pWriter->pTIter); + pWriter->pTIter = NULL; } - tsdbInfo("vgId:%d, vnode snapshot tsdb write del for %s end", TD_VID(pTsdb->pVnode), pTsdb->path); +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbInfo("vgId:%d %s done", TD_VID(pTsdb->pVnode), __func__); + } return code; +} + +static int32_t tsdbSnapWriteDelData(STsdbSnapWriter* pWriter, SSnapDataHdr* pHdr) { + int32_t code = 0; + int32_t lino = 0; + + STsdb* pTsdb = pWriter->pTsdb; -_err: - tsdbError("vgId:%d, vnode snapshot tsdb write del end for %s failed since %s", TD_VID(pTsdb->pVnode), pTsdb->path, - tstrerror(code)); + // start to write del data if need + if (pWriter->pDelFWriter == NULL) { + code = tsdbSnapWriteDelDataStart(pWriter); + TSDB_CHECK_CODE(code, lino, _exit); + } + + // do write del data + code = tsdbSnapWriteDelTableData(pWriter, (TABLEID*)pHdr->data, pHdr->data + sizeof(TABLEID), + pHdr->size - sizeof(TABLEID)); + TSDB_CHECK_CODE(code, lino, _exit); + +_exit: + if (code) { + tsdbError("vgId:%d %s failed since %s", TD_VID(pTsdb->pVnode), __func__, tstrerror(code)); + } else { + tsdbTrace("vgId:%d %s done", TD_VID(pTsdb->pVnode), __func__); + } return code; } // APIs int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, STsdbSnapWriter** ppWriter) { - int32_t code = 0; - int32_t lino = 0; - STsdbSnapWriter* pWriter = NULL; + int32_t code = 0; + int32_t lino = 0; // alloc - pWriter = (STsdbSnapWriter*)taosMemoryCalloc(1, sizeof(*pWriter)); + STsdbSnapWriter* pWriter = (STsdbSnapWriter*)taosMemoryCalloc(1, sizeof(*pWriter)); if (pWriter == NULL) { code = TSDB_CODE_OUT_OF_MEMORY; TSDB_CHECK_CODE(code, lino, _exit); @@ -1288,11 +1833,6 @@ int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, STsdbSnapWr pWriter->pTsdb = pTsdb; pWriter->sver = sver; pWriter->ever = ever; - - code = tsdbFSCopy(pTsdb, &pWriter->fs); - TSDB_CHECK_CODE(code, lino, _exit); - - // config pWriter->minutes = pTsdb->keepCfg.days; pWriter->precision = pTsdb->keepCfg.precision; pWriter->minRow = pTsdb->pVnode->config.tsdbCfg.minRows; @@ -1300,102 +1840,70 @@ int32_t tsdbSnapWriterOpen(STsdb* pTsdb, int64_t sver, int64_t ever, STsdbSnapWr pWriter->cmprAlg = pTsdb->pVnode->config.tsdbCfg.compression; pWriter->commitID = pTsdb->pVnode->state.commitID; + code = tsdbFSCopy(pTsdb, &pWriter->fs); + TSDB_CHECK_CODE(code, lino, _exit); + // SNAP_DATA_TSDB - code = tBlockDataCreate(&pWriter->bData); + code = tBlockDataCreate(&pWriter->inData); TSDB_CHECK_CODE(code, lino, _exit); pWriter->fid = INT32_MIN; - pWriter->id = (TABLEID){0}; - // Reader - pWriter->dReader.aBlockIdx = taosArrayInit(0, sizeof(SBlockIdx)); - if (pWriter->dReader.aBlockIdx == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - code = tBlockDataCreate(&pWriter->dReader.bData); - TSDB_CHECK_CODE(code, lino, _exit); - // Writer - pWriter->dWriter.aBlockIdx = taosArrayInit(0, sizeof(SBlockIdx)); - if (pWriter->dWriter.aBlockIdx == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - pWriter->dWriter.aSttBlk = taosArrayInit(0, sizeof(SSttBlk)); - if (pWriter->dWriter.aSttBlk == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - code = tBlockDataCreate(&pWriter->dWriter.bData); + code = tBlockDataCreate(&pWriter->bData); TSDB_CHECK_CODE(code, lino, _exit); - code = tBlockDataCreate(&pWriter->dWriter.sData); + + code = tBlockDataCreate(&pWriter->sData); TSDB_CHECK_CODE(code, lino, _exit); // SNAP_DATA_DEL - pWriter->aDelIdxR = taosArrayInit(0, sizeof(SDelIdx)); - if (pWriter->aDelIdxR == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - pWriter->aDelData = taosArrayInit(0, sizeof(SDelData)); - if (pWriter->aDelData == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } - pWriter->aDelIdxW = taosArrayInit(0, sizeof(SDelIdx)); - if (pWriter->aDelIdxW == NULL) { - code = TSDB_CODE_OUT_OF_MEMORY; - TSDB_CHECK_CODE(code, lino, _exit); - } _exit: if (code) { - tsdbError("vgId:%d, %s failed at line %d since %s", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code)); - *ppWriter = NULL; - + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code)); if (pWriter) { - if (pWriter->aDelIdxW) taosArrayDestroy(pWriter->aDelIdxW); - if (pWriter->aDelData) taosArrayDestroy(pWriter->aDelData); - if (pWriter->aDelIdxR) taosArrayDestroy(pWriter->aDelIdxR); - tBlockDataDestroy(&pWriter->dWriter.sData, 1); - tBlockDataDestroy(&pWriter->dWriter.bData, 1); - if (pWriter->dWriter.aSttBlk) taosArrayDestroy(pWriter->dWriter.aSttBlk); - if (pWriter->dWriter.aBlockIdx) taosArrayDestroy(pWriter->dWriter.aBlockIdx); - tBlockDataDestroy(&pWriter->dReader.bData, 1); - if (pWriter->dReader.aBlockIdx) taosArrayDestroy(pWriter->dReader.aBlockIdx); + tBlockDataDestroy(&pWriter->sData, 1); tBlockDataDestroy(&pWriter->bData, 1); + tBlockDataDestroy(&pWriter->inData, 1); tsdbFSDestroy(&pWriter->fs); - taosMemoryFree(pWriter); + pWriter = NULL; } } else { - tsdbInfo("vgId:%d, %s done", TD_VID(pTsdb->pVnode), __func__); - *ppWriter = pWriter; + tsdbInfo("vgId:%d %s done, sver:%" PRId64 " ever:%" PRId64, TD_VID(pTsdb->pVnode), __func__, sver, ever); } + *ppWriter = pWriter; return code; } int32_t tsdbSnapWriterPrepareClose(STsdbSnapWriter* pWriter) { int32_t code = 0; - if (pWriter->dWriter.pWriter) { - code = tsdbSnapWriteCloseFile(pWriter); - if (code) goto _exit; + int32_t lino = 0; + + if (pWriter->pDataFWriter) { + code = tsdbSnapWriteFileDataEnd(pWriter); + TSDB_CHECK_CODE(code, lino, _exit); } - code = tsdbSnapWriteDelEnd(pWriter); - if (code) goto _exit; + if (pWriter->pDelFWriter) { + code = tsdbSnapWriteDelDataEnd(pWriter); + TSDB_CHECK_CODE(code, lino, _exit); + } code = tsdbFSPrepareCommit(pWriter->pTsdb, &pWriter->fs); - if (code) goto _exit; + TSDB_CHECK_CODE(code, lino, _exit); _exit: if (code) { - tsdbError("vgId:%d, %s failed since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, tstrerror(code)); + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbDebug("vgId:%d %s done", TD_VID(pWriter->pTsdb->pVnode), __func__); } return code; } int32_t tsdbSnapWriterClose(STsdbSnapWriter** ppWriter, int8_t rollback) { - int32_t code = 0; + int32_t code = 0; + int32_t lino = 0; + STsdbSnapWriter* pWriter = *ppWriter; STsdb* pTsdb = pWriter->pTsdb; @@ -1408,7 +1916,7 @@ int32_t tsdbSnapWriterClose(STsdbSnapWriter** ppWriter, int8_t rollback) { code = tsdbFSCommit(pWriter->pTsdb); if (code) { taosThreadRwlockUnlock(&pTsdb->rwLock); - goto _err; + TSDB_CHECK_CODE(code, lino, _exit); } // unlock @@ -1416,72 +1924,60 @@ int32_t tsdbSnapWriterClose(STsdbSnapWriter** ppWriter, int8_t rollback) { } // SNAP_DATA_DEL - taosArrayDestroy(pWriter->aDelIdxW); taosArrayDestroy(pWriter->aDelData); - taosArrayDestroy(pWriter->aDelIdxR); + taosArrayDestroy(pWriter->aDelIdx); // SNAP_DATA_TSDB - - // Writer - tBlockDataDestroy(&pWriter->dWriter.sData, 1); - tBlockDataDestroy(&pWriter->dWriter.bData, 1); - taosArrayDestroy(pWriter->dWriter.aSttBlk); - tMapDataClear(&pWriter->dWriter.mDataBlk); - taosArrayDestroy(pWriter->dWriter.aBlockIdx); - - // Reader - tBlockDataDestroy(&pWriter->dReader.bData, 1); - tMapDataClear(&pWriter->dReader.mDataBlk); - taosArrayDestroy(pWriter->dReader.aBlockIdx); - + tBlockDataDestroy(&pWriter->sData, 1); tBlockDataDestroy(&pWriter->bData, 1); + taosArrayDestroy(pWriter->aSttBlk); + tMapDataClear(&pWriter->mDataBlk); + taosArrayDestroy(pWriter->aBlockIdx); tDestroyTSchema(pWriter->skmTable.pTSchema); + tBlockDataDestroy(&pWriter->inData, 1); for (int32_t iBuf = 0; iBuf < sizeof(pWriter->aBuf) / sizeof(uint8_t*); iBuf++) { tFree(pWriter->aBuf[iBuf]); } - tsdbInfo("vgId:%d, %s done", TD_VID(pWriter->pTsdb->pVnode), __func__); + tsdbFSDestroy(&pWriter->fs); taosMemoryFree(pWriter); *ppWriter = NULL; - return code; -_err: - tsdbError("vgId:%d, vnode snapshot tsdb writer close for %s failed since %s", TD_VID(pWriter->pTsdb->pVnode), - pWriter->pTsdb->path, tstrerror(code)); - taosMemoryFree(pWriter); - *ppWriter = NULL; +_exit: + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s", TD_VID(pTsdb->pVnode), __func__, lino, tstrerror(code)); + } else { + tsdbInfo("vgId:%d %s done", TD_VID(pTsdb->pVnode), __func__); + } return code; } -int32_t tsdbSnapWrite(STsdbSnapWriter* pWriter, uint8_t* pData, uint32_t nData) { - int32_t code = 0; - SSnapDataHdr* pHdr = (SSnapDataHdr*)pData; +int32_t tsdbSnapWrite(STsdbSnapWriter* pWriter, SSnapDataHdr* pHdr) { + int32_t code = 0; + int32_t lino = 0; - // ts data if (pHdr->type == SNAP_DATA_TSDB) { - code = tsdbSnapWriteData(pWriter, pData, nData); - if (code) goto _err; - + code = tsdbSnapWriteTimeSeriesData(pWriter, pHdr); + TSDB_CHECK_CODE(code, lino, _exit); goto _exit; - } else { - if (pWriter->dWriter.pWriter) { - code = tsdbSnapWriteCloseFile(pWriter); - if (code) goto _err; - } + } else if (pWriter->pDataFWriter) { + code = tsdbSnapWriteFileDataEnd(pWriter); + TSDB_CHECK_CODE(code, lino, _exit); } - // del data if (pHdr->type == SNAP_DATA_DEL) { - code = tsdbSnapWriteDel(pWriter, pData, nData); - if (code) goto _err; + code = tsdbSnapWriteDelData(pWriter, pHdr); + TSDB_CHECK_CODE(code, lino, _exit); + goto _exit; } _exit: - tsdbDebug("vgId:%d, tsdb snapshot write for %s succeed", TD_VID(pWriter->pTsdb->pVnode), pWriter->pTsdb->path); - return code; - -_err: - tsdbError("vgId:%d, tsdb snapshot write for %s failed since %s", TD_VID(pWriter->pTsdb->pVnode), pWriter->pTsdb->path, - tstrerror(code)); + if (code) { + tsdbError("vgId:%d %s failed at line %d since %s, type:%d index:%" PRId64 " size:%" PRId64, + TD_VID(pWriter->pTsdb->pVnode), __func__, lino, tstrerror(code), pHdr->type, pHdr->index, pHdr->size); + } else { + tsdbDebug("vgId:%d %s done, type:%d index:%" PRId64 " size:%" PRId64, TD_VID(pWriter->pTsdb->pVnode), __func__, + pHdr->type, pHdr->index, pHdr->size); + } return code; } diff --git a/source/dnode/vnode/src/tsdb/tsdbUtil.c b/source/dnode/vnode/src/tsdb/tsdbUtil.c index ab1096097014bc6f07a9c2f27871bd0d57581445..e2d4b92836d8f6c25cdd4eb5838553fdef167ba5 100644 --- a/source/dnode/vnode/src/tsdb/tsdbUtil.c +++ b/source/dnode/vnode/src/tsdb/tsdbUtil.c @@ -684,7 +684,7 @@ int32_t tRowMergerInit2(SRowMerger *pMerger, STSchema *pResTSchema, TSDBROW *pRo tsdbRowGetColVal(pRow, pTSchema, jCol++, pColVal); if ((!COL_VAL_IS_NONE(pColVal)) && (!COL_VAL_IS_NULL(pColVal)) && IS_VAR_DATA_TYPE(pColVal->type)) { uint8_t *pVal = pColVal->value.pData; - + pColVal->value.pData = NULL; code = tRealloc(&pColVal->value.pData, pColVal->value.nData); if (code) goto _exit; @@ -757,7 +757,7 @@ int32_t tRowMergerAdd(SRowMerger *pMerger, TSDBROW *pRow, STSchema *pTSchema) { pTColVal->value.nData = pColVal->value.nData; if (pTColVal->value.nData) { - memcpy(pTColVal->value.pData, pColVal->value.pData, pTColVal->value.nData); + memcpy(pTColVal->value.pData, pColVal->value.pData, pTColVal->value.nData); } pTColVal->flag = 0; } else { @@ -776,7 +776,7 @@ int32_t tRowMergerAdd(SRowMerger *pMerger, TSDBROW *pRow, STSchema *pTSchema) { code = tRealloc(&tColVal->value.pData, pColVal->value.nData); if (code) return code; - tColVal->value.nData = pColVal->value.nData; + tColVal->value.nData = pColVal->value.nData; if (pColVal->value.nData) { memcpy(tColVal->value.pData, pColVal->value.pData, pColVal->value.nData); } @@ -825,7 +825,7 @@ int32_t tRowMergerInit(SRowMerger *pMerger, TSDBROW *pRow, STSchema *pTSchema) { tsdbRowGetColVal(pRow, pTSchema, iCol, pColVal); if ((!COL_VAL_IS_NONE(pColVal)) && (!COL_VAL_IS_NULL(pColVal)) && IS_VAR_DATA_TYPE(pColVal->type)) { uint8_t *pVal = pColVal->value.pData; - + pColVal->value.pData = NULL; code = tRealloc(&pColVal->value.pData, pColVal->value.nData); if (code) goto _exit; @@ -834,7 +834,7 @@ int32_t tRowMergerInit(SRowMerger *pMerger, TSDBROW *pRow, STSchema *pTSchema) { memcpy(pColVal->value.pData, pVal, pColVal->value.nData); } } - + if (taosArrayPush(pMerger->pArray, pColVal) == NULL) { code = TSDB_CODE_OUT_OF_MEMORY; goto _exit; @@ -845,7 +845,7 @@ _exit: return code; } -void tRowMergerClear(SRowMerger *pMerger) { +void tRowMergerClear(SRowMerger *pMerger) { for (int32_t iCol = 1; iCol < pMerger->pTSchema->numOfCols; iCol++) { SColVal *pTColVal = taosArrayGet(pMerger->pArray, iCol); if (IS_VAR_DATA_TYPE(pTColVal->type)) { @@ -853,7 +853,7 @@ void tRowMergerClear(SRowMerger *pMerger) { } } - taosArrayDestroy(pMerger->pArray); + taosArrayDestroy(pMerger->pArray); } int32_t tRowMerge(SRowMerger *pMerger, TSDBROW *pRow) { @@ -876,7 +876,7 @@ int32_t tRowMerge(SRowMerger *pMerger, TSDBROW *pRow) { pTColVal->value.nData = pColVal->value.nData; if (pTColVal->value.nData) { - memcpy(pTColVal->value.pData, pColVal->value.pData, pTColVal->value.nData); + memcpy(pTColVal->value.pData, pColVal->value.pData, pTColVal->value.nData); } pTColVal->flag = 0; } else { @@ -898,7 +898,7 @@ int32_t tRowMerge(SRowMerger *pMerger, TSDBROW *pRow) { tColVal->value.nData = pColVal->value.nData; if (tColVal->value.nData) { - memcpy(tColVal->value.pData, pColVal->value.pData, tColVal->value.nData); + memcpy(tColVal->value.pData, pColVal->value.pData, tColVal->value.nData); } tColVal->flag = 0; } else { diff --git a/source/dnode/vnode/src/vnd/vnodeSnapshot.c b/source/dnode/vnode/src/vnd/vnodeSnapshot.c index e75dc24329c03ade45e12242fd70a62b963f74ff..43f903dc4867178919e3d3b519b899afcca6d835 100644 --- a/source/dnode/vnode/src/vnd/vnodeSnapshot.c +++ b/source/dnode/vnode/src/vnd/vnodeSnapshot.c @@ -455,7 +455,7 @@ int32_t vnodeSnapWrite(SVSnapWriter *pWriter, uint8_t *pData, uint32_t nData) { if (code) goto _err; } - code = tsdbSnapWrite(pWriter->pTsdbSnapWriter, pData, nData); + code = tsdbSnapWrite(pWriter->pTsdbSnapWriter, pHdr); if (code) goto _err; } break; case SNAP_DATA_TQ_HANDLE: { diff --git a/source/libs/executor/src/timewindowoperator.c b/source/libs/executor/src/timewindowoperator.c index d78e9c4edf06329ccdb6fe1817a67c318ee3041d..d320ef6e9e1cd4525886bda90f5dd764b5e91488 100644 --- a/source/libs/executor/src/timewindowoperator.c +++ b/source/libs/executor/src/timewindowoperator.c @@ -2477,7 +2477,19 @@ static void doStreamIntervalAggImpl(SOperatorInfo* pOperatorInfo, SSDataBlock* p pInfo->delKey = key; } int32_t prevEndPos = (forwardRows - 1) * step + startPos; - ASSERT(pSDataBlock->info.window.skey > 0 && pSDataBlock->info.window.ekey > 0); + if (pSDataBlock->info.window.skey <= 0 || pSDataBlock->info.window.ekey <= 0) { + qError("table uid %" PRIu64 " data block timestamp range may not be calculated! minKey %" PRId64 + ",maxKey %" PRId64, + pSDataBlock->info.id.uid, pSDataBlock->info.window.skey, pSDataBlock->info.window.ekey); + blockDataUpdateTsWindow(pSDataBlock, 0); + + // timestamp of the data is incorrect + if (pSDataBlock->info.window.skey <= 0 || pSDataBlock->info.window.ekey <= 0) { + qError("table uid %" PRIu64 " data block timestamp is out of range! minKey %" PRId64 ",maxKey %" PRId64, + pSDataBlock->info.id.uid, pSDataBlock->info.window.skey, pSDataBlock->info.window.ekey); + } + } + if (IS_FINAL_OP(pInfo)) { startPos = getNextQualifiedFinalWindow(&pInfo->interval, &nextWin, &pSDataBlock->info, tsCols, prevEndPos); } else { diff --git a/source/libs/qworker/inc/qwInt.h b/source/libs/qworker/inc/qwInt.h index aa1ce8090369dbfef9dbabf7f2ff47e41a7af73b..bde05d41164c0ed36fc662f21e145a8967052ab0 100644 --- a/source/libs/qworker/inc/qwInt.h +++ b/source/libs/qworker/inc/qwInt.h @@ -194,6 +194,8 @@ typedef struct SQWorker { SMsgCb msgCb; SQWStat stat; int32_t *destroyed; + + int8_t nodeStopped; } SQWorker; typedef struct SQWorkerMgmt { diff --git a/source/libs/qworker/src/qwUtil.c b/source/libs/qworker/src/qwUtil.c index fdd2775daacbbadecd600f117ea3928c162e8f08..7ee7c50c9687d26f94825a5b0888caba9371f66d 100644 --- a/source/libs/qworker/src/qwUtil.c +++ b/source/libs/qworker/src/qwUtil.c @@ -213,9 +213,15 @@ int32_t qwAcquireTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx) { QW_SET_QTID(id, qId, tId, eId); *ctx = taosHashAcquire(mgmt->ctxHash, id, sizeof(id)); + int8_t nodeStopped = atomic_load_8(&mgmt->nodeStopped); if (NULL == (*ctx)) { - QW_TASK_DLOG_E("task ctx not exist, may be dropped"); - QW_ERR_RET(TSDB_CODE_QRY_TASK_CTX_NOT_EXIST); + if (!nodeStopped) { + QW_TASK_DLOG_E("task ctx not exist, may be dropped"); + QW_ERR_RET(TSDB_CODE_QRY_TASK_CTX_NOT_EXIST); + } else { + QW_TASK_DLOG_E("node stopped"); + QW_ERR_RET(TSDB_CODE_VND_STOPPED); + } } return TSDB_CODE_SUCCESS; @@ -226,9 +232,16 @@ int32_t qwGetTaskCtx(QW_FPARAMS_DEF, SQWTaskCtx **ctx) { QW_SET_QTID(id, qId, tId, eId); *ctx = taosHashGet(mgmt->ctxHash, id, sizeof(id)); + int8_t nodeStopped = atomic_load_8(&mgmt->nodeStopped); + if (NULL == (*ctx)) { - QW_TASK_DLOG_E("task ctx not exist, may be dropped"); - QW_ERR_RET(TSDB_CODE_QRY_TASK_CTX_NOT_EXIST); + if (!nodeStopped) { + QW_TASK_DLOG_E("task ctx not exist, may be dropped"); + QW_ERR_RET(TSDB_CODE_QRY_TASK_CTX_NOT_EXIST); + } else { + QW_TASK_DLOG_E("node stopped"); + QW_ERR_RET(TSDB_CODE_VND_STOPPED); + } } return TSDB_CODE_SUCCESS; diff --git a/source/libs/qworker/src/qworker.c b/source/libs/qworker/src/qworker.c index e38361d87f52d914e2412a42ab222bdc7896ca3b..fedaa96ed9b0d91eb8b67a7f2656f813ee1f2acb 100644 --- a/source/libs/qworker/src/qworker.c +++ b/source/libs/qworker/src/qworker.c @@ -1188,6 +1188,9 @@ void qWorkerStopAllTasks(void *qWorkerMgmt) { uint64_t qId, tId, sId; int32_t eId; int64_t rId = 0; + + atomic_store_8(&mgmt->nodeStopped, 1); + void *pIter = taosHashIterate(mgmt->ctxHash, NULL); while (pIter) { SQWTaskCtx *ctx = (SQWTaskCtx *)pIter; diff --git a/source/libs/stream/src/streamMeta.c b/source/libs/stream/src/streamMeta.c index 6f77769decdf2649b66e5cac0c8bbf823cce85e1..2f991288ffd0201be79ed3392befcd5da669294e 100644 --- a/source/libs/stream/src/streamMeta.c +++ b/source/libs/stream/src/streamMeta.c @@ -207,6 +207,7 @@ void streamMetaRemoveTask(SStreamMeta* pMeta, int32_t taskId) { if (ppTask) { SStreamTask* pTask = *ppTask; taosHashRemove(pMeta->pTasks, &taskId, sizeof(int32_t)); + tdbTbDelete(pMeta->pTaskDb, &taskId, sizeof(int32_t), pMeta->txn); /*if (pTask->timer) { * taosTmrStop(pTask->timer);*/ /*pTask->timer = NULL;*/ diff --git a/source/libs/sync/src/syncAppendEntries.c b/source/libs/sync/src/syncAppendEntries.c index 948d2f53e7510d52a574c7c977615178fd080c3d..835e5d248e345cbbb3206e35d67ddd20717009db 100644 --- a/source/libs/sync/src/syncAppendEntries.c +++ b/source/libs/sync/src/syncAppendEntries.c @@ -89,45 +89,6 @@ // /\ UNCHANGED <> // -int32_t syncNodeFollowerCommit(SSyncNode* ths, SyncIndex newCommitIndex) { - ASSERT(false && "deprecated"); - if (ths->state != TAOS_SYNC_STATE_FOLLOWER) { - sNTrace(ths, "can not do follower commit"); - return -1; - } - - // maybe update commit index, leader notice me - if (newCommitIndex > ths->commitIndex) { - // has commit entry in local - if (newCommitIndex <= ths->pLogStore->syncLogLastIndex(ths->pLogStore)) { - // advance commit index to sanpshot first - SSnapshot snapshot; - ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &snapshot); - if (snapshot.lastApplyIndex >= 0 && snapshot.lastApplyIndex > ths->commitIndex) { - SyncIndex commitBegin = ths->commitIndex; - SyncIndex commitEnd = snapshot.lastApplyIndex; - ths->commitIndex = snapshot.lastApplyIndex; - sNTrace(ths, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, commitBegin, commitEnd); - } - - SyncIndex beginIndex = ths->commitIndex + 1; - SyncIndex endIndex = newCommitIndex; - - // update commit index - ths->commitIndex = newCommitIndex; - - // call back Wal - int32_t code = ths->pLogStore->syncLogUpdateCommitIndex(ths->pLogStore, ths->commitIndex); - ASSERT(code == 0); - - code = syncNodeDoCommit(ths, beginIndex, endIndex, ths->state); - ASSERT(code == 0); - } - } - - return 0; -} - SSyncRaftEntry* syncBuildRaftEntryFromAppendEntries(const SyncAppendEntries* pMsg) { SSyncRaftEntry* pEntry = taosMemoryMalloc(pMsg->dataLen); if (pEntry == NULL) { @@ -232,256 +193,3 @@ _IGNORE: rpcFreeCont(rpcRsp.pCont); return 0; } - -int32_t syncNodeOnAppendEntriesOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) { - SyncAppendEntries* pMsg = pRpcMsg->pCont; - SRpcMsg rpcRsp = {0}; - - // if already drop replica, do not process - if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) { - syncLogRecvAppendEntries(ths, pMsg, "not in my config"); - goto _IGNORE; - } - - // prepare response msg - int32_t code = syncBuildAppendEntriesReply(&rpcRsp, ths->vgId); - if (code != 0) { - syncLogRecvAppendEntries(ths, pMsg, "build rsp error"); - goto _IGNORE; - } - - SyncAppendEntriesReply* pReply = rpcRsp.pCont; - pReply->srcId = ths->myRaftId; - pReply->destId = pMsg->srcId; - pReply->term = ths->raftStore.currentTerm; - pReply->success = false; - // pReply->matchIndex = ths->pLogStore->syncLogLastIndex(ths->pLogStore); - pReply->matchIndex = SYNC_INDEX_INVALID; - pReply->lastSendIndex = pMsg->prevLogIndex + 1; - pReply->startTime = ths->startTime; - - if (pMsg->term < ths->raftStore.currentTerm) { - syncLogRecvAppendEntries(ths, pMsg, "reject, small term"); - goto _SEND_RESPONSE; - } - - if (pMsg->term > ths->raftStore.currentTerm) { - pReply->term = pMsg->term; - } - - syncNodeStepDown(ths, pMsg->term); - syncNodeResetElectTimer(ths); - - SyncIndex startIndex = ths->pLogStore->syncLogBeginIndex(ths->pLogStore); - SyncIndex lastIndex = ths->pLogStore->syncLogLastIndex(ths->pLogStore); - - if (pMsg->prevLogIndex > lastIndex) { - syncLogRecvAppendEntries(ths, pMsg, "reject, index not match"); - goto _SEND_RESPONSE; - } - - if (pMsg->prevLogIndex >= startIndex) { - SyncTerm myPreLogTerm = syncNodeGetPreTerm(ths, pMsg->prevLogIndex + 1); - // ASSERT(myPreLogTerm != SYNC_TERM_INVALID); - if (myPreLogTerm == SYNC_TERM_INVALID) { - syncLogRecvAppendEntries(ths, pMsg, "reject, pre-term invalid"); - goto _SEND_RESPONSE; - } - - if (myPreLogTerm != pMsg->prevLogTerm) { - syncLogRecvAppendEntries(ths, pMsg, "reject, pre-term not match"); - goto _SEND_RESPONSE; - } - } - - // accept - pReply->success = true; - bool hasAppendEntries = pMsg->dataLen > 0; - if (hasAppendEntries) { - SSyncRaftEntry* pAppendEntry = syncEntryBuildFromAppendEntries(pMsg); - ASSERT(pAppendEntry != NULL); - - SyncIndex appendIndex = pMsg->prevLogIndex + 1; - - LRUHandle* hLocal = NULL; - LRUHandle* hAppend = NULL; - - int32_t code = 0; - SSyncRaftEntry* pLocalEntry = NULL; - SLRUCache* pCache = ths->pLogStore->pCache; - hLocal = taosLRUCacheLookup(pCache, &appendIndex, sizeof(appendIndex)); - if (hLocal) { - pLocalEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, hLocal); - code = 0; - - ths->pLogStore->cacheHit++; - sNTrace(ths, "hit cache index:%" PRId64 ", bytes:%u, %p", appendIndex, pLocalEntry->bytes, pLocalEntry); - - } else { - ths->pLogStore->cacheMiss++; - sNTrace(ths, "miss cache index:%" PRId64, appendIndex); - - code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, appendIndex, &pLocalEntry); - } - - if (code == 0) { - // get local entry success - - if (pLocalEntry->term == pAppendEntry->term) { - // do nothing - sNTrace(ths, "log match, do nothing, index:%" PRId64, appendIndex); - - } else { - // truncate - code = ths->pLogStore->syncLogTruncate(ths->pLogStore, appendIndex); - if (code != 0) { - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "ignore, truncate error, append-index:%" PRId64, appendIndex); - syncLogRecvAppendEntries(ths, pMsg, logBuf); - - if (hLocal) { - taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false); - } else { - syncEntryDestroy(pLocalEntry); - } - - if (hAppend) { - taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false); - } else { - syncEntryDestroy(pAppendEntry); - } - - goto _IGNORE; - } - - ASSERT(pAppendEntry->index == appendIndex); - - // append - code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pAppendEntry, false); - if (code != 0) { - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "ignore, append error, append-index:%" PRId64, appendIndex); - syncLogRecvAppendEntries(ths, pMsg, logBuf); - - if (hLocal) { - taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false); - } else { - syncEntryDestroy(pLocalEntry); - } - - if (hAppend) { - taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false); - } else { - syncEntryDestroy(pAppendEntry); - } - - goto _IGNORE; - } - - syncCacheEntry(ths->pLogStore, pAppendEntry, &hAppend); - } - - } else { - if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) { - // log not exist - - // truncate - code = ths->pLogStore->syncLogTruncate(ths->pLogStore, appendIndex); - if (code != 0) { - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "ignore, log not exist, truncate error, append-index:%" PRId64, appendIndex); - syncLogRecvAppendEntries(ths, pMsg, logBuf); - - syncEntryDestroy(pLocalEntry); - syncEntryDestroy(pAppendEntry); - goto _IGNORE; - } - - // append - code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pAppendEntry, false); - if (code != 0) { - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "ignore, log not exist, append error, append-index:%" PRId64, appendIndex); - syncLogRecvAppendEntries(ths, pMsg, logBuf); - - if (hLocal) { - taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false); - } else { - syncEntryDestroy(pLocalEntry); - } - - if (hAppend) { - taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false); - } else { - syncEntryDestroy(pAppendEntry); - } - - goto _IGNORE; - } - - syncCacheEntry(ths->pLogStore, pAppendEntry, &hAppend); - - } else { - // get local entry success - char logBuf[128]; - snprintf(logBuf, sizeof(logBuf), "ignore, get local entry error, append-index:%" PRId64 " err:%d", appendIndex, - terrno); - syncLogRecvAppendEntries(ths, pMsg, logBuf); - - if (hLocal) { - taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false); - } else { - syncEntryDestroy(pLocalEntry); - } - - if (hAppend) { - taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false); - } else { - syncEntryDestroy(pAppendEntry); - } - - goto _IGNORE; - } - } - - // update match index - pReply->matchIndex = pAppendEntry->index; - - if (hLocal) { - taosLRUCacheRelease(ths->pLogStore->pCache, hLocal, false); - } else { - syncEntryDestroy(pLocalEntry); - } - - if (hAppend) { - taosLRUCacheRelease(ths->pLogStore->pCache, hAppend, false); - } else { - syncEntryDestroy(pAppendEntry); - } - - } else { - // no append entries, do nothing - // maybe has extra entries, no harm - - // update match index - pReply->matchIndex = pMsg->prevLogIndex; - } - - // maybe update commit index, leader notice me - syncNodeFollowerCommit(ths, pMsg->commitIndex); - - syncLogRecvAppendEntries(ths, pMsg, "accept"); - goto _SEND_RESPONSE; - -_IGNORE: - rpcFreeCont(rpcRsp.pCont); - return 0; - -_SEND_RESPONSE: - // msg event log - syncLogSendAppendEntriesReply(ths, pReply, ""); - - // send response - syncNodeSendMsgById(&pReply->destId, ths, &rpcRsp); - return 0; -} diff --git a/source/libs/sync/src/syncAppendEntriesReply.c b/source/libs/sync/src/syncAppendEntriesReply.c index 8157a5a14f9c5275036d14f8917aebf3dfe05a26..44a29da3ea0e54d4e9932183a67d298a9c6239ed 100644 --- a/source/libs/sync/src/syncAppendEntriesReply.c +++ b/source/libs/sync/src/syncAppendEntriesReply.c @@ -89,63 +89,3 @@ int32_t syncNodeOnAppendEntriesReply(SSyncNode* ths, const SRpcMsg* pRpcMsg) { } return 0; } - -int32_t syncNodeOnAppendEntriesReplyOld(SSyncNode* ths, SyncAppendEntriesReply* pMsg) { - int32_t ret = 0; - - // if already drop replica, do not process - if (!syncNodeInRaftGroup(ths, &(pMsg->srcId))) { - syncLogRecvAppendEntriesReply(ths, pMsg, "not in my config"); - return 0; - } - - // drop stale response - if (pMsg->term < ths->raftStore.currentTerm) { - syncLogRecvAppendEntriesReply(ths, pMsg, "drop stale response"); - return 0; - } - - if (ths->state == TAOS_SYNC_STATE_LEADER) { - if (pMsg->term > ths->raftStore.currentTerm) { - syncLogRecvAppendEntriesReply(ths, pMsg, "error term"); - syncNodeStepDown(ths, pMsg->term); - return -1; - } - - ASSERT(pMsg->term == ths->raftStore.currentTerm); - - if (pMsg->success) { - SyncIndex oldMatchIndex = syncIndexMgrGetIndex(ths->pMatchIndex, &(pMsg->srcId)); - if (pMsg->matchIndex > oldMatchIndex) { - syncIndexMgrSetIndex(ths->pMatchIndex, &(pMsg->srcId), pMsg->matchIndex); - syncMaybeAdvanceCommitIndex(ths); - - // maybe update minMatchIndex - ths->minMatchIndex = syncMinMatchIndex(ths); - } - syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), pMsg->matchIndex + 1); - - } else { - SyncIndex nextIndex = syncIndexMgrGetIndex(ths->pNextIndex, &(pMsg->srcId)); - if (nextIndex > SYNC_INDEX_BEGIN) { - --nextIndex; - } - syncIndexMgrSetIndex(ths->pNextIndex, &(pMsg->srcId), nextIndex); - } - - // send next append entries - SPeerState* pState = syncNodeGetPeerState(ths, &(pMsg->srcId)); - ASSERT(pState != NULL); - - if (pMsg->lastSendIndex == pState->lastSendIndex) { - int64_t timeNow = taosGetTimestampMs(); - int64_t elapsed = timeNow - pState->lastSendTime; - sNTrace(ths, "sync-append-entries rtt elapsed:%" PRId64 ", index:%" PRId64, elapsed, pState->lastSendIndex); - - syncNodeReplicateOne(ths, &(pMsg->srcId), true); - } - } - - syncLogRecvAppendEntriesReply(ths, pMsg, "process"); - return 0; -} diff --git a/source/libs/sync/src/syncCommit.c b/source/libs/sync/src/syncCommit.c index 286cf4daf52a0ef206ff4d2bf285b673dd1a7b1a..67ed1e0701eebefff06870af66611acdbd3bb681 100644 --- a/source/libs/sync/src/syncCommit.c +++ b/source/libs/sync/src/syncCommit.c @@ -43,148 +43,6 @@ // IN commitIndex' = [commitIndex EXCEPT ![i] = newCommitIndex] // /\ UNCHANGED <> // -void syncOneReplicaAdvance(SSyncNode* pSyncNode) { - ASSERT(false && "deprecated"); - if (pSyncNode == NULL) { - sError("pSyncNode is NULL"); - return; - } - - if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { - sNError(pSyncNode, "not leader, can not advance commit index"); - return; - } - - if (pSyncNode->replicaNum != 1) { - sNError(pSyncNode, "not one replica, can not advance commit index"); - return; - } - - // advance commit index to snapshot first - SSnapshot snapshot; - pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot); - if (snapshot.lastApplyIndex > 0 && snapshot.lastApplyIndex > pSyncNode->commitIndex) { - SyncIndex commitBegin = pSyncNode->commitIndex; - SyncIndex commitEnd = snapshot.lastApplyIndex; - pSyncNode->commitIndex = snapshot.lastApplyIndex; - sNTrace(pSyncNode, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, commitBegin, commitEnd); - } - - // advance commit index as large as possible - SyncIndex lastIndex = syncNodeGetLastIndex(pSyncNode); - if (lastIndex > pSyncNode->commitIndex) { - sNTrace(pSyncNode, "commit by wal from index:%" PRId64 " to index:%" PRId64, pSyncNode->commitIndex + 1, lastIndex); - pSyncNode->commitIndex = lastIndex; - } - - // call back Wal - SyncIndex walCommitVer = logStoreWalCommitVer(pSyncNode->pLogStore); - if (pSyncNode->commitIndex > walCommitVer) { - pSyncNode->pLogStore->syncLogUpdateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex); - } -} - -void syncMaybeAdvanceCommitIndex(SSyncNode* pSyncNode) { - ASSERTS(false, "deprecated"); - if (pSyncNode == NULL) { - sError("pSyncNode is NULL"); - return; - } - - if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { - sNError(pSyncNode, "not leader, can not advance commit index"); - return; - } - - // advance commit index to sanpshot first - SSnapshot snapshot; - pSyncNode->pFsm->FpGetSnapshotInfo(pSyncNode->pFsm, &snapshot); - if (snapshot.lastApplyIndex > 0 && snapshot.lastApplyIndex > pSyncNode->commitIndex) { - SyncIndex commitBegin = pSyncNode->commitIndex; - SyncIndex commitEnd = snapshot.lastApplyIndex; - pSyncNode->commitIndex = snapshot.lastApplyIndex; - sNTrace(pSyncNode, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, commitBegin, commitEnd); - } - - // update commit index - SyncIndex newCommitIndex = pSyncNode->commitIndex; - for (SyncIndex index = syncNodeGetLastIndex(pSyncNode); index > pSyncNode->commitIndex; --index) { - bool agree = syncAgree(pSyncNode, index); - - if (agree) { - // term - SSyncRaftEntry* pEntry = NULL; - SLRUCache* pCache = pSyncNode->pLogStore->pCache; - LRUHandle* h = taosLRUCacheLookup(pCache, &index, sizeof(index)); - if (h) { - pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h); - - pSyncNode->pLogStore->cacheHit++; - sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", index, pEntry->bytes, pEntry); - - } else { - pSyncNode->pLogStore->cacheMiss++; - sNTrace(pSyncNode, "miss cache index:%" PRId64, index); - - int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry); - if (code != 0) { - sNError(pSyncNode, "advance commit index error, read wal index:%" PRId64, index); - return; - } - } - // cannot commit, even if quorum agree. need check term! - if (pEntry->term <= pSyncNode->raftStore.currentTerm) { - // update commit index - newCommitIndex = index; - - if (h) { - taosLRUCacheRelease(pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - - break; - } else { - sNTrace(pSyncNode, "can not commit due to term not equal, index:%" PRId64 ", term:%" PRIu64, pEntry->index, - pEntry->term); - } - - if (h) { - taosLRUCacheRelease(pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - } - } - - // advance commit index as large as possible - SyncIndex walCommitVer = logStoreWalCommitVer(pSyncNode->pLogStore); - if (walCommitVer > newCommitIndex) { - newCommitIndex = walCommitVer; - } - - // maybe execute fsm - if (newCommitIndex > pSyncNode->commitIndex) { - SyncIndex beginIndex = pSyncNode->commitIndex + 1; - SyncIndex endIndex = newCommitIndex; - - // update commit index - pSyncNode->commitIndex = newCommitIndex; - - // call back Wal - pSyncNode->pLogStore->syncLogUpdateCommitIndex(pSyncNode->pLogStore, pSyncNode->commitIndex); - - // execute fsm - if (pSyncNode != NULL && pSyncNode->pFsm != NULL) { - int32_t code = syncNodeDoCommit(pSyncNode, beginIndex, endIndex, pSyncNode->state); - if (code != 0) { - sNError(pSyncNode, "advance commit index error, do commit begin:%" PRId64 ", end:%" PRId64, beginIndex, - endIndex); - return; - } - } - } -} bool syncAgreeIndex(SSyncNode* pSyncNode, SRaftId* pRaftId, SyncIndex index) { // I am leader, I agree @@ -210,83 +68,7 @@ static inline int64_t syncNodeAbs64(int64_t a, int64_t b) { return c; } -int32_t syncNodeDynamicQuorum(const SSyncNode* pSyncNode) { - return pSyncNode->quorum; - -#if 0 - int32_t quorum = 1; // self - - int64_t timeNow = taosGetTimestampMs(); - for (int i = 0; i < pSyncNode->peersNum; ++i) { - int64_t peerStartTime = syncIndexMgrGetStartTime(pSyncNode->pNextIndex, &(pSyncNode->peersId)[i]); - int64_t peerRecvTime = syncIndexMgrGetRecvTime(pSyncNode->pNextIndex, &(pSyncNode->peersId)[i]); - SyncIndex peerMatchIndex = syncIndexMgrGetIndex(pSyncNode->pMatchIndex, &(pSyncNode->peersId)[i]); - - int64_t recvTimeDiff = TABS(peerRecvTime - timeNow); - int64_t startTimeDiff = TABS(peerStartTime - pSyncNode->startTime); - int64_t logDiff = TABS(peerMatchIndex - syncNodeGetLastIndex(pSyncNode)); - - /* - int64_t recvTimeDiff = syncNodeAbs64(peerRecvTime, timeNow); - int64_t startTimeDiff = syncNodeAbs64(peerStartTime, pSyncNode->startTime); - int64_t logDiff = syncNodeAbs64(peerMatchIndex, syncNodeGetLastIndex(pSyncNode)); - */ - - int32_t addQuorum = 0; - - if (recvTimeDiff < SYNC_MAX_RECV_TIME_RANGE_MS) { - if (startTimeDiff < SYNC_MAX_START_TIME_RANGE_MS) { - addQuorum = 1; - } else { - if (logDiff < SYNC_ADD_QUORUM_COUNT) { - addQuorum = 1; - } else { - addQuorum = 0; - } - } - } else { - addQuorum = 0; - } - - /* - if (recvTimeDiff < SYNC_MAX_RECV_TIME_RANGE_MS) { - addQuorum = 1; - } else { - addQuorum = 0; - } - - if (startTimeDiff > SYNC_MAX_START_TIME_RANGE_MS) { - addQuorum = 0; - } - */ - - quorum += addQuorum; - } - - ASSERT(quorum <= pSyncNode->replicaNum); - - if (quorum < pSyncNode->quorum) { - quorum = pSyncNode->quorum; - } - - return quorum; -#endif -} - -/* -bool syncAgree(SSyncNode* pSyncNode, SyncIndex index) { - int agreeCount = 0; - for (int i = 0; i < pSyncNode->replicaNum; ++i) { - if (syncAgreeIndex(pSyncNode, &(pSyncNode->replicasId[i]), index)) { - ++agreeCount; - } - if (agreeCount >= syncNodeDynamicQuorum(pSyncNode)) { - return true; - } - } - return false; -} -*/ +int32_t syncNodeDynamicQuorum(const SSyncNode* pSyncNode) { return pSyncNode->quorum; } bool syncNodeAgreedUpon(SSyncNode* pNode, SyncIndex index) { int count = 0; diff --git a/source/libs/sync/src/syncElection.c b/source/libs/sync/src/syncElection.c index cd3ffc33e34fef0047f9e92f319a5fadaf271419..682ace83ecfa99e4781f70915048cf62a5e2d76f 100644 --- a/source/libs/sync/src/syncElection.c +++ b/source/libs/sync/src/syncElection.c @@ -43,7 +43,10 @@ static int32_t syncNodeRequestVotePeers(SSyncNode* pNode) { for (int i = 0; i < pNode->peersNum; ++i) { SRpcMsg rpcMsg = {0}; ret = syncBuildRequestVote(&rpcMsg, pNode->vgId); - ASSERT(ret == 0); + if (ret < 0) { + sError("vgId:%d, failed to build request-vote msg since %s", pNode->vgId, terrstr()); + continue; + } SyncRequestVote* pMsg = rpcMsg.pCont; pMsg->srcId = pNode->myRaftId; @@ -51,13 +54,18 @@ static int32_t syncNodeRequestVotePeers(SSyncNode* pNode) { pMsg->term = pNode->raftStore.currentTerm; ret = syncNodeGetLastIndexTerm(pNode, &pMsg->lastLogIndex, &pMsg->lastLogTerm); - ASSERT(ret == 0); + if (ret < 0) { + sError("vgId:%d, failed to get index and term of last log since %s", pNode->vgId, terrstr()); + continue; + } ret = syncNodeSendMsgById(&pNode->peersId[i], pNode, &rpcMsg); - ASSERT(ret == 0); + if (ret < 0) { + sError("vgId:%d, failed to send msg to peerId:%" PRId64, pNode->vgId, pNode->peersId[i].addr); + continue; + } } - - return ret; + return 0; } int32_t syncNodeElect(SSyncNode* pSyncNode) { diff --git a/source/libs/sync/src/syncMain.c b/source/libs/sync/src/syncMain.c index eeda612bcdfc2a7ff28d33105e1e5deeb77c7aab..ea22ac7bb57deda837bdb4eee295cd07536cc8f1 100644 --- a/source/libs/sync/src/syncMain.c +++ b/source/libs/sync/src/syncMain.c @@ -292,8 +292,6 @@ int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) { goto _DEL_WAL; } else { - lastApplyIndex -= SYNC_VNODE_LOG_RETENTION; - SyncIndex beginIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore); SyncIndex endIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore); bool isEmpty = pSyncNode->pLogStore->syncLogIsEmpty(pSyncNode->pLogStore); @@ -308,6 +306,8 @@ int32_t syncBeginSnapshot(int64_t rid, int64_t lastApplyIndex) { if (pSyncNode->replicaNum > 1) { // multi replicas + lastApplyIndex = TMAX(lastApplyIndex - SYNC_VNODE_LOG_RETENTION, beginIndex - 1); + if (pSyncNode->state == TAOS_SYNC_STATE_LEADER) { pSyncNode->minMatchIndex = syncMinMatchIndex(pSyncNode); @@ -586,78 +586,6 @@ SSyncState syncGetState(int64_t rid) { return state; } -#if 0 -int32_t syncGetSnapshotByIndex(int64_t rid, SyncIndex index, SSnapshot* pSnapshot) { - if (index < SYNC_INDEX_BEGIN) { - return -1; - } - - SSyncNode* pSyncNode = syncNodeAcquire(rid); - if (pSyncNode == NULL) { - return -1; - } - ASSERT(rid == pSyncNode->rid); - - SSyncRaftEntry* pEntry = NULL; - int32_t code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, index, &pEntry); - if (code != 0) { - if (pEntry != NULL) { - syncEntryDestroy(pEntry); - } - syncNodeRelease(pSyncNode); - return -1; - } - ASSERT(pEntry != NULL); - - pSnapshot->data = NULL; - pSnapshot->lastApplyIndex = index; - pSnapshot->lastApplyTerm = pEntry->term; - pSnapshot->lastConfigIndex = syncNodeGetSnapshotConfigIndex(pSyncNode, index); - - syncEntryDestroy(pEntry); - syncNodeRelease(pSyncNode); - return 0; -} - -int32_t syncGetSnapshotMeta(int64_t rid, struct SSnapshotMeta* sMeta) { - SSyncNode* pSyncNode = syncNodeAcquire(rid); - if (pSyncNode == NULL) { - return -1; - } - ASSERT(rid == pSyncNode->rid); - sMeta->lastConfigIndex = pSyncNode->raftCfg.lastConfigIndex; - - sTrace("vgId:%d, get snapshot meta, lastConfigIndex:%" PRId64, pSyncNode->vgId, pSyncNode->raftCfg.lastConfigIndex); - - syncNodeRelease(pSyncNode); - return 0; -} - -int32_t syncGetSnapshotMetaByIndex(int64_t rid, SyncIndex snapshotIndex, struct SSnapshotMeta* sMeta) { - SSyncNode* pSyncNode = syncNodeAcquire(rid); - if (pSyncNode == NULL) { - return -1; - } - ASSERT(rid == pSyncNode->rid); - - ASSERT(pSyncNode->raftCfg.configIndexCount >= 1); - SyncIndex lastIndex = (pSyncNode->raftCfg.configIndexArr)[0]; - - for (int32_t i = 0; i < pSyncNode->raftCfg.configIndexCount; ++i) { - if ((pSyncNode->raftCfg.configIndexArr)[i] > lastIndex && - (pSyncNode->raftCfg.configIndexArr)[i] <= snapshotIndex) { - lastIndex = (pSyncNode->raftCfg.configIndexArr)[i]; - } - } - sMeta->lastConfigIndex = lastIndex; - sTrace("vgId:%d, get snapshot meta by index:%" PRId64 " lcindex:%" PRId64, pSyncNode->vgId, snapshotIndex, - sMeta->lastConfigIndex); - - syncNodeRelease(pSyncNode); - return 0; -} -#endif - SyncIndex syncNodeGetSnapshotConfigIndex(SSyncNode* pSyncNode, SyncIndex snapshotLastApplyIndex) { ASSERT(pSyncNode->raftCfg.configIndexCount >= 1); SyncIndex lastIndex = (pSyncNode->raftCfg.configIndexArr)[0]; @@ -1031,9 +959,12 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { pSyncNode->commitIndex = commitIndex; sInfo("vgId:%d, sync node commitIndex initialized as %" PRId64, pSyncNode->vgId, pSyncNode->commitIndex); + // restore log store on need if (syncNodeLogStoreRestoreOnNeed(pSyncNode) < 0) { + sError("vgId:%d, failed to restore log store since %s.", pSyncNode->vgId, terrstr()); goto _error; } + // timer ms init pSyncNode->pingBaseLine = PING_TIMER_MS; pSyncNode->electBaseLine = tsElectInterval; @@ -1096,10 +1027,16 @@ SSyncNode* syncNodeOpen(SSyncInfo* pSyncInfo) { pSyncNode->changing = false; // replication mgr - syncNodeLogReplMgrInit(pSyncNode); + if (syncNodeLogReplMgrInit(pSyncNode) < 0) { + sError("vgId:%d, failed to init repl mgr since %s.", pSyncNode->vgId, terrstr()); + goto _error; + } // peer state - syncNodePeerStateInit(pSyncNode); + if (syncNodePeerStateInit(pSyncNode) < 0) { + sError("vgId:%d, failed to init peer stat since %s.", pSyncNode->vgId, terrstr()); + goto _error; + } // // min match index @@ -1194,27 +1131,10 @@ int32_t syncNodeStart(SSyncNode* pSyncNode) { int32_t ret = 0; ret = syncNodeStartPingTimer(pSyncNode); - ASSERT(ret == 0); - return ret; -} - -void syncNodeStartOld(SSyncNode* pSyncNode) { - // start raft - if (pSyncNode->replicaNum == 1) { - raftStoreNextTerm(pSyncNode); - syncNodeBecomeLeader(pSyncNode, "one replica start"); - - // Raft 3.6.2 Committing entries from previous terms - syncNodeAppendNoop(pSyncNode); - syncMaybeAdvanceCommitIndex(pSyncNode); - - } else { - syncNodeBecomeFollower(pSyncNode, "first start"); + if (ret != 0) { + sError("vgId:%d, failed to start ping timer since %s", pSyncNode->vgId, terrstr()); } - - int32_t ret = 0; - ret = syncNodeStartPingTimer(pSyncNode); - ASSERT(ret == 0); + return ret; } int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) { @@ -1225,11 +1145,16 @@ int32_t syncNodeStartStandBy(SSyncNode* pSyncNode) { // reset elect timer, long enough int32_t electMS = TIMER_MAX_MS; int32_t ret = syncNodeRestartElectTimer(pSyncNode, electMS); - ASSERT(ret == 0); + if (ret < 0) { + sError("vgId:%d, failed to restart elect timer since %s", pSyncNode->vgId, terrstr()); + return -1; + } - ret = 0; ret = syncNodeStartPingTimer(pSyncNode); - ASSERT(ret == 0); + if (ret < 0) { + sError("vgId:%d, failed to start ping timer since %s", pSyncNode->vgId, terrstr()); + return -1; + } return ret; } @@ -1818,12 +1743,6 @@ void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) { pSyncNode->leaderCache = pSyncNode->myRaftId; for (int32_t i = 0; i < pSyncNode->pNextIndex->replicaNum; ++i) { - // maybe overwrite myself, no harm - // just do it! - - // pSyncNode->pNextIndex->index[i] = pSyncNode->pLogStore->getLastIndex(pSyncNode->pLogStore) + 1; - - // maybe wal is deleted SyncIndex lastIndex; SyncTerm lastTerm; int32_t code = syncNodeGetLastIndexTerm(pSyncNode, &lastIndex, &lastTerm); @@ -1885,7 +1804,11 @@ void syncNodeBecomeLeader(SSyncNode* pSyncNode, const char* debugStr) { void syncNodeCandidate2Leader(SSyncNode* pSyncNode) { ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE); - ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted)); + bool granted = voteGrantedMajority(pSyncNode->pVotesGranted); + if (!granted) { + sError("vgId:%d, not granted by majority.", pSyncNode->vgId); + return; + } syncNodeBecomeLeader(pSyncNode, "candidate to leader"); sNTrace(pSyncNode, "state change syncNodeCandidate2Leader"); @@ -1901,20 +1824,6 @@ void syncNodeCandidate2Leader(SSyncNode* pSyncNode) { pSyncNode->vgId, pSyncNode->raftStore.currentTerm, pSyncNode->commitIndex, lastIndex); } -void syncNodeCandidate2LeaderOld(SSyncNode* pSyncNode) { - ASSERT(pSyncNode->state == TAOS_SYNC_STATE_CANDIDATE); - ASSERT(voteGrantedMajority(pSyncNode->pVotesGranted)); - syncNodeBecomeLeader(pSyncNode, "candidate to leader"); - - // Raft 3.6.2 Committing entries from previous terms - syncNodeAppendNoop(pSyncNode); - syncMaybeAdvanceCommitIndex(pSyncNode); - - if (pSyncNode->replicaNum > 1) { - syncNodeReplicate(pSyncNode); - } -} - bool syncNodeIsMnode(SSyncNode* pSyncNode) { return (pSyncNode->vgId == 1); } int32_t syncNodePeerStateInit(SSyncNode* pSyncNode) { @@ -1960,7 +1869,8 @@ void syncNodeCandidate2Follower(SSyncNode* pSyncNode) { // need assert void syncNodeVoteForTerm(SSyncNode* pSyncNode, SyncTerm term, SRaftId* pRaftId) { ASSERT(term == pSyncNode->raftStore.currentTerm); - ASSERT(!raftStoreHasVoted(pSyncNode)); + bool voted = raftStoreHasVoted(pSyncNode); + ASSERT(!voted); raftStoreVote(pSyncNode, pRaftId); } @@ -2638,24 +2548,6 @@ int32_t syncNodeOnLocalCmd(SSyncNode* ths, const SRpcMsg* pRpcMsg) { return 0; } -int32_t syncNodeOnLocalCmdOld(SSyncNode* ths, const SRpcMsg* pRpcMsg) { - ASSERT(false && "deprecated"); - SyncLocalCmd* pMsg = pRpcMsg->pCont; - syncLogRecvLocalCmd(ths, pMsg, ""); - - if (pMsg->cmd == SYNC_LOCAL_CMD_STEP_DOWN) { - syncNodeStepDown(ths, pMsg->currentTerm); - - } else if (pMsg->cmd == SYNC_LOCAL_CMD_FOLLOWER_CMT) { - syncNodeFollowerCommit(ths, pMsg->commitIndex); - - } else { - sError("error local cmd"); - } - - return 0; -} - // TLA+ Spec // ClientRequest(i, v) == // /\ state[i] = Leader @@ -2700,96 +2592,6 @@ int32_t syncNodeOnClientRequest(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIn } } -int32_t syncNodeOnClientRequestOld(SSyncNode* ths, SRpcMsg* pMsg, SyncIndex* pRetIndex) { - sNTrace(ths, "on client request"); - - int32_t ret = 0; - int32_t code = 0; - - SyncIndex index = ths->pLogStore->syncLogWriteIndex(ths->pLogStore); - SyncTerm term = ths->raftStore.currentTerm; - SSyncRaftEntry* pEntry; - - if (pMsg->msgType == TDMT_SYNC_CLIENT_REQUEST) { - pEntry = syncEntryBuildFromClientRequest(pMsg->pCont, term, index); - } else { - pEntry = syncEntryBuildFromRpcMsg(pMsg, term, index); - } - - LRUHandle* h = NULL; - - if (ths->state == TAOS_SYNC_STATE_LEADER) { - // append entry - code = ths->pLogStore->syncLogAppendEntry(ths->pLogStore, pEntry, false); - if (code != 0) { - if (ths->replicaNum == 1) { - if (h) { - taosLRUCacheRelease(ths->pLogStore->pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - - return -1; - - } else { - // del resp mgr, call FpCommitCb - SFsmCbMeta cbMeta = { - .index = pEntry->index, - .lastConfigIndex = SYNC_INDEX_INVALID, - .isWeak = pEntry->isWeak, - .code = -1, - .state = ths->state, - .seqNum = pEntry->seqNum, - .term = pEntry->term, - .currentTerm = ths->raftStore.currentTerm, - .flag = 0, - }; - ths->pFsm->FpCommitCb(ths->pFsm, pMsg, &cbMeta); - - if (h) { - taosLRUCacheRelease(ths->pLogStore->pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - - return -1; - } - } - - syncCacheEntry(ths->pLogStore, pEntry, &h); - - // if mulit replica, start replicate right now - if (ths->replicaNum > 1) { - syncNodeReplicate(ths); - } - - // if only myself, maybe commit right now - if (ths->replicaNum == 1) { - if (syncNodeIsMnode(ths)) { - syncMaybeAdvanceCommitIndex(ths); - } else { - syncOneReplicaAdvance(ths); - } - } - } - - if (pRetIndex != NULL) { - if (ret == 0 && pEntry != NULL) { - *pRetIndex = pEntry->index; - } else { - *pRetIndex = SYNC_INDEX_INVALID; - } - } - - if (h) { - taosLRUCacheRelease(ths->pLogStore->pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - - return ret; -} - const char* syncStr(ESyncState state) { switch (state) { case TAOS_SYNC_STATE_FOLLOWER: @@ -2894,129 +2696,6 @@ bool syncNodeIsOptimizedOneReplica(SSyncNode* ths, SRpcMsg* pMsg) { return (ths->replicaNum == 1 && syncUtilUserCommit(pMsg->msgType) && ths->vgId != 1); } -int32_t syncNodeDoCommit(SSyncNode* ths, SyncIndex beginIndex, SyncIndex endIndex, uint64_t flag) { - ASSERT(false); - if (beginIndex > endIndex) { - return 0; - } - - if (ths == NULL) { - return -1; - } - - if (ths->pFsm != NULL && ths->pFsm->FpGetSnapshotInfo != NULL) { - // advance commit index to sanpshot first - SSnapshot snapshot = {0}; - ths->pFsm->FpGetSnapshotInfo(ths->pFsm, &snapshot); - if (snapshot.lastApplyIndex >= 0 && snapshot.lastApplyIndex >= beginIndex) { - sNTrace(ths, "commit by snapshot from index:%" PRId64 " to index:%" PRId64, beginIndex, snapshot.lastApplyIndex); - - // update begin index - beginIndex = snapshot.lastApplyIndex + 1; - } - } - - int32_t code = 0; - ESyncState state = flag; - - sNTrace(ths, "commit by wal from index:%" PRId64 " to index:%" PRId64, beginIndex, endIndex); - - // execute fsm - if (ths->pFsm != NULL) { - for (SyncIndex i = beginIndex; i <= endIndex; ++i) { - if (i != SYNC_INDEX_INVALID) { - SSyncRaftEntry* pEntry; - SLRUCache* pCache = ths->pLogStore->pCache; - LRUHandle* h = taosLRUCacheLookup(pCache, &i, sizeof(i)); - if (h) { - pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h); - - ths->pLogStore->cacheHit++; - sNTrace(ths, "hit cache index:%" PRId64 ", bytes:%u, %p", i, pEntry->bytes, pEntry); - - } else { - ths->pLogStore->cacheMiss++; - sNTrace(ths, "miss cache index:%" PRId64, i); - - code = ths->pLogStore->syncLogGetEntry(ths->pLogStore, i, &pEntry); - // ASSERT(code == 0); - // ASSERT(pEntry != NULL); - if (code != 0 || pEntry == NULL) { - sNError(ths, "get log entry error"); - sFatal("vgId:%d, get log entry %" PRId64 " error when commit since %s", ths->vgId, i, terrstr()); - continue; - } - } - - SRpcMsg rpcMsg = {0}; - syncEntry2OriginalRpc(pEntry, &rpcMsg); - - sTrace("do commit index:%" PRId64 ", type:%s", i, TMSG_INFO(pEntry->msgType)); - - // user commit - if ((ths->pFsm->FpCommitCb != NULL) && syncUtilUserCommit(pEntry->originalRpcType)) { - bool internalExecute = true; - if ((ths->replicaNum == 1) && ths->restoreFinish && ths->vgId != 1) { - internalExecute = false; - } - - sNTrace(ths, "user commit index:%" PRId64 ", internal:%d, type:%s", i, internalExecute, - TMSG_INFO(pEntry->msgType)); - - // execute fsm in apply thread, or execute outside syncPropose - if (internalExecute) { - SFsmCbMeta cbMeta = { - .index = pEntry->index, - .lastConfigIndex = syncNodeGetSnapshotConfigIndex(ths, pEntry->index), - .isWeak = pEntry->isWeak, - .code = 0, - .state = ths->state, - .seqNum = pEntry->seqNum, - .term = pEntry->term, - .currentTerm = ths->raftStore.currentTerm, - .flag = flag, - }; - - syncRespMgrGetAndDel(ths->pSyncRespMgr, cbMeta.seqNum, &rpcMsg.info); - ths->pFsm->FpCommitCb(ths->pFsm, &rpcMsg, &cbMeta); - } - } - -#if 0 - // execute in pre-commit - // leader transfer - if (pEntry->originalRpcType == TDMT_SYNC_LEADER_TRANSFER) { - code = syncDoLeaderTransfer(ths, &rpcMsg, pEntry); - ASSERT(code == 0); - } -#endif - - // restore finish - // if only snapshot, a noop entry will be append, so syncLogLastIndex is always ok - if (pEntry->index == ths->pLogStore->syncLogLastIndex(ths->pLogStore)) { - if (ths->restoreFinish == false) { - if (ths->pFsm->FpRestoreFinishCb != NULL) { - ths->pFsm->FpRestoreFinishCb(ths->pFsm); - } - ths->restoreFinish = true; - - int64_t restoreDelay = taosGetTimestampMs() - ths->leaderTime; - sNTrace(ths, "restore finish, index:%" PRId64 ", elapsed:%" PRId64 " ms", pEntry->index, restoreDelay); - } - } - - rpcFreeCont(rpcMsg.pCont); - if (h) { - taosLRUCacheRelease(pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - } - } - } - return 0; -} - bool syncNodeInRaftGroup(SSyncNode* ths, SRaftId* pRaftId) { for (int32_t i = 0; i < ths->replicaNum; ++i) { if (syncUtilSameId(&((ths->replicasId)[i]), pRaftId)) { diff --git a/source/libs/sync/src/syncPipeline.c b/source/libs/sync/src/syncPipeline.c index 1d2b0b57fed915ff5a2731f936b91a72140c6142..6cc517fda00c67d6d89dcb2b49d40049a05ea18e 100644 --- a/source/libs/sync/src/syncPipeline.c +++ b/source/libs/sync/src/syncPipeline.c @@ -945,8 +945,11 @@ int32_t syncNodeLogReplMgrInit(SSyncNode* pNode) { for (int i = 0; i < TSDB_MAX_REPLICA; i++) { ASSERT(pNode->logReplMgrs[i] == NULL); pNode->logReplMgrs[i] = syncLogReplMgrCreate(); + if (pNode->logReplMgrs[i] == NULL) { + terrno = TSDB_CODE_OUT_OF_MEMORY; + return -1; + } pNode->logReplMgrs[i]->peerId = i; - ASSERTS(pNode->logReplMgrs[i] != NULL, "Out of memory."); } return 0; } diff --git a/source/libs/sync/src/syncReplication.c b/source/libs/sync/src/syncReplication.c index 1aa476e84e03ab46a925085ff7792ca88b0af5b4..3df203221b88bbd0b9d804613e2ea50e881be149 100644 --- a/source/libs/sync/src/syncReplication.c +++ b/source/libs/sync/src/syncReplication.c @@ -48,92 +48,6 @@ int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg); -int32_t syncNodeReplicateOne(SSyncNode* pSyncNode, SRaftId* pDestId, bool snapshot) { - ASSERT(false && "deprecated"); - // next index - SyncIndex nextIndex = syncIndexMgrGetIndex(pSyncNode->pNextIndex, pDestId); - - if (snapshot) { - // maybe start snapshot - SyncIndex logStartIndex = pSyncNode->pLogStore->syncLogBeginIndex(pSyncNode->pLogStore); - SyncIndex logEndIndex = pSyncNode->pLogStore->syncLogEndIndex(pSyncNode->pLogStore); - if (nextIndex < logStartIndex || nextIndex - 1 > logEndIndex) { - sNTrace(pSyncNode, "maybe start snapshot for next-index:%" PRId64 ", start:%" PRId64 ", end:%" PRId64, nextIndex, - logStartIndex, logEndIndex); - // start snapshot - int32_t code = syncNodeStartSnapshot(pSyncNode, pDestId); - } - } - - // pre index, pre term - SyncIndex preLogIndex = syncNodeGetPreIndex(pSyncNode, nextIndex); - SyncTerm preLogTerm = syncNodeGetPreTerm(pSyncNode, nextIndex); - - // prepare entry - SRpcMsg rpcMsg = {0}; - SyncAppendEntries* pMsg = NULL; - - SSyncRaftEntry* pEntry = NULL; - SLRUCache* pCache = pSyncNode->pLogStore->pCache; - LRUHandle* h = taosLRUCacheLookup(pCache, &nextIndex, sizeof(nextIndex)); - int32_t code = 0; - if (h) { - pEntry = (SSyncRaftEntry*)taosLRUCacheValue(pCache, h); - code = 0; - - pSyncNode->pLogStore->cacheHit++; - sNTrace(pSyncNode, "hit cache index:%" PRId64 ", bytes:%u, %p", nextIndex, pEntry->bytes, pEntry); - - } else { - pSyncNode->pLogStore->cacheMiss++; - sNTrace(pSyncNode, "miss cache index:%" PRId64, nextIndex); - - code = pSyncNode->pLogStore->syncLogGetEntry(pSyncNode->pLogStore, nextIndex, &pEntry); - } - - if (code == 0) { - ASSERT(pEntry != NULL); - - code = syncBuildAppendEntries(&rpcMsg, (int32_t)(pEntry->bytes), pSyncNode->vgId); - ASSERT(code == 0); - - pMsg = rpcMsg.pCont; - memcpy(pMsg->data, pEntry, pEntry->bytes); - } else { - if (terrno == TSDB_CODE_WAL_LOG_NOT_EXIST) { - // no entry in log - code = syncBuildAppendEntries(&rpcMsg, 0, pSyncNode->vgId); - ASSERT(code == 0); - - pMsg = rpcMsg.pCont; - } else { - sNError(pSyncNode, "replicate to dnode:%d error, next-index:%" PRId64, DID(pDestId), nextIndex); - return -1; - } - } - - if (h) { - taosLRUCacheRelease(pCache, h, false); - } else { - syncEntryDestroy(pEntry); - } - - // prepare msg - ASSERT(pMsg != NULL); - pMsg->srcId = pSyncNode->myRaftId; - pMsg->destId = *pDestId; - pMsg->term = pSyncNode->raftStore.currentTerm; - pMsg->prevLogIndex = preLogIndex; - pMsg->prevLogTerm = preLogTerm; - pMsg->commitIndex = pSyncNode->commitIndex; - pMsg->privateTerm = 0; - // pMsg->privateTerm = syncIndexMgrGetTerm(pSyncNode->pNextIndex, pDestId); - - // send msg - syncNodeMaybeSendAppendEntries(pSyncNode, pDestId, &rpcMsg); - return 0; -} - int32_t syncNodeReplicate(SSyncNode* pNode) { SSyncLogBuffer* pBuf = pNode->pLogBuf; taosThreadMutexLock(&pBuf->mutex); @@ -156,25 +70,6 @@ int32_t syncNodeReplicateWithoutLock(SSyncNode* pNode) { return 0; } -int32_t syncNodeReplicateOld(SSyncNode* pSyncNode) { - if (pSyncNode->state != TAOS_SYNC_STATE_LEADER) { - return -1; - } - - sNTrace(pSyncNode, "do replicate"); - - int32_t ret = 0; - for (int i = 0; i < pSyncNode->peersNum; ++i) { - SRaftId* pDestId = &(pSyncNode->peersId[i]); - ret = syncNodeReplicateOne(pSyncNode, pDestId, true); - if (ret != 0) { - sError("vgId:%d, do append entries error for dnode:%d", pSyncNode->vgId, DID(pDestId)); - } - } - - return 0; -} - int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) { SyncAppendEntries* pMsg = pRpcMsg->pCont; pMsg->destId = *destRaftId; @@ -182,39 +77,6 @@ int32_t syncNodeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftI return 0; } -int32_t syncNodeSendAppendEntriesOld(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) { - int32_t ret = 0; - SyncAppendEntries* pMsg = pRpcMsg->pCont; - if (pMsg == NULL) { - sError("vgId:%d, sync-append-entries msg is NULL", pSyncNode->vgId); - return 0; - } - - SPeerState* pState = syncNodeGetPeerState(pSyncNode, destRaftId); - if (pState == NULL) { - sError("vgId:%d, replica maybe dropped", pSyncNode->vgId); - return 0; - } - - // save index, otherwise pMsg will be free by rpc - SyncIndex saveLastSendIndex = pState->lastSendIndex; - bool update = false; - if (pMsg->dataLen > 0) { - saveLastSendIndex = pMsg->prevLogIndex + 1; - update = true; - } - - syncLogSendAppendEntries(pSyncNode, pMsg, ""); - syncNodeSendMsgById(destRaftId, pSyncNode, pRpcMsg); - - if (update) { - pState->lastSendIndex = saveLastSendIndex; - pState->lastSendTime = taosGetTimestampMs(); - } - - return ret; -} - int32_t syncNodeMaybeSendAppendEntries(SSyncNode* pSyncNode, const SRaftId* destRaftId, SRpcMsg* pRpcMsg) { int32_t ret = 0; SyncAppendEntries* pMsg = pRpcMsg->pCont; diff --git a/source/libs/wal/src/walMeta.c b/source/libs/wal/src/walMeta.c index 44e88a4dcc425a728789a5e95b9fbf19bcaae415..a5473789670f47ca9e562f624a1d6c2dd6bd350a 100644 --- a/source/libs/wal/src/walMeta.c +++ b/source/libs/wal/src/walMeta.c @@ -325,6 +325,35 @@ bool walLogEntriesComplete(const SWal* pWal) { return complete; } +int walTrimIdxFile(SWal* pWal, int32_t fileIdx) { + SWalFileInfo* pFileInfo = taosArrayGet(pWal->fileInfoSet, fileIdx); + ASSERT(pFileInfo != NULL); + char fnameStr[WAL_FILE_LEN]; + walBuildIdxName(pWal, pFileInfo->firstVer, fnameStr); + + int64_t fileSize = 0; + taosStatFile(fnameStr, &fileSize, NULL); + int64_t records = TMAX(0, pFileInfo->lastVer - pFileInfo->firstVer + 1); + int64_t lastEndOffset = records * sizeof(SWalIdxEntry); + + if (fileSize <= lastEndOffset) { + return 0; + } + + TdFilePtr pFile = taosOpenFile(fnameStr, TD_FILE_READ | TD_FILE_WRITE); + if (pFile == NULL) { + terrno = TAOS_SYSTEM_ERROR(errno); + return -1; + } + + wInfo("vgId:%d, trim idx file. file: %s, size: %" PRId64 ", offset: %" PRId64, pWal->cfg.vgId, fnameStr, fileSize, + lastEndOffset); + + taosFtruncateFile(pFile, lastEndOffset); + taosCloseFile(&pFile); + return 0; +} + int walCheckAndRepairMeta(SWal* pWal) { // load log files, get first/snapshot/last version info const char* logPattern = "^[0-9]+.log$"; @@ -402,6 +431,8 @@ int walCheckAndRepairMeta(SWal* pWal) { } updateMeta = true; + (void)walTrimIdxFile(pWal, fileIdx); + int64_t lastVer = walScanLogGetLastVer(pWal, fileIdx); if (lastVer < 0) { if (terrno != TSDB_CODE_WAL_LOG_NOT_EXIST) { @@ -567,6 +598,7 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) { goto _err; } + int64_t count = 0; while (idxEntry.ver < pFileInfo->lastVer) { ASSERT(idxEntry.ver == ckHead.head.version); @@ -578,11 +610,11 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) { idxEntry.offset, fLogNameStr); goto _err; } - wWarn("vgId:%d, wal idx append new entry %" PRId64 " %" PRId64, pWal->cfg.vgId, idxEntry.ver, idxEntry.offset); if (taosWriteFile(pIdxFile, &idxEntry, sizeof(SWalIdxEntry)) < 0) { wError("vgId:%d, failed to append file since %s. file:%s", pWal->cfg.vgId, terrstr(), fnameStr); goto _err; } + count++; } if (taosFsyncFile(pIdxFile) < 0) { @@ -590,6 +622,11 @@ int walCheckAndRepairIdxFile(SWal* pWal, int32_t fileIdx) { goto _err; } + if (count > 0) { + wInfo("vgId:%d, rebuilt %" PRId64 " wal idx entries until lastVer: %" PRId64, pWal->cfg.vgId, count, + pFileInfo->lastVer); + } + (void)taosCloseFile(&pLogFile); (void)taosCloseFile(&pIdxFile); return 0; diff --git a/source/libs/wal/src/walRef.c b/source/libs/wal/src/walRef.c index e86111109ce4fc9768be8b662670ae711f935e39..43470f4c82ab1cb35d37ae9ace9a78b938e3cbd9 100644 --- a/source/libs/wal/src/walRef.c +++ b/source/libs/wal/src/walRef.c @@ -77,6 +77,31 @@ void walUnrefVer(SWalRef *pRef) { } #endif +SWalRef *walRefFirstVer(SWal *pWal, SWalRef *pRef) { + if (pRef == NULL) { + pRef = walOpenRef(pWal); + if (pRef == NULL) { + return NULL; + } + } + taosThreadMutexLock(&pWal->mutex); + + int64_t ver = walGetFirstVer(pWal); + + wDebug("vgId:%d, wal ref version %" PRId64 " for first", pWal->cfg.vgId, ver); + + pRef->refVer = ver; + // bsearch in fileSet + SWalFileInfo tmpInfo; + tmpInfo.firstVer = ver; + SWalFileInfo *pRet = taosArraySearch(pWal->fileInfoSet, &tmpInfo, compareWalFileInfo, TD_LE); + ASSERT(pRet != NULL); + pRef->refFile = pRet->firstVer; + + taosThreadMutexUnlock(&pWal->mutex); + return pRef; +} + SWalRef *walRefCommittedVer(SWal *pWal) { SWalRef *pRef = walOpenRef(pWal); if (pRef == NULL) { @@ -86,6 +111,8 @@ SWalRef *walRefCommittedVer(SWal *pWal) { int64_t ver = walGetCommittedVer(pWal); + wDebug("vgId:%d, wal ref version %" PRId64 " for committed", pWal->cfg.vgId, ver); + pRef->refVer = ver; // bsearch in fileSet SWalFileInfo tmpInfo; diff --git a/source/os/src/osMath.c b/source/os/src/osMath.c index dddadd5ff6c187621a1b37efb24882ef84448cb9..41c8c9257a95fb57213c33042ac7c1339295553b 100644 --- a/source/os/src/osMath.c +++ b/source/os/src/osMath.c @@ -32,7 +32,18 @@ void swapStr(char* j, char* J, int width) { } #endif +int qsortHelper(const void* p1, const void* p2, const void* param) { + __compar_fn_t comparFn = param; + return comparFn(p1, p2); +} + // todo refactor: 1) move away; 2) use merge sort instead; 3) qsort is not a stable sort actually. -void taosSort(void* arr, int64_t sz, int64_t width, __compar_fn_t compar) { - qsort(arr, sz, width, compar); +void taosSort(void* base, int64_t sz, int64_t width, __compar_fn_t compar) { +#ifdef _ALPINE + void* param = compar; + taosqsort(base, width, sz, param, qsortHelper); +#else + qsort(base, sz, width, compar); +#endif } + diff --git a/source/os/src/osTime.c b/source/os/src/osTime.c index cd4324a5928e11497d3f98276a8cff800e369395..685693a709f4dd14efc3ffd5f277946a3293b38a 100644 --- a/source/os/src/osTime.c +++ b/source/os/src/osTime.c @@ -33,6 +33,11 @@ #include //#define TM_YEAR_BASE 1970 //origin #define TM_YEAR_BASE 1900 // slguan + +// This magic number is the number of 100 nanosecond intervals since January 1, 1601 (UTC) +// until 00:00:00 January 1, 1970 +static const uint64_t TIMEEPOCH = ((uint64_t)116444736000000000ULL); + /* * We do not implement alternate representations. However, we always * check whether a given modifier is allowed for a certain conversion. @@ -341,15 +346,17 @@ char *taosStrpTime(const char *buf, const char *fmt, struct tm *tm) { int32_t taosGetTimeOfDay(struct timeval *tv) { #ifdef WINDOWS - time_t t; - t = taosGetTimestampSec(); - SYSTEMTIME st; - GetLocalTime(&st); + LARGE_INTEGER t; + FILETIME f; - tv->tv_sec = (long)t; - tv->tv_usec = st.wMilliseconds * 1000; + GetSystemTimeAsFileTime(&f); + t.QuadPart = f.dwHighDateTime; + t.QuadPart <<= 32; + t.QuadPart |= f.dwLowDateTime; - return 0; + t.QuadPart -= TIMEEPOCH; + tv->tv_sec = t.QuadPart / 10000000; + tv->tv_usec = (t.QuadPart % 10000000) / 10; #else return gettimeofday(tv, NULL); #endif @@ -550,37 +557,13 @@ int32_t taosClockGetTime(int clock_id, struct timespec *pTS) { #ifdef WINDOWS LARGE_INTEGER t; FILETIME f; - static FILETIME ff; - static SYSTEMTIME ss; - static LARGE_INTEGER offset; - - static int8_t offsetInit = 0; - static volatile bool offsetInitFinished = false; - int8_t old = atomic_val_compare_exchange_8(&offsetInit, 0, 1); - if (0 == old) { - ss.wYear = 1970; - ss.wMonth = 1; - ss.wDay = 1; - ss.wHour = 0; - ss.wMinute = 0; - ss.wSecond = 0; - ss.wMilliseconds = 0; - SystemTimeToFileTime(&ss, &ff); - offset.QuadPart = ff.dwHighDateTime; - offset.QuadPart <<= 32; - offset.QuadPart |= ff.dwLowDateTime; - offsetInitFinished = true; - } else { - while (!offsetInitFinished) - ; // Ensure initialization is completed. - } GetSystemTimeAsFileTime(&f); t.QuadPart = f.dwHighDateTime; t.QuadPart <<= 32; t.QuadPart |= f.dwLowDateTime; - t.QuadPart -= offset.QuadPart; + t.QuadPart -= TIMEEPOCH; pTS->tv_sec = t.QuadPart / 10000000; pTS->tv_nsec = (t.QuadPart % 10000000) * 100; return (0); diff --git a/source/util/src/talgo.c b/source/util/src/talgo.c index d9319485b7c3bbed717c054f6d63f91ca2220063..a06aac6afe3a64dcb9e53a7580c797f4d90a06a9 100644 --- a/source/util/src/talgo.c +++ b/source/util/src/talgo.c @@ -41,12 +41,6 @@ static void median(void *src, int64_t size, int64_t s, int64_t e, const void *pa ASSERT(comparFn(elePtrAt(src, size, mid), elePtrAt(src, size, s), param) <= 0 && comparFn(elePtrAt(src, size, s), elePtrAt(src, size, e), param) <= 0); - -#ifdef _DEBUG_VIEW -// tTagsPrints(src[s], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx); -// tTagsPrints(src[mid], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx); -// tTagsPrints(src[e], pOrderDesc->pColumnModel, &pOrderDesc->orderIdx); -#endif } static void tInsertSort(void *src, int64_t size, int32_t s, int32_t e, const void *param, __ext_compar_fn_t comparFn, @@ -278,14 +272,4 @@ void taosheapsort(void *base, int32_t size, int32_t len, const void *parcompar, } taosMemoryFree(buf); - /* - char *buf = taosMemoryCalloc(1, size); - - for (i = len - 1; i > 0; i--) { - doswap(elePtrAt(base, size, 0), elePtrAt(base, size, i)); - taosheapadjust(base, size, 0, i - 1, parcompar, compar, parswap, swap, maxroot); - } - - taosMemoryFreeClear(buf); - */ } diff --git a/tests/docs-examples-test/python.sh b/tests/docs-examples-test/python.sh index 140d05395bdf6c32bbded25bb53ffaab523e3434..ccb391b7527fbf2490911d868d08c87436221162 100644 --- a/tests/docs-examples-test/python.sh +++ b/tests/docs-examples-test/python.sh @@ -23,7 +23,7 @@ python3 bind_param_example.py # 4 taos -s "drop database power" -python3 multi_bind_example.py +python3 multi_bind_example.py # 5 python3 query_example.py @@ -44,4 +44,43 @@ taos -s "drop database test" python3 json_protocol_example.py # 10 -# python3 subscribe_demo.py +pip install SQLAlchemy +pip install pandas +taosBenchmark -y -d power -t 10 -n 10 +python3 conn_native_pandas.py +python3 conn_rest_pandas.py +taos -s "drop database if exists power" + +# 11 +taos -s "create database if not exists test" +python3 connect_native_reference.py + +# 12 +python3 connect_rest_examples.py + +# 13 +python3 handle_exception.py + +# 14 +taosBenchmark -y -d power -t 2 -n 10 +python3 rest_client_example.py +taos -s "drop database if exists power" + +# 15 +python3 result_set_examples.py + +# 16 +python3 tmq_example.py + +# 17 +python3 sql_writer.py + +# 18 +python3 mockdatasource.py + +# 19 +python3 fast_write_example.py + +# 20 +pip3 install kafka-python +python3 kafka_example.py diff --git a/tests/parallel_test/container_build.sh b/tests/parallel_test/container_build.sh index 221e5490560252989b81264247ca2b8bf6eda7b1..ff854449bb913bea8bf5dd1b6477d7d9c7a7b70e 100755 --- a/tests/parallel_test/container_build.sh +++ b/tests/parallel_test/container_build.sh @@ -55,7 +55,7 @@ fi date docker run \ -v $REP_MOUNT_PARAM \ - --rm --ulimit core=-1 taos_test:v1.0 sh -c "cd $REP_DIR;rm -rf debug;mkdir -p debug;cd debug;cmake .. -DBUILD_HTTP=false -DBUILD_TOOLS=true -DBUILD_TEST=true -DWEBSOCKET=true;make -j || exit 1" + --rm --ulimit core=-1 taos_test:v1.0 sh -c "cd $REP_DIR;rm -rf debug;mkdir -p debug;cd debug;cmake .. -DBUILD_HTTP=false -DBUILD_TOOLS=true -DBUILD_TEST=true -DWEBSOCKET=true -DBUILD_TAOSX=true;make -j || exit 1" if [[ -d ${WORKDIR}/debugNoSan ]] ;then echo "delete ${WORKDIR}/debugNoSan" @@ -70,7 +70,7 @@ mv ${REP_REAL_PATH}/debug ${WORKDIR}/debugNoSan date docker run \ -v $REP_MOUNT_PARAM \ - --rm --ulimit core=-1 taos_test:v1.0 sh -c "cd $REP_DIR;rm -rf debug;mkdir -p debug;cd debug;cmake .. -DBUILD_HTTP=false -DBUILD_TOOLS=true -DBUILD_TEST=true -DWEBSOCKET=true -DBUILD_SANITIZER=1 -DTOOLS_SANITIZE=true -DTOOLS_BUILD_TYPE=Debug;make -j || exit 1 " + --rm --ulimit core=-1 taos_test:v1.0 sh -c "cd $REP_DIR;rm -rf debug;mkdir -p debug;cd debug;cmake .. -DBUILD_HTTP=false -DBUILD_TOOLS=true -DBUILD_TEST=true -DWEBSOCKET=true -DBUILD_SANITIZER=1 -DTOOLS_SANITIZE=true -DTOOLS_BUILD_TYPE=Debug -DBUILD_TAOSX=true;make -j || exit 1 " mv ${REP_REAL_PATH}/debug ${WORKDIR}/debugSan diff --git a/tests/script/tsim/db/alter_replica_13.sim b/tests/script/tsim/db/alter_replica_13.sim index d75acb50ad087383fd2e5aabadfb0e2c1165204e..a9dc1741a1681a25da42c842e5c82078bad40ae7 100644 --- a/tests/script/tsim/db/alter_replica_13.sim +++ b/tests/script/tsim/db/alter_replica_13.sim @@ -79,6 +79,7 @@ sql insert into db.ctb6 values(now, 6, "6") sql insert into db.ctb7 values(now, 7, "7") sql insert into db.ctb8 values(now, 8, "8") sql insert into db.ctb9 values(now, 9, "9") +sql flush database db; print =============== step3: create dnodes sql create dnode $hostname port 7300 diff --git a/tests/script/tsim/stream/basic1.sim b/tests/script/tsim/stream/basic1.sim index 7bf10df6372fac03bdfc2cae6a9eef2d0c01b6a9..c61c7667f8fef94ea84b7379a4a6fc12cc5b2141 100644 --- a/tests/script/tsim/stream/basic1.sim +++ b/tests/script/tsim/stream/basic1.sim @@ -834,4 +834,57 @@ endi print ====== test _wstart end +print insert into ts1 values(-1648791211000,1,2,3) + +sql create database test7 vgroups 1; +sql use test7; +sql create stable st(ts timestamp, a int, b int , c int) tags(ta int,tb int,tc int); +sql create table ts1 using st tags(1,1,1); +sql create stream streams7 trigger at_once into streamt7 as select _wstart, count(*) from ts1 interval(10s) ; + +sql insert into ts1 values(1648791211000,1,2,3); +sql_error insert into ts1 values(-1648791211000,1,2,3); + +loop18: + +sleep 200 +sql select * from streamt7; + +$loop_count = $loop_count + 1 +if $loop_count == 10 then + return -1 +endi + +if $rows != 1 then + print =====rows=$rows + goto loop18 +endi + +if $data01 != 1 then + print =====data01=$data01 + goto loop18 +endi + +sql_error insert into ts1 values(-1648791211001,1,2,3) (1648791211001,1,2,3); + +sql select _wstart, count(*) from ts1 interval(10s) ; + +print $data00 $data01 +print $data10 $data11 + +loop19: + +sleep 200 +sql select * from streamt7; + +$loop_count = $loop_count + 1 +if $loop_count == 10 then + return -1 +endi + +if $rows != 1 then + print =====rows=$rows + goto loop19 +endi + system sh/exec.sh -n dnode1 -s stop -x SIGINT diff --git a/tests/script/tsim/stream/triggerInterval0.sim b/tests/script/tsim/stream/triggerInterval0.sim index 7353f026bb01a824fe6935d4e44c870029555e97..b522dcf035f66c439e8c59a8190d9228ea8dca5c 100644 --- a/tests/script/tsim/stream/triggerInterval0.sim +++ b/tests/script/tsim/stream/triggerInterval0.sim @@ -29,69 +29,119 @@ sql insert into t1 values(1648791223001,2,2,3,1.1); sql insert into t1 values(1648791223002,2,2,3,1.1); sql insert into t1 values(1648791223003,2,2,3,1.1); sql insert into t1 values(1648791223001,2,2,3,1.1); + +print step 0 + +$loop_count = 0 + +loop0: sleep 300 + +$loop_count = $loop_count + 1 +if $loop_count == 10 then + return -1 +endi + sql select * from streamt; + if $rows != 1 then print ======$rows - return -1 + goto loop0 endi if $data01 != 1 then print ======$data01 - return -1 + goto loop0 endi sql insert into t1 values(1648791233001,2,2,3,1.1); + +print step 1 + +$loop_count = 0 + +loop1: sleep 300 + +$loop_count = $loop_count + 1 +if $loop_count == 10 then + return -1 +endi + sql select * from streamt; if $rows != 2 then print ======$rows - return -1 + goto loop1 endi if $data01 != 1 then print ======$data01 - return -1 + goto loop1 endi if $data11 != 3 then print ======$data11 - return -1 + goto loop1 endi sql insert into t1 values(1648791223004,2,2,3,1.1); sql insert into t1 values(1648791223004,2,2,3,1.1); sql insert into t1 values(1648791223005,2,2,3,1.1); + +print step 2 + +$loop_count = 0 + +loop2: sleep 300 + +$loop_count = $loop_count + 1 +if $loop_count == 10 then + return -1 +endi + sql select * from streamt; if $rows != 2 then print ======$rows - return -1 + goto loop2 endi + if $data01 != 1 then print ======$data01 - return -1 + goto loop2 endi if $data11 != 5 then print ======$data11 - return -1 + goto loop2 endi sql insert into t1 values(1648791233002,3,2,3,2.1); sql insert into t1 values(1648791213002,4,2,3,3.1) sql insert into t1 values(1648791213002,4,2,3,4.1); + +print step 3 + +$loop_count = 0 + +loop3: sleep 300 + +$loop_count = $loop_count + 1 +if $loop_count == 10 then + return -1 +endi + sql select * from streamt; if $rows != 2 then print ======$rows - return -1 + goto loop3 endi if $data01 != 2 then print ======$data01 - return -1 + goto loop3 endi if $data11 != 5 then print ======$data11 - return -1 + goto loop3 endi system sh/exec.sh -n dnode1 -s stop -x SIGINT \ No newline at end of file diff --git a/tests/system-test/7-tmq/tmqUpdate-1ctb.py b/tests/system-test/7-tmq/tmqUpdate-1ctb.py index b974e4a41a2c60d4c882b2006400f500ec799efc..db2ec3285dd87de532a29021142e4285c05718db 100644 --- a/tests/system-test/7-tmq/tmqUpdate-1ctb.py +++ b/tests/system-test/7-tmq/tmqUpdate-1ctb.py @@ -206,7 +206,7 @@ class TDTestCase: paraDict['rowsPerTbl'] = self.rowsPerTbl consumerId = 1 if self.snapshot == 0: - expectrowcnt = int(paraDict["rowsPerTbl"] * paraDict["ctbNum"] * (2)) + expectrowcnt = int(paraDict["rowsPerTbl"] * paraDict["ctbNum"] * (1/2)) elif self.snapshot == 1: expectrowcnt = int(paraDict["rowsPerTbl"] * paraDict["ctbNum"] * (1)) diff --git a/tests/system-test/7-tmq/tmqUpdate-multiCtb-snapshot0.py b/tests/system-test/7-tmq/tmqUpdate-multiCtb-snapshot0.py index d5df88cf43b1e207f3856807bb9b0bcf55b4b8c6..daffff44c1cf0dda7c4ecf5ac2dfd3dadfdf5504 100644 --- a/tests/system-test/7-tmq/tmqUpdate-multiCtb-snapshot0.py +++ b/tests/system-test/7-tmq/tmqUpdate-multiCtb-snapshot0.py @@ -213,9 +213,9 @@ class TDTestCase: paraDict['rowsPerTbl'] = self.rowsPerTbl consumerId = 1 if self.snapshot == 0: - expectrowcnt = int(paraDict["rowsPerTbl"] * paraDict["ctbNum"] * (2 + 1/2*1/2*2 + 1/2*1/2)) + expectrowcnt = int(paraDict["rowsPerTbl"] * paraDict["ctbNum"] * (1/2) * (1/2*3)) elif self.snapshot == 1: - expectrowcnt = int(paraDict["rowsPerTbl"] * paraDict["ctbNum"] * (2 + 1/2*1/2)) + expectrowcnt = int(paraDict["rowsPerTbl"] * paraDict["ctbNum"] * (1 + 1/2)) topicList = topicFromStb1 ifcheckdata = 1