Merge branch '3.0' into docs/daniel-3.0

fd90e9ce · wade zhang · GitHub · c5d41019 · 4bc8e086 · fd90e9ce
152 changed file
--- a/Jenkinsfile2
+++ b/Jenkinsfile2
@@ -43,6 +43,7 @@ def pre_test(){
        cd ${WKC}
        git reset --hard
        git clean -fxd
+        rm -rf examples/rust/
        git remote prune origin
        git fetch
    '''

--- a/cmake/cmake.install
+++ b/cmake/cmake.install
-IF (EXISTS /var/lib/taos/dnode/dnodeCfg.json)
+IF (TD_LINUX)
-  INSTALL(CODE "MESSAGE(\"The default data directory /var/lib/taos contains old data of tdengine 2.x, please clear it before installing!\")")
-ELSEIF (EXISTS C:/TDengine/data/dnode/dnodeCfg.json)
-  INSTALL(CODE "MESSAGE(\"The default data directory C:/TDengine/data contains old data of tdengine 2.x, please clear it before installing!\")")
-ELSEIF (TD_LINUX)
  SET(TD_MAKE_INSTALL_SH "${TD_SOURCE_DIR}/packaging/tools/make_install.sh")
  INSTALL(CODE "MESSAGE(\"make install script: ${TD_MAKE_INSTALL_SH}\")")
  INSTALL(CODE "execute_process(COMMAND bash ${TD_MAKE_INSTALL_SH} ${TD_SOURCE_DIR} ${PROJECT_BINARY_DIR} Linux ${TD_VER_NUMBER})")
 ELSEIF (TD_WINDOWS)
-  SET(CMAKE_INSTALL_PREFIX C:/TDengine)
-  # INSTALL(DIRECTORY ${TD_SOURCE_DIR}/src/connector/go     DESTINATION connector)
-  # INSTALL(DIRECTORY ${TD_SOURCE_DIR}/src/connector/nodejs DESTINATION connector)
-  # INSTALL(DIRECTORY ${TD_SOURCE_DIR}/src/connector/python DESTINATION connector)
-  # INSTALL(DIRECTORY ${TD_SOURCE_DIR}/src/connector/C\# DESTINATION connector)
-  # INSTALL(DIRECTORY ${TD_SOURCE_DIR}/examples DESTINATION .)
-  INSTALL(CODE "IF (NOT EXISTS ${CMAKE_INSTALL_PREFIX}/cfg/taos.cfg)
-    execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${TD_SOURCE_DIR}/packaging/cfg/taos.cfg ${CMAKE_INSTALL_PREFIX}/cfg/taos.cfg)
-  ENDIF ()")
-  INSTALL(FILES ${TD_SOURCE_DIR}/include/client/taos.h DESTINATION include)
-  INSTALL(FILES ${TD_SOURCE_DIR}/include/util/taoserror.h DESTINATION include)
-  INSTALL(FILES ${TD_SOURCE_DIR}/include/libs/function/taosudf.h DESTINATION include)
-  INSTALL(FILES ${LIBRARY_OUTPUT_PATH}/taos.lib DESTINATION driver)
-  INSTALL(FILES ${LIBRARY_OUTPUT_PATH}/taos_static.lib DESTINATION driver)
-  INSTALL(FILES ${LIBRARY_OUTPUT_PATH}/taos.dll DESTINATION driver)
-  INSTALL(FILES ${EXECUTABLE_OUTPUT_PATH}/taos.exe DESTINATION .)
-  INSTALL(FILES ${EXECUTABLE_OUTPUT_PATH}/taosd.exe DESTINATION .)
-  INSTALL(FILES ${EXECUTABLE_OUTPUT_PATH}/udfd.exe DESTINATION .)
-  IF (BUILD_TOOLS)
-    INSTALL(FILES ${EXECUTABLE_OUTPUT_PATH}/taosBenchmark.exe DESTINATION .)
-  ENDIF ()
-  IF (TD_MVN_INSTALLED)
-    INSTALL(FILES ${LIBRARY_OUTPUT_PATH}/taos-jdbcdriver-2.0.38-dist.jar DESTINATION connector/jdbc)
-  ENDIF ()
  SET(TD_MAKE_INSTALL_SH "${TD_SOURCE_DIR}/packaging/tools/make_install.bat")
  INSTALL(CODE "MESSAGE(\"make install script: ${TD_MAKE_INSTALL_SH}\")")
  INSTALL(CODE "execute_process(COMMAND ${TD_MAKE_INSTALL_SH} :needAdmin ${TD_SOURCE_DIR} ${PROJECT_BINARY_DIR} Windows ${TD_VER_NUMBER})")

--- a/cmake/taosadapter_CMakeLists.txt.in
+++ b/cmake/taosadapter_CMakeLists.txt.in
@@ -2,7 +2,7 @@
 # taosadapter
 ExternalProject_Add(taosadapter
        GIT_REPOSITORY https://github.com/taosdata/taosadapter.git
-        GIT_TAG 3d21433
+        GIT_TAG abed566
        SOURCE_DIR "${TD_SOURCE_DIR}/tools/taosadapter"
        BINARY_DIR ""
        #BUILD_IN_SOURCE TRUE

--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -135,24 +135,6 @@ execute_process(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" .
        WORKING_DIRECTORY "${TD_CONTRIB_DIR}/deps-download")
 execute_process(COMMAND "${CMAKE_COMMAND}" --build .
        WORKING_DIRECTORY "${TD_CONTRIB_DIR}/deps-download")
-# clear submodule
-execute_process(COMMAND git submodule deinit -f tools/taos-tools
-        WORKING_DIRECTORY "${TD_SOURCE_DIR}")
-execute_process(COMMAND git rm --cached tools/taos-tools
-        WORKING_DIRECTORY "${TD_SOURCE_DIR}")
-execute_process(COMMAND git submodule deinit -f tools/taosadapter
-        WORKING_DIRECTORY "${TD_SOURCE_DIR}")
-execute_process(COMMAND git rm --cached tools/taosadapter
-        WORKING_DIRECTORY "${TD_SOURCE_DIR}")
-execute_process(COMMAND git submodule deinit -f tools/taosws-rs
-        WORKING_DIRECTORY "${TD_SOURCE_DIR}")
-execute_process(COMMAND git rm --cached tools/taosws-rs
-        WORKING_DIRECTORY "${TD_SOURCE_DIR}")
-execute_process(COMMAND git submodule deinit -f examples/rust
-        WORKING_DIRECTORY "${TD_SOURCE_DIR}")
-execute_process(COMMAND git rm --cached examples/rust
-        WORKING_DIRECTORY "${TD_SOURCE_DIR}")
 # ================================================================================================
 # Build
@@ -349,9 +331,11 @@ endif(${BUILD_WITH_TRAFT})
 # LIBUV
 if(${BUILD_WITH_UV})
-    if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+    if (TD_WINDOWS)
-        MESSAGE("Windows need set no-sign-compare")
+        # There is no GetHostNameW function on win7.
-        add_compile_options(-Wno-sign-compare)
+        file(READ "libuv/src/win/util.c" LIBUV_WIN_UTIL_CONTENT)
+        string(REPLACE "if (GetHostNameW(buf, UV_MAXHOSTNAMESIZE" "DWORD  nSize = UV_MAXHOSTNAMESIZE;\n  if (GetComputerNameW(buf, &nSize" LIBUV_WIN_UTIL_CONTENT "${LIBUV_WIN_UTIL_CONTENT}")
+        file(WRITE "libuv/src/win/util.c" "${LIBUV_WIN_UTIL_CONTENT}")
    endif ()
    add_subdirectory(libuv EXCLUDE_FROM_ALL)
 endif(${BUILD_WITH_UV})

--- a/docs/en/01-index.md
+++ b/docs/en/01-index.md
@@ -4,7 +4,8 @@ sidebar_label: Documentation Home
 slug: /
 ---
-TDengine is a [high-performance](https://tdengine.com/fast), [scalable](https://tdengine.com/scalable) time series database with [SQL support](https://tdengine.com/sql-support). This document is the TDengine user manual. It introduces the basic, as well as novel concepts, in TDengine, and also talks in detail about installation, features, SQL, APIs, operation, maintenance, kernel design and other topics. It’s written mainly for architects, developers and system administrators.
+TDengine is an open source, cloud native time-series database optimized for Internet of Things (IoT), Connected Cars, and Industrial IoT. It enables efficient, real-time data ingestion, processing, and monitoring of TB and even PB scale data per day, generated by billions of sensors and data collectors.  This document is the TDengine user manual. It introduces the basic, as well as novel concepts, in TDengine, and also talks in detail about installation, features, SQL, APIs, operation, maintenance, kernel design and other topics. It’s written mainly for architects, developers and system administrators.
 To get an overview of TDengine, such as a feature list, benchmarks, and competitive advantages, please browse through the [Introduction](./intro) section.

--- a/docs/en/02-intro/index.md
+++ b/docs/en/02-intro/index.md
@@ -3,7 +3,7 @@ title: Introduction
 toc_max_heading_level: 2
 ---
-TDengine is a high-performance, scalable time-series database with SQL support. Its code, including its cluster feature is open source under GNU AGPL v3.0. Besides the database engine, it provides [caching](/develop/cache), [stream processing](/develop/continuous-query), [data subscription](/develop/subscribe)  and other functionalities to reduce the complexity and cost of development and operation.
+TDengine is an open source, high-performance, cloud native time-series database optimized for Internet of Things (IoT), Connected Cars, and Industrial IoT. Its code, including its cluster feature is open source under GNU AGPL v3.0. Besides the database engine, it provides [caching](/develop/cache), [stream processing](/develop/continuous-query), [data subscription](/develop/subscribe) and other functionalities to reduce the system complexity and cost of development and operation.
 This section introduces the major features, competitive advantages, typical use-cases and benchmarks to help you get a high level overview of TDengine.
@@ -31,25 +31,21 @@ For more details on features, please read through the entire documentation.
 ## Competitive Advantages
-Time-series data is structured, not transactional, and is rarely deleted or updated. TDengine makes full use of [these characteristics of time series data](https://tdengine.com/2019/07/09/86.html) to build its own innovative storage engine and computing engine to differentiate itself from other time series databases, with the following advantages.
+By making full use of [characteristics of time series data](https://tdengine.com/tsdb/characteristics-of-time-series-data/), TDengine differentiates itself from other time series databases, with the following advantages.
- **[High Performance](https://tdengine.com/fast)**: With an innovatively designed and purpose-built storage engine, TDengine outperforms other time series databases in data ingestion and querying while significantly reducing storage costs and compute costs.
+- **High-Performance**: TDengine is the only time-series database to solve the high cardinality issue to support billions of data collection points while out performing other time-series databases for data ingestion, querying and data compression.
- **[Scalable](https://tdengine.com/scalable)**: TDengine provides out-of-box scalability and high-availability through its native distributed design. Nodes can be added through simple configuration to achieve greater data processing power. In addition, this feature is open source.
+- **Simplified Solution**: Through built-in caching, stream processing and data subscription features, TDengine provides a simplified solution for time-series data processing. It reduces system design complexity and operation costs significantly.
- **[SQL Support](https://tdengine.com/sql-support)**: TDengine uses SQL as the query language, thereby reducing learning and migration costs, while adding SQL extensions to better handle time-series. Keeping NoSQL developers in mind, TDengine also supports convenient and flexible, schemaless data ingestion.
+- **Cloud Native**: Through native distributed design, sharding and partitioning, separation of compute and storage, RAFT, support for kubernetes deployment and full observability, TDengine is a cloud native Time-Series Database and can be deployed on public, private or hybrid clouds.
- **All in One**: TDengine has built-in caching, stream processing and data subscription functions. It is no longer necessary to integrate Kafka/Redis/HBase/Spark or other software in some scenarios. It makes the system architecture much simpler, cost-effective and easier to maintain.
+- **Ease of Use**: For administrators, TDengine significantly reduces the effort to deploy and maintain. For developers, it provides a simple interface, simplified solution and seamless integrations for third party tools. For data users, it gives easy data access. 
- **Seamless Integration**: Without a single line of code, TDengine provide seamless, configurable integration with third-party tools such as Telegraf, Grafana, EMQX, Prometheus, StatsD, collectd, etc. More third-party tools are being integrated.
+- **Easy Data Analytics**: Through super tables, storage and compute separation, data partitioning by time interval, pre-computation and other means, TDengine makes it easy to explore, format, and get access to data in a highly efficient way. 
- **Zero Management**: Installation and cluster setup can be done in seconds. Data partitioning and sharding are executed automatically. TDengine’s running status can be monitored via Grafana or other DevOps tools.
+- **Open Source**: TDengine’s core modules, including cluster feature, are all available under open source licenses. It has gathered 18.8k stars on GitHub. There is an active developer community, and over 139k running instances worldwide.
- **Zero Learning Costs**: With SQL as the query language and support for ubiquitous tools like Python, Java, C/C++, Go, Rust, and Node.js connectors, and a REST API, there are zero learning costs.
+With TDengine, the total cost of ownership of your time-series data platform can be greatly reduced. 1: With its superior performance, the computing and storage resources are reduced significantly；2: With SQL support, it can be seamlessly integrated with many third party tools, and learning costs/migration costs are reduced significantly；3: With its simplified solution and nearly zero management, the operation and maintenance costs are reduced significantly. 
- **Interactive Console**: TDengine provides convenient console access to the database, through a CLI, to run ad hoc queries, maintain the database, or manage the cluster, without any programming.
-With TDengine, the total cost of ownership of your time-series data platform can be greatly reduced. 1: With its superior performance, the computing and storage resources are reduced significantly 2: With SQL support, it can be seamlessly integrated with many third party tools, and learning costs/migration costs are reduced significantly 3: With its simple architecture and zero management, the operation and maintenance costs are reduced. 
 ## Technical Ecosystem
 This is how TDengine would be situated, in a typical time-series data processing platform:

--- a/docs/en/04-concept/index.md
+++ b/docs/en/04-concept/index.md
@@ -2,7 +2,7 @@
 title: Concepts
 ---
-In order to explain the basic concepts and provide some sample code, the TDengine documentation smart meters as a typical time series use case. We assume the following: 1. Each smart meter collects three metrics i.e. current, voltage, and phase 2. There are multiple smart meters, and 3. Each meter has static attributes like location and group ID. Based on this, collected data will look similar to the following table:
+In order to explain the basic concepts and provide some sample code, the TDengine documentation smart meters as a typical time series use case. We assume the following: 1. Each smart meter collects three metrics i.e. current, voltage, and phase; 2. There are multiple smart meters; 3. Each meter has static attributes like location and group ID. Based on this, collected data will look similar to the following table:
 <div className="center-table">
 <table>

--- a/docs/en/07-develop/02-model/index.mdx
+++ b/docs/en/07-develop/02-model/index.mdx
@@ -2,7 +2,9 @@
 title: Data Model
 ---
-The data model employed by TDengine is similar to that of a relational database. You have to create databases and tables. You must design the data model based on your own business and application requirements. You should design the STable (an abbreviation for super table) schema to fit your data. This chapter will explain the big picture without getting into syntactical details.
+The data model employed by TDengine is similar to that of a relational database. You have to create databases and tables. You must design the data model based on your own business and application requirements. You should design the [STable](/concept/#super-table-stable) (an abbreviation for super table) schema to fit your data. This chapter will explain the big picture without getting into syntactical details. 
+Note: before you read this chapter, please make sure you have already read through [Key Concepts](/concept/), since TDengine introduces new concepts like "one table for one [data collection point](/concept/#data-collection-point)" and "[super table](/concept/#super-table-stable)".
@@ -11,7 +13,7 @@ The data model employed by TDengine is similar to that of a relational database.
 The characteristics of time-series data from different data collection points may be different. Characteristics include collection frequency, retention policy and others which determine how you create and configure the database. For e.g. days to keep, number of replicas, data block size, whether data updates are allowed and other configurable parameters would be determined by the characteristics of your data and your business requirements. For TDengine to operate with the best performance, we strongly recommend that you create and configure different databases for data with different characteristics. This allows you, for example, to set up different storage and retention policies. When creating a database, there are a lot of parameters that can be configured such as, the days to keep data, the number of replicas, the size of the cache, time precision, the minimum and maximum number of rows in each data block, whether compression is enabled, the time range of the data in single data file and so on. An example is shown as follows:
 ```sql
-CREATE DATABASE power KEEP 365 DURATION 10 BUFFER 16 VGROUPS 100 WAL 1;
+CREATE DATABASE power KEEP 365 DURATION 10 BUFFER 16 WAL_LEVEL 1;
 ```
 In the above SQL statement:

--- a/docs/en/07-develop/04-query-data/index.mdx
+++ b/docs/en/07-develop/04-query-data/index.mdx
@@ -54,21 +54,21 @@ In most use cases, there are always multiple kinds of data collection points. A
 In TDengine CLI `taos`, use the SQL below to get the average voltage of all the meters in California grouped by location.
 ```
-taos> SELECT AVG(voltage) FROM meters GROUP BY location;
+taos> SELECT AVG(voltage), location FROM meters GROUP BY location;
-       avg(voltage)        |            location            |
+       avg(voltage)        |                             location                             |
-=============================================================
+===============================================================================================
-             222.000000000 | California.LosAngeles                |
+             219.200000000 | California.SanFrancisco                                          |
-             219.200000000 | California.SanFrancisco               |
+             221.666666667 | California.LosAngeles                                            |
-Query OK, 2 row(s) in set (0.002136s)
+Query OK, 2 rows in database (0.005995s)
 ```
 ### Example 2
-In TDengine CLI `taos`, use the SQL below to get the number of rows and the maximum current in the past 24 hours from meters whose groupId is 2.
+In TDengine CLI `taos`, use the SQL below to get the number of rows and the maximum current from meters whose groupId is 2.
 ```
-taos> SELECT count(*), max(current) FROM meters where groupId = 2 and ts > now - 24h;
+taos> SELECT count(*), max(current) FROM meters where groupId = 2;
-     cunt(*)  |    max(current)  |
+     count(*)  |    max(current)  |
 ==================================
            5 |             13.4 |
 Query OK, 1 row(s) in set (0.002136s)
@@ -81,40 +81,41 @@ In [Select](../../taos-sql././select), all query operations are marked as to whe
 In IoT use cases, down sampling is widely used to aggregate data by time range. The `INTERVAL` keyword in TDengine can be used to simplify the query by time window. For example, the SQL statement below can be used to get the sum of current every 10 seconds from meters table d1001.
 ```
-taos> SELECT sum(current) FROM d1001 INTERVAL(10s);
+taos> SELECT _wstart, sum(current) FROM d1001 INTERVAL(10s);
-           ts            |       sum(current)        |
+         _wstart         |       sum(current)        |
 ======================================================
 2018-10-03 14:38:00.000 |              10.300000191 |
 2018-10-03 14:38:10.000 |              24.900000572 |
-Query OK, 2 row(s) in set (0.000883s)
+Query OK, 2 rows in database (0.003139s)
 ```
 Down sampling can also be used for STable. For example, the below SQL statement can be used to get the sum of current from all meters in California.
 ```
-taos> SELECT SUM(current) FROM meters where location like "California%" INTERVAL(1s);
+taos> SELECT _wstart, SUM(current) FROM meters where location like "California%" INTERVAL(1s);
-           ts            |       sum(current)        |
+         _wstart         |       sum(current)        |
 ======================================================
 2018-10-03 14:38:04.000 |              10.199999809 |
- 2018-10-03 14:38:05.000 |              32.900000572 |
+ 2018-10-03 14:38:05.000 |              23.699999809 |
 2018-10-03 14:38:06.000 |              11.500000000 |
 2018-10-03 14:38:15.000 |              12.600000381 |
- 2018-10-03 14:38:16.000 |              36.000000000 |
+ 2018-10-03 14:38:16.000 |              34.400000572 |
-Query OK, 5 row(s) in set (0.001538s)
+Query OK, 5 rows in database (0.007413s)
 ```
 Down sampling also supports time offset. For example, the below SQL statement can be used to get the sum of current from all meters but each time window must start at the boundary of 500 milliseconds.
 ```
-taos> SELECT SUM(current) FROM meters INTERVAL(1s, 500a);
+taos> SELECT _wstart, SUM(current) FROM meters INTERVAL(1s, 500a);
-           ts            |       sum(current)        |
+         _wstart         |       sum(current)        |
 ======================================================
- 2018-10-03 14:38:04.500 |              11.189999809 |
+ 2018-10-03 14:38:03.500 |              10.199999809 |
- 2018-10-03 14:38:05.500 |              31.900000572 |
+ 2018-10-03 14:38:04.500 |              10.300000191 |
- 2018-10-03 14:38:06.500 |              11.600000000 |
+ 2018-10-03 14:38:05.500 |              13.399999619 |
- 2018-10-03 14:38:15.500 |              12.300000381 |
+ 2018-10-03 14:38:06.500 |              11.500000000 |
- 2018-10-03 14:38:16.500 |              35.000000000 |
+ 2018-10-03 14:38:14.500 |              12.600000381 |
-Query OK, 5 row(s) in set (0.001521s)
+ 2018-10-03 14:38:16.500 |              34.400000572 |
+Query OK, 6 rows in database (0.005515s)
 ```
 In many use cases, it's hard to align the timestamp of the data collected by each collection point. However, a lot of algorithms like FFT require the data to be aligned with same time interval and application programs have to handle this by themselves. In TDengine, it's easy to achieve the alignment using down sampling.

--- a/docs/examples/node/nativeexample/subscribe_demo.js
+++ b/docs/examples/node/nativeexample/subscribe_demo.js
@@ -28,8 +28,7 @@ function runConsumer() {
        console.log(msg.topicPartition);
        console.log(msg.block);
        console.log(msg.fields)
-        // fixme(@xiaolei): commented temp, should be fixed.
+        consumer.commit(msg);
-        //consumer.commit(msg);
        console.log(`=======consumer ${i} done`)
    }

--- a/docs/examples/node/package.json
+++ b/docs/examples/node/package.json
@@ -4,7 +4,7 @@
  "main": "index.js",
  "license": "MIT",
  "dependencies": {
-    "@tdengine/client": "^3.0.0",
+    "@tdengine/client": "^3.0.1",
    "@tdengine/rest": "^3.0.0"
  }
 }
--- a/docs/zh/07-develop/02-model/index.mdx
+++ b/docs/zh/07-develop/02-model/index.mdx
@@ -11,10 +11,10 @@ TDengine 采用类关系型数据模型，需要建库、建表。因此对于
 不同类型的数据采集点往往具有不同的数据特征，包括数据采集频率的高低，数据保留时间的长短，副本的数目，数据块的大小，是否允许更新数据等等。为了在各种场景下 TDengine 都能最大效率的工作，TDengine 建议将不同数据特征的表创建在不同的库里，因为每个库可以配置不同的存储策略。创建一个库时，除 SQL 标准的选项外，还可以指定保留时长、副本数、缓存大小、时间精度、文件块里最大最小记录条数、是否压缩、一个数据文件覆盖的天数等多种参数。比如：
 ```sql
-CREATE DATABASE power KEEP 365 DURATION 10 BUFFER 16 VGROUPS 100 WAL 1;
+CREATE DATABASE power KEEP 365 DURATION 10 BUFFER 16 WAL_LEVEL 1;
 ```
-上述语句将创建一个名为 power 的库，这个库的数据将保留 365 天（超过 365 天将被自动删除），每 10 天一个数据文件，每个 VNODE 的写入内存池的大小为 16 MB，数据库的 VGROUPS 数量，对该数据库入会写 WAL 但不执行 FSYNC。详细的语法及参数请见 [数据库管理](/taos-sql/database) 章节。
+上述语句将创建一个名为 power 的库，这个库的数据将保留 365 天（超过 365 天将被自动删除），每 10 天一个数据文件，每个 VNODE 的写入内存池的大小为 16 MB，对该数据库入会写 WAL 但不执行 FSYNC。详细的语法及参数请见 [数据库管理](/taos-sql/database) 章节。
 创建库之后，需要使用 SQL 命令 `USE` 将当前库切换过来，例如：

--- a/docs/zh/07-develop/04-query-data/index.mdx
+++ b/docs/zh/07-develop/04-query-data/index.mdx
@@ -54,20 +54,20 @@ Query OK, 2 row(s) in set (0.001100s)
 在 TAOS Shell，查找加利福尼亚州所有智能电表采集的电压平均值，并按照 location 分组。
 ```
-taos> SELECT AVG(voltage) FROM meters GROUP BY location;
+taos> SELECT AVG(voltage), location FROM meters GROUP BY location;
-       avg(voltage)        |            location            |
+       avg(voltage)        |                             location                             |
-=============================================================
+===============================================================================================
-             222.000000000 | California.LosAngeles                |
+             219.200000000 | California.SanFrancisco                                          |
-             219.200000000 | California.SanFrancisco               |
+             221.666666667 | California.LosAngeles                                            |
-Query OK, 2 row(s) in set (0.002136s)
+Query OK, 2 rows in database (0.005995s)
 ```
 ### 示例二
-在 TAOS shell, 查找 groupId 为 2 的所有智能电表过去 24 小时的记录条数，电流的最大值。
+在 TAOS shell, 查找 groupId 为 2 的所有智能电表的记录条数，电流的最大值。
 ```
-taos> SELECT count(*), max(current) FROM meters where groupId = 2 and ts > now - 24h;
+taos> SELECT count(*), max(current) FROM meters where groupId = 2;
     cunt(*)  |    max(current)  |
 ==================================
            5 |             13.4 |
@@ -81,40 +81,41 @@ Query OK, 1 row(s) in set (0.002136s)
 物联网场景里，经常需要通过降采样（down sampling）将采集的数据按时间段进行聚合。TDengine 提供了一个简便的关键词 interval 让按照时间窗口的查询操作变得极为简单。比如，将智能电表 d1001 采集的电流值每 10 秒钟求和
 ```
-taos> SELECT sum(current) FROM d1001 INTERVAL(10s);
+taos> SELECT _wstart, sum(current) FROM d1001 INTERVAL(10s);
-           ts            |       sum(current)        |
+         _wstart         |       sum(current)        |
 ======================================================
 2018-10-03 14:38:00.000 |              10.300000191 |
 2018-10-03 14:38:10.000 |              24.900000572 |
-Query OK, 2 row(s) in set (0.000883s)
+Query OK, 2 rows in database (0.003139s)
 ```
 降采样操作也适用于超级表，比如：将加利福尼亚州所有智能电表采集的电流值每秒钟求和
 ```
-taos> SELECT SUM(current) FROM meters where location like "California%" INTERVAL(1s);
+taos> SELECT _wstart, SUM(current) FROM meters where location like "California%" INTERVAL(1s);
-           ts            |       sum(current)        |
+         _wstart         |       sum(current)        |
 ======================================================
 2018-10-03 14:38:04.000 |              10.199999809 |
- 2018-10-03 14:38:05.000 |              32.900000572 |
+ 2018-10-03 14:38:05.000 |              23.699999809 |
 2018-10-03 14:38:06.000 |              11.500000000 |
 2018-10-03 14:38:15.000 |              12.600000381 |
- 2018-10-03 14:38:16.000 |              36.000000000 |
+ 2018-10-03 14:38:16.000 |              34.400000572 |
-Query OK, 5 row(s) in set (0.001538s)
+Query OK, 5 rows in database (0.007413s)
 ```
 降采样操作也支持时间偏移，比如：将所有智能电表采集的电流值每秒钟求和，但要求每个时间窗口从 500 毫秒开始
 ```
-taos> SELECT SUM(current) FROM meters INTERVAL(1s, 500a);
+taos> SELECT _wstart, SUM(current) FROM meters INTERVAL(1s, 500a);
-           ts            |       sum(current)        |
+         _wstart         |       sum(current)        |
 ======================================================
- 2018-10-03 14:38:04.500 |              11.189999809 |
+ 2018-10-03 14:38:03.500 |              10.199999809 |
- 2018-10-03 14:38:05.500 |              31.900000572 |
+ 2018-10-03 14:38:04.500 |              10.300000191 |
- 2018-10-03 14:38:06.500 |              11.600000000 |
+ 2018-10-03 14:38:05.500 |              13.399999619 |
- 2018-10-03 14:38:15.500 |              12.300000381 |
+ 2018-10-03 14:38:06.500 |              11.500000000 |
- 2018-10-03 14:38:16.500 |              35.000000000 |
+ 2018-10-03 14:38:14.500 |              12.600000381 |
-Query OK, 5 row(s) in set (0.001521s)
+ 2018-10-03 14:38:16.500 |              34.400000572 |
+Query OK, 6 rows in database (0.005515s)
 ```
 物联网场景里，每个数据采集点采集数据的时间是难同步的，但很多分析算法(比如 FFT)需要把采集的数据严格按照时间等间隔的对齐，在很多系统里，需要应用自己写程序来处理，但使用 TDengine 的降采样操作就轻松解决。

--- a/docs/zh/12-taos-sql/03-table.md
+++ b/docs/zh/12-taos-sql/03-table.md
@@ -110,7 +110,7 @@ alter_table_option: {
 对普通表可以进行如下修改操作
 1. ADD COLUMN：添加列。
 2. DROP COLUMN：删除列。
-3. ODIFY COLUMN：修改列定义，如果数据列的类型是可变长类型，那么可以使用此指令修改其宽度，只能改大，不能改小。
+3. MODIFY COLUMN：修改列定义，如果数据列的类型是可变长类型，那么可以使用此指令修改其宽度，只能改大，不能改小。
 4. RENAME COLUMN：修改列名称。
 ### 增加列
@@ -195,4 +195,4 @@ SHOW CREATE TABLE tb_name;
 ```
 DESCRIBE [db_name.]tb_name;
 ```
\ No newline at end of file
--- a/docs/zh/14-reference/11-docker/index.md
+++ b/docs/zh/14-reference/11-docker/index.md
--- a/docs/zh/17-operation/03-tolerance.md
+++ b/docs/zh/17-operation/03-tolerance.md
@@ -26,5 +26,3 @@ TDengine 集群中的时序数据的副本数是与数据库关联的，一个
 TDengine 集群的节点数必须大于等于副本数，否则创建表时将报错。
 当 TDengine 集群中的节点部署在不同的物理机上，并设置多个副本数时，就实现了系统的高可靠性，无需再使用其他软件或工具。TDengine 企业版还可以将副本部署在不同机房，从而实现异地容灾。
-另外一种灾备方式是通过 `taosX` 将一个 TDengine 集群的数据同步复制到物理上位于不同数据中心的另一个 TDengine 集群。其详细使用方法请参考 [taosX 参考手册](../../reference/taosX)
--- a/examples/c/stream_demo.c
+++ b/examples/c/stream_demo.c
@@ -98,10 +98,9 @@ int32_t create_stream() {
  /*const char* sql = "select min(k), max(k), sum(k) as sum_of_k from st1";*/
  /*const char* sql = "select sum(k) from tu1 interval(10m)";*/
  /*pRes = tmq_create_stream(pConn, "stream1", "out1", sql);*/
-  pRes =
+  pRes = taos_query(pConn,
-      taos_query(pConn,
+                    "create stream stream1 trigger max_delay 10s watermark 10s into outstb as select _wstart start, "
-                 "create stream stream1 trigger max_delay 10s into outstb as select _wstart, sum(k) from st1 partition "
+                    "count(k) from st1 partition by tbname interval(20s) ");
-                 "by tbname session(ts, 10s) ");
  if (taos_errno(pRes) != 0) {
    printf("failed to create stream stream1, reason:%s\n", taos_errstr(pRes));
    return -1;

--- a/include/common/tcommon.h
+++ b/include/common/tcommon.h
@@ -60,6 +60,7 @@ enum {
  STREAM_INPUT__DATA_RETRIEVE,
  STREAM_INPUT__GET_RES,
  STREAM_INPUT__CHECKPOINT,
+  STREAM_INPUT__DESTROY,
 };
 typedef enum EStreamType {

--- a/include/common/tdataformat.h
+++ b/include/common/tdataformat.h
@@ -38,22 +38,18 @@ typedef struct STagVal       STagVal;
 typedef struct STag          STag;
 // bitmap
-#define N1(n)        ((1 << (n)) - 1)
+const static uint8_t BIT2_MAP[4][4] = {{0b00000000, 0b00000001, 0b00000010, 0},
-#define BIT1_SIZE(n) (((n)-1) / 8 + 1)
+                                       {0b00000000, 0b00000100, 0b00001000, 2},
-#define BIT2_SIZE(n) (((n)-1) / 4 + 1)
+                                       {0b00000000, 0b00010000, 0b00100000, 4},
-#define SET_BIT1(p, i, v)                            \
+                                       {0b00000000, 0b01000000, 0b10000000, 6}};
-  do {                                               \
-    (p)[(i) / 8] &= N1((i) % 8);                     \
+#define N1(n)             ((((uint8_t)1) << (n)) - 1)
-    (p)[(i) / 8] |= (((uint8_t)(v)) << (((i) % 8))); \
+#define BIT1_SIZE(n)      ((((n)-1) >> 3) + 1)
-  } while (0)
+#define BIT2_SIZE(n)      ((((n)-1) >> 2) + 1)
+#define SET_BIT1(p, i, v) ((p)[(i) >> 3] = (p)[(i) >> 3] & N1((i)&7) | (((uint8_t)(v)) << ((i)&7)))
-#define GET_BIT1(p, i) (((p)[(i) / 8] >> ((i) % 8)) & ((uint8_t)1))
+#define GET_BIT1(p, i)    (((p)[(i) >> 3] >> ((i)&7)) & ((uint8_t)1))
-#define SET_BIT2(p, i, v)                                \
+#define SET_BIT2(p, i, v) ((p)[(i) >> 2] = (p)[(i) >> 2] & N1(BIT2_MAP[(i)&3][3]) | BIT2_MAP[(i)&3][(v)])
-  do {                                                   \
+#define GET_BIT2(p, i)    (((p)[(i) >> 2] >> BIT2_MAP[(i)&3][3]) & ((uint8_t)3))
-    p[(i) / 4] &= N1((i) % 4 * 2);                       \
-    (p)[(i) / 4] |= (((uint8_t)(v)) << (((i) % 4) * 2)); \
-  } while (0)
-#define GET_BIT2(p, i) (((p)[(i) / 4] >> (((i) % 4) * 2)) & ((uint8_t)3))
 // STSchema
 int32_t tTSchemaCreate(int32_t sver, SSchema *pSchema, int32_t nCols, STSchema **ppTSchema);
@@ -171,7 +167,7 @@ struct SColVal {
 #pragma pack(push, 1)
 struct STagVal {
-//  char colName[TSDB_COL_NAME_LEN]; // only used for tmq_get_meta
+  //  char colName[TSDB_COL_NAME_LEN]; // only used for tmq_get_meta
  union {
    int16_t cid;
    char   *pKey;

--- a/include/common/tmsg.h
+++ b/include/common/tmsg.h
@@ -2664,29 +2664,8 @@ typedef struct {
 } SVgEpSet;
 typedef struct {
-  int64_t suid;
+  int32_t padding;
-  int8_t  level;
+} SRSmaExecMsg;
-} SRSmaFetchMsg;
-static FORCE_INLINE int32_t tEncodeSRSmaFetchMsg(SEncoder* pCoder, const SRSmaFetchMsg* pReq) {
-  if (tStartEncode(pCoder) < 0) return -1;
-  if (tEncodeI64(pCoder, pReq->suid) < 0) return -1;
-  if (tEncodeI8(pCoder, pReq->level) < 0) return -1;
-  tEndEncode(pCoder);
-  return 0;
-}
-static FORCE_INLINE int32_t tDecodeSRSmaFetchMsg(SDecoder* pCoder, SRSmaFetchMsg* pReq) {
-  if (tStartDecode(pCoder) < 0) return -1;
-  if (tDecodeI64(pCoder, &pReq->suid) < 0) return -1;
-  if (tDecodeI8(pCoder, &pReq->level) < 0) return -1;
-  tEndDecode(pCoder);
-  return 0;
-}
 typedef struct {
  int8_t         version;       // for compatibility(default 0)

--- a/include/common/tmsgdef.h
+++ b/include/common/tmsgdef.h
@@ -201,7 +201,8 @@ enum {
  TD_DEF_MSG_TYPE(TDMT_VND_CANCEL_SMA, "vnode-cancel-sma", NULL, NULL)
  TD_DEF_MSG_TYPE(TDMT_VND_DROP_SMA, "vnode-drop-sma", NULL, NULL)
  TD_DEF_MSG_TYPE(TDMT_VND_SUBMIT_RSMA, "vnode-submit-rsma", SSubmitReq, SSubmitRsp)
-  TD_DEF_MSG_TYPE(TDMT_VND_FETCH_RSMA, "vnode-fetch-rsma", SRSmaFetchMsg, NULL)
+  TD_DEF_MSG_TYPE(TDMT_VND_FETCH_RSMA, "vnode-fetch-rsma", NULL, NULL)
+  TD_DEF_MSG_TYPE(TDMT_VND_EXEC_RSMA, "vnode-exec-rsma", NULL, NULL)
  TD_DEF_MSG_TYPE(TDMT_VND_DELETE, "delete-data", SVDeleteReq, SVDeleteRsp)
  TD_DEF_MSG_TYPE(TDMT_VND_BATCH_DEL, "batch-delete", SBatchDeleteReq, NULL)
  TD_DEF_MSG_TYPE(TDMT_VND_ALTER_CONFIG, "alter-config", NULL, NULL)

--- a/include/libs/nodes/nodes.h
+++ b/include/libs/nodes/nodes.h
@@ -105,7 +105,7 @@ typedef enum ENodeType {
  QUERY_NODE_COLUMN_REF,
  // Statement nodes are used in parser and planner module.
-  QUERY_NODE_SET_OPERATOR,
+  QUERY_NODE_SET_OPERATOR = 100,
  QUERY_NODE_SELECT_STMT,
  QUERY_NODE_VNODE_MODIF_STMT,
  QUERY_NODE_CREATE_DATABASE_STMT,
@@ -198,7 +198,7 @@ typedef enum ENodeType {
  QUERY_NODE_QUERY,
  // logic plan node
-  QUERY_NODE_LOGIC_PLAN_SCAN,
+  QUERY_NODE_LOGIC_PLAN_SCAN = 1000,
  QUERY_NODE_LOGIC_PLAN_JOIN,
  QUERY_NODE_LOGIC_PLAN_AGG,
  QUERY_NODE_LOGIC_PLAN_PROJECT,
@@ -215,7 +215,7 @@ typedef enum ENodeType {
  QUERY_NODE_LOGIC_PLAN,
  // physical plan node
-  QUERY_NODE_PHYSICAL_PLAN_TAG_SCAN,
+  QUERY_NODE_PHYSICAL_PLAN_TAG_SCAN = 1100,
  QUERY_NODE_PHYSICAL_PLAN_TABLE_SCAN,
  QUERY_NODE_PHYSICAL_PLAN_TABLE_SEQ_SCAN,
  QUERY_NODE_PHYSICAL_PLAN_TABLE_MERGE_SCAN,

--- a/include/libs/nodes/querynodes.h
+++ b/include/libs/nodes/querynodes.h
@@ -428,6 +428,9 @@ void    nodesValueNodeToVariant(const SValueNode* pNode, SVariant* pVal);
 char*   nodesGetFillModeString(EFillMode mode);
 int32_t nodesMergeConds(SNode** pDst, SNodeList** pSrc);
+const char* operatorTypeStr(EOperatorType type);
+const char* logicConditionTypeStr(ELogicConditionType type);
 #ifdef __cplusplus
 }
 #endif

--- a/include/libs/stream/tstream.h
+++ b/include/libs/stream/tstream.h
@@ -53,6 +53,7 @@ enum {
  TASK_SCHED_STATUS__WAITING,
  TASK_SCHED_STATUS__ACTIVE,
  TASK_SCHED_STATUS__FAILED,
+  TASK_SCHED_STATUS__DROPPING,
 };
 enum {
@@ -127,6 +128,10 @@ typedef struct {
  int8_t type;
 } SStreamCheckpoint;
+typedef struct {
+  int8_t type;
+} SStreamTaskDestroy;
 typedef struct {
  int8_t       type;
  SSDataBlock* pBlock;
@@ -211,7 +216,6 @@ typedef struct {
  void*     vnode;
  FTbSink*  tbSinkFunc;
  STSchema* pTSchema;
-  SHashObj* pHash;  // groupId to tbuid
 } STaskSinkTb;
 typedef void FSmaSink(void* vnode, int64_t smaId, const SArray* data);

--- a/include/libs/sync/sync.h
+++ b/include/libs/sync/sync.h
@@ -33,7 +33,8 @@ extern bool gRaftDetailLog;
 #define SYNC_MAX_READ_RANGE          2
 #define SYNC_MAX_PROGRESS_WAIT_MS    4000
 #define SYNC_MAX_START_TIME_RANGE_MS (1000 * 20)
-#define SYNC_MAX_RECV_TIME_RANGE_MS  1000
+#define SYNC_MAX_RECV_TIME_RANGE_MS  1200
+#define SYNC_ADD_QUORUM_COUNT        3
 #define SYNC_MAX_BATCH_SIZE 1
 #define SYNC_INDEX_BEGIN    0

--- a/include/util/taoserror.h
+++ b/include/util/taoserror.h
@@ -291,6 +291,7 @@ int32_t* taosGetErrno();
 #define TSDB_CODE_MND_STREAM_NOT_EXIST          TAOS_DEF_ERROR_CODE(0, 0x03F1)
 #define TSDB_CODE_MND_INVALID_STREAM_OPTION     TAOS_DEF_ERROR_CODE(0, 0x03F2)
 #define TSDB_CODE_MND_STREAM_MUST_BE_DELETED    TAOS_DEF_ERROR_CODE(0, 0x03F3)
+#define TSDB_CODE_MND_STREAM_TASK_DROPPED       TAOS_DEF_ERROR_CODE(0, 0x03F4)
 // mnode-sma
 #define TSDB_CODE_MND_SMA_ALREADY_EXIST         TAOS_DEF_ERROR_CODE(0, 0x0480)
@@ -614,6 +615,7 @@ int32_t* taosGetErrno();
 #define TSDB_CODE_RSMA_REMOVE_EXISTS             TAOS_DEF_ERROR_CODE(0, 0x3154)
 #define TSDB_CODE_RSMA_FETCH_MSG_MSSED_UP        TAOS_DEF_ERROR_CODE(0, 0x3155)
 #define TSDB_CODE_RSMA_EMPTY_INFO                TAOS_DEF_ERROR_CODE(0, 0x3156)
+#define TSDB_CODE_RSMA_INVALID_SCHEMA            TAOS_DEF_ERROR_CODE(0, 0x3157)
 //index
 #define TSDB_CODE_INDEX_REBUILDING               TAOS_DEF_ERROR_CODE(0, 0x3200)

--- a/include/util/tdef.h
+++ b/include/util/tdef.h
@@ -132,15 +132,14 @@ typedef enum EOperatorType {
  OP_TYPE_DIV,
  OP_TYPE_REM,
  // unary arithmetic operator
-  OP_TYPE_MINUS,
+  OP_TYPE_MINUS = 20,
-  OP_TYPE_ASSIGN,
  // bitwise operator
-  OP_TYPE_BIT_AND,
+  OP_TYPE_BIT_AND = 30,
  OP_TYPE_BIT_OR,
  // binary comparison operator
-  OP_TYPE_GREATER_THAN,
+  OP_TYPE_GREATER_THAN = 40,
  OP_TYPE_GREATER_EQUAL,
  OP_TYPE_LOWER_THAN,
  OP_TYPE_LOWER_EQUAL,
@@ -153,7 +152,7 @@ typedef enum EOperatorType {
  OP_TYPE_MATCH,
  OP_TYPE_NMATCH,
  // unary comparison operator
-  OP_TYPE_IS_NULL,
+  OP_TYPE_IS_NULL = 100,
  OP_TYPE_IS_NOT_NULL,
  OP_TYPE_IS_TRUE,
  OP_TYPE_IS_FALSE,
@@ -163,8 +162,11 @@ typedef enum EOperatorType {
  OP_TYPE_IS_NOT_UNKNOWN,
  // json operator
-  OP_TYPE_JSON_GET_VALUE,
+  OP_TYPE_JSON_GET_VALUE = 150,
-  OP_TYPE_JSON_CONTAINS
+  OP_TYPE_JSON_CONTAINS,
+  // internal operator
+  OP_TYPE_ASSIGN = 200
 } EOperatorType;
 #define OP_TYPE_CALC_MAX OP_TYPE_BIT_OR

--- a/include/util/tqueue.h
+++ b/include/util/tqueue.h
@@ -76,6 +76,7 @@ void       taosFreeQall(STaosQall *qall);
 int32_t    taosReadAllQitems(STaosQueue *queue, STaosQall *qall);
 int32_t    taosGetQitem(STaosQall *qall, void **ppItem);
 void       taosResetQitems(STaosQall *qall);
+int32_t    taosQallItemSize(STaosQall *qall);
 STaosQset *taosOpenQset();
 void       taosCloseQset(STaosQset *qset);

--- a/packaging/tools/make_install.bat
+++ b/packaging/tools/make_install.bat
 @echo off
 goto %1
 :needAdmin
+if exist C:\\TDengine\\data\\dnode\\dnodeCfg.json (
+  echo The default data directory C:/TDengine/data contains old data of tdengine 2.x, please clear it before installing!
+)
+set source_dir=%2
+set source_dir=%source_dir:/=\\%
+set binary_dir=%3
+set binary_dir=%binary_dir:/=\\%
+set osType=%4
+set verNumber=%5
+set tagert_dir=C:\\TDengine
+if not exist %tagert_dir% (
+    mkdir %tagert_dir%
+)
+if not exist %tagert_dir%\\cfg (
+    mkdir %tagert_dir%\\cfg
+)
+if not exist %tagert_dir%\\include (
+    mkdir %tagert_dir%\\include
+)
+if not exist %tagert_dir%\\driver (
+    mkdir %tagert_dir%\\driver
+)
+if not exist C:\\TDengine\\cfg\\taos.cfg (
+    copy %source_dir%\\packaging\\cfg\\taos.cfg %tagert_dir%\\cfg\\taos.cfg > nul
+)
+if exist %binary_dir%\\test\\cfg\\taosadapter.toml (
+    if not exist %tagert_dir%\\cfg\\taosadapter.toml (
+        copy %binary_dir%\\test\\cfg\\taosadapter.toml %tagert_dir%\\cfg\\taosadapter.toml > nul
+    )
+)
+copy %source_dir%\\include\\client\\taos.h %tagert_dir%\\include > nul
+copy %source_dir%\\include\\util\\taoserror.h %tagert_dir%\\include > nul
+copy %source_dir%\\include\\libs\\function\\taosudf.h %tagert_dir%\\include > nul
+copy %binary_dir%\\build\\lib\\taos.lib %tagert_dir%\\driver > nul
+copy %binary_dir%\\build\\lib\\taos_static.lib %tagert_dir%\\driver > nul
+copy %binary_dir%\\build\\lib\\taos.dll %tagert_dir%\\driver > nul
+copy %binary_dir%\\build\\bin\\taos.exe %tagert_dir% > nul
+copy %binary_dir%\\build\\bin\\taosd.exe %tagert_dir% > nul
+copy %binary_dir%\\build\\bin\\udfd.exe %tagert_dir% > nul
+if exist %binary_dir%\\build\\bin\\taosBenchmark.exe (
+    copy %binary_dir%\\build\\bin\\taosBenchmark.exe %tagert_dir% > nul
+)
+if exist %binary_dir%\\build\\bin\\taosadapter.exe (
+    copy %binary_dir%\\build\\bin\\taosadapter.exe %tagert_dir% > nul
+)
 mshta vbscript:createobject("shell.application").shellexecute("%~s0",":hasAdmin","","runas",1)(window.close)&& echo To start/stop TDengine with administrator privileges: sc start/stop taosd &goto :eof
 :hasAdmin
-cp -f C:\\TDengine\\driver\\taos.dll C:\\Windows\\System32
+copy /y C:\\TDengine\\driver\\taos.dll C:\\Windows\\System32 > nul
 sc query "taosd" >nul || sc create "taosd" binPath= "C:\\TDengine\\taosd.exe --win_service" start= DEMAND
--- a/packaging/tools/make_install.sh
+++ b/packaging/tools/make_install.sh
@@ -664,7 +664,9 @@ function install_TDengine() {
 ## ==============================Main program starts from here============================
 echo source directory: $1
 echo binary directory: $2
-if [ "$osType" != "Darwin" ]; then
+if [ -x ${data_dir}/dnode/dnodeCfg.json ]; then
+  echo -e "\033[44;31;5mThe default data directory ${data_dir} contains old data of tdengine 2.x, please clear it before installing!\033[0m"
+elif [ "$osType" != "Darwin" ]; then
  if [ -x ${bin_dir}/${clientName} ]; then
    update_TDengine
  else

--- a/source/client/test/clientTests.cpp
+++ b/source/client/test/clientTests.cpp
@@ -123,7 +123,7 @@ void createNewTable(TAOS* pConn, int32_t index) {
  }
  taos_free_result(pRes);
-  for(int32_t i = 0; i < 100000; i += 20) {
+  for(int32_t i = 0; i < 3280; i += 20) {
    char sql[1024] = {0};
    sprintf(sql,
            "insert into tu%d values(now+%da, %d)(now+%da, %d)(now+%da, %d)(now+%da, %d)"
@@ -679,30 +679,28 @@ TEST(testCase, projection_query_tables) {
  TAOS_RES* pRes = taos_query(pConn, "use abc1");
  taos_free_result(pRes);
-  pRes = taos_query(pConn, "explain verbose true select _wstart,count(*),a from st1 partition by a interval(1s)");
+  pRes = taos_query(pConn, "create stable st1 (ts timestamp, k int) tags(a int)");
-  printResult(pRes);
+  if (taos_errno(pRes) != 0) {
-//  pRes = taos_query(pConn, "create stable st1 (ts timestamp, k int) tags(a int)");
+    printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
-//  if (taos_errno(pRes) != 0) {
+  }
-//    printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
+  taos_free_result(pRes);
-//  }
-//  taos_free_result(pRes);
+  pRes = taos_query(pConn, "create stable st2 (ts timestamp, k int) tags(a int)");
-//
+  if (taos_errno(pRes) != 0) {
-//  pRes = taos_query(pConn, "create stable st2 (ts timestamp, k int) tags(a int)");
+    printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
-//  if (taos_errno(pRes) != 0) {
+  }
-//    printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
+  taos_free_result(pRes);
-//  }
-//  taos_free_result(pRes);
+  pRes = taos_query(pConn, "create table tu using st1 tags(1)");
-//
+  if (taos_errno(pRes) != 0) {
-//  pRes = taos_query(pConn, "create table tu using st1 tags(1)");
+    printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
-//  if (taos_errno(pRes) != 0) {
+  }
-//    printf("failed to create table tu, reason:%s\n", taos_errstr(pRes));
+  taos_free_result(pRes);
-//  }
-//  taos_free_result(pRes);
+  for(int32_t i = 0; i < 2; ++i) {
-//
+    printf("create table :%d\n", i);
-//  for(int32_t i = 0; i < 1; ++i) {
+    createNewTable(pConn, i);
-//    printf("create table :%d\n", i);
+  }
-//    createNewTable(pConn, i);
-//  }
 //
 //  pRes = taos_query(pConn, "select * from tu");
 //  if (taos_errno(pRes) != 0) {

--- a/source/common/src/systable.c
+++ b/source/common/src/systable.c
@@ -88,7 +88,7 @@ static const SSysDbTableSchema userDBSchema[] = {
    {.name = "comp", .bytes = 1, .type = TSDB_DATA_TYPE_TINYINT},
    {.name = "precision", .bytes = 2 + VARSTR_HEADER_SIZE, .type = TSDB_DATA_TYPE_VARCHAR},
    {.name = "status", .bytes = 10 + VARSTR_HEADER_SIZE, .type = TSDB_DATA_TYPE_VARCHAR},
-    {.name = "retention", .bytes = 60 + VARSTR_HEADER_SIZE, .type = TSDB_DATA_TYPE_VARCHAR},
+    {.name = "retentions", .bytes = 60 + VARSTR_HEADER_SIZE, .type = TSDB_DATA_TYPE_VARCHAR},
    {.name = "single_stable", .bytes = 1, .type = TSDB_DATA_TYPE_BOOL},
    {.name = "cachemodel", .bytes = TSDB_CACHE_MODEL_STR_LEN + VARSTR_HEADER_SIZE, .type = TSDB_DATA_TYPE_VARCHAR},
    {.name = "cachesize", .bytes = 4, .type = TSDB_DATA_TYPE_INT},

--- a/source/common/src/tglobal.c
+++ b/source/common/src/tglobal.c
@@ -75,7 +75,7 @@ int32_t  tsMonitorMaxLogs = 100;
 bool     tsMonitorComp = false;
 // telem
-bool     tsEnableTelem = false;
+bool     tsEnableTelem = true;
 int32_t  tsTelemInterval = 86400;
 char     tsTelemServer[TSDB_FQDN_LEN] = "telemetry.taosdata.com";
 uint16_t tsTelemPort = 80;
@@ -166,7 +166,7 @@ int32_t tsTtlPushInterval = 86400;
 int32_t tsGrantHBInterval = 60;
 #ifndef _STORAGE
-int32_t taosSetTfsCfg(SConfig *pCfg) { 
+int32_t taosSetTfsCfg(SConfig *pCfg) {
  SConfigItem *pItem = cfgGetItem(pCfg, "dataDir");
  memset(tsDataDir, 0, PATH_MAX);
@@ -180,7 +180,7 @@ int32_t taosSetTfsCfg(SConfig *pCfg) {
    uError("failed to create dataDir:%s", tsDataDir);
    return -1;
  }
-  return 0; 
+  return 0;
 }
 #else
 int32_t taosSetTfsCfg(SConfig *pCfg);

--- a/source/common/src/ttypes.c
+++ b/source/common/src/ttypes.c
@@ -392,10 +392,10 @@ tDataTypeDescriptor tDataTypes[TSDB_DATA_TYPE_MAX] = {
     getStatics_i64},
    {TSDB_DATA_TYPE_FLOAT, 5, FLOAT_BYTES, "FLOAT", 0, 0, tsCompressFloat, tsDecompressFloat, getStatics_f},
    {TSDB_DATA_TYPE_DOUBLE, 6, DOUBLE_BYTES, "DOUBLE", 0, 0, tsCompressDouble, tsDecompressDouble, getStatics_d},
-    {TSDB_DATA_TYPE_VARCHAR, 6, 0, "VARCHAR", 0, 0, tsCompressString, tsDecompressString, getStatics_bin},
+    {TSDB_DATA_TYPE_VARCHAR, 6, 1, "VARCHAR", 0, 0, tsCompressString, tsDecompressString, getStatics_bin},
    {TSDB_DATA_TYPE_TIMESTAMP, 9, LONG_BYTES, "TIMESTAMP", INT64_MIN, INT64_MAX, tsCompressTimestamp,
     tsDecompressTimestamp, getStatics_i64},
-    {TSDB_DATA_TYPE_NCHAR, 5, 8, "NCHAR", 0, 0, tsCompressString, tsDecompressString, getStatics_nchr},
+    {TSDB_DATA_TYPE_NCHAR, 5, 1, "NCHAR", 0, 0, tsCompressString, tsDecompressString, getStatics_nchr},
    {TSDB_DATA_TYPE_UTINYINT, 16, CHAR_BYTES, "TINYINT UNSIGNED", 0, UINT8_MAX, tsCompressTinyint, tsDecompressTinyint,
     getStatics_u8},
    {TSDB_DATA_TYPE_USMALLINT, 17, SHORT_BYTES, "SMALLINT UNSIGNED", 0, UINT16_MAX, tsCompressSmallint,

--- a/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
+++ b/source/dnode/mgmt/mgmt_vnode/src/vmHandle.c
@@ -338,6 +338,7 @@ SArray *vmGetMsgHandles() {
  if (dmSetMgmtHandle(pArray, TDMT_SCH_MERGE_QUERY, vmPutMsgToQueryQueue, 0) == NULL) goto _OVER;
  if (dmSetMgmtHandle(pArray, TDMT_SCH_QUERY_CONTINUE, vmPutMsgToQueryQueue, 0) == NULL) goto _OVER;
  if (dmSetMgmtHandle(pArray, TDMT_VND_FETCH_RSMA, vmPutMsgToQueryQueue, 0) == NULL) goto _OVER;
+  if (dmSetMgmtHandle(pArray, TDMT_VND_EXEC_RSMA, vmPutMsgToQueryQueue, 0) == NULL) goto _OVER;
  if (dmSetMgmtHandle(pArray, TDMT_SCH_FETCH, vmPutMsgToFetchQueue, 0) == NULL) goto _OVER;
  if (dmSetMgmtHandle(pArray, TDMT_SCH_MERGE_FETCH, vmPutMsgToFetchQueue, 0) == NULL) goto _OVER;
  if (dmSetMgmtHandle(pArray, TDMT_VND_ALTER_TABLE, vmPutMsgToWriteQueue, 0) == NULL) goto _OVER;

--- a/source/dnode/mnode/impl/inc/mndDef.h
+++ b/source/dnode/mnode/impl/inc/mndDef.h
@@ -636,6 +636,7 @@ typedef struct {
 int32_t tEncodeSStreamObj(SEncoder* pEncoder, const SStreamObj* pObj);
 int32_t tDecodeSStreamObj(SDecoder* pDecoder, SStreamObj* pObj);
+void    tFreeStreamObj(SStreamObj* pObj);
 typedef struct {
  char    streamName[TSDB_STREAM_FNAME_LEN];

--- a/source/dnode/mnode/impl/inc/mndStb.h
+++ b/source/dnode/mnode/impl/inc/mndStb.h
@@ -34,6 +34,7 @@ int32_t mndCheckCreateStbReq(SMCreateStbReq *pCreate);
 SDbObj *mndAcquireDbByStb(SMnode *pMnode, const char *stbName);
 int32_t mndBuildStbFromReq(SMnode *pMnode, SStbObj *pDst, SMCreateStbReq *pCreate, SDbObj *pDb);
 int32_t mndAddStbToTrans(SMnode *pMnode, STrans *pTrans, SDbObj *pDb, SStbObj *pStb);
+void    mndFreeStb(SStbObj *pStb);
 void mndExtractDbNameFromStbFullName(const char *stbFullName, char *dst);
 void mndExtractTbNameFromStbFullName(const char *stbFullName, char *dst, int32_t dstSize);

--- a/source/dnode/mnode/impl/src/mndDef.c
+++ b/source/dnode/mnode/impl/src/mndDef.c
@@ -116,6 +116,25 @@ int32_t tDecodeSStreamObj(SDecoder *pDecoder, SStreamObj *pObj) {
  return 0;
 }
+void tFreeStreamObj(SStreamObj *pStream) {
+  taosMemoryFree(pStream->sql);
+  taosMemoryFree(pStream->ast);
+  taosMemoryFree(pStream->physicalPlan);
+  if (pStream->outputSchema.nCols) taosMemoryFree(pStream->outputSchema.pSchema);
+  int32_t sz = taosArrayGetSize(pStream->tasks);
+  for (int32_t i = 0; i < sz; i++) {
+    SArray *pLevel = taosArrayGetP(pStream->tasks, i);
+    int32_t taskSz = taosArrayGetSize(pLevel);
+    for (int32_t j = 0; j < taskSz; j++) {
+      SStreamTask *pTask = taosArrayGetP(pLevel, j);
+      tFreeSStreamTask(pTask);
+    }
+    taosArrayDestroy(pLevel);
+  }
+  taosArrayDestroy(pStream->tasks);
+}
 SMqVgEp *tCloneSMqVgEp(const SMqVgEp *pVgEp) {
  SMqVgEp *pVgEpNew = taosMemoryMalloc(sizeof(SMqVgEp));
  if (pVgEpNew == NULL) return NULL;

--- a/source/dnode/mnode/impl/src/mndScheduler.c
+++ b/source/dnode/mnode/impl/src/mndScheduler.c
@@ -424,6 +424,8 @@ int32_t mndScheduleStream(SMnode* pMnode, SStreamObj* pStream) {
      }
      mndAddTaskToTaskSet(taskSourceLevel, pTask);
+      pTask->triggerParam = 0;
      // source
      pTask->taskLevel = TASK_LEVEL__SOURCE;

--- a/source/dnode/mnode/impl/src/mndSma.c
+++ b/source/dnode/mnode/impl/src/mndSma.c
@@ -489,7 +489,7 @@ static int32_t mndCreateSma(SMnode *pMnode, SRpcMsg *pReq, SMCreateSmaReq *pCrea
  smaObj.uid = mndGenerateUid(pCreate->name, TSDB_TABLE_FNAME_LEN);
  ASSERT(smaObj.uid != 0);
  char resultTbName[TSDB_TABLE_FNAME_LEN + 16] = {0};
-  snprintf(resultTbName, TSDB_TABLE_FNAME_LEN + 16, "%s_td_tsma_rst_tb",pCreate->name);
+  snprintf(resultTbName, TSDB_TABLE_FNAME_LEN + 16, "%s_td_tsma_rst_tb", pCreate->name);
  memcpy(smaObj.dstTbName, resultTbName, TSDB_TABLE_FNAME_LEN);
  smaObj.dstTbUid = mndGenerateUid(smaObj.dstTbName, TSDB_TABLE_FNAME_LEN);
  smaObj.stbUid = pStb->uid;
@@ -530,7 +530,7 @@ static int32_t mndCreateSma(SMnode *pMnode, SRpcMsg *pReq, SMCreateSmaReq *pCrea
  streamObj.sourceDbUid = pDb->uid;
  streamObj.targetDbUid = pDb->uid;
  streamObj.version = 1;
-  streamObj.sql = pCreate->sql;
+  streamObj.sql = strdup(pCreate->sql);
  streamObj.smaId = smaObj.uid;
  streamObj.watermark = pCreate->watermark;
  streamObj.trigger = STREAM_TRIGGER_WINDOW_CLOSE;
@@ -585,6 +585,7 @@ static int32_t mndCreateSma(SMnode *pMnode, SRpcMsg *pReq, SMCreateSmaReq *pCrea
    return -1;
  }
  if (pAst != NULL) nodesDestroyNode(pAst);
+  nodesDestroyNode((SNode *)pPlan);
  int32_t code = -1;
  STrans *pTrans = mndTransCreate(pMnode, TRN_POLICY_RETRY, TRN_CONFLICT_DB, pReq);
@@ -609,6 +610,7 @@ static int32_t mndCreateSma(SMnode *pMnode, SRpcMsg *pReq, SMCreateSmaReq *pCrea
  code = 0;
 _OVER:
+  tFreeStreamObj(&streamObj);
  mndDestroySmaObj(&smaObj);
  mndTransDrop(pTrans);
  return code;

--- a/source/dnode/mnode/impl/src/mndStb.c
+++ b/source/dnode/mnode/impl/src/mndStb.c
@@ -266,6 +266,15 @@ _OVER:
  return pRow;
 }
+void mndFreeStb(SStbObj *pStb) {
+  taosArrayDestroy(pStb->pFuncs);
+  taosMemoryFreeClear(pStb->pColumns);
+  taosMemoryFreeClear(pStb->pTags);
+  taosMemoryFreeClear(pStb->comment);
+  taosMemoryFreeClear(pStb->pAst1);
+  taosMemoryFreeClear(pStb->pAst2);
+}
 static int32_t mndStbActionInsert(SSdb *pSdb, SStbObj *pStb) {
  mTrace("stb:%s, perform insert action, row:%p", pStb->name, pStb);
  return 0;
@@ -273,12 +282,7 @@ static int32_t mndStbActionInsert(SSdb *pSdb, SStbObj *pStb) {
 static int32_t mndStbActionDelete(SSdb *pSdb, SStbObj *pStb) {
  mTrace("stb:%s, perform delete action, row:%p", pStb->name, pStb);
-  taosArrayDestroy(pStb->pFuncs);
+  mndFreeStb(pStb);
-  taosMemoryFreeClear(pStb->pColumns);
-  taosMemoryFreeClear(pStb->pTags);
-  taosMemoryFreeClear(pStb->comment);
-  taosMemoryFreeClear(pStb->pAst1);
-  taosMemoryFreeClear(pStb->pAst2);
  return 0;
 }
@@ -438,6 +442,8 @@ static void *mndBuildVCreateStbReq(SMnode *pMnode, SVgObj *pVgroup, SStbObj *pSt
  if (req.rollup) {
    req.rsmaParam.maxdelay[0] = pStb->maxdelay[0];
    req.rsmaParam.maxdelay[1] = pStb->maxdelay[1];
+    req.rsmaParam.watermark[0] = pStb->watermark[0];
+    req.rsmaParam.watermark[1] = pStb->watermark[1];
    if (pStb->ast1Len > 0) {
      if (mndConvertRsmaTask(&req.rsmaParam.qmsg[0], &req.rsmaParam.qmsgLen[0], pStb->pAst1, pStb->uid,
                             STREAM_TRIGGER_WINDOW_CLOSE, req.rsmaParam.watermark[0]) < 0) {
@@ -2021,8 +2027,7 @@ static int32_t mndCheckDropStbForTopic(SMnode *pMnode, const char *stbFullName,
    FOREACH(pNode, pNodeList) {
      SColumnNode *pCol = (SColumnNode *)pNode;
-      if (pCol->tableId != suid) {
+      if (pCol->tableId == suid) {
-        mDebug("topic:%s, check colId:%d passed", pTopic->name, pCol->colId);
        sdbRelease(pSdb, pTopic);
        nodesDestroyNode(pAst);
        return -1;
@@ -2045,6 +2050,16 @@ static int32_t mndCheckDropStbForStream(SMnode *pMnode, const char *stbFullName,
    pIter = sdbFetch(pSdb, SDB_STREAM, pIter, (void **)&pStream);
    if (pIter == NULL) break;
+    if (pStream->smaId != 0) {
+      sdbRelease(pSdb, pStream);
+      continue;
+    }
+    if (pStream->targetStbUid == suid) {
+      sdbRelease(pSdb, pStream);
+      return -1;
+    }
    SNode *pAst = NULL;
    if (nodesStringToNode(pStream->ast, &pAst) != 0) {
      ASSERT(0);
@@ -2057,8 +2072,7 @@ static int32_t mndCheckDropStbForStream(SMnode *pMnode, const char *stbFullName,
    FOREACH(pNode, pNodeList) {
      SColumnNode *pCol = (SColumnNode *)pNode;
-      if (pCol->tableId != suid) {
+      if (pCol->tableId == suid) {
-        mDebug("stream:%s, check colId:%d passed", pStream->name, pCol->colId);
        sdbRelease(pSdb, pStream);
        nodesDestroyNode(pAst);
        return -1;

--- a/source/dnode/mnode/impl/src/mndStream.c
+++ b/source/dnode/mnode/impl/src/mndStream.c
@@ -167,6 +167,9 @@ static int32_t mndStreamActionInsert(SSdb *pSdb, SStreamObj *pStream) {
 static int32_t mndStreamActionDelete(SSdb *pSdb, SStreamObj *pStream) {
  mTrace("stream:%s, perform delete action", pStream->name);
+  taosWLockLatch(&pStream->lock);
+  tFreeStreamObj(pStream);
+  taosWUnLockLatch(&pStream->lock);
  return 0;
 }
@@ -493,10 +496,17 @@ static int32_t mndCreateStbForStream(SMnode *pMnode, STrans *pTrans, const SStre
  stbObj.uid = pStream->targetStbUid;
-  if (mndAddStbToTrans(pMnode, pTrans, pDb, &stbObj) < 0) goto _OVER;
+  if (mndAddStbToTrans(pMnode, pTrans, pDb, &stbObj) < 0) {
+    mndFreeStb(&stbObj);
+    goto _OVER;
+  }
+  tFreeSMCreateStbReq(&createReq);
+  mndFreeStb(&stbObj);
  return 0;
 _OVER:
+  tFreeSMCreateStbReq(&createReq);
  mndReleaseStb(pMnode, pStb);
  mndReleaseDb(pMnode, pDb);
  return -1;
@@ -715,6 +725,7 @@ _OVER:
  mndReleaseDb(pMnode, pDb);
  tFreeSCMCreateStreamReq(&createStreamReq);
+  tFreeStreamObj(&streamObj);
  return code;
 }

--- a/source/dnode/mnode/impl/src/mndVgroup.c
+++ b/source/dnode/mnode/impl/src/mndVgroup.c
@@ -509,6 +509,7 @@ int32_t mndAllocSmaVgroup(SMnode *pMnode, SDbObj *pDb, SVgObj *pVgroup) {
  pVgroup->replica = 1;
  if (mndGetAvailableDnode(pMnode, pDb, pVgroup, pArray) != 0) return -1;
+  taosArrayDestroy(pArray);
  mInfo("db:%s, sma vgId:%d is alloced", pDb->name, pVgroup->vgId);
  return 0;
@@ -1862,4 +1863,4 @@ _OVER:
 #endif
 }
 bool mndVgroupInDb(SVgObj *pVgroup, int64_t dbUid) { return !pVgroup->isTsma && pVgroup->dbUid == dbUid; }
\ No newline at end of file
--- a/source/dnode/vnode/inc/vnode.h
+++ b/source/dnode/vnode/inc/vnode.h
@@ -63,6 +63,7 @@ void    vnodeGetInfo(SVnode *pVnode, const char **dbname, int32_t *vgId);
 int32_t vnodeProcessCreateTSma(SVnode *pVnode, void *pCont, uint32_t contLen);
 int32_t vnodeGetAllTableList(SVnode *pVnode, uint64_t uid, SArray *list);
 int32_t vnodeGetCtbIdList(SVnode *pVnode, int64_t suid, SArray *list);
+int32_t vnodeGetStbIdList(SVnode *pVnode, int64_t suid, SArray* list);
 void   *vnodeGetIdx(SVnode *pVnode);
 void   *vnodeGetIvtIdx(SVnode *pVnode);
@@ -95,6 +96,7 @@ int32_t     metaGetTableTags(SMeta *pMeta, uint64_t suid, SArray *uidList, SHash
 int32_t     metaReadNext(SMetaReader *pReader);
 const void *metaGetTableTagVal(void *tag, int16_t type, STagVal *tagVal);
 int         metaGetTableNameByUid(void *meta, uint64_t uid, char *tbName);
+bool        metaIsTableExist(SMeta  *pMeta, tb_uid_t uid);
 typedef struct SMetaFltParam {
  tb_uid_t suid;

--- a/source/dnode/vnode/src/inc/meta.h
+++ b/source/dnode/vnode/src/inc/meta.h
@@ -66,7 +66,6 @@ int32_t metaCacheOpen(SMeta* pMeta);
 void    metaCacheClose(SMeta* pMeta);
 int32_t metaCacheUpsert(SMeta* pMeta, SMetaInfo* pInfo);
 int32_t metaCacheDrop(SMeta* pMeta, int64_t uid);
-int32_t metaCacheGet(SMeta* pMeta, int64_t uid, SMetaInfo* pInfo);
 struct SMeta {
  TdThreadRwlock lock;

--- a/source/dnode/vnode/src/inc/sma.h
+++ b/source/dnode/vnode/src/inc/sma.h
@@ -32,7 +32,8 @@ extern "C" {
 #define smaTrace(...) do { if (smaDebugFlag & DEBUG_TRACE) { taosPrintLog("SMA ", DEBUG_TRACE, tsdbDebugFlag, __VA_ARGS__); }} while(0)
 // clang-format on
-#define RSMA_TASK_INFO_HASH_SLOT 8
+#define RSMA_TASK_INFO_HASH_SLOT (8)
+#define RSMA_EXECUTOR_MAX        (1)
 typedef struct SSmaEnv       SSmaEnv;
 typedef struct SSmaStat      SSmaStat;
@@ -57,9 +58,10 @@ typedef struct {
  void   *tmrHandle;  // shared by all fetch tasks
 } SSmaMgmt;
-#define SMA_ENV_LOCK(env) (&(env)->lock)
+#define SMA_ENV_LOCK(env)  (&(env)->lock)
-#define SMA_ENV_TYPE(env) ((env)->type)
+#define SMA_ENV_TYPE(env)  ((env)->type)
-#define SMA_ENV_STAT(env) ((env)->pStat)
+#define SMA_ENV_STAT(env)  ((env)->pStat)
+#define SMA_RSMA_STAT(sma) ((SRSmaStat *)SMA_ENV_STAT((SSmaEnv *)(sma)->pRSmaEnv))
 struct STSmaStat {
  int8_t    state;  // ETsdbSmaStat
@@ -86,15 +88,17 @@ struct SQTaskFWriter {
 };
 struct SRSmaStat {
-  SSma     *pSma;
+  SSma            *pSma;
-  int64_t   commitAppliedVer;  // vnode applied version for async commit
+  int64_t          commitAppliedVer;  // vnode applied version for async commit
-  int64_t   refId;             // shared by fetch tasks
+  int64_t          refId;             // shared by fetch tasks
-  SRWLatch  lock;              // r/w lock for rsma fs(e.g. qtaskinfo)
+  volatile int64_t nBufItems;         // number of items in queue buffer
-  int8_t    triggerStat;       // shared by fetch tasks
+  SRWLatch         lock;              // r/w lock for rsma fs(e.g. qtaskinfo)
-  int8_t    commitStat;        // 0 not in committing, 1 in committing
+  volatile int8_t  nExecutor;         // [1, max(half of query threads, 4)]
-  SArray   *aTaskFile;         // qTaskFiles committed recently(for recovery/snapshot r/w)
+  int8_t           triggerStat;       // shared by fetch tasks
-  SHashObj *rsmaInfoHash;      // key: stbUid, value: SRSmaInfo;
+  int8_t           commitStat;        // 0 not in committing, 1 in committing
-  SHashObj *iRsmaInfoHash;     // key: stbUid, value: SRSmaInfo; immutable rsmaInfoHash
+  SArray          *aTaskFile;         // qTaskFiles committed recently(for recovery/snapshot r/w)
+  SHashObj        *infoHash;          // key: suid, value: SRSmaInfo
+  tsem_t           notEmpty;          // has items in queue buffer
 };
 struct SSmaStat {
@@ -105,34 +109,42 @@ struct SSmaStat {
  T_REF_DECLARE()
 };
-#define SMA_TSMA_STAT(s)      (&(s)->tsmaStat)
+#define SMA_STAT_TSMA(s)     (&(s)->tsmaStat)
-#define SMA_RSMA_STAT(s)      (&(s)->rsmaStat)
+#define SMA_STAT_RSMA(s)     (&(s)->rsmaStat)
-#define RSMA_INFO_HASH(r)     ((r)->rsmaInfoHash)
+#define RSMA_INFO_HASH(r)    ((r)->infoHash)
-#define RSMA_IMU_INFO_HASH(r) ((r)->iRsmaInfoHash)
+#define RSMA_TRIGGER_STAT(r) (&(r)->triggerStat)
-#define RSMA_TRIGGER_STAT(r)  (&(r)->triggerStat)
+#define RSMA_COMMIT_STAT(r)  (&(r)->commitStat)
-#define RSMA_COMMIT_STAT(r)   (&(r)->commitStat)
+#define RSMA_REF_ID(r)       ((r)->refId)
-#define RSMA_REF_ID(r)        ((r)->refId)
+#define RSMA_FS_LOCK(r)      (&(r)->lock)
-#define RSMA_FS_LOCK(r)       (&(r)->lock)
 struct SRSmaInfoItem {
-  int8_t  level;
+  int8_t   level : 4;
-  int8_t  triggerStat;
+  int8_t   fetchLevel : 4;
-  int32_t maxDelay;
+  int8_t   triggerStat;
-  tmr_h   tmrId;
+  uint16_t nSkipped;
+  int32_t  maxDelay;  // ms
+  tmr_h    tmrId;
 };
 struct SRSmaInfo {
  STSchema *pTSchema;
  int64_t   suid;
-  int64_t   refId;  // refId of SRSmaStat
+  int64_t   refId;     // refId of SRSmaStat
+  int64_t   lastRecv;  // ms
+  int8_t    assigned;  // 0 idle, 1 assgined for exec
  int8_t    delFlag;
+  int16_t   padding;
  T_REF_DECLARE()
  SRSmaInfoItem items[TSDB_RETENTION_L2];
  void         *taskInfo[TSDB_RETENTION_L2];   // qTaskInfo_t
-  void         *iTaskInfo[TSDB_RETENTION_L2];  // immutable
+  STaosQueue   *queue;                         // buffer queue of SubmitReq
+  STaosQall    *qall;                          // buffer qall of SubmitReq
+  void         *iTaskInfo[TSDB_RETENTION_L2];  // immutable qTaskInfo_t
+  STaosQueue   *iQueue;                        // immutable buffer queue of SubmitReq
+  STaosQall    *iQall;                         // immutable buffer qall of SubmitReq
 };
-#define RSMA_INFO_HEAD_LEN     32
+#define RSMA_INFO_HEAD_LEN     offsetof(SRSmaInfo, items)
 #define RSMA_INFO_IS_DEL(r)    ((r)->delFlag == 1)
 #define RSMA_INFO_SET_DEL(r)   ((r)->delFlag = 1)
 #define RSMA_INFO_QTASK(r, i)  ((r)->taskInfo[i])
@@ -161,6 +173,12 @@ enum {
  RSMA_RESTORE_SYNC = 2,
 };
+typedef enum {
+  RSMA_EXEC_OVERFLOW = 1,  // triggered by queue buf overflow
+  RSMA_EXEC_TIMEOUT = 2,   // triggered by timer
+  RSMA_EXEC_COMMIT = 3,    // triggered by commit
+} ERsmaExecType;
 void  tdDestroySmaEnv(SSmaEnv *pSmaEnv);
 void *tdFreeSmaEnv(SSmaEnv *pSmaEnv);
@@ -228,12 +246,13 @@ static FORCE_INLINE void tdSmaStatSetDropped(STSmaStat *pTStat) {
 void           tdRSmaQTaskInfoGetFileName(int32_t vid, int64_t version, char *outputName);
 void           tdRSmaQTaskInfoGetFullName(int32_t vid, int64_t version, const char *path, char *outputName);
-int32_t        tdCloneRSmaInfo(SSma *pSma, SRSmaInfo **pDest, SRSmaInfo *pSrc);
+int32_t        tdCloneRSmaInfo(SSma *pSma, SRSmaInfo *pInfo);
 void           tdFreeQTaskInfo(qTaskInfo_t *taskHandle, int32_t vgId, int32_t level);
 static int32_t tdDestroySmaState(SSmaStat *pSmaStat, int8_t smaType);
 void          *tdFreeSmaState(SSmaStat *pSmaStat, int8_t smaType);
 void          *tdFreeRSmaInfo(SSma *pSma, SRSmaInfo *pInfo, bool isDeepFree);
 int32_t        tdRSmaPersistExecImpl(SRSmaStat *pRSmaStat, SHashObj *pInfoHash);
+int32_t        tdRSmaProcessExecImpl(SSma *pSma, ERsmaExecType type);
 int32_t tdProcessRSmaCreateImpl(SSma *pSma, SRSmaParam *param, int64_t suid, const char *tbName);
 int32_t tdProcessRSmaRestoreImpl(SSma *pSma, int8_t type, int64_t qtaskFileVer);

--- a/source/dnode/vnode/src/inc/tsdb.h
+++ b/source/dnode/vnode/src/inc/tsdb.h
@@ -45,7 +45,7 @@ typedef struct SBlockIdx     SBlockIdx;
 typedef struct SBlock        SBlock;
 typedef struct SBlockL       SBlockL;
 typedef struct SColData      SColData;
-typedef struct SBlockDataHdr SBlockDataHdr;
+typedef struct SDiskDataHdr  SDiskDataHdr;
 typedef struct SBlockData    SBlockData;
 typedef struct SDelFile      SDelFile;
 typedef struct SHeadFile     SHeadFile;
@@ -61,7 +61,11 @@ typedef struct SRowIter      SRowIter;
 typedef struct STsdbFS       STsdbFS;
 typedef struct SRowMerger    SRowMerger;
 typedef struct STsdbReadSnap STsdbReadSnap;
+typedef struct SBlockInfo    SBlockInfo;
+typedef struct SSmaInfo      SSmaInfo;
+typedef struct SBlockCol     SBlockCol;
+#define TSDB_FILE_DLMT     ((uint32_t)0xF00AFA0F)
 #define TSDB_MAX_SUBBLOCKS 8
 #define TSDB_FHDR_SIZE     512
@@ -113,10 +117,14 @@ int32_t tPutBlock(uint8_t *p, void *ph);
 int32_t tGetBlock(uint8_t *p, void *ph);
 int32_t tBlockCmprFn(const void *p1, const void *p2);
 bool    tBlockHasSma(SBlock *pBlock);
+// SBlockL
+int32_t tPutBlockL(uint8_t *p, void *ph);
+int32_t tGetBlockL(uint8_t *p, void *ph);
 // SBlockIdx
 int32_t tPutBlockIdx(uint8_t *p, void *ph);
 int32_t tGetBlockIdx(uint8_t *p, void *ph);
 int32_t tCmprBlockIdx(void const *lhs, void const *rhs);
+int32_t tCmprBlockL(void const *lhs, void const *rhs);
 // SColdata
 void    tColDataInit(SColData *pColData, int16_t cid, int8_t type, int8_t smaOn);
 void    tColDataReset(SColData *pColData);
@@ -131,20 +139,25 @@ int32_t tGetColData(uint8_t *p, SColData *pColData);
 #define tBlockDataLastRow(PBLOCKDATA)  tsdbRowFromBlockData(PBLOCKDATA, (PBLOCKDATA)->nRow - 1)
 #define tBlockDataFirstKey(PBLOCKDATA) TSDBROW_KEY(&tBlockDataFirstRow(PBLOCKDATA))
 #define tBlockDataLastKey(PBLOCKDATA)  TSDBROW_KEY(&tBlockDataLastRow(PBLOCKDATA))
-int32_t   tBlockDataInit(SBlockData *pBlockData);
+int32_t   tBlockDataCreate(SBlockData *pBlockData);
+void      tBlockDataDestroy(SBlockData *pBlockData, int8_t deepClear);
+int32_t   tBlockDataInit(SBlockData *pBlockData, int64_t suid, int64_t uid, STSchema *pTSchema);
+int32_t   tBlockDataInitEx(SBlockData *pBlockData, SBlockData *pBlockDataFrom);
 void      tBlockDataReset(SBlockData *pBlockData);
-int32_t   tBlockDataSetSchema(SBlockData *pBlockData, STSchema *pTSchema);
+int32_t   tBlockDataAppendRow(SBlockData *pBlockData, TSDBROW *pRow, STSchema *pTSchema, int64_t uid);
-int32_t   tBlockDataCorrectSchema(SBlockData *pBlockData, SBlockData *pBlockDataFrom);
+void      tBlockDataClear(SBlockData *pBlockData);
-void      tBlockDataClearData(SBlockData *pBlockData);
-void      tBlockDataClear(SBlockData *pBlockData, int8_t deepClear);
-int32_t   tBlockDataAddColData(SBlockData *pBlockData, int32_t iColData, SColData **ppColData);
-int32_t   tBlockDataAppendRow(SBlockData *pBlockData, TSDBROW *pRow, STSchema *pTSchema);
-int32_t   tBlockDataMerge(SBlockData *pBlockData1, SBlockData *pBlockData2, SBlockData *pBlockData);
-int32_t   tBlockDataCopy(SBlockData *pBlockDataSrc, SBlockData *pBlockDataDest);
 SColData *tBlockDataGetColDataByIdx(SBlockData *pBlockData, int32_t idx);
 void      tBlockDataGetColData(SBlockData *pBlockData, int16_t cid, SColData **ppColData);
-int32_t   tPutBlockData(uint8_t *p, SBlockData *pBlockData);
+int32_t   tBlockDataCopy(SBlockData *pBlockDataSrc, SBlockData *pBlockDataDest);
-int32_t   tGetBlockData(uint8_t *p, SBlockData *pBlockData);
+int32_t   tBlockDataMerge(SBlockData *pBlockData1, SBlockData *pBlockData2, SBlockData *pBlockData);
+int32_t   tBlockDataAddColData(SBlockData *pBlockData, int32_t iColData, SColData **ppColData);
+int32_t   tCmprBlockData(SBlockData *pBlockData, int8_t cmprAlg, uint8_t **ppOut, int32_t *szOut, uint8_t *aBuf[],
+                         int32_t aBufN[]);
+int32_t   tDecmprBlockData(uint8_t *pIn, int32_t szIn, SBlockData *pBlockData, uint8_t *aBuf[]);
+// SDiskDataHdr
+int32_t tPutDiskDataHdr(uint8_t *p, void *ph);
+int32_t tGetDiskDataHdr(uint8_t *p, void *ph);
 // SDelIdx
 int32_t tPutDelIdx(uint8_t *p, void *ph);
 int32_t tGetDelIdx(uint8_t *p, void *ph);
@@ -168,13 +181,25 @@ void    tsdbFidKeyRange(int32_t fid, int32_t minutes, int8_t precision, TSKEY *m
 int32_t tsdbFidLevel(int32_t fid, STsdbKeepCfg *pKeepCfg, int64_t now);
 int32_t tsdbBuildDeleteSkyline(SArray *aDelData, int32_t sidx, int32_t eidx, SArray *aSkyline);
 void    tsdbCalcColDataSMA(SColData *pColData, SColumnDataAgg *pColAgg);
+int32_t tPutColumnDataAgg(uint8_t *p, SColumnDataAgg *pColAgg);
+int32_t tGetColumnDataAgg(uint8_t *p, SColumnDataAgg *pColAgg);
+int32_t tsdbCmprData(uint8_t *pIn, int32_t szIn, int8_t type, int8_t cmprAlg, uint8_t **ppOut, int32_t nOut,
+                     int32_t *szOut, uint8_t **ppBuf);
+int32_t tsdbDecmprData(uint8_t *pIn, int32_t szIn, int8_t type, int8_t cmprAlg, uint8_t **ppOut, int32_t szOut,
+                       uint8_t **ppBuf);
+int32_t tsdbCmprColData(SColData *pColData, int8_t cmprAlg, SBlockCol *pBlockCol, uint8_t **ppOut, int32_t nOut,
+                        uint8_t **ppBuf);
+int32_t tsdbDecmprColData(uint8_t *pIn, SBlockCol *pBlockCol, int8_t cmprAlg, int32_t nVal, SColData *pColData,
+                          uint8_t **ppBuf);
+int32_t tsdbReadAndCheck(TdFilePtr pFD, int64_t offset, uint8_t **ppOut, int32_t size, int8_t toCheck);
 // tsdbMemTable ==============================================================================================
 // SMemTable
-int32_t tsdbMemTableCreate(STsdb *pTsdb, SMemTable **ppMemTable);
+int32_t  tsdbMemTableCreate(STsdb *pTsdb, SMemTable **ppMemTable);
-void    tsdbMemTableDestroy(SMemTable *pMemTable);
+void     tsdbMemTableDestroy(SMemTable *pMemTable);
-void    tsdbGetTbDataFromMemTable(SMemTable *pMemTable, tb_uid_t suid, tb_uid_t uid, STbData **ppTbData);
+STbData *tsdbGetTbDataFromMemTable(SMemTable *pMemTable, tb_uid_t suid, tb_uid_t uid);
-void    tsdbRefMemTable(SMemTable *pMemTable);
+void     tsdbRefMemTable(SMemTable *pMemTable);
-void    tsdbUnrefMemTable(SMemTable *pMemTable);
+void     tsdbUnrefMemTable(SMemTable *pMemTable);
+SArray  *tsdbMemTableGetTbDataArray(SMemTable *pMemTable);
 // STbDataIter
 int32_t  tsdbTbDataIterCreate(STbData *pTbData, TSDBKEY *pFrom, int8_t backward, STbDataIter **ppIter);
 void    *tsdbTbDataIterDestroy(STbDataIter *pIter);
@@ -223,33 +248,33 @@ int32_t tsdbFSUpsertDelFile(STsdbFS *pFS, SDelFile *pDelFile);
 int32_t tsdbDataFWriterOpen(SDataFWriter **ppWriter, STsdb *pTsdb, SDFileSet *pSet);
 int32_t tsdbDataFWriterClose(SDataFWriter **ppWriter, int8_t sync);
 int32_t tsdbUpdateDFileSetHeader(SDataFWriter *pWriter);
-int32_t tsdbWriteBlockIdx(SDataFWriter *pWriter, SArray *aBlockIdx, uint8_t **ppBuf);
+int32_t tsdbWriteBlockIdx(SDataFWriter *pWriter, SArray *aBlockIdx);
-int32_t tsdbWriteBlock(SDataFWriter *pWriter, SMapData *pMapData, uint8_t **ppBuf, SBlockIdx *pBlockIdx);
+int32_t tsdbWriteBlock(SDataFWriter *pWriter, SMapData *pMapData, SBlockIdx *pBlockIdx);
-int32_t tsdbWriteBlockData(SDataFWriter *pWriter, SBlockData *pBlockData, uint8_t **ppBuf1, uint8_t **ppBuf2,
+int32_t tsdbWriteBlockL(SDataFWriter *pWriter, SArray *aBlockL);
-                           SBlockIdx *pBlockIdx, SBlock *pBlock, int8_t cmprAlg);
+int32_t tsdbWriteBlockData(SDataFWriter *pWriter, SBlockData *pBlockData, SBlockInfo *pBlkInfo, SSmaInfo *pSmaInfo,
+                           int8_t cmprAlg, int8_t toLast);
 int32_t tsdbDFileSetCopy(STsdb *pTsdb, SDFileSet *pSetFrom, SDFileSet *pSetTo);
 // SDataFReader
 int32_t tsdbDataFReaderOpen(SDataFReader **ppReader, STsdb *pTsdb, SDFileSet *pSet);
 int32_t tsdbDataFReaderClose(SDataFReader **ppReader);
-int32_t tsdbReadBlockIdx(SDataFReader *pReader, SArray *aBlockIdx, uint8_t **ppBuf);
+int32_t tsdbReadBlockIdx(SDataFReader *pReader, SArray *aBlockIdx);
-int32_t tsdbReadBlock(SDataFReader *pReader, SBlockIdx *pBlockIdx, SMapData *pMapData, uint8_t **ppBuf);
+int32_t tsdbReadBlock(SDataFReader *pReader, SBlockIdx *pBlockIdx, SMapData *pMapData);
-int32_t tsdbReadColData(SDataFReader *pReader, SBlockIdx *pBlockIdx, SBlock *pBlock, int16_t *aColId, int32_t nCol,
+int32_t tsdbReadBlockL(SDataFReader *pReader, SArray *aBlockL);
-                        SBlockData *pBlockData, uint8_t **ppBuf1, uint8_t **ppBuf2);
+int32_t tsdbReadBlockSma(SDataFReader *pReader, SBlock *pBlock, SArray *aColumnDataAgg);
-int32_t tsdbReadBlockData(SDataFReader *pReader, SBlockIdx *pBlockIdx, SBlock *pBlock, SBlockData *pBlockData,
+int32_t tsdbReadDataBlock(SDataFReader *pReader, SBlock *pBlock, SBlockData *pBlockData);
-                          uint8_t **ppBuf1, uint8_t **ppBuf2);
+int32_t tsdbReadLastBlock(SDataFReader *pReader, SBlockL *pBlockL, SBlockData *pBlockData);
-int32_t tsdbReadBlockSma(SDataFReader *pReader, SBlock *pBlock, SArray *aColumnDataAgg, uint8_t **ppBuf);
 // SDelFWriter
 int32_t tsdbDelFWriterOpen(SDelFWriter **ppWriter, SDelFile *pFile, STsdb *pTsdb);
 int32_t tsdbDelFWriterClose(SDelFWriter **ppWriter, int8_t sync);
-int32_t tsdbWriteDelData(SDelFWriter *pWriter, SArray *aDelData, uint8_t **ppBuf, SDelIdx *pDelIdx);
+int32_t tsdbWriteDelData(SDelFWriter *pWriter, SArray *aDelData, SDelIdx *pDelIdx);
-int32_t tsdbWriteDelIdx(SDelFWriter *pWriter, SArray *aDelIdx, uint8_t **ppBuf);
+int32_t tsdbWriteDelIdx(SDelFWriter *pWriter, SArray *aDelIdx);
 int32_t tsdbUpdateDelFileHdr(SDelFWriter *pWriter);
 // SDelFReader
-int32_t tsdbDelFReaderOpen(SDelFReader **ppReader, SDelFile *pFile, STsdb *pTsdb, uint8_t **ppBuf);
+int32_t tsdbDelFReaderOpen(SDelFReader **ppReader, SDelFile *pFile, STsdb *pTsdb);
 int32_t tsdbDelFReaderClose(SDelFReader **ppReader);
-int32_t tsdbReadDelData(SDelFReader *pReader, SDelIdx *pDelIdx, SArray *aDelData, uint8_t **ppBuf);
+int32_t tsdbReadDelData(SDelFReader *pReader, SDelIdx *pDelIdx, SArray *aDelData);
-int32_t tsdbReadDelIdx(SDelFReader *pReader, SArray *aDelIdx, uint8_t **ppBuf);
+int32_t tsdbReadDelIdx(SDelFReader *pReader, SArray *aDelIdx);
 // tsdbRead.c ==============================================================================================
 int32_t tsdbTakeReadSnap(STsdb *pTsdb, STsdbReadSnap **ppSnap);
 void    tsdbUntakeReadSnap(STsdb *pTsdb, STsdbReadSnap *pSnap);
@@ -260,7 +285,7 @@ void    tsdbUntakeReadSnap(STsdb *pTsdb, STsdbReadSnap *pSnap);
 // tsdbCache
 int32_t tsdbOpenCache(STsdb *pTsdb);
-void    tsdbCloseCache(SLRUCache *pCache);
+void    tsdbCloseCache(STsdb *pTsdb);
 int32_t tsdbCacheInsertLast(SLRUCache *pCache, tb_uid_t uid, STSRow *row, STsdb *pTsdb);
 int32_t tsdbCacheInsertLastrow(SLRUCache *pCache, STsdb *pTsdb, tb_uid_t uid, STSRow *row, bool dup);
 int32_t tsdbCacheGetLastH(SLRUCache *pCache, tb_uid_t uid, STsdb *pTsdb, LRUHandle **h);
@@ -277,13 +302,6 @@ size_t tsdbCacheGetCapacity(SVnode *pVnode);
 int32_t tsdbCacheLastArray2Row(SArray *pLastArray, STSRow **ppRow, STSchema *pSchema);
 // structs =======================
-typedef struct {
-  int   minFid;
-  int   midFid;
-  int   maxFid;
-  TSKEY minKey;
-} SRtn;
 struct STsdbFS {
  SDelFile *pDelFile;
  SArray   *aDFileSet;  // SArray<SDFileSet>
@@ -298,6 +316,7 @@ struct STsdb {
  SMemTable     *imem;
  STsdbFS        fs;
  SLRUCache     *lruCache;
+  TdThreadMutex  lruMutex;
 };
 struct TSDBKEY {
@@ -311,30 +330,23 @@ struct SMemSkipListNode {
  SMemSkipListNode *forwards[0];
 };
 typedef struct SMemSkipList {
-  uint32_t          seed;
  int64_t           size;
+  uint32_t          seed;
  int8_t            maxLevel;
  int8_t            level;
  SMemSkipListNode *pHead;
  SMemSkipListNode *pTail;
 } SMemSkipList;
-struct SDelDataInfo {
-  tb_uid_t suid;
-  tb_uid_t uid;
-};
 struct STbData {
  tb_uid_t     suid;
  tb_uid_t     uid;
  TSKEY        minKey;
  TSKEY        maxKey;
-  int64_t      minVersion;
-  int64_t      maxVersion;
-  int32_t      maxSkmVer;
  SDelData    *pHead;
  SDelData    *pTail;
  SMemSkipList sl;
+  STbData     *next;
 };
 struct SMemTable {
@@ -344,11 +356,13 @@ struct SMemTable {
  volatile int32_t nRef;
  TSKEY            minKey;
  TSKEY            maxKey;
-  int64_t          minVersion;
-  int64_t          maxVersion;
  int64_t          nRow;
  int64_t          nDel;
-  SArray          *aTbData;  // SArray<STbData*>
+  struct {
+    int32_t   nTbData;
+    int32_t   nBucket;
+    STbData **aBucket;
+  };
 };
 struct TSDBROW {
@@ -379,63 +393,51 @@ struct SMapData {
  uint8_t *pData;
 };
-typedef struct {
+struct SBlockCol {
  int16_t cid;
  int8_t  type;
  int8_t  smaOn;
-  int8_t  flag;  // HAS_NONE|HAS_NULL|HAS_VALUE
+  int8_t  flag;      // HAS_NONE|HAS_NULL|HAS_VALUE
-  int32_t offset;
-  int32_t szBitmap;  // bitmap size
-  int32_t szOffset;  // size of offset, only for variant-length data type
-  int32_t szValue;   // compressed column value size
  int32_t szOrigin;  // original column value size (only save for variant data type)
-} SBlockCol;
+  int32_t szBitmap;  // bitmap size, 0 only for flag == HAS_VAL
+  int32_t szOffset;  // offset size, 0 only for non-variant-length type
-typedef struct {
+  int32_t szValue;   // value size, 0 when flag == (HAS_NULL | HAS_NONE)
-  int32_t nRow;
+  int32_t offset;
-  int8_t  cmprAlg;
+};
-  int64_t offset;      // block data offset
-  int32_t szBlockCol;  // SBlockCol size
+struct SBlockInfo {
-  int32_t szVersion;   // VERSION size
+  int64_t offset;  // block data offset
-  int32_t szTSKEY;     // TSKEY size
+  int32_t szBlock;
-  int32_t szBlock;     // total block size
+  int32_t szKey;
-  int64_t sOffset;     // sma offset
+};
-  int32_t nSma;        // sma size
-} SSubBlock;
+struct SSmaInfo {
+  int64_t offset;
+  int32_t size;
+};
 struct SBlock {
-  TSDBKEY   minKey;
+  TSDBKEY    minKey;
-  TSDBKEY   maxKey;
+  TSDBKEY    maxKey;
-  int64_t   minVersion;
+  int64_t    minVer;
-  int64_t   maxVersion;
+  int64_t    maxVer;
-  int32_t   nRow;
+  int32_t    nRow;
-  int8_t    last;
+  int8_t     hasDup;
-  int8_t    hasDup;
+  int8_t     nSubBlock;
-  int8_t    nSubBlock;
+  SBlockInfo aSubBlock[TSDB_MAX_SUBBLOCKS];
-  SSubBlock aSubBlock[TSDB_MAX_SUBBLOCKS];
+  SSmaInfo   smaInfo;
 };
 struct SBlockL {
-  struct {
+  int64_t    suid;
-    int64_t uid;
+  int64_t    minUid;
-    int64_t version;
+  int64_t    maxUid;
-    TSKEY   ts;
+  TSKEY      minKey;
-  } minKey;
+  TSKEY      maxKey;
-  struct {
+  int64_t    minVer;
-    int64_t uid;
+  int64_t    maxVer;
-    int64_t version;
+  int32_t    nRow;
-    TSKEY   ts;
+  SBlockInfo bInfo;
-  } maxKey;
-  int64_t minVer;
-  int64_t maxVer;
-  int32_t nRow;
-  int8_t  cmprAlg;
-  int64_t offset;
-  int32_t szBlock;
-  int32_t szBlockCol;
-  int32_t szUid;
-  int32_t szVer;
-  int32_t szTSKEY;
 };
 struct SColData {
@@ -450,10 +452,17 @@ struct SColData {
  uint8_t *pData;
 };
+// (SBlockData){.suid = 0, .uid = 0}: block data not initialized
+// (SBlockData){.suid = suid, .uid = uid}: block data for ONE child table int .data file
+// (SBlockData){.suid = suid, .uid = 0}: block data for N child tables int .last file
+// (SBlockData){.suid = 0, .uid = uid}: block data for 1 normal table int .last/.data file
 struct SBlockData {
-  int32_t  nRow;
+  int64_t  suid;      // 0 means normal table block data, otherwise child table block data
-  int64_t *aVersion;
+  int64_t  uid;       // 0 means block data in .last file, otherwise in .data file
-  TSKEY   *aTSKEY;
+  int32_t  nRow;      // number of rows
+  int64_t *aUid;      // uids of each row, only exist in block data in .last file (uid == 0)
+  int64_t *aVersion;  // versions of each row
+  TSKEY   *aTSKEY;    // timestamp of each row
  SArray  *aIdx;      // SArray<int32_t>
  SArray  *aColData;  // SArray<SColData>
 };
@@ -491,13 +500,18 @@ struct SDelIdx {
  int64_t  size;
 };
-#pragma pack(push, 1)
+struct SDiskDataHdr {
-struct SBlockDataHdr {
  uint32_t delimiter;
+  uint32_t fmtVer;
  int64_t  suid;
  int64_t  uid;
+  int32_t  szUid;
+  int32_t  szVer;
+  int32_t  szKey;
+  int32_t  szBlkCol;
+  int32_t  nRow;
+  int8_t   cmprAlg;
 };
-#pragma pack(pop)
 struct SDelFile {
  volatile int32_t nRef;
@@ -527,6 +541,7 @@ struct SLastFile {
  int64_t commitID;
  int64_t size;
+  int64_t offset;
 };
 struct SSmaFile {
@@ -561,6 +576,8 @@ struct SDelFWriter {
  STsdb    *pTsdb;
  SDelFile  fDel;
  TdFilePtr pWriteH;
+  uint8_t *aBuf[1];
 };
 struct SDataFWriter {
@@ -576,6 +593,8 @@ struct SDataFWriter {
  SDataFile fData;
  SLastFile fLast;
  SSmaFile  fSma;
+  uint8_t *aBuf[4];
 };
 struct STsdbReadSnap {

--- a/source/dnode/vnode/src/inc/vnd.h
+++ b/source/dnode/vnode/src/inc/vnd.h
@@ -65,6 +65,7 @@ struct SVBufPool {
  SVBufPool*       next;
  SVnode*          pVnode;
  volatile int32_t nRef;
+  TdThreadSpinlock lock;
  int64_t          size;
  uint8_t*         ptr;
  SVBufPoolNode*   pTail;

--- a/source/dnode/vnode/src/inc/vnodeInt.h
+++ b/source/dnode/vnode/src/inc/vnodeInt.h
@@ -189,6 +189,7 @@ SSubmitReq* tqBlockToSubmit(SVnode* pVnode, const SArray* pBlocks, const STSchem
 int32_t smaInit();
 void    smaCleanUp();
 int32_t smaOpen(SVnode* pVnode);
+int32_t smaPreClose(SSma* pSma);
 int32_t smaClose(SSma* pSma);
 int32_t smaBegin(SSma* pSma);
 int32_t smaSyncPreCommit(SSma* pSma);
@@ -198,7 +199,7 @@ int32_t smaAsyncPreCommit(SSma* pSma);
 int32_t smaAsyncCommit(SSma* pSma);
 int32_t smaAsyncPostCommit(SSma* pSma);
 int32_t smaDoRetention(SSma* pSma, int64_t now);
-int32_t smaProcessFetch(SSma* pSma, void* pMsg);
+int32_t smaProcessExec(SSma* pSma, void* pMsg);
 int32_t tdProcessTSmaCreate(SSma* pSma, int64_t version, const char* msg);
 int32_t tdProcessTSmaInsert(SSma* pSma, int64_t indexUid, const char* msg);
@@ -322,6 +323,7 @@ struct SVnode {
  TdThreadMutex lock;
  bool          blocked;
  bool          restored;
+  bool          inClose;
  tsem_t        syncSem;
  SQHandle*     pQuery;
 };
@@ -368,6 +370,7 @@ struct SSma {
 void smaHandleRes(void* pVnode, int64_t smaId, const SArray* data);
 enum {
+  SNAP_DATA_CFG = 0,
  SNAP_DATA_META = 1,
  SNAP_DATA_TSDB = 2,
  SNAP_DATA_DEL = 3,

--- a/source/dnode/vnode/src/meta/metaQuery.c
+++ b/source/dnode/vnode/src/meta/metaQuery.c
@@ -127,6 +127,15 @@ _err:
 //   return 0;
 // }
+bool metaIsTableExist(SMeta *pMeta, tb_uid_t uid) {
+  // query uid.idx
+  if (tdbTbGet(pMeta->pUidIdx, &uid, sizeof(uid), NULL, NULL) < 0) {
+    return false;
+  }
+  return true;
+}
 int metaGetTableEntryByUid(SMetaReader *pReader, tb_uid_t uid) {
  SMeta  *pMeta = pReader->pMeta;
  int64_t version;
@@ -503,18 +512,65 @@ STSchema *metaGetTbTSchema(SMeta *pMeta, tb_uid_t uid, int32_t sver) {
 }
 int32_t metaGetTbTSchemaEx(SMeta *pMeta, tb_uid_t suid, tb_uid_t uid, int32_t sver, STSchema **ppTSchema) {
-  int32_t   code = 0;
+  int32_t code = 0;
-  STSchema *pTSchema = NULL;
-  SSkmDbKey skmDbKey = {.uid = suid ? suid : uid, .sver = sver};
  void     *pData = NULL;
  int       nData = 0;
+  SSkmDbKey skmDbKey;
+  if (sver <= 0) {
+    SMetaInfo info;
+    if (metaGetInfo(pMeta, suid ? suid : uid, &info) == 0) {
+      sver = info.skmVer;
+    } else {
+      TBC *pSkmDbC = NULL;
+      int  c;
+      skmDbKey.uid = suid ? suid : uid;
+      skmDbKey.sver = INT32_MAX;
+      tdbTbcOpen(pMeta->pSkmDb, &pSkmDbC, NULL);
+      metaRLock(pMeta);
+      if (tdbTbcMoveTo(pSkmDbC, &skmDbKey, sizeof(skmDbKey), &c) < 0) {
+        metaULock(pMeta);
+        tdbTbcClose(pSkmDbC);
+        code = TSDB_CODE_NOT_FOUND;
+        goto _exit;
+      }
+      ASSERT(c);
+      if (c < 0) {
+        tdbTbcMoveToPrev(pSkmDbC);
+      }
+      const void *pKey = NULL;
+      int32_t     nKey = 0;
+      tdbTbcGet(pSkmDbC, &pKey, &nKey, NULL, NULL);
-  // query
+      if (((SSkmDbKey *)pKey)->uid != skmDbKey.uid) {
+        metaULock(pMeta);
+        tdbTbcClose(pSkmDbC);
+        code = TSDB_CODE_NOT_FOUND;
+        goto _exit;
+      }
+      sver = ((SSkmDbKey *)pKey)->sver;
+      metaULock(pMeta);
+      tdbTbcClose(pSkmDbC);
+    }
+  }
+  ASSERT(sver > 0);
+  skmDbKey.uid = suid ? suid : uid;
+  skmDbKey.sver = sver;
  metaRLock(pMeta);
-  if (tdbTbGet(pMeta->pSkmDb, &skmDbKey, sizeof(skmDbKey), &pData, &nData) < 0) {
+  if (tdbTbGet(pMeta->pSkmDb, &skmDbKey, sizeof(SSkmDbKey), &pData, &nData) < 0) {
-    code = TSDB_CODE_NOT_FOUND;
    metaULock(pMeta);
-    goto _err;
+    code = TSDB_CODE_NOT_FOUND;
+    goto _exit;
  }
  metaULock(pMeta);
@@ -536,15 +592,13 @@ int32_t metaGetTbTSchemaEx(SMeta *pMeta, tb_uid_t suid, tb_uid_t uid, int32_t sv
    SSchema *pSchema = pSchemaWrapper->pSchema + i;
    tdAddColToSchema(&sb, pSchema->type, pSchema->flags, pSchema->colId, pSchema->bytes);
  }
-  pTSchema = tdGetSchemaFromBuilder(&sb);
+  STSchema *pTSchema = tdGetSchemaFromBuilder(&sb);
  tdDestroyTSchemaBuilder(&sb);
  *ppTSchema = pTSchema;
  taosMemoryFree(pSchemaWrapper->pSchema);
-  return code;
-_err:
+_exit:
-  *ppTSchema = NULL;
  return code;
 }
@@ -997,6 +1051,8 @@ int32_t metaGetTableTags(SMeta *pMeta, uint64_t suid, SArray *uidList, SHashObj
  return TSDB_CODE_SUCCESS;
 }
+int32_t metaCacheGet(SMeta *pMeta, int64_t uid, SMetaInfo *pInfo);
 int32_t metaGetInfo(SMeta *pMeta, int64_t uid, SMetaInfo *pInfo) {
  int32_t code = 0;
  void   *pData = NULL;

--- a/source/dnode/vnode/src/meta/metaTable.c
+++ b/source/dnode/vnode/src/meta/metaTable.c
@@ -357,10 +357,7 @@ int metaAlterSTable(SMeta *pMeta, int64_t version, SVCreateStbReq *pReq) {
  metaSaveToTbDb(pMeta, &nStbEntry);
  // update uid index
-  SMetaInfo info;
+  metaUpdateUidIdx(pMeta, &nStbEntry);
-  metaGetEntryInfo(&nStbEntry, &info);
-  tdbTbcUpsert(pUidIdxc, &pReq->suid, sizeof(tb_uid_t),
-               &(SUidIdxVal){.suid = info.suid, .version = info.version, .skmVer = info.skmVer}, sizeof(SUidIdxVal), 0);
  if (oStbEntry.pBuf) taosMemoryFree(oStbEntry.pBuf);
  metaULock(pMeta);
@@ -884,7 +881,8 @@ static int metaUpdateTableTagVal(SMeta *pMeta, int64_t version, SVAlterTbReq *pA
  }
  SCtbIdxKey ctbIdxKey = {.suid = ctbEntry.ctbEntry.suid, .uid = uid};
-  tdbTbUpsert(pMeta->pCtbIdx, &ctbIdxKey, sizeof(ctbIdxKey), ctbEntry.ctbEntry.pTags, ((STag*)(ctbEntry.ctbEntry.pTags))->len, &pMeta->txn);
+  tdbTbUpsert(pMeta->pCtbIdx, &ctbIdxKey, sizeof(ctbIdxKey), ctbEntry.ctbEntry.pTags,
+              ((STag *)(ctbEntry.ctbEntry.pTags))->len, &pMeta->txn);
  tDecoderClear(&dc1);
  tDecoderClear(&dc2);
@@ -1091,7 +1089,8 @@ static int metaUpdateTtlIdx(SMeta *pMeta, const SMetaEntry *pME) {
 static int metaUpdateCtbIdx(SMeta *pMeta, const SMetaEntry *pME) {
  SCtbIdxKey ctbIdxKey = {.suid = pME->ctbEntry.suid, .uid = pME->uid};
-  return tdbTbInsert(pMeta->pCtbIdx, &ctbIdxKey, sizeof(ctbIdxKey), pME->ctbEntry.pTags, ((STag*)(pME->ctbEntry.pTags))->len, &pMeta->txn);
+  return tdbTbInsert(pMeta->pCtbIdx, &ctbIdxKey, sizeof(ctbIdxKey), pME->ctbEntry.pTags,
+                     ((STag *)(pME->ctbEntry.pTags))->len, &pMeta->txn);
 }
 int metaCreateTagIdxKey(tb_uid_t suid, int32_t cid, const void *pTagData, int32_t nTagData, int8_t type, tb_uid_t uid,

--- a/source/dnode/vnode/src/sma/smaCommit.c
+++ b/source/dnode/vnode/src/sma/smaCommit.c
@@ -83,8 +83,7 @@ int32_t smaBegin(SSma *pSma) {
    return TSDB_CODE_SUCCESS;
  }
-  SSmaStat  *pStat = SMA_ENV_STAT(pSmaEnv);
+  SRSmaStat *pRSmaStat = (SRSmaStat *)SMA_ENV_STAT(pSmaEnv);
-  SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pStat);
  int8_t rsmaTriggerStat =
      atomic_val_compare_exchange_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED, TASK_TRIGGER_STAT_ACTIVE);
@@ -110,7 +109,7 @@ int32_t smaBegin(SSma *pSma) {
 /**
 * @brief pre-commit for rollup sma(sync commit).
 *  1) set trigger stat of rsma timer TASK_TRIGGER_STAT_PAUSED.
- *  2) wait all triggered fetch tasks finished
+ *  2) wait for all triggered fetch tasks to finish
 *  3) perform persist task for qTaskInfo
 *
 * @param pSma
@@ -123,19 +122,19 @@ static int32_t tdProcessRSmaSyncPreCommitImpl(SSma *pSma) {
  }
  SSmaStat  *pStat = SMA_ENV_STAT(pSmaEnv);
-  SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pStat);
+  SRSmaStat *pRSmaStat = SMA_STAT_RSMA(pStat);
  // step 1: set rsma stat paused
  atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED);
-  // step 2: wait all triggered fetch tasks finished
+  // step 2: wait for all triggered fetch tasks to finish
  int32_t nLoops = 0;
  while (1) {
    if (T_REF_VAL_GET(pStat) == 0) {
-      smaDebug("vgId:%d, rsma fetch tasks all finished", SMA_VID(pSma));
+      smaDebug("vgId:%d, rsma fetch tasks are all finished", SMA_VID(pSma));
      break;
    } else {
-      smaDebug("vgId:%d, rsma fetch tasks not all finished yet", SMA_VID(pSma));
+      smaDebug("vgId:%d, rsma fetch tasks are not all finished yet", SMA_VID(pSma));
    }
    ++nLoops;
    if (nLoops > 1000) {
@@ -289,8 +288,7 @@ static int32_t tdProcessRSmaSyncPostCommitImpl(SSma *pSma) {
    return TSDB_CODE_SUCCESS;
  }
-  SSmaEnv   *pSmaEnv = SMA_RSMA_ENV(pSma);
+  SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pSma);
-  SRSmaStat *pRSmaStat = SMA_RSMA_STAT(SMA_ENV_STAT(pSmaEnv));
  // cleanup outdated qtaskinfo files
  tdCleanupQTaskInfoFiles(pSma, pRSmaStat);
@@ -299,10 +297,9 @@ static int32_t tdProcessRSmaSyncPostCommitImpl(SSma *pSma) {
 }
 /**
- * @brief Rsma async commit implementation
+ * @brief Rsma async commit implementation(only do some necessary light weighted task)
 *  1) set rsma stat TASK_TRIGGER_STAT_PAUSED
 *  2) Wait all running fetch task finish to fetch and put submitMsg into level 2/3 wQueue(blocking level 1 write)
- *  3)
 *
 * @param pSma
 * @return int32_t
@@ -314,51 +311,75 @@ static int32_t tdProcessRSmaAsyncPreCommitImpl(SSma *pSma) {
  }
  SSmaStat  *pStat = SMA_ENV_STAT(pEnv);
-  SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pStat);
+  SRSmaStat *pRSmaStat = SMA_STAT_RSMA(pStat);
  // step 1: set rsma stat
  atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_PAUSED);
  atomic_store_8(RSMA_COMMIT_STAT(pRSmaStat), 1);
+  pRSmaStat->commitAppliedVer = pSma->pVnode->state.applied;
+  ASSERT(pRSmaStat->commitAppliedVer > 0);
-  // step 2: wait all triggered fetch tasks finished
+  // step 2: wait for all triggered fetch tasks to finish
  int32_t nLoops = 0;
  while (1) {
    if (T_REF_VAL_GET(pStat) == 0) {
-      smaDebug("vgId:%d, rsma fetch tasks all finished", SMA_VID(pSma));
+      smaDebug("vgId:%d, rsma commit, fetch tasks are all finished", SMA_VID(pSma));
      break;
    } else {
-      smaDebug("vgId:%d, rsma fetch tasks not all finished yet", SMA_VID(pSma));
+      smaDebug("vgId:%d, rsma commit, fetch tasks are not all finished yet", SMA_VID(pSma));
+    }
+    ++nLoops;
+    if (nLoops > 1000) {
+      sched_yield();
+      nLoops = 0;
    }
+  }
+  /**
+   * @brief step 3: consume the SubmitReq in buffer
+   *  1) This is high cost task and should not put in asyncPreCommit originally.
+   *  2) But, if put in asyncCommit, would trigger taskInfo cloning frequently.
+   */
+  if (tdRSmaProcessExecImpl(pSma, RSMA_EXEC_COMMIT) < 0) {
+    return TSDB_CODE_FAILED;
+  }
+  smaInfo("vgId:%d, rsma commit, wait for all items to be consumed, TID:%p", SMA_VID(pSma), (void*)taosGetSelfPthreadId());
+  nLoops = 0;
+  while (atomic_load_64(&pRSmaStat->nBufItems) > 0) {
    ++nLoops;
    if (nLoops > 1000) {
      sched_yield();
      nLoops = 0;
    }
  }
+  smaInfo("vgId:%d, rsma commit, all items are consumed, TID:%p", SMA_VID(pSma), (void *)taosGetSelfPthreadId());
+  if (tdRSmaPersistExecImpl(pRSmaStat, RSMA_INFO_HASH(pRSmaStat)) < 0) {
+    return TSDB_CODE_FAILED;
+  }
+  smaInfo("vgId:%d, rsma commit, operator state commited, TID:%p", SMA_VID(pSma), (void *)taosGetSelfPthreadId());
-  // step 3:  swap rsmaInfoHash and iRsmaInfoHash
+#if 0 // consuming task of qTaskInfo clone 
+  // step 4:  swap queue/qall and iQueue/iQall
  // lock
-  taosWLockLatch(SMA_ENV_LOCK(pEnv));
+  // taosWLockLatch(SMA_ENV_LOCK(pEnv));
  ASSERT(RSMA_INFO_HASH(pRSmaStat));
-  ASSERT(!RSMA_IMU_INFO_HASH(pRSmaStat));
-  RSMA_IMU_INFO_HASH(pRSmaStat) = RSMA_INFO_HASH(pRSmaStat);
+  void *pIter = taosHashIterate(RSMA_INFO_HASH(pRSmaStat), NULL);
-  RSMA_INFO_HASH(pRSmaStat) =
-      taosHashInit(RSMA_TASK_INFO_HASH_SLOT, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), true, HASH_ENTRY_LOCK);
-  if (!RSMA_INFO_HASH(pRSmaStat)) {
+  while (pIter) {
-    // unlock
+    SRSmaInfo *pInfo = *(SRSmaInfo **)pIter;
-    taosWUnLockLatch(SMA_ENV_LOCK(pEnv));
+    TSWAP(pInfo->iQall, pInfo->qall);
-    smaError("vgId:%d, rsma async commit failed since %s", SMA_VID(pSma), terrstr());
+    TSWAP(pInfo->iQueue, pInfo->queue);
-    return TSDB_CODE_FAILED;
+    TSWAP(pInfo->iTaskInfo[0], pInfo->taskInfo[0]);
+    TSWAP(pInfo->iTaskInfo[1], pInfo->taskInfo[1]);
+    pIter = taosHashIterate(RSMA_INFO_HASH(pRSmaStat), pIter);
  }
  // unlock
-  taosWUnLockLatch(SMA_ENV_LOCK(pEnv));
+  // taosWUnLockLatch(SMA_ENV_LOCK(pEnv));
+#endif
-  // step 4: others
-  pRSmaStat->commitAppliedVer = pSma->pVnode->state.applied;
  return TSDB_CODE_SUCCESS;
 }
@@ -374,18 +395,20 @@ static int32_t tdProcessRSmaAsyncCommitImpl(SSma *pSma) {
  if (!pSmaEnv) {
    return TSDB_CODE_SUCCESS;
  }
+#if 0
+  SRSmaStat *pRSmaStat = (SRSmaStat *)SMA_ENV_STAT(pSmaEnv);
-  SSmaStat  *pStat = SMA_ENV_STAT(pSmaEnv);
+  // perform persist task for qTaskInfo operator
-  SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pStat);
+  if (tdRSmaPersistExecImpl(pRSmaStat, RSMA_INFO_HASH(pRSmaStat)) < 0) {
+    return TSDB_CODE_FAILED;
-  // perform persist task for qTaskInfo
+  }
-  tdRSmaPersistExecImpl(pRSmaStat, RSMA_IMU_INFO_HASH(pRSmaStat));
+#endif
  return TSDB_CODE_SUCCESS;
 }
 /**
- * @brief Migrate rsmaInfo from iRsmaInfo to rsmaInfo if rsmaInfoHash not empty.
+ * @brief Migrate rsmaInfo from iRsmaInfo to rsmaInfo if rsma infoHash not empty.
 *
 * @param pSma
 * @return int32_t
@@ -396,68 +419,66 @@ static int32_t tdProcessRSmaAsyncPostCommitImpl(SSma *pSma) {
    return TSDB_CODE_SUCCESS;
  }
-  SSmaStat  *pStat = SMA_ENV_STAT(pEnv);
+  SRSmaStat *pRSmaStat = (SRSmaStat *)SMA_ENV_STAT(pEnv);
-  SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pStat);
+  SArray    *rsmaDeleted = NULL;
-  // step 1: merge rsmaInfoHash and iRsmaInfoHash
+  // step 1: merge qTaskInfo and iQTaskInfo
  // lock
-  taosWLockLatch(SMA_ENV_LOCK(pEnv));
+  // taosWLockLatch(SMA_ENV_LOCK(pEnv));
-#if 0
-  if (taosHashGetSize(RSMA_INFO_HASH(pRSmaStat)) <= 0) {
+  void *pIter = NULL;
-    // just switch the hash pointer if rsmaInfoHash is empty
+  while ((pIter = taosHashIterate(RSMA_INFO_HASH(pRSmaStat), pIter))) {
-    if (taosHashGetSize(RSMA_IMU_INFO_HASH(pRSmaStat)) > 0) {
+    tb_uid_t  *pSuid = (tb_uid_t *)taosHashGetKey(pIter, NULL);
-      SHashObj *infoHash = RSMA_INFO_HASH(pRSmaStat);
+    SRSmaInfo *pRSmaInfo = *(SRSmaInfo **)pIter;
-      RSMA_INFO_HASH(pRSmaStat) = RSMA_IMU_INFO_HASH(pRSmaStat);
+    if (RSMA_INFO_IS_DEL(pRSmaInfo)) {
-      RSMA_IMU_INFO_HASH(pRSmaStat) = infoHash;
+      int32_t refVal = T_REF_VAL_GET(pRSmaInfo);
-    }
+      if (refVal == 0) {
-  } else {
+        if (!rsmaDeleted) {
-#endif
+          if ((rsmaDeleted = taosArrayInit(1, sizeof(tb_uid_t)))) {
-#if 1
+            taosArrayPush(rsmaDeleted, pSuid);
-  void *pIter = taosHashIterate(RSMA_IMU_INFO_HASH(pRSmaStat), NULL);
+          }
-  while (pIter) {
-    tb_uid_t *pSuid = (tb_uid_t *)taosHashGetKey(pIter, NULL);
-    if (!taosHashGet(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t))) {
-      SRSmaInfo *pRSmaInfo = *(SRSmaInfo **)pIter;
-      if (RSMA_INFO_IS_DEL(pRSmaInfo)) {
-        int32_t refVal = T_REF_VAL_GET(pRSmaInfo);
-        if (refVal == 0) {
-          tdFreeRSmaInfo(pSma, pRSmaInfo, true);
-          smaDebug(
-              "vgId:%d, rsma async post commit, free rsma info since already deleted and ref is 0 for "
-              "table:%" PRIi64,
-              SMA_VID(pSma), *pSuid);
-        } else {
-          smaDebug(
-              "vgId:%d, rsma async post commit, not free rsma info since ref is %d although already deleted for "
-              "table:%" PRIi64,
-              SMA_VID(pSma), refVal, *pSuid);
        }
+      } else {
+        smaDebug(
+            "vgId:%d, rsma async post commit, not free rsma info since ref is %d although already deleted for "
+            "table:%" PRIi64,
+            SMA_VID(pSma), refVal, *pSuid);
+      }
-        pIter = taosHashIterate(RSMA_IMU_INFO_HASH(pRSmaStat), pIter);
+      continue;
-        continue;
+    }
+#if 0
+    if (pRSmaInfo->taskInfo[0]) {
+      if (pRSmaInfo->iTaskInfo[0]) {
+        SRSmaInfo *pRSmaInfo = *(SRSmaInfo **)pRSmaInfo->iTaskInfo[0];
+        tdFreeRSmaInfo(pSma, pRSmaInfo, false);
+        pRSmaInfo->iTaskInfo[0] = NULL;
      }
-      taosHashPut(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t), pIter, sizeof(pIter));
-      smaDebug("vgId:%d, rsma async post commit, migrated from iRsmaInfoHash for table:%" PRIi64, SMA_VID(pSma),
-               *pSuid);
    } else {
-      // free the resources
+      TSWAP(pRSmaInfo->taskInfo[0], pRSmaInfo->iTaskInfo[0]);
-      SRSmaInfo *pRSmaInfo = *(SRSmaInfo **)pIter;
-      tdFreeRSmaInfo(pSma, pRSmaInfo, false);
-      smaDebug("vgId:%d, rsma async post commit, free rsma info since already COW for table:%" PRIi64, SMA_VID(pSma),
-               *pSuid);
    }
-    pIter = taosHashIterate(RSMA_IMU_INFO_HASH(pRSmaStat), pIter);
+    taosHashPut(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t), pIter, sizeof(pIter));
-  }
+    smaDebug("vgId:%d, rsma async post commit, migrated from iRsmaInfoHash for table:%" PRIi64, SMA_VID(pSma), *pSuid);
 #endif
-  // }
+  }
-  taosHashCleanup(RSMA_IMU_INFO_HASH(pRSmaStat));
+  for (int32_t i = 0; i < taosArrayGetSize(rsmaDeleted); ++i) {
-  RSMA_IMU_INFO_HASH(pRSmaStat) = NULL;
+    tb_uid_t *pSuid = taosArrayGet(rsmaDeleted, i);
+    void     *pRSmaInfo = taosHashGet(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t));
+    if ((pRSmaInfo = *(SRSmaInfo **)pRSmaInfo)) {
+      tdFreeRSmaInfo(pSma, pRSmaInfo, true);
+      smaDebug(
+          "vgId:%d, rsma async post commit, free rsma info since already deleted and ref is 0 for "
+          "table:%" PRIi64,
+          SMA_VID(pSma), *pSuid);
+    }
+    taosHashRemove(RSMA_INFO_HASH(pRSmaStat), pSuid, sizeof(tb_uid_t));
+  }
+  taosArrayDestroy(rsmaDeleted);
  // unlock
-  taosWUnLockLatch(SMA_ENV_LOCK(pEnv));
+  // taosWUnLockLatch(SMA_ENV_LOCK(pEnv));
  // step 2: cleanup outdated qtaskinfo files
  tdCleanupQTaskInfoFiles(pSma, pRSmaStat);

--- a/source/dnode/vnode/src/sma/smaEnv.c
+++ b/source/dnode/vnode/src/sma/smaEnv.c
@@ -171,7 +171,7 @@ int32_t tdUnRefSmaStat(SSma *pSma, SSmaStat *pStat) {
 int32_t tdRefRSmaInfo(SSma *pSma, SRSmaInfo *pRSmaInfo) {
  if (!pRSmaInfo) return 0;
  int ref = T_REF_INC(pRSmaInfo);
  smaDebug("vgId:%d, ref rsma info:%p, val:%d", SMA_VID(pSma), pRSmaInfo, ref);
  return 0;
@@ -209,6 +209,7 @@ static int32_t tdInitSmaStat(SSmaStat **pSmaStat, int8_t smaType, const SSma *pS
      SRSmaStat *pRSmaStat = (SRSmaStat *)(*pSmaStat);
      pRSmaStat->pSma = (SSma *)pSma;
      atomic_store_8(RSMA_TRIGGER_STAT(pRSmaStat), TASK_TRIGGER_STAT_INIT);
+      tsem_init(&pRSmaStat->notEmpty, 0, 0);
      // init smaMgmt
      smaInit();
@@ -228,7 +229,6 @@ static int32_t tdInitSmaStat(SSmaStat **pSmaStat, int8_t smaType, const SSma *pS
      RSMA_INFO_HASH(pRSmaStat) = taosHashInit(
          RSMA_TASK_INFO_HASH_SLOT, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), true, HASH_ENTRY_LOCK);
      if (!RSMA_INFO_HASH(pRSmaStat)) {
-        taosMemoryFreeClear(*pSmaStat);
        return TSDB_CODE_FAILED;
      }
    } else if (smaType == TSDB_SMA_TYPE_TIME_RANGE) {
@@ -262,10 +262,9 @@ static void tdDestroyRSmaStat(void *pRSmaStat) {
    smaDebug("vgId:%d, destroy rsma stat %p", SMA_VID(pSma), pRSmaStat);
    // step 1: set rsma trigger stat cancelled
    atomic_store_8(RSMA_TRIGGER_STAT(pStat), TASK_TRIGGER_STAT_CANCELLED);
+    tsem_destroy(&(pStat->notEmpty));
    // step 2: destroy the rsma info and associated fetch tasks
-    // TODO: use taosHashSetFreeFp when taosHashSetFreeFp is ready.
-#if 1
    if (taosHashGetSize(RSMA_INFO_HASH(pStat)) > 0) {
      void *infoHash = taosHashIterate(RSMA_INFO_HASH(pStat), NULL);
      while (infoHash) {
@@ -274,17 +273,16 @@ static void tdDestroyRSmaStat(void *pRSmaStat) {
        infoHash = taosHashIterate(RSMA_INFO_HASH(pStat), infoHash);
      }
    }
-#endif
    taosHashCleanup(RSMA_INFO_HASH(pStat));
-    // step 3: wait all triggered fetch tasks finished
+    // step 3: wait for all triggered fetch tasks to finish
    int32_t nLoops = 0;
    while (1) {
      if (T_REF_VAL_GET((SSmaStat *)pStat) == 0) {
-        smaDebug("vgId:%d, rsma fetch tasks all finished", SMA_VID(pSma));
+        smaDebug("vgId:%d, rsma fetch tasks are all finished", SMA_VID(pSma));
        break;
      } else {
-        smaDebug("vgId:%d, rsma fetch tasks not all finished yet", SMA_VID(pSma));
+        smaDebug("vgId:%d, rsma fetch tasks are not all finished yet", SMA_VID(pSma));
      }
      ++nLoops;
      if (nLoops > 1000) {
@@ -293,7 +291,7 @@ static void tdDestroyRSmaStat(void *pRSmaStat) {
      }
    }
-    // step 4: free pStat
+    // step 5: free pStat
    taosMemoryFreeClear(pStat);
  }
 }
@@ -318,9 +316,9 @@ void *tdFreeSmaState(SSmaStat *pSmaStat, int8_t smaType) {
 int32_t tdDestroySmaState(SSmaStat *pSmaStat, int8_t smaType) {
  if (pSmaStat) {
    if (smaType == TSDB_SMA_TYPE_TIME_RANGE) {
-      tdDestroyTSmaStat(SMA_TSMA_STAT(pSmaStat));
+      tdDestroyTSmaStat(SMA_STAT_TSMA(pSmaStat));
    } else if (smaType == TSDB_SMA_TYPE_ROLLUP) {
-      SRSmaStat *pRSmaStat = SMA_RSMA_STAT(pSmaStat);
+      SRSmaStat *pRSmaStat = &pSmaStat->rsmaStat;
      int32_t    vid = SMA_VID(pRSmaStat->pSma);
      int64_t    refId = RSMA_REF_ID(pRSmaStat);
      if (taosRemoveRef(smaMgmt.rsetId, RSMA_REF_ID(pRSmaStat)) < 0) {

--- a/source/dnode/vnode/src/sma/smaOpen.c
+++ b/source/dnode/vnode/src/sma/smaOpen.c
@@ -146,6 +146,20 @@ int32_t smaClose(SSma *pSma) {
  return 0;
 }
+int32_t smaPreClose(SSma *pSma) {
+  if (pSma && VND_IS_RSMA(pSma->pVnode)) {
+    SSmaEnv   *pEnv = NULL;
+    SRSmaStat *pStat = NULL;
+    if (!(pEnv = SMA_RSMA_ENV(pSma)) || !(pStat = (SRSmaStat *)SMA_ENV_STAT(pEnv))) {
+      return 0;
+    }
+    for (int32_t i = 0; i < RSMA_EXECUTOR_MAX; ++i) {
+      tsem_post(&(pStat->notEmpty));
+    }
+  }
+  return 0;
+}
 /**
 * @brief rsma env restore
 * 

--- a/source/dnode/vnode/src/sma/smaRollup.c
+++ b/source/dnode/vnode/src/sma/smaRollup.c
--- a/source/dnode/vnode/src/sma/smaSnapshot.c
+++ b/source/dnode/vnode/src/sma/smaSnapshot.c
@@ -139,7 +139,6 @@ static int32_t rsmaSnapReadQTaskInfo(SRsmaSnapReader* pReader, uint8_t** ppBuf)
  smaInfo("vgId:%d, vnode snapshot rsma read qtaskinfo, size:%" PRIi64, SMA_VID(pSma), size);
  SSnapDataHdr* pHdr = (SSnapDataHdr*)(*ppBuf);
  pHdr->type = SNAP_DATA_QTASK;
  pHdr->size = size;
@@ -279,7 +278,8 @@ int32_t rsmaSnapWriterOpen(SSma* pSma, int64_t sver, int64_t ever, SRsmaSnapWrit
  TdFilePtr qTaskF = taosCreateFile(qTaskInfoFullName, TD_FILE_CREATE | TD_FILE_WRITE | TD_FILE_TRUNC);
  if (!qTaskF) {
    code = TAOS_SYSTEM_ERROR(errno);
-    smaError("vgId:%d, rsma snapshot writer open %s failed since %s", TD_VID(pSma->pVnode), qTaskInfoFullName, tstrerror(code));
+    smaError("vgId:%d, rsma snapshot writer open %s failed since %s", TD_VID(pSma->pVnode), qTaskInfoFullName,
+             tstrerror(code));
    goto _err;
  }
  qWriter->pWriteH = qTaskF;
@@ -309,7 +309,7 @@ int32_t rsmaSnapWriterClose(SRsmaSnapWriter** ppWriter, int8_t rollback) {
  if (rollback) {
    // TODO: rsma1/rsma2
    // qtaskinfo
-    if(pWriter->pQTaskFWriter) {
+    if (pWriter->pQTaskFWriter) {
      taosRemoveFile(pWriter->pQTaskFWriter->fname);
    }
  } else {

--- a/source/dnode/vnode/src/sma/smaTimeRange.c
+++ b/source/dnode/vnode/src/sma/smaTimeRange.c
@@ -175,7 +175,7 @@ int32_t tdProcessTSmaInsertImpl(SSma *pSma, int64_t indexUid, const char *msg) {
  }
  tdRefSmaStat(pSma, pStat);
-  pTsmaStat = SMA_TSMA_STAT(pStat);
+  pTsmaStat = SMA_STAT_TSMA(pStat);
  if (!pTsmaStat->pTSma) {
    STSma *pTSma = metaGetSmaInfoByIndex(SMA_META(pSma), indexUid);

--- a/source/dnode/vnode/src/sma/smaUtil.c
+++ b/source/dnode/vnode/src/sma/smaUtil.c
@@ -350,49 +350,48 @@ _err:
 }
 /**
- * @brief pTSchema is shared
+ * @brief Clone qTaskInfo of SRSmaInfo
 *
 * @param pSma
- * @param pDest
+ * @param pInfo
- * @param pSrc
 * @return int32_t
 */
-int32_t tdCloneRSmaInfo(SSma *pSma, SRSmaInfo **pDest, SRSmaInfo *pSrc) {
+int32_t tdCloneRSmaInfo(SSma *pSma, SRSmaInfo *pInfo) {
-  SVnode     *pVnode = pSma->pVnode;
  SRSmaParam *param = NULL;
-  if (!pSrc) {
+  if (!pInfo) {
-    *pDest = NULL;
    return TSDB_CODE_SUCCESS;
  }
  SMetaReader mr = {0};
  metaReaderInit(&mr, SMA_META(pSma), 0);
-  smaDebug("vgId:%d, rsma clone, suid is %" PRIi64, TD_VID(pVnode), pSrc->suid);
+  smaDebug("vgId:%d, rsma clone qTaskInfo for suid:%" PRIi64, SMA_VID(pSma), pInfo->suid);
-  if (metaGetTableEntryByUid(&mr, pSrc->suid) < 0) {
+  if (metaGetTableEntryByUid(&mr, pInfo->suid) < 0) {
-    smaError("vgId:%d, rsma clone, failed to get table meta for %" PRIi64 " since %s", TD_VID(pVnode), pSrc->suid,
+    smaError("vgId:%d, rsma clone, failed to get table meta for %" PRIi64 " since %s", SMA_VID(pSma), pInfo->suid,
             terrstr());
    goto _err;
  }
  ASSERT(mr.me.type == TSDB_SUPER_TABLE);
-  ASSERT(mr.me.uid == pSrc->suid);
+  ASSERT(mr.me.uid == pInfo->suid);
  if (TABLE_IS_ROLLUP(mr.me.flags)) {
    param = &mr.me.stbEntry.rsmaParam;
    for (int32_t i = 0; i < TSDB_RETENTION_L2; ++i) {
-      if (tdCloneQTaskInfo(pSma, pSrc->iTaskInfo[i], pSrc->taskInfo[i], param, pSrc->suid, i) < 0) {
+      if (!pInfo->iTaskInfo[i]) {
+        continue;
+      }
+      if (tdCloneQTaskInfo(pSma, pInfo->taskInfo[i], pInfo->iTaskInfo[i], param, pInfo->suid, i) < 0) {
        goto _err;
      }
    }
-    smaDebug("vgId:%d, rsma clone env success for %" PRIi64, TD_VID(pVnode), pSrc->suid);
+    smaDebug("vgId:%d, rsma clone env success for %" PRIi64, SMA_VID(pSma), pInfo->suid);
+  } else {
+    terrno = TSDB_CODE_RSMA_INVALID_SCHEMA;
+    goto _err;
  }
  metaReaderClear(&mr);
-  *pDest = pSrc;  // pointer copy
  return TSDB_CODE_SUCCESS;
 _err:
-  *pDest = NULL;
  metaReaderClear(&mr);
-  smaError("vgId:%d, rsma clone env failed for %" PRIi64 " since %s", TD_VID(pVnode), pSrc->suid, terrstr());
+  smaError("vgId:%d, rsma clone env failed for %" PRIi64 " since %s", SMA_VID(pSma), pInfo->suid, terrstr());
  return TSDB_CODE_FAILED;
 }
\ No newline at end of file
--- a/source/dnode/vnode/src/tq/tq.c
+++ b/source/dnode/vnode/src/tq/tq.c
@@ -628,8 +628,6 @@ int32_t tqProcessVgChangeReq(STQ* pTq, int64_t version, char* msg, int32_t msgLe
 }
 int32_t tqExpandTask(STQ* pTq, SStreamTask* pTask) {
-  int32_t code = 0;
  if (pTask->taskLevel == TASK_LEVEL__AGG) {
    ASSERT(taosArrayGetSize(pTask->childEpInfo) != 0);
  }
@@ -640,8 +638,7 @@ int32_t tqExpandTask(STQ* pTq, SStreamTask* pTask) {
  pTask->outputQueue = streamQueueOpen();
  if (pTask->inputQueue == NULL || pTask->outputQueue == NULL) {
-    code = -1;
+    return -1;
-    goto FAIL;
  }
  pTask->inputStatus = TASK_INPUT_STATUS__NORMAL;
@@ -686,14 +683,9 @@ int32_t tqExpandTask(STQ* pTq, SStreamTask* pTask) {
  streamSetupTrigger(pTask);
-  tqInfo("deploy stream task on vg %d, task id %d, child id %d", TD_VID(pTq->pVnode), pTask->taskId,
+  tqInfo("expand stream task on vg %d, task id %d, child id %d", TD_VID(pTq->pVnode), pTask->taskId,
         pTask->selfChildId);
+  return 0;
-FAIL:
-  if (pTask->inputQueue) streamQueueClose(pTask->inputQueue);
-  if (pTask->outputQueue) streamQueueClose(pTask->outputQueue);
-  // TODO free executor
-  return code;
 }
 int32_t tqProcessTaskDeployReq(STQ* pTq, int64_t version, char* msg, int32_t msgLen) {

--- a/source/dnode/vnode/src/tq/tqRead.c
+++ b/source/dnode/vnode/src/tq/tqRead.c
@@ -341,7 +341,7 @@ FAIL:
  return -1;
 }
-void tqReaderSetColIdList(STqReader* pReadHandle, SArray* pColIdList) { pReadHandle->pColIdList = pColIdList; }
+void tqReaderSetColIdList(STqReader* pReader, SArray* pColIdList) { pReader->pColIdList = pColIdList; }
 int tqReaderSetTbUidList(STqReader* pReader, const SArray* tbUidList) {
  if (pReader->tbIdHash) {

--- a/source/dnode/vnode/src/tq/tqSink.c
+++ b/source/dnode/vnode/src/tq/tqSink.c
@@ -231,34 +231,35 @@ void tqTableSink(SStreamTask* pTask, void* vnode, int64_t ver, void* data) {
  ASSERT(pTask->tbSink.pTSchema);
  deleteReq.deleteReqs = taosArrayInit(0, sizeof(SSingleDeleteReq));
-  SSubmitReq* pReq = tqBlockToSubmit(pVnode, pRes, pTask->tbSink.pTSchema, true, pTask->tbSink.stbUid,
+  SSubmitReq* submitReq = tqBlockToSubmit(pVnode, pRes, pTask->tbSink.pTSchema, true, pTask->tbSink.stbUid,
-                                     pTask->tbSink.stbFullName, &deleteReq);
+                                          pTask->tbSink.stbFullName, &deleteReq);
  tqDebug("vgId:%d, task %d convert blocks over, put into write-queue", TD_VID(pVnode), pTask->taskId);
-  int32_t code;
+  if (taosArrayGetSize(deleteReq.deleteReqs) != 0) {
-  int32_t len;
+    int32_t code;
-  tEncodeSize(tEncodeSBatchDeleteReq, &deleteReq, len, code);
+    int32_t len;
-  if (code < 0) {
+    tEncodeSize(tEncodeSBatchDeleteReq, &deleteReq, len, code);
-    //
+    if (code < 0) {
-    ASSERT(0);
+      //
-  }
+      ASSERT(0);
-  SEncoder encoder;
+    }
-  void*    buf = rpcMallocCont(len + sizeof(SMsgHead));
+    SEncoder encoder;
-  void*    abuf = POINTER_SHIFT(buf, sizeof(SMsgHead));
+    void*    serializedDeleteReq = rpcMallocCont(len + sizeof(SMsgHead));
-  tEncoderInit(&encoder, abuf, len);
+    void*    abuf = POINTER_SHIFT(serializedDeleteReq, sizeof(SMsgHead));
-  tEncodeSBatchDeleteReq(&encoder, &deleteReq);
+    tEncoderInit(&encoder, abuf, len);
-  tEncoderClear(&encoder);
+    tEncodeSBatchDeleteReq(&encoder, &deleteReq);
+    tEncoderClear(&encoder);
-  ((SMsgHead*)buf)->vgId = pVnode->config.vgId;
+    ((SMsgHead*)serializedDeleteReq)->vgId = pVnode->config.vgId;
-  if (taosArrayGetSize(deleteReq.deleteReqs) != 0) {
    SRpcMsg msg = {
        .msgType = TDMT_VND_BATCH_DEL,
-        .pCont = buf,
+        .pCont = serializedDeleteReq,
        .contLen = len + sizeof(SMsgHead),
    };
    if (tmsgPutToQueue(&pVnode->msgCb, WRITE_QUEUE, &msg) != 0) {
+      rpcFreeCont(serializedDeleteReq);
      tqDebug("failed to put into write-queue since %s", terrstr());
    }
  }
@@ -268,11 +269,12 @@ void tqTableSink(SStreamTask* pTask, void* vnode, int64_t ver, void* data) {
  // build write msg
  SRpcMsg msg = {
      .msgType = TDMT_VND_SUBMIT,
-      .pCont = pReq,
+      .pCont = submitReq,
-      .contLen = ntohl(pReq->length),
+      .contLen = ntohl(submitReq->length),
  };
  if (tmsgPutToQueue(&pVnode->msgCb, WRITE_QUEUE, &msg) != 0) {
+    rpcFreeCont(submitReq);
    tqDebug("failed to put into write-queue since %s", terrstr());
  }
 }
--- a/source/dnode/vnode/src/tsdb/tsdbCache.c
+++ b/source/dnode/vnode/src/tsdb/tsdbCache.c
@@ -33,16 +33,21 @@ int32_t tsdbOpenCache(STsdb *pTsdb) {
  taosLRUCacheSetStrictCapacity(pCache, true);
+  taosThreadMutexInit(&pTsdb->lruMutex, NULL);
 _err:
  pTsdb->lruCache = pCache;
  return code;
 }
-void tsdbCloseCache(SLRUCache *pCache) {
+void tsdbCloseCache(STsdb *pTsdb) {
+  SLRUCache *pCache = pTsdb->lruCache;
  if (pCache) {
    taosLRUCacheEraseUnrefEntries(pCache);
    taosLRUCacheCleanup(pCache);
+    taosThreadMutexDestroy(&pTsdb->lruMutex);
  }
 }
@@ -261,14 +266,14 @@ int32_t tsdbCacheInsertLast(SLRUCache *pCache, tb_uid_t uid, STSRow *row, STsdb
    }
    for (++iCol; iCol < nCol; ++iCol) {
-      SLastCol *tTsVal = (SLastCol *)taosArrayGet(pLast, iCol);
+      SLastCol *tTsVal1 = (SLastCol *)taosArrayGet(pLast, iCol);
-      if (keyTs >= tTsVal->ts) {
+      if (keyTs >= tTsVal1->ts) {
-        SColVal *tColVal = &tTsVal->colVal;
+        SColVal *tColVal = &tTsVal1->colVal;
        SColVal colVal = {0};
        tTSRowGetVal(row, pTSchema, iCol, &colVal);
        if (colVal.isNone || colVal.isNull) {
-          if (keyTs == tTsVal->ts && !tColVal->isNone && !tColVal->isNull) {
+          if (keyTs == tTsVal1->ts && !tColVal->isNone && !tColVal->isNull) {
            invalidate = true;
            break;
@@ -279,6 +284,7 @@ int32_t tsdbCacheInsertLast(SLRUCache *pCache, tb_uid_t uid, STSRow *row, STsdb
      }
    }
+  _invalidate:
    taosMemoryFreeClear(pTSchema);
    taosLRUCacheRelease(pCache, h, invalidate);
@@ -317,7 +323,7 @@ static int32_t getTableDelDataFromDelIdx(SDelFReader *pDelReader, SDelIdx *pDelI
  int32_t code = 0;
  if (pDelIdx) {
-    code = tsdbReadDelData(pDelReader, pDelIdx, aDelData, NULL);
+    code = tsdbReadDelData(pDelReader, pDelIdx, aDelData);
  }
  return code;
@@ -388,8 +394,7 @@ static int32_t getTableDelIdx(SDelFReader *pDelFReader, tb_uid_t suid, tb_uid_t
  SDelIdx idx = {.suid = suid, .uid = uid};
  // tMapDataReset(&delIdxMap);
-  //  code = tsdbReadDelIdx(pDelFReader, &delIdxMap, NULL);
+  code = tsdbReadDelIdx(pDelFReader, pDelIdxArray);
-  code = tsdbReadDelIdx(pDelFReader, pDelIdxArray, NULL);
  if (code) goto _err;
  // code = tMapDataSearch(&delIdxMap, &idx, tGetDelIdx, tCmprDelIdx, pDelIdx);
@@ -405,6 +410,178 @@ _err:
  return code;
 }
+typedef enum {
+  SFSLASTNEXTROW_FS,
+  SFSLASTNEXTROW_FILESET,
+  SFSLASTNEXTROW_BLOCKDATA,
+  SFSLASTNEXTROW_BLOCKROW
+} SFSLASTNEXTROWSTATES;
+typedef struct {
+  SFSLASTNEXTROWSTATES state;         // [input]
+  STsdb               *pTsdb;         // [input]
+  SBlockIdx           *pBlockIdxExp;  // [input]
+  STSchema            *pTSchema;      // [input]
+  int32_t              nFileSet;
+  int32_t              iFileSet;
+  SArray              *aDFileSet;
+  SDataFReader        *pDataFReader;
+  SArray              *aBlockL;
+  SBlockL             *pBlockL;
+  SBlockData          *pBlockDataL;
+  SBlockData           blockDataL;
+  int32_t              nRow;
+  int32_t              iRow;
+  TSDBROW              row;
+  /*
+  SArray    *aBlockIdx;
+  SBlockIdx *pBlockIdx;
+  SMapData   blockMap;
+  int32_t    nBlock;
+  int32_t    iBlock;
+  SBlock     block;
+  */
+} SFSLastNextRowIter;
+static int32_t getNextRowFromFSLast(void *iter, TSDBROW **ppRow) {
+  SFSLastNextRowIter *state = (SFSLastNextRowIter *)iter;
+  int32_t             code = 0;
+  switch (state->state) {
+    case SFSLASTNEXTROW_FS:
+      // state->aDFileSet = state->pTsdb->pFS->cState->aDFileSet;
+      state->nFileSet = taosArrayGetSize(state->aDFileSet);
+      state->iFileSet = state->nFileSet;
+      state->pBlockDataL = NULL;
+    case SFSLASTNEXTROW_FILESET: {
+      SDFileSet *pFileSet = NULL;
+    _next_fileset:
+      if (--state->iFileSet >= 0) {
+        pFileSet = (SDFileSet *)taosArrayGet(state->aDFileSet, state->iFileSet);
+      } else {
+        if (state->pBlockDataL) {
+          tBlockDataDestroy(state->pBlockDataL, 1);
+          state->pBlockDataL = NULL;
+        }
+        *ppRow = NULL;
+        return code;
+      }
+      code = tsdbDataFReaderOpen(&state->pDataFReader, state->pTsdb, pFileSet);
+      if (code) goto _err;
+      if (!state->aBlockL) {
+        state->aBlockL = taosArrayInit(0, sizeof(SBlockIdx));
+      } else {
+        taosArrayClear(state->aBlockL);
+      }
+      code = tsdbReadBlockL(state->pDataFReader, state->aBlockL);
+      if (code) goto _err;
+      // SBlockL *pBlockL = (SBlockL *)taosArrayGet(state->aBlockL, state->iBlockL);
+      state->pBlockL = taosArraySearch(state->aBlockL, state->pBlockIdxExp, tCmprBlockL, TD_EQ);
+      if (!state->pBlockL) {
+        goto _next_fileset;
+      }
+      int64_t suid = state->pBlockL->suid;
+      int64_t uid = state->pBlockL->maxUid;
+      if (!state->pBlockDataL) {
+        state->pBlockDataL = &state->blockDataL;
+      }
+      code = tBlockDataInit(state->pBlockDataL, suid, suid ? 0 : uid, state->pTSchema);
+      if (code) goto _err;
+    }
+    case SFSLASTNEXTROW_BLOCKDATA:
+      code = tsdbReadLastBlock(state->pDataFReader, state->pBlockL, state->pBlockDataL);
+      if (code) goto _err;
+      state->nRow = state->blockDataL.nRow;
+      state->iRow = state->nRow - 1;
+      if (!state->pBlockDataL->uid) {
+        while (state->pBlockIdxExp->uid != state->pBlockDataL->aUid[state->iRow]) {
+          --state->iRow;
+        }
+      }
+      state->state = SFSLASTNEXTROW_BLOCKROW;
+    case SFSLASTNEXTROW_BLOCKROW:
+      if (state->pBlockDataL->uid) {
+        if (state->iRow >= 0) {
+          state->row = tsdbRowFromBlockData(state->pBlockDataL, state->iRow);
+          *ppRow = &state->row;
+          if (--state->iRow < 0) {
+            state->state = SFSLASTNEXTROW_FILESET;
+          }
+        }
+      } else {
+        if (state->iRow >= 0 && state->pBlockIdxExp->uid == state->pBlockDataL->aUid[state->iRow]) {
+          state->row = tsdbRowFromBlockData(state->pBlockDataL, state->iRow);
+          *ppRow = &state->row;
+          if (--state->iRow < 0 || state->pBlockIdxExp->uid != state->pBlockDataL->aUid[state->iRow]) {
+            state->state = SFSLASTNEXTROW_FILESET;
+          }
+        }
+      }
+      return code;
+    default:
+      ASSERT(0);
+      break;
+  }
+_err:
+  if (state->pDataFReader) {
+    tsdbDataFReaderClose(&state->pDataFReader);
+    state->pDataFReader = NULL;
+  }
+  if (state->aBlockL) {
+    taosArrayDestroy(state->aBlockL);
+    state->aBlockL = NULL;
+  }
+  if (state->pBlockDataL) {
+    tBlockDataDestroy(state->pBlockDataL, 1);
+    state->pBlockDataL = NULL;
+  }
+  *ppRow = NULL;
+  return code;
+}
+int32_t clearNextRowFromFSLast(void *iter) {
+  SFSLastNextRowIter *state = (SFSLastNextRowIter *)iter;
+  int32_t             code = 0;
+  if (!state) {
+    return code;
+  }
+  if (state->pDataFReader) {
+    tsdbDataFReaderClose(&state->pDataFReader);
+    state->pDataFReader = NULL;
+  }
+  if (state->aBlockL) {
+    taosArrayDestroy(state->aBlockL);
+    state->aBlockL = NULL;
+  }
+  if (state->pBlockDataL) {
+    tBlockDataDestroy(state->pBlockDataL, 1);
+    state->pBlockDataL = NULL;
+  }
+  return code;
+}
 typedef enum SFSNEXTROWSTATES {
  SFSNEXTROW_FS,
  SFSNEXTROW_FILESET,
@@ -451,9 +628,9 @@ static int32_t getNextRowFromFS(void *iter, TSDBROW **ppRow) {
      if (--state->iFileSet >= 0) {
        pFileSet = (SDFileSet *)taosArrayGet(state->aDFileSet, state->iFileSet);
      } else {
-        // tBlockDataClear(&state->blockData, 1);
+        // tBlockDataDestroy(&state->blockData, 1);
        if (state->pBlockData) {
-          tBlockDataClear(state->pBlockData, 1);
+          tBlockDataDestroy(state->pBlockData, 1);
          state->pBlockData = NULL;
        }
@@ -465,13 +642,12 @@ static int32_t getNextRowFromFS(void *iter, TSDBROW **ppRow) {
      if (code) goto _err;
      // tMapDataReset(&state->blockIdxMap);
-      // code = tsdbReadBlockIdx(state->pDataFReader, &state->blockIdxMap, NULL);
      if (!state->aBlockIdx) {
        state->aBlockIdx = taosArrayInit(0, sizeof(SBlockIdx));
      } else {
        taosArrayClear(state->aBlockIdx);
      }
-      code = tsdbReadBlockIdx(state->pDataFReader, state->aBlockIdx, NULL);
+      code = tsdbReadBlockIdx(state->pDataFReader, state->aBlockIdx);
      if (code) goto _err;
      /* if (state->pBlockIdx) { */
@@ -487,8 +663,7 @@ static int32_t getNextRowFromFS(void *iter, TSDBROW **ppRow) {
      }
      tMapDataReset(&state->blockMap);
-      code = tsdbReadBlock(state->pDataFReader, state->pBlockIdx, &state->blockMap, NULL);
+      code = tsdbReadBlock(state->pDataFReader, state->pBlockIdx, &state->blockMap);
-      /* code = tsdbReadBlock(state->pDataFReader, &state->blockIdx, &state->blockMap, NULL); */
      if (code) goto _err;
      state->nBlock = state->blockMap.nItem;
@@ -497,7 +672,7 @@ static int32_t getNextRowFromFS(void *iter, TSDBROW **ppRow) {
      if (!state->pBlockData) {
        state->pBlockData = &state->blockData;
-        tBlockDataInit(&state->blockData);
+        tBlockDataCreate(&state->blockData);
      }
    }
    case SFSNEXTROW_BLOCKDATA:
@@ -510,7 +685,7 @@ static int32_t getNextRowFromFS(void *iter, TSDBROW **ppRow) {
        tMapDataGetItemByIdx(&state->blockMap, state->iBlock, &block, tGetBlock);
        /* code = tsdbReadBlockData(state->pDataFReader, &state->blockIdx, &block, &state->blockData, NULL, NULL); */
-        code = tsdbReadBlockData(state->pDataFReader, state->pBlockIdx, &block, state->pBlockData, NULL, NULL);
+        code = tsdbReadDataBlock(state->pDataFReader, &block, state->pBlockData);
        if (code) goto _err;
        state->nRow = state->blockData.nRow;
@@ -555,8 +730,8 @@ _err:
    state->aBlockIdx = NULL;
  }
  if (state->pBlockData) {
-    // tBlockDataClear(&state->blockData, 1);
+    // tBlockDataDestroy(&state->blockData, 1);
-    tBlockDataClear(state->pBlockData, 1);
+    tBlockDataDestroy(state->pBlockData, 1);
    state->pBlockData = NULL;
  }
@@ -582,8 +757,8 @@ int32_t clearNextRowFromFS(void *iter) {
    state->aBlockIdx = NULL;
  }
  if (state->pBlockData) {
-    // tBlockDataClear(&state->blockData, 1);
+    // tBlockDataDestroy(&state->blockData, 1);
-    tBlockDataClear(state->pBlockData, 1);
+    tBlockDataDestroy(state->pBlockData, 1);
    state->pBlockData = NULL;
  }
@@ -725,18 +900,19 @@ typedef struct {
  SArray *pSkyline;
  int64_t iSkyline;
-  SBlockIdx       idx;
+  SBlockIdx          idx;
-  SMemNextRowIter memState;
+  SMemNextRowIter    memState;
-  SMemNextRowIter imemState;
+  SMemNextRowIter    imemState;
-  SFSNextRowIter  fsState;
+  SFSLastNextRowIter fsLastState;
-  TSDBROW         memRow, imemRow, fsRow;
+  SFSNextRowIter     fsState;
+  TSDBROW            memRow, imemRow, fsLastRow, fsRow;
-  TsdbNextRowState input[3];
+  TsdbNextRowState input[4];
  STsdbReadSnap   *pReadSnap;
  STsdb           *pTsdb;
 } CacheNextRowIter;
-static int32_t nextRowIterOpen(CacheNextRowIter *pIter, tb_uid_t uid, STsdb *pTsdb) {
+static int32_t nextRowIterOpen(CacheNextRowIter *pIter, tb_uid_t uid, STsdb *pTsdb, STSchema *pTSchema) {
  int code = 0;
  tb_uid_t suid = getTableSuidByUid(uid, pTsdb);
@@ -745,12 +921,12 @@ static int32_t nextRowIterOpen(CacheNextRowIter *pIter, tb_uid_t uid, STsdb *pTs
  STbData *pMem = NULL;
  if (pIter->pReadSnap->pMem) {
-    tsdbGetTbDataFromMemTable(pIter->pReadSnap->pMem, suid, uid, &pMem);
+    pMem = tsdbGetTbDataFromMemTable(pIter->pReadSnap->pMem, suid, uid);
  }
  STbData *pIMem = NULL;
  if (pIter->pReadSnap->pIMem) {
-    tsdbGetTbDataFromMemTable(pIter->pReadSnap->pIMem, suid, uid, &pIMem);
+    pIMem = tsdbGetTbDataFromMemTable(pIter->pReadSnap->pIMem, suid, uid);
  }
  pIter->pTsdb = pTsdb;
@@ -763,7 +939,7 @@ static int32_t nextRowIterOpen(CacheNextRowIter *pIter, tb_uid_t uid, STsdb *pTs
  if (pDelFile) {
    SDelFReader *pDelFReader;
-    code = tsdbDelFReaderOpen(&pDelFReader, pDelFile, pTsdb, NULL);
+    code = tsdbDelFReaderOpen(&pDelFReader, pDelFile, pTsdb);
    if (code) goto _err;
    code = getTableDelIdx(pDelFReader, suid, uid, &delIdx);
@@ -782,6 +958,12 @@ static int32_t nextRowIterOpen(CacheNextRowIter *pIter, tb_uid_t uid, STsdb *pTs
  pIter->idx = (SBlockIdx){.suid = suid, .uid = uid};
+  pIter->fsLastState.state = (SFSLASTNEXTROWSTATES) SFSNEXTROW_FS;
+  pIter->fsLastState.pTsdb = pTsdb;
+  pIter->fsLastState.aDFileSet = pIter->pReadSnap->fs.aDFileSet;
+  pIter->fsLastState.pBlockIdxExp = &pIter->idx;
+  pIter->fsLastState.pTSchema = pTSchema;
  pIter->fsState.state = SFSNEXTROW_FS;
  pIter->fsState.pTsdb = pTsdb;
  pIter->fsState.aDFileSet = pIter->pReadSnap->fs.aDFileSet;
@@ -789,7 +971,9 @@ static int32_t nextRowIterOpen(CacheNextRowIter *pIter, tb_uid_t uid, STsdb *pTs
  pIter->input[0] = (TsdbNextRowState){&pIter->memRow, true, false, &pIter->memState, getNextRowFromMem, NULL};
  pIter->input[1] = (TsdbNextRowState){&pIter->imemRow, true, false, &pIter->imemState, getNextRowFromMem, NULL};
-  pIter->input[2] =
+  pIter->input[2] = (TsdbNextRowState){&pIter->fsLastRow,     false, true, &pIter->fsLastState, getNextRowFromFSLast,
+                                       clearNextRowFromFSLast};
+  pIter->input[3] =
      (TsdbNextRowState){&pIter->fsRow, false, true, &pIter->fsState, getNextRowFromFS, clearNextRowFromFS};
  if (pMem) {
@@ -814,7 +998,7 @@ _err:
 static int32_t nextRowIterClose(CacheNextRowIter *pIter) {
  int code = 0;
-  for (int i = 0; i < 3; ++i) {
+  for (int i = 0; i < 4; ++i) {
    if (pIter->input[i].nextRowClearFn) {
      pIter->input[i].nextRowClearFn(pIter->input[i].iter);
    }
@@ -826,7 +1010,6 @@ static int32_t nextRowIterClose(CacheNextRowIter *pIter) {
  tsdbUntakeReadSnap(pIter->pTsdb, pIter->pReadSnap);
-  return code;
 _err:
  return code;
 }
@@ -835,7 +1018,7 @@ _err:
 static int32_t nextRowIterGet(CacheNextRowIter *pIter, TSDBROW **ppRow) {
  int code = 0;
-  for (int i = 0; i < 3; ++i) {
+  for (int i = 0; i < 4; ++i) {
    if (pIter->input[i].next && !pIter->input[i].stop) {
      code = pIter->input[i].nextRowFn(pIter->input[i].iter, &pIter->input[i].pRow);
      if (code) goto _err;
@@ -847,18 +1030,18 @@ static int32_t nextRowIterGet(CacheNextRowIter *pIter, TSDBROW **ppRow) {
    }
  }
-  if (pIter->input[0].stop && pIter->input[1].stop && pIter->input[2].stop) {
+  if (pIter->input[0].stop && pIter->input[1].stop && pIter->input[2].stop && pIter->input[3].stop) {
    *ppRow = NULL;
    return code;
  }
-  // select maxpoint(s) from mem, imem, fs
+  // select maxpoint(s) from mem, imem, fs and last
-  TSDBROW *max[3] = {0};
+  TSDBROW *max[4] = {0};
-  int      iMax[3] = {-1, -1, -1};
+  int      iMax[4] = {-1, -1, -1, -1};
  int      nMax = 0;
  TSKEY    maxKey = TSKEY_MIN;
-  for (int i = 0; i < 3; ++i) {
+  for (int i = 0; i < 4; ++i) {
    if (!pIter->input[i].stop && pIter->input[i].pRow != NULL) {
      TSDBKEY key = TSDBROW_KEY(pIter->input[i].pRow);
@@ -876,13 +1059,13 @@ static int32_t nextRowIterGet(CacheNextRowIter *pIter, TSDBROW **ppRow) {
  }
  // delete detection
-  TSDBROW *merge[3] = {0};
+  TSDBROW *merge[4] = {0};
-  int      iMerge[3] = {-1, -1, -1};
+  int      iMerge[4] = {-1, -1, -1, -1};
  int      nMerge = 0;
  for (int i = 0; i < nMax; ++i) {
-    TSDBKEY maxKey = TSDBROW_KEY(max[i]);
+    TSDBKEY maxKey1 = TSDBROW_KEY(max[i]);
-    bool deleted = tsdbKeyDeleted(&maxKey, pIter->pSkyline, &pIter->iSkyline);
+    bool deleted = tsdbKeyDeleted(&maxKey1, pIter->pSkyline, &pIter->iSkyline);
    if (!deleted) {
      iMerge[nMerge] = iMax[i];
      merge[nMerge++] = max[i];
@@ -918,7 +1101,7 @@ static int32_t mergeLastRow(tb_uid_t uid, STsdb *pTsdb, bool *dup, STSRow **ppRo
  TSKEY lastRowTs = TSKEY_MAX;
  CacheNextRowIter iter = {0};
-  nextRowIterOpen(&iter, uid, pTsdb);
+  nextRowIterOpen(&iter, uid, pTsdb, pTSchema);
  do {
    TSDBROW *pRow = NULL;
@@ -1015,7 +1198,7 @@ static int32_t mergeLast(tb_uid_t uid, STsdb *pTsdb, SArray **ppLastArray) {
  TSKEY lastRowTs = TSKEY_MAX;
  CacheNextRowIter iter = {0};
-  nextRowIterOpen(&iter, uid, pTsdb);
+  nextRowIterOpen(&iter, uid, pTsdb, pTSchema);
  do {
    TSDBROW *pRow = NULL;
@@ -1100,29 +1283,40 @@ int32_t tsdbCacheGetLastrowH(SLRUCache *pCache, tb_uid_t uid, STsdb *pTsdb, LRUH
  //  getTableCacheKeyS(uid, "lr", key, &keyLen);
  getTableCacheKey(uid, 0, key, &keyLen);
  LRUHandle *h = taosLRUCacheLookup(pCache, key, keyLen);
-  if (h) {
+  if (!h) {
-  } else {
+    taosThreadMutexLock(&pTsdb->lruMutex);
-    STSRow *pRow = NULL;
-    bool    dup = false;  // which is always false for now
+    h = taosLRUCacheLookup(pCache, key, keyLen);
-    code = mergeLastRow(uid, pTsdb, &dup, &pRow);
+    if (!h) {
-    // if table's empty or error, return code of -1
+      STSRow *pRow = NULL;
-    if (code < 0 || pRow == NULL) {
+      bool    dup = false;  // which is always false for now
-      if (!dup && pRow) {
+      code = mergeLastRow(uid, pTsdb, &dup, &pRow);
-        taosMemoryFree(pRow);
+      // if table's empty or error, return code of -1
+      if (code < 0 || pRow == NULL) {
+        if (!dup && pRow) {
+          taosMemoryFree(pRow);
+        }
+        taosThreadMutexUnlock(&pTsdb->lruMutex);
+        *handle = NULL;
+        return 0;
      }
-      *handle = NULL;
+      _taos_lru_deleter_t deleter = deleteTableCacheLastrow;
-      return 0;
+      LRUStatus           status =
-    }
+          taosLRUCacheInsert(pCache, key, keyLen, pRow, TD_ROW_LEN(pRow), deleter, NULL, TAOS_LRU_PRIORITY_LOW);
+      if (status != TAOS_LRU_STATUS_OK) {
+        code = -1;
+      }
-    _taos_lru_deleter_t deleter = deleteTableCacheLastrow;
+      taosThreadMutexUnlock(&pTsdb->lruMutex);
-    LRUStatus           status =
-        taosLRUCacheInsert(pCache, key, keyLen, pRow, TD_ROW_LEN(pRow), deleter, NULL, TAOS_LRU_PRIORITY_LOW);
-    if (status != TAOS_LRU_STATUS_OK) {
-      code = -1;
-    }
-    h = taosLRUCacheLookup(pCache, key, keyLen);
+      h = taosLRUCacheLookup(pCache, key, keyLen);
+    } else {
+      taosThreadMutexUnlock(&pTsdb->lruMutex);
+    }
  }
  *handle = h;

--- a/source/dnode/vnode/src/tsdb/tsdbCommit.c
+++ b/source/dnode/vnode/src/tsdb/tsdbCommit.c
--- a/source/dnode/vnode/src/tsdb/tsdbFS.c
+++ b/source/dnode/vnode/src/tsdb/tsdbFS.c
@@ -576,10 +576,7 @@ int32_t tsdbFSCopy(STsdb *pTsdb, STsdbFS *pFS) {
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
-    fSet.pHeadF->nRef = 0;
+    *fSet.pHeadF = *pSet->pHeadF;
-    fSet.pHeadF->commitID = pSet->pHeadF->commitID;
-    fSet.pHeadF->size = pSet->pHeadF->size;
-    fSet.pHeadF->offset = pSet->pHeadF->offset;
    // data
    fSet.pDataF = (SDataFile *)taosMemoryMalloc(sizeof(SDataFile));
@@ -587,9 +584,7 @@ int32_t tsdbFSCopy(STsdb *pTsdb, STsdbFS *pFS) {
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
-    fSet.pDataF->nRef = 0;
+    *fSet.pDataF = *pSet->pDataF;
-    fSet.pDataF->commitID = pSet->pDataF->commitID;
-    fSet.pDataF->size = pSet->pDataF->size;
    // data
    fSet.pLastF = (SLastFile *)taosMemoryMalloc(sizeof(SLastFile));
@@ -597,9 +592,7 @@ int32_t tsdbFSCopy(STsdb *pTsdb, STsdbFS *pFS) {
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
-    fSet.pLastF->nRef = 0;
+    *fSet.pLastF = *pSet->pLastF;
-    fSet.pLastF->commitID = pSet->pLastF->commitID;
-    fSet.pLastF->size = pSet->pLastF->size;
    // last
    fSet.pSmaF = (SSmaFile *)taosMemoryMalloc(sizeof(SSmaFile));
@@ -607,9 +600,7 @@ int32_t tsdbFSCopy(STsdb *pTsdb, STsdbFS *pFS) {
      code = TSDB_CODE_OUT_OF_MEMORY;
      goto _exit;
    }
-    fSet.pSmaF->nRef = 0;
+    *fSet.pSmaF = *pSet->pSmaF;
-    fSet.pSmaF->commitID = pSet->pSmaF->commitID;
-    fSet.pSmaF->size = pSet->pSmaF->size;
    if (taosArrayPush(pFS->aDFileSet, &fSet) == NULL) {
      code = TSDB_CODE_OUT_OF_MEMORY;

--- a/source/dnode/vnode/src/tsdb/tsdbFile.c
+++ b/source/dnode/vnode/src/tsdb/tsdbFile.c
@@ -58,6 +58,7 @@ int32_t tPutLastFile(uint8_t *p, SLastFile *pLastFile) {
  n += tPutI64v(p ? p + n : p, pLastFile->commitID);
  n += tPutI64v(p ? p + n : p, pLastFile->size);
+  n += tPutI64v(p ? p + n : p, pLastFile->offset);
  return n;
 }
@@ -67,6 +68,7 @@ static int32_t tGetLastFile(uint8_t *p, SLastFile *pLastFile) {
  n += tGetI64v(p + n, &pLastFile->commitID);
  n += tGetI64v(p + n, &pLastFile->size);
+  n += tGetI64v(p + n, &pLastFile->offset);
  return n;
 }
@@ -186,11 +188,16 @@ int32_t tPutDFileSet(uint8_t *p, SDFileSet *pSet) {
  n += tPutI32v(p ? p + n : p, pSet->diskId.level);
  n += tPutI32v(p ? p + n : p, pSet->diskId.id);
  n += tPutI32v(p ? p + n : p, pSet->fid);
+  // data
  n += tPutHeadFile(p ? p + n : p, pSet->pHeadF);
  n += tPutDataFile(p ? p + n : p, pSet->pDataF);
-  n += tPutLastFile(p ? p + n : p, pSet->pLastF);
  n += tPutSmaFile(p ? p + n : p, pSet->pSmaF);
+  // last
+  n += tPutU8(p ? p + n : p, 1);  // for future compatibility
+  n += tPutLastFile(p ? p + n : p, pSet->pLastF);
  return n;
 }
@@ -200,11 +207,17 @@ int32_t tGetDFileSet(uint8_t *p, SDFileSet *pSet) {
  n += tGetI32v(p + n, &pSet->diskId.level);
  n += tGetI32v(p + n, &pSet->diskId.id);
  n += tGetI32v(p + n, &pSet->fid);
+  // data
  n += tGetHeadFile(p + n, pSet->pHeadF);
  n += tGetDataFile(p + n, pSet->pDataF);
-  n += tGetLastFile(p + n, pSet->pLastF);
  n += tGetSmaFile(p + n, pSet->pSmaF);
+  // last
+  uint8_t nLast;
+  n += tGetU8(p + n, &nLast);
+  n += tGetLastFile(p + n, pSet->pLastF);
  return n;
 }

--- a/source/dnode/vnode/src/tsdb/tsdbMemTable.c
+++ b/source/dnode/vnode/src/tsdb/tsdbMemTable.c
@@ -15,6 +15,7 @@
 #include "tsdb.h"
+#define MEM_MIN_HASH 1024
 #define SL_MAX_LEVEL 5
 #define SL_NODE_SIZE(l)        (sizeof(SMemSkipListNode) + sizeof(SMemSkipListNode *) * (l)*2)
@@ -45,12 +46,12 @@ int32_t tsdbMemTableCreate(STsdb *pTsdb, SMemTable **ppMemTable) {
  pMemTable->nRef = 1;
  pMemTable->minKey = TSKEY_MAX;
  pMemTable->maxKey = TSKEY_MIN;
-  pMemTable->minVersion = VERSION_MAX;
-  pMemTable->maxVersion = VERSION_MIN;
  pMemTable->nRow = 0;
  pMemTable->nDel = 0;
-  pMemTable->aTbData = taosArrayInit(128, sizeof(STbData *));
+  pMemTable->nTbData = 0;
-  if (pMemTable->aTbData == NULL) {
+  pMemTable->nBucket = MEM_MIN_HASH;
+  pMemTable->aBucket = (STbData **)taosMemoryCalloc(pMemTable->nBucket, sizeof(STbData *));
+  if (pMemTable->aBucket == NULL) {
    code = TSDB_CODE_OUT_OF_MEMORY;
    taosMemoryFree(pMemTable);
    goto _err;
@@ -68,37 +69,30 @@ _err:
 void tsdbMemTableDestroy(SMemTable *pMemTable) {
  if (pMemTable) {
    vnodeBufPoolUnRef(pMemTable->pPool);
-    taosArrayDestroy(pMemTable->aTbData);
+    taosMemoryFree(pMemTable->aBucket);
    taosMemoryFree(pMemTable);
  }
 }
-static int32_t tbDataPCmprFn(const void *p1, const void *p2) {
+static FORCE_INLINE STbData *tsdbGetTbDataFromMemTableImpl(SMemTable *pMemTable, tb_uid_t suid, tb_uid_t uid) {
-  STbData *pTbData1 = *(STbData **)p1;
+  STbData *pTbData = pMemTable->aBucket[TABS(uid) % pMemTable->nBucket];
-  STbData *pTbData2 = *(STbData **)p2;
-  if (pTbData1->suid < pTbData2->suid) {
+  while (pTbData) {
-    return -1;
+    if (pTbData->uid == uid) break;
-  } else if (pTbData1->suid > pTbData2->suid) {
+    pTbData = pTbData->next;
-    return 1;
-  }
-  if (pTbData1->uid < pTbData2->uid) {
-    return -1;
-  } else if (pTbData1->uid > pTbData2->uid) {
-    return 1;
  }
-  return 0;
+  return pTbData;
 }
-void tsdbGetTbDataFromMemTable(SMemTable *pMemTable, tb_uid_t suid, tb_uid_t uid, STbData **ppTbData) {
-  STbData *pTbData = &(STbData){.suid = suid, .uid = uid};
+STbData *tsdbGetTbDataFromMemTable(SMemTable *pMemTable, tb_uid_t suid, tb_uid_t uid) {
+  STbData *pTbData;
  taosRLockLatch(&pMemTable->latch);
-  void *p = taosArraySearch(pMemTable->aTbData, &pTbData, tbDataPCmprFn, TD_EQ);
+  pTbData = tsdbGetTbDataFromMemTableImpl(pMemTable, suid, uid);
  taosRUnLockLatch(&pMemTable->latch);
-  *ppTbData = p ? *(STbData **)p : NULL;
+  return pTbData;
 }
 int32_t tsdbInsertTableData(STsdb *pTsdb, int64_t version, SSubmitMsgIter *pMsgIter, SSubmitBlk *pBlock,
@@ -184,10 +178,6 @@ int32_t tsdbDeleteTableData(STsdb *pTsdb, int64_t version, tb_uid_t suid, tb_uid
    pTbData->pTail = pDelData;
  }
-  // update the state of pMemTable and other (todo)
-  pMemTable->minVersion = TMIN(pMemTable->minVersion, version);
-  pMemTable->maxVersion = TMAX(pMemTable->maxVersion, version);
  pMemTable->nDel++;
  if (TSDB_CACHE_LAST_ROW(pMemTable->pTsdb->pVnode->config) && tsdbKeyCmprFn(&lastKey, &pTbData->maxKey) >= 0) {
@@ -320,18 +310,44 @@ _exit:
  return pIter->pRow;
 }
+static int32_t tsdbMemTableRehash(SMemTable *pMemTable) {
+  int32_t code = 0;
+  int32_t   nBucket = pMemTable->nBucket * 2;
+  STbData **aBucket = (STbData **)taosMemoryCalloc(nBucket, sizeof(STbData *));
+  if (aBucket == NULL) {
+    code = TSDB_CODE_OUT_OF_MEMORY;
+    goto _exit;
+  }
+  for (int32_t iBucket = 0; iBucket < pMemTable->nBucket; iBucket++) {
+    STbData *pTbData = pMemTable->aBucket[iBucket];
+    while (pTbData) {
+      STbData *pNext = pTbData->next;
+      int32_t idx = TABS(pTbData->uid) % nBucket;
+      pTbData->next = aBucket[idx];
+      aBucket[idx] = pTbData;
+      pTbData = pNext;
+    }
+  }
+  taosMemoryFree(pMemTable->aBucket);
+  pMemTable->nBucket = nBucket;
+  pMemTable->aBucket = aBucket;
+_exit:
+  return code;
+}
 static int32_t tsdbGetOrCreateTbData(SMemTable *pMemTable, tb_uid_t suid, tb_uid_t uid, STbData **ppTbData) {
-  int32_t  code = 0;
+  int32_t code = 0;
-  int32_t  idx = 0;
-  STbData *pTbData = NULL;
-  STbData *pTbDataT = &(STbData){.suid = suid, .uid = uid};
  // get
-  idx = taosArraySearchIdx(pMemTable->aTbData, &pTbDataT, tbDataPCmprFn, TD_GE);
+  STbData *pTbData = tsdbGetTbDataFromMemTableImpl(pMemTable, suid, uid);
-  if (idx >= 0) {
+  if (pTbData) goto _exit;
-    pTbData = (STbData *)taosArrayGetP(pMemTable->aTbData, idx);
-    if (tbDataPCmprFn(&pTbDataT, &pTbData) == 0) goto _exit;
-  }
  // create
  SVBufPool *pPool = pMemTable->pTsdb->pVnode->inUse;
@@ -346,9 +362,6 @@ static int32_t tsdbGetOrCreateTbData(SMemTable *pMemTable, tb_uid_t suid, tb_uid
  pTbData->uid = uid;
  pTbData->minKey = TSKEY_MAX;
  pTbData->maxKey = TSKEY_MIN;
-  pTbData->minVersion = VERSION_MAX;
-  pTbData->maxVersion = VERSION_MIN;
-  pTbData->maxSkmVer = -1;
  pTbData->pHead = NULL;
  pTbData->pTail = NULL;
  pTbData->sl.seed = taosRand();
@@ -367,21 +380,23 @@ static int32_t tsdbGetOrCreateTbData(SMemTable *pMemTable, tb_uid_t suid, tb_uid
    SL_NODE_FORWARD(pTbData->sl.pTail, iLevel) = NULL;
  }
-  void *p;
+  taosWLockLatch(&pMemTable->latch);
-  if (idx < 0) {
-    idx = taosArrayGetSize(pMemTable->aTbData);
+  if (pMemTable->nTbData >= pMemTable->nBucket) {
+    code = tsdbMemTableRehash(pMemTable);
+    if (code) {
+      taosWUnLockLatch(&pMemTable->latch);
+      goto _err;
+    }
  }
-  taosWLockLatch(&pMemTable->latch);
+  int32_t idx = TABS(uid) % pMemTable->nBucket;
-  p = taosArrayInsert(pMemTable->aTbData, idx, &pTbData);
+  pTbData->next = pMemTable->aBucket[idx];
-  taosWUnLockLatch(&pMemTable->latch);
+  pMemTable->aBucket[idx] = pTbData;
+  pMemTable->nTbData++;
-  tsdbDebug("vgId:%d, add table data %p at idx:%d", TD_VID(pMemTable->pTsdb->pVnode), pTbData, idx);
+  taosWUnLockLatch(&pMemTable->latch);
-  if (p == NULL) {
-    code = TSDB_CODE_OUT_OF_MEMORY;
-    goto _err;
-  }
 _exit:
  *ppTbData = pTbData;
  return code;
@@ -591,15 +606,9 @@ static int32_t tsdbInsertTableDataImpl(SMemTable *pMemTable, STbData *pTbData, i
    tsdbCacheInsertLast(pMemTable->pTsdb->lruCache, pTbData->uid, pLastRow, pMemTable->pTsdb);
  }
-  pTbData->minVersion = TMIN(pTbData->minVersion, version);
-  pTbData->maxVersion = TMAX(pTbData->maxVersion, version);
-  pTbData->maxSkmVer = TMAX(pTbData->maxSkmVer, pMsgIter->sversion);
  // SMemTable
  pMemTable->minKey = TMIN(pMemTable->minKey, pTbData->minKey);
  pMemTable->maxKey = TMAX(pMemTable->maxKey, pTbData->maxKey);
-  pMemTable->minVersion = TMIN(pMemTable->minVersion, pTbData->minVersion);
-  pMemTable->maxVersion = TMAX(pMemTable->maxVersion, pTbData->maxVersion);
  pMemTable->nRow += nRow;
  pRsp->numOfRows = nRow;
@@ -624,3 +633,41 @@ void tsdbUnrefMemTable(SMemTable *pMemTable) {
    tsdbMemTableDestroy(pMemTable);
  }
 }
+static FORCE_INLINE int32_t tbDataPCmprFn(const void *p1, const void *p2) {
+  STbData *pTbData1 = *(STbData **)p1;
+  STbData *pTbData2 = *(STbData **)p2;
+  if (pTbData1->suid < pTbData2->suid) {
+    return -1;
+  } else if (pTbData1->suid > pTbData2->suid) {
+    return 1;
+  }
+  if (pTbData1->uid < pTbData2->uid) {
+    return -1;
+  } else if (pTbData1->uid > pTbData2->uid) {
+    return 1;
+  }
+  return 0;
+}
+SArray *tsdbMemTableGetTbDataArray(SMemTable *pMemTable) {
+  SArray *aTbDataP = taosArrayInit(pMemTable->nTbData, sizeof(STbData *));
+  if (aTbDataP == NULL) goto _exit;
+  for (int32_t iBucket = 0; iBucket < pMemTable->nBucket; iBucket++) {
+    STbData *pTbData = pMemTable->aBucket[iBucket];
+    while (pTbData) {
+      taosArrayPush(aTbDataP, &pTbData);
+      pTbData = pTbData->next;
+    }
+  }
+  taosArraySort(aTbDataP, tbDataPCmprFn);
+_exit:
+  return aTbDataP;
+}
--- a/source/dnode/vnode/src/tsdb/tsdbOpen.c
+++ b/source/dnode/vnode/src/tsdb/tsdbOpen.c
@@ -86,7 +86,7 @@ int tsdbClose(STsdb **pTsdb) {
  if (*pTsdb) {
    taosThreadRwlockDestroy(&(*pTsdb)->rwLock);
    tsdbFSClose(*pTsdb);
-    tsdbCloseCache((*pTsdb)->lruCache);
+    tsdbCloseCache(*pTsdb);
    taosMemoryFreeClear(*pTsdb);
  }
  return 0;

--- a/source/dnode/vnode/src/tsdb/tsdbRead.c
+++ b/source/dnode/vnode/src/tsdb/tsdbRead.c
--- a/source/dnode/vnode/src/tsdb/tsdbReaderWriter.c
+++ b/source/dnode/vnode/src/tsdb/tsdbReaderWriter.c
--- a/source/dnode/vnode/src/tsdb/tsdbSnapshot.c
+++ b/source/dnode/vnode/src/tsdb/tsdbSnapshot.c
--- a/source/dnode/vnode/src/tsdb/tsdbUtil.c
+++ b/source/dnode/vnode/src/tsdb/tsdbUtil.c
--- a/source/dnode/vnode/src/vnd/vnodeBufPool.c
+++ b/source/dnode/vnode/src/vnd/vnodeBufPool.c
@@ -78,7 +78,7 @@ void vnodeBufPoolReset(SVBufPool *pPool) {
 void *vnodeBufPoolMalloc(SVBufPool *pPool, int size) {
  SVBufPoolNode *pNode;
  void          *p;
+  taosThreadSpinLock(&pPool->lock);
  if (pPool->node.size >= pPool->ptr - pPool->node.data + size) {
    // allocate from the anchor node
    p = pPool->ptr;
@@ -89,6 +89,7 @@ void *vnodeBufPoolMalloc(SVBufPool *pPool, int size) {
    pNode = taosMemoryMalloc(sizeof(*pNode) + size);
    if (pNode == NULL) {
      terrno = TSDB_CODE_OUT_OF_MEMORY;
+      taosThreadSpinUnlock(&pPool->lock);
      return NULL;
    }
@@ -101,7 +102,7 @@ void *vnodeBufPoolMalloc(SVBufPool *pPool, int size) {
    pPool->size = pPool->size + sizeof(*pNode) + size;
  }
+  taosThreadSpinUnlock(&pPool->lock);
  return p;
 }
@@ -129,6 +130,12 @@ static int vnodeBufPoolCreate(SVnode *pVnode, int64_t size, SVBufPool **ppPool)
    return -1;
  }
+  if (taosThreadSpinInit(&pPool->lock, 0) != 0) {
+    taosMemoryFree(pPool);
+    terrno = TAOS_SYSTEM_ERROR(errno);
+    return -1;
+  }
  pPool->next = NULL;
  pPool->pVnode = pVnode;
  pPool->nRef = 0;
@@ -145,6 +152,7 @@ static int vnodeBufPoolCreate(SVnode *pVnode, int64_t size, SVBufPool **ppPool)
 static int vnodeBufPoolDestroy(SVBufPool *pPool) {
  vnodeBufPoolReset(pPool);
+  taosThreadSpinDestroy(&pPool->lock);
  taosMemoryFree(pPool);
  return 0;
 }

--- a/source/dnode/vnode/src/vnd/vnodeCommit.c
+++ b/source/dnode/vnode/src/vnd/vnodeCommit.c
@@ -220,9 +220,6 @@ int vnodeCommit(SVnode *pVnode) {
  vInfo("vgId:%d, start to commit, commit ID:%" PRId64 " version:%" PRId64, TD_VID(pVnode), pVnode->state.commitID,
        pVnode->state.applied);
-  vnodeBufPoolUnRef(pVnode->inUse);
-  pVnode->inUse = NULL;
  pVnode->state.commitTerm = pVnode->state.applyTerm;
  // save info
@@ -239,7 +236,13 @@ int vnodeCommit(SVnode *pVnode) {
  // preCommit
  // smaSyncPreCommit(pVnode->pSma);
-  smaAsyncPreCommit(pVnode->pSma);
+  if(smaAsyncPreCommit(pVnode->pSma) < 0){
+    ASSERT(0);
+    return -1;
+  }
+  vnodeBufPoolUnRef(pVnode->inUse);
+  pVnode->inUse = NULL;
  // commit each sub-system
  if (metaCommit(pVnode->pMeta) < 0) {
@@ -248,7 +251,10 @@ int vnodeCommit(SVnode *pVnode) {
  }
  if (VND_IS_RSMA(pVnode)) {
-    smaAsyncCommit(pVnode->pSma);
+    if (smaAsyncCommit(pVnode->pSma) < 0) {
+      ASSERT(0);
+      return -1;
+    }
    if (tsdbCommit(VND_RSMA0(pVnode)) < 0) {
      ASSERT(0);
@@ -285,7 +291,10 @@ int vnodeCommit(SVnode *pVnode) {
  // postCommit
  // smaSyncPostCommit(pVnode->pSma);
-  smaAsyncPostCommit(pVnode->pSma);
+  if (smaAsyncPostCommit(pVnode->pSma) < 0) {
+    ASSERT(0);
+    return -1;
+  }
  // apply the commit (TODO)
  walEndSnapshot(pVnode->pWal);

--- a/source/dnode/vnode/src/vnd/vnodeOpen.c
+++ b/source/dnode/vnode/src/vnd/vnodeOpen.c
@@ -87,6 +87,7 @@ SVnode *vnodeOpen(const char *path, STfs *pTfs, SMsgCb msgCb) {
  pVnode->msgCb = msgCb;
  taosThreadMutexInit(&pVnode->lock, NULL);
  pVnode->blocked = false;
+  pVnode->inClose = false;
  tsem_init(&pVnode->syncSem, 0, 0);
  tsem_init(&(pVnode->canCommit), 0, 1);
@@ -181,6 +182,8 @@ _err:
 void vnodePreClose(SVnode *pVnode) {
  if (pVnode) {
    syncLeaderTransfer(pVnode->sync);
+    pVnode->inClose = true;
+    smaPreClose(pVnode->pSma);
  }
 }

--- a/source/dnode/vnode/src/vnd/vnodeQuery.c
+++ b/source/dnode/vnode/src/vnd/vnodeQuery.c
@@ -424,6 +424,25 @@ int32_t vnodeGetCtbIdList(SVnode *pVnode, int64_t suid, SArray *list) {
  return TSDB_CODE_SUCCESS;
 }
+int32_t vnodeGetStbIdList(SVnode* pVnode, int64_t suid, SArray* list) {
+  SMStbCursor* pCur = metaOpenStbCursor(pVnode->pMeta, suid);
+  if (!pCur) {
+    return TSDB_CODE_FAILED;
+  }
+  while (1) {
+    tb_uid_t id = metaStbCursorNext(pCur);
+    if (id == 0) {
+      break;
+    }
+    taosArrayPush(list, &id);
+  }
+  metaCloseStbCursor(pCur);
+  return TSDB_CODE_SUCCESS;
+}
 int32_t vnodeGetCtbNum(SVnode *pVnode, int64_t suid, int64_t *num) {
  SMCtbCursor *pCur = metaOpenCtbCursor(pVnode->pMeta, suid);
  if (!pCur) {

--- a/source/dnode/vnode/src/vnd/vnodeSvr.c
+++ b/source/dnode/vnode/src/vnd/vnodeSvr.c
@@ -301,8 +301,8 @@ int32_t vnodeProcessQueryMsg(SVnode *pVnode, SRpcMsg *pMsg) {
      return qWorkerProcessQueryMsg(&handle, pVnode->pQuery, pMsg, 0);
    case TDMT_SCH_QUERY_CONTINUE:
      return qWorkerProcessCQueryMsg(&handle, pVnode->pQuery, pMsg, 0);
-    case TDMT_VND_FETCH_RSMA:
+    case TDMT_VND_EXEC_RSMA:
-      return smaProcessFetch(pVnode->pSma, pMsg);
+      return smaProcessExec(pVnode->pSma, pMsg);
    default:
      vError("unknown msg type:%d in query queue", pMsg->msgType);
      return TSDB_CODE_VND_APP_ERROR;
@@ -530,7 +530,9 @@ static int32_t vnodeProcessCreateTbReq(SVnode *pVnode, int64_t version, void *pR
  }
  tqUpdateTbUidList(pVnode->pTq, tbUids, true);
-  tdUpdateTbUidList(pVnode->pSma, pStore);
+  if (tdUpdateTbUidList(pVnode->pSma, pStore) < 0) {
+    goto _exit;
+  }
  tdUidStoreFree(pStore);
  // prepare rsp

--- a/source/libs/executor/inc/executorimpl.h
+++ b/source/libs/executor/inc/executorimpl.h
@@ -205,7 +205,7 @@ typedef struct SExprSupp {
 } SExprSupp;
 typedef struct SOperatorInfo {
-  uint8_t                operatorType;
+  uint16_t               operatorType;
  bool                   blocking;  // block operator or not
  uint8_t                status;    // denote if current operator is completed
  char*                  name;      // name, for debug purpose
@@ -434,7 +434,7 @@ typedef struct SStreamAggSupporter {
 typedef struct SessionWindowSupporter {
  SStreamAggSupporter* pStreamAggSup;
  int64_t              gap;
-  uint8_t              parentType;
+  uint16_t             parentType;
  SAggSupporter*       pIntervalAggSup;
 } SessionWindowSupporter;
@@ -924,9 +924,6 @@ SOperatorInfo* createMergeAlignedIntervalOperatorInfo(SOperatorInfo* downstream,
 SOperatorInfo* createStreamFinalIntervalOperatorInfo(SOperatorInfo* downstream, SPhysiNode* pPhyNode,
                                                     SExecTaskInfo* pTaskInfo, int32_t numOfChild);
-SOperatorInfo* createStreamIntervalOperatorInfo(SOperatorInfo* downstream, SExprInfo* pExprInfo, int32_t numOfCols,
-                                                SSDataBlock* pResBlock, SInterval* pInterval, int32_t primaryTsSlotId,
-                                                STimeWindowAggSupp *pTwAggSupp, SExecTaskInfo* pTaskInfo);
 SOperatorInfo* createSessionAggOperatorInfo(SOperatorInfo* downstream, SSessionWinodwPhysiNode* pSessionNode,
                                            SExecTaskInfo* pTaskInfo);
 SOperatorInfo* createGroupOperatorInfo(SOperatorInfo* downstream, SExprInfo* pExprInfo, int32_t numOfCols,

--- a/source/libs/executor/inc/tsimplehash.h
+++ b/source/libs/executor/inc/tsimplehash.h
@@ -17,7 +17,6 @@
 #define TDENGINE_TSIMPLEHASH_H
 #include "tarray.h"
-#include "tlockfree.h"
 #ifdef __cplusplus
 extern "C" {
@@ -27,6 +26,10 @@ typedef uint32_t (*_hash_fn_t)(const char *, uint32_t);
 typedef int32_t (*_equal_fn_t)(const void *, const void *, size_t len);
 typedef void (*_hash_free_fn_t)(void *);
+/**
+ * @brief single thread hash
+ * 
+ */
 typedef struct SSHashObj SSHashObj;
 /**
@@ -36,7 +39,7 @@ typedef struct SSHashObj SSHashObj;
 * @param fn          hash function to generate the hash value
 * @return
 */
-SSHashObj *tSimpleHashInit(size_t capacity, _hash_fn_t fn, size_t keyLen, size_t dataLen);
+SSHashObj *tSimpleHashInit(size_t capacity, _hash_fn_t fn);
 /**
 * return the size of hash table
@@ -48,22 +51,26 @@ int32_t tSimpleHashGetSize(const SSHashObj *pHashObj);
 int32_t tSimpleHashPrint(const SSHashObj *pHashObj);
 /**
- * put element into hash table, if the element with the same key exists, update it
+ * @brief put element into hash table, if the element with the same key exists, update it
- * @param pHashObj
+ * 
- * @param key
+ * @param pHashObj 
- * @param data
+ * @param key 
- * @return
+ * @param keyLen 
+ * @param data 
+ * @param dataLen 
+ * @return int32_t 
 */
-int32_t tSimpleHashPut(SSHashObj *pHashObj, const void *key, const void *data);
+int32_t tSimpleHashPut(SSHashObj *pHashObj, const void *key, size_t keyLen, const void *data, size_t dataLen);
 /**
 * return the payload data with the specified key
 *
 * @param pHashObj
 * @param key
+ * @param keyLen
 * @return
 */
-void *tSimpleHashGet(SSHashObj *pHashObj, const void *key);
+void *tSimpleHashGet(SSHashObj *pHashObj, const void *key, size_t keyLen);
 /**
 * remove item with the specified key
@@ -71,7 +78,7 @@ void *tSimpleHashGet(SSHashObj *pHashObj, const void *key);
 * @param key
 * @param keyLen
 */
-int32_t tSimpleHashRemove(SSHashObj *pHashObj, const void *key);
+int32_t tSimpleHashRemove(SSHashObj *pHashObj, const void *key, size_t keyLen);
 /**
 * Clear the hash table.
@@ -98,7 +105,7 @@ size_t tSimpleHashGetMemSize(const SSHashObj *pHashObj);
 * @param keyLen
 * @return
 */
-void *tSimpleHashGetKey(const SSHashObj* pHashObj, void *data, size_t* keyLen);
+void *tSimpleHashGetKey(void *data, size_t* keyLen);
 /**
 * Create the hash table iterator
@@ -109,7 +116,18 @@ void *tSimpleHashGetKey(const SSHashObj* pHashObj, void *data, size_t* keyLen);
 */
 void *tSimpleHashIterate(const SSHashObj *pHashObj, void *data, int32_t *iter);
+/**
+ * Create the hash table iterator
+ * 
+ * @param pHashObj 
+ * @param data 
+ * @param key 
+ * @param iter 
+ * @return void* 
+ */
+void *tSimpleHashIterateKV(const SSHashObj *pHashObj, void *data, void **key, int32_t *iter);
 #ifdef __cplusplus
 }
 #endif
 #endif  // TDENGINE_TSIMPLEHASH_H
\ No newline at end of file
--- a/source/libs/executor/src/executil.c
+++ b/source/libs/executor/src/executil.c
@@ -429,7 +429,9 @@ static SColumnInfoData* getColInfoResult(void* metaHandle, uint64_t suid, SArray
  for (int32_t i = 0; i < rows; i++) {
    int64_t* uid = taosArrayGet(uidList, i);
    void* tag = taosHashGet(tags, uid, sizeof(int64_t));
-    ASSERT(tag);
+    if (suid != 0) {
+      ASSERT(tag);
+    }
    for(int32_t j = 0; j < taosArrayGetSize(pResBlock->pDataBlock); j++){
      SColumnInfoData* pColInfo = (SColumnInfoData*)taosArrayGet(pResBlock->pDataBlock, j);
@@ -533,7 +535,9 @@ int32_t getTableList(void* metaHandle, void* pVnode, SScanPhysiNode* pScanNode,
      vnodeGetCtbIdList(pVnode, pScanNode->suid, res);
    }
  } else {  // Create one table group.
-    taosArrayPush(res, &tableUid);
+    if(metaIsTableExist(metaHandle, tableUid)){
+      taosArrayPush(res, &tableUid);
+    }
  }
  if (pTagCond) {
@@ -599,7 +603,10 @@ size_t getTableTagsBufLen(const SNodeList* pGroups) {
 int32_t getGroupIdFromTagsVal(void* pMeta, uint64_t uid, SNodeList* pGroupNode, char* keyBuf, uint64_t* pGroupId) {
  SMetaReader mr = {0};
  metaReaderInit(&mr, pMeta, 0);
-  metaGetTableEntryByUid(&mr, uid);
+  if(metaGetTableEntryByUid(&mr, uid) != 0){    // table not exist
+    metaReaderClear(&mr);
+    return TSDB_CODE_PAR_TABLE_NOT_EXIST;
+  }
  SNodeList* groupNew = nodesCloneList(pGroupNode);

--- a/source/libs/executor/src/executor.c
+++ b/source/libs/executor/src/executor.c
--- a/source/libs/executor/src/executorimpl.c
+++ b/source/libs/executor/src/executorimpl.c
--- a/source/libs/executor/src/scanoperator.c
+++ b/source/libs/executor/src/scanoperator.c
--- a/source/libs/executor/src/timewindowoperator.c
+++ b/source/libs/executor/src/timewindowoperator.c
--- a/source/libs/executor/src/tsimplehash.c
+++ b/source/libs/executor/src/tsimplehash.c
--- a/source/libs/executor/test/tSimpleHashTests.cpp
+++ b/source/libs/executor/test/tSimpleHashTests.cpp
--- a/source/libs/function/src/builtins.c
+++ b/source/libs/function/src/builtins.c
--- a/source/libs/function/src/builtinsimpl.c
+++ b/source/libs/function/src/builtinsimpl.c
--- a/source/libs/nodes/src/nodesCodeFuncs.c
+++ b/source/libs/nodes/src/nodesCodeFuncs.c
@@ -4673,7 +4673,6 @@ static int32_t jsonToNode(const SJson* pJson, void* pObj) {
  int32_t code;
  tjsonGetNumberValue(pJson, jkNodeType, pNode->type, code);
-  ;
  if (TSDB_CODE_SUCCESS == code) {
    code = tjsonToObject(pJson, nodesNodeName(pNode->type), jsonToSpecificNode, pNode);
    if (TSDB_CODE_SUCCESS != code) {

--- a/source/libs/nodes/src/nodesToSQLFuncs.c
+++ b/source/libs/nodes/src/nodesToSQLFuncs.c
--- a/source/libs/parser/src/parInsert.c
+++ b/source/libs/parser/src/parInsert.c
@@ -143,9 +143,9 @@ static int32_t createSName(SName* pName, SToken* pTableName, int32_t acctId, con
    }
    char name[TSDB_DB_FNAME_LEN] = {0};
    strncpy(name, pTableName->z, dbLen);
-    dbLen = strdequote(name);
+    int32_t actualDbLen = strdequote(name);
-    code = tNameSetDbName(pName, acctId, name, dbLen);
+    code = tNameSetDbName(pName, acctId, name, actualDbLen);
    if (code != TSDB_CODE_SUCCESS) {
      return buildInvalidOperationMsg(pMsgBuf, msg1);
    }

--- a/source/libs/parser/src/parTranslater.c
+++ b/source/libs/parser/src/parTranslater.c
--- a/source/libs/scalar/inc/filterInt.h
+++ b/source/libs/scalar/inc/filterInt.h
@@ -350,7 +350,6 @@ struct SFilterInfo {
 extern bool filterDoCompare(__compar_fn_t func, uint8_t optr, void *left, void *right);
 extern __compar_fn_t filterGetCompFunc(int32_t type, int32_t optr);
-extern OptrStr gOptrStr[];
 #ifdef __cplusplus
 }

--- a/source/libs/scalar/src/filter.c
+++ b/source/libs/scalar/src/filter.c
--- a/source/libs/scalar/test/scalar/scalarTests.cpp
+++ b/source/libs/scalar/test/scalar/scalarTests.cpp
--- a/source/libs/scheduler/src/schRemote.c
+++ b/source/libs/scheduler/src/schRemote.c
--- a/source/libs/stream/inc/streamInc.h
+++ b/source/libs/stream/inc/streamInc.h
--- a/source/libs/stream/src/stream.c
+++ b/source/libs/stream/src/stream.c
--- a/source/libs/stream/src/streamMeta.c
+++ b/source/libs/stream/src/streamMeta.c
--- a/source/libs/stream/src/streamQueue.c
+++ b/source/libs/stream/src/streamQueue.c
--- a/source/libs/stream/src/streamTask.c
+++ b/source/libs/stream/src/streamTask.c
--- a/source/libs/sync/inc/syncInt.h
+++ b/source/libs/sync/inc/syncInt.h
--- a/source/libs/sync/src/syncCommit.c
+++ b/source/libs/sync/src/syncCommit.c
--- a/source/libs/sync/src/syncMain.c
+++ b/source/libs/sync/src/syncMain.c
--- a/source/libs/tdb/src/db/tdbPCache.c
+++ b/source/libs/tdb/src/db/tdbPCache.c
--- a/source/libs/transport/inc/transComm.h
+++ b/source/libs/transport/inc/transComm.h
--- a/source/libs/transport/src/thttp.c
+++ b/source/libs/transport/src/thttp.c
--- a/source/libs/transport/src/transCli.c
+++ b/source/libs/transport/src/transCli.c
--- a/source/libs/transport/src/transComm.c
+++ b/source/libs/transport/src/transComm.c
--- a/source/libs/transport/src/transSvr.c
+++ b/source/libs/transport/src/transSvr.c
--- a/source/util/src/tconfig.c
+++ b/source/util/src/tconfig.c
--- a/source/util/src/terror.c
+++ b/source/util/src/terror.c
--- a/source/util/src/tqueue.c
+++ b/source/util/src/tqueue.c
--- a/source/util/src/trbtree.c
+++ b/source/util/src/trbtree.c
--- a/tests/script/tsim/alter/cached_schema_after_alter.sim
+++ b/tests/script/tsim/alter/cached_schema_after_alter.sim
--- a/tests/script/tsim/alter/dnode.sim
+++ b/tests/script/tsim/alter/dnode.sim
--- a/tests/script/tsim/alter/table.sim
+++ b/tests/script/tsim/alter/table.sim
--- a/tests/script/tsim/bnode/basic1.sim
+++ b/tests/script/tsim/bnode/basic1.sim
--- a/tests/script/tsim/compress/commitlog.sim
+++ b/tests/script/tsim/compress/commitlog.sim
--- a/tests/script/tsim/compress/compress.sim
+++ b/tests/script/tsim/compress/compress.sim
--- a/tests/script/tsim/compress/compress2.sim
+++ b/tests/script/tsim/compress/compress2.sim
--- a/tests/script/tsim/compress/uncompress.sim
+++ b/tests/script/tsim/compress/uncompress.sim
--- a/tests/script/tsim/db/alter_option.sim
+++ b/tests/script/tsim/db/alter_option.sim
--- a/tests/script/tsim/db/alter_replica_13.sim
+++ b/tests/script/tsim/db/alter_replica_13.sim
--- a/tests/script/tsim/db/alter_replica_31.sim
+++ b/tests/script/tsim/db/alter_replica_31.sim
--- a/tests/script/tsim/db/back_insert.sim
+++ b/tests/script/tsim/db/back_insert.sim
--- a/tests/script/tsim/db/basic1.sim
+++ b/tests/script/tsim/db/basic1.sim
--- a/tests/script/tsim/db/basic2.sim
+++ b/tests/script/tsim/db/basic2.sim
--- a/tests/script/tsim/db/basic3.sim
+++ b/tests/script/tsim/db/basic3.sim
--- a/tests/script/tsim/db/basic4.sim
+++ b/tests/script/tsim/db/basic4.sim
--- a/tests/script/tsim/db/basic5.sim
+++ b/tests/script/tsim/db/basic5.sim
--- a/tests/script/tsim/db/basic6.sim
+++ b/tests/script/tsim/db/basic6.sim
--- a/tests/script/tsim/db/commit.sim
+++ b/tests/script/tsim/db/commit.sim
--- a/tests/script/tsim/db/delete_reuse1.sim
+++ b/tests/script/tsim/db/delete_reuse1.sim
--- a/tests/script/tsim/db/delete_reuse2.sim
+++ b/tests/script/tsim/db/delete_reuse2.sim
--- a/tests/script/tsim/db/delete_reusevnode.sim
+++ b/tests/script/tsim/db/delete_reusevnode.sim
--- a/tests/script/tsim/db/delete_reusevnode2.sim
+++ b/tests/script/tsim/db/delete_reusevnode2.sim
--- a/tests/script/tsim/db/delete_writing1.sim
+++ b/tests/script/tsim/db/delete_writing1.sim
--- a/tests/script/tsim/db/delete_writing2.sim
+++ b/tests/script/tsim/db/delete_writing2.sim
--- a/tests/script/tsim/db/dropdnodes.sim
+++ b/tests/script/tsim/db/dropdnodes.sim
--- a/tests/script/tsim/db/keep.sim
+++ b/tests/script/tsim/db/keep.sim
--- a/tests/script/tsim/db/len.sim
+++ b/tests/script/tsim/db/len.sim
--- a/tests/script/tsim/db/repeat.sim
+++ b/tests/script/tsim/db/repeat.sim
--- a/tests/script/tsim/db/show_create_db.sim
+++ b/tests/script/tsim/db/show_create_db.sim
--- a/tests/script/tsim/db/show_create_table.sim
+++ b/tests/script/tsim/db/show_create_table.sim
--- a/tests/script/tsim/db/tables.sim
+++ b/tests/script/tsim/db/tables.sim
--- a/tests/script/tsim/parser/fill_stb.sim
+++ b/tests/script/tsim/parser/fill_stb.sim
--- a/tests/script/tsim/parser/function.sim
+++ b/tests/script/tsim/parser/function.sim
--- a/tests/script/tsim/sma/rsmaPersistenceRecovery.sim
+++ b/tests/script/tsim/sma/rsmaPersistenceRecovery.sim
--- a/tests/script/tsim/stream/basic0.sim
+++ b/tests/script/tsim/stream/basic0.sim
--- a/tests/system-test/1-insert/time_range_wise.py
+++ b/tests/system-test/1-insert/time_range_wise.py
--- a/tests/system-test/2-query/interp.py
+++ b/tests/system-test/2-query/interp.py
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt