提交 1a7b4c83 编写于 作者: D Denis Smirnov 提交者: Chris Hajas

Replace list() with set() validation in analyzedb

After testing analyzedb on a huge database with 170k tables we
have found a bottleneck while printing candidate list to analyze.
It took about 45 minutes to print all tables. The bottleneck was
in O(n^2) complexity when we validated candidates in a loop with
a list() instead of set(). The same O(n^2) validation is made while
running analyze commands on executor pool.
This commit change candidate type from list() to set() to reduce
complexity from O(n^2) to O(n).
上级 c18bb772
......@@ -24,6 +24,7 @@ from datetime import datetime
from contextlib import closing
import pipes # for shell-quoting, pipes.quote()
import fcntl
import itertools
try:
import pg
......@@ -616,12 +617,12 @@ class AnalyzeDb(Operation):
3. skip views and external tables
"""
qresult = run_sql(self.conn, GET_MID_LEVEL_PARTITIONS_SQL)
mid_level_partitions = []
mid_level_partitions = set()
for schema_tbl in qresult:
tup = (schema_tbl[0], schema_tbl[1])
mid_level_partitions.append(tup)
mid_level_partitions.add(tup)
ret = []
ret = set()
for can in candidates:
schema = can[0]
table = can[1]
......@@ -632,16 +633,16 @@ class AnalyzeDb(Operation):
if can in mid_level_partitions:
logger.warning("Skipping mid-level partition %s.%s" % (schema, table))
else:
ret.append(can)
ret.add(can)
if self.config_file is not None or self.single_table is not None:
valid_tables = []
valid_tables = set()
if len(ret) > 0:
oid_str = get_oid_str(ret)
qresult = run_sql(self.conn, GET_VALID_DATA_TABLES_SQL % oid_str)
for schema_tbl in qresult:
tup = (schema_tbl[0], schema_tbl[1])
valid_tables.append(tup)
valid_tables.add(tup)
return valid_tables
return ret
......@@ -856,7 +857,7 @@ class AnalyzeDb(Operation):
2. The leaf partitions (if range partitioned, especially by date) will be ordered in descending
order of the partition key, so that newer partitions can be analyzed first.
"""
candidate_regclass_str = get_oid_str(candidates + list(root_partition_col_dict.keys()))
candidate_regclass_str = get_oid_str(itertools.chain(candidates, root_partition_col_dict.keys()))
qresult = run_sql(self.conn, ORDER_CANDIDATES_BY_OID_SQL % candidate_regclass_str)
ordered_candidates = []
for schema_tbl in qresult:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册