From 3ba11d3df2115b04171a8eda8e0389e02578d8d0 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 7 Oct 2010 20:00:28 -0400 Subject: [PATCH] Teach CLUSTER to use seqscan-and-sort when it's faster than indexscan. ... or at least, when the planner's cost estimates say it will be faster. Leonardo Francalanci, reviewed by Itagaki Takahiro and Tom Lane --- doc/src/sgml/ref/cluster.sgml | 67 +++-- src/backend/commands/cluster.c | 161 +++++++++--- src/backend/optimizer/path/costsize.c | 45 ++-- src/backend/optimizer/plan/createplan.c | 3 + src/backend/optimizer/plan/planmain.c | 3 +- src/backend/optimizer/plan/planner.c | 123 +++++++++- src/backend/optimizer/prep/prepunion.c | 3 +- src/backend/optimizer/util/pathnode.c | 2 + src/backend/optimizer/util/plancat.c | 86 +++++-- src/backend/utils/sort/tuplesort.c | 310 ++++++++++++++++++++++++ src/include/optimizer/cost.h | 1 + src/include/optimizer/plancat.h | 2 + src/include/optimizer/planner.h | 2 + src/include/utils/tuplesort.h | 43 ++-- 14 files changed, 713 insertions(+), 138 deletions(-) diff --git a/doc/src/sgml/ref/cluster.sgml b/doc/src/sgml/ref/cluster.sgml index 4b641954ef..adba267863 100644 --- a/doc/src/sgml/ref/cluster.sgml +++ b/doc/src/sgml/ref/cluster.sgml @@ -128,18 +128,33 @@ CLUSTER [VERBOSE] - During the cluster operation, a temporary copy of the table is created - that contains the table data in the index order. Temporary copies of - each index on the table are created as well. Therefore, you need free - space on disk at least equal to the sum of the table size and the index - sizes. + CLUSTER can re-sort the table using either an indexscan + on the specified index, or (if the index is a b-tree) a sequential + scan followed by sorting. It will attempt to choose the method that + will be faster, based on planner cost parameters and available statistical + information. - Because CLUSTER remembers the clustering information, - one can cluster the tables one wants clustered manually the first time, and - setup a timed event similar to VACUUM so that the tables - are periodically reclustered. + When an indexscan is used, a temporary copy of the table is created that + contains the table data in the index order. Temporary copies of each + index on the table are created as well. Therefore, you need free space on + disk at least equal to the sum of the table size and the index sizes. + + + + When a sequential scan and sort is used, a temporary sort file is + also created, so that the peak temporary space requirement is as much + as double the table size, plus the index sizes. This method is often + faster than the indexscan method, but if the disk space requirement is + intolerable, you can disable this choice by temporarily setting to off. + + + + It is advisable to set to + a reasonably large value (but not more than the amount of RAM you can + dedicate to the CLUSTER operation) before clustering. @@ -150,35 +165,13 @@ CLUSTER [VERBOSE] - There is another way to cluster data. The - CLUSTER command reorders the original table by - scanning it using the index you specify. This can be slow - on large tables because the rows are fetched from the table - in index order, and if the table is disordered, the - entries are on random pages, so there is one disk page - retrieved for every row moved. (PostgreSQL has - a cache, but the majority of a big table will not fit in the cache.) - The other way to cluster a table is to use: - - -CREATE TABLE newtable AS - SELECT * FROM table ORDER BY columnlist; - - - which uses the PostgreSQL sorting code - to produce the desired order; - this is usually much faster than an index scan for disordered data. - Then you drop the old table, use - ALTER TABLE ... RENAME - to rename newtable to the - old name, and recreate the table's indexes. - The big disadvantage of this approach is that it does not preserve - OIDs, constraints, foreign key relationships, granted privileges, and - other ancillary properties of the table — all such items must be - manually recreated. Another disadvantage is that this way requires a sort - temporary file about the same size as the table itself, so peak disk usage - is about three times the table size instead of twice the table size. + Because CLUSTER remembers which indexes are clustered, + one can cluster the tables one wants clustered manually the first time, + then set up a periodic maintenance script that executes + CLUSTER without any parameters, so that the desired tables + are periodically reclustered. + diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index a2a2bbfa75..f52e39fc36 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -36,6 +36,7 @@ #include "commands/trigger.h" #include "commands/vacuum.h" #include "miscadmin.h" +#include "optimizer/planner.h" #include "storage/bufmgr.h" #include "storage/procarray.h" #include "storage/smgr.h" @@ -49,6 +50,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tqual.h" +#include "utils/tuplesort.h" /* @@ -69,7 +71,10 @@ static void copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, int freeze_min_age, int freeze_table_age, bool *pSwapToastByContent, TransactionId *pFreezeXid); static List *get_tables_to_cluster(MemoryContext cluster_context); - +static void reform_and_rewrite_tuple(HeapTuple tuple, + TupleDesc oldTupDesc, TupleDesc newTupDesc, + Datum *values, bool *isnull, + bool newRelHasOids, RewriteState rwstate); /*--------------------------------------------------------------------------- @@ -759,6 +764,8 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, TransactionId OldestXmin; TransactionId FreezeXid; RewriteState rwstate; + bool use_sort; + Tuplesortstate *tuplesort; /* * Open the relations we need. @@ -845,12 +852,30 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, rwstate = begin_heap_rewrite(NewHeap, OldestXmin, FreezeXid, use_wal); /* - * Scan through the OldHeap, either in OldIndex order or sequentially, and - * copy each tuple into the NewHeap. To ensure we see recently-dead - * tuples that still need to be copied, we scan with SnapshotAny and use + * Decide whether to use an indexscan or seqscan-and-optional-sort to + * scan the OldHeap. We know how to use a sort to duplicate the ordering + * of a btree index, and will use seqscan-and-sort for that case if the + * planner tells us it's cheaper. Otherwise, always indexscan if an + * index is provided, else plain seqscan. + */ + if (OldIndex != NULL && OldIndex->rd_rel->relam == BTREE_AM_OID) + use_sort = plan_cluster_use_sort(OIDOldHeap, OIDOldIndex); + else + use_sort = false; + + /* Set up sorting if wanted */ + if (use_sort) + tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex, + maintenance_work_mem, false); + else + tuplesort = NULL; + + /* + * Prepare to scan the OldHeap. To ensure we see recently-dead tuples + * that still need to be copied, we scan with SnapshotAny and use * HeapTupleSatisfiesVacuum for the visibility test. */ - if (OldIndex != NULL) + if (OldIndex != NULL && !use_sort) { heapScan = NULL; indexScan = index_beginscan(OldHeap, OldIndex, @@ -862,17 +887,21 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, indexScan = NULL; } + /* + * Scan through the OldHeap, either in OldIndex order or sequentially; + * copy each tuple into the NewHeap, or transiently to the tuplesort + * module. Note that we don't bother sorting dead tuples (they won't + * get to the new table anyway). + */ for (;;) { HeapTuple tuple; - HeapTuple copiedTuple; Buffer buf; bool isdead; - int i; CHECK_FOR_INTERRUPTS(); - if (OldIndex != NULL) + if (indexScan != NULL) { tuple = index_getnext(indexScan, ForwardScanDirection); if (tuple == NULL) @@ -951,45 +980,50 @@ copy_heap_data(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex, continue; } - /* - * We cannot simply copy the tuple as-is, for several reasons: - * - * 1. We'd like to squeeze out the values of any dropped columns, both - * to save space and to ensure we have no corner-case failures. (It's - * possible for example that the new table hasn't got a TOAST table - * and so is unable to store any large values of dropped cols.) - * - * 2. The tuple might not even be legal for the new table; this is - * currently only known to happen as an after-effect of ALTER TABLE - * SET WITHOUT OIDS. - * - * So, we must reconstruct the tuple from component Datums. - */ - heap_deform_tuple(tuple, oldTupDesc, values, isnull); + if (tuplesort != NULL) + tuplesort_putheaptuple(tuplesort, tuple); + else + reform_and_rewrite_tuple(tuple, + oldTupDesc, newTupDesc, + values, isnull, + NewHeap->rd_rel->relhasoids, rwstate); + } - /* Be sure to null out any dropped columns */ - for (i = 0; i < natts; i++) + if (indexScan != NULL) + index_endscan(indexScan); + if (heapScan != NULL) + heap_endscan(heapScan); + + /* + * In scan-and-sort mode, complete the sort, then read out all live + * tuples from the tuplestore and write them to the new relation. + */ + if (tuplesort != NULL) + { + tuplesort_performsort(tuplesort); + + for (;;) { - if (newTupDesc->attrs[i]->attisdropped) - isnull[i] = true; - } + HeapTuple tuple; + bool shouldfree; - copiedTuple = heap_form_tuple(newTupDesc, values, isnull); + CHECK_FOR_INTERRUPTS(); - /* Preserve OID, if any */ - if (NewHeap->rd_rel->relhasoids) - HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple)); + tuple = tuplesort_getheaptuple(tuplesort, true, &shouldfree); + if (tuple == NULL) + break; - /* The heap rewrite module does the rest */ - rewrite_heap_tuple(rwstate, tuple, copiedTuple); + reform_and_rewrite_tuple(tuple, + oldTupDesc, newTupDesc, + values, isnull, + NewHeap->rd_rel->relhasoids, rwstate); - heap_freetuple(copiedTuple); - } + if (shouldfree) + heap_freetuple(tuple); + } - if (OldIndex != NULL) - index_endscan(indexScan); - else - heap_endscan(heapScan); + tuplesort_end(tuplesort); + } /* Write out any remaining tuples, and fsync if needed */ end_heap_rewrite(rwstate); @@ -1488,3 +1522,50 @@ get_tables_to_cluster(MemoryContext cluster_context) return rvs; } + + +/* + * Reconstruct and rewrite the given tuple + * + * We cannot simply copy the tuple as-is, for several reasons: + * + * 1. We'd like to squeeze out the values of any dropped columns, both + * to save space and to ensure we have no corner-case failures. (It's + * possible for example that the new table hasn't got a TOAST table + * and so is unable to store any large values of dropped cols.) + * + * 2. The tuple might not even be legal for the new table; this is + * currently only known to happen as an after-effect of ALTER TABLE + * SET WITHOUT OIDS. + * + * So, we must reconstruct the tuple from component Datums. + */ +static void +reform_and_rewrite_tuple(HeapTuple tuple, + TupleDesc oldTupDesc, TupleDesc newTupDesc, + Datum *values, bool *isnull, + bool newRelHasOids, RewriteState rwstate) +{ + HeapTuple copiedTuple; + int i; + + heap_deform_tuple(tuple, oldTupDesc, values, isnull); + + /* Be sure to null out any dropped columns */ + for (i = 0; i < newTupDesc->natts; i++) + { + if (newTupDesc->attrs[i]->attisdropped) + isnull[i] = true; + } + + copiedTuple = heap_form_tuple(newTupDesc, values, isnull); + + /* Preserve OID, if any */ + if (newRelHasOids) + HeapTupleSetOid(copiedTuple, HeapTupleGetOid(tuple)); + + /* The heap rewrite module does the rest */ + rewrite_heap_tuple(rwstate, tuple, copiedTuple); + + heap_freetuple(copiedTuple); +} diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 53aa62fb81..b27dc53fef 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -1071,33 +1071,37 @@ cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm) * Determines and returns the cost of sorting a relation, including * the cost of reading the input data. * - * If the total volume of data to sort is less than work_mem, we will do + * If the total volume of data to sort is less than sort_mem, we will do * an in-memory sort, which requires no I/O and about t*log2(t) tuple * comparisons for t tuples. * - * If the total volume exceeds work_mem, we switch to a tape-style merge + * If the total volume exceeds sort_mem, we switch to a tape-style merge * algorithm. There will still be about t*log2(t) tuple comparisons in * total, but we will also need to write and read each tuple once per * merge pass. We expect about ceil(logM(r)) merge passes where r is the * number of initial runs formed and M is the merge order used by tuplesort.c. - * Since the average initial run should be about twice work_mem, we have - * disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem))) + * Since the average initial run should be about twice sort_mem, we have + * disk traffic = 2 * relsize * ceil(logM(p / (2*sort_mem))) * cpu = comparison_cost * t * log2(t) * * If the sort is bounded (i.e., only the first k result tuples are needed) - * and k tuples can fit into work_mem, we use a heap method that keeps only + * and k tuples can fit into sort_mem, we use a heap method that keeps only * k tuples in the heap; this will require about t*log2(k) tuple comparisons. * * The disk traffic is assumed to be 3/4ths sequential and 1/4th random * accesses (XXX can't we refine that guess?) * - * We charge two operator evals per tuple comparison, which should be in - * the right ballpark in most cases. + * By default, we charge two operator evals per tuple comparison, which should + * be in the right ballpark in most cases. The caller can tweak this by + * specifying nonzero comparison_cost; typically that's used for any extra + * work that has to be done to prepare the inputs to the comparison operators. * * 'pathkeys' is a list of sort keys * 'input_cost' is the total cost for reading the input data * 'tuples' is the number of tuples in the relation * 'width' is the average tuple width in bytes + * 'comparison_cost' is the extra cost per comparison, if any + * 'sort_mem' is the number of kilobytes of work memory allowed for the sort * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound * * NOTE: some callers currently pass NIL for pathkeys because they @@ -1110,6 +1114,7 @@ cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm) void cost_sort(Path *path, PlannerInfo *root, List *pathkeys, Cost input_cost, double tuples, int width, + Cost comparison_cost, int sort_mem, double limit_tuples) { Cost startup_cost = input_cost; @@ -1117,7 +1122,7 @@ cost_sort(Path *path, PlannerInfo *root, double input_bytes = relation_byte_size(tuples, width); double output_bytes; double output_tuples; - long work_mem_bytes = work_mem * 1024L; + long sort_mem_bytes = sort_mem * 1024L; if (!enable_sort) startup_cost += disable_cost; @@ -1129,6 +1134,9 @@ cost_sort(Path *path, PlannerInfo *root, if (tuples < 2.0) tuples = 2.0; + /* Include the default cost-per-comparison */ + comparison_cost += 2.0 * cpu_operator_cost; + /* Do we have a useful LIMIT? */ if (limit_tuples > 0 && limit_tuples < tuples) { @@ -1141,24 +1149,23 @@ cost_sort(Path *path, PlannerInfo *root, output_bytes = input_bytes; } - if (output_bytes > work_mem_bytes) + if (output_bytes > sort_mem_bytes) { /* * We'll have to use a disk-based sort of all the tuples */ double npages = ceil(input_bytes / BLCKSZ); - double nruns = (input_bytes / work_mem_bytes) * 0.5; - double mergeorder = tuplesort_merge_order(work_mem_bytes); + double nruns = (input_bytes / sort_mem_bytes) * 0.5; + double mergeorder = tuplesort_merge_order(sort_mem_bytes); double log_runs; double npageaccesses; /* * CPU costs * - * Assume about two operator evals per tuple comparison and N log2 N - * comparisons + * Assume about N log2 N comparisons */ - startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples); + startup_cost += comparison_cost * tuples * LOG2(tuples); /* Disk costs */ @@ -1172,7 +1179,7 @@ cost_sort(Path *path, PlannerInfo *root, startup_cost += npageaccesses * (seq_page_cost * 0.75 + random_page_cost * 0.25); } - else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes) + else if (tuples > 2 * output_tuples || input_bytes > sort_mem_bytes) { /* * We'll use a bounded heap-sort keeping just K tuples in memory, for @@ -1180,12 +1187,12 @@ cost_sort(Path *path, PlannerInfo *root, * factor is a bit higher than for quicksort. Tweak it so that the * cost curve is continuous at the crossover point. */ - startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples); + startup_cost += comparison_cost * tuples * LOG2(2.0 * output_tuples); } else { /* We'll use plain quicksort on all the input tuples */ - startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples); + startup_cost += comparison_cost * tuples * LOG2(tuples); } /* @@ -1786,6 +1793,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) outer_path->total_cost, outer_path_rows, outer_path->parent->width, + 0.0, + work_mem, -1.0); startup_cost += sort_path.startup_cost; startup_cost += (sort_path.total_cost - sort_path.startup_cost) @@ -1810,6 +1819,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root, SpecialJoinInfo *sjinfo) inner_path->total_cost, inner_path_rows, inner_path->parent->width, + 0.0, + work_mem, -1.0); startup_cost += sort_path.startup_cost; startup_cost += (sort_path.total_cost - sort_path.startup_cost) diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index fa7b29f7d4..2c398d2eed 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -20,6 +20,7 @@ #include #include "access/skey.h" +#include "miscadmin.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" @@ -3041,6 +3042,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols, lefttree->total_cost, lefttree->plan_rows, lefttree->plan_width, + 0.0, + work_mem, limit_tuples); plan->startup_cost = sort_path.startup_cost; plan->total_cost = sort_path.total_cost; diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index 9e884cbb3c..fd4c6f54d0 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -20,6 +20,7 @@ */ #include "postgres.h" +#include "miscadmin.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" @@ -415,7 +416,7 @@ query_planner(PlannerInfo *root, List *tlist, cost_sort(&sort_path, root, root->query_pathkeys, cheapestpath->total_cost, final_rel->rows, final_rel->width, - limit_tuples); + 0.0, work_mem, limit_tuples); } if (compare_fractional_path_costs(sortedpath, &sort_path, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 9cf5995ce3..93daedc706 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -26,6 +26,7 @@ #include "optimizer/cost.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" +#include "optimizer/plancat.h" #include "optimizer/planmain.h" #include "optimizer/planner.h" #include "optimizer/prep.h" @@ -2276,7 +2277,8 @@ choose_hashed_grouping(PlannerInfo *root, /* Result of hashed agg is always unsorted */ if (target_pathkeys) cost_sort(&hashed_p, root, target_pathkeys, hashed_p.total_cost, - dNumGroups, path_width, limit_tuples); + dNumGroups, path_width, + 0.0, work_mem, limit_tuples); if (sorted_path) { @@ -2293,7 +2295,8 @@ choose_hashed_grouping(PlannerInfo *root, if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys)) { cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost, - path_rows, path_width, -1.0); + path_rows, path_width, + 0.0, work_mem, -1.0); current_pathkeys = root->group_pathkeys; } @@ -2310,7 +2313,8 @@ choose_hashed_grouping(PlannerInfo *root, if (target_pathkeys && !pathkeys_contained_in(target_pathkeys, current_pathkeys)) cost_sort(&sorted_p, root, target_pathkeys, sorted_p.total_cost, - dNumGroups, path_width, limit_tuples); + dNumGroups, path_width, + 0.0, work_mem, limit_tuples); /* * Now make the decision using the top-level tuple fraction. First we @@ -2427,7 +2431,8 @@ choose_hashed_distinct(PlannerInfo *root, */ if (parse->sortClause) cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost, - dNumDistinctRows, path_width, limit_tuples); + dNumDistinctRows, path_width, + 0.0, work_mem, limit_tuples); /* * Now for the GROUP case. See comments in grouping_planner about the @@ -2450,7 +2455,8 @@ choose_hashed_distinct(PlannerInfo *root, else current_pathkeys = root->sort_pathkeys; cost_sort(&sorted_p, root, current_pathkeys, sorted_p.total_cost, - path_rows, path_width, -1.0); + path_rows, path_width, + 0.0, work_mem, -1.0); } cost_group(&sorted_p, root, numDistinctCols, dNumDistinctRows, sorted_p.startup_cost, sorted_p.total_cost, @@ -2458,7 +2464,8 @@ choose_hashed_distinct(PlannerInfo *root, if (parse->sortClause && !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys)) cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost, - dNumDistinctRows, path_width, limit_tuples); + dNumDistinctRows, path_width, + 0.0, work_mem, limit_tuples); /* * Now make the decision using the top-level tuple fraction. First we @@ -2997,3 +3004,107 @@ expression_planner(Expr *expr) return (Expr *) result; } + + +/* + * plan_cluster_use_sort + * Use the planner to decide how CLUSTER should implement sorting + * + * tableOid is the OID of a table to be clustered on its index indexOid + * (which is already known to be a btree index). Decide whether it's + * cheaper to do an indexscan or a seqscan-plus-sort to execute the CLUSTER. + * Return TRUE to use sorting, FALSE to use an indexscan. + * + * Note: caller had better already hold some type of lock on the table. + */ +bool +plan_cluster_use_sort(Oid tableOid, Oid indexOid) +{ + PlannerInfo *root; + Query *query; + PlannerGlobal *glob; + RangeTblEntry *rte; + RelOptInfo *rel; + IndexOptInfo *indexInfo; + QualCost indexExprCost; + Cost comparisonCost; + Path *seqScanPath; + Path seqScanAndSortPath; + IndexPath *indexScanPath; + ListCell *lc; + + /* Set up mostly-dummy planner state */ + query = makeNode(Query); + query->commandType = CMD_SELECT; + + glob = makeNode(PlannerGlobal); + + root = makeNode(PlannerInfo); + root->parse = query; + root->glob = glob; + root->query_level = 1; + root->planner_cxt = CurrentMemoryContext; + root->wt_param_id = -1; + + /* Build a minimal RTE for the rel */ + rte = makeNode(RangeTblEntry); + rte->rtekind = RTE_RELATION; + rte->relid = tableOid; + rte->inh = false; + rte->inFromCl = true; + query->rtable = list_make1(rte); + + /* ... and insert it into PlannerInfo */ + root->simple_rel_array_size = 2; + root->simple_rel_array = (RelOptInfo **) + palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *)); + root->simple_rte_array = (RangeTblEntry **) + palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *)); + root->simple_rte_array[1] = rte; + + /* Build RelOptInfo */ + rel = build_simple_rel(root, 1, RELOPT_BASEREL); + + /* + * Rather than doing all the pushups that would be needed to use + * set_baserel_size_estimates, just do a quick hack for rows and width. + */ + rel->rows = rel->tuples; + rel->width = get_relation_data_width(tableOid); + + root->total_table_pages = rel->pages; + + /* Locate IndexOptInfo for the target index */ + indexInfo = NULL; + foreach(lc, rel->indexlist) + { + indexInfo = (IndexOptInfo *) lfirst(lc); + if (indexInfo->indexoid == indexOid) + break; + } + if (lc == NULL) /* not in the list? */ + elog(ERROR, "index %u does not belong to table %u", + indexOid, tableOid); + + /* + * Determine eval cost of the index expressions, if any. We need to + * charge twice that amount for each tuple comparison that happens + * during the sort, since tuplesort.c will have to re-evaluate the + * index expressions each time. (XXX that's pretty inefficient...) + */ + cost_qual_eval(&indexExprCost, indexInfo->indexprs, root); + comparisonCost = 2.0 * (indexExprCost.startup + indexExprCost.per_tuple); + + /* Estimate the cost of seq scan + sort */ + seqScanPath = create_seqscan_path(root, rel); + cost_sort(&seqScanAndSortPath, root, NIL, + seqScanPath->total_cost, rel->tuples, rel->width, + comparisonCost, maintenance_work_mem, -1.0); + + /* Estimate the cost of index scan */ + indexScanPath = create_index_path(root, indexInfo, + NIL, NIL, + ForwardScanDirection, NULL); + + return (seqScanAndSortPath.total_cost < indexScanPath->path.total_cost); +} diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index f904258280..0d3a739175 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -805,7 +805,8 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses, sorted_p.total_cost = input_plan->total_cost; /* XXX cost_sort doesn't actually look at pathkeys, so just pass NIL */ cost_sort(&sorted_p, root, NIL, sorted_p.total_cost, - input_plan->plan_rows, input_plan->plan_width, -1.0); + input_plan->plan_rows, input_plan->plan_width, + 0.0, work_mem, -1.0); cost_group(&sorted_p, root, numGroupCols, dNumGroups, sorted_p.startup_cost, sorted_p.total_cost, input_plan->plan_rows); diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index f8aa745fef..71e0e75a56 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -969,6 +969,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, subpath->total_cost, rel->rows, rel->width, + 0.0, + work_mem, -1.0); /* diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index ad71d3a4f9..7ffa11588d 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -46,6 +46,7 @@ int constraint_exclusion = CONSTRAINT_EXCLUSION_PARTITION; get_relation_info_hook_type get_relation_info_hook = NULL; +static int32 get_rel_data_width(Relation rel, int32 *attr_widths); static List *get_relation_constraints(PlannerInfo *root, Oid relationObjectId, RelOptInfo *rel, bool include_notnull); @@ -406,28 +407,9 @@ estimate_rel_size(Relation rel, int32 *attr_widths, * platform dependencies in the default plans which are kind * of a headache for regression testing. */ - int32 tuple_width = 0; - int i; + int32 tuple_width; - for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++) - { - Form_pg_attribute att = rel->rd_att->attrs[i - 1]; - int32 item_width; - - if (att->attisdropped) - continue; - /* This should match set_rel_width() in costsize.c */ - item_width = get_attavgwidth(RelationGetRelid(rel), i); - if (item_width <= 0) - { - item_width = get_typavgwidth(att->atttypid, - att->atttypmod); - Assert(item_width > 0); - } - if (attr_widths != NULL) - attr_widths[i] = item_width; - tuple_width += item_width; - } + tuple_width = get_rel_data_width(rel, attr_widths); tuple_width += sizeof(HeapTupleHeaderData); tuple_width += sizeof(ItemPointerData); /* note: integer division is intentional here */ @@ -449,6 +431,68 @@ estimate_rel_size(Relation rel, int32 *attr_widths, } +/* + * get_rel_data_width + * + * Estimate the average width of (the data part of) the relation's tuples. + * If attr_widths isn't NULL, also store per-column width estimates into + * that array. + * + * Currently we ignore dropped columns. Ideally those should be included + * in the result, but we haven't got any way to get info about them; and + * since they might be mostly NULLs, treating them as zero-width is not + * necessarily the wrong thing anyway. + */ +static int32 +get_rel_data_width(Relation rel, int32 *attr_widths) +{ + int32 tuple_width = 0; + int i; + + for (i = 1; i <= RelationGetNumberOfAttributes(rel); i++) + { + Form_pg_attribute att = rel->rd_att->attrs[i - 1]; + int32 item_width; + + if (att->attisdropped) + continue; + /* This should match set_rel_width() in costsize.c */ + item_width = get_attavgwidth(RelationGetRelid(rel), i); + if (item_width <= 0) + { + item_width = get_typavgwidth(att->atttypid, att->atttypmod); + Assert(item_width > 0); + } + if (attr_widths != NULL) + attr_widths[i] = item_width; + tuple_width += item_width; + } + + return tuple_width; +} + +/* + * get_relation_data_width + * + * External API for get_rel_data_width + */ +int32 +get_relation_data_width(Oid relid) +{ + int32 result; + Relation relation; + + /* As above, assume relation is already locked */ + relation = heap_open(relid, NoLock); + + result = get_rel_data_width(relation, NULL); + + heap_close(relation, NoLock); + + return result; +} + + /* * get_relation_constraints * diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index cf0a583f5c..0013703250 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -102,9 +102,11 @@ #include "access/genam.h" #include "access/nbtree.h" +#include "catalog/index.h" #include "catalog/pg_amop.h" #include "catalog/pg_operator.h" #include "commands/tablespace.h" +#include "executor/executor.h" #include "miscadmin.h" #include "pg_trace.h" #include "utils/datum.h" @@ -121,6 +123,7 @@ #define HEAP_SORT 0 #define INDEX_SORT 1 #define DATUM_SORT 2 +#define CLUSTER_SORT 3 /* GUC variables */ #ifdef TRACE_SORT @@ -342,6 +345,14 @@ struct Tuplesortstate TupleDesc tupDesc; ScanKey scanKeys; /* array of length nKeys */ + /* + * These variables are specific to the CLUSTER case; they are set by + * tuplesort_begin_cluster. Note CLUSTER also uses tupDesc and + * indexScanKey. + */ + IndexInfo *indexInfo; /* info about index being used for reference */ + EState *estate; /* for evaluating index expressions */ + /* * These variables are specific to the IndexTuple case; they are set by * tuplesort_begin_index_xxx and used only by the IndexTuple routines. @@ -450,6 +461,13 @@ static void writetup_heap(Tuplesortstate *state, int tapenum, static void readtup_heap(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len); static void reversedirection_heap(Tuplesortstate *state); +static int comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static void copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup); +static void writetup_cluster(Tuplesortstate *state, int tapenum, + SortTuple *stup); +static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int len); static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, @@ -627,6 +645,67 @@ tuplesort_begin_heap(TupleDesc tupDesc, return state; } +Tuplesortstate * +tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(workMem, randomAccess); + MemoryContext oldcontext; + + Assert(indexRel->rd_rel->relam == BTREE_AM_OID); + + oldcontext = MemoryContextSwitchTo(state->sortcontext); + +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, + "begin tuple sort: nkeys = %d, workMem = %d, randomAccess = %c", + RelationGetNumberOfAttributes(indexRel), + workMem, randomAccess ? 't' : 'f'); +#endif + + state->nKeys = RelationGetNumberOfAttributes(indexRel); + + TRACE_POSTGRESQL_SORT_START(CLUSTER_SORT, + false, /* no unique check */ + state->nKeys, + workMem, + randomAccess); + + state->comparetup = comparetup_cluster; + state->copytup = copytup_cluster; + state->writetup = writetup_cluster; + state->readtup = readtup_cluster; + state->reversedirection = reversedirection_index_btree; + + state->indexInfo = BuildIndexInfo(indexRel); + state->indexScanKey = _bt_mkscankey_nodata(indexRel); + + state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ + + if (state->indexInfo->ii_Expressions != NULL) + { + TupleTableSlot *slot; + ExprContext *econtext; + + /* + * We will need to use FormIndexDatum to evaluate the index + * expressions. To do that, we need an EState, as well as a + * TupleTableSlot to put the table tuples into. The econtext's + * scantuple has to point to that slot, too. + */ + state->estate = CreateExecutorState(); + slot = MakeSingleTupleTableSlot(tupDesc); + econtext = GetPerTupleExprContext(state->estate); + econtext->ecxt_scantuple = slot; + } + + MemoryContextSwitchTo(oldcontext); + + return state; +} + Tuplesortstate * tuplesort_begin_index_btree(Relation indexRel, bool enforceUnique, @@ -850,6 +929,15 @@ tuplesort_end(Tuplesortstate *state) TRACE_POSTGRESQL_SORT_DONE(state->tapeset != NULL, 0L); #endif + /* Free any execution state created for CLUSTER case */ + if (state->estate != NULL) + { + ExprContext *econtext = GetPerTupleExprContext(state->estate); + + ExecDropSingleTupleTableSlot(econtext->ecxt_scantuple); + FreeExecutorState(state->estate); + } + MemoryContextSwitchTo(oldcontext); /* @@ -923,6 +1011,28 @@ tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot) MemoryContextSwitchTo(oldcontext); } +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input data is always copied; the caller need not save it. + */ +void +tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + /* + * Copy the given tuple into memory we control, and decrease availMem. + * Then call the common code. + */ + COPYTUP(state, &stup, (void *) tup); + + puttuple_common(state, &stup); + + MemoryContextSwitchTo(oldcontext); +} + /* * Accept one index tuple while collecting input data for sort. * @@ -1421,6 +1531,25 @@ tuplesort_gettupleslot(Tuplesortstate *state, bool forward, } } +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. If *should_free is set, the + * caller must pfree the returned tuple when done with it. + */ +HeapTuple +tuplesort_getheaptuple(Tuplesortstate *state, bool forward, bool *should_free) +{ + MemoryContext oldcontext = MemoryContextSwitchTo(state->sortcontext); + SortTuple stup; + + if (!tuplesort_gettuple_common(state, forward, &stup, should_free)) + stup.tuple = NULL; + + MemoryContextSwitchTo(oldcontext); + + return stup.tuple; +} + /* * Fetch the next index tuple in either forward or back direction. * Returns NULL if no more tuples. If *should_free is set, the @@ -2712,6 +2841,187 @@ reversedirection_heap(Tuplesortstate *state) } +/* + * Routines specialized for the CLUSTER case (HeapTuple data, with + * comparisons per a btree index definition) + */ + +static int +comparetup_cluster(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + ScanKey scanKey = state->indexScanKey; + HeapTuple ltup; + HeapTuple rtup; + TupleDesc tupDesc; + int nkey; + int32 compare; + + /* Allow interrupting long sorts */ + CHECK_FOR_INTERRUPTS(); + + /* Compare the leading sort key, if it's simple */ + if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + { + compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags, + a->datum1, a->isnull1, + b->datum1, b->isnull1); + if (compare != 0 || state->nKeys == 1) + return compare; + /* Compare additional columns the hard way */ + scanKey++; + nkey = 1; + } + else + { + /* Must compare all keys the hard way */ + nkey = 0; + } + + /* Compare additional sort keys */ + ltup = (HeapTuple) a->tuple; + rtup = (HeapTuple) b->tuple; + + if (state->indexInfo->ii_Expressions == NULL) + { + /* If not expression index, just compare the proper heap attrs */ + tupDesc = state->tupDesc; + + for (; nkey < state->nKeys; nkey++, scanKey++) + { + AttrNumber attno = state->indexInfo->ii_KeyAttrNumbers[nkey]; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + datum1 = heap_getattr(ltup, attno, tupDesc, &isnull1); + datum2 = heap_getattr(rtup, attno, tupDesc, &isnull2); + + compare = inlineApplySortFunction(&scanKey->sk_func, + scanKey->sk_flags, + datum1, isnull1, + datum2, isnull2); + if (compare != 0) + return compare; + } + } + else + { + /* + * In the expression index case, compute the whole index tuple and + * then compare values. It would perhaps be faster to compute only as + * many columns as we need to compare, but that would require + * duplicating all the logic in FormIndexDatum. + */ + Datum l_index_values[INDEX_MAX_KEYS]; + bool l_index_isnull[INDEX_MAX_KEYS]; + Datum r_index_values[INDEX_MAX_KEYS]; + bool r_index_isnull[INDEX_MAX_KEYS]; + TupleTableSlot *ecxt_scantuple; + + /* Reset context each time to prevent memory leakage */ + ResetPerTupleExprContext(state->estate); + + ecxt_scantuple = GetPerTupleExprContext(state->estate)->ecxt_scantuple; + + ExecStoreTuple(ltup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + l_index_values, l_index_isnull); + + ExecStoreTuple(rtup, ecxt_scantuple, InvalidBuffer, false); + FormIndexDatum(state->indexInfo, ecxt_scantuple, state->estate, + r_index_values, r_index_isnull); + + for (; nkey < state->nKeys; nkey++, scanKey++) + { + compare = inlineApplySortFunction(&scanKey->sk_func, + scanKey->sk_flags, + l_index_values[nkey], + l_index_isnull[nkey], + r_index_values[nkey], + r_index_isnull[nkey]); + if (compare != 0) + return compare; + } + } + + return 0; +} + +static void +copytup_cluster(Tuplesortstate *state, SortTuple *stup, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + + /* copy the tuple into sort storage */ + tuple = heap_copytuple(tuple); + stup->tuple = (void *) tuple; + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + +static void +writetup_cluster(Tuplesortstate *state, int tapenum, SortTuple *stup) +{ + HeapTuple tuple = (HeapTuple) stup->tuple; + unsigned int tuplen = tuple->t_len + sizeof(ItemPointerData) + sizeof(int); + + /* We need to store t_self, but not other fields of HeapTupleData */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + &tuple->t_self, sizeof(ItemPointerData)); + LogicalTapeWrite(state->tapeset, tapenum, + tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + &tuplen, sizeof(tuplen)); + + FREEMEM(state, GetMemoryChunkSpace(tuple)); + heap_freetuple(tuple); +} + +static void +readtup_cluster(Tuplesortstate *state, SortTuple *stup, + int tapenum, unsigned int tuplen) +{ + unsigned int t_len = tuplen - sizeof(ItemPointerData) - sizeof(int); + HeapTuple tuple = (HeapTuple) palloc(t_len + HEAPTUPLESIZE); + + USEMEM(state, GetMemoryChunkSpace(tuple)); + /* Reconstruct the HeapTupleData header */ + tuple->t_data = (HeapTupleHeader) ((char *) tuple + HEAPTUPLESIZE); + tuple->t_len = t_len; + if (LogicalTapeRead(state->tapeset, tapenum, + &tuple->t_self, + sizeof(ItemPointerData)) != sizeof(ItemPointerData)) + elog(ERROR, "unexpected end of data"); + /* We don't currently bother to reconstruct t_tableOid */ + tuple->t_tableOid = InvalidOid; + /* Read in the tuple body */ + if (LogicalTapeRead(state->tapeset, tapenum, + tuple->t_data, tuple->t_len) != tuple->t_len) + elog(ERROR, "unexpected end of data"); + if (state->randomAccess) /* need trailing length word? */ + if (LogicalTapeRead(state->tapeset, tapenum, &tuplen, + sizeof(tuplen)) != sizeof(tuplen)) + elog(ERROR, "unexpected end of data"); + stup->tuple = (void *) tuple; + /* set up first-column key value, if it's a simple column */ + if (state->indexInfo->ii_KeyAttrNumbers[0] != 0) + stup->datum1 = heap_getattr(tuple, + state->indexInfo->ii_KeyAttrNumbers[0], + state->tupDesc, + &stup->isnull1); +} + + /* * Routines specialized for IndexTuple case * diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 63641b9cc8..5a4b33f2b1 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -84,6 +84,7 @@ extern void cost_ctescan(Path *path, PlannerInfo *root, RelOptInfo *baserel); extern void cost_recursive_union(Plan *runion, Plan *nrterm, Plan *rterm); extern void cost_sort(Path *path, PlannerInfo *root, List *pathkeys, Cost input_cost, double tuples, int width, + Cost comparison_cost, int sort_mem, double limit_tuples); extern void cost_material(Path *path, Cost input_startup_cost, Cost input_total_cost, diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 0b84295854..de7de84cb3 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -31,6 +31,8 @@ extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, extern void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples); +extern int32 get_relation_data_width(Oid relid); + extern bool relation_excluded_by_constraints(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 8552d6eeb0..ef7287d8b5 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -37,4 +37,6 @@ extern Plan *subquery_planner(PlannerGlobal *glob, Query *parse, extern Expr *expression_planner(Expr *expr); +extern bool plan_cluster_use_sort(Oid tableOid, Oid indexOid); + #endif /* PLANNER_H */ diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index d879ff081d..8a31dff34c 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -32,29 +32,39 @@ typedef struct Tuplesortstate Tuplesortstate; /* - * We provide two different interfaces to what is essentially the same - * code: one for sorting HeapTuples and one for sorting IndexTuples. - * They differ primarily in the way that the sort key information is - * supplied. Also, the HeapTuple case actually stores MinimalTuples, - * which means it doesn't preserve the "system columns" (tuple identity and - * transaction visibility info). The IndexTuple case does preserve all - * the header fields of an index entry. In the HeapTuple case we can - * save some cycles by passing and returning the tuples in TupleTableSlots, - * rather than forming actual HeapTuples (which'd have to be converted to - * MinimalTuples). + * We provide multiple interfaces to what is essentially the same code, + * since different callers have different data to be sorted and want to + * specify the sort key information differently. There are two APIs for + * sorting HeapTuples and two more for sorting IndexTuples. Yet another + * API supports sorting bare Datums. * - * The IndexTuple case is itself broken into two subcases, one for btree - * indexes and one for hash indexes; the latter variant actually sorts - * the tuples by hash code. The API is the same except for the "begin" - * routine. + * The "heap" API actually stores/sorts MinimalTuples, which means it doesn't + * preserve the system columns (tuple identity and transaction visibility + * info). The sort keys are specified by column numbers within the tuples + * and sort operator OIDs. We save some cycles by passing and returning the + * tuples in TupleTableSlots, rather than forming actual HeapTuples (which'd + * have to be converted to MinimalTuples). This API works well for sorts + * executed as parts of plan trees. * - * Yet another slightly different interface supports sorting bare Datums. + * The "cluster" API stores/sorts full HeapTuples including all visibility + * info. The sort keys are specified by reference to a btree index that is + * defined on the relation to be sorted. Note that putheaptuple/getheaptuple + * go with this API, not the "begin_heap" one! + * + * The "index_btree" API stores/sorts IndexTuples (preserving all their + * header fields). The sort keys are specified by a btree index definition. + * + * The "index_hash" API is similar to index_btree, but the tuples are + * actually sorted by their hash codes not the raw data. */ extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, Oid *sortOperators, bool *nullsFirstFlags, int workMem, bool randomAccess); +extern Tuplesortstate *tuplesort_begin_cluster(TupleDesc tupDesc, + Relation indexRel, + int workMem, bool randomAccess); extern Tuplesortstate *tuplesort_begin_index_btree(Relation indexRel, bool enforceUnique, int workMem, bool randomAccess); @@ -69,6 +79,7 @@ extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); extern void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot); +extern void tuplesort_putheaptuple(Tuplesortstate *state, HeapTuple tup); extern void tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple); extern void tuplesort_putdatum(Tuplesortstate *state, Datum val, bool isNull); @@ -77,6 +88,8 @@ extern void tuplesort_performsort(Tuplesortstate *state); extern bool tuplesort_gettupleslot(Tuplesortstate *state, bool forward, TupleTableSlot *slot); +extern HeapTuple tuplesort_getheaptuple(Tuplesortstate *state, bool forward, + bool *should_free); extern IndexTuple tuplesort_getindextuple(Tuplesortstate *state, bool forward, bool *should_free); extern bool tuplesort_getdatum(Tuplesortstate *state, bool forward, -- GitLab