Order active window clauses for greater reuse of Sort nodes.

This is a backport of the below commit from postgres 12dev, which in turn is a patch that was influenced by an optimization from the previous version of the Greenplum Window code. The idea is to order the Sort nodes based on sort prefixes, such that sorts can be reused by subsequent nodes. As this uses EXPLAIN in the test output, a new expected file is added for ORCA output even though the patch only touches the postgres planner. commit 728202b6 Author: Andrew Gierth <rhodiumtoad@postgresql.org> Date: Fri Sep 14 17:35:42 2018 +0100 Order active window clauses for greater reuse of Sort nodes. By sorting the active window list lexicographically by the sort clause list but putting longer clauses before shorter prefixes, we generate more chances to elide Sort nodes when building the path. Author: Daniel Gustafsson (with some editorialization by me) Reviewed-by: Alexander Kuzmenkov, Masahiko Sawada, Tom Lane Discussion: https://postgr.es/m/124A7F69-84CD-435B-BA0E-2695BE21E5C2%40yesql.se

Order active window clauses for greater reuse of Sort nodes.
This is a backport of the below commit from postgres 12dev, which in turn is a patch that was influenced by an optimization from the previous version of the Greenplum Window code. The idea is to order the Sort nodes based on sort prefixes, such that sorts can be reused by subsequent nodes. As this uses EXPLAIN in the test output, a new expected file is added for ORCA output even though the patch only touches the postgres planner. commit 728202b6 Author: Andrew Gierth <rhodiumtoad@postgresql.org> Date: Fri Sep 14 17:35:42 2018 +0100 Order active window clauses for greater reuse of Sort nodes. By sorting the active window list lexicographically by the sort clause list but putting longer clauses before shorter prefixes, we generate more chances to elide Sort nodes when building the path. Author: Daniel Gustafsson (with some editorialization by me) Reviewed-by: Alexander Kuzmenkov, Masahiko Sawada, Tom Lane Discussion: https://postgr.es/m/124A7F69-84CD-435B-BA0E-2695BE21E5C2%40yesql.se
3f0d46f7 · Daniel Gustafsson · 3f6273e1 · 3f0d46f7 · 3f0d46f7 · 3f0d46f7
6 changed file
--- a/src/backend/nodes/list.c
+++ b/src/backend/nodes/list.c
@@ -1003,8 +1003,11 @@ list_append_unique_oid(List *list, Oid datum)
 * via equal().
 *
 * This is almost the same functionality as list_union(), but list1 is
- * modified in-place rather than being copied.	Note also that list2's cells
- * are not inserted in list1, so the analogy to list_concat() isn't perfect.
+ * modified in-place rather than being copied. However, callers of this
+ * function may have strict ordering expectations -- i.e. that the relative
+ * order of those list2 elements that are not duplicates is preserved. Note
+ * also that list2's cells are not inserted in list1, so the analogy to
+ * list_concat() isn't perfect.
 */
 List *
 list_concat_unique(List *list1, List *list2)

--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -89,6 +89,17 @@ typedef struct
 	List	   *activeWindows;	/* active windows, if any */
 } standard_qp_extra;

+/*
+ * Temporary structure for use during WindowClause reordering in order to be
+ * be able to sort WindowClauses on partitioning/ordering prefix.
+ */
+typedef struct
+{
+	WindowClause *wc;
+	List	   *uniqueOrder;	/* A List of unique ordering/partitioning
+								 * clauses per Window */
+} WindowClauseSortData;
+
 /* Local functions */
 static Node *preprocess_expression(PlannerInfo *root, Node *expr, int kind);
 static void preprocess_qual_conditions(PlannerInfo *root, Node *jtnode);
@@ -129,6 +140,7 @@ static void get_column_info_for_window(PlannerInfo *root, WindowClause *wc,
 						   int *ordNumCols,
 						   AttrNumber **ordColIdx,
 						   Oid **ordOperators);
+static int	common_prefix_cmp(const void *a, const void *b);

 static Bitmapset *canonicalize_colref_list(Node *node);
 static List *canonicalize_gs_list(List *gsl, bool ordinary);
@@ -4366,65 +4378,121 @@ postprocess_setop_tlist(List *new_tlist, List *orig_tlist)
 static List *
 select_active_windows(PlannerInfo *root, WindowFuncLists *wflists)
 {
-	List	   *result;
-	List	   *actives;
+	List	   *windowClause = root->parse->windowClause;
+	List	   *result = NIL;
 	ListCell   *lc;
+	int			nActive = 0;
+	WindowClauseSortData *actives = palloc(sizeof(WindowClauseSortData)
+										   * list_length(windowClause));

-	/* First, make a list of the active windows */
-	actives = NIL;
-	foreach(lc, root->parse->windowClause)
+	/* First, construct an array of the active windows */
+	foreach(lc, windowClause)
 	{
 		WindowClause *wc = (WindowClause *) lfirst(lc);

 		/* It's only active if wflists shows some related WindowFuncs */
 		Assert(wc->winref <= wflists->maxWinRef);
-		if (wflists->windowFuncs[wc->winref] != NIL)
-			actives = lappend(actives, wc);
+		if (wflists->windowFuncs[wc->winref] == NIL)
+			continue;
+
+		actives[nActive].wc = wc;	/* original clause */
+
+		/*
+		 * For sorting, we want the list of partition keys followed by the
+		 * list of sort keys. But pathkeys construction will remove duplicates
+		 * between the two, so we can as well (even though we can't detect all
+		 * of the duplicates, since some may come from ECs - that might mean
+		 * we miss optimization chances here). We must, however, ensure that
+		 * the order of entries is preserved with respect to the ones we do
+		 * keep.
+		 *
+		 * partitionClause and orderClause had their own duplicates removed in
+		 * parse analysis, so we're only concerned here with removing
+		 * orderClause entries that also appear in partitionClause.
+		 */
+		actives[nActive].uniqueOrder =
+			list_concat_unique(list_copy(wc->partitionClause),
+							   wc->orderClause);
+		nActive++;
 	}

 	/*
-	 * Now, ensure that windows with identical partitioning/ordering clauses
-	 * are adjacent in the list.  This is required by the SQL standard, which
-	 * says that only one sort is to be used for such windows, even if they
-	 * are otherwise distinct (eg, different names or framing clauses).
+	 * Sort active windows by their partitioning/ordering clauses, ignoring
+	 * any framing clauses, so that the windows that need the same sorting are
+	 * adjacent in the list. When we come to generate paths, this will avoid
+	 * inserting additional Sort nodes.
+	 *
+	 * This is how we implement a specific requirement from the SQL standard,
+	 * which says that when two or more windows are order-equivalent (i.e.
+	 * have matching partition and order clauses, even if their names or
+	 * framing clauses differ), then all peer rows must be presented in the
+	 * same order in all of them. If we allowed multiple sort nodes for such
+	 * cases, we'd risk having the peer rows end up in different orders in
+	 * equivalent windows due to sort instability. (See General Rule 4 of
+	 * <window clause> in SQL2008 - SQL2016.)
 	 *
-	 * There is room to be much smarter here, for example detecting whether
-	 * one window's sort keys are a prefix of another's (so that sorting for
-	 * the latter would do for the former), or putting windows first that
-	 * match a sort order available for the underlying query.  For the moment
-	 * we are content with meeting the spec.
+	 * Additionally, if the entire list of clauses of one window is a prefix
+	 * of another, put first the window with stronger sorting requirements.
+	 * This way we will first sort for stronger window, and won't have to sort
+	 * again for the weaker one.
 	 */
-	result = NIL;
-	while (actives != NIL)
-	{
-		WindowClause *wc = (WindowClause *) linitial(actives);
-		ListCell   *prev;
-		ListCell   *next;
+	qsort(actives, nActive, sizeof(WindowClauseSortData), common_prefix_cmp);

-		/* Move wc from actives to result */
-		actives = list_delete_first(actives);
-		result = lappend(result, wc);
+	/* build ordered list of the original WindowClause nodes */
+	for (int i = 0; i < nActive; i++)
+		result = lappend(result, actives[i].wc);

-		/* Now move any matching windows from actives to result */
-		prev = NULL;
-		for (lc = list_head(actives); lc; lc = next)
-		{
-			WindowClause *wc2 = (WindowClause *) lfirst(lc);
+	pfree(actives);

-			next = lnext(lc);
-			/* framing options are NOT to be compared here! */
-			if (equal(wc->partitionClause, wc2->partitionClause) &&
-				equal(wc->orderClause, wc2->orderClause))
-			{
-				actives = list_delete_cell(actives, lc, prev);
-				result = lappend(result, wc2);
-			}
-			else
-				prev = lc;
-		}
+	return result;
+}
+
+/*
+ * common_prefix_cmp
+ *	  QSort comparison function for WindowClauseSortData
+ *
+ * Sort the windows by the required sorting clauses. First, compare the sort
+ * clauses themselves. Second, if one window's clauses are a prefix of another
+ * one's clauses, put the window with more sort clauses first.
+ */
+static int
+common_prefix_cmp(const void *a, const void *b)
+{
+	const WindowClauseSortData *wcsa = a;
+	const WindowClauseSortData *wcsb = b;
+	ListCell   *item_a;
+	ListCell   *item_b;
+
+	forboth(item_a, wcsa->uniqueOrder, item_b, wcsb->uniqueOrder)
+	{
+		/*
+		 * GPDB_100_MERGE_FIXME: replace with lfirst_node() calls when commit
+		 * 8f0530f58061b185dc385df42e62d78a18d4ae3e is merged.
+		 */
+		SortGroupClause *sca = (SortGroupClause *) lfirst(item_a);
+		SortGroupClause *scb = (SortGroupClause *) lfirst(item_b);
+
+		if (sca->tleSortGroupRef > scb->tleSortGroupRef)
+			return -1;
+		else if (sca->tleSortGroupRef < scb->tleSortGroupRef)
+			return 1;
+		else if (sca->sortop > scb->sortop)
+			return -1;
+		else if (sca->sortop < scb->sortop)
+			return 1;
+		else if (sca->nulls_first && !scb->nulls_first)
+			return -1;
+		else if (!sca->nulls_first && scb->nulls_first)
+			return 1;
+		/* no need to compare eqop, since it is fully determined by sortop */
 	}

-	return result;
+	if (list_length(wcsa->uniqueOrder) > list_length(wcsb->uniqueOrder))
+		return -1;
+	else if (list_length(wcsa->uniqueOrder) < list_length(wcsb->uniqueOrder))
+		return 1;
+
+	return 0;
 }

 /*

--- a/src/test/regress/expected/olap_window_seq.out
+++ b/src/test/regress/expected/olap_window_seq.out
@@ -7889,23 +7889,20 @@ select count(*) over (partition by 1 order by cn rows between 1 preceding and 1
 -- MPP-13710
 create table redundant_sort_check (i int, j int, k int) distributed by (i);
 explain select count(*) over (order by i), count(*) over (partition by i order by j) from redundant_sort_check;
-                                               QUERY PLAN                                               
--------------------------------------------------------------------------------------------------------
- WindowAgg  (cost=14900.48..16458.48 rows=77900 width=8)
-   Partition By: i
-   Order By: j
-   ->  Sort  (cost=14900.48..15095.23 rows=77900 width=8)
-         Sort Key: i, j
-         ->  WindowAgg  (cost=7208.12..8571.37 rows=77900 width=8)
-               Order By: i
-               ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=7208.12..7402.87 rows=77900 width=8)
-                     Merge Key: i
-                     ->  Sort  (cost=7208.12..7402.87 rows=25967 width=8)
-                           Sort Key: i
-                           ->  Seq Scan on redundant_sort_check  (cost=0.00..879.00 rows=25967 width=8)
- Settings:  optimizer=off
- Optimizer status: legacy query optimizer
-(14 rows)
+                                            QUERY PLAN                                            
+--------------------------------------------------------------------------------------------------
+ WindowAgg  (cost=7208.12..9934.62 rows=77900 width=8)
+   Order By: i
+   ->  Gather Motion 3:1  (slice1; segments: 3)  (cost=7208.12..8766.12 rows=77900 width=8)
+         Merge Key: i, j
+         ->  WindowAgg  (cost=7208.12..8766.12 rows=25967 width=8)
+               Partition By: i
+               Order By: j
+               ->  Sort  (cost=7208.12..7402.87 rows=25967 width=8)
+                     Sort Key: i, j
+                     ->  Seq Scan on redundant_sort_check  (cost=0.00..879.00 rows=25967 width=8)
+ Optimizer: legacy query optimizer
+(11 rows)

 -- End of MPP-13710
 -- MPP-13879
@@ -8151,39 +8148,34 @@ EXPLAIN SELECT count(*) over (PARTITION BY a ORDER BY b, c, d) as count1,
       count(*) over (PARTITION BY a ORDER BY c, b) as count2,
       count(*) over (PARTITION BY a ORDER BY c, b, d) as count3
 FROM foo;
-                                                      QUERY PLAN                                                      
----------------------------------------------------------------------------------------------------------------------
- Gather Motion 3:1  (slice1; segments: 3)  (cost=4.81..5.06 rows=10 width=16)
-   ->  WindowAgg  (cost=4.81..5.06 rows=4 width=16)
+                                                QUERY PLAN                                                
+----------------------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)  (cost=4.06..4.68 rows=10 width=16)
+   ->  WindowAgg  (cost=4.06..4.68 rows=4 width=16)
         Partition By: a
-         Order By: c, b, d
-         ->  Sort  (cost=4.81..4.84 rows=4 width=16)
-               Sort Key: a, c, b, d
-               ->  WindowAgg  (cost=4.42..4.65 rows=4 width=16)
+         Order By: b
+         ->  WindowAgg  (cost=4.06..4.51 rows=4 width=16)
+               Partition By: a
+               Order By: b, c
+               ->  WindowAgg  (cost=4.06..4.31 rows=4 width=16)
                     Partition By: a
-                     Order By: c, b
-                     ->  Sort  (cost=4.42..4.45 rows=4 width=16)
-                           Sort Key: a, c, b
-                           ->  WindowAgg  (cost=4.06..4.26 rows=4 width=16)
+                     Order By: b, c, d
+                     ->  Sort  (cost=4.06..4.08 rows=4 width=16)
+                           Sort Key: a, b, c, d
+                           ->  WindowAgg  (cost=3.27..3.89 rows=4 width=16)
                                 Partition By: a
                                 Order By: c
-                                 ->  Sort  (cost=4.06..4.08 rows=4 width=16)
-                                       Sort Key: a, c
-                                       ->  WindowAgg  (cost=3.27..3.89 rows=4 width=16)
+                                 ->  WindowAgg  (cost=3.27..3.72 rows=4 width=16)
+                                       Partition By: a
+                                       Order By: c, b
+                                       ->  WindowAgg  (cost=3.27..3.52 rows=4 width=16)
                                             Partition By: a
-                                             Order By: b
-                                             ->  WindowAgg  (cost=3.27..3.72 rows=4 width=16)
-                                                   Partition By: a
-                                                   Order By: b, c
-                                                   ->  WindowAgg  (cost=3.27..3.52 rows=4 width=16)
-                                                         Partition By: a
-                                                         Order By: b, c, d
-                                                         ->  Sort  (cost=3.27..3.29 rows=4 width=16)
-                                                               Sort Key: a, b, c, d
-                                                               ->  Seq Scan on foo  (cost=0.00..3.10 rows=4 width=16)
- Settings:  optimizer=off
- Optimizer status: legacy query optimizer
-(30 rows)
+                                             Order By: c, b, d
+                                             ->  Sort  (cost=3.27..3.29 rows=4 width=16)
+                                                   Sort Key: a, c, b, d
+                                                   ->  Seq Scan on foo  (cost=0.00..3.10 rows=4 width=16)
+ Optimizer: legacy query optimizer
+(25 rows)

 drop table foo;
 -- test predicate push down in subqueries for quals containing windowref nodes

--- a/src/test/regress/expected/window.out
+++ b/src/test/regress/expected/window.out
@@ -511,9 +511,9 @@ SELECT sum(salary),
 FROM empsalary GROUP BY depname;
  sum  | row_number |  sum  
 -------+------------+-------
- 14600 |          3 | 14600
-  7400 |          2 | 22000
 25100 |          1 | 47100
+  7400 |          2 | 22000
+ 14600 |          3 | 14600
 (3 rows)

 -- identical windows with different names
@@ -1057,6 +1057,59 @@ SELECT ntile(0) OVER (ORDER BY ten), ten, four FROM tenk1;
 ERROR:  argument of ntile must be greater than zero
 SELECT nth_value(four, 0) OVER (ORDER BY ten), ten, four FROM tenk1;
 ERROR:  argument of nth_value must be greater than zero
+-- Test Sort node collapsing
+EXPLAIN (COSTS OFF)
+SELECT * FROM
+  (SELECT depname,
+          sum(salary) OVER (PARTITION BY depname order by empno) depsalary,
+          min(salary) OVER (PARTITION BY depname, empno order by enroll_date) depminsalary
+   FROM empsalary) emp
+WHERE depname = 'sales';
+                                        QUERY PLAN                                        
+------------------------------------------------------------------------------------------
+ Gather Motion 3:1  (slice3; segments: 3)
+   ->  Subquery Scan on emp
+         ->  WindowAgg
+               Order By: empsalary.empno
+               ->  Sort
+                     Sort Key: empsalary.empno
+                     ->  Redistribute Motion 3:3  (slice2; segments: 3)
+                           Hash Key: empsalary.depname
+                           ->  WindowAgg
+                                 Partition By: empsalary.empno
+                                 Order By: empsalary.enroll_date
+                                 ->  Sort
+                                       Sort Key: empsalary.empno, empsalary.enroll_date
+                                       ->  Redistribute Motion 3:3  (slice1; segments: 3)
+                                             Hash Key: empsalary.depname, empsalary.empno
+                                             ->  Seq Scan on empsalary
+                                                   Filter: ((depname)::text = 'sales'::text)
+ Optimizer: legacy query optimizer
+(18 rows)
+
+-- Test Sort node reordering
+EXPLAIN (COSTS OFF)
+SELECT
+  lead(1) OVER (PARTITION BY depname ORDER BY salary, enroll_date),
+  lag(1) OVER (PARTITION BY depname ORDER BY salary,enroll_date,empno)
+FROM empsalary;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Gather Motion 3:1  (slice2; segments: 3)
+   ->  WindowAgg
+         Partition By: depname
+         Order By: salary, enroll_date
+         ->  WindowAgg
+               Partition By: depname
+               Order By: salary, enroll_date, empno
+               ->  Sort
+                     Sort Key: depname, salary, enroll_date, empno
+                     ->  Redistribute Motion 3:3  (slice1; segments: 3)
+                           Hash Key: depname
+                           ->  Seq Scan on empsalary
+ Optimizer: legacy query optimizer
+(13 rows)
+
 -- cleanup
 DROP TABLE empsalary;
 --

--- a/src/test/regress/expected/window_optimizer.out
+++ b/src/test/regress/expected/window_optimizer.out
@@ -511,9 +511,9 @@ SELECT sum(salary),
 FROM empsalary GROUP BY depname;
  sum  | row_number |  sum  
 -------+------------+-------
- 14600 |          3 | 14600
-  7400 |          2 | 22000
 25100 |          1 | 47100
+  7400 |          2 | 22000
+ 14600 |          3 | 14600
 (3 rows)

 -- identical windows with different names
@@ -1059,6 +1059,55 @@ SELECT ntile(0) OVER (ORDER BY ten), ten, four FROM tenk1;
 ERROR:  argument of ntile must be greater than zero
 SELECT nth_value(four, 0) OVER (ORDER BY ten), ten, four FROM tenk1;
 ERROR:  argument of nth_value must be greater than zero
+-- Test Sort node collapsing
+EXPLAIN (COSTS OFF)
+SELECT * FROM
+  (SELECT depname,
+          sum(salary) OVER (PARTITION BY depname order by empno) depsalary,
+          min(salary) OVER (PARTITION BY depname, empno order by enroll_date) depminsalary
+   FROM empsalary) emp
+WHERE depname = 'sales';
+                                  QUERY PLAN
+-----------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   ->  Result
+         ->  WindowAgg
+               Partition By: depname, empno
+               Order By: enroll_date
+               ->  Sort
+                     Sort Key: depname, empno, enroll_date
+                     ->  WindowAgg
+                           Partition By: depname
+                           Order By: empno
+                           ->  Sort
+                                 Sort Key: depname, empno
+                                 ->  Table Scan on empsalary
+                                       Filter: ((depname)::text = 'sales'::text)
+(15 rows)
+
+-- Test Sort node reordering
+EXPLAIN (COSTS OFF)
+SELECT
+  lead(1) OVER (PARTITION BY depname ORDER BY salary, enroll_date),
+  lag(1) OVER (PARTITION BY depname ORDER BY salary,enroll_date,empno)
+FROM empsalary;
+                               QUERY PLAN                               
+------------------------------------------------------------------------
+ Gather Motion 3:1  (slice1; segments: 3)
+   ->  Result
+         ->  WindowAgg
+               Partition By: depname
+               Order By: salary, enroll_date, empno
+               ->  Sort
+                     Sort Key: depname, salary, enroll_date, empno
+                     ->  WindowAgg
+                           Partition By: depname
+                           Order By: salary, enroll_date
+                           ->  Sort
+                                 Sort Key: depname, salary, enroll_date
+                                 ->  Table Scan on empsalary
+(14 rows)
+
 -- cleanup
 DROP TABLE empsalary;
 --

--- a/src/test/regress/sql/window.sql
+++ b/src/test/regress/sql/window.sql
@@ -276,6 +276,22 @@ SELECT ntile(0) OVER (ORDER BY ten), ten, four FROM tenk1;

 SELECT nth_value(four, 0) OVER (ORDER BY ten), ten, four FROM tenk1;

+-- Test Sort node collapsing
+EXPLAIN (COSTS OFF)
+SELECT * FROM
+  (SELECT depname,
+          sum(salary) OVER (PARTITION BY depname order by empno) depsalary,
+          min(salary) OVER (PARTITION BY depname, empno order by enroll_date) depminsalary
+   FROM empsalary) emp
+WHERE depname = 'sales';
+
+-- Test Sort node reordering
+EXPLAIN (COSTS OFF)
+SELECT
+  lead(1) OVER (PARTITION BY depname ORDER BY salary, enroll_date),
+  lag(1) OVER (PARTITION BY depname ORDER BY salary,enroll_date,empno)
+FROM empsalary;
+
 -- cleanup
 DROP TABLE empsalary;