From ba1cc6501e98aee7d43098b032d6656f243bca5b Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 25 Jan 2013 16:49:13 +0200 Subject: [PATCH] Add some randomness to the choice of which GiST page to insert to. When descending the tree for an insert, and there are multiple equally good pages we could insert to, make the choice in random. Previously, we would always choose the tuple with lowest offset number. That meant that when two non-leaf pages overlap - in the extreme case they might have exactly the same key - all but the first such page went unused. That wasn't optimal for space usage; if you deleted some tuples from the non-first pages, the space would never be reused. With this patch, the other pages are sometimes chosen too, although there's still a heavy bias towards low-offset tuples, so that we don't lose cache locality when doing a lot of inserts with similar keys. Original idea by Alexander Korotkov, although this patch version was written by me and copy-edited by Tom Lane. --- src/backend/access/gist/gistutil.c | 66 ++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 8b60774f50..a127627334 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -379,6 +379,7 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ GISTENTRY entry, identry[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; + int keep_current_best; Assert(!GistPageIsLeaf(p)); @@ -401,6 +402,31 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ */ best_penalty[0] = -1; + /* + * If we find a tuple that's exactly as good as the currently best one, we + * could use either one. When inserting a lot of tuples with the same or + * similar keys, it's preferable to descend down the same path when + * possible, as that's more cache-friendly. On the other hand, if all + * inserts land on the same leaf page after a split, we're never going to + * insert anything to the other half of the split, and will end up using + * only 50% of the available space. Distributing the inserts evenly would + * lead to better space usage, but that hurts cache-locality during + * insertion. To get the best of both worlds, when we find a tuple that's + * exactly as good as the previous best, choose randomly whether to stick + * to the old best, or use the new one. Once we decide to stick to the + * old best, we keep sticking to it for any subsequent equally good tuples + * we might find. This favors tuples with low offsets, but still allows + * some inserts to go to other equally-good subtrees. + * + * keep_current_best is -1 if we haven't yet had to make a random choice + * whether to keep the current best tuple. If we have done so, and + * decided to keep it, keep_current_best is 1; if we've decided to + * replace, keep_current_best is 0. (This state will be reset to -1 as + * soon as we've made the replacement, but sometimes we make the choice in + * advance of actually finding a replacement best tuple.) + */ + keep_current_best = -1; + /* * Loop over tuples on page. */ @@ -446,6 +472,9 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ if (j < r->rd_att->natts - 1) best_penalty[j + 1] = -1; + + /* we have new best, so reset keep-it decision */ + keep_current_best = -1; } else if (best_penalty[j] == usize) { @@ -468,12 +497,41 @@ gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */ } /* - * If we find a tuple with zero penalty for all columns, there's no - * need to examine remaining tuples; just break out of the loop and - * return it. + * If we looped past the last column, and did not update "result", + * then this tuple is exactly as good as the prior best tuple. + */ + if (j == r->rd_att->natts && result != i) + { + if (keep_current_best == -1) + { + /* we didn't make the random choice yet for this old best */ + keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; + } + if (keep_current_best == 0) + { + /* we choose to use the new tuple */ + result = i; + /* choose again if there are even more exactly-as-good ones */ + keep_current_best = -1; + } + } + + /* + * If we find a tuple with zero penalty for all columns, and we've + * decided we don't want to search for another tuple with equal + * penalty, there's no need to examine remaining tuples; just break + * out of the loop and return it. */ if (zero_penalty) - break; + { + if (keep_current_best == -1) + { + /* we didn't make the random choice yet for this old best */ + keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0; + } + if (keep_current_best == 1) + break; + } } return result; -- GitLab