提交 da56e576 编写于 作者: T Tom Lane

Modify tidbitmap.c to avoid creating a hash table until there is more

than one heap page represented in the bitmap.  This is a bit ugly but
it cuts overhead fairly effectively in simple join cases.  Per example
from Sergey Koposov.
上级 e90df662
......@@ -23,7 +23,7 @@
* Copyright (c) 2003-2005, PostgreSQL Global Development Group
*
* IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.2 2005/04/19 22:35:15 tgl Exp $
* $PostgreSQL: pgsql/src/backend/nodes/tidbitmap.c,v 1.3 2005/05/17 00:43:47 tgl Exp $
*
*-------------------------------------------------------------------------
*/
......@@ -90,6 +90,24 @@ typedef struct PagetableEntry
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
} PagetableEntry;
/*
* dynahash.c is optimized for relatively large, long-lived hash tables.
* This is not ideal for TIDBitMap, particularly when we are using a bitmap
* scan on the inside of a nestloop join: a bitmap may well live only long
* enough to accumulate one entry in such cases. We therefore avoid creating
* an actual hashtable until we need two pagetable entries. When just one
* pagetable entry is needed, we store it in a fixed field of TIDBitMap.
* (NOTE: we don't get rid of the hashtable if the bitmap later shrinks down
* to zero or one page again. So, status can be TBM_HASH even when nentries
* is zero or one.)
*/
typedef enum
{
TBM_EMPTY, /* no hashtable, nentries == 0 */
TBM_ONE_PAGE, /* entry1 contains the single entry */
TBM_HASH /* pagetable is valid, entry1 is not */
} TBMStatus;
/*
* Here is the representation for a whole TIDBitMap:
*/
......@@ -97,25 +115,29 @@ struct TIDBitmap
{
NodeTag type; /* to make it a valid Node */
MemoryContext mcxt; /* memory context containing me */
TBMStatus status; /* see codes above */
HTAB *pagetable; /* hash table of PagetableEntry's */
int nentries; /* number of entries in pagetable */
int maxentries; /* limit on same to meet maxbytes */
int npages; /* number of exact entries in pagetable */
int nchunks; /* number of lossy entries in pagetable */
bool iterating; /* tbm_begin_iterate called? */
PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
/* the remaining fields are used while producing sorted output: */
TBMIterateResult *output; /* NULL if not yet created */
PagetableEntry **spages; /* sorted exact-page list, or NULL */
PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
int spageptr; /* next spages index */
int schunkptr; /* next schunks index */
int schunkbit; /* next bit to check in current schunk */
TBMIterateResult output; /* MUST BE LAST (because variable-size) */
};
/* Local function prototypes */
static PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
BlockNumber pageno);
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
static bool tbm_intersect_page(PagetableEntry *apage, const TIDBitmap *b);
static const PagetableEntry *tbm_find_pageentry(const TIDBitmap *tbm,
BlockNumber pageno);
static PagetableEntry *tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno);
static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
......@@ -134,37 +156,79 @@ TIDBitmap *
tbm_create(long maxbytes)
{
TIDBitmap *tbm;
HASHCTL hash_ctl;
long nbuckets;
tbm = makeNode(TIDBitmap);
/* we rely on makeNode to have zeroed all the fields */
/*
* Create the TIDBitmap struct, with enough trailing space to serve
* the needs of the TBMIterateResult sub-struct.
*/
tbm = (TIDBitmap *) palloc(sizeof(TIDBitmap) +
MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
/* Zero all the fixed fields */
MemSetAligned(tbm, 0, sizeof(TIDBitmap));
tbm->type = T_TIDBitmap; /* Set NodeTag */
tbm->mcxt = CurrentMemoryContext;
tbm->status = TBM_EMPTY;
/*
* Estimate number of hashtable entries we can have within maxbytes.
* This estimates the hash overhead at MAXALIGN(sizeof(HASHELEMENT))
* plus a pointer per hash entry, which is crude but good enough for
* our purpose. (NOTE: this does not count the space for data
* structures created during iteration readout.)
* our purpose. Also count an extra Pointer per entry for the arrays
* created during iteration readout.
*/
nbuckets = maxbytes /
(MAXALIGN(sizeof(HASHELEMENT)) + MAXALIGN(sizeof(PagetableEntry))
+ sizeof(Pointer));
+ sizeof(Pointer) + sizeof(Pointer));
nbuckets = Min(nbuckets, INT_MAX-1); /* safety limit */
nbuckets = Max(nbuckets, 16); /* sanity limit */
tbm->maxentries = (int) nbuckets;
return tbm;
}
/*
* Actually create the hashtable. Since this is a moderately expensive
* proposition, we don't do it until we have to.
*/
static void
tbm_create_pagetable(TIDBitmap *tbm)
{
HASHCTL hash_ctl;
Assert(tbm->status != TBM_HASH);
Assert(tbm->pagetable == NULL);
/* Create the hashtable proper */
MemSet(&hash_ctl, 0, sizeof(hash_ctl));
hash_ctl.keysize = sizeof(BlockNumber);
hash_ctl.entrysize = sizeof(PagetableEntry);
hash_ctl.hash = tag_hash;
hash_ctl.hcxt = CurrentMemoryContext;
hash_ctl.hcxt = tbm->mcxt;
tbm->pagetable = hash_create("TIDBitmap",
128, /* start small and extend */
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
return tbm;
/* If entry1 is valid, push it into the hashtable */
if (tbm->status == TBM_ONE_PAGE)
{
PagetableEntry *page;
bool found;
page = (PagetableEntry *) hash_search(tbm->pagetable,
(void *) &tbm->entry1.blockno,
HASH_ENTER, &found);
if (page == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
Assert(!found);
memcpy(page, &tbm->entry1, sizeof(PagetableEntry));
}
tbm->status = TBM_HASH;
}
/*
......@@ -173,9 +237,8 @@ tbm_create(long maxbytes)
void
tbm_free(TIDBitmap *tbm)
{
hash_destroy(tbm->pagetable);
if (tbm->output)
pfree(tbm->output);
if (tbm->pagetable)
hash_destroy(tbm->pagetable);
if (tbm->spages)
pfree(tbm->spages);
if (tbm->schunks)
......@@ -235,62 +298,77 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids)
void
tbm_union(TIDBitmap *a, const TIDBitmap *b)
{
HASH_SEQ_STATUS status;
Assert(!a->iterating);
/* Nothing to do if b is empty */
if (b->nentries == 0)
return;
/* Scan through chunks and pages in b, merge into a */
if (b->status == TBM_ONE_PAGE)
tbm_union_page(a, &b->entry1);
else
{
HASH_SEQ_STATUS status;
PagetableEntry *bpage;
Assert(b->status == TBM_HASH);
hash_seq_init(&status, b->pagetable);
while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
tbm_union_page(a, bpage);
}
}
/* Process one page of b during a union op */
static void
tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage)
{
PagetableEntry *apage;
PagetableEntry *bpage;
int wordnum;
Assert(!a->iterating);
/* Scan through chunks and pages in b, merge into a */
hash_seq_init(&status, b->pagetable);
while ((bpage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
if (bpage->ischunk)
{
if (bpage->ischunk)
/* Scan b's chunk, mark each indicated page lossy in a */
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
{
/* Scan b's chunk, mark each indicated page lossy in a */
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
bitmapword w = bpage->words[wordnum];
if (w != 0)
{
bitmapword w = bpage->words[wordnum];
BlockNumber pg;
if (w != 0)
pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
while (w != 0)
{
BlockNumber pg;
pg = bpage->blockno + (wordnum * BITS_PER_BITMAPWORD);
while (w != 0)
{
if (w & 1)
tbm_mark_page_lossy(a, pg);
pg++;
w >>= 1;
}
if (w & 1)
tbm_mark_page_lossy(a, pg);
pg++;
w >>= 1;
}
}
}
else if (tbm_page_is_lossy(a, bpage->blockno))
}
else if (tbm_page_is_lossy(a, bpage->blockno))
{
/* page is already lossy in a, nothing to do */
return;
}
else
{
apage = tbm_get_pageentry(a, bpage->blockno);
if (apage->ischunk)
{
/* page is already lossy in a, nothing to do */
continue;
/* The page is a lossy chunk header, set bit for itself */
apage->words[0] |= ((bitmapword) 1 << 0);
}
else
{
apage = tbm_get_pageentry(a, bpage->blockno);
if (apage->ischunk)
{
/* The page is a lossy chunk header, set bit for itself */
apage->words[0] |= ((bitmapword) 1 << 0);
}
else
{
/* Both pages are exact, merge at the bit level */
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
apage->words[wordnum] |= bpage->words[wordnum];
}
/* Both pages are exact, merge at the bit level */
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
apage->words[wordnum] |= bpage->words[wordnum];
}
if (a->nentries > a->maxentries)
tbm_lossify(a);
}
if (a->nentries > a->maxentries)
tbm_lossify(a);
}
/*
......@@ -301,96 +379,121 @@ tbm_union(TIDBitmap *a, const TIDBitmap *b)
void
tbm_intersect(TIDBitmap *a, const TIDBitmap *b)
{
HASH_SEQ_STATUS status;
PagetableEntry *apage;
PagetableEntry *bpage;
int wordnum;
Assert(!a->iterating);
/* Nothing to do if a is empty */
if (a->nentries == 0)
return;
/* Scan through chunks and pages in a, try to match to b */
hash_seq_init(&status, a->pagetable);
while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
if (a->status == TBM_ONE_PAGE)
{
if (apage->ischunk)
if (tbm_intersect_page(&a->entry1, b))
{
/* Scan each bit in chunk, try to clear */
bool candelete = true;
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
{
bitmapword w = apage->words[wordnum];
if (w != 0)
{
bitmapword neww = w;
BlockNumber pg;
int bitnum;
/* Page is now empty, remove it from a */
Assert(!a->entry1.ischunk);
a->npages--;
a->nentries--;
Assert(a->nentries == 0);
a->status = TBM_EMPTY;
}
}
else
{
HASH_SEQ_STATUS status;
PagetableEntry *apage;
pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
bitnum = 0;
while (w != 0)
{
if (w & 1)
{
if (!tbm_page_is_lossy(b, pg) &&
tbm_find_pageentry(b, pg) == NULL)
{
/* Page is not in b at all, lose lossy bit */
neww &= ~((bitmapword) 1 << bitnum);
}
}
pg++;
bitnum++;
w >>= 1;
}
apage->words[wordnum] = neww;
if (neww != 0)
candelete = false;
}
}
if (candelete)
Assert(a->status == TBM_HASH);
hash_seq_init(&status, a->pagetable);
while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL)
{
if (tbm_intersect_page(apage, b))
{
/* Chunk is now empty, remove it from a */
/* Page or chunk is now empty, remove it from a */
if (apage->ischunk)
a->nchunks--;
else
a->npages--;
a->nentries--;
if (hash_search(a->pagetable,
(void *) &apage->blockno,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "hash table corrupted");
a->nentries--;
a->nchunks--;
}
}
else if (tbm_page_is_lossy(b, apage->blockno))
{
/* page is lossy in b, cannot clear any bits */
continue;
}
else
}
}
/*
* Process one page of a during an intersection op
*
* Returns TRUE if apage is now empty and should be deleted from a
*/
static bool
tbm_intersect_page(PagetableEntry *apage, const TIDBitmap *b)
{
const PagetableEntry *bpage;
int wordnum;
if (apage->ischunk)
{
/* Scan each bit in chunk, try to clear */
bool candelete = true;
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
{
bool candelete = true;
bitmapword w = apage->words[wordnum];
bpage = tbm_find_pageentry(b, apage->blockno);
if (bpage != NULL)
if (w != 0)
{
/* Both pages are exact, merge at the bit level */
Assert(!bpage->ischunk);
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
bitmapword neww = w;
BlockNumber pg;
int bitnum;
pg = apage->blockno + (wordnum * BITS_PER_BITMAPWORD);
bitnum = 0;
while (w != 0)
{
apage->words[wordnum] &= bpage->words[wordnum];
if (apage->words[wordnum] != 0)
candelete = false;
if (w & 1)
{
if (!tbm_page_is_lossy(b, pg) &&
tbm_find_pageentry(b, pg) == NULL)
{
/* Page is not in b at all, lose lossy bit */
neww &= ~((bitmapword) 1 << bitnum);
}
}
pg++;
bitnum++;
w >>= 1;
}
apage->words[wordnum] = neww;
if (neww != 0)
candelete = false;
}
if (candelete)
}
return candelete;
}
else if (tbm_page_is_lossy(b, apage->blockno))
{
/* page is lossy in b, cannot clear any bits */
return false;
}
else
{
bool candelete = true;
bpage = tbm_find_pageentry(b, apage->blockno);
if (bpage != NULL)
{
/* Both pages are exact, merge at the bit level */
Assert(!bpage->ischunk);
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
{
/* Page is now empty, remove it from a */
if (hash_search(a->pagetable,
(void *) &apage->blockno,
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "hash table corrupted");
a->nentries--;
a->npages--;
apage->words[wordnum] &= bpage->words[wordnum];
if (apage->words[wordnum] != 0)
candelete = false;
}
}
return candelete;
}
}
......@@ -411,15 +514,16 @@ tbm_begin_iterate(TIDBitmap *tbm)
tbm->iterating = true;
/*
* Allocate the output data structure if we didn't already.
* (We don't do this during tbm_create since it's entirely possible
* that a TIDBitmap will live and die without ever being iterated.)
* Reset iteration pointers.
*/
tbm->spageptr = 0;
tbm->schunkptr = 0;
tbm->schunkbit = 0;
/*
* Nothing else to do if no entries, nor if we don't have a hashtable.
*/
if (!tbm->output)
tbm->output = (TBMIterateResult *)
MemoryContextAllocZero(tbm->mcxt,
sizeof(TBMIterateResult) +
MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
if (tbm->nentries == 0 || tbm->status != TBM_HASH)
return;
/*
* Create and fill the sorted page lists if we didn't already.
*/
......@@ -447,12 +551,6 @@ tbm_begin_iterate(TIDBitmap *tbm)
qsort(tbm->spages, npages, sizeof(PagetableEntry *), tbm_comparator);
if (nchunks > 1)
qsort(tbm->schunks, nchunks, sizeof(PagetableEntry *), tbm_comparator);
/*
* Reset iteration pointers.
*/
tbm->spageptr = 0;
tbm->schunkptr = 0;
tbm->schunkbit = 0;
}
/*
......@@ -468,7 +566,7 @@ tbm_begin_iterate(TIDBitmap *tbm)
TBMIterateResult *
tbm_iterate(TIDBitmap *tbm)
{
TBMIterateResult *output = tbm->output;
TBMIterateResult *output = &(tbm->output);
Assert(tbm->iterating);
/*
......@@ -521,10 +619,16 @@ tbm_iterate(TIDBitmap *tbm)
if (tbm->spageptr < tbm->npages)
{
PagetableEntry *page = tbm->spages[tbm->spageptr];
PagetableEntry *page;
int ntuples;
int wordnum;
/* In ONE_PAGE state, we don't allocate an spages[] array */
if (tbm->status == TBM_ONE_PAGE)
page = &tbm->entry1;
else
page = tbm->spages[tbm->spageptr];
/* scan bitmap to extract individual offset numbers */
ntuples = 0;
for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
......@@ -559,10 +663,22 @@ tbm_iterate(TIDBitmap *tbm)
*
* Returns NULL if there is no non-lossy entry for the pageno.
*/
static PagetableEntry *
static const PagetableEntry *
tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno)
{
PagetableEntry *page;
const PagetableEntry *page;
if (tbm->nentries == 0) /* in case pagetable doesn't exist */
return NULL;
if (tbm->status == TBM_ONE_PAGE)
{
page = &tbm->entry1;
if (page->blockno != pageno)
return NULL;
Assert(!page->ischunk);
return page;
}
page = (PagetableEntry *) hash_search(tbm->pagetable,
(void *) &pageno,
......@@ -588,14 +704,33 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno)
PagetableEntry *page;
bool found;
/* Look up or create an entry */
page = (PagetableEntry *) hash_search(tbm->pagetable,
(void *) &pageno,
HASH_ENTER, &found);
if (page == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
if (tbm->status == TBM_EMPTY)
{
/* Use the fixed slot */
page = &tbm->entry1;
found = false;
tbm->status = TBM_ONE_PAGE;
}
else
{
if (tbm->status == TBM_ONE_PAGE)
{
page = &tbm->entry1;
if (page->blockno == pageno)
return page;
/* Time to switch from one page to a hashtable */
tbm_create_pagetable(tbm);
}
/* Look up or create an entry */
page = (PagetableEntry *) hash_search(tbm->pagetable,
(void *) &pageno,
HASH_ENTER, &found);
if (page == NULL)
ereport(ERROR,
(errcode(ERRCODE_OUT_OF_MEMORY),
errmsg("out of memory")));
}
/* Initialize it if not present before */
if (!found)
......@@ -623,6 +758,7 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno)
/* we can skip the lookup if there are no lossy chunks */
if (tbm->nchunks == 0)
return false;
Assert(tbm->status == TBM_HASH);
bitno = pageno % PAGES_PER_CHUNK;
chunk_pageno = pageno - bitno;
......@@ -656,6 +792,10 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno)
int wordnum;
int bitnum;
/* We force the bitmap into hashtable mode whenever it's lossy */
if (tbm->status != TBM_HASH)
tbm_create_pagetable(tbm);
bitno = pageno % PAGES_PER_CHUNK;
chunk_pageno = pageno - bitno;
......@@ -731,6 +871,8 @@ tbm_lossify(TIDBitmap *tbm)
* during each call.
*/
Assert(!tbm->iterating);
Assert(tbm->status == TBM_HASH);
hash_seq_init(&status, tbm->pagetable);
while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL)
{
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册