提交 3beef665 编写于 作者: H Hongze Cheng

more TDB

上级 031d84e7
...@@ -22,16 +22,6 @@ target_link_libraries( ...@@ -22,16 +22,6 @@ target_link_libraries(
PUBLIC util PUBLIC util
) )
# for tdb_sqlite
add_library(tdb_sqlite "")
target_sources(tdb_sqlite
PRIVATE
"src/sqlite/pcache.c"
"src/sqlite/pcache1.c"
"src/sqlite/pager.c"
)
target_include_directories(tdb_sqlite PUBLIC "src/sqliteinc")
# for test # for test
if(${BUILD_TEST}) if(${BUILD_TEST})
add_subdirectory(test) add_subdirectory(test)
......
/*
** 2007 August 27
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file contains code used to implement mutexes on Btree objects.
** This code really belongs in btree.c. But btree.c is getting too
** big and we want to break it down some. This packaged seemed like
** a good breakout.
*/
#include "btreeInt.h"
#ifndef SQLITE_OMIT_SHARED_CACHE
#if SQLITE_THREADSAFE
/*
** Obtain the BtShared mutex associated with B-Tree handle p. Also,
** set BtShared.db to the database handle associated with p and the
** p->locked boolean to true.
*/
static void lockBtreeMutex(Btree *p){
assert( p->locked==0 );
assert( sqlite3_mutex_notheld(p->pBt->mutex) );
assert( sqlite3_mutex_held(p->db->mutex) );
sqlite3_mutex_enter(p->pBt->mutex);
p->pBt->db = p->db;
p->locked = 1;
}
/*
** Release the BtShared mutex associated with B-Tree handle p and
** clear the p->locked boolean.
*/
static void SQLITE_NOINLINE unlockBtreeMutex(Btree *p){
BtShared *pBt = p->pBt;
assert( p->locked==1 );
assert( sqlite3_mutex_held(pBt->mutex) );
assert( sqlite3_mutex_held(p->db->mutex) );
assert( p->db==pBt->db );
sqlite3_mutex_leave(pBt->mutex);
p->locked = 0;
}
/* Forward reference */
static void SQLITE_NOINLINE btreeLockCarefully(Btree *p);
/*
** Enter a mutex on the given BTree object.
**
** If the object is not sharable, then no mutex is ever required
** and this routine is a no-op. The underlying mutex is non-recursive.
** But we keep a reference count in Btree.wantToLock so the behavior
** of this interface is recursive.
**
** To avoid deadlocks, multiple Btrees are locked in the same order
** by all database connections. The p->pNext is a list of other
** Btrees belonging to the same database connection as the p Btree
** which need to be locked after p. If we cannot get a lock on
** p, then first unlock all of the others on p->pNext, then wait
** for the lock to become available on p, then relock all of the
** subsequent Btrees that desire a lock.
*/
void sqlite3BtreeEnter(Btree *p){
/* Some basic sanity checking on the Btree. The list of Btrees
** connected by pNext and pPrev should be in sorted order by
** Btree.pBt value. All elements of the list should belong to
** the same connection. Only shared Btrees are on the list. */
assert( p->pNext==0 || p->pNext->pBt>p->pBt );
assert( p->pPrev==0 || p->pPrev->pBt<p->pBt );
assert( p->pNext==0 || p->pNext->db==p->db );
assert( p->pPrev==0 || p->pPrev->db==p->db );
assert( p->sharable || (p->pNext==0 && p->pPrev==0) );
/* Check for locking consistency */
assert( !p->locked || p->wantToLock>0 );
assert( p->sharable || p->wantToLock==0 );
/* We should already hold a lock on the database connection */
assert( sqlite3_mutex_held(p->db->mutex) );
/* Unless the database is sharable and unlocked, then BtShared.db
** should already be set correctly. */
assert( (p->locked==0 && p->sharable) || p->pBt->db==p->db );
if( !p->sharable ) return;
p->wantToLock++;
if( p->locked ) return;
btreeLockCarefully(p);
}
/* This is a helper function for sqlite3BtreeLock(). By moving
** complex, but seldom used logic, out of sqlite3BtreeLock() and
** into this routine, we avoid unnecessary stack pointer changes
** and thus help the sqlite3BtreeLock() routine to run much faster
** in the common case.
*/
static void SQLITE_NOINLINE btreeLockCarefully(Btree *p){
Btree *pLater;
/* In most cases, we should be able to acquire the lock we
** want without having to go through the ascending lock
** procedure that follows. Just be sure not to block.
*/
if( sqlite3_mutex_try(p->pBt->mutex)==SQLITE_OK ){
p->pBt->db = p->db;
p->locked = 1;
return;
}
/* To avoid deadlock, first release all locks with a larger
** BtShared address. Then acquire our lock. Then reacquire
** the other BtShared locks that we used to hold in ascending
** order.
*/
for(pLater=p->pNext; pLater; pLater=pLater->pNext){
assert( pLater->sharable );
assert( pLater->pNext==0 || pLater->pNext->pBt>pLater->pBt );
assert( !pLater->locked || pLater->wantToLock>0 );
if( pLater->locked ){
unlockBtreeMutex(pLater);
}
}
lockBtreeMutex(p);
for(pLater=p->pNext; pLater; pLater=pLater->pNext){
if( pLater->wantToLock ){
lockBtreeMutex(pLater);
}
}
}
/*
** Exit the recursive mutex on a Btree.
*/
void sqlite3BtreeLeave(Btree *p){
assert( sqlite3_mutex_held(p->db->mutex) );
if( p->sharable ){
assert( p->wantToLock>0 );
p->wantToLock--;
if( p->wantToLock==0 ){
unlockBtreeMutex(p);
}
}
}
#ifndef NDEBUG
/*
** Return true if the BtShared mutex is held on the btree, or if the
** B-Tree is not marked as sharable.
**
** This routine is used only from within assert() statements.
*/
int sqlite3BtreeHoldsMutex(Btree *p){
assert( p->sharable==0 || p->locked==0 || p->wantToLock>0 );
assert( p->sharable==0 || p->locked==0 || p->db==p->pBt->db );
assert( p->sharable==0 || p->locked==0 || sqlite3_mutex_held(p->pBt->mutex) );
assert( p->sharable==0 || p->locked==0 || sqlite3_mutex_held(p->db->mutex) );
return (p->sharable==0 || p->locked);
}
#endif
/*
** Enter the mutex on every Btree associated with a database
** connection. This is needed (for example) prior to parsing
** a statement since we will be comparing table and column names
** against all schemas and we do not want those schemas being
** reset out from under us.
**
** There is a corresponding leave-all procedures.
**
** Enter the mutexes in accending order by BtShared pointer address
** to avoid the possibility of deadlock when two threads with
** two or more btrees in common both try to lock all their btrees
** at the same instant.
*/
static void SQLITE_NOINLINE btreeEnterAll(sqlite3 *db){
int i;
int skipOk = 1;
Btree *p;
assert( sqlite3_mutex_held(db->mutex) );
for(i=0; i<db->nDb; i++){
p = db->aDb[i].pBt;
if( p && p->sharable ){
sqlite3BtreeEnter(p);
skipOk = 0;
}
}
db->noSharedCache = skipOk;
}
void sqlite3BtreeEnterAll(sqlite3 *db){
if( db->noSharedCache==0 ) btreeEnterAll(db);
}
static void SQLITE_NOINLINE btreeLeaveAll(sqlite3 *db){
int i;
Btree *p;
assert( sqlite3_mutex_held(db->mutex) );
for(i=0; i<db->nDb; i++){
p = db->aDb[i].pBt;
if( p ) sqlite3BtreeLeave(p);
}
}
void sqlite3BtreeLeaveAll(sqlite3 *db){
if( db->noSharedCache==0 ) btreeLeaveAll(db);
}
#ifndef NDEBUG
/*
** Return true if the current thread holds the database connection
** mutex and all required BtShared mutexes.
**
** This routine is used inside assert() statements only.
*/
int sqlite3BtreeHoldsAllMutexes(sqlite3 *db){
int i;
if( !sqlite3_mutex_held(db->mutex) ){
return 0;
}
for(i=0; i<db->nDb; i++){
Btree *p;
p = db->aDb[i].pBt;
if( p && p->sharable &&
(p->wantToLock==0 || !sqlite3_mutex_held(p->pBt->mutex)) ){
return 0;
}
}
return 1;
}
#endif /* NDEBUG */
#ifndef NDEBUG
/*
** Return true if the correct mutexes are held for accessing the
** db->aDb[iDb].pSchema structure. The mutexes required for schema
** access are:
**
** (1) The mutex on db
** (2) if iDb!=1, then the mutex on db->aDb[iDb].pBt.
**
** If pSchema is not NULL, then iDb is computed from pSchema and
** db using sqlite3SchemaToIndex().
*/
int sqlite3SchemaMutexHeld(sqlite3 *db, int iDb, Schema *pSchema){
Btree *p;
assert( db!=0 );
if( pSchema ) iDb = sqlite3SchemaToIndex(db, pSchema);
assert( iDb>=0 && iDb<db->nDb );
if( !sqlite3_mutex_held(db->mutex) ) return 0;
if( iDb==1 ) return 1;
p = db->aDb[iDb].pBt;
assert( p!=0 );
return p->sharable==0 || p->locked==1;
}
#endif /* NDEBUG */
#else /* SQLITE_THREADSAFE>0 above. SQLITE_THREADSAFE==0 below */
/*
** The following are special cases for mutex enter routines for use
** in single threaded applications that use shared cache. Except for
** these two routines, all mutex operations are no-ops in that case and
** are null #defines in btree.h.
**
** If shared cache is disabled, then all btree mutex routines, including
** the ones below, are no-ops and are null #defines in btree.h.
*/
void sqlite3BtreeEnter(Btree *p){
p->pBt->db = p->db;
}
void sqlite3BtreeEnterAll(sqlite3 *db){
int i;
for(i=0; i<db->nDb; i++){
Btree *p = db->aDb[i].pBt;
if( p ){
p->pBt->db = p->db;
}
}
}
#endif /* if SQLITE_THREADSAFE */
#ifndef SQLITE_OMIT_INCRBLOB
/*
** Enter a mutex on a Btree given a cursor owned by that Btree.
**
** These entry points are used by incremental I/O only. Enter() is required
** any time OMIT_SHARED_CACHE is not defined, regardless of whether or not
** the build is threadsafe. Leave() is only required by threadsafe builds.
*/
void sqlite3BtreeEnterCursor(BtCursor *pCur){
sqlite3BtreeEnter(pCur->pBtree);
}
# if SQLITE_THREADSAFE
void sqlite3BtreeLeaveCursor(BtCursor *pCur){
sqlite3BtreeLeave(pCur->pBtree);
}
# endif
#endif /* ifndef SQLITE_OMIT_INCRBLOB */
#endif /* ifndef SQLITE_OMIT_SHARED_CACHE */
因为 它太大了无法显示 source diff 。你可以改为 查看blob
因为 它太大了无法显示 source diff 。你可以改为 查看blob
/*
** 2008 August 05
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This file implements that page cache.
*/
#include "sqliteInt.h"
/*
** A complete page cache is an instance of this structure. Every
** entry in the cache holds a single page of the database file. The
** btree layer only operates on the cached copy of the database pages.
**
** A page cache entry is "clean" if it exactly matches what is currently
** on disk. A page is "dirty" if it has been modified and needs to be
** persisted to disk.
**
** pDirty, pDirtyTail, pSynced:
** All dirty pages are linked into the doubly linked list using
** PgHdr.pDirtyNext and pDirtyPrev. The list is maintained in LRU order
** such that p was added to the list more recently than p->pDirtyNext.
** PCache.pDirty points to the first (newest) element in the list and
** pDirtyTail to the last (oldest).
**
** The PCache.pSynced variable is used to optimize searching for a dirty
** page to eject from the cache mid-transaction. It is better to eject
** a page that does not require a journal sync than one that does.
** Therefore, pSynced is maintained so that it *almost* always points
** to either the oldest page in the pDirty/pDirtyTail list that has a
** clear PGHDR_NEED_SYNC flag or to a page that is older than this one
** (so that the right page to eject can be found by following pDirtyPrev
** pointers).
*/
struct PCache {
PgHdr *pDirty, *pDirtyTail; /* List of dirty pages in LRU order */
PgHdr *pSynced; /* Last synced page in dirty page list */
int nRefSum; /* Sum of ref counts over all pages */
int szCache; /* Configured cache size */
int szSpill; /* Size before spilling occurs */
int szPage; /* Size of every page in this cache */
int szExtra; /* Size of extra space for each page */
u8 bPurgeable; /* True if pages are on backing store */
u8 eCreate; /* eCreate value for for xFetch() */
int (*xStress)(void *, PgHdr *); /* Call to try make a page clean */
void * pStress; /* Argument to xStress */
sqlite3_pcache *pCache; /* Pluggable cache module */
};
/********************************** Test and Debug Logic **********************/
/*
** Debug tracing macros. Enable by by changing the "0" to "1" and
** recompiling.
**
** When sqlite3PcacheTrace is 1, single line trace messages are issued.
** When sqlite3PcacheTrace is 2, a dump of the pcache showing all cache entries
** is displayed for many operations, resulting in a lot of output.
*/
#if defined(SQLITE_DEBUG) && 0
int sqlite3PcacheTrace = 2; /* 0: off 1: simple 2: cache dumps */
int sqlite3PcacheMxDump = 9999; /* Max cache entries for pcacheDump() */
#define pcacheTrace(X) \
if (sqlite3PcacheTrace) { \
sqlite3DebugPrintf X; \
}
void pcacheDump(PCache *pCache) {
int N;
int i, j;
sqlite3_pcache_page *pLower;
PgHdr * pPg;
unsigned char * a;
if (sqlite3PcacheTrace < 2) return;
if (pCache->pCache == 0) return;
N = sqlite3PcachePagecount(pCache);
if (N > sqlite3PcacheMxDump) N = sqlite3PcacheMxDump;
for (i = 1; i <= N; i++) {
pLower = pcache2.xFetch(pCache->pCache, i, 0);
if (pLower == 0) continue;
pPg = (PgHdr *)pLower->pExtra;
printf("%3d: nRef %2d flgs %02x data ", i, pPg->nRef, pPg->flags);
a = (unsigned char *)pLower->pBuf;
for (j = 0; j < 12; j++) printf("%02x", a[j]);
printf("\n");
if (pPg->pPage == 0) {
pcache2.xUnpin(pCache->pCache, pLower, 0);
}
}
}
#else
#define pcacheTrace(X)
#define pcacheDump(X)
#endif
// /*
// ** Check invariants on a PgHdr entry. Return true if everything is OK.
// ** Return false if any invariant is violated.
// **
// ** This routine is for use inside of assert() statements only. For
// ** example:
// **
// ** assert( sqlite3PcachePageSanity(pPg) );
// */
// #ifdef SQLITE_DEBUG
// int sqlite3PcachePageSanity(PgHdr *pPg) {
// PCache *pCache;
// assert(pPg != 0);
// assert(pPg->pgno > 0 || pPg->pPager == 0); /* Page number is 1 or more */
// pCache = pPg->pCache;
// assert(pCache != 0); /* Every page has an associated PCache */
// if (pPg->flags & PGHDR_CLEAN) {
// assert((pPg->flags & PGHDR_DIRTY) == 0); /* Cannot be both CLEAN and DIRTY */
// assert(pCache->pDirty != pPg); /* CLEAN pages not on dirty list */
// assert(pCache->pDirtyTail != pPg);
// }
// /* WRITEABLE pages must also be DIRTY */
// if (pPg->flags & PGHDR_WRITEABLE) {
// assert(pPg->flags & PGHDR_DIRTY); /* WRITEABLE implies DIRTY */
// }
// /* NEED_SYNC can be set independently of WRITEABLE. This can happen,
// ** for example, when using the sqlite3PagerDontWrite() optimization:
// ** (1) Page X is journalled, and gets WRITEABLE and NEED_SEEK.
// ** (2) Page X moved to freelist, WRITEABLE is cleared
// ** (3) Page X reused, WRITEABLE is set again
// ** If NEED_SYNC had been cleared in step 2, then it would not be reset
// ** in step 3, and page might be written into the database without first
// ** syncing the rollback journal, which might cause corruption on a power
// ** loss.
// **
// ** Another example is when the database page size is smaller than the
// ** disk sector size. When any page of a sector is journalled, all pages
// ** in that sector are marked NEED_SYNC even if they are still CLEAN, just
// ** in case they are later modified, since all pages in the same sector
// ** must be journalled and synced before any of those pages can be safely
// ** written.
// */
// return 1;
// }
// #endif /* SQLITE_DEBUG */
/********************************** Linked List Management ********************/
/* Allowed values for second argument to pcacheManageDirtyList() */
#define PCACHE_DIRTYLIST_REMOVE 1 /* Remove pPage from dirty list */
#define PCACHE_DIRTYLIST_ADD 2 /* Add pPage to the dirty list */
#define PCACHE_DIRTYLIST_FRONT 3 /* Move pPage to the front of the list */
/*
** Manage pPage's participation on the dirty list. Bits of the addRemove
** argument determines what operation to do. The 0x01 bit means first
** remove pPage from the dirty list. The 0x02 means add pPage back to
** the dirty list. Doing both moves pPage to the front of the dirty list.
*/
static void pcacheManageDirtyList(PgHdr *pPage, u8 addRemove) {
PCache *p = pPage->pCache;
pcacheTrace(("%p.DIRTYLIST.%s %d\n", p, addRemove == 1 ? "REMOVE" : addRemove == 2 ? "ADD" : "FRONT", pPage->pgno));
if (addRemove & PCACHE_DIRTYLIST_REMOVE) {
assert(pPage->pDirtyNext || pPage == p->pDirtyTail);
assert(pPage->pDirtyPrev || pPage == p->pDirty);
/* Update the PCache1.pSynced variable if necessary. */
if (p->pSynced == pPage) {
p->pSynced = pPage->pDirtyPrev;
}
if (pPage->pDirtyNext) {
pPage->pDirtyNext->pDirtyPrev = pPage->pDirtyPrev;
} else {
assert(pPage == p->pDirtyTail);
p->pDirtyTail = pPage->pDirtyPrev;
}
if (pPage->pDirtyPrev) {
pPage->pDirtyPrev->pDirtyNext = pPage->pDirtyNext;
} else {
/* If there are now no dirty pages in the cache, set eCreate to 2.
** This is an optimization that allows sqlite3PcacheFetch() to skip
** searching for a dirty page to eject from the cache when it might
** otherwise have to. */
assert(pPage == p->pDirty);
p->pDirty = pPage->pDirtyNext;
assert(p->bPurgeable || p->eCreate == 2);
if (p->pDirty == 0) { /*OPTIMIZATION-IF-TRUE*/
assert(p->bPurgeable == 0 || p->eCreate == 1);
p->eCreate = 2;
}
}
}
if (addRemove & PCACHE_DIRTYLIST_ADD) {
pPage->pDirtyPrev = 0;
pPage->pDirtyNext = p->pDirty;
if (pPage->pDirtyNext) {
assert(pPage->pDirtyNext->pDirtyPrev == 0);
pPage->pDirtyNext->pDirtyPrev = pPage;
} else {
p->pDirtyTail = pPage;
if (p->bPurgeable) {
assert(p->eCreate == 2);
p->eCreate = 1;
}
}
p->pDirty = pPage;
/* If pSynced is NULL and this page has a clear NEED_SYNC flag, set
** pSynced to point to it. Checking the NEED_SYNC flag is an
** optimization, as if pSynced points to a page with the NEED_SYNC
** flag set sqlite3PcacheFetchStress() searches through all newer
** entries of the dirty-list for a page with NEED_SYNC clear anyway. */
if (!p->pSynced && 0 == (pPage->flags & PGHDR_NEED_SYNC) /*OPTIMIZATION-IF-FALSE*/
) {
p->pSynced = pPage;
}
}
pcacheDump(p);
}
/*
** Wrapper around the pluggable caches xUnpin method. If the cache is
** being used for an in-memory database, this function is a no-op.
*/
static void pcacheUnpin(PgHdr *p) {
if (p->pCache->bPurgeable) {
pcacheTrace(("%p.UNPIN %d\n", p->pCache, p->pgno));
pcache2.xUnpin(p->pCache->pCache, p->pPage, 0);
pcacheDump(p->pCache);
}
}
/*
** Compute the number of pages of cache requested. p->szCache is the
** cache size requested by the "PRAGMA cache_size" statement.
*/
static int numberOfCachePages(PCache *p) {
if (p->szCache >= 0) {
/* IMPLEMENTATION-OF: R-42059-47211 If the argument N is positive then the
** suggested cache size is set to N. */
return p->szCache;
} else {
i64 n;
/* IMPLEMANTATION-OF: R-59858-46238 If the argument N is negative, then the
** number of cache pages is adjusted to be a number of pages that would
** use approximately abs(N*1024) bytes of memory based on the current
** page size. */
n = ((-1024 * (i64)p->szCache) / (p->szPage + p->szExtra));
if (n > 1000000000) n = 1000000000;
return (int)n;
}
}
/*************************************************** General Interfaces ******
**
** Initialize and shutdown the page cache subsystem. Neither of these
** functions are threadsafe.
*/
int sqlite3PcacheInitialize(void) { return pcache2.xInit(pcache2.pArg); }
void sqlite3PcacheShutdown(void) {
if (pcache2.xShutdown) {
/* IMPLEMENTATION-OF: R-26000-56589 The xShutdown() method may be NULL. */
pcache2.xShutdown(pcache2.pArg);
}
}
/*
** Return the size in bytes of a PCache object.
*/
int sqlite3PcacheSize(void) { return sizeof(PCache); }
/*
** Create a new PCache object. Storage space to hold the object
** has already been allocated and is passed in as the p pointer.
** The caller discovers how much space needs to be allocated by
** calling sqlite3PcacheSize().
**
** szExtra is some extra space allocated for each page. The first
** 8 bytes of the extra space will be zeroed as the page is allocated,
** but remaining content will be uninitialized. Though it is opaque
** to this module, the extra space really ends up being the MemPage
** structure in the pager.
*/
int sqlite3PcacheOpen(int szPage, /* Size of every page */
int szExtra, /* Extra space associated with each page */
int bPurgeable, /* True if pages are on backing store */
int (*xStress)(void *, PgHdr *), /* Call to try to make pages clean */
void * pStress, /* Argument to xStress */
PCache *p /* Preallocated space for the PCache */
) {
memset(p, 0, sizeof(PCache));
p->szPage = 1;
p->szExtra = szExtra;
assert(szExtra >= 8); /* First 8 bytes will be zeroed */
p->bPurgeable = bPurgeable;
p->eCreate = 2;
p->xStress = xStress;
p->pStress = pStress;
p->szCache = 100;
p->szSpill = 1;
pcacheTrace(("%p.OPEN szPage %d bPurgeable %d\n", p, szPage, bPurgeable));
return sqlite3PcacheSetPageSize(p, szPage);
}
/*
** Change the page size for PCache object. The caller must ensure that there
** are no outstanding page references when this function is called.
*/
int sqlite3PcacheSetPageSize(PCache *pCache, int szPage) {
assert(pCache->nRefSum == 0 && pCache->pDirty == 0);
if (pCache->szPage) {
sqlite3_pcache *pNew;
pNew = pcache2.xCreate(szPage, pCache->szExtra + ROUND8(sizeof(PgHdr)), pCache->bPurgeable);
if (pNew == 0) return SQLITE_NOMEM;
pcache2.xCachesize(pNew, numberOfCachePages(pCache));
if (pCache->pCache) {
pcache2.xDestroy(pCache->pCache);
}
pCache->pCache = pNew;
pCache->szPage = szPage;
pcacheTrace(("%p.PAGESIZE %d\n", pCache, szPage));
}
return 0;
}
/*
** Try to obtain a page from the cache.
**
** This routine returns a pointer to an sqlite3_pcache_page object if
** such an object is already in cache, or if a new one is created.
** This routine returns a NULL pointer if the object was not in cache
** and could not be created.
**
** The createFlags should be 0 to check for existing pages and should
** be 3 (not 1, but 3) to try to create a new page.
**
** If the createFlag is 0, then NULL is always returned if the page
** is not already in the cache. If createFlag is 1, then a new page
** is created only if that can be done without spilling dirty pages
** and without exceeding the cache size limit.
**
** The caller needs to invoke sqlite3PcacheFetchFinish() to properly
** initialize the sqlite3_pcache_page object and convert it into a
** PgHdr object. The sqlite3PcacheFetch() and sqlite3PcacheFetchFinish()
** routines are split this way for performance reasons. When separated
** they can both (usually) operate without having to push values to
** the stack on entry and pop them back off on exit, which saves a
** lot of pushing and popping.
*/
sqlite3_pcache_page *sqlite3PcacheFetch(PCache *pCache, /* Obtain the page from this cache */
Pgno pgno, /* Page number to obtain */
int createFlag /* If true, create page if it does not exist already */
) {
int eCreate;
sqlite3_pcache_page *pRes;
assert(pCache != 0);
assert(pCache->pCache != 0);
assert(createFlag == 3 || createFlag == 0);
assert(pCache->eCreate == ((pCache->bPurgeable && pCache->pDirty) ? 1 : 2));
/* eCreate defines what to do if the page does not exist.
** 0 Do not allocate a new page. (createFlag==0)
** 1 Allocate a new page if doing so is inexpensive.
** (createFlag==1 AND bPurgeable AND pDirty)
** 2 Allocate a new page even it doing so is difficult.
** (createFlag==1 AND !(bPurgeable AND pDirty)
*/
eCreate = createFlag & pCache->eCreate;
assert(eCreate == 0 || eCreate == 1 || eCreate == 2);
assert(createFlag == 0 || pCache->eCreate == eCreate);
assert(createFlag == 0 || eCreate == 1 + (!pCache->bPurgeable || !pCache->pDirty));
pRes = pcache2.xFetch(pCache->pCache, pgno, eCreate);
pcacheTrace(("%p.FETCH %d%s (result: %p)\n", pCache, pgno, createFlag ? " create" : "", pRes));
return pRes;
}
/*
** If the sqlite3PcacheFetch() routine is unable to allocate a new
** page because no clean pages are available for reuse and the cache
** size limit has been reached, then this routine can be invoked to
** try harder to allocate a page. This routine might invoke the stress
** callback to spill dirty pages to the journal. It will then try to
** allocate the new page and will only fail to allocate a new page on
** an OOM error.
**
** This routine should be invoked only after sqlite3PcacheFetch() fails.
*/
int sqlite3PcacheFetchStress(PCache * pCache, /* Obtain the page from this cache */
Pgno pgno, /* Page number to obtain */
sqlite3_pcache_page **ppPage /* Write result here */
) {
PgHdr *pPg;
if (pCache->eCreate == 2) return 0;
if (sqlite3PcachePagecount(pCache) > pCache->szSpill) {
/* Find a dirty page to write-out and recycle. First try to find a
** page that does not require a journal-sync (one with PGHDR_NEED_SYNC
** cleared), but if that is not possible settle for any other
** unreferenced dirty page.
**
** If the LRU page in the dirty list that has a clear PGHDR_NEED_SYNC
** flag is currently referenced, then the following may leave pSynced
** set incorrectly (pointing to other than the LRU page with NEED_SYNC
** cleared). This is Ok, as pSynced is just an optimization. */
for (pPg = pCache->pSynced; pPg && (pPg->nRef || (pPg->flags & PGHDR_NEED_SYNC)); pPg = pPg->pDirtyPrev)
;
pCache->pSynced = pPg;
if (!pPg) {
for (pPg = pCache->pDirtyTail; pPg && pPg->nRef; pPg = pPg->pDirtyPrev)
;
}
if (pPg) {
int rc;
#ifdef SQLITE_LOG_CACHE_SPILL
sqlite3_log(SQLITE_FULL, "spill page %d making room for %d - cache used: %d/%d", pPg->pgno, pgno,
pcache2.xPagecount(pCache->pCache), numberOfCachePages(pCache));
#endif
pcacheTrace(("%p.SPILL %d\n", pCache, pPg->pgno));
rc = pCache->xStress(pCache->pStress, pPg);
pcacheDump(pCache);
if (rc != 0 && rc != SQLITE_BUSY) {
return rc;
}
}
}
*ppPage = pcache2.xFetch(pCache->pCache, pgno, 2);
return *ppPage == 0 ? SQLITE_NOMEM : 0;
}
/*
** This is a helper routine for sqlite3PcacheFetchFinish()
**
** In the uncommon case where the page being fetched has not been
** initialized, this routine is invoked to do the initialization.
** This routine is broken out into a separate function since it
** requires extra stack manipulation that can be avoided in the common
** case.
*/
static PgHdr *pcacheFetchFinishWithInit(PCache * pCache, /* Obtain the page from this cache */
Pgno pgno, /* Page number obtained */
sqlite3_pcache_page *pPage /* Page obtained by prior PcacheFetch() call */
) {
PgHdr *pPgHdr;
assert(pPage != 0);
pPgHdr = (PgHdr *)pPage->pExtra;
assert(pPgHdr->pPage == 0);
memset(&pPgHdr->pDirty, 0, sizeof(PgHdr) - offsetof(PgHdr, pDirty));
pPgHdr->pPage = pPage;
pPgHdr->pData = pPage->pBuf;
pPgHdr->pExtra = (void *)&pPgHdr[1];
memset(pPgHdr->pExtra, 0, 8);
pPgHdr->pCache = pCache;
pPgHdr->pgno = pgno;
pPgHdr->flags = PGHDR_CLEAN;
return sqlite3PcacheFetchFinish(pCache, pgno, pPage);
}
/*
** This routine converts the sqlite3_pcache_page object returned by
** sqlite3PcacheFetch() into an initialized PgHdr object. This routine
** must be called after sqlite3PcacheFetch() in order to get a usable
** result.
*/
PgHdr *sqlite3PcacheFetchFinish(PCache * pCache, /* Obtain the page from this cache */
Pgno pgno, /* Page number obtained */
sqlite3_pcache_page *pPage /* Page obtained by prior PcacheFetch() call */
) {
PgHdr *pPgHdr;
assert(pPage != 0);
pPgHdr = (PgHdr *)pPage->pExtra;
if (!pPgHdr->pPage) {
return pcacheFetchFinishWithInit(pCache, pgno, pPage);
}
pCache->nRefSum++;
pPgHdr->nRef++;
// assert(sqlite3PcachePageSanity(pPgHdr));
return pPgHdr;
}
/*
** Decrement the reference count on a page. If the page is clean and the
** reference count drops to 0, then it is made eligible for recycling.
*/
void sqlite3PcacheRelease(PgHdr *p) {
assert(p->nRef > 0);
p->pCache->nRefSum--;
if ((--p->nRef) == 0) {
if (p->flags & PGHDR_CLEAN) {
pcacheUnpin(p);
} else {
pcacheManageDirtyList(p, PCACHE_DIRTYLIST_FRONT);
}
}
}
/*
** Increase the reference count of a supplied page by 1.
*/
void sqlite3PcacheRef(PgHdr *p) {
assert(p->nRef > 0);
// assert(sqlite3PcachePageSanity(p));
p->nRef++;
p->pCache->nRefSum++;
}
/*
** Drop a page from the cache. There must be exactly one reference to the
** page. This function deletes that reference, so after it returns the
** page pointed to by p is invalid.
*/
void sqlite3PcacheDrop(PgHdr *p) {
assert(p->nRef == 1);
// assert(sqlite3PcachePageSanity(p));
if (p->flags & PGHDR_DIRTY) {
pcacheManageDirtyList(p, PCACHE_DIRTYLIST_REMOVE);
}
p->pCache->nRefSum--;
pcache2.xUnpin(p->pCache->pCache, p->pPage, 1);
}
/*
** Make sure the page is marked as dirty. If it isn't dirty already,
** make it so.
*/
void sqlite3PcacheMakeDirty(PgHdr *p) {
assert(p->nRef > 0);
// assert(sqlite3PcachePageSanity(p));
if (p->flags & (PGHDR_CLEAN | PGHDR_DONT_WRITE)) { /*OPTIMIZATION-IF-FALSE*/
p->flags &= ~PGHDR_DONT_WRITE;
if (p->flags & PGHDR_CLEAN) {
p->flags ^= (PGHDR_DIRTY | PGHDR_CLEAN);
pcacheTrace(("%p.DIRTY %d\n", p->pCache, p->pgno));
assert((p->flags & (PGHDR_DIRTY | PGHDR_CLEAN)) == PGHDR_DIRTY);
pcacheManageDirtyList(p, PCACHE_DIRTYLIST_ADD);
}
// assert(sqlite3PcachePageSanity(p));
}
}
/*
** Make sure the page is marked as clean. If it isn't clean already,
** make it so.
*/
void sqlite3PcacheMakeClean(PgHdr *p) {
// assert(sqlite3PcachePageSanity(p));
assert((p->flags & PGHDR_DIRTY) != 0);
assert((p->flags & PGHDR_CLEAN) == 0);
pcacheManageDirtyList(p, PCACHE_DIRTYLIST_REMOVE);
p->flags &= ~(PGHDR_DIRTY | PGHDR_NEED_SYNC | PGHDR_WRITEABLE);
p->flags |= PGHDR_CLEAN;
pcacheTrace(("%p.CLEAN %d\n", p->pCache, p->pgno));
// assert(sqlite3PcachePageSanity(p));
if (p->nRef == 0) {
pcacheUnpin(p);
}
}
/*
** Make every page in the cache clean.
*/
void sqlite3PcacheCleanAll(PCache *pCache) {
PgHdr *p;
pcacheTrace(("%p.CLEAN-ALL\n", pCache));
while ((p = pCache->pDirty) != 0) {
sqlite3PcacheMakeClean(p);
}
}
/*
** Clear the PGHDR_NEED_SYNC and PGHDR_WRITEABLE flag from all dirty pages.
*/
void sqlite3PcacheClearWritable(PCache *pCache) {
PgHdr *p;
pcacheTrace(("%p.CLEAR-WRITEABLE\n", pCache));
for (p = pCache->pDirty; p; p = p->pDirtyNext) {
p->flags &= ~(PGHDR_NEED_SYNC | PGHDR_WRITEABLE);
}
pCache->pSynced = pCache->pDirtyTail;
}
/*
** Clear the PGHDR_NEED_SYNC flag from all dirty pages.
*/
void sqlite3PcacheClearSyncFlags(PCache *pCache) {
PgHdr *p;
for (p = pCache->pDirty; p; p = p->pDirtyNext) {
p->flags &= ~PGHDR_NEED_SYNC;
}
pCache->pSynced = pCache->pDirtyTail;
}
/*
** Change the page number of page p to newPgno.
*/
void sqlite3PcacheMove(PgHdr *p, Pgno newPgno) {
PCache *pCache = p->pCache;
assert(p->nRef > 0);
assert(newPgno > 0);
// assert(sqlite3PcachePageSanity(p));
pcacheTrace(("%p.MOVE %d -> %d\n", pCache, p->pgno, newPgno));
pcache2.xRekey(pCache->pCache, p->pPage, p->pgno, newPgno);
p->pgno = newPgno;
if ((p->flags & PGHDR_DIRTY) && (p->flags & PGHDR_NEED_SYNC)) {
pcacheManageDirtyList(p, PCACHE_DIRTYLIST_FRONT);
}
}
/*
** Drop every cache entry whose page number is greater than "pgno". The
** caller must ensure that there are no outstanding references to any pages
** other than page 1 with a page number greater than pgno.
**
** If there is a reference to page 1 and the pgno parameter passed to this
** function is 0, then the data area associated with page 1 is zeroed, but
** the page object is not dropped.
*/
void sqlite3PcacheTruncate(PCache *pCache, Pgno pgno) {
if (pCache->pCache) {
PgHdr *p;
PgHdr *pNext;
pcacheTrace(("%p.TRUNCATE %d\n", pCache, pgno));
for (p = pCache->pDirty; p; p = pNext) {
pNext = p->pDirtyNext;
/* This routine never gets call with a positive pgno except right
** after sqlite3PcacheCleanAll(). So if there are dirty pages,
** it must be that pgno==0.
*/
assert(p->pgno > 0);
if (p->pgno > pgno) {
assert(p->flags & PGHDR_DIRTY);
sqlite3PcacheMakeClean(p);
}
}
if (pgno == 0 && pCache->nRefSum) {
sqlite3_pcache_page *pPage1;
pPage1 = pcache2.xFetch(pCache->pCache, 1, 0);
if (pPage1) { /* Page 1 is always available in cache, because
** pCache->nRefSum>0 */
memset(pPage1->pBuf, 0, pCache->szPage);
pgno = 1;
}
}
pcache2.xTruncate(pCache->pCache, pgno + 1);
}
}
/*
** Close a cache.
*/
void sqlite3PcacheClose(PCache *pCache) {
assert(pCache->pCache != 0);
pcacheTrace(("%p.CLOSE\n", pCache));
pcache2.xDestroy(pCache->pCache);
}
/*
** Discard the contents of the cache.
*/
void sqlite3PcacheClear(PCache *pCache) { sqlite3PcacheTruncate(pCache, 0); }
/*
** Merge two lists of pages connected by pDirty and in pgno order.
** Do not bother fixing the pDirtyPrev pointers.
*/
static PgHdr *pcacheMergeDirtyList(PgHdr *pA, PgHdr *pB) {
PgHdr result, *pTail;
pTail = &result;
assert(pA != 0 && pB != 0);
for (;;) {
if (pA->pgno < pB->pgno) {
pTail->pDirty = pA;
pTail = pA;
pA = pA->pDirty;
if (pA == 0) {
pTail->pDirty = pB;
break;
}
} else {
pTail->pDirty = pB;
pTail = pB;
pB = pB->pDirty;
if (pB == 0) {
pTail->pDirty = pA;
break;
}
}
}
return result.pDirty;
}
/*
** Sort the list of pages in accending order by pgno. Pages are
** connected by pDirty pointers. The pDirtyPrev pointers are
** corrupted by this sort.
**
** Since there cannot be more than 2^31 distinct pages in a database,
** there cannot be more than 31 buckets required by the merge sorter.
** One extra bucket is added to catch overflow in case something
** ever changes to make the previous sentence incorrect.
*/
#define N_SORT_BUCKET 32
static PgHdr *pcacheSortDirtyList(PgHdr *pIn) {
PgHdr *a[N_SORT_BUCKET], *p;
int i;
memset(a, 0, sizeof(a));
while (pIn) {
p = pIn;
pIn = p->pDirty;
p->pDirty = 0;
for (i = 0; i < N_SORT_BUCKET - 1; i++) {
if (a[i] == 0) {
a[i] = p;
break;
} else {
p = pcacheMergeDirtyList(a[i], p);
a[i] = 0;
}
}
if (i == N_SORT_BUCKET - 1) {
/* To get here, there need to be 2^(N_SORT_BUCKET) elements in
** the input list. But that is impossible.
*/
a[i] = pcacheMergeDirtyList(a[i], p);
}
}
p = a[0];
for (i = 1; i < N_SORT_BUCKET; i++) {
if (a[i] == 0) continue;
p = p ? pcacheMergeDirtyList(p, a[i]) : a[i];
}
return p;
}
/*
** Return a list of all dirty pages in the cache, sorted by page number.
*/
PgHdr *sqlite3PcacheDirtyList(PCache *pCache) {
PgHdr *p;
for (p = pCache->pDirty; p; p = p->pDirtyNext) {
p->pDirty = p->pDirtyNext;
}
return pcacheSortDirtyList(pCache->pDirty);
}
/*
** Return the total number of references to all pages held by the cache.
**
** This is not the total number of pages referenced, but the sum of the
** reference count for all pages.
*/
int sqlite3PcacheRefCount(PCache *pCache) { return pCache->nRefSum; }
/*
** Return the number of references to the page supplied as an argument.
*/
int sqlite3PcachePageRefcount(PgHdr *p) { return p->nRef; }
/*
** Return the total number of pages in the cache.
*/
int sqlite3PcachePagecount(PCache *pCache) {
assert(pCache->pCache != 0);
return pcache2.xPagecount(pCache->pCache);
}
#ifdef SQLITE_TEST
/*
** Get the suggested cache-size value.
*/
int sqlite3PcacheGetCachesize(PCache *pCache) { return numberOfCachePages(pCache); }
#endif
/*
** Set the suggested cache-size value.
*/
void sqlite3PcacheSetCachesize(PCache *pCache, int mxPage) {
assert(pCache->pCache != 0);
pCache->szCache = mxPage;
pcache2.xCachesize(pCache->pCache, numberOfCachePages(pCache));
}
/*
** Set the suggested cache-spill value. Make no changes if if the
** argument is zero. Return the effective cache-spill size, which will
** be the larger of the szSpill and szCache.
*/
int sqlite3PcacheSetSpillsize(PCache *p, int mxPage) {
int res;
assert(p->pCache != 0);
if (mxPage) {
if (mxPage < 0) {
mxPage = (int)((-1024 * (i64)mxPage) / (p->szPage + p->szExtra));
}
p->szSpill = mxPage;
}
res = numberOfCachePages(p);
if (res < p->szSpill) res = p->szSpill;
return res;
}
/*
** Free up as much memory as possible from the page cache.
*/
void sqlite3PcacheShrink(PCache *pCache) {
assert(pCache->pCache != 0);
pcache2.xShrink(pCache->pCache);
}
/*
** Return the size of the header added by this middleware layer
** in the page-cache hierarchy.
*/
int sqlite3HeaderSizePcache(void) { return ROUND8(sizeof(PgHdr)); }
/*
** Return the number of dirty pages currently in the cache, as a percentage
** of the configured cache size.
*/
int sqlite3PCachePercentDirty(PCache *pCache) {
PgHdr *pDirty;
int nDirty = 0;
int nCache = numberOfCachePages(pCache);
for (pDirty = pCache->pDirty; pDirty; pDirty = pDirty->pDirtyNext) nDirty++;
return nCache ? (int)(((i64)nDirty * 100) / nCache) : 0;
}
#ifdef SQLITE_DIRECT_OVERFLOW_READ
/*
** Return true if there are one or more dirty pages in the cache. Else false.
*/
int sqlite3PCacheIsDirty(PCache *pCache) { return (pCache->pDirty != 0); }
#endif
#if defined(SQLITE_CHECK_PAGES) || defined(SQLITE_DEBUG)
/*
** For all dirty pages currently in the cache, invoke the specified
** callback. This is only used if the SQLITE_CHECK_PAGES macro is
** defined.
*/
void sqlite3PcacheIterateDirty(PCache *pCache, void (*xIter)(PgHdr *)) {
PgHdr *pDirty;
for (pDirty = pCache->pDirty; pDirty; pDirty = pDirty->pDirtyNext) {
xIter(pDirty);
}
}
#endif
/*
** 2008 November 05
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file implements the default page cache implementation (the
** sqlite3_pcache interface). It also contains part of the implementation
** of the SQLITE_CONFIG_PAGECACHE and sqlite3_release_memory() features.
** If the default page cache implementation is overridden, then neither of
** these two features are available.
**
** A Page cache line looks like this:
**
** -------------------------------------------------------------
** | database page content | PgHdr1 | MemPage | PgHdr |
** -------------------------------------------------------------
**
** The database page content is up front (so that buffer overreads tend to
** flow harmlessly into the PgHdr1, MemPage, and PgHdr extensions). MemPage
** is the extension added by the btree.c module containing information such
** as the database page number and how that database page is used. PgHdr
** is added by the pcache.c layer and contains information used to keep track
** of which pages are "dirty". PgHdr1 is an extension added by this
** module (pcache1.c). The PgHdr1 header is a subclass of sqlite3_pcache_page.
** PgHdr1 contains information needed to look up a page by its page number.
** The superclass sqlite3_pcache_page.pBuf points to the start of the
** database page content and sqlite3_pcache_page.pExtra points to PgHdr.
**
** The size of the extension (MemPage+PgHdr+PgHdr1) can be determined at
** runtime using sqlite3_config(SQLITE_CONFIG_PCACHE_HDRSZ, &size). The
** sizes of the extensions sum to 272 bytes on x64 for 3.8.10, but this
** size can vary according to architecture, compile-time options, and
** SQLite library version number.
**
** If SQLITE_PCACHE_SEPARATE_HEADER is defined, then the extension is obtained
** using a separate memory allocation from the database page content. This
** seeks to overcome the "clownshoe" problem (also called "internal
** fragmentation" in academic literature) of allocating a few bytes more
** than a power of two with the memory allocator rounding up to the next
** power of two, and leaving the rounded-up space unused.
**
** This module tracks pointers to PgHdr1 objects. Only pcache.c communicates
** with this module. Information is passed back and forth as PgHdr1 pointers.
**
** The pcache.c and pager.c modules deal pointers to PgHdr objects.
** The btree.c module deals with pointers to MemPage objects.
**
** SOURCE OF PAGE CACHE MEMORY:
**
** Memory for a page might come from any of three sources:
**
** (1) The general-purpose memory allocator - sqlite3Malloc()
** (2) Global page-cache memory provided using sqlite3_config() with
** SQLITE_CONFIG_PAGECACHE.
** (3) PCache-local bulk allocation.
**
** The third case is a chunk of heap memory (defaulting to 100 pages worth)
** that is allocated when the page cache is created. The size of the local
** bulk allocation can be adjusted using
**
** sqlite3_config(SQLITE_CONFIG_PAGECACHE, (void*)0, 0, N).
**
** If N is positive, then N pages worth of memory are allocated using a single
** sqlite3Malloc() call and that memory is used for the first N pages allocated.
** Or if N is negative, then -1024*N bytes of memory are allocated and used
** for as many pages as can be accomodated.
**
** Only one of (2) or (3) can be used. Once the memory available to (2) or
** (3) is exhausted, subsequent allocations fail over to the general-purpose
** memory allocator (1).
**
** Earlier versions of SQLite used only methods (1) and (2). But experiments
** show that method (3) with N==100 provides about a 5% performance boost for
** common workloads.
*/
#include "sqliteInt.h"
typedef struct PCache1 PCache1;
typedef struct PgHdr1 PgHdr1;
typedef struct PgFreeslot PgFreeslot;
typedef struct PGroup PGroup;
/*
** Each cache entry is represented by an instance of the following
** structure. Unless SQLITE_PCACHE_SEPARATE_HEADER is defined, a buffer of
** PgHdr1.pCache->szPage bytes is allocated directly before this structure
** in memory.
**
** Note: Variables isBulkLocal and isAnchor were once type "u8". That works,
** but causes a 2-byte gap in the structure for most architectures (since
** pointers must be either 4 or 8-byte aligned). As this structure is located
** in memory directly after the associated page data, if the database is
** corrupt, code at the b-tree layer may overread the page buffer and
** read part of this structure before the corruption is detected. This
** can cause a valgrind error if the unitialized gap is accessed. Using u16
** ensures there is no such gap, and therefore no bytes of unitialized memory
** in the structure.
*/
struct PgHdr1 {
sqlite3_pcache_page page; /* Base class. Must be first. pBuf & pExtra */
unsigned int iKey; /* Key value (page number) */
u16 isBulkLocal; /* This page from bulk local storage */
u16 isAnchor; /* This is the PGroup.lru element */
PgHdr1 * pNext; /* Next in hash table chain */
PCache1 * pCache; /* Cache that currently owns this page */
PgHdr1 * pLruNext; /* Next in LRU list of unpinned pages */
PgHdr1 * pLruPrev; /* Previous in LRU list of unpinned pages */
/* NB: pLruPrev is only valid if pLruNext!=0 */
};
/*
** A page is pinned if it is not on the LRU list. To be "pinned" means
** that the page is in active use and must not be deallocated.
*/
#define PAGE_IS_PINNED(p) ((p)->pLruNext == 0)
#define PAGE_IS_UNPINNED(p) ((p)->pLruNext != 0)
/* Each page cache (or PCache) belongs to a PGroup. A PGroup is a set
** of one or more PCaches that are able to recycle each other's unpinned
** pages when they are under memory pressure. A PGroup is an instance of
** the following object.
**
** This page cache implementation works in one of two modes:
**
** (1) Every PCache is the sole member of its own PGroup. There is
** one PGroup per PCache.
**
** (2) There is a single global PGroup that all PCaches are a member
** of.
**
** Mode 1 uses more memory (since PCache instances are not able to rob
** unused pages from other PCaches) but it also operates without a mutex,
** and is therefore often faster. Mode 2 requires a mutex in order to be
** threadsafe, but recycles pages more efficiently.
**
** For mode (1), PGroup.mutex is NULL. For mode (2) there is only a single
** PGroup which is the pcache1.grp global variable and its mutex is
** SQLITE_MUTEX_STATIC_LRU.
*/
struct PGroup {
pthread_mutex_t mutex; /* MUTEX_STATIC_LRU or NULL */
unsigned int nMaxPage; /* Sum of nMax for purgeable caches */
unsigned int nMinPage; /* Sum of nMin for purgeable caches */
unsigned int mxPinned; /* nMaxpage + 10 - nMinPage */
unsigned int nPurgeable; /* Number of purgeable pages allocated */
PgHdr1 lru; /* The beginning and end of the LRU list */
};
/* Each page cache is an instance of the following object. Every
** open database file (including each in-memory database and each
** temporary or transient database) has a single page cache which
** is an instance of this object.
**
** Pointers to structures of this type are cast and returned as
** opaque sqlite3_pcache* handles.
*/
struct PCache1 {
/* Cache configuration parameters. Page size (szPage) and the purgeable
** flag (bPurgeable) and the pnPurgeable pointer are all set when the
** cache is created and are never changed thereafter. nMax may be
** modified at any time by a call to the pcache1Cachesize() method.
** The PGroup mutex must be held when accessing nMax.
*/
PGroup * pGroup; /* PGroup this cache belongs to */
unsigned int *pnPurgeable; /* Pointer to pGroup->nPurgeable */
int szPage; /* Size of database content section */
int szExtra; /* sizeof(MemPage)+sizeof(PgHdr) */
int szAlloc; /* Total size of one pcache line */
int bPurgeable; /* True if cache is purgeable */
unsigned int nMin; /* Minimum number of pages reserved */
unsigned int nMax; /* Configured "cache_size" value */
unsigned int n90pct; /* nMax*9/10 */
unsigned int iMaxKey; /* Largest key seen since xTruncate() */
unsigned int nPurgeableDummy; /* pnPurgeable points here when not used*/
/* Hash table of all pages. The following variables may only be accessed
** when the accessor is holding the PGroup mutex.
*/
unsigned int nRecyclable; /* Number of pages in the LRU list */
unsigned int nPage; /* Total number of pages in apHash */
unsigned int nHash; /* Number of slots in apHash[] */
PgHdr1 ** apHash; /* Hash table for fast lookup by key */
PgHdr1 * pFree; /* List of unused pcache-local pages */
void * pBulk; /* Bulk memory used by pcache-local */
};
/*
** Free slots in the allocator used to divide up the global page cache
** buffer provided using the SQLITE_CONFIG_PAGECACHE mechanism.
*/
struct PgFreeslot {
PgFreeslot *pNext; /* Next free slot */
};
/*
** Global data used by this cache.
*/
static struct PCacheGlobal {
PGroup grp; /* The global PGroup for mode (2) */
/* Variables related to SQLITE_CONFIG_PAGECACHE settings. The
** szSlot, nSlot, pStart, pEnd, nReserve, and isInit values are all
** fixed at sqlite3_initialize() time and do not require mutex protection.
** The nFreeSlot and pFree values do require mutex protection.
*/
int isInit; /* True if initialized */
int separateCache; /* Use a new PGroup for each PCache */
int nInitPage; /* Initial bulk allocation size */
int szSlot; /* Size of each free slot */
int nSlot; /* The number of pcache slots */
int nReserve; /* Try to keep nFreeSlot above this */
void *pStart, *pEnd; /* Bounds of global page cache memory */
/* Above requires no mutex. Use mutex below for variable that follow. */
pthread_mutex_t mutex; /* Mutex for accessing the following: */
PgFreeslot * pFree; /* Free page blocks */
int nFreeSlot; /* Number of unused pcache slots */
/* The following value requires a mutex to change. We skip the mutex on
** reading because (1) most platforms read a 32-bit integer atomically and
** (2) even if an incorrect value is read, no great harm is done since this
** is really just an optimization. */
int bUnderPressure; /* True if low on PAGECACHE memory */
} pcache1;
#define pcache1EnterMutex(X) pthread_mutex_lock(&((X)->mutex))
#define pcache1LeaveMutex(X) pthread_mutex_unlock(&((X)->mutex))
#define PCACHE1_MIGHT_USE_GROUP_MUTEX 1
/******************************************************************************/
/******** Page Allocation/SQLITE_CONFIG_PCACHE Related Functions **************/
/*
** This function is called during initialization if a static buffer is
** supplied to use for the page-cache by passing the SQLITE_CONFIG_PAGECACHE
** verb to sqlite3_config(). Parameter pBuf points to an allocation large
** enough to contain 'n' buffers of 'sz' bytes each.
**
** This routine is called from sqlite3_initialize() and so it is guaranteed
** to be serialized already. There is no need for further mutexing.
*/
void sqlite3PCacheBufferSetup(void *pBuf, int sz, int n) {
if (pcache1.isInit) {
PgFreeslot *p;
if (pBuf == 0) sz = n = 0;
if (n == 0) sz = 0;
sz = ROUNDDOWN8(sz);
pcache1.szSlot = sz;
pcache1.nSlot = pcache1.nFreeSlot = n;
pcache1.nReserve = n > 90 ? 10 : (n / 10 + 1);
pcache1.pStart = pBuf;
pcache1.pFree = 0;
pcache1.bUnderPressure = 0;
while (n--) {
p = (PgFreeslot *)pBuf;
p->pNext = pcache1.pFree;
pcache1.pFree = p;
pBuf = (void *)&((char *)pBuf)[sz];
}
pcache1.pEnd = pBuf;
}
}
/*
** Try to initialize the pCache->pFree and pCache->pBulk fields. Return
** true if pCache->pFree ends up containing one or more free pages.
*/
static int pcache1InitBulk(PCache1 *pCache) {
i64 szBulk;
char *zBulk;
if (pcache1.nInitPage == 0) return 0;
/* Do not bother with a bulk allocation if the cache size very small */
if (pCache->nMax < 3) return 0;
// sqlite3BeginBenignMalloc();
if (pcache1.nInitPage > 0) {
szBulk = pCache->szAlloc * (i64)pcache1.nInitPage;
} else {
szBulk = -1024 * (i64)pcache1.nInitPage;
}
if (szBulk > pCache->szAlloc * (i64)pCache->nMax) {
szBulk = pCache->szAlloc * (i64)pCache->nMax;
}
zBulk = pCache->pBulk = malloc(szBulk);
// sqlite3EndBenignMalloc();
if (zBulk) {
int nBulk = szBulk / pCache->szAlloc;
do {
PgHdr1 *pX = (PgHdr1 *)(&zBulk[pCache->szPage]);
pX->page.pBuf = zBulk;
pX->page.pExtra = &pX[1];
pX->isBulkLocal = 1;
pX->isAnchor = 0;
pX->pNext = pCache->pFree;
pX->pLruPrev = 0; /* Initializing this saves a valgrind error */
pCache->pFree = pX;
zBulk += pCache->szAlloc;
} while (--nBulk);
}
return pCache->pFree != 0;
}
/*
** Malloc function used within this file to allocate space from the buffer
** configured using sqlite3_config(SQLITE_CONFIG_PAGECACHE) option. If no
** such buffer exists or there is no space left in it, this function falls
** back to sqlite3Malloc().
**
** Multiple threads can run this routine at the same time. Global variables
** in pcache1 need to be protected via mutex.
*/
static void *pcache1Alloc(int nByte) {
void *p = 0;
// assert(sqlite3_mutex_notheld(pcache1.grp.mutex));
if (nByte <= pcache1.szSlot) {
pthread_mutex_lock(&(pcache1.mutex));
p = (PgHdr1 *)pcache1.pFree;
if (p) {
pcache1.pFree = pcache1.pFree->pNext;
pcache1.nFreeSlot--;
pcache1.bUnderPressure = pcache1.nFreeSlot < pcache1.nReserve;
assert(pcache1.nFreeSlot >= 0);
// sqlite3StatusHighwater(SQLITE_STATUS_PAGECACHE_SIZE, nByte);
// sqlite3StatusUp(SQLITE_STATUS_PAGECACHE_USED, 1);
}
pthread_mutex_unlock(&pcache1.mutex);
}
if (p == 0) {
/* Memory is not available in the SQLITE_CONFIG_PAGECACHE pool. Get
** it from sqlite3Malloc instead.
*/
p = malloc(nByte);
#ifndef SQLITE_DISABLE_PAGECACHE_OVERFLOW_STATS
if (p) {
int sz = nByte;
pthread_mutex_lock(&pcache1.mutex);
// sqlite3StatusHighwater(SQLITE_STATUS_PAGECACHE_SIZE, nByte);
// sqlite3StatusUp(SQLITE_STATUS_PAGECACHE_OVERFLOW, sz);
pthread_mutex_unlock(&pcache1.mutex);
}
#endif
// sqlite3MemdebugSetType(p, MEMTYPE_PCACHE);
}
return p;
}
/*
** Free an allocated buffer obtained from pcache1Alloc().
*/
static void pcache1Free(void *p) {
if (p == 0) return;
// if (SQLITE_WITHIN(p, pcache1.pStart, pcache1.pEnd)) {
if (p >= pcache1.pStart && p < pcache1.pEnd) {
PgFreeslot *pSlot;
pthread_mutex_lock(&pcache1.mutex);
// sqlite3StatusDown(SQLITE_STATUS_PAGECACHE_USED, 1);
pSlot = (PgFreeslot *)p;
pSlot->pNext = pcache1.pFree;
pcache1.pFree = pSlot;
pcache1.nFreeSlot++;
pcache1.bUnderPressure = pcache1.nFreeSlot < pcache1.nReserve;
assert(pcache1.nFreeSlot <= pcache1.nSlot);
pthread_mutex_unlock(&pcache1.mutex);
} else {
// assert(sqlite3MemdebugHasType(p, MEMTYPE_PCACHE));
// sqlite3MemdebugSetType(p, MEMTYPE_HEAP);
#ifndef SQLITE_DISABLE_PAGECACHE_OVERFLOW_STATS
{
int nFreed = 0;
// nFreed = sqlite3MallocSize(p);
pthread_mutex_lock(&pcache1.mutex);
// sqlite3StatusDown(SQLITE_STATUS_PAGECACHE_OVERFLOW, nFreed);
pthread_mutex_unlock(&pcache1.mutex);
}
#endif
free(p);
}
}
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
/*
** Return the size of a pcache allocation
*/
static int pcache1MemSize(void *p) {
if (p >= pcache1.pStart && p < pcache1.pEnd) {
return pcache1.szSlot;
} else {
int iSize;
assert(sqlite3MemdebugHasType(p, MEMTYPE_PCACHE));
sqlite3MemdebugSetType(p, MEMTYPE_HEAP);
iSize = sqlite3MallocSize(p);
sqlite3MemdebugSetType(p, MEMTYPE_PCACHE);
return iSize;
}
}
#endif /* SQLITE_ENABLE_MEMORY_MANAGEMENT */
/*
** Allocate a new page object initially associated with cache pCache.
*/
static PgHdr1 *pcache1AllocPage(PCache1 *pCache, int benignMalloc) {
PgHdr1 *p = 0;
void * pPg;
// assert(sqlite3_mutex_held(pCache->pGroup->mutex));
if (pCache->pFree || (pCache->nPage == 0 && pcache1InitBulk(pCache))) {
assert(pCache->pFree != 0);
p = pCache->pFree;
pCache->pFree = p->pNext;
p->pNext = 0;
} else {
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
/* The group mutex must be released before pcache1Alloc() is called. This
** is because it might call sqlite3_release_memory(), which assumes that
** this mutex is not held. */
assert(pcache1.separateCache == 0);
assert(pCache->pGroup == &pcache1.grp);
pcache1LeaveMutex(pCache->pGroup);
#endif
if (benignMalloc) {
// sqlite3BeginBenignMalloc();
}
#ifdef SQLITE_PCACHE_SEPARATE_HEADER
pPg = pcache1Alloc(pCache->szPage);
p = sqlite3Malloc(sizeof(PgHdr1) + pCache->szExtra);
if (!pPg || !p) {
pcache1Free(pPg);
sqlite3_free(p);
pPg = 0;
}
#else
pPg = pcache1Alloc(pCache->szAlloc);
#endif
if (benignMalloc) {
// sqlite3EndBenignMalloc();
}
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
pcache1EnterMutex(pCache->pGroup);
#endif
if (pPg == 0) return 0;
#ifndef SQLITE_PCACHE_SEPARATE_HEADER
p = (PgHdr1 *)&((u8 *)pPg)[pCache->szPage];
#endif
p->page.pBuf = pPg;
p->page.pExtra = &p[1];
p->isBulkLocal = 0;
p->isAnchor = 0;
p->pLruPrev = 0; /* Initializing this saves a valgrind error */
}
(*pCache->pnPurgeable)++;
return p;
}
/*
** Free a page object allocated by pcache1AllocPage().
*/
static void pcache1FreePage(PgHdr1 *p) {
PCache1 *pCache;
assert(p != 0);
pCache = p->pCache;
// assert(sqlite3_mutex_held(p->pCache->pGroup->mutex));
if (p->isBulkLocal) {
p->pNext = pCache->pFree;
pCache->pFree = p;
} else {
pcache1Free(p->page.pBuf);
#ifdef SQLITE_PCACHE_SEPARATE_HEADER
sqlite3_free(p);
#endif
}
(*pCache->pnPurgeable)--;
}
/*
** Malloc function used by SQLite to obtain space from the buffer configured
** using sqlite3_config(SQLITE_CONFIG_PAGECACHE) option. If no such buffer
** exists, this function falls back to sqlite3Malloc().
*/
void *sqlite3PageMalloc(int sz) {
assert(sz <= 65536 + 8); /* These allocations are never very large */
return pcache1Alloc(sz);
}
/*
** Free an allocated buffer obtained from sqlite3PageMalloc().
*/
void sqlite3PageFree(void *p) { pcache1Free(p); }
/*
** Return true if it desirable to avoid allocating a new page cache
** entry.
**
** If memory was allocated specifically to the page cache using
** SQLITE_CONFIG_PAGECACHE but that memory has all been used, then
** it is desirable to avoid allocating a new page cache entry because
** presumably SQLITE_CONFIG_PAGECACHE was suppose to be sufficient
** for all page cache needs and we should not need to spill the
** allocation onto the heap.
**
** Or, the heap is used for all page cache memory but the heap is
** under memory pressure, then again it is desirable to avoid
** allocating a new page cache entry in order to avoid stressing
** the heap even further.
*/
static int pcache1UnderMemoryPressure(PCache1 *pCache) {
// if (pcache1.nSlot && (pCache->szPage + pCache->szExtra) <= pcache1.szSlot) {
return pcache1.bUnderPressure;
// } else {
// return sqlite3HeapNearlyFull();
// }
}
/******************************************************************************/
/******** General Implementation Functions ************************************/
/*
** This function is used to resize the hash table used by the cache passed
** as the first argument.
**
** The PCache mutex must be held when this function is called.
*/
static void pcache1ResizeHash(PCache1 *p) {
PgHdr1 ** apNew;
unsigned int nNew;
unsigned int i;
// assert(sqlite3_mutex_held(p->pGroup->mutex));
nNew = p->nHash * 2;
if (nNew < 256) {
nNew = 256;
}
pcache1LeaveMutex(p->pGroup);
if (p->nHash) {
// sqlite3BeginBenignMalloc();
}
apNew = (PgHdr1 **)calloc(nNew, sizeof(PgHdr1 *));
if (p->nHash) {
// sqlite3EndBenignMalloc();
}
pcache1EnterMutex(p->pGroup);
if (apNew) {
for (i = 0; i < p->nHash; i++) {
PgHdr1 *pPage;
PgHdr1 *pNext = p->apHash[i];
while ((pPage = pNext) != 0) {
unsigned int h = pPage->iKey % nNew;
pNext = pPage->pNext;
pPage->pNext = apNew[h];
apNew[h] = pPage;
}
}
free(p->apHash);
p->apHash = apNew;
p->nHash = nNew;
}
}
/*
** This function is used internally to remove the page pPage from the
** PGroup LRU list, if is part of it. If pPage is not part of the PGroup
** LRU list, then this function is a no-op.
**
** The PGroup mutex must be held when this function is called.
*/
static PgHdr1 *pcache1PinPage(PgHdr1 *pPage) {
assert(pPage != 0);
assert(PAGE_IS_UNPINNED(pPage));
assert(pPage->pLruNext);
assert(pPage->pLruPrev);
// assert(sqlite3_mutex_held(pPage->pCache->pGroup->mutex));
pPage->pLruPrev->pLruNext = pPage->pLruNext;
pPage->pLruNext->pLruPrev = pPage->pLruPrev;
pPage->pLruNext = 0;
/* pPage->pLruPrev = 0;
** No need to clear pLruPrev as it is never accessed if pLruNext is 0 */
assert(pPage->isAnchor == 0);
assert(pPage->pCache->pGroup->lru.isAnchor == 1);
pPage->pCache->nRecyclable--;
return pPage;
}
/*
** Remove the page supplied as an argument from the hash table
** (PCache1.apHash structure) that it is currently stored in.
** Also free the page if freePage is true.
**
** The PGroup mutex must be held when this function is called.
*/
static void pcache1RemoveFromHash(PgHdr1 *pPage, int freeFlag) {
unsigned int h;
PCache1 * pCache = pPage->pCache;
PgHdr1 ** pp;
// assert(sqlite3_mutex_held(pCache->pGroup->mutex));
h = pPage->iKey % pCache->nHash;
for (pp = &pCache->apHash[h]; (*pp) != pPage; pp = &(*pp)->pNext)
;
*pp = (*pp)->pNext;
pCache->nPage--;
if (freeFlag) pcache1FreePage(pPage);
}
/*
** If there are currently more than nMaxPage pages allocated, try
** to recycle pages to reduce the number allocated to nMaxPage.
*/
static void pcache1EnforceMaxPage(PCache1 *pCache) {
PGroup *pGroup = pCache->pGroup;
PgHdr1 *p;
// assert(sqlite3_mutex_held(pGroup->mutex));
while (pGroup->nPurgeable > pGroup->nMaxPage && (p = pGroup->lru.pLruPrev)->isAnchor == 0) {
assert(p->pCache->pGroup == pGroup);
assert(PAGE_IS_UNPINNED(p));
pcache1PinPage(p);
pcache1RemoveFromHash(p, 1);
}
if (pCache->nPage == 0 && pCache->pBulk) {
free(pCache->pBulk);
pCache->pBulk = pCache->pFree = 0;
}
}
/*
** Discard all pages from cache pCache with a page number (key value)
** greater than or equal to iLimit. Any pinned pages that meet this
** criteria are unpinned before they are discarded.
**
** The PCache mutex must be held when this function is called.
*/
static void pcache1TruncateUnsafe(PCache1 * pCache, /* The cache to truncate */
unsigned int iLimit /* Drop pages with this pgno or larger */
) {
int nPage = 0; /* To assert pCache->nPage is correct */
unsigned int h, iStop;
// assert(sqlite3_mutex_held(pCache->pGroup->mutex));
assert(pCache->iMaxKey >= iLimit);
assert(pCache->nHash > 0);
if (pCache->iMaxKey - iLimit < pCache->nHash) {
/* If we are just shaving the last few pages off the end of the
** cache, then there is no point in scanning the entire hash table.
** Only scan those hash slots that might contain pages that need to
** be removed. */
h = iLimit % pCache->nHash;
iStop = pCache->iMaxKey % pCache->nHash;
nPage = -10; /* Disable the pCache->nPage validity check */
} else {
/* This is the general case where many pages are being removed.
** It is necessary to scan the entire hash table */
h = pCache->nHash / 2;
iStop = h - 1;
}
for (;;) {
PgHdr1 **pp;
PgHdr1 * pPage;
assert(h < pCache->nHash);
pp = &pCache->apHash[h];
while ((pPage = *pp) != 0) {
if (pPage->iKey >= iLimit) {
pCache->nPage--;
*pp = pPage->pNext;
if (PAGE_IS_UNPINNED(pPage)) pcache1PinPage(pPage);
pcache1FreePage(pPage);
} else {
pp = &pPage->pNext;
if (nPage >= 0) nPage++;
}
}
if (h == iStop) break;
h = (h + 1) % pCache->nHash;
}
assert(nPage < 0 || pCache->nPage == (unsigned)nPage);
}
/******************************************************************************/
/******** sqlite3_pcache Methods **********************************************/
/*
** Implementation of the sqlite3_pcache.xInit method.
*/
static int pcache1Init(void *NotUsed) {
assert(pcache1.isInit == 0);
memset(&pcache1, 0, sizeof(pcache1));
// /*
// ** The pcache1.separateCache variable is true if each PCache has its own
// ** private PGroup (mode-1). pcache1.separateCache is false if the single
// ** PGroup in pcache1.grp is used for all page caches (mode-2).
// **
// ** * Always use a unified cache (mode-2) if ENABLE_MEMORY_MANAGEMENT
// **
// ** * Use a unified cache in single-threaded applications that have
// ** configured a start-time buffer for use as page-cache memory using
// ** sqlite3_config(SQLITE_CONFIG_PAGECACHE, pBuf, sz, N) with non-NULL
// ** pBuf argument.
// **
// ** * Otherwise use separate caches (mode-1)
// */
// #if defined(SQLITE_ENABLE_MEMORY_MANAGEMENT)
// pcache1.separateCache = 0;
// #elif SQLITE_THREADSAFE
// pcache1.separateCache = sqlite3GlobalConfig.pPage==0
// || sqlite3GlobalConfig.bCoreMutex>0;
// #else
// pcache1.separateCache = sqlite3GlobalConfig.pPage==0;
// #endif
pcache1.separateCache = 1;
pthread_mutex_init(&pcache1.grp.mutex, NULL);
pthread_mutex_init(&pcache1.mutex, NULL);
// if (pcache1.separateCache && sqlite3GlobalConfig.nPage != 0 && sqlite3GlobalConfig.pPage == 0) {
// pcache1.nInitPage = sqlite3GlobalConfig.nPage;
// } else {
pcache1.nInitPage = 0;
// }
pcache1.grp.mxPinned = 10;
pcache1.isInit = 1;
return 0;
}
/*
** Implementation of the sqlite3_pcache.xShutdown method.
** Note that the static mutex allocated in xInit does
** not need to be freed.
*/
static void pcache1Shutdown(void *NotUsed) {
assert(pcache1.isInit != 0);
memset(&pcache1, 0, sizeof(pcache1));
}
/* forward declaration */
static void pcache1Destroy(sqlite3_pcache *p);
/*
** Implementation of the sqlite3_pcache.xCreate method.
**
** Allocate a new cache.
*/
static sqlite3_pcache *pcache1Create(int szPage, int szExtra, int bPurgeable) {
PCache1 *pCache; /* The newly created page cache */
PGroup * pGroup; /* The group the new page cache will belong to */
int sz; /* Bytes of memory required to allocate the new cache */
assert((szPage & (szPage - 1)) == 0 && szPage >= 512 && szPage <= 65536);
assert(szExtra < 300);
sz = sizeof(PCache1) + sizeof(PGroup) * pcache1.separateCache;
pCache = (PCache1 *)calloc(1, sz);
if (pCache) {
if (pcache1.separateCache) {
pGroup = (PGroup *)&pCache[1];
pGroup->mxPinned = 10;
} else {
pGroup = &pcache1.grp;
}
pcache1EnterMutex(pGroup);
if (pGroup->lru.isAnchor == 0) {
pGroup->lru.isAnchor = 1;
pGroup->lru.pLruPrev = pGroup->lru.pLruNext = &pGroup->lru;
}
pCache->pGroup = pGroup;
pCache->szPage = szPage;
pCache->szExtra = szExtra;
pCache->szAlloc = szPage + szExtra + ROUND8(sizeof(PgHdr1));
pCache->bPurgeable = (bPurgeable ? 1 : 0);
pcache1ResizeHash(pCache);
if (bPurgeable) {
pCache->nMin = 10;
pGroup->nMinPage += pCache->nMin;
pGroup->mxPinned = pGroup->nMaxPage + 10 - pGroup->nMinPage;
pCache->pnPurgeable = &pGroup->nPurgeable;
} else {
pCache->pnPurgeable = &pCache->nPurgeableDummy;
}
pcache1LeaveMutex(pGroup);
if (pCache->nHash == 0) {
pcache1Destroy((sqlite3_pcache *)pCache);
pCache = 0;
}
}
return (sqlite3_pcache *)pCache;
}
/*
** Implementation of the sqlite3_pcache.xCachesize method.
**
** Configure the cache_size limit for a cache.
*/
static void pcache1Cachesize(sqlite3_pcache *p, int nMax) {
PCache1 *pCache = (PCache1 *)p;
u32 n;
assert(nMax >= 0);
if (pCache->bPurgeable) {
PGroup *pGroup = pCache->pGroup;
pcache1EnterMutex(pGroup);
n = (u32)nMax;
if (n > 0x7fff0000 - pGroup->nMaxPage + pCache->nMax) {
n = 0x7fff0000 - pGroup->nMaxPage + pCache->nMax;
}
pGroup->nMaxPage += (n - pCache->nMax);
pGroup->mxPinned = pGroup->nMaxPage + 10 - pGroup->nMinPage;
pCache->nMax = n;
pCache->n90pct = pCache->nMax * 9 / 10;
pcache1EnforceMaxPage(pCache);
pcache1LeaveMutex(pGroup);
}
}
/*
** Implementation of the sqlite3_pcache.xShrink method.
**
** Free up as much memory as possible.
*/
static void pcache1Shrink(sqlite3_pcache *p) {
PCache1 *pCache = (PCache1 *)p;
if (pCache->bPurgeable) {
PGroup * pGroup = pCache->pGroup;
unsigned int savedMaxPage;
pcache1EnterMutex(pGroup);
savedMaxPage = pGroup->nMaxPage;
pGroup->nMaxPage = 0;
pcache1EnforceMaxPage(pCache);
pGroup->nMaxPage = savedMaxPage;
pcache1LeaveMutex(pGroup);
}
}
/*
** Implementation of the sqlite3_pcache.xPagecount method.
*/
static int pcache1Pagecount(sqlite3_pcache *p) {
int n;
PCache1 *pCache = (PCache1 *)p;
pcache1EnterMutex(pCache->pGroup);
n = pCache->nPage;
pcache1LeaveMutex(pCache->pGroup);
return n;
}
/*
** Implement steps 3, 4, and 5 of the pcache1Fetch() algorithm described
** in the header of the pcache1Fetch() procedure.
**
** This steps are broken out into a separate procedure because they are
** usually not needed, and by avoiding the stack initialization required
** for these steps, the main pcache1Fetch() procedure can run faster.
*/
static PgHdr1 *pcache1FetchStage2(PCache1 *pCache, unsigned int iKey, int createFlag) {
unsigned int nPinned;
PGroup * pGroup = pCache->pGroup;
PgHdr1 * pPage = 0;
/* Step 3: Abort if createFlag is 1 but the cache is nearly full */
assert(pCache->nPage >= pCache->nRecyclable);
nPinned = pCache->nPage - pCache->nRecyclable;
assert(pGroup->mxPinned == pGroup->nMaxPage + 10 - pGroup->nMinPage);
assert(pCache->n90pct == pCache->nMax * 9 / 10);
if (createFlag == 1 && (nPinned >= pGroup->mxPinned || nPinned >= pCache->n90pct ||
(pcache1UnderMemoryPressure(pCache) && pCache->nRecyclable < nPinned))) {
return 0;
}
if (pCache->nPage >= pCache->nHash) pcache1ResizeHash(pCache);
assert(pCache->nHash > 0 && pCache->apHash);
/* Step 4. Try to recycle a page. */
if (pCache->bPurgeable && !pGroup->lru.pLruPrev->isAnchor &&
((pCache->nPage + 1 >= pCache->nMax) || pcache1UnderMemoryPressure(pCache))) {
PCache1 *pOther;
pPage = pGroup->lru.pLruPrev;
assert(PAGE_IS_UNPINNED(pPage));
pcache1RemoveFromHash(pPage, 0);
pcache1PinPage(pPage);
pOther = pPage->pCache;
if (pOther->szAlloc != pCache->szAlloc) {
pcache1FreePage(pPage);
pPage = 0;
} else {
pGroup->nPurgeable -= (pOther->bPurgeable - pCache->bPurgeable);
}
}
/* Step 5. If a usable page buffer has still not been found,
** attempt to allocate a new one.
*/
if (!pPage) {
pPage = pcache1AllocPage(pCache, createFlag == 1);
}
if (pPage) {
unsigned int h = iKey % pCache->nHash;
pCache->nPage++;
pPage->iKey = iKey;
pPage->pNext = pCache->apHash[h];
pPage->pCache = pCache;
pPage->pLruNext = 0;
/* pPage->pLruPrev = 0;
** No need to clear pLruPrev since it is not accessed when pLruNext==0 */
*(void **)pPage->page.pExtra = 0;
pCache->apHash[h] = pPage;
if (iKey > pCache->iMaxKey) {
pCache->iMaxKey = iKey;
}
}
return pPage;
}
/*
** Implementation of the sqlite3_pcache.xFetch method.
**
** Fetch a page by key value.
**
** Whether or not a new page may be allocated by this function depends on
** the value of the createFlag argument. 0 means do not allocate a new
** page. 1 means allocate a new page if space is easily available. 2
** means to try really hard to allocate a new page.
**
** For a non-purgeable cache (a cache used as the storage for an in-memory
** database) there is really no difference between createFlag 1 and 2. So
** the calling function (pcache.c) will never have a createFlag of 1 on
** a non-purgeable cache.
**
** There are three different approaches to obtaining space for a page,
** depending on the value of parameter createFlag (which may be 0, 1 or 2).
**
** 1. Regardless of the value of createFlag, the cache is searched for a
** copy of the requested page. If one is found, it is returned.
**
** 2. If createFlag==0 and the page is not already in the cache, NULL is
** returned.
**
** 3. If createFlag is 1, and the page is not already in the cache, then
** return NULL (do not allocate a new page) if any of the following
** conditions are true:
**
** (a) the number of pages pinned by the cache is greater than
** PCache1.nMax, or
**
** (b) the number of pages pinned by the cache is greater than
** the sum of nMax for all purgeable caches, less the sum of
** nMin for all other purgeable caches, or
**
** 4. If none of the first three conditions apply and the cache is marked
** as purgeable, and if one of the following is true:
**
** (a) The number of pages allocated for the cache is already
** PCache1.nMax, or
**
** (b) The number of pages allocated for all purgeable caches is
** already equal to or greater than the sum of nMax for all
** purgeable caches,
**
** (c) The system is under memory pressure and wants to avoid
** unnecessary pages cache entry allocations
**
** then attempt to recycle a page from the LRU list. If it is the right
** size, return the recycled buffer. Otherwise, free the buffer and
** proceed to step 5.
**
** 5. Otherwise, allocate and return a new page buffer.
**
** There are two versions of this routine. pcache1FetchWithMutex() is
** the general case. pcache1FetchNoMutex() is a faster implementation for
** the common case where pGroup->mutex is NULL. The pcache1Fetch() wrapper
** invokes the appropriate routine.
*/
static PgHdr1 *pcache1FetchNoMutex(sqlite3_pcache *p, unsigned int iKey, int createFlag) {
PCache1 *pCache = (PCache1 *)p;
PgHdr1 * pPage = 0;
/* Step 1: Search the hash table for an existing entry. */
pPage = pCache->apHash[iKey % pCache->nHash];
while (pPage && pPage->iKey != iKey) {
pPage = pPage->pNext;
}
/* Step 2: If the page was found in the hash table, then return it.
** If the page was not in the hash table and createFlag is 0, abort.
** Otherwise (page not in hash and createFlag!=0) continue with
** subsequent steps to try to create the page. */
if (pPage) {
if (PAGE_IS_UNPINNED(pPage)) {
return pcache1PinPage(pPage);
} else {
return pPage;
}
} else if (createFlag) {
/* Steps 3, 4, and 5 implemented by this subroutine */
return pcache1FetchStage2(pCache, iKey, createFlag);
} else {
return 0;
}
}
#if PCACHE1_MIGHT_USE_GROUP_MUTEX
static PgHdr1 *pcache1FetchWithMutex(sqlite3_pcache *p, unsigned int iKey, int createFlag) {
PCache1 *pCache = (PCache1 *)p;
PgHdr1 * pPage;
pcache1EnterMutex(pCache->pGroup);
pPage = pcache1FetchNoMutex(p, iKey, createFlag);
assert(pPage == 0 || pCache->iMaxKey >= iKey);
pcache1LeaveMutex(pCache->pGroup);
return pPage;
}
#endif
static sqlite3_pcache_page *pcache1Fetch(sqlite3_pcache *p, unsigned int iKey, int createFlag) {
#if PCACHE1_MIGHT_USE_GROUP_MUTEX || defined(SQLITE_DEBUG)
PCache1 *pCache = (PCache1 *)p;
#endif
assert(offsetof(PgHdr1, page) == 0);
assert(pCache->bPurgeable || createFlag != 1);
assert(pCache->bPurgeable || pCache->nMin == 0);
assert(pCache->bPurgeable == 0 || pCache->nMin == 10);
assert(pCache->nMin == 0 || pCache->bPurgeable);
assert(pCache->nHash > 0);
return (sqlite3_pcache_page *)pcache1FetchWithMutex(p, iKey, createFlag);
}
/*
** Implementation of the sqlite3_pcache.xUnpin method.
**
** Mark a page as unpinned (eligible for asynchronous recycling).
*/
static void pcache1Unpin(sqlite3_pcache *p, sqlite3_pcache_page *pPg, int reuseUnlikely) {
PCache1 *pCache = (PCache1 *)p;
PgHdr1 * pPage = (PgHdr1 *)pPg;
PGroup * pGroup = pCache->pGroup;
assert(pPage->pCache == pCache);
pcache1EnterMutex(pGroup);
/* It is an error to call this function if the page is already
** part of the PGroup LRU list.
*/
assert(pPage->pLruNext == 0);
assert(PAGE_IS_PINNED(pPage));
if (reuseUnlikely || pGroup->nPurgeable > pGroup->nMaxPage) {
pcache1RemoveFromHash(pPage, 1);
} else {
/* Add the page to the PGroup LRU list. */
PgHdr1 **ppFirst = &pGroup->lru.pLruNext;
pPage->pLruPrev = &pGroup->lru;
(pPage->pLruNext = *ppFirst)->pLruPrev = pPage;
*ppFirst = pPage;
pCache->nRecyclable++;
}
pcache1LeaveMutex(pCache->pGroup);
}
/*
** Implementation of the sqlite3_pcache.xRekey method.
*/
static void pcache1Rekey(sqlite3_pcache *p, sqlite3_pcache_page *pPg, unsigned int iOld, unsigned int iNew) {
PCache1 * pCache = (PCache1 *)p;
PgHdr1 * pPage = (PgHdr1 *)pPg;
PgHdr1 ** pp;
unsigned int h;
assert(pPage->iKey == iOld);
assert(pPage->pCache == pCache);
pcache1EnterMutex(pCache->pGroup);
h = iOld % pCache->nHash;
pp = &pCache->apHash[h];
while ((*pp) != pPage) {
pp = &(*pp)->pNext;
}
*pp = pPage->pNext;
h = iNew % pCache->nHash;
pPage->iKey = iNew;
pPage->pNext = pCache->apHash[h];
pCache->apHash[h] = pPage;
if (iNew > pCache->iMaxKey) {
pCache->iMaxKey = iNew;
}
pcache1LeaveMutex(pCache->pGroup);
}
/*
** Implementation of the sqlite3_pcache.xTruncate method.
**
** Discard all unpinned pages in the cache with a page number equal to
** or greater than parameter iLimit. Any pinned pages with a page number
** equal to or greater than iLimit are implicitly unpinned.
*/
static void pcache1Truncate(sqlite3_pcache *p, unsigned int iLimit) {
PCache1 *pCache = (PCache1 *)p;
pcache1EnterMutex(pCache->pGroup);
if (iLimit <= pCache->iMaxKey) {
pcache1TruncateUnsafe(pCache, iLimit);
pCache->iMaxKey = iLimit - 1;
}
pcache1LeaveMutex(pCache->pGroup);
}
/*
** Implementation of the sqlite3_pcache.xDestroy method.
**
** Destroy a cache allocated using pcache1Create().
*/
static void pcache1Destroy(sqlite3_pcache *p) {
PCache1 *pCache = (PCache1 *)p;
PGroup * pGroup = pCache->pGroup;
assert(pCache->bPurgeable || (pCache->nMax == 0 && pCache->nMin == 0));
pcache1EnterMutex(pGroup);
if (pCache->nPage) pcache1TruncateUnsafe(pCache, 0);
assert(pGroup->nMaxPage >= pCache->nMax);
pGroup->nMaxPage -= pCache->nMax;
assert(pGroup->nMinPage >= pCache->nMin);
pGroup->nMinPage -= pCache->nMin;
pGroup->mxPinned = pGroup->nMaxPage + 10 - pGroup->nMinPage;
pcache1EnforceMaxPage(pCache);
pcache1LeaveMutex(pGroup);
free(pCache->pBulk);
free(pCache->apHash);
free(pCache);
}
/*
** Return the size of the header on each page of this PCACHE implementation.
*/
int sqlite3HeaderSizePcache1(void) { return ROUND8(sizeof(PgHdr1)); }
// /*
// ** Return the global mutex used by this PCACHE implementation. The
// ** sqlite3_status() routine needs access to this mutex.
// */
// sqlite3_mutex *sqlite3Pcache1Mutex(void) { return pcache1.mutex; }
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
/*
** This function is called to free superfluous dynamically allocated memory
** held by the pager system. Memory in use by any SQLite pager allocated
** by the current thread may be sqlite3_free()ed.
**
** nReq is the number of bytes of memory required. Once this much has
** been released, the function returns. The return value is the total number
** of bytes of memory released.
*/
int sqlite3PcacheReleaseMemory(int nReq) {
int nFree = 0;
// assert(sqlite3_mutex_notheld(pcache1.grp.mutex));
// assert(sqlite3_mutex_notheld(pcache1.mutex));
if (sqlite3GlobalConfig.pPage == 0) {
PgHdr1 *p;
pcache1EnterMutex(&pcache1.grp);
while ((nReq < 0 || nFree < nReq) && (p = pcache1.grp.lru.pLruPrev) != 0 && p->isAnchor == 0) {
nFree += pcache1MemSize(p->page.pBuf);
#ifdef SQLITE_PCACHE_SEPARATE_HEADER
nFree += sqlite3MemSize(p);
#endif
assert(PAGE_IS_UNPINNED(p));
pcache1PinPage(p);
pcache1RemoveFromHash(p, 1);
}
pcache1LeaveMutex(&pcache1.grp);
}
return nFree;
}
#endif /* SQLITE_ENABLE_MEMORY_MANAGEMENT */
#ifdef SQLITE_TEST
/*
** This function is used by test procedures to inspect the internal state
** of the global cache.
*/
void sqlite3PcacheStats(int *pnCurrent, /* OUT: Total number of pages cached */
int *pnMax, /* OUT: Global maximum cache size */
int *pnMin, /* OUT: Sum of PCache1.nMin for purgeable caches */
int *pnRecyclable /* OUT: Total number of pages available for recycling */
) {
PgHdr1 *p;
int nRecyclable = 0;
for (p = pcache1.grp.lru.pLruNext; p && !p->isAnchor; p = p->pLruNext) {
assert(PAGE_IS_UNPINNED(p));
nRecyclable++;
}
*pnCurrent = pcache1.grp.nPurgeable;
*pnMax = (int)pcache1.grp.nMaxPage;
*pnMin = (int)pcache1.grp.nMinPage;
*pnRecyclable = nRecyclable;
}
#endif
sqlite3_pcache_methods2 pcache2 = {
1, /* iVersion */
0, /* pArg */
pcache1Init, /* xInit */
pcache1Shutdown, /* xShutdown */
pcache1Create, /* xCreate */
pcache1Cachesize, /* xCachesize */
pcache1Pagecount, /* xPagecount */
pcache1Fetch, /* xFetch */
pcache1Unpin, /* xUnpin */
pcache1Rekey, /* xRekey */
pcache1Truncate, /* xTruncate */
pcache1Destroy, /* xDestroy */
pcache1Shrink /* xShrink */
};
/*
** 2010 February 1
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file contains the implementation of a write-ahead log (WAL) used in
** "journal_mode=WAL" mode.
**
** WRITE-AHEAD LOG (WAL) FILE FORMAT
**
** A WAL file consists of a header followed by zero or more "frames".
** Each frame records the revised content of a single page from the
** database file. All changes to the database are recorded by writing
** frames into the WAL. Transactions commit when a frame is written that
** contains a commit marker. A single WAL can and usually does record
** multiple transactions. Periodically, the content of the WAL is
** transferred back into the database file in an operation called a
** "checkpoint".
**
** A single WAL file can be used multiple times. In other words, the
** WAL can fill up with frames and then be checkpointed and then new
** frames can overwrite the old ones. A WAL always grows from beginning
** toward the end. Checksums and counters attached to each frame are
** used to determine which frames within the WAL are valid and which
** are leftovers from prior checkpoints.
**
** The WAL header is 32 bytes in size and consists of the following eight
** big-endian 32-bit unsigned integer values:
**
** 0: Magic number. 0x377f0682 or 0x377f0683
** 4: File format version. Currently 3007000
** 8: Database page size. Example: 1024
** 12: Checkpoint sequence number
** 16: Salt-1, random integer incremented with each checkpoint
** 20: Salt-2, a different random integer changing with each ckpt
** 24: Checksum-1 (first part of checksum for first 24 bytes of header).
** 28: Checksum-2 (second part of checksum for first 24 bytes of header).
**
** Immediately following the wal-header are zero or more frames. Each
** frame consists of a 24-byte frame-header followed by a <page-size> bytes
** of page data. The frame-header is six big-endian 32-bit unsigned
** integer values, as follows:
**
** 0: Page number.
** 4: For commit records, the size of the database image in pages
** after the commit. For all other records, zero.
** 8: Salt-1 (copied from the header)
** 12: Salt-2 (copied from the header)
** 16: Checksum-1.
** 20: Checksum-2.
**
** A frame is considered valid if and only if the following conditions are
** true:
**
** (1) The salt-1 and salt-2 values in the frame-header match
** salt values in the wal-header
**
** (2) The checksum values in the final 8 bytes of the frame-header
** exactly match the checksum computed consecutively on the
** WAL header and the first 8 bytes and the content of all frames
** up to and including the current frame.
**
** The checksum is computed using 32-bit big-endian integers if the
** magic number in the first 4 bytes of the WAL is 0x377f0683 and it
** is computed using little-endian if the magic number is 0x377f0682.
** The checksum values are always stored in the frame header in a
** big-endian format regardless of which byte order is used to compute
** the checksum. The checksum is computed by interpreting the input as
** an even number of unsigned 32-bit integers: x[0] through x[N]. The
** algorithm used for the checksum is as follows:
**
** for i from 0 to n-1 step 2:
** s0 += x[i] + s1;
** s1 += x[i+1] + s0;
** endfor
**
** Note that s0 and s1 are both weighted checksums using fibonacci weights
** in reverse order (the largest fibonacci weight occurs on the first element
** of the sequence being summed.) The s1 value spans all 32-bit
** terms of the sequence whereas s0 omits the final term.
**
** On a checkpoint, the WAL is first VFS.xSync-ed, then valid content of the
** WAL is transferred into the database, then the database is VFS.xSync-ed.
** The VFS.xSync operations serve as write barriers - all writes launched
** before the xSync must complete before any write that launches after the
** xSync begins.
**
** After each checkpoint, the salt-1 value is incremented and the salt-2
** value is randomized. This prevents old and new frames in the WAL from
** being considered valid at the same time and being checkpointing together
** following a crash.
**
** READER ALGORITHM
**
** To read a page from the database (call it page number P), a reader
** first checks the WAL to see if it contains page P. If so, then the
** last valid instance of page P that is a followed by a commit frame
** or is a commit frame itself becomes the value read. If the WAL
** contains no copies of page P that are valid and which are a commit
** frame or are followed by a commit frame, then page P is read from
** the database file.
**
** To start a read transaction, the reader records the index of the last
** valid frame in the WAL. The reader uses this recorded "mxFrame" value
** for all subsequent read operations. New transactions can be appended
** to the WAL, but as long as the reader uses its original mxFrame value
** and ignores the newly appended content, it will see a consistent snapshot
** of the database from a single point in time. This technique allows
** multiple concurrent readers to view different versions of the database
** content simultaneously.
**
** The reader algorithm in the previous paragraphs works correctly, but
** because frames for page P can appear anywhere within the WAL, the
** reader has to scan the entire WAL looking for page P frames. If the
** WAL is large (multiple megabytes is typical) that scan can be slow,
** and read performance suffers. To overcome this problem, a separate
** data structure called the wal-index is maintained to expedite the
** search for frames of a particular page.
**
** WAL-INDEX FORMAT
**
** Conceptually, the wal-index is shared memory, though VFS implementations
** might choose to implement the wal-index using a mmapped file. Because
** the wal-index is shared memory, SQLite does not support journal_mode=WAL
** on a network filesystem. All users of the database must be able to
** share memory.
**
** In the default unix and windows implementation, the wal-index is a mmapped
** file whose name is the database name with a "-shm" suffix added. For that
** reason, the wal-index is sometimes called the "shm" file.
**
** The wal-index is transient. After a crash, the wal-index can (and should
** be) reconstructed from the original WAL file. In fact, the VFS is required
** to either truncate or zero the header of the wal-index when the last
** connection to it closes. Because the wal-index is transient, it can
** use an architecture-specific format; it does not have to be cross-platform.
** Hence, unlike the database and WAL file formats which store all values
** as big endian, the wal-index can store multi-byte values in the native
** byte order of the host computer.
**
** The purpose of the wal-index is to answer this question quickly: Given
** a page number P and a maximum frame index M, return the index of the
** last frame in the wal before frame M for page P in the WAL, or return
** NULL if there are no frames for page P in the WAL prior to M.
**
** The wal-index consists of a header region, followed by an one or
** more index blocks.
**
** The wal-index header contains the total number of frames within the WAL
** in the mxFrame field.
**
** Each index block except for the first contains information on
** HASHTABLE_NPAGE frames. The first index block contains information on
** HASHTABLE_NPAGE_ONE frames. The values of HASHTABLE_NPAGE_ONE and
** HASHTABLE_NPAGE are selected so that together the wal-index header and
** first index block are the same size as all other index blocks in the
** wal-index. The values are:
**
** HASHTABLE_NPAGE 4096
** HASHTABLE_NPAGE_ONE 4062
**
** Each index block contains two sections, a page-mapping that contains the
** database page number associated with each wal frame, and a hash-table
** that allows readers to query an index block for a specific page number.
** The page-mapping is an array of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE
** for the first index block) 32-bit page numbers. The first entry in the
** first index-block contains the database page number corresponding to the
** first frame in the WAL file. The first entry in the second index block
** in the WAL file corresponds to the (HASHTABLE_NPAGE_ONE+1)th frame in
** the log, and so on.
**
** The last index block in a wal-index usually contains less than the full
** complement of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE) page-numbers,
** depending on the contents of the WAL file. This does not change the
** allocated size of the page-mapping array - the page-mapping array merely
** contains unused entries.
**
** Even without using the hash table, the last frame for page P
** can be found by scanning the page-mapping sections of each index block
** starting with the last index block and moving toward the first, and
** within each index block, starting at the end and moving toward the
** beginning. The first entry that equals P corresponds to the frame
** holding the content for that page.
**
** The hash table consists of HASHTABLE_NSLOT 16-bit unsigned integers.
** HASHTABLE_NSLOT = 2*HASHTABLE_NPAGE, and there is one entry in the
** hash table for each page number in the mapping section, so the hash
** table is never more than half full. The expected number of collisions
** prior to finding a match is 1. Each entry of the hash table is an
** 1-based index of an entry in the mapping section of the same
** index block. Let K be the 1-based index of the largest entry in
** the mapping section. (For index blocks other than the last, K will
** always be exactly HASHTABLE_NPAGE (4096) and for the last index block
** K will be (mxFrame%HASHTABLE_NPAGE).) Unused slots of the hash table
** contain a value of 0.
**
** To look for page P in the hash table, first compute a hash iKey on
** P as follows:
**
** iKey = (P * 383) % HASHTABLE_NSLOT
**
** Then start scanning entries of the hash table, starting with iKey
** (wrapping around to the beginning when the end of the hash table is
** reached) until an unused hash slot is found. Let the first unused slot
** be at index iUnused. (iUnused might be less than iKey if there was
** wrap-around.) Because the hash table is never more than half full,
** the search is guaranteed to eventually hit an unused entry. Let
** iMax be the value between iKey and iUnused, closest to iUnused,
** where aHash[iMax]==P. If there is no iMax entry (if there exists
** no hash slot such that aHash[i]==p) then page P is not in the
** current index block. Otherwise the iMax-th mapping entry of the
** current index block corresponds to the last entry that references
** page P.
**
** A hash search begins with the last index block and moves toward the
** first index block, looking for entries corresponding to page P. On
** average, only two or three slots in each index block need to be
** examined in order to either find the last entry for page P, or to
** establish that no such entry exists in the block. Each index block
** holds over 4000 entries. So two or three index blocks are sufficient
** to cover a typical 10 megabyte WAL file, assuming 1K pages. 8 or 10
** comparisons (on average) suffice to either locate a frame in the
** WAL or to establish that the frame does not exist in the WAL. This
** is much faster than scanning the entire 10MB WAL.
**
** Note that entries are added in order of increasing K. Hence, one
** reader might be using some value K0 and a second reader that started
** at a later time (after additional transactions were added to the WAL
** and to the wal-index) might be using a different value K1, where K1>K0.
** Both readers can use the same hash table and mapping section to get
** the correct result. There may be entries in the hash table with
** K>K0 but to the first reader, those entries will appear to be unused
** slots in the hash table and so the first reader will get an answer as
** if no values greater than K0 had ever been inserted into the hash table
** in the first place - which is what reader one wants. Meanwhile, the
** second reader using K1 will see additional values that were inserted
** later, which is exactly what reader two wants.
**
** When a rollback occurs, the value of K is decreased. Hash table entries
** that correspond to frames greater than the new K value are removed
** from the hash table at this point.
*/
#ifndef SQLITE_OMIT_WAL
#include "wal.h"
/*
** Trace output macros
*/
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
int sqlite3WalTrace = 0;
# define WALTRACE(X) if(sqlite3WalTrace) sqlite3DebugPrintf X
#else
# define WALTRACE(X)
#endif
/*
** The maximum (and only) versions of the wal and wal-index formats
** that may be interpreted by this version of SQLite.
**
** If a client begins recovering a WAL file and finds that (a) the checksum
** values in the wal-header are correct and (b) the version field is not
** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN.
**
** Similarly, if a client successfully reads a wal-index header (i.e. the
** checksum test is successful) and finds that the version field is not
** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite
** returns SQLITE_CANTOPEN.
*/
#define WAL_MAX_VERSION 3007000
#define WALINDEX_MAX_VERSION 3007000
/*
** Index numbers for various locking bytes. WAL_NREADER is the number
** of available reader locks and should be at least 3. The default
** is SQLITE_SHM_NLOCK==8 and WAL_NREADER==5.
**
** Technically, the various VFSes are free to implement these locks however
** they see fit. However, compatibility is encouraged so that VFSes can
** interoperate. The standard implemention used on both unix and windows
** is for the index number to indicate a byte offset into the
** WalCkptInfo.aLock[] array in the wal-index header. In other words, all
** locks are on the shm file. The WALINDEX_LOCK_OFFSET constant (which
** should be 120) is the location in the shm file for the first locking
** byte.
*/
#define WAL_WRITE_LOCK 0
#define WAL_ALL_BUT_WRITE 1
#define WAL_CKPT_LOCK 1
#define WAL_RECOVER_LOCK 2
#define WAL_READ_LOCK(I) (3+(I))
#define WAL_NREADER (SQLITE_SHM_NLOCK-3)
/* Object declarations */
typedef struct WalIndexHdr WalIndexHdr;
typedef struct WalIterator WalIterator;
typedef struct WalCkptInfo WalCkptInfo;
/*
** The following object holds a copy of the wal-index header content.
**
** The actual header in the wal-index consists of two copies of this
** object followed by one instance of the WalCkptInfo object.
** For all versions of SQLite through 3.10.0 and probably beyond,
** the locking bytes (WalCkptInfo.aLock) start at offset 120 and
** the total header size is 136 bytes.
**
** The szPage value can be any power of 2 between 512 and 32768, inclusive.
** Or it can be 1 to represent a 65536-byte page. The latter case was
** added in 3.7.1 when support for 64K pages was added.
*/
struct WalIndexHdr {
u32 iVersion; /* Wal-index version */
u32 unused; /* Unused (padding) field */
u32 iChange; /* Counter incremented each transaction */
u8 isInit; /* 1 when initialized */
u8 bigEndCksum; /* True if checksums in WAL are big-endian */
u16 szPage; /* Database page size in bytes. 1==64K */
u32 mxFrame; /* Index of last valid frame in the WAL */
u32 nPage; /* Size of database in pages */
u32 aFrameCksum[2]; /* Checksum of last frame in log */
u32 aSalt[2]; /* Two salt values copied from WAL header */
u32 aCksum[2]; /* Checksum over all prior fields */
};
/*
** A copy of the following object occurs in the wal-index immediately
** following the second copy of the WalIndexHdr. This object stores
** information used by checkpoint.
**
** nBackfill is the number of frames in the WAL that have been written
** back into the database. (We call the act of moving content from WAL to
** database "backfilling".) The nBackfill number is never greater than
** WalIndexHdr.mxFrame. nBackfill can only be increased by threads
** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
** mxFrame back to zero when the WAL is reset.
**
** nBackfillAttempted is the largest value of nBackfill that a checkpoint
** has attempted to achieve. Normally nBackfill==nBackfillAtempted, however
** the nBackfillAttempted is set before any backfilling is done and the
** nBackfill is only set after all backfilling completes. So if a checkpoint
** crashes, nBackfillAttempted might be larger than nBackfill. The
** WalIndexHdr.mxFrame must never be less than nBackfillAttempted.
**
** The aLock[] field is a set of bytes used for locking. These bytes should
** never be read or written.
**
** There is one entry in aReadMark[] for each reader lock. If a reader
** holds read-lock K, then the value in aReadMark[K] is no greater than
** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff)
** for any aReadMark[] means that entry is unused. aReadMark[0] is
** a special case; its value is never used and it exists as a place-holder
** to avoid having to offset aReadMark[] indexs by one. Readers holding
** WAL_READ_LOCK(0) always ignore the entire WAL and read all content
** directly from the database.
**
** The value of aReadMark[K] may only be changed by a thread that
** is holding an exclusive lock on WAL_READ_LOCK(K). Thus, the value of
** aReadMark[K] cannot changed while there is a reader is using that mark
** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
**
** The checkpointer may only transfer frames from WAL to database where
** the frame numbers are less than or equal to every aReadMark[] that is
** in use (that is, every aReadMark[j] for which there is a corresponding
** WAL_READ_LOCK(j)). New readers (usually) pick the aReadMark[] with the
** largest value and will increase an unused aReadMark[] to mxFrame if there
** is not already an aReadMark[] equal to mxFrame. The exception to the
** previous sentence is when nBackfill equals mxFrame (meaning that everything
** in the WAL has been backfilled into the database) then new readers
** will choose aReadMark[0] which has value 0 and hence such reader will
** get all their all content directly from the database file and ignore
** the WAL.
**
** Writers normally append new frames to the end of the WAL. However,
** if nBackfill equals mxFrame (meaning that all WAL content has been
** written back into the database) and if no readers are using the WAL
** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then
** the writer will first "reset" the WAL back to the beginning and start
** writing new content beginning at frame 1.
**
** We assume that 32-bit loads are atomic and so no locks are needed in
** order to read from any aReadMark[] entries.
*/
struct WalCkptInfo {
u32 nBackfill; /* Number of WAL frames backfilled into DB */
u32 aReadMark[WAL_NREADER]; /* Reader marks */
u8 aLock[SQLITE_SHM_NLOCK]; /* Reserved space for locks */
u32 nBackfillAttempted; /* WAL frames perhaps written, or maybe not */
u32 notUsed0; /* Available for future enhancements */
};
#define READMARK_NOT_USED 0xffffffff
/*
** This is a schematic view of the complete 136-byte header of the
** wal-index file (also known as the -shm file):
**
** +-----------------------------+
** 0: | iVersion | \
** +-----------------------------+ |
** 4: | (unused padding) | |
** +-----------------------------+ |
** 8: | iChange | |
** +-------+-------+-------------+ |
** 12: | bInit | bBig | szPage | |
** +-------+-------+-------------+ |
** 16: | mxFrame | | First copy of the
** +-----------------------------+ | WalIndexHdr object
** 20: | nPage | |
** +-----------------------------+ |
** 24: | aFrameCksum | |
** | | |
** +-----------------------------+ |
** 32: | aSalt | |
** | | |
** +-----------------------------+ |
** 40: | aCksum | |
** | | /
** +-----------------------------+
** 48: | iVersion | \
** +-----------------------------+ |
** 52: | (unused padding) | |
** +-----------------------------+ |
** 56: | iChange | |
** +-------+-------+-------------+ |
** 60: | bInit | bBig | szPage | |
** +-------+-------+-------------+ | Second copy of the
** 64: | mxFrame | | WalIndexHdr
** +-----------------------------+ |
** 68: | nPage | |
** +-----------------------------+ |
** 72: | aFrameCksum | |
** | | |
** +-----------------------------+ |
** 80: | aSalt | |
** | | |
** +-----------------------------+ |
** 88: | aCksum | |
** | | /
** +-----------------------------+
** 96: | nBackfill |
** +-----------------------------+
** 100: | 5 read marks |
** | |
** | |
** | |
** | |
** +-------+-------+------+------+
** 120: | Write | Ckpt | Rcvr | Rd0 | \
** +-------+-------+------+------+ ) 8 lock bytes
** | Read1 | Read2 | Rd3 | Rd4 | /
** +-------+-------+------+------+
** 128: | nBackfillAttempted |
** +-----------------------------+
** 132: | (unused padding) |
** +-----------------------------+
*/
/* A block of WALINDEX_LOCK_RESERVED bytes beginning at
** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
** only support mandatory file-locks, we do not read or write data
** from the region of the file on which locks are applied.
*/
#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2+offsetof(WalCkptInfo,aLock))
#define WALINDEX_HDR_SIZE (sizeof(WalIndexHdr)*2+sizeof(WalCkptInfo))
/* Size of header before each frame in wal */
#define WAL_FRAME_HDRSIZE 24
/* Size of write ahead log header, including checksum. */
#define WAL_HDRSIZE 32
/* WAL magic value. Either this value, or the same value with the least
** significant bit also set (WAL_MAGIC | 0x00000001) is stored in 32-bit
** big-endian format in the first 4 bytes of a WAL file.
**
** If the LSB is set, then the checksums for each frame within the WAL
** file are calculated by treating all data as an array of 32-bit
** big-endian words. Otherwise, they are calculated by interpreting
** all data as 32-bit little-endian words.
*/
#define WAL_MAGIC 0x377f0682
/*
** Return the offset of frame iFrame in the write-ahead log file,
** assuming a database page size of szPage bytes. The offset returned
** is to the start of the write-ahead log frame-header.
*/
#define walFrameOffset(iFrame, szPage) ( \
WAL_HDRSIZE + ((iFrame)-1)*(i64)((szPage)+WAL_FRAME_HDRSIZE) \
)
/*
** An open write-ahead log file is represented by an instance of the
** following object.
*/
struct Wal {
sqlite3_vfs *pVfs; /* The VFS used to create pDbFd */
sqlite3_file *pDbFd; /* File handle for the database file */
sqlite3_file *pWalFd; /* File handle for WAL file */
u32 iCallback; /* Value to pass to log callback (or 0) */
i64 mxWalSize; /* Truncate WAL to this size upon reset */
int nWiData; /* Size of array apWiData */
int szFirstBlock; /* Size of first block written to WAL file */
volatile u32 **apWiData; /* Pointer to wal-index content in memory */
u32 szPage; /* Database page size */
i16 readLock; /* Which read lock is being held. -1 for none */
u8 syncFlags; /* Flags to use to sync header writes */
u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */
u8 writeLock; /* True if in a write transaction */
u8 ckptLock; /* True if holding a checkpoint lock */
u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */
u8 truncateOnCommit; /* True to truncate WAL file on commit */
u8 syncHeader; /* Fsync the WAL header if true */
u8 padToSectorBoundary; /* Pad transactions out to the next sector */
u8 bShmUnreliable; /* SHM content is read-only and unreliable */
WalIndexHdr hdr; /* Wal-index header for current transaction */
u32 minFrame; /* Ignore wal frames before this one */
u32 iReCksum; /* On commit, recalculate checksums from here */
const char *zWalName; /* Name of WAL file */
u32 nCkpt; /* Checkpoint sequence counter in the wal-header */
#ifdef SQLITE_DEBUG
u8 lockError; /* True if a locking error has occurred */
#endif
#ifdef SQLITE_ENABLE_SNAPSHOT
WalIndexHdr *pSnapshot; /* Start transaction here if not NULL */
#endif
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
sqlite3 *db;
#endif
};
/*
** Candidate values for Wal.exclusiveMode.
*/
#define WAL_NORMAL_MODE 0
#define WAL_EXCLUSIVE_MODE 1
#define WAL_HEAPMEMORY_MODE 2
/*
** Possible values for WAL.readOnly
*/
#define WAL_RDWR 0 /* Normal read/write connection */
#define WAL_RDONLY 1 /* The WAL file is readonly */
#define WAL_SHM_RDONLY 2 /* The SHM file is readonly */
/*
** Each page of the wal-index mapping contains a hash-table made up of
** an array of HASHTABLE_NSLOT elements of the following type.
*/
typedef u16 ht_slot;
/*
** This structure is used to implement an iterator that loops through
** all frames in the WAL in database page order. Where two or more frames
** correspond to the same database page, the iterator visits only the
** frame most recently written to the WAL (in other words, the frame with
** the largest index).
**
** The internals of this structure are only accessed by:
**
** walIteratorInit() - Create a new iterator,
** walIteratorNext() - Step an iterator,
** walIteratorFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see walCheckpoint()).
*/
struct WalIterator {
u32 iPrior; /* Last result returned from the iterator */
int nSegment; /* Number of entries in aSegment[] */
struct WalSegment {
int iNext; /* Next slot in aIndex[] not yet returned */
ht_slot *aIndex; /* i0, i1, i2... such that aPgno[iN] ascend */
u32 *aPgno; /* Array of page numbers. */
int nEntry; /* Nr. of entries in aPgno[] and aIndex[] */
int iZero; /* Frame number associated with aPgno[0] */
} aSegment[1]; /* One for every 32KB page in the wal-index */
};
/*
** Define the parameters of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
**
** Changing any of these constants will alter the wal-index format and
** create incompatibilities.
*/
#define HASHTABLE_NPAGE 4096 /* Must be power of 2 */
#define HASHTABLE_HASH_1 383 /* Should be prime */
#define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2) /* Must be a power of 2 */
/*
** The block of page numbers associated with the first hash-table in a
** wal-index is smaller than usual. This is so that there is a complete
** hash-table on each aligned 32KB page of the wal-index.
*/
#define HASHTABLE_NPAGE_ONE (HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)))
/* The wal-index is divided into pages of WALINDEX_PGSZ bytes each. */
#define WALINDEX_PGSZ ( \
sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32) \
)
/*
** Obtain a pointer to the iPage'th page of the wal-index. The wal-index
** is broken into pages of WALINDEX_PGSZ bytes. Wal-index pages are
** numbered from zero.
**
** If the wal-index is currently smaller the iPage pages then the size
** of the wal-index might be increased, but only if it is safe to do
** so. It is safe to enlarge the wal-index if pWal->writeLock is true
** or pWal->exclusiveMode==WAL_HEAPMEMORY_MODE.
**
** Three possible result scenarios:
**
** (1) rc==SQLITE_OK and *ppPage==Requested-Wal-Index-Page
** (2) rc>=SQLITE_ERROR and *ppPage==NULL
** (3) rc==SQLITE_OK and *ppPage==NULL // only if iPage==0
**
** Scenario (3) can only occur when pWal->writeLock is false and iPage==0
*/
static SQLITE_NOINLINE int walIndexPageRealloc(
Wal *pWal, /* The WAL context */
int iPage, /* The page we seek */
volatile u32 **ppPage /* Write the page pointer here */
){
int rc = SQLITE_OK;
/* Enlarge the pWal->apWiData[] array if required */
if( pWal->nWiData<=iPage ){
sqlite3_int64 nByte = sizeof(u32*)*(iPage+1);
volatile u32 **apNew;
apNew = (volatile u32 **)sqlite3Realloc((void *)pWal->apWiData, nByte);
if( !apNew ){
*ppPage = 0;
return SQLITE_NOMEM;
}
memset((void*)&apNew[pWal->nWiData], 0,
sizeof(u32*)*(iPage+1-pWal->nWiData));
pWal->apWiData = apNew;
pWal->nWiData = iPage+1;
}
/* Request a pointer to the required page from the VFS */
assert( pWal->apWiData[iPage]==0 );
if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE ){
pWal->apWiData[iPage] = (u32 volatile *)sqlite3MallocZero(WALINDEX_PGSZ);
if( !pWal->apWiData[iPage] ) rc = SQLITE_NOMEM;
}else{
rc = sqlite3OsShmMap(pWal->pDbFd, iPage, WALINDEX_PGSZ,
pWal->writeLock, (void volatile **)&pWal->apWiData[iPage]
);
assert( pWal->apWiData[iPage]!=0
|| rc!=SQLITE_OK
|| (pWal->writeLock==0 && iPage==0) );
testcase( pWal->apWiData[iPage]==0 && rc==SQLITE_OK );
if( rc==SQLITE_OK ){
if( iPage>0 && sqlite3FaultSim(600) ) rc = SQLITE_NOMEM;
}else if( (rc&0xff)==SQLITE_READONLY ){
pWal->readOnly |= WAL_SHM_RDONLY;
if( rc==SQLITE_READONLY ){
rc = SQLITE_OK;
}
}
}
*ppPage = pWal->apWiData[iPage];
assert( iPage==0 || *ppPage || rc!=SQLITE_OK );
return rc;
}
static int walIndexPage(
Wal *pWal, /* The WAL context */
int iPage, /* The page we seek */
volatile u32 **ppPage /* Write the page pointer here */
){
if( pWal->nWiData<=iPage || (*ppPage = pWal->apWiData[iPage])==0 ){
return walIndexPageRealloc(pWal, iPage, ppPage);
}
return SQLITE_OK;
}
/*
** Return a pointer to the WalCkptInfo structure in the wal-index.
*/
static volatile WalCkptInfo *walCkptInfo(Wal *pWal){
assert( pWal->nWiData>0 && pWal->apWiData[0] );
return (volatile WalCkptInfo*)&(pWal->apWiData[0][sizeof(WalIndexHdr)/2]);
}
/*
** Return a pointer to the WalIndexHdr structure in the wal-index.
*/
static volatile WalIndexHdr *walIndexHdr(Wal *pWal){
assert( pWal->nWiData>0 && pWal->apWiData[0] );
return (volatile WalIndexHdr*)pWal->apWiData[0];
}
/*
** The argument to this macro must be of type u32. On a little-endian
** architecture, it returns the u32 value that results from interpreting
** the 4 bytes as a big-endian value. On a big-endian architecture, it
** returns the value that would be produced by interpreting the 4 bytes
** of the input value as a little-endian integer.
*/
#define BYTESWAP32(x) ( \
(((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \
+ (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \
)
/*
** Generate or extend an 8 byte checksum based on the data in
** array aByte[] and the initial values of aIn[0] and aIn[1] (or
** initial values of 0 and 0 if aIn==NULL).
**
** The checksum is written back into aOut[] before returning.
**
** nByte must be a positive multiple of 8.
*/
static void walChecksumBytes(
int nativeCksum, /* True for native byte-order, false for non-native */
u8 *a, /* Content to be checksummed */
int nByte, /* Bytes of content in a[]. Must be a multiple of 8. */
const u32 *aIn, /* Initial checksum value input */
u32 *aOut /* OUT: Final checksum value output */
){
u32 s1, s2;
u32 *aData = (u32 *)a;
u32 *aEnd = (u32 *)&a[nByte];
if( aIn ){
s1 = aIn[0];
s2 = aIn[1];
}else{
s1 = s2 = 0;
}
assert( nByte>=8 );
assert( (nByte&0x00000007)==0 );
assert( nByte<=65536 );
if( nativeCksum ){
do {
s1 += *aData++ + s2;
s2 += *aData++ + s1;
}while( aData<aEnd );
}else{
do {
s1 += BYTESWAP32(aData[0]) + s2;
s2 += BYTESWAP32(aData[1]) + s1;
aData += 2;
}while( aData<aEnd );
}
aOut[0] = s1;
aOut[1] = s2;
}
/*
** If there is the possibility of concurrent access to the SHM file
** from multiple threads and/or processes, then do a memory barrier.
*/
static void walShmBarrier(Wal *pWal){
if( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE ){
sqlite3OsShmBarrier(pWal->pDbFd);
}
}
/*
** Add the SQLITE_NO_TSAN as part of the return-type of a function
** definition as a hint that the function contains constructs that
** might give false-positive TSAN warnings.
**
** See tag-20200519-1.
*/
#if defined(__clang__) && !defined(SQLITE_NO_TSAN)
# define SQLITE_NO_TSAN __attribute__((no_sanitize_thread))
#else
# define SQLITE_NO_TSAN
#endif
/*
** Write the header information in pWal->hdr into the wal-index.
**
** The checksum on pWal->hdr is updated before it is written.
*/
static SQLITE_NO_TSAN void walIndexWriteHdr(Wal *pWal){
volatile WalIndexHdr *aHdr = walIndexHdr(pWal);
const int nCksum = offsetof(WalIndexHdr, aCksum);
assert( pWal->writeLock );
pWal->hdr.isInit = 1;
pWal->hdr.iVersion = WALINDEX_MAX_VERSION;
walChecksumBytes(1, (u8*)&pWal->hdr, nCksum, 0, pWal->hdr.aCksum);
/* Possible TSAN false-positive. See tag-20200519-1 */
memcpy((void*)&aHdr[1], (const void*)&pWal->hdr, sizeof(WalIndexHdr));
walShmBarrier(pWal);
memcpy((void*)&aHdr[0], (const void*)&pWal->hdr, sizeof(WalIndexHdr));
}
/*
** This function encodes a single frame header and writes it to a buffer
** supplied by the caller. A frame-header is made up of a series of
** 4-byte big-endian integers, as follows:
**
** 0: Page number.
** 4: For commit records, the size of the database image in pages
** after the commit. For all other records, zero.
** 8: Salt-1 (copied from the wal-header)
** 12: Salt-2 (copied from the wal-header)
** 16: Checksum-1.
** 20: Checksum-2.
*/
static void walEncodeFrame(
Wal *pWal, /* The write-ahead log */
u32 iPage, /* Database page number for frame */
u32 nTruncate, /* New db size (or 0 for non-commit frames) */
u8 *aData, /* Pointer to page data */
u8 *aFrame /* OUT: Write encoded frame here */
){
int nativeCksum; /* True for native byte-order checksums */
u32 *aCksum = pWal->hdr.aFrameCksum;
assert( WAL_FRAME_HDRSIZE==24 );
sqlite3Put4byte(&aFrame[0], iPage);
sqlite3Put4byte(&aFrame[4], nTruncate);
if( pWal->iReCksum==0 ){
memcpy(&aFrame[8], pWal->hdr.aSalt, 8);
nativeCksum = (pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN);
walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);
walChecksumBytes(nativeCksum, aData, pWal->szPage, aCksum, aCksum);
sqlite3Put4byte(&aFrame[16], aCksum[0]);
sqlite3Put4byte(&aFrame[20], aCksum[1]);
}else{
memset(&aFrame[8], 0, 16);
}
}
/*
** Check to see if the frame with header in aFrame[] and content
** in aData[] is valid. If it is a valid frame, fill *piPage and
** *pnTruncate and return true. Return if the frame is not valid.
*/
static int walDecodeFrame(
Wal *pWal, /* The write-ahead log */
u32 *piPage, /* OUT: Database page number for frame */
u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */
u8 *aData, /* Pointer to page data (for checksum) */
u8 *aFrame /* Frame data */
){
int nativeCksum; /* True for native byte-order checksums */
u32 *aCksum = pWal->hdr.aFrameCksum;
u32 pgno; /* Page number of the frame */
assert( WAL_FRAME_HDRSIZE==24 );
/* A frame is only valid if the salt values in the frame-header
** match the salt values in the wal-header.
*/
if( memcmp(&pWal->hdr.aSalt, &aFrame[8], 8)!=0 ){
return 0;
}
/* A frame is only valid if the page number is creater than zero.
*/
pgno = sqlite3Get4byte(&aFrame[0]);
if( pgno==0 ){
return 0;
}
/* A frame is only valid if a checksum of the WAL header,
** all prior frams, the first 16 bytes of this frame-header,
** and the frame-data matches the checksum in the last 8
** bytes of this frame-header.
*/
nativeCksum = (pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN);
walChecksumBytes(nativeCksum, aFrame, 8, aCksum, aCksum);
walChecksumBytes(nativeCksum, aData, pWal->szPage, aCksum, aCksum);
if( aCksum[0]!=sqlite3Get4byte(&aFrame[16])
|| aCksum[1]!=sqlite3Get4byte(&aFrame[20])
){
/* Checksum failed. */
return 0;
}
/* If we reach this point, the frame is valid. Return the page number
** and the new database size.
*/
*piPage = pgno;
*pnTruncate = sqlite3Get4byte(&aFrame[4]);
return 1;
}
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
/*
** Names of locks. This routine is used to provide debugging output and is not
** a part of an ordinary build.
*/
static const char *walLockName(int lockIdx){
if( lockIdx==WAL_WRITE_LOCK ){
return "WRITE-LOCK";
}else if( lockIdx==WAL_CKPT_LOCK ){
return "CKPT-LOCK";
}else if( lockIdx==WAL_RECOVER_LOCK ){
return "RECOVER-LOCK";
}else{
static char zName[15];
sqlite3_snprintf(sizeof(zName), zName, "READ-LOCK[%d]",
lockIdx-WAL_READ_LOCK(0));
return zName;
}
}
#endif /*defined(SQLITE_TEST) || defined(SQLITE_DEBUG) */
/*
** Set or release locks on the WAL. Locks are either shared or exclusive.
** A lock cannot be moved directly between shared and exclusive - it must go
** through the unlocked state first.
**
** In locking_mode=EXCLUSIVE, all of these routines become no-ops.
*/
static int walLockShared(Wal *pWal, int lockIdx){
int rc;
if( pWal->exclusiveMode ) return SQLITE_OK;
rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,
SQLITE_SHM_LOCK | SQLITE_SHM_SHARED);
WALTRACE(("WAL%p: acquire SHARED-%s %s\n", pWal,
walLockName(lockIdx), rc ? "failed" : "ok"));
VVA_ONLY( pWal->lockError = (u8)(rc!=SQLITE_OK && (rc&0xFF)!=SQLITE_BUSY); )
return rc;
}
static void walUnlockShared(Wal *pWal, int lockIdx){
if( pWal->exclusiveMode ) return;
(void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, 1,
SQLITE_SHM_UNLOCK | SQLITE_SHM_SHARED);
WALTRACE(("WAL%p: release SHARED-%s\n", pWal, walLockName(lockIdx)));
}
static int walLockExclusive(Wal *pWal, int lockIdx, int n){
int rc;
if( pWal->exclusiveMode ) return SQLITE_OK;
rc = sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
SQLITE_SHM_LOCK | SQLITE_SHM_EXCLUSIVE);
WALTRACE(("WAL%p: acquire EXCLUSIVE-%s cnt=%d %s\n", pWal,
walLockName(lockIdx), n, rc ? "failed" : "ok"));
VVA_ONLY( pWal->lockError = (u8)(rc!=SQLITE_OK && (rc&0xFF)!=SQLITE_BUSY); )
return rc;
}
static void walUnlockExclusive(Wal *pWal, int lockIdx, int n){
if( pWal->exclusiveMode ) return;
(void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n,
SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE);
WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal,
walLockName(lockIdx), n));
}
/*
** Compute a hash on a page number. The resulting hash value must land
** between 0 and (HASHTABLE_NSLOT-1). The walHashNext() function advances
** the hash to the next value in the event of a collision.
*/
static int walHash(u32 iPage){
assert( iPage>0 );
assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 );
return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1);
}
static int walNextHash(int iPriorHash){
return (iPriorHash+1)&(HASHTABLE_NSLOT-1);
}
/*
** An instance of the WalHashLoc object is used to describe the location
** of a page hash table in the wal-index. This becomes the return value
** from walHashGet().
*/
typedef struct WalHashLoc WalHashLoc;
struct WalHashLoc {
volatile ht_slot *aHash; /* Start of the wal-index hash table */
volatile u32 *aPgno; /* aPgno[1] is the page of first frame indexed */
u32 iZero; /* One less than the frame number of first indexed*/
};
/*
** Return pointers to the hash table and page number array stored on
** page iHash of the wal-index. The wal-index is broken into 32KB pages
** numbered starting from 0.
**
** Set output variable pLoc->aHash to point to the start of the hash table
** in the wal-index file. Set pLoc->iZero to one less than the frame
** number of the first frame indexed by this hash table. If a
** slot in the hash table is set to N, it refers to frame number
** (pLoc->iZero+N) in the log.
**
** Finally, set pLoc->aPgno so that pLoc->aPgno[0] is the page number of the
** first frame indexed by the hash table, frame (pLoc->iZero).
*/
static int walHashGet(
Wal *pWal, /* WAL handle */
int iHash, /* Find the iHash'th table */
WalHashLoc *pLoc /* OUT: Hash table location */
){
int rc; /* Return code */
rc = walIndexPage(pWal, iHash, &pLoc->aPgno);
assert( rc==SQLITE_OK || iHash>0 );
if( pLoc->aPgno ){
pLoc->aHash = (volatile ht_slot *)&pLoc->aPgno[HASHTABLE_NPAGE];
if( iHash==0 ){
pLoc->aPgno = &pLoc->aPgno[WALINDEX_HDR_SIZE/sizeof(u32)];
pLoc->iZero = 0;
}else{
pLoc->iZero = HASHTABLE_NPAGE_ONE + (iHash-1)*HASHTABLE_NPAGE;
}
}else if( NEVER(rc==SQLITE_OK) ){
rc = SQLITE_ERROR;
}
return rc;
}
/*
** Return the number of the wal-index page that contains the hash-table
** and page-number array that contain entries corresponding to WAL frame
** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages
** are numbered starting from 0.
*/
static int walFramePage(u32 iFrame){
int iHash = (iFrame+HASHTABLE_NPAGE-HASHTABLE_NPAGE_ONE-1) / HASHTABLE_NPAGE;
assert( (iHash==0 || iFrame>HASHTABLE_NPAGE_ONE)
&& (iHash>=1 || iFrame<=HASHTABLE_NPAGE_ONE)
&& (iHash<=1 || iFrame>(HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE))
&& (iHash>=2 || iFrame<=HASHTABLE_NPAGE_ONE+HASHTABLE_NPAGE)
&& (iHash<=2 || iFrame>(HASHTABLE_NPAGE_ONE+2*HASHTABLE_NPAGE))
);
assert( iHash>=0 );
return iHash;
}
/*
** Return the page number associated with frame iFrame in this WAL.
*/
static u32 walFramePgno(Wal *pWal, u32 iFrame){
int iHash = walFramePage(iFrame);
if( iHash==0 ){
return pWal->apWiData[0][WALINDEX_HDR_SIZE/sizeof(u32) + iFrame - 1];
}
return pWal->apWiData[iHash][(iFrame-1-HASHTABLE_NPAGE_ONE)%HASHTABLE_NPAGE];
}
/*
** Remove entries from the hash table that point to WAL slots greater
** than pWal->hdr.mxFrame.
**
** This function is called whenever pWal->hdr.mxFrame is decreased due
** to a rollback or savepoint.
**
** At most only the hash table containing pWal->hdr.mxFrame needs to be
** updated. Any later hash tables will be automatically cleared when
** pWal->hdr.mxFrame advances to the point where those hash tables are
** actually needed.
*/
static void walCleanupHash(Wal *pWal){
WalHashLoc sLoc; /* Hash table location */
int iLimit = 0; /* Zero values greater than this */
int nByte; /* Number of bytes to zero in aPgno[] */
int i; /* Used to iterate through aHash[] */
assert( pWal->writeLock );
testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE-1 );
testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE );
testcase( pWal->hdr.mxFrame==HASHTABLE_NPAGE_ONE+1 );
if( pWal->hdr.mxFrame==0 ) return;
/* Obtain pointers to the hash-table and page-number array containing
** the entry that corresponds to frame pWal->hdr.mxFrame. It is guaranteed
** that the page said hash-table and array reside on is already mapped.(1)
*/
assert( pWal->nWiData>walFramePage(pWal->hdr.mxFrame) );
assert( pWal->apWiData[walFramePage(pWal->hdr.mxFrame)] );
i = walHashGet(pWal, walFramePage(pWal->hdr.mxFrame), &sLoc);
if( NEVER(i) ) return; /* Defense-in-depth, in case (1) above is wrong */
/* Zero all hash-table entries that correspond to frame numbers greater
** than pWal->hdr.mxFrame.
*/
iLimit = pWal->hdr.mxFrame - sLoc.iZero;
assert( iLimit>0 );
for(i=0; i<HASHTABLE_NSLOT; i++){
if( sLoc.aHash[i]>iLimit ){
sLoc.aHash[i] = 0;
}
}
/* Zero the entries in the aPgno array that correspond to frames with
** frame numbers greater than pWal->hdr.mxFrame.
*/
nByte = (int)((char *)sLoc.aHash - (char *)&sLoc.aPgno[iLimit]);
assert( nByte>=0 );
memset((void *)&sLoc.aPgno[iLimit], 0, nByte);
#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
/* Verify that the every entry in the mapping region is still reachable
** via the hash table even after the cleanup.
*/
if( iLimit ){
int j; /* Loop counter */
int iKey; /* Hash key */
for(j=0; j<iLimit; j++){
for(iKey=walHash(sLoc.aPgno[j]);sLoc.aHash[iKey];iKey=walNextHash(iKey)){
if( sLoc.aHash[iKey]==j+1 ) break;
}
assert( sLoc.aHash[iKey]==j+1 );
}
}
#endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
}
/*
** Set an entry in the wal-index that will map database page number
** pPage into WAL frame iFrame.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
int rc; /* Return code */
WalHashLoc sLoc; /* Wal-index hash table location */
rc = walHashGet(pWal, walFramePage(iFrame), &sLoc);
/* Assuming the wal-index file was successfully mapped, populate the
** page number array and hash table entry.
*/
if( rc==SQLITE_OK ){
int iKey; /* Hash table key */
int idx; /* Value to write to hash-table slot */
int nCollide; /* Number of hash collisions */
idx = iFrame - sLoc.iZero;
assert( idx <= HASHTABLE_NSLOT/2 + 1 );
/* If this is the first entry to be added to this hash-table, zero the
** entire hash table and aPgno[] array before proceeding.
*/
if( idx==1 ){
int nByte = (int)((u8*)&sLoc.aHash[HASHTABLE_NSLOT] - (u8*)sLoc.aPgno);
assert( nByte>=0 );
memset((void*)sLoc.aPgno, 0, nByte);
}
/* If the entry in aPgno[] is already set, then the previous writer
** must have exited unexpectedly in the middle of a transaction (after
** writing one or more dirty pages to the WAL to free up memory).
** Remove the remnants of that writers uncommitted transaction from
** the hash-table before writing any new entries.
*/
if( sLoc.aPgno[idx-1] ){
walCleanupHash(pWal);
assert( !sLoc.aPgno[idx-1] );
}
/* Write the aPgno[] array entry and the hash-table slot. */
nCollide = idx;
for(iKey=walHash(iPage); sLoc.aHash[iKey]; iKey=walNextHash(iKey)){
if( (nCollide--)==0 ) return SQLITE_CORRUPT_BKPT;
}
sLoc.aPgno[idx-1] = iPage;
AtomicStore(&sLoc.aHash[iKey], (ht_slot)idx);
#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
/* Verify that the number of entries in the hash table exactly equals
** the number of entries in the mapping region.
*/
{
int i; /* Loop counter */
int nEntry = 0; /* Number of entries in the hash table */
for(i=0; i<HASHTABLE_NSLOT; i++){ if( sLoc.aHash[i] ) nEntry++; }
assert( nEntry==idx );
}
/* Verify that the every entry in the mapping region is reachable
** via the hash table. This turns out to be a really, really expensive
** thing to check, so only do this occasionally - not on every
** iteration.
*/
if( (idx&0x3ff)==0 ){
int i; /* Loop counter */
for(i=0; i<idx; i++){
for(iKey=walHash(sLoc.aPgno[i]);
sLoc.aHash[iKey];
iKey=walNextHash(iKey)){
if( sLoc.aHash[iKey]==i+1 ) break;
}
assert( sLoc.aHash[iKey]==i+1 );
}
}
#endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
}
return rc;
}
/*
** Recover the wal-index by reading the write-ahead log file.
**
** This routine first tries to establish an exclusive lock on the
** wal-index to prevent other threads/processes from doing anything
** with the WAL or wal-index while recovery is running. The
** WAL_RECOVER_LOCK is also held so that other threads will know
** that this thread is running recovery. If unable to establish
** the necessary locks, this routine returns SQLITE_BUSY.
*/
static int walIndexRecover(Wal *pWal){
int rc; /* Return Code */
i64 nSize; /* Size of log file */
u32 aFrameCksum[2] = {0, 0};
int iLock; /* Lock offset to lock for checkpoint */
/* Obtain an exclusive lock on all byte in the locking range not already
** locked by the caller. The caller is guaranteed to have locked the
** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte.
** If successful, the same bytes that are locked here are unlocked before
** this function returns.
*/
assert( pWal->ckptLock==1 || pWal->ckptLock==0 );
assert( WAL_ALL_BUT_WRITE==WAL_WRITE_LOCK+1 );
assert( WAL_CKPT_LOCK==WAL_ALL_BUT_WRITE );
assert( pWal->writeLock );
iLock = WAL_ALL_BUT_WRITE + pWal->ckptLock;
rc = walLockExclusive(pWal, iLock, WAL_READ_LOCK(0)-iLock);
if( rc ){
return rc;
}
WALTRACE(("WAL%p: recovery begin...\n", pWal));
memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
rc = sqlite3OsFileSize(pWal->pWalFd, &nSize);
if( rc!=SQLITE_OK ){
goto recovery_error;
}
if( nSize>WAL_HDRSIZE ){
u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */
u32 *aPrivate = 0; /* Heap copy of *-shm hash being populated */
u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
int szFrame; /* Number of bytes in buffer aFrame[] */
u8 *aData; /* Pointer to data part of aFrame buffer */
int szPage; /* Page size according to the log */
u32 magic; /* Magic value read from WAL header */
u32 version; /* Magic value read from WAL header */
int isValid; /* True if this frame is valid */
u32 iPg; /* Current 32KB wal-index page */
u32 iLastFrame; /* Last frame in wal, based on nSize alone */
/* Read in the WAL header. */
rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);
if( rc!=SQLITE_OK ){
goto recovery_error;
}
/* If the database page size is not a power of two, or is greater than
** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid
** data. Similarly, if the 'magic' value is invalid, ignore the whole
** WAL file.
*/
magic = sqlite3Get4byte(&aBuf[0]);
szPage = sqlite3Get4byte(&aBuf[8]);
if( (magic&0xFFFFFFFE)!=WAL_MAGIC
|| szPage&(szPage-1)
|| szPage>SQLITE_MAX_PAGE_SIZE
|| szPage<512
){
goto finished;
}
pWal->hdr.bigEndCksum = (u8)(magic&0x00000001);
pWal->szPage = szPage;
pWal->nCkpt = sqlite3Get4byte(&aBuf[12]);
memcpy(&pWal->hdr.aSalt, &aBuf[16], 8);
/* Verify that the WAL header checksum is correct */
walChecksumBytes(pWal->hdr.bigEndCksum==SQLITE_BIGENDIAN,
aBuf, WAL_HDRSIZE-2*4, 0, pWal->hdr.aFrameCksum
);
if( pWal->hdr.aFrameCksum[0]!=sqlite3Get4byte(&aBuf[24])
|| pWal->hdr.aFrameCksum[1]!=sqlite3Get4byte(&aBuf[28])
){
goto finished;
}
/* Verify that the version number on the WAL format is one that
** are able to understand */
version = sqlite3Get4byte(&aBuf[4]);
if( version!=WAL_MAX_VERSION ){
rc = SQLITE_CANTOPEN_BKPT;
goto finished;
}
/* Malloc a buffer to read frames into. */
szFrame = szPage + WAL_FRAME_HDRSIZE;
aFrame = (u8 *)sqlite3_malloc64(szFrame + WALINDEX_PGSZ);
if( !aFrame ){
rc = SQLITE_NOMEM;
goto recovery_error;
}
aData = &aFrame[WAL_FRAME_HDRSIZE];
aPrivate = (u32*)&aData[szPage];
/* Read all frames from the log file. */
iLastFrame = (nSize - WAL_HDRSIZE) / szFrame;
for(iPg=0; iPg<=(u32)walFramePage(iLastFrame); iPg++){
u32 *aShare;
u32 iFrame; /* Index of last frame read */
u32 iLast = MIN(iLastFrame, HASHTABLE_NPAGE_ONE+iPg*HASHTABLE_NPAGE);
u32 iFirst = 1 + (iPg==0?0:HASHTABLE_NPAGE_ONE+(iPg-1)*HASHTABLE_NPAGE);
u32 nHdr, nHdr32;
rc = walIndexPage(pWal, iPg, (volatile u32**)&aShare);
assert( aShare!=0 || rc!=SQLITE_OK );
if( aShare==0 ) break;
pWal->apWiData[iPg] = aPrivate;
for(iFrame=iFirst; iFrame<=iLast; iFrame++){
i64 iOffset = walFrameOffset(iFrame, szPage);
u32 pgno; /* Database page number for frame */
u32 nTruncate; /* dbsize field from frame header */
/* Read and decode the next log frame. */
rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset);
if( rc!=SQLITE_OK ) break;
isValid = walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame);
if( !isValid ) break;
rc = walIndexAppend(pWal, iFrame, pgno);
if( NEVER(rc!=SQLITE_OK) ) break;
/* If nTruncate is non-zero, this is a commit record. */
if( nTruncate ){
pWal->hdr.mxFrame = iFrame;
pWal->hdr.nPage = nTruncate;
pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16));
testcase( szPage<=32768 );
testcase( szPage>=65536 );
aFrameCksum[0] = pWal->hdr.aFrameCksum[0];
aFrameCksum[1] = pWal->hdr.aFrameCksum[1];
}
}
pWal->apWiData[iPg] = aShare;
nHdr = (iPg==0 ? WALINDEX_HDR_SIZE : 0);
nHdr32 = nHdr / sizeof(u32);
#ifndef SQLITE_SAFER_WALINDEX_RECOVERY
/* Memcpy() should work fine here, on all reasonable implementations.
** Technically, memcpy() might change the destination to some
** intermediate value before setting to the final value, and that might
** cause a concurrent reader to malfunction. Memcpy() is allowed to
** do that, according to the spec, but no memcpy() implementation that
** we know of actually does that, which is why we say that memcpy()
** is safe for this. Memcpy() is certainly a lot faster.
*/
memcpy(&aShare[nHdr32], &aPrivate[nHdr32], WALINDEX_PGSZ-nHdr);
#else
/* In the event that some platform is found for which memcpy()
** changes the destination to some intermediate value before
** setting the final value, this alternative copy routine is
** provided.
*/
{
int i;
for(i=nHdr32; i<WALINDEX_PGSZ/sizeof(u32); i++){
if( aShare[i]!=aPrivate[i] ){
/* Atomic memory operations are not required here because if
** the value needs to be changed, that means it is not being
** accessed concurrently. */
aShare[i] = aPrivate[i];
}
}
}
#endif
if( iFrame<=iLast ) break;
}
sqlite3_free(aFrame);
}
finished:
if( rc==SQLITE_OK ){
volatile WalCkptInfo *pInfo;
int i;
pWal->hdr.aFrameCksum[0] = aFrameCksum[0];
pWal->hdr.aFrameCksum[1] = aFrameCksum[1];
walIndexWriteHdr(pWal);
/* Reset the checkpoint-header. This is safe because this thread is
** currently holding locks that exclude all other writers and
** checkpointers. Then set the values of read-mark slots 1 through N.
*/
pInfo = walCkptInfo(pWal);
pInfo->nBackfill = 0;
pInfo->nBackfillAttempted = pWal->hdr.mxFrame;
pInfo->aReadMark[0] = 0;
for(i=1; i<WAL_NREADER; i++){
rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
if( rc==SQLITE_OK ){
if( i==1 && pWal->hdr.mxFrame ){
pInfo->aReadMark[i] = pWal->hdr.mxFrame;
}else{
pInfo->aReadMark[i] = READMARK_NOT_USED;
}
walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
}else if( rc!=SQLITE_BUSY ){
goto recovery_error;
}
}
/* If more than one frame was recovered from the log file, report an
** event via sqlite3_log(). This is to help with identifying performance
** problems caused by applications routinely shutting down without
** checkpointing the log file.
*/
if( pWal->hdr.nPage ){
sqlite3_log(SQLITE_NOTICE_RECOVER_WAL,
"recovered %d frames from WAL file %s",
pWal->hdr.mxFrame, pWal->zWalName
);
}
}
recovery_error:
WALTRACE(("WAL%p: recovery %s\n", pWal, rc ? "failed" : "ok"));
walUnlockExclusive(pWal, iLock, WAL_READ_LOCK(0)-iLock);
return rc;
}
/*
** Close an open wal-index.
*/
static void walIndexClose(Wal *pWal, int isDelete){
if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE || pWal->bShmUnreliable ){
int i;
for(i=0; i<pWal->nWiData; i++){
sqlite3_free((void *)pWal->apWiData[i]);
pWal->apWiData[i] = 0;
}
}
if( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE ){
sqlite3OsShmUnmap(pWal->pDbFd, isDelete);
}
}
/*
** Open a connection to the WAL file zWalName. The database file must
** already be opened on connection pDbFd. The buffer that zWalName points
** to must remain valid for the lifetime of the returned Wal* handle.
**
** A SHARED lock should be held on the database file when this function
** is called. The purpose of this SHARED lock is to prevent any other
** client from unlinking the WAL or wal-index file. If another process
** were to do this just after this client opened one of these files, the
** system would be badly broken.
**
** If the log file is successfully opened, SQLITE_OK is returned and
** *ppWal is set to point to a new WAL handle. If an error occurs,
** an SQLite error code is returned and *ppWal is left unmodified.
*/
int sqlite3WalOpen(
sqlite3_vfs *pVfs, /* vfs module to open wal and wal-index */
sqlite3_file *pDbFd, /* The open database file */
const char *zWalName, /* Name of the WAL file */
int bNoShm, /* True to run in heap-memory mode */
i64 mxWalSize, /* Truncate WAL to this size on reset */
Wal **ppWal /* OUT: Allocated Wal handle */
){
int rc; /* Return Code */
Wal *pRet; /* Object to allocate and return */
int flags; /* Flags passed to OsOpen() */
assert( zWalName && zWalName[0] );
assert( pDbFd );
/* Verify the values of various constants. Any changes to the values
** of these constants would result in an incompatible on-disk format
** for the -shm file. Any change that causes one of these asserts to
** fail is a backward compatibility problem, even if the change otherwise
** works.
**
** This table also serves as a helpful cross-reference when trying to
** interpret hex dumps of the -shm file.
*/
assert( 48 == sizeof(WalIndexHdr) );
assert( 40 == sizeof(WalCkptInfo) );
assert( 120 == WALINDEX_LOCK_OFFSET );
assert( 136 == WALINDEX_HDR_SIZE );
assert( 4096 == HASHTABLE_NPAGE );
assert( 4062 == HASHTABLE_NPAGE_ONE );
assert( 8192 == HASHTABLE_NSLOT );
assert( 383 == HASHTABLE_HASH_1 );
assert( 32768 == WALINDEX_PGSZ );
assert( 8 == SQLITE_SHM_NLOCK );
assert( 5 == WAL_NREADER );
assert( 24 == WAL_FRAME_HDRSIZE );
assert( 32 == WAL_HDRSIZE );
assert( 120 == WALINDEX_LOCK_OFFSET + WAL_WRITE_LOCK );
assert( 121 == WALINDEX_LOCK_OFFSET + WAL_CKPT_LOCK );
assert( 122 == WALINDEX_LOCK_OFFSET + WAL_RECOVER_LOCK );
assert( 123 == WALINDEX_LOCK_OFFSET + WAL_READ_LOCK(0) );
assert( 124 == WALINDEX_LOCK_OFFSET + WAL_READ_LOCK(1) );
assert( 125 == WALINDEX_LOCK_OFFSET + WAL_READ_LOCK(2) );
assert( 126 == WALINDEX_LOCK_OFFSET + WAL_READ_LOCK(3) );
assert( 127 == WALINDEX_LOCK_OFFSET + WAL_READ_LOCK(4) );
/* In the amalgamation, the os_unix.c and os_win.c source files come before
** this source file. Verify that the #defines of the locking byte offsets
** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value.
** For that matter, if the lock offset ever changes from its initial design
** value of 120, we need to know that so there is an assert() to check it.
*/
#ifdef WIN_SHM_BASE
assert( WIN_SHM_BASE==WALINDEX_LOCK_OFFSET );
#endif
#ifdef UNIX_SHM_BASE
assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET );
#endif
/* Allocate an instance of struct Wal to return. */
*ppWal = 0;
pRet = (Wal*)sqlite3MallocZero(sizeof(Wal) + pVfs->szOsFile);
if( !pRet ){
return SQLITE_NOMEM;
}
pRet->pVfs = pVfs;
pRet->pWalFd = (sqlite3_file *)&pRet[1];
pRet->pDbFd = pDbFd;
pRet->readLock = -1;
pRet->mxWalSize = mxWalSize;
pRet->zWalName = zWalName;
pRet->syncHeader = 1;
pRet->padToSectorBoundary = 1;
pRet->exclusiveMode = (bNoShm ? WAL_HEAPMEMORY_MODE: WAL_NORMAL_MODE);
/* Open file handle on the write-ahead log file. */
flags = (SQLITE_OPEN_READWRITE|SQLITE_OPEN_CREATE|SQLITE_OPEN_WAL);
rc = sqlite3OsOpen(pVfs, zWalName, pRet->pWalFd, flags, &flags);
if( rc==SQLITE_OK && flags&SQLITE_OPEN_READONLY ){
pRet->readOnly = WAL_RDONLY;
}
if( rc!=SQLITE_OK ){
walIndexClose(pRet, 0);
sqlite3OsClose(pRet->pWalFd);
sqlite3_free(pRet);
}else{
int iDC = sqlite3OsDeviceCharacteristics(pDbFd);
if( iDC & SQLITE_IOCAP_SEQUENTIAL ){ pRet->syncHeader = 0; }
if( iDC & SQLITE_IOCAP_POWERSAFE_OVERWRITE ){
pRet->padToSectorBoundary = 0;
}
*ppWal = pRet;
WALTRACE(("WAL%d: opened\n", pRet));
}
return rc;
}
/*
** Change the size to which the WAL file is trucated on each reset.
*/
void sqlite3WalLimit(Wal *pWal, i64 iLimit){
if( pWal ) pWal->mxWalSize = iLimit;
}
/*
** Find the smallest page number out of all pages held in the WAL that
** has not been returned by any prior invocation of this method on the
** same WalIterator object. Write into *piFrame the frame index where
** that page was last written into the WAL. Write into *piPage the page
** number.
**
** Return 0 on success. If there are no pages in the WAL with a page
** number larger than *piPage, then return 1.
*/
static int walIteratorNext(
WalIterator *p, /* Iterator */
u32 *piPage, /* OUT: The page number of the next page */
u32 *piFrame /* OUT: Wal frame index of next page */
){
u32 iMin; /* Result pgno must be greater than iMin */
u32 iRet = 0xFFFFFFFF; /* 0xffffffff is never a valid page number */
int i; /* For looping through segments */
iMin = p->iPrior;
assert( iMin<0xffffffff );
for(i=p->nSegment-1; i>=0; i--){
struct WalSegment *pSegment = &p->aSegment[i];
while( pSegment->iNext<pSegment->nEntry ){
u32 iPg = pSegment->aPgno[pSegment->aIndex[pSegment->iNext]];
if( iPg>iMin ){
if( iPg<iRet ){
iRet = iPg;
*piFrame = pSegment->iZero + pSegment->aIndex[pSegment->iNext];
}
break;
}
pSegment->iNext++;
}
}
*piPage = p->iPrior = iRet;
return (iRet==0xFFFFFFFF);
}
/*
** This function merges two sorted lists into a single sorted list.
**
** aLeft[] and aRight[] are arrays of indices. The sort key is
** aContent[aLeft[]] and aContent[aRight[]]. Upon entry, the following
** is guaranteed for all J<K:
**
** aContent[aLeft[J]] < aContent[aLeft[K]]
** aContent[aRight[J]] < aContent[aRight[K]]
**
** This routine overwrites aRight[] with a new (probably longer) sequence
** of indices such that the aRight[] contains every index that appears in
** either aLeft[] or the old aRight[] and such that the second condition
** above is still met.
**
** The aContent[aLeft[X]] values will be unique for all X. And the
** aContent[aRight[X]] values will be unique too. But there might be
** one or more combinations of X and Y such that
**
** aLeft[X]!=aRight[Y] && aContent[aLeft[X]] == aContent[aRight[Y]]
**
** When that happens, omit the aLeft[X] and use the aRight[Y] index.
*/
static void walMerge(
const u32 *aContent, /* Pages in wal - keys for the sort */
ht_slot *aLeft, /* IN: Left hand input list */
int nLeft, /* IN: Elements in array *paLeft */
ht_slot **paRight, /* IN/OUT: Right hand input list */
int *pnRight, /* IN/OUT: Elements in *paRight */
ht_slot *aTmp /* Temporary buffer */
){
int iLeft = 0; /* Current index in aLeft */
int iRight = 0; /* Current index in aRight */
int iOut = 0; /* Current index in output buffer */
int nRight = *pnRight;
ht_slot *aRight = *paRight;
assert( nLeft>0 && nRight>0 );
while( iRight<nRight || iLeft<nLeft ){
ht_slot logpage;
Pgno dbpage;
if( (iLeft<nLeft)
&& (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]])
){
logpage = aLeft[iLeft++];
}else{
logpage = aRight[iRight++];
}
dbpage = aContent[logpage];
aTmp[iOut++] = logpage;
if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++;
assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage );
assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage );
}
*paRight = aLeft;
*pnRight = iOut;
memcpy(aLeft, aTmp, sizeof(aTmp[0])*iOut);
}
/*
** Sort the elements in list aList using aContent[] as the sort key.
** Remove elements with duplicate keys, preferring to keep the
** larger aList[] values.
**
** The aList[] entries are indices into aContent[]. The values in
** aList[] are to be sorted so that for all J<K:
**
** aContent[aList[J]] < aContent[aList[K]]
**
** For any X and Y such that
**
** aContent[aList[X]] == aContent[aList[Y]]
**
** Keep the larger of the two values aList[X] and aList[Y] and discard
** the smaller.
*/
static void walMergesort(
const u32 *aContent, /* Pages in wal */
ht_slot *aBuffer, /* Buffer of at least *pnList items to use */
ht_slot *aList, /* IN/OUT: List to sort */
int *pnList /* IN/OUT: Number of elements in aList[] */
){
struct Sublist {
int nList; /* Number of elements in aList */
ht_slot *aList; /* Pointer to sub-list content */
};
const int nList = *pnList; /* Size of input list */
int nMerge = 0; /* Number of elements in list aMerge */
ht_slot *aMerge = 0; /* List to be merged */
int iList; /* Index into input list */
u32 iSub = 0; /* Index into aSub array */
struct Sublist aSub[13]; /* Array of sub-lists */
memset(aSub, 0, sizeof(aSub));
assert( nList<=HASHTABLE_NPAGE && nList>0 );
assert( HASHTABLE_NPAGE==(1<<(ArraySize(aSub)-1)) );
for(iList=0; iList<nList; iList++){
nMerge = 1;
aMerge = &aList[iList];
for(iSub=0; iList & (1<<iSub); iSub++){
struct Sublist *p;
assert( iSub<ArraySize(aSub) );
p = &aSub[iSub];
assert( p->aList && p->nList<=(1<<iSub) );
assert( p->aList==&aList[iList&~((2<<iSub)-1)] );
walMerge(aContent, p->aList, p->nList, &aMerge, &nMerge, aBuffer);
}
aSub[iSub].aList = aMerge;
aSub[iSub].nList = nMerge;
}
for(iSub++; iSub<ArraySize(aSub); iSub++){
if( nList & (1<<iSub) ){
struct Sublist *p;
assert( iSub<ArraySize(aSub) );
p = &aSub[iSub];
assert( p->nList<=(1<<iSub) );
assert( p->aList==&aList[nList&~((2<<iSub)-1)] );
walMerge(aContent, p->aList, p->nList, &aMerge, &nMerge, aBuffer);
}
}
assert( aMerge==aList );
*pnList = nMerge;
#ifdef SQLITE_DEBUG
{
int i;
for(i=1; i<*pnList; i++){
assert( aContent[aList[i]] > aContent[aList[i-1]] );
}
}
#endif
}
/*
** Free an iterator allocated by walIteratorInit().
*/
static void walIteratorFree(WalIterator *p){
sqlite3_free(p);
}
/*
** Construct a WalInterator object that can be used to loop over all
** pages in the WAL following frame nBackfill in ascending order. Frames
** nBackfill or earlier may be included - excluding them is an optimization
** only. The caller must hold the checkpoint lock.
**
** On success, make *pp point to the newly allocated WalInterator object
** return SQLITE_OK. Otherwise, return an error code. If this routine
** returns an error, the value of *pp is undefined.
**
** The calling routine should invoke walIteratorFree() to destroy the
** WalIterator object when it has finished with it.
*/
static int walIteratorInit(Wal *pWal, u32 nBackfill, WalIterator **pp){
WalIterator *p; /* Return value */
int nSegment; /* Number of segments to merge */
u32 iLast; /* Last frame in log */
sqlite3_int64 nByte; /* Number of bytes to allocate */
int i; /* Iterator variable */
ht_slot *aTmp; /* Temp space used by merge-sort */
int rc = SQLITE_OK; /* Return Code */
/* This routine only runs while holding the checkpoint lock. And
** it only runs if there is actually content in the log (mxFrame>0).
*/
assert( pWal->ckptLock && pWal->hdr.mxFrame>0 );
iLast = pWal->hdr.mxFrame;
/* Allocate space for the WalIterator object. */
nSegment = walFramePage(iLast) + 1;
nByte = sizeof(WalIterator)
+ (nSegment-1)*sizeof(struct WalSegment)
+ iLast*sizeof(ht_slot);
p = (WalIterator *)sqlite3_malloc64(nByte);
if( !p ){
return SQLITE_NOMEM;
}
memset(p, 0, nByte);
p->nSegment = nSegment;
/* Allocate temporary space used by the merge-sort routine. This block
** of memory will be freed before this function returns.
*/
aTmp = (ht_slot *)sqlite3_malloc64(
sizeof(ht_slot) * (iLast>HASHTABLE_NPAGE?HASHTABLE_NPAGE:iLast)
);
if( !aTmp ){
rc = SQLITE_NOMEM;
}
for(i=walFramePage(nBackfill+1); rc==SQLITE_OK && i<nSegment; i++){
WalHashLoc sLoc;
rc = walHashGet(pWal, i, &sLoc);
if( rc==SQLITE_OK ){
int j; /* Counter variable */
int nEntry; /* Number of entries in this segment */
ht_slot *aIndex; /* Sorted index for this segment */
if( (i+1)==nSegment ){
nEntry = (int)(iLast - sLoc.iZero);
}else{
nEntry = (int)((u32*)sLoc.aHash - (u32*)sLoc.aPgno);
}
aIndex = &((ht_slot *)&p->aSegment[p->nSegment])[sLoc.iZero];
sLoc.iZero++;
for(j=0; j<nEntry; j++){
aIndex[j] = (ht_slot)j;
}
walMergesort((u32 *)sLoc.aPgno, aTmp, aIndex, &nEntry);
p->aSegment[i].iZero = sLoc.iZero;
p->aSegment[i].nEntry = nEntry;
p->aSegment[i].aIndex = aIndex;
p->aSegment[i].aPgno = (u32 *)sLoc.aPgno;
}
}
sqlite3_free(aTmp);
if( rc!=SQLITE_OK ){
walIteratorFree(p);
p = 0;
}
*pp = p;
return rc;
}
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
/*
** Attempt to enable blocking locks. Blocking locks are enabled only if (a)
** they are supported by the VFS, and (b) the database handle is configured
** with a busy-timeout. Return 1 if blocking locks are successfully enabled,
** or 0 otherwise.
*/
static int walEnableBlocking(Wal *pWal){
int res = 0;
if( pWal->db ){
int tmout = pWal->db->busyTimeout;
if( tmout ){
int rc;
rc = sqlite3OsFileControl(
pWal->pDbFd, SQLITE_FCNTL_LOCK_TIMEOUT, (void*)&tmout
);
res = (rc==SQLITE_OK);
}
}
return res;
}
/*
** Disable blocking locks.
*/
static void walDisableBlocking(Wal *pWal){
int tmout = 0;
sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_LOCK_TIMEOUT, (void*)&tmout);
}
/*
** If parameter bLock is true, attempt to enable blocking locks, take
** the WRITER lock, and then disable blocking locks. If blocking locks
** cannot be enabled, no attempt to obtain the WRITER lock is made. Return
** an SQLite error code if an error occurs, or SQLITE_OK otherwise. It is not
** an error if blocking locks can not be enabled.
**
** If the bLock parameter is false and the WRITER lock is held, release it.
*/
int sqlite3WalWriteLock(Wal *pWal, int bLock){
int rc = SQLITE_OK;
assert( pWal->readLock<0 || bLock==0 );
if( bLock ){
assert( pWal->db );
if( walEnableBlocking(pWal) ){
rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
if( rc==SQLITE_OK ){
pWal->writeLock = 1;
}
walDisableBlocking(pWal);
}
}else if( pWal->writeLock ){
walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
pWal->writeLock = 0;
}
return rc;
}
/*
** Set the database handle used to determine if blocking locks are required.
*/
void sqlite3WalDb(Wal *pWal, sqlite3 *db){
pWal->db = db;
}
/*
** Take an exclusive WRITE lock. Blocking if so configured.
*/
static int walLockWriter(Wal *pWal){
int rc;
walEnableBlocking(pWal);
rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
walDisableBlocking(pWal);
return rc;
}
#else
# define walEnableBlocking(x) 0
# define walDisableBlocking(x)
# define walLockWriter(pWal) walLockExclusive((pWal), WAL_WRITE_LOCK, 1)
# define sqlite3WalDb(pWal, db)
#endif /* ifdef SQLITE_ENABLE_SETLK_TIMEOUT */
/*
** Attempt to obtain the exclusive WAL lock defined by parameters lockIdx and
** n. If the attempt fails and parameter xBusy is not NULL, then it is a
** busy-handler function. Invoke it and retry the lock until either the
** lock is successfully obtained or the busy-handler returns 0.
*/
static int walBusyLock(
Wal *pWal, /* WAL connection */
int (*xBusy)(void*), /* Function to call when busy */
void *pBusyArg, /* Context argument for xBusyHandler */
int lockIdx, /* Offset of first byte to lock */
int n /* Number of bytes to lock */
){
int rc;
do {
rc = walLockExclusive(pWal, lockIdx, n);
}while( xBusy && rc==SQLITE_BUSY && xBusy(pBusyArg) );
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
if( rc==SQLITE_BUSY_TIMEOUT ){
walDisableBlocking(pWal);
rc = SQLITE_BUSY;
}
#endif
return rc;
}
/*
** The cache of the wal-index header must be valid to call this function.
** Return the page-size in bytes used by the database.
*/
static int walPagesize(Wal *pWal){
return (pWal->hdr.szPage&0xfe00) + ((pWal->hdr.szPage&0x0001)<<16);
}
/*
** The following is guaranteed when this function is called:
**
** a) the WRITER lock is held,
** b) the entire log file has been checkpointed, and
** c) any existing readers are reading exclusively from the database
** file - there are no readers that may attempt to read a frame from
** the log file.
**
** This function updates the shared-memory structures so that the next
** client to write to the database (which may be this one) does so by
** writing frames into the start of the log file.
**
** The value of parameter salt1 is used as the aSalt[1] value in the
** new wal-index header. It should be passed a pseudo-random value (i.e.
** one obtained from sqlite3_randomness()).
*/
static void walRestartHdr(Wal *pWal, u32 salt1){
volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
int i; /* Loop counter */
u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */
pWal->nCkpt++;
pWal->hdr.mxFrame = 0;
sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0]));
memcpy(&pWal->hdr.aSalt[1], &salt1, 4);
walIndexWriteHdr(pWal);
AtomicStore(&pInfo->nBackfill, 0);
pInfo->nBackfillAttempted = 0;
pInfo->aReadMark[1] = 0;
for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED;
assert( pInfo->aReadMark[0]==0 );
}
/*
** Copy as much content as we can from the WAL back into the database file
** in response to an sqlite3_wal_checkpoint() request or the equivalent.
**
** The amount of information copies from WAL to database might be limited
** by active readers. This routine will never overwrite a database page
** that a concurrent reader might be using.
**
** All I/O barrier operations (a.k.a fsyncs) occur in this routine when
** SQLite is in WAL-mode in synchronous=NORMAL. That means that if
** checkpoints are always run by a background thread or background
** process, foreground threads will never block on a lengthy fsync call.
**
** Fsync is called on the WAL before writing content out of the WAL and
** into the database. This ensures that if the new content is persistent
** in the WAL and can be recovered following a power-loss or hard reset.
**
** Fsync is also called on the database file if (and only if) the entire
** WAL content is copied into the database file. This second fsync makes
** it safe to delete the WAL since the new content will persist in the
** database file.
**
** This routine uses and updates the nBackfill field of the wal-index header.
** This is the only routine that will increase the value of nBackfill.
** (A WAL reset or recovery will revert nBackfill to zero, but not increase
** its value.)
**
** The caller must be holding sufficient locks to ensure that no other
** checkpoint is running (in any other thread or process) at the same
** time.
*/
static int walCheckpoint(
Wal *pWal, /* Wal connection */
sqlite3 *db, /* Check for interrupts on this handle */
int eMode, /* One of PASSIVE, FULL or RESTART */
int (*xBusy)(void*), /* Function to call when busy */
void *pBusyArg, /* Context argument for xBusyHandler */
int sync_flags, /* Flags for OsSync() (or 0) */
u8 *zBuf /* Temporary buffer to use */
){
int rc = SQLITE_OK; /* Return code */
int szPage; /* Database page-size */
WalIterator *pIter = 0; /* Wal iterator context */
u32 iDbpage = 0; /* Next database page to write */
u32 iFrame = 0; /* Wal frame containing data for iDbpage */
u32 mxSafeFrame; /* Max frame that can be backfilled */
u32 mxPage; /* Max database page to write */
int i; /* Loop counter */
volatile WalCkptInfo *pInfo; /* The checkpoint status information */
szPage = walPagesize(pWal);
testcase( szPage<=32768 );
testcase( szPage>=65536 );
pInfo = walCkptInfo(pWal);
if( pInfo->nBackfill<pWal->hdr.mxFrame ){
/* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked
** in the SQLITE_CHECKPOINT_PASSIVE mode. */
assert( eMode!=SQLITE_CHECKPOINT_PASSIVE || xBusy==0 );
/* Compute in mxSafeFrame the index of the last frame of the WAL that is
** safe to write into the database. Frames beyond mxSafeFrame might
** overwrite database pages that are in use by active readers and thus
** cannot be backfilled from the WAL.
*/
mxSafeFrame = pWal->hdr.mxFrame;
mxPage = pWal->hdr.nPage;
for(i=1; i<WAL_NREADER; i++){
u32 y = AtomicLoad(pInfo->aReadMark+i);
if( mxSafeFrame>y ){
assert( y<=pWal->hdr.mxFrame );
rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(i), 1);
if( rc==SQLITE_OK ){
u32 iMark = (i==1 ? mxSafeFrame : READMARK_NOT_USED);
AtomicStore(pInfo->aReadMark+i, iMark);
walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
}else if( rc==SQLITE_BUSY ){
mxSafeFrame = y;
xBusy = 0;
}else{
goto walcheckpoint_out;
}
}
}
/* Allocate the iterator */
if( pInfo->nBackfill<mxSafeFrame ){
rc = walIteratorInit(pWal, pInfo->nBackfill, &pIter);
assert( rc==SQLITE_OK || pIter==0 );
}
if( pIter
&& (rc = walBusyLock(pWal,xBusy,pBusyArg,WAL_READ_LOCK(0),1))==SQLITE_OK
){
u32 nBackfill = pInfo->nBackfill;
pInfo->nBackfillAttempted = mxSafeFrame;
/* Sync the WAL to disk */
rc = sqlite3OsSync(pWal->pWalFd, CKPT_SYNC_FLAGS(sync_flags));
/* If the database may grow as a result of this checkpoint, hint
** about the eventual size of the db file to the VFS layer.
*/
if( rc==SQLITE_OK ){
i64 nReq = ((i64)mxPage * szPage);
i64 nSize; /* Current size of database file */
sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_CKPT_START, 0);
rc = sqlite3OsFileSize(pWal->pDbFd, &nSize);
if( rc==SQLITE_OK && nSize<nReq ){
if( (nSize+65536+(i64)pWal->hdr.mxFrame*szPage)<nReq ){
/* If the size of the final database is larger than the current
** database plus the amount of data in the wal file, plus the
** maximum size of the pending-byte page (65536 bytes), then
** must be corruption somewhere. */
rc = SQLITE_CORRUPT_BKPT;
}else{
sqlite3OsFileControlHint(pWal->pDbFd, SQLITE_FCNTL_SIZE_HINT,&nReq);
}
}
}
/* Iterate through the contents of the WAL, copying data to the db file */
while( rc==SQLITE_OK && 0==walIteratorNext(pIter, &iDbpage, &iFrame) ){
i64 iOffset;
assert( walFramePgno(pWal, iFrame)==iDbpage );
if( AtomicLoad(&db->u1.isInterrupted) ){
rc = db->mallocFailed ? SQLITE_NOMEM : SQLITE_INTERRUPT;
break;
}
if( iFrame<=nBackfill || iFrame>mxSafeFrame || iDbpage>mxPage ){
continue;
}
iOffset = walFrameOffset(iFrame, szPage) + WAL_FRAME_HDRSIZE;
/* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */
rc = sqlite3OsRead(pWal->pWalFd, zBuf, szPage, iOffset);
if( rc!=SQLITE_OK ) break;
iOffset = (iDbpage-1)*(i64)szPage;
testcase( IS_BIG_INT(iOffset) );
rc = sqlite3OsWrite(pWal->pDbFd, zBuf, szPage, iOffset);
if( rc!=SQLITE_OK ) break;
}
sqlite3OsFileControl(pWal->pDbFd, SQLITE_FCNTL_CKPT_DONE, 0);
/* If work was actually accomplished... */
if( rc==SQLITE_OK ){
if( mxSafeFrame==walIndexHdr(pWal)->mxFrame ){
i64 szDb = pWal->hdr.nPage*(i64)szPage;
testcase( IS_BIG_INT(szDb) );
rc = sqlite3OsTruncate(pWal->pDbFd, szDb);
if( rc==SQLITE_OK ){
rc = sqlite3OsSync(pWal->pDbFd, CKPT_SYNC_FLAGS(sync_flags));
}
}
if( rc==SQLITE_OK ){
AtomicStore(&pInfo->nBackfill, mxSafeFrame);
}
}
/* Release the reader lock held while backfilling */
walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1);
}
if( rc==SQLITE_BUSY ){
/* Reset the return code so as not to report a checkpoint failure
** just because there are active readers. */
rc = SQLITE_OK;
}
}
/* If this is an SQLITE_CHECKPOINT_RESTART or TRUNCATE operation, and the
** entire wal file has been copied into the database file, then block
** until all readers have finished using the wal file. This ensures that
** the next process to write to the database restarts the wal file.
*/
if( rc==SQLITE_OK && eMode!=SQLITE_CHECKPOINT_PASSIVE ){
assert( pWal->writeLock );
if( pInfo->nBackfill<pWal->hdr.mxFrame ){
rc = SQLITE_BUSY;
}else if( eMode>=SQLITE_CHECKPOINT_RESTART ){
u32 salt1;
sqlite3_randomness(4, &salt1);
assert( pInfo->nBackfill==pWal->hdr.mxFrame );
rc = walBusyLock(pWal, xBusy, pBusyArg, WAL_READ_LOCK(1), WAL_NREADER-1);
if( rc==SQLITE_OK ){
if( eMode==SQLITE_CHECKPOINT_TRUNCATE ){
/* IMPLEMENTATION-OF: R-44699-57140 This mode works the same way as
** SQLITE_CHECKPOINT_RESTART with the addition that it also
** truncates the log file to zero bytes just prior to a
** successful return.
**
** In theory, it might be safe to do this without updating the
** wal-index header in shared memory, as all subsequent reader or
** writer clients should see that the entire log file has been
** checkpointed and behave accordingly. This seems unsafe though,
** as it would leave the system in a state where the contents of
** the wal-index header do not match the contents of the
** file-system. To avoid this, update the wal-index header to
** indicate that the log file contains zero valid frames. */
walRestartHdr(pWal, salt1);
rc = sqlite3OsTruncate(pWal->pWalFd, 0);
}
walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
}
}
}
walcheckpoint_out:
walIteratorFree(pIter);
return rc;
}
/*
** If the WAL file is currently larger than nMax bytes in size, truncate
** it to exactly nMax bytes. If an error occurs while doing so, ignore it.
*/
static void walLimitSize(Wal *pWal, i64 nMax){
i64 sz;
int rx;
sqlite3BeginBenignMalloc();
rx = sqlite3OsFileSize(pWal->pWalFd, &sz);
if( rx==SQLITE_OK && (sz > nMax ) ){
rx = sqlite3OsTruncate(pWal->pWalFd, nMax);
}
sqlite3EndBenignMalloc();
if( rx ){
sqlite3_log(rx, "cannot limit WAL size: %s", pWal->zWalName);
}
}
/*
** Close a connection to a log file.
*/
int sqlite3WalClose(
Wal *pWal, /* Wal to close */
sqlite3 *db, /* For interrupt flag */
int sync_flags, /* Flags to pass to OsSync() (or 0) */
int nBuf,
u8 *zBuf /* Buffer of at least nBuf bytes */
){
int rc = SQLITE_OK;
if( pWal ){
int isDelete = 0; /* True to unlink wal and wal-index files */
/* If an EXCLUSIVE lock can be obtained on the database file (using the
** ordinary, rollback-mode locking methods, this guarantees that the
** connection associated with this log file is the only connection to
** the database. In this case checkpoint the database and unlink both
** the wal and wal-index files.
**
** The EXCLUSIVE lock is not released before returning.
*/
if( zBuf!=0
&& SQLITE_OK==(rc = sqlite3OsLock(pWal->pDbFd, SQLITE_LOCK_EXCLUSIVE))
){
if( pWal->exclusiveMode==WAL_NORMAL_MODE ){
pWal->exclusiveMode = WAL_EXCLUSIVE_MODE;
}
rc = sqlite3WalCheckpoint(pWal, db,
SQLITE_CHECKPOINT_PASSIVE, 0, 0, sync_flags, nBuf, zBuf, 0, 0
);
if( rc==SQLITE_OK ){
int bPersist = -1;
sqlite3OsFileControlHint(
pWal->pDbFd, SQLITE_FCNTL_PERSIST_WAL, &bPersist
);
if( bPersist!=1 ){
/* Try to delete the WAL file if the checkpoint completed and
** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal
** mode (!bPersist) */
isDelete = 1;
}else if( pWal->mxWalSize>=0 ){
/* Try to truncate the WAL file to zero bytes if the checkpoint
** completed and fsynced (rc==SQLITE_OK) and we are in persistent
** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a
** non-negative value (pWal->mxWalSize>=0). Note that we truncate
** to zero bytes as truncating to the journal_size_limit might
** leave a corrupt WAL file on disk. */
walLimitSize(pWal, 0);
}
}
}
walIndexClose(pWal, isDelete);
sqlite3OsClose(pWal->pWalFd);
if( isDelete ){
sqlite3BeginBenignMalloc();
sqlite3OsDelete(pWal->pVfs, pWal->zWalName, 0);
sqlite3EndBenignMalloc();
}
WALTRACE(("WAL%p: closed\n", pWal));
sqlite3_free((void *)pWal->apWiData);
sqlite3_free(pWal);
}
return rc;
}
/*
** Try to read the wal-index header. Return 0 on success and 1 if
** there is a problem.
**
** The wal-index is in shared memory. Another thread or process might
** be writing the header at the same time this procedure is trying to
** read it, which might result in inconsistency. A dirty read is detected
** by verifying that both copies of the header are the same and also by
** a checksum on the header.
**
** If and only if the read is consistent and the header is different from
** pWal->hdr, then pWal->hdr is updated to the content of the new header
** and *pChanged is set to 1.
**
** If the checksum cannot be verified return non-zero. If the header
** is read successfully and the checksum verified, return zero.
*/
static SQLITE_NO_TSAN int walIndexTryHdr(Wal *pWal, int *pChanged){
u32 aCksum[2]; /* Checksum on the header content */
WalIndexHdr h1, h2; /* Two copies of the header content */
WalIndexHdr volatile *aHdr; /* Header in shared memory */
/* The first page of the wal-index must be mapped at this point. */
assert( pWal->nWiData>0 && pWal->apWiData[0] );
/* Read the header. This might happen concurrently with a write to the
** same area of shared memory on a different CPU in a SMP,
** meaning it is possible that an inconsistent snapshot is read
** from the file. If this happens, return non-zero.
**
** tag-20200519-1:
** There are two copies of the header at the beginning of the wal-index.
** When reading, read [0] first then [1]. Writes are in the reverse order.
** Memory barriers are used to prevent the compiler or the hardware from
** reordering the reads and writes. TSAN and similar tools can sometimes
** give false-positive warnings about these accesses because the tools do not
** account for the double-read and the memory barrier. The use of mutexes
** here would be problematic as the memory being accessed is potentially
** shared among multiple processes and not all mutex implementions work
** reliably in that environment.
*/
aHdr = walIndexHdr(pWal);
memcpy(&h1, (void *)&aHdr[0], sizeof(h1)); /* Possible TSAN false-positive */
walShmBarrier(pWal);
memcpy(&h2, (void *)&aHdr[1], sizeof(h2));
if( memcmp(&h1, &h2, sizeof(h1))!=0 ){
return 1; /* Dirty read */
}
if( h1.isInit==0 ){
return 1; /* Malformed header - probably all zeros */
}
walChecksumBytes(1, (u8*)&h1, sizeof(h1)-sizeof(h1.aCksum), 0, aCksum);
if( aCksum[0]!=h1.aCksum[0] || aCksum[1]!=h1.aCksum[1] ){
return 1; /* Checksum does not match */
}
if( memcmp(&pWal->hdr, &h1, sizeof(WalIndexHdr)) ){
*pChanged = 1;
memcpy(&pWal->hdr, &h1, sizeof(WalIndexHdr));
pWal->szPage = (pWal->hdr.szPage&0xfe00) + ((pWal->hdr.szPage&0x0001)<<16);
testcase( pWal->szPage<=32768 );
testcase( pWal->szPage>=65536 );
}
/* The header was successfully read. Return zero. */
return 0;
}
/*
** This is the value that walTryBeginRead returns when it needs to
** be retried.
*/
#define WAL_RETRY (-1)
/*
** Read the wal-index header from the wal-index and into pWal->hdr.
** If the wal-header appears to be corrupt, try to reconstruct the
** wal-index from the WAL before returning.
**
** Set *pChanged to 1 if the wal-index header value in pWal->hdr is
** changed by this operation. If pWal->hdr is unchanged, set *pChanged
** to 0.
**
** If the wal-index header is successfully read, return SQLITE_OK.
** Otherwise an SQLite error code.
*/
static int walIndexReadHdr(Wal *pWal, int *pChanged){
int rc; /* Return code */
int badHdr; /* True if a header read failed */
volatile u32 *page0; /* Chunk of wal-index containing header */
/* Ensure that page 0 of the wal-index (the page that contains the
** wal-index header) is mapped. Return early if an error occurs here.
*/
assert( pChanged );
rc = walIndexPage(pWal, 0, &page0);
if( rc!=SQLITE_OK ){
assert( rc!=SQLITE_READONLY ); /* READONLY changed to OK in walIndexPage */
if( rc==SQLITE_READONLY_CANTINIT ){
/* The SQLITE_READONLY_CANTINIT return means that the shared-memory
** was openable but is not writable, and this thread is unable to
** confirm that another write-capable connection has the shared-memory
** open, and hence the content of the shared-memory is unreliable,
** since the shared-memory might be inconsistent with the WAL file
** and there is no writer on hand to fix it. */
assert( page0==0 );
assert( pWal->writeLock==0 );
assert( pWal->readOnly & WAL_SHM_RDONLY );
pWal->bShmUnreliable = 1;
pWal->exclusiveMode = WAL_HEAPMEMORY_MODE;
*pChanged = 1;
}else{
return rc; /* Any other non-OK return is just an error */
}
}else{
/* page0 can be NULL if the SHM is zero bytes in size and pWal->writeLock
** is zero, which prevents the SHM from growing */
testcase( page0!=0 );
}
assert( page0!=0 || pWal->writeLock==0 );
/* If the first page of the wal-index has been mapped, try to read the
** wal-index header immediately, without holding any lock. This usually
** works, but may fail if the wal-index header is corrupt or currently
** being modified by another thread or process.
*/
badHdr = (page0 ? walIndexTryHdr(pWal, pChanged) : 1);
/* If the first attempt failed, it might have been due to a race
** with a writer. So get a WRITE lock and try again.
*/
if( badHdr ){
if( pWal->bShmUnreliable==0 && (pWal->readOnly & WAL_SHM_RDONLY) ){
if( SQLITE_OK==(rc = walLockShared(pWal, WAL_WRITE_LOCK)) ){
walUnlockShared(pWal, WAL_WRITE_LOCK);
rc = SQLITE_READONLY_RECOVERY;
}
}else{
int bWriteLock = pWal->writeLock;
if( bWriteLock || SQLITE_OK==(rc = walLockWriter(pWal)) ){
pWal->writeLock = 1;
if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){
badHdr = walIndexTryHdr(pWal, pChanged);
if( badHdr ){
/* If the wal-index header is still malformed even while holding
** a WRITE lock, it can only mean that the header is corrupted and
** needs to be reconstructed. So run recovery to do exactly that.
*/
rc = walIndexRecover(pWal);
*pChanged = 1;
}
}
if( bWriteLock==0 ){
pWal->writeLock = 0;
walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
}
}
}
}
/* If the header is read successfully, check the version number to make
** sure the wal-index was not constructed with some future format that
** this version of SQLite cannot understand.
*/
if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){
rc = SQLITE_CANTOPEN_BKPT;
}
if( pWal->bShmUnreliable ){
if( rc!=SQLITE_OK ){
walIndexClose(pWal, 0);
pWal->bShmUnreliable = 0;
assert( pWal->nWiData>0 && pWal->apWiData[0]==0 );
/* walIndexRecover() might have returned SHORT_READ if a concurrent
** writer truncated the WAL out from under it. If that happens, it
** indicates that a writer has fixed the SHM file for us, so retry */
if( rc==SQLITE_IOERR_SHORT_READ ) rc = WAL_RETRY;
}
pWal->exclusiveMode = WAL_NORMAL_MODE;
}
return rc;
}
/*
** Open a transaction in a connection where the shared-memory is read-only
** and where we cannot verify that there is a separate write-capable connection
** on hand to keep the shared-memory up-to-date with the WAL file.
**
** This can happen, for example, when the shared-memory is implemented by
** memory-mapping a *-shm file, where a prior writer has shut down and
** left the *-shm file on disk, and now the present connection is trying
** to use that database but lacks write permission on the *-shm file.
** Other scenarios are also possible, depending on the VFS implementation.
**
** Precondition:
**
** The *-wal file has been read and an appropriate wal-index has been
** constructed in pWal->apWiData[] using heap memory instead of shared
** memory.
**
** If this function returns SQLITE_OK, then the read transaction has
** been successfully opened. In this case output variable (*pChanged)
** is set to true before returning if the caller should discard the
** contents of the page cache before proceeding. Or, if it returns
** WAL_RETRY, then the heap memory wal-index has been discarded and
** the caller should retry opening the read transaction from the
** beginning (including attempting to map the *-shm file).
**
** If an error occurs, an SQLite error code is returned.
*/
static int walBeginShmUnreliable(Wal *pWal, int *pChanged){
i64 szWal; /* Size of wal file on disk in bytes */
i64 iOffset; /* Current offset when reading wal file */
u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */
u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */
int szFrame; /* Number of bytes in buffer aFrame[] */
u8 *aData; /* Pointer to data part of aFrame buffer */
volatile void *pDummy; /* Dummy argument for xShmMap */
int rc; /* Return code */
u32 aSaveCksum[2]; /* Saved copy of pWal->hdr.aFrameCksum */
assert( pWal->bShmUnreliable );
assert( pWal->readOnly & WAL_SHM_RDONLY );
assert( pWal->nWiData>0 && pWal->apWiData[0] );
/* Take WAL_READ_LOCK(0). This has the effect of preventing any
** writers from running a checkpoint, but does not stop them
** from running recovery. */
rc = walLockShared(pWal, WAL_READ_LOCK(0));
if( rc!=SQLITE_OK ){
if( rc==SQLITE_BUSY ) rc = WAL_RETRY;
goto begin_unreliable_shm_out;
}
pWal->readLock = 0;
/* Check to see if a separate writer has attached to the shared-memory area,
** thus making the shared-memory "reliable" again. Do this by invoking
** the xShmMap() routine of the VFS and looking to see if the return
** is SQLITE_READONLY instead of SQLITE_READONLY_CANTINIT.
**
** If the shared-memory is now "reliable" return WAL_RETRY, which will
** cause the heap-memory WAL-index to be discarded and the actual
** shared memory to be used in its place.
**
** This step is important because, even though this connection is holding
** the WAL_READ_LOCK(0) which prevents a checkpoint, a writer might
** have already checkpointed the WAL file and, while the current
** is active, wrap the WAL and start overwriting frames that this
** process wants to use.
**
** Once sqlite3OsShmMap() has been called for an sqlite3_file and has
** returned any SQLITE_READONLY value, it must return only SQLITE_READONLY
** or SQLITE_READONLY_CANTINIT or some error for all subsequent invocations,
** even if some external agent does a "chmod" to make the shared-memory
** writable by us, until sqlite3OsShmUnmap() has been called.
** This is a requirement on the VFS implementation.
*/
rc = sqlite3OsShmMap(pWal->pDbFd, 0, WALINDEX_PGSZ, 0, &pDummy);
assert( rc!=SQLITE_OK ); /* SQLITE_OK not possible for read-only connection */
if( rc!=SQLITE_READONLY_CANTINIT ){
rc = (rc==SQLITE_READONLY ? WAL_RETRY : rc);
goto begin_unreliable_shm_out;
}
/* We reach this point only if the real shared-memory is still unreliable.
** Assume the in-memory WAL-index substitute is correct and load it
** into pWal->hdr.
*/
memcpy(&pWal->hdr, (void*)walIndexHdr(pWal), sizeof(WalIndexHdr));
/* Make sure some writer hasn't come in and changed the WAL file out
** from under us, then disconnected, while we were not looking.
*/
rc = sqlite3OsFileSize(pWal->pWalFd, &szWal);
if( rc!=SQLITE_OK ){
goto begin_unreliable_shm_out;
}
if( szWal<WAL_HDRSIZE ){
/* If the wal file is too small to contain a wal-header and the
** wal-index header has mxFrame==0, then it must be safe to proceed
** reading the database file only. However, the page cache cannot
** be trusted, as a read/write connection may have connected, written
** the db, run a checkpoint, truncated the wal file and disconnected
** since this client's last read transaction. */
*pChanged = 1;
rc = (pWal->hdr.mxFrame==0 ? SQLITE_OK : WAL_RETRY);
goto begin_unreliable_shm_out;
}
/* Check the salt keys at the start of the wal file still match. */
rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);
if( rc!=SQLITE_OK ){
goto begin_unreliable_shm_out;
}
if( memcmp(&pWal->hdr.aSalt, &aBuf[16], 8) ){
/* Some writer has wrapped the WAL file while we were not looking.
** Return WAL_RETRY which will cause the in-memory WAL-index to be
** rebuilt. */
rc = WAL_RETRY;
goto begin_unreliable_shm_out;
}
/* Allocate a buffer to read frames into */
szFrame = pWal->hdr.szPage + WAL_FRAME_HDRSIZE;
aFrame = (u8 *)sqlite3_malloc64(szFrame);
if( aFrame==0 ){
rc = SQLITE_NOMEM;
goto begin_unreliable_shm_out;
}
aData = &aFrame[WAL_FRAME_HDRSIZE];
/* Check to see if a complete transaction has been appended to the
** wal file since the heap-memory wal-index was created. If so, the
** heap-memory wal-index is discarded and WAL_RETRY returned to
** the caller. */
aSaveCksum[0] = pWal->hdr.aFrameCksum[0];
aSaveCksum[1] = pWal->hdr.aFrameCksum[1];
for(iOffset=walFrameOffset(pWal->hdr.mxFrame+1, pWal->hdr.szPage);
iOffset+szFrame<=szWal;
iOffset+=szFrame
){
u32 pgno; /* Database page number for frame */
u32 nTruncate; /* dbsize field from frame header */
/* Read and decode the next log frame. */
rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset);
if( rc!=SQLITE_OK ) break;
if( !walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame) ) break;
/* If nTruncate is non-zero, then a complete transaction has been
** appended to this wal file. Set rc to WAL_RETRY and break out of
** the loop. */
if( nTruncate ){
rc = WAL_RETRY;
break;
}
}
pWal->hdr.aFrameCksum[0] = aSaveCksum[0];
pWal->hdr.aFrameCksum[1] = aSaveCksum[1];
begin_unreliable_shm_out:
sqlite3_free(aFrame);
if( rc!=SQLITE_OK ){
int i;
for(i=0; i<pWal->nWiData; i++){
sqlite3_free((void*)pWal->apWiData[i]);
pWal->apWiData[i] = 0;
}
pWal->bShmUnreliable = 0;
sqlite3WalEndReadTransaction(pWal);
*pChanged = 1;
}
return rc;
}
/*
** Attempt to start a read transaction. This might fail due to a race or
** other transient condition. When that happens, it returns WAL_RETRY to
** indicate to the caller that it is safe to retry immediately.
**
** On success return SQLITE_OK. On a permanent failure (such an
** I/O error or an SQLITE_BUSY because another process is running
** recovery) return a positive error code.
**
** The useWal parameter is true to force the use of the WAL and disable
** the case where the WAL is bypassed because it has been completely
** checkpointed. If useWal==0 then this routine calls walIndexReadHdr()
** to make a copy of the wal-index header into pWal->hdr. If the
** wal-index header has changed, *pChanged is set to 1 (as an indication
** to the caller that the local page cache is obsolete and needs to be
** flushed.) When useWal==1, the wal-index header is assumed to already
** be loaded and the pChanged parameter is unused.
**
** The caller must set the cnt parameter to the number of prior calls to
** this routine during the current read attempt that returned WAL_RETRY.
** This routine will start taking more aggressive measures to clear the
** race conditions after multiple WAL_RETRY returns, and after an excessive
** number of errors will ultimately return SQLITE_PROTOCOL. The
** SQLITE_PROTOCOL return indicates that some other process has gone rogue
** and is not honoring the locking protocol. There is a vanishingly small
** chance that SQLITE_PROTOCOL could be returned because of a run of really
** bad luck when there is lots of contention for the wal-index, but that
** possibility is so small that it can be safely neglected, we believe.
**
** On success, this routine obtains a read lock on
** WAL_READ_LOCK(pWal->readLock). The pWal->readLock integer is
** in the range 0 <= pWal->readLock < WAL_NREADER. If pWal->readLock==(-1)
** that means the Wal does not hold any read lock. The reader must not
** access any database page that is modified by a WAL frame up to and
** including frame number aReadMark[pWal->readLock]. The reader will
** use WAL frames up to and including pWal->hdr.mxFrame if pWal->readLock>0
** Or if pWal->readLock==0, then the reader will ignore the WAL
** completely and get all content directly from the database file.
** If the useWal parameter is 1 then the WAL will never be ignored and
** this routine will always set pWal->readLock>0 on success.
** When the read transaction is completed, the caller must release the
** lock on WAL_READ_LOCK(pWal->readLock) and set pWal->readLock to -1.
**
** This routine uses the nBackfill and aReadMark[] fields of the header
** to select a particular WAL_READ_LOCK() that strives to let the
** checkpoint process do as much work as possible. This routine might
** update values of the aReadMark[] array in the header, but if it does
** so it takes care to hold an exclusive lock on the corresponding
** WAL_READ_LOCK() while changing values.
*/
static int walTryBeginRead(Wal *pWal, int *pChanged, int useWal, int cnt){
volatile WalCkptInfo *pInfo; /* Checkpoint information in wal-index */
u32 mxReadMark; /* Largest aReadMark[] value */
int mxI; /* Index of largest aReadMark[] value */
int i; /* Loop counter */
int rc = SQLITE_OK; /* Return code */
u32 mxFrame; /* Wal frame to lock to */
assert( pWal->readLock<0 ); /* Not currently locked */
/* useWal may only be set for read/write connections */
assert( (pWal->readOnly & WAL_SHM_RDONLY)==0 || useWal==0 );
/* Take steps to avoid spinning forever if there is a protocol error.
**
** Circumstances that cause a RETRY should only last for the briefest
** instances of time. No I/O or other system calls are done while the
** locks are held, so the locks should not be held for very long. But
** if we are unlucky, another process that is holding a lock might get
** paged out or take a page-fault that is time-consuming to resolve,
** during the few nanoseconds that it is holding the lock. In that case,
** it might take longer than normal for the lock to free.
**
** After 5 RETRYs, we begin calling sqlite3OsSleep(). The first few
** calls to sqlite3OsSleep() have a delay of 1 microsecond. Really this
** is more of a scheduler yield than an actual delay. But on the 10th
** an subsequent retries, the delays start becoming longer and longer,
** so that on the 100th (and last) RETRY we delay for 323 milliseconds.
** The total delay time before giving up is less than 10 seconds.
*/
if( cnt>5 ){
int nDelay = 1; /* Pause time in microseconds */
if( cnt>100 ){
VVA_ONLY( pWal->lockError = 1; )
return SQLITE_PROTOCOL;
}
if( cnt>=10 ) nDelay = (cnt-9)*(cnt-9)*39;
sqlite3OsSleep(pWal->pVfs, nDelay);
}
if( !useWal ){
assert( rc==SQLITE_OK );
if( pWal->bShmUnreliable==0 ){
rc = walIndexReadHdr(pWal, pChanged);
}
if( rc==SQLITE_BUSY ){
/* If there is not a recovery running in another thread or process
** then convert BUSY errors to WAL_RETRY. If recovery is known to
** be running, convert BUSY to BUSY_RECOVERY. There is a race here
** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY
** would be technically correct. But the race is benign since with
** WAL_RETRY this routine will be called again and will probably be
** right on the second iteration.
*/
if( pWal->apWiData[0]==0 ){
/* This branch is taken when the xShmMap() method returns SQLITE_BUSY.
** We assume this is a transient condition, so return WAL_RETRY. The
** xShmMap() implementation used by the default unix and win32 VFS
** modules may return SQLITE_BUSY due to a race condition in the
** code that determines whether or not the shared-memory region
** must be zeroed before the requested page is returned.
*/
rc = WAL_RETRY;
}else if( SQLITE_OK==(rc = walLockShared(pWal, WAL_RECOVER_LOCK)) ){
walUnlockShared(pWal, WAL_RECOVER_LOCK);
rc = WAL_RETRY;
}else if( rc==SQLITE_BUSY ){
rc = SQLITE_BUSY_RECOVERY;
}
}
if( rc!=SQLITE_OK ){
return rc;
}
else if( pWal->bShmUnreliable ){
return walBeginShmUnreliable(pWal, pChanged);
}
}
assert( pWal->nWiData>0 );
assert( pWal->apWiData[0]!=0 );
pInfo = walCkptInfo(pWal);
if( !useWal && AtomicLoad(&pInfo->nBackfill)==pWal->hdr.mxFrame
#ifdef SQLITE_ENABLE_SNAPSHOT
&& (pWal->pSnapshot==0 || pWal->hdr.mxFrame==0)
#endif
){
/* The WAL has been completely backfilled (or it is empty).
** and can be safely ignored.
*/
rc = walLockShared(pWal, WAL_READ_LOCK(0));
walShmBarrier(pWal);
if( rc==SQLITE_OK ){
if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){
/* It is not safe to allow the reader to continue here if frames
** may have been appended to the log before READ_LOCK(0) was obtained.
** When holding READ_LOCK(0), the reader ignores the entire log file,
** which implies that the database file contains a trustworthy
** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from
** happening, this is usually correct.
**
** However, if frames have been appended to the log (or if the log
** is wrapped and written for that matter) before the READ_LOCK(0)
** is obtained, that is not necessarily true. A checkpointer may
** have started to backfill the appended frames but crashed before
** it finished. Leaving a corrupt image in the database file.
*/
walUnlockShared(pWal, WAL_READ_LOCK(0));
return WAL_RETRY;
}
pWal->readLock = 0;
return SQLITE_OK;
}else if( rc!=SQLITE_BUSY ){
return rc;
}
}
/* If we get this far, it means that the reader will want to use
** the WAL to get at content from recent commits. The job now is
** to select one of the aReadMark[] entries that is closest to
** but not exceeding pWal->hdr.mxFrame and lock that entry.
*/
mxReadMark = 0;
mxI = 0;
mxFrame = pWal->hdr.mxFrame;
#ifdef SQLITE_ENABLE_SNAPSHOT
if( pWal->pSnapshot && pWal->pSnapshot->mxFrame<mxFrame ){
mxFrame = pWal->pSnapshot->mxFrame;
}
#endif
for(i=1; i<WAL_NREADER; i++){
u32 thisMark = AtomicLoad(pInfo->aReadMark+i);
if( mxReadMark<=thisMark && thisMark<=mxFrame ){
assert( thisMark!=READMARK_NOT_USED );
mxReadMark = thisMark;
mxI = i;
}
}
if( (pWal->readOnly & WAL_SHM_RDONLY)==0
&& (mxReadMark<mxFrame || mxI==0)
){
for(i=1; i<WAL_NREADER; i++){
rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1);
if( rc==SQLITE_OK ){
AtomicStore(pInfo->aReadMark+i,mxFrame);
mxReadMark = mxFrame;
mxI = i;
walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1);
break;
}else if( rc!=SQLITE_BUSY ){
return rc;
}
}
}
if( mxI==0 ){
assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 );
return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTINIT;
}
rc = walLockShared(pWal, WAL_READ_LOCK(mxI));
if( rc ){
return rc==SQLITE_BUSY ? WAL_RETRY : rc;
}
/* Now that the read-lock has been obtained, check that neither the
** value in the aReadMark[] array or the contents of the wal-index
** header have changed.
**
** It is necessary to check that the wal-index header did not change
** between the time it was read and when the shared-lock was obtained
** on WAL_READ_LOCK(mxI) was obtained to account for the possibility
** that the log file may have been wrapped by a writer, or that frames
** that occur later in the log than pWal->hdr.mxFrame may have been
** copied into the database by a checkpointer. If either of these things
** happened, then reading the database with the current value of
** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry
** instead.
**
** Before checking that the live wal-index header has not changed
** since it was read, set Wal.minFrame to the first frame in the wal
** file that has not yet been checkpointed. This client will not need
** to read any frames earlier than minFrame from the wal file - they
** can be safely read directly from the database file.
**
** Because a ShmBarrier() call is made between taking the copy of
** nBackfill and checking that the wal-header in shared-memory still
** matches the one cached in pWal->hdr, it is guaranteed that the
** checkpointer that set nBackfill was not working with a wal-index
** header newer than that cached in pWal->hdr. If it were, that could
** cause a problem. The checkpointer could omit to checkpoint
** a version of page X that lies before pWal->minFrame (call that version
** A) on the basis that there is a newer version (version B) of the same
** page later in the wal file. But if version B happens to like past
** frame pWal->hdr.mxFrame - then the client would incorrectly assume
** that it can read version A from the database file. However, since
** we can guarantee that the checkpointer that set nBackfill could not
** see any pages past pWal->hdr.mxFrame, this problem does not come up.
*/
pWal->minFrame = AtomicLoad(&pInfo->nBackfill)+1;
walShmBarrier(pWal);
if( AtomicLoad(pInfo->aReadMark+mxI)!=mxReadMark
|| memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr))
){
walUnlockShared(pWal, WAL_READ_LOCK(mxI));
return WAL_RETRY;
}else{
assert( mxReadMark<=pWal->hdr.mxFrame );
pWal->readLock = (i16)mxI;
}
return rc;
}
#ifdef SQLITE_ENABLE_SNAPSHOT
/*
** Attempt to reduce the value of the WalCkptInfo.nBackfillAttempted
** variable so that older snapshots can be accessed. To do this, loop
** through all wal frames from nBackfillAttempted to (nBackfill+1),
** comparing their content to the corresponding page with the database
** file, if any. Set nBackfillAttempted to the frame number of the
** first frame for which the wal file content matches the db file.
**
** This is only really safe if the file-system is such that any page
** writes made by earlier checkpointers were atomic operations, which
** is not always true. It is also possible that nBackfillAttempted
** may be left set to a value larger than expected, if a wal frame
** contains content that duplicate of an earlier version of the same
** page.
**
** SQLITE_OK is returned if successful, or an SQLite error code if an
** error occurs. It is not an error if nBackfillAttempted cannot be
** decreased at all.
*/
int sqlite3WalSnapshotRecover(Wal *pWal){
int rc;
assert( pWal->readLock>=0 );
rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
if( rc==SQLITE_OK ){
volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
int szPage = (int)pWal->szPage;
i64 szDb; /* Size of db file in bytes */
rc = sqlite3OsFileSize(pWal->pDbFd, &szDb);
if( rc==SQLITE_OK ){
void *pBuf1 = sqlite3_malloc(szPage);
void *pBuf2 = sqlite3_malloc(szPage);
if( pBuf1==0 || pBuf2==0 ){
rc = SQLITE_NOMEM;
}else{
u32 i = pInfo->nBackfillAttempted;
for(i=pInfo->nBackfillAttempted; i>AtomicLoad(&pInfo->nBackfill); i--){
WalHashLoc sLoc; /* Hash table location */
u32 pgno; /* Page number in db file */
i64 iDbOff; /* Offset of db file entry */
i64 iWalOff; /* Offset of wal file entry */
rc = walHashGet(pWal, walFramePage(i), &sLoc);
if( rc!=SQLITE_OK ) break;
assert( i - sLoc.iZero - 1 >=0 );
pgno = sLoc.aPgno[i-sLoc.iZero-1];
iDbOff = (i64)(pgno-1) * szPage;
if( iDbOff+szPage<=szDb ){
iWalOff = walFrameOffset(i, szPage) + WAL_FRAME_HDRSIZE;
rc = sqlite3OsRead(pWal->pWalFd, pBuf1, szPage, iWalOff);
if( rc==SQLITE_OK ){
rc = sqlite3OsRead(pWal->pDbFd, pBuf2, szPage, iDbOff);
}
if( rc!=SQLITE_OK || 0==memcmp(pBuf1, pBuf2, szPage) ){
break;
}
}
pInfo->nBackfillAttempted = i-1;
}
}
sqlite3_free(pBuf1);
sqlite3_free(pBuf2);
}
walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
}
return rc;
}
#endif /* SQLITE_ENABLE_SNAPSHOT */
/*
** Begin a read transaction on the database.
**
** This routine used to be called sqlite3OpenSnapshot() and with good reason:
** it takes a snapshot of the state of the WAL and wal-index for the current
** instant in time. The current thread will continue to use this snapshot.
** Other threads might append new content to the WAL and wal-index but
** that extra content is ignored by the current thread.
**
** If the database contents have changes since the previous read
** transaction, then *pChanged is set to 1 before returning. The
** Pager layer will use this to know that its cache is stale and
** needs to be flushed.
*/
int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){
int rc; /* Return code */
int cnt = 0; /* Number of TryBeginRead attempts */
#ifdef SQLITE_ENABLE_SNAPSHOT
int bChanged = 0;
WalIndexHdr *pSnapshot = pWal->pSnapshot;
#endif
assert( pWal->ckptLock==0 );
#ifdef SQLITE_ENABLE_SNAPSHOT
if( pSnapshot ){
if( memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){
bChanged = 1;
}
/* It is possible that there is a checkpointer thread running
** concurrent with this code. If this is the case, it may be that the
** checkpointer has already determined that it will checkpoint
** snapshot X, where X is later in the wal file than pSnapshot, but
** has not yet set the pInfo->nBackfillAttempted variable to indicate
** its intent. To avoid the race condition this leads to, ensure that
** there is no checkpointer process by taking a shared CKPT lock
** before checking pInfo->nBackfillAttempted. */
(void)walEnableBlocking(pWal);
rc = walLockShared(pWal, WAL_CKPT_LOCK);
walDisableBlocking(pWal);
if( rc!=SQLITE_OK ){
return rc;
}
pWal->ckptLock = 1;
}
#endif
do{
rc = walTryBeginRead(pWal, pChanged, 0, ++cnt);
}while( rc==WAL_RETRY );
testcase( (rc&0xff)==SQLITE_BUSY );
testcase( (rc&0xff)==SQLITE_IOERR );
testcase( rc==SQLITE_PROTOCOL );
testcase( rc==SQLITE_OK );
#ifdef SQLITE_ENABLE_SNAPSHOT
if( rc==SQLITE_OK ){
if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){
/* At this point the client has a lock on an aReadMark[] slot holding
** a value equal to or smaller than pSnapshot->mxFrame, but pWal->hdr
** is populated with the wal-index header corresponding to the head
** of the wal file. Verify that pSnapshot is still valid before
** continuing. Reasons why pSnapshot might no longer be valid:
**
** (1) The WAL file has been reset since the snapshot was taken.
** In this case, the salt will have changed.
**
** (2) A checkpoint as been attempted that wrote frames past
** pSnapshot->mxFrame into the database file. Note that the
** checkpoint need not have completed for this to cause problems.
*/
volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
assert( pWal->readLock>0 || pWal->hdr.mxFrame==0 );
assert( pInfo->aReadMark[pWal->readLock]<=pSnapshot->mxFrame );
/* Check that the wal file has not been wrapped. Assuming that it has
** not, also check that no checkpointer has attempted to checkpoint any
** frames beyond pSnapshot->mxFrame. If either of these conditions are
** true, return SQLITE_ERROR_SNAPSHOT. Otherwise, overwrite pWal->hdr
** with *pSnapshot and set *pChanged as appropriate for opening the
** snapshot. */
if( !memcmp(pSnapshot->aSalt, pWal->hdr.aSalt, sizeof(pWal->hdr.aSalt))
&& pSnapshot->mxFrame>=pInfo->nBackfillAttempted
){
assert( pWal->readLock>0 );
memcpy(&pWal->hdr, pSnapshot, sizeof(WalIndexHdr));
*pChanged = bChanged;
}else{
rc = SQLITE_ERROR_SNAPSHOT;
}
/* A client using a non-current snapshot may not ignore any frames
** from the start of the wal file. This is because, for a system
** where (minFrame < iSnapshot < maxFrame), a checkpointer may
** have omitted to checkpoint a frame earlier than minFrame in
** the file because there exists a frame after iSnapshot that
** is the same database page. */
pWal->minFrame = 1;
if( rc!=SQLITE_OK ){
sqlite3WalEndReadTransaction(pWal);
}
}
}
/* Release the shared CKPT lock obtained above. */
if( pWal->ckptLock ){
assert( pSnapshot );
walUnlockShared(pWal, WAL_CKPT_LOCK);
pWal->ckptLock = 0;
}
#endif
return rc;
}
/*
** Finish with a read transaction. All this does is release the
** read-lock.
*/
void sqlite3WalEndReadTransaction(Wal *pWal){
sqlite3WalEndWriteTransaction(pWal);
if( pWal->readLock>=0 ){
walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
pWal->readLock = -1;
}
}
/*
** Search the wal file for page pgno. If found, set *piRead to the frame that
** contains the page. Otherwise, if pgno is not in the wal file, set *piRead
** to zero.
**
** Return SQLITE_OK if successful, or an error code if an error occurs. If an
** error does occur, the final value of *piRead is undefined.
*/
int sqlite3WalFindFrame(
Wal *pWal, /* WAL handle */
Pgno pgno, /* Database page number to read data for */
u32 *piRead /* OUT: Frame number (or zero) */
){
u32 iRead = 0; /* If !=0, WAL frame to return data from */
u32 iLast = pWal->hdr.mxFrame; /* Last page in WAL for this reader */
int iHash; /* Used to loop through N hash tables */
int iMinHash;
/* This routine is only be called from within a read transaction. */
assert( pWal->readLock>=0 || pWal->lockError );
/* If the "last page" field of the wal-index header snapshot is 0, then
** no data will be read from the wal under any circumstances. Return early
** in this case as an optimization. Likewise, if pWal->readLock==0,
** then the WAL is ignored by the reader so return early, as if the
** WAL were empty.
*/
if( iLast==0 || (pWal->readLock==0 && pWal->bShmUnreliable==0) ){
*piRead = 0;
return SQLITE_OK;
}
/* Search the hash table or tables for an entry matching page number
** pgno. Each iteration of the following for() loop searches one
** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
**
** This code might run concurrently to the code in walIndexAppend()
** that adds entries to the wal-index (and possibly to this hash
** table). This means the value just read from the hash
** slot (aHash[iKey]) may have been added before or after the
** current read transaction was opened. Values added after the
** read transaction was opened may have been written incorrectly -
** i.e. these slots may contain garbage data. However, we assume
** that any slots written before the current read transaction was
** opened remain unmodified.
**
** For the reasons above, the if(...) condition featured in the inner
** loop of the following block is more stringent that would be required
** if we had exclusive access to the hash-table:
**
** (aPgno[iFrame]==pgno):
** This condition filters out normal hash-table collisions.
**
** (iFrame<=iLast):
** This condition filters out entries that were added to the hash
** table after the current read-transaction had started.
*/
iMinHash = walFramePage(pWal->minFrame);
for(iHash=walFramePage(iLast); iHash>=iMinHash; iHash--){
WalHashLoc sLoc; /* Hash table location */
int iKey; /* Hash slot index */
int nCollide; /* Number of hash collisions remaining */
int rc; /* Error code */
u32 iH;
rc = walHashGet(pWal, iHash, &sLoc);
if( rc!=SQLITE_OK ){
return rc;
}
nCollide = HASHTABLE_NSLOT;
iKey = walHash(pgno);
while( (iH = AtomicLoad(&sLoc.aHash[iKey]))!=0 ){
u32 iFrame = iH + sLoc.iZero;
if( iFrame<=iLast && iFrame>=pWal->minFrame && sLoc.aPgno[iH-1]==pgno ){
assert( iFrame>iRead || CORRUPT_DB );
iRead = iFrame;
}
if( (nCollide--)==0 ){
return SQLITE_CORRUPT_BKPT;
}
iKey = walNextHash(iKey);
}
if( iRead ) break;
}
#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
/* If expensive assert() statements are available, do a linear search
** of the wal-index file content. Make sure the results agree with the
** result obtained using the hash indexes above. */
{
u32 iRead2 = 0;
u32 iTest;
assert( pWal->bShmUnreliable || pWal->minFrame>0 );
for(iTest=iLast; iTest>=pWal->minFrame && iTest>0; iTest--){
if( walFramePgno(pWal, iTest)==pgno ){
iRead2 = iTest;
break;
}
}
assert( iRead==iRead2 );
}
#endif
*piRead = iRead;
return SQLITE_OK;
}
/*
** Read the contents of frame iRead from the wal file into buffer pOut
** (which is nOut bytes in size). Return SQLITE_OK if successful, or an
** error code otherwise.
*/
int sqlite3WalReadFrame(
Wal *pWal, /* WAL handle */
u32 iRead, /* Frame to read */
int nOut, /* Size of buffer pOut in bytes */
u8 *pOut /* Buffer to write page data to */
){
int sz;
i64 iOffset;
sz = pWal->hdr.szPage;
sz = (sz&0xfe00) + ((sz&0x0001)<<16);
testcase( sz<=32768 );
testcase( sz>=65536 );
iOffset = walFrameOffset(iRead, sz) + WAL_FRAME_HDRSIZE;
/* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
return sqlite3OsRead(pWal->pWalFd, pOut, (nOut>sz ? sz : nOut), iOffset);
}
/*
** Return the size of the database in pages (or zero, if unknown).
*/
Pgno sqlite3WalDbsize(Wal *pWal){
if( pWal && ALWAYS(pWal->readLock>=0) ){
return pWal->hdr.nPage;
}
return 0;
}
/*
** This function starts a write transaction on the WAL.
**
** A read transaction must have already been started by a prior call
** to sqlite3WalBeginReadTransaction().
**
** If another thread or process has written into the database since
** the read transaction was started, then it is not possible for this
** thread to write as doing so would cause a fork. So this routine
** returns SQLITE_BUSY in that case and no write transaction is started.
**
** There can only be a single writer active at a time.
*/
int sqlite3WalBeginWriteTransaction(Wal *pWal){
int rc;
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
/* If the write-lock is already held, then it was obtained before the
** read-transaction was even opened, making this call a no-op.
** Return early. */
if( pWal->writeLock ){
assert( !memcmp(&pWal->hdr,(void *)walIndexHdr(pWal),sizeof(WalIndexHdr)) );
return SQLITE_OK;
}
#endif
/* Cannot start a write transaction without first holding a read
** transaction. */
assert( pWal->readLock>=0 );
assert( pWal->writeLock==0 && pWal->iReCksum==0 );
if( pWal->readOnly ){
return SQLITE_READONLY;
}
/* Only one writer allowed at a time. Get the write lock. Return
** SQLITE_BUSY if unable.
*/
rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1);
if( rc ){
return rc;
}
pWal->writeLock = 1;
/* If another connection has written to the database file since the
** time the read transaction on this connection was started, then
** the write is disallowed.
*/
if( memcmp(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr))!=0 ){
walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
pWal->writeLock = 0;
rc = SQLITE_BUSY_SNAPSHOT;
}
return rc;
}
/*
** End a write transaction. The commit has already been done. This
** routine merely releases the lock.
*/
int sqlite3WalEndWriteTransaction(Wal *pWal){
if( pWal->writeLock ){
walUnlockExclusive(pWal, WAL_WRITE_LOCK, 1);
pWal->writeLock = 0;
pWal->iReCksum = 0;
pWal->truncateOnCommit = 0;
}
return SQLITE_OK;
}
/*
** If any data has been written (but not committed) to the log file, this
** function moves the write-pointer back to the start of the transaction.
**
** Additionally, the callback function is invoked for each frame written
** to the WAL since the start of the transaction. If the callback returns
** other than SQLITE_OK, it is not invoked again and the error code is
** returned to the caller.
**
** Otherwise, if the callback function does not return an error, this
** function returns SQLITE_OK.
*/
int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx){
int rc = SQLITE_OK;
if( ALWAYS(pWal->writeLock) ){
Pgno iMax = pWal->hdr.mxFrame;
Pgno iFrame;
/* Restore the clients cache of the wal-index header to the state it
** was in before the client began writing to the database.
*/
memcpy(&pWal->hdr, (void *)walIndexHdr(pWal), sizeof(WalIndexHdr));
for(iFrame=pWal->hdr.mxFrame+1;
ALWAYS(rc==SQLITE_OK) && iFrame<=iMax;
iFrame++
){
/* This call cannot fail. Unless the page for which the page number
** is passed as the second argument is (a) in the cache and
** (b) has an outstanding reference, then xUndo is either a no-op
** (if (a) is false) or simply expels the page from the cache (if (b)
** is false).
**
** If the upper layer is doing a rollback, it is guaranteed that there
** are no outstanding references to any page other than page 1. And
** page 1 is never written to the log until the transaction is
** committed. As a result, the call to xUndo may not fail.
*/
assert( walFramePgno(pWal, iFrame)!=1 );
rc = xUndo(pUndoCtx, walFramePgno(pWal, iFrame));
}
if( iMax!=pWal->hdr.mxFrame ) walCleanupHash(pWal);
}
return rc;
}
/*
** Argument aWalData must point to an array of WAL_SAVEPOINT_NDATA u32
** values. This function populates the array with values required to
** "rollback" the write position of the WAL handle back to the current
** point in the event of a savepoint rollback (via WalSavepointUndo()).
*/
void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData){
assert( pWal->writeLock );
aWalData[0] = pWal->hdr.mxFrame;
aWalData[1] = pWal->hdr.aFrameCksum[0];
aWalData[2] = pWal->hdr.aFrameCksum[1];
aWalData[3] = pWal->nCkpt;
}
/*
** Move the write position of the WAL back to the point identified by
** the values in the aWalData[] array. aWalData must point to an array
** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated
** by a call to WalSavepoint().
*/
int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData){
int rc = SQLITE_OK;
assert( pWal->writeLock );
assert( aWalData[3]!=pWal->nCkpt || aWalData[0]<=pWal->hdr.mxFrame );
if( aWalData[3]!=pWal->nCkpt ){
/* This savepoint was opened immediately after the write-transaction
** was started. Right after that, the writer decided to wrap around
** to the start of the log. Update the savepoint values to match.
*/
aWalData[0] = 0;
aWalData[3] = pWal->nCkpt;
}
if( aWalData[0]<pWal->hdr.mxFrame ){
pWal->hdr.mxFrame = aWalData[0];
pWal->hdr.aFrameCksum[0] = aWalData[1];
pWal->hdr.aFrameCksum[1] = aWalData[2];
walCleanupHash(pWal);
}
return rc;
}
/*
** This function is called just before writing a set of frames to the log
** file (see sqlite3WalFrames()). It checks to see if, instead of appending
** to the current log file, it is possible to overwrite the start of the
** existing log file with the new frames (i.e. "reset" the log). If so,
** it sets pWal->hdr.mxFrame to 0. Otherwise, pWal->hdr.mxFrame is left
** unchanged.
**
** SQLITE_OK is returned if no error is encountered (regardless of whether
** or not pWal->hdr.mxFrame is modified). An SQLite error code is returned
** if an error occurs.
*/
static int walRestartLog(Wal *pWal){
int rc = SQLITE_OK;
int cnt;
if( pWal->readLock==0 ){
volatile WalCkptInfo *pInfo = walCkptInfo(pWal);
assert( pInfo->nBackfill==pWal->hdr.mxFrame );
if( pInfo->nBackfill>0 ){
u32 salt1;
sqlite3_randomness(4, &salt1);
rc = walLockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
if( rc==SQLITE_OK ){
/* If all readers are using WAL_READ_LOCK(0) (in other words if no
** readers are currently using the WAL), then the transactions
** frames will overwrite the start of the existing log. Update the
** wal-index header to reflect this.
**
** In theory it would be Ok to update the cache of the header only
** at this point. But updating the actual wal-index header is also
** safe and means there is no special case for sqlite3WalUndo()
** to handle if this transaction is rolled back. */
walRestartHdr(pWal, salt1);
walUnlockExclusive(pWal, WAL_READ_LOCK(1), WAL_NREADER-1);
}else if( rc!=SQLITE_BUSY ){
return rc;
}
}
walUnlockShared(pWal, WAL_READ_LOCK(0));
pWal->readLock = -1;
cnt = 0;
do{
int notUsed;
rc = walTryBeginRead(pWal, &notUsed, 1, ++cnt);
}while( rc==WAL_RETRY );
assert( (rc&0xff)!=SQLITE_BUSY ); /* BUSY not possible when useWal==1 */
testcase( (rc&0xff)==SQLITE_IOERR );
testcase( rc==SQLITE_PROTOCOL );
testcase( rc==SQLITE_OK );
}
return rc;
}
/*
** Information about the current state of the WAL file and where
** the next fsync should occur - passed from sqlite3WalFrames() into
** walWriteToLog().
*/
typedef struct WalWriter {
Wal *pWal; /* The complete WAL information */
sqlite3_file *pFd; /* The WAL file to which we write */
sqlite3_int64 iSyncPoint; /* Fsync at this offset */
int syncFlags; /* Flags for the fsync */
int szPage; /* Size of one page */
} WalWriter;
/*
** Write iAmt bytes of content into the WAL file beginning at iOffset.
** Do a sync when crossing the p->iSyncPoint boundary.
**
** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt,
** first write the part before iSyncPoint, then sync, then write the
** rest.
*/
static int walWriteToLog(
WalWriter *p, /* WAL to write to */
void *pContent, /* Content to be written */
int iAmt, /* Number of bytes to write */
sqlite3_int64 iOffset /* Start writing at this offset */
){
int rc;
if( iOffset<p->iSyncPoint && iOffset+iAmt>=p->iSyncPoint ){
int iFirstAmt = (int)(p->iSyncPoint - iOffset);
rc = sqlite3OsWrite(p->pFd, pContent, iFirstAmt, iOffset);
if( rc ) return rc;
iOffset += iFirstAmt;
iAmt -= iFirstAmt;
pContent = (void*)(iFirstAmt + (char*)pContent);
assert( WAL_SYNC_FLAGS(p->syncFlags)!=0 );
rc = sqlite3OsSync(p->pFd, WAL_SYNC_FLAGS(p->syncFlags));
if( iAmt==0 || rc ) return rc;
}
rc = sqlite3OsWrite(p->pFd, pContent, iAmt, iOffset);
return rc;
}
/*
** Write out a single frame of the WAL
*/
static int walWriteOneFrame(
WalWriter *p, /* Where to write the frame */
PgHdr *pPage, /* The page of the frame to be written */
int nTruncate, /* The commit flag. Usually 0. >0 for commit */
sqlite3_int64 iOffset /* Byte offset at which to write */
){
int rc; /* Result code from subfunctions */
void *pData; /* Data actually written */
u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */
pData = pPage->pData;
walEncodeFrame(p->pWal, pPage->pgno, nTruncate, pData, aFrame);
rc = walWriteToLog(p, aFrame, sizeof(aFrame), iOffset);
if( rc ) return rc;
/* Write the page data */
rc = walWriteToLog(p, pData, p->szPage, iOffset+sizeof(aFrame));
return rc;
}
/*
** This function is called as part of committing a transaction within which
** one or more frames have been overwritten. It updates the checksums for
** all frames written to the wal file by the current transaction starting
** with the earliest to have been overwritten.
**
** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
*/
static int walRewriteChecksums(Wal *pWal, u32 iLast){
const int szPage = pWal->szPage;/* Database page size */
int rc = SQLITE_OK; /* Return code */
u8 *aBuf; /* Buffer to load data from wal file into */
u8 aFrame[WAL_FRAME_HDRSIZE]; /* Buffer to assemble frame-headers in */
u32 iRead; /* Next frame to read from wal file */
i64 iCksumOff;
aBuf = sqlite3_malloc(szPage + WAL_FRAME_HDRSIZE);
if( aBuf==0 ) return SQLITE_NOMEM;
/* Find the checksum values to use as input for the recalculating the
** first checksum. If the first frame is frame 1 (implying that the current
** transaction restarted the wal file), these values must be read from the
** wal-file header. Otherwise, read them from the frame header of the
** previous frame. */
assert( pWal->iReCksum>0 );
if( pWal->iReCksum==1 ){
iCksumOff = 24;
}else{
iCksumOff = walFrameOffset(pWal->iReCksum-1, szPage) + 16;
}
rc = sqlite3OsRead(pWal->pWalFd, aBuf, sizeof(u32)*2, iCksumOff);
pWal->hdr.aFrameCksum[0] = sqlite3Get4byte(aBuf);
pWal->hdr.aFrameCksum[1] = sqlite3Get4byte(&aBuf[sizeof(u32)]);
iRead = pWal->iReCksum;
pWal->iReCksum = 0;
for(; rc==SQLITE_OK && iRead<=iLast; iRead++){
i64 iOff = walFrameOffset(iRead, szPage);
rc = sqlite3OsRead(pWal->pWalFd, aBuf, szPage+WAL_FRAME_HDRSIZE, iOff);
if( rc==SQLITE_OK ){
u32 iPgno, nDbSize;
iPgno = sqlite3Get4byte(aBuf);
nDbSize = sqlite3Get4byte(&aBuf[4]);
walEncodeFrame(pWal, iPgno, nDbSize, &aBuf[WAL_FRAME_HDRSIZE], aFrame);
rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOff);
}
}
sqlite3_free(aBuf);
return rc;
}
/*
** Write a set of frames to the log. The caller must hold the write-lock
** on the log file (obtained using sqlite3WalBeginWriteTransaction()).
*/
int sqlite3WalFrames(
Wal *pWal, /* Wal handle to write to */
int szPage, /* Database page-size in bytes */
PgHdr *pList, /* List of dirty pages to write */
Pgno nTruncate, /* Database size after this commit */
int isCommit, /* True if this is a commit */
int sync_flags /* Flags to pass to OsSync() (or 0) */
){
int rc; /* Used to catch return codes */
u32 iFrame; /* Next frame address */
PgHdr *p; /* Iterator to run through pList with. */
PgHdr *pLast = 0; /* Last frame in list */
int nExtra = 0; /* Number of extra copies of last page */
int szFrame; /* The size of a single frame */
i64 iOffset; /* Next byte to write in WAL file */
WalWriter w; /* The writer */
u32 iFirst = 0; /* First frame that may be overwritten */
WalIndexHdr *pLive; /* Pointer to shared header */
assert( pList );
assert( pWal->writeLock );
/* If this frame set completes a transaction, then nTruncate>0. If
** nTruncate==0 then this frame set does not complete the transaction. */
assert( (isCommit!=0)==(nTruncate!=0) );
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
{ int cnt; for(cnt=0, p=pList; p; p=p->pDirty, cnt++){}
WALTRACE(("WAL%p: frame write begin. %d frames. mxFrame=%d. %s\n",
pWal, cnt, pWal->hdr.mxFrame, isCommit ? "Commit" : "Spill"));
}
#endif
pLive = (WalIndexHdr*)walIndexHdr(pWal);
if( memcmp(&pWal->hdr, (void *)pLive, sizeof(WalIndexHdr))!=0 ){
iFirst = pLive->mxFrame+1;
}
/* See if it is possible to write these frames into the start of the
** log file, instead of appending to it at pWal->hdr.mxFrame.
*/
if( SQLITE_OK!=(rc = walRestartLog(pWal)) ){
return rc;
}
/* If this is the first frame written into the log, write the WAL
** header to the start of the WAL file. See comments at the top of
** this source file for a description of the WAL header format.
*/
iFrame = pWal->hdr.mxFrame;
if( iFrame==0 ){
u8 aWalHdr[WAL_HDRSIZE]; /* Buffer to assemble wal-header in */
u32 aCksum[2]; /* Checksum for wal-header */
sqlite3Put4byte(&aWalHdr[0], (WAL_MAGIC | SQLITE_BIGENDIAN));
sqlite3Put4byte(&aWalHdr[4], WAL_MAX_VERSION);
sqlite3Put4byte(&aWalHdr[8], szPage);
sqlite3Put4byte(&aWalHdr[12], pWal->nCkpt);
if( pWal->nCkpt==0 ) sqlite3_randomness(8, pWal->hdr.aSalt);
memcpy(&aWalHdr[16], pWal->hdr.aSalt, 8);
walChecksumBytes(1, aWalHdr, WAL_HDRSIZE-2*4, 0, aCksum);
sqlite3Put4byte(&aWalHdr[24], aCksum[0]);
sqlite3Put4byte(&aWalHdr[28], aCksum[1]);
pWal->szPage = szPage;
pWal->hdr.bigEndCksum = SQLITE_BIGENDIAN;
pWal->hdr.aFrameCksum[0] = aCksum[0];
pWal->hdr.aFrameCksum[1] = aCksum[1];
pWal->truncateOnCommit = 1;
rc = sqlite3OsWrite(pWal->pWalFd, aWalHdr, sizeof(aWalHdr), 0);
WALTRACE(("WAL%p: wal-header write %s\n", pWal, rc ? "failed" : "ok"));
if( rc!=SQLITE_OK ){
return rc;
}
/* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless
** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise
** an out-of-order write following a WAL restart could result in
** database corruption. See the ticket:
**
** https://sqlite.org/src/info/ff5be73dee
*/
if( pWal->syncHeader ){
rc = sqlite3OsSync(pWal->pWalFd, CKPT_SYNC_FLAGS(sync_flags));
if( rc ) return rc;
}
}
assert( (int)pWal->szPage==szPage );
/* Setup information needed to write frames into the WAL */
w.pWal = pWal;
w.pFd = pWal->pWalFd;
w.iSyncPoint = 0;
w.syncFlags = sync_flags;
w.szPage = szPage;
iOffset = walFrameOffset(iFrame+1, szPage);
szFrame = szPage + WAL_FRAME_HDRSIZE;
/* Write all frames into the log file exactly once */
for(p=pList; p; p=p->pDirty){
int nDbSize; /* 0 normally. Positive == commit flag */
/* Check if this page has already been written into the wal file by
** the current transaction. If so, overwrite the existing frame and
** set Wal.writeLock to WAL_WRITELOCK_RECKSUM - indicating that
** checksums must be recomputed when the transaction is committed. */
if( iFirst && (p->pDirty || isCommit==0) ){
u32 iWrite = 0;
VVA_ONLY(rc =) sqlite3WalFindFrame(pWal, p->pgno, &iWrite);
assert( rc==SQLITE_OK || iWrite==0 );
if( iWrite>=iFirst ){
i64 iOff = walFrameOffset(iWrite, szPage) + WAL_FRAME_HDRSIZE;
void *pData;
if( pWal->iReCksum==0 || iWrite<pWal->iReCksum ){
pWal->iReCksum = iWrite;
}
pData = p->pData;
rc = sqlite3OsWrite(pWal->pWalFd, pData, szPage, iOff);
if( rc ) return rc;
p->flags &= ~PGHDR_WAL_APPEND;
continue;
}
}
iFrame++;
assert( iOffset==walFrameOffset(iFrame, szPage) );
nDbSize = (isCommit && p->pDirty==0) ? nTruncate : 0;
rc = walWriteOneFrame(&w, p, nDbSize, iOffset);
if( rc ) return rc;
pLast = p;
iOffset += szFrame;
p->flags |= PGHDR_WAL_APPEND;
}
/* Recalculate checksums within the wal file if required. */
if( isCommit && pWal->iReCksum ){
rc = walRewriteChecksums(pWal, iFrame);
if( rc ) return rc;
}
/* If this is the end of a transaction, then we might need to pad
** the transaction and/or sync the WAL file.
**
** Padding and syncing only occur if this set of frames complete a
** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL
** or synchronous==OFF, then no padding or syncing are needed.
**
** If SQLITE_IOCAP_POWERSAFE_OVERWRITE is defined, then padding is not
** needed and only the sync is done. If padding is needed, then the
** final frame is repeated (with its commit mark) until the next sector
** boundary is crossed. Only the part of the WAL prior to the last
** sector boundary is synced; the part of the last frame that extends
** past the sector boundary is written after the sync.
*/
if( isCommit && WAL_SYNC_FLAGS(sync_flags)!=0 ){
int bSync = 1;
if( pWal->padToSectorBoundary ){
int sectorSize = sqlite3SectorSize(pWal->pWalFd);
w.iSyncPoint = ((iOffset+sectorSize-1)/sectorSize)*sectorSize;
bSync = (w.iSyncPoint==iOffset);
testcase( bSync );
while( iOffset<w.iSyncPoint ){
rc = walWriteOneFrame(&w, pLast, nTruncate, iOffset);
if( rc ) return rc;
iOffset += szFrame;
nExtra++;
assert( pLast!=0 );
}
}
if( bSync ){
assert( rc==SQLITE_OK );
rc = sqlite3OsSync(w.pFd, WAL_SYNC_FLAGS(sync_flags));
}
}
/* If this frame set completes the first transaction in the WAL and
** if PRAGMA journal_size_limit is set, then truncate the WAL to the
** journal size limit, if possible.
*/
if( isCommit && pWal->truncateOnCommit && pWal->mxWalSize>=0 ){
i64 sz = pWal->mxWalSize;
if( walFrameOffset(iFrame+nExtra+1, szPage)>pWal->mxWalSize ){
sz = walFrameOffset(iFrame+nExtra+1, szPage);
}
walLimitSize(pWal, sz);
pWal->truncateOnCommit = 0;
}
/* Append data to the wal-index. It is not necessary to lock the
** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index
** guarantees that there are no other writers, and no data that may
** be in use by existing readers is being overwritten.
*/
iFrame = pWal->hdr.mxFrame;
for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){
if( (p->flags & PGHDR_WAL_APPEND)==0 ) continue;
iFrame++;
rc = walIndexAppend(pWal, iFrame, p->pgno);
}
assert( pLast!=0 || nExtra==0 );
while( rc==SQLITE_OK && nExtra>0 ){
iFrame++;
nExtra--;
rc = walIndexAppend(pWal, iFrame, pLast->pgno);
}
if( rc==SQLITE_OK ){
/* Update the private copy of the header. */
pWal->hdr.szPage = (u16)((szPage&0xff00) | (szPage>>16));
testcase( szPage<=32768 );
testcase( szPage>=65536 );
pWal->hdr.mxFrame = iFrame;
if( isCommit ){
pWal->hdr.iChange++;
pWal->hdr.nPage = nTruncate;
}
/* If this is a commit, update the wal-index header too. */
if( isCommit ){
walIndexWriteHdr(pWal);
pWal->iCallback = iFrame;
}
}
WALTRACE(("WAL%p: frame write %s\n", pWal, rc ? "failed" : "ok"));
return rc;
}
/*
** This routine is called to implement sqlite3_wal_checkpoint() and
** related interfaces.
**
** Obtain a CHECKPOINT lock and then backfill as much information as
** we can from WAL into the database.
**
** If parameter xBusy is not NULL, it is a pointer to a busy-handler
** callback. In this case this function runs a blocking checkpoint.
*/
int sqlite3WalCheckpoint(
Wal *pWal, /* Wal connection */
sqlite3 *db, /* Check this handle's interrupt flag */
int eMode, /* PASSIVE, FULL, RESTART, or TRUNCATE */
int (*xBusy)(void*), /* Function to call when busy */
void *pBusyArg, /* Context argument for xBusyHandler */
int sync_flags, /* Flags to sync db file with (or 0) */
int nBuf, /* Size of temporary buffer */
u8 *zBuf, /* Temporary buffer to use */
int *pnLog, /* OUT: Number of frames in WAL */
int *pnCkpt /* OUT: Number of backfilled frames in WAL */
){
int rc; /* Return code */
int isChanged = 0; /* True if a new wal-index header is loaded */
int eMode2 = eMode; /* Mode to pass to walCheckpoint() */
int (*xBusy2)(void*) = xBusy; /* Busy handler for eMode2 */
assert( pWal->ckptLock==0 );
assert( pWal->writeLock==0 );
/* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked
** in the SQLITE_CHECKPOINT_PASSIVE mode. */
assert( eMode!=SQLITE_CHECKPOINT_PASSIVE || xBusy==0 );
if( pWal->readOnly ) return SQLITE_READONLY;
WALTRACE(("WAL%p: checkpoint begins\n", pWal));
/* Enable blocking locks, if possible. If blocking locks are successfully
** enabled, set xBusy2=0 so that the busy-handler is never invoked. */
sqlite3WalDb(pWal, db);
(void)walEnableBlocking(pWal);
/* IMPLEMENTATION-OF: R-62028-47212 All calls obtain an exclusive
** "checkpoint" lock on the database file.
** EVIDENCE-OF: R-10421-19736 If any other process is running a
** checkpoint operation at the same time, the lock cannot be obtained and
** SQLITE_BUSY is returned.
** EVIDENCE-OF: R-53820-33897 Even if there is a busy-handler configured,
** it will not be invoked in this case.
*/
rc = walLockExclusive(pWal, WAL_CKPT_LOCK, 1);
testcase( rc==SQLITE_BUSY );
testcase( rc!=SQLITE_OK && xBusy2!=0 );
if( rc==SQLITE_OK ){
pWal->ckptLock = 1;
/* IMPLEMENTATION-OF: R-59782-36818 The SQLITE_CHECKPOINT_FULL, RESTART and
** TRUNCATE modes also obtain the exclusive "writer" lock on the database
** file.
**
** EVIDENCE-OF: R-60642-04082 If the writer lock cannot be obtained
** immediately, and a busy-handler is configured, it is invoked and the
** writer lock retried until either the busy-handler returns 0 or the
** lock is successfully obtained.
*/
if( eMode!=SQLITE_CHECKPOINT_PASSIVE ){
rc = walBusyLock(pWal, xBusy2, pBusyArg, WAL_WRITE_LOCK, 1);
if( rc==SQLITE_OK ){
pWal->writeLock = 1;
}else if( rc==SQLITE_BUSY ){
eMode2 = SQLITE_CHECKPOINT_PASSIVE;
xBusy2 = 0;
rc = SQLITE_OK;
}
}
}
/* Read the wal-index header. */
if( rc==SQLITE_OK ){
walDisableBlocking(pWal);
rc = walIndexReadHdr(pWal, &isChanged);
(void)walEnableBlocking(pWal);
if( isChanged && pWal->pDbFd->pMethods->iVersion>=3 ){
sqlite3OsUnfetch(pWal->pDbFd, 0, 0);
}
}
/* Copy data from the log to the database file. */
if( rc==SQLITE_OK ){
if( pWal->hdr.mxFrame && walPagesize(pWal)!=nBuf ){
rc = SQLITE_CORRUPT_BKPT;
}else{
rc = walCheckpoint(pWal, db, eMode2, xBusy2, pBusyArg, sync_flags, zBuf);
}
/* If no error occurred, set the output variables. */
if( rc==SQLITE_OK || rc==SQLITE_BUSY ){
if( pnLog ) *pnLog = (int)pWal->hdr.mxFrame;
if( pnCkpt ) *pnCkpt = (int)(walCkptInfo(pWal)->nBackfill);
}
}
if( isChanged ){
/* If a new wal-index header was loaded before the checkpoint was
** performed, then the pager-cache associated with pWal is now
** out of date. So zero the cached wal-index header to ensure that
** next time the pager opens a snapshot on this database it knows that
** the cache needs to be reset.
*/
memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
}
walDisableBlocking(pWal);
sqlite3WalDb(pWal, 0);
/* Release the locks. */
sqlite3WalEndWriteTransaction(pWal);
if( pWal->ckptLock ){
walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
pWal->ckptLock = 0;
}
WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok"));
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY;
#endif
return (rc==SQLITE_OK && eMode!=eMode2 ? SQLITE_BUSY : rc);
}
/* Return the value to pass to a sqlite3_wal_hook callback, the
** number of frames in the WAL at the point of the last commit since
** sqlite3WalCallback() was called. If no commits have occurred since
** the last call, then return 0.
*/
int sqlite3WalCallback(Wal *pWal){
u32 ret = 0;
if( pWal ){
ret = pWal->iCallback;
pWal->iCallback = 0;
}
return (int)ret;
}
/*
** This function is called to change the WAL subsystem into or out
** of locking_mode=EXCLUSIVE.
**
** If op is zero, then attempt to change from locking_mode=EXCLUSIVE
** into locking_mode=NORMAL. This means that we must acquire a lock
** on the pWal->readLock byte. If the WAL is already in locking_mode=NORMAL
** or if the acquisition of the lock fails, then return 0. If the
** transition out of exclusive-mode is successful, return 1. This
** operation must occur while the pager is still holding the exclusive
** lock on the main database file.
**
** If op is one, then change from locking_mode=NORMAL into
** locking_mode=EXCLUSIVE. This means that the pWal->readLock must
** be released. Return 1 if the transition is made and 0 if the
** WAL is already in exclusive-locking mode - meaning that this
** routine is a no-op. The pager must already hold the exclusive lock
** on the main database file before invoking this operation.
**
** If op is negative, then do a dry-run of the op==1 case but do
** not actually change anything. The pager uses this to see if it
** should acquire the database exclusive lock prior to invoking
** the op==1 case.
*/
int sqlite3WalExclusiveMode(Wal *pWal, int op){
int rc;
assert( pWal->writeLock==0 );
assert( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE || op==-1 );
/* pWal->readLock is usually set, but might be -1 if there was a
** prior error while attempting to acquire are read-lock. This cannot
** happen if the connection is actually in exclusive mode (as no xShmLock
** locks are taken in this case). Nor should the pager attempt to
** upgrade to exclusive-mode following such an error.
*/
assert( pWal->readLock>=0 || pWal->lockError );
assert( pWal->readLock>=0 || (op<=0 && pWal->exclusiveMode==0) );
if( op==0 ){
if( pWal->exclusiveMode!=WAL_NORMAL_MODE ){
pWal->exclusiveMode = WAL_NORMAL_MODE;
if( walLockShared(pWal, WAL_READ_LOCK(pWal->readLock))!=SQLITE_OK ){
pWal->exclusiveMode = WAL_EXCLUSIVE_MODE;
}
rc = pWal->exclusiveMode==WAL_NORMAL_MODE;
}else{
/* Already in locking_mode=NORMAL */
rc = 0;
}
}else if( op>0 ){
assert( pWal->exclusiveMode==WAL_NORMAL_MODE );
assert( pWal->readLock>=0 );
walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock));
pWal->exclusiveMode = WAL_EXCLUSIVE_MODE;
rc = 1;
}else{
rc = pWal->exclusiveMode==WAL_NORMAL_MODE;
}
return rc;
}
/*
** Return true if the argument is non-NULL and the WAL module is using
** heap-memory for the wal-index. Otherwise, if the argument is NULL or the
** WAL module is using shared-memory, return false.
*/
int sqlite3WalHeapMemory(Wal *pWal){
return (pWal && pWal->exclusiveMode==WAL_HEAPMEMORY_MODE );
}
#ifdef SQLITE_ENABLE_SNAPSHOT
/* Create a snapshot object. The content of a snapshot is opaque to
** every other subsystem, so the WAL module can put whatever it needs
** in the object.
*/
int sqlite3WalSnapshotGet(Wal *pWal, sqlite3_snapshot **ppSnapshot){
int rc = SQLITE_OK;
WalIndexHdr *pRet;
static const u32 aZero[4] = { 0, 0, 0, 0 };
assert( pWal->readLock>=0 && pWal->writeLock==0 );
if( memcmp(&pWal->hdr.aFrameCksum[0],aZero,16)==0 ){
*ppSnapshot = 0;
return SQLITE_ERROR;
}
pRet = (WalIndexHdr*)sqlite3_malloc(sizeof(WalIndexHdr));
if( pRet==0 ){
rc = SQLITE_NOMEM;
}else{
memcpy(pRet, &pWal->hdr, sizeof(WalIndexHdr));
*ppSnapshot = (sqlite3_snapshot*)pRet;
}
return rc;
}
/* Try to open on pSnapshot when the next read-transaction starts
*/
void sqlite3WalSnapshotOpen(
Wal *pWal,
sqlite3_snapshot *pSnapshot
){
pWal->pSnapshot = (WalIndexHdr*)pSnapshot;
}
/*
** Return a +ve value if snapshot p1 is newer than p2. A -ve value if
** p1 is older than p2 and zero if p1 and p2 are the same snapshot.
*/
int sqlite3_snapshot_cmp(sqlite3_snapshot *p1, sqlite3_snapshot *p2){
WalIndexHdr *pHdr1 = (WalIndexHdr*)p1;
WalIndexHdr *pHdr2 = (WalIndexHdr*)p2;
/* aSalt[0] is a copy of the value stored in the wal file header. It
** is incremented each time the wal file is restarted. */
if( pHdr1->aSalt[0]<pHdr2->aSalt[0] ) return -1;
if( pHdr1->aSalt[0]>pHdr2->aSalt[0] ) return +1;
if( pHdr1->mxFrame<pHdr2->mxFrame ) return -1;
if( pHdr1->mxFrame>pHdr2->mxFrame ) return +1;
return 0;
}
/*
** The caller currently has a read transaction open on the database.
** This function takes a SHARED lock on the CHECKPOINTER slot and then
** checks if the snapshot passed as the second argument is still
** available. If so, SQLITE_OK is returned.
**
** If the snapshot is not available, SQLITE_ERROR is returned. Or, if
** the CHECKPOINTER lock cannot be obtained, SQLITE_BUSY. If any error
** occurs (any value other than SQLITE_OK is returned), the CHECKPOINTER
** lock is released before returning.
*/
int sqlite3WalSnapshotCheck(Wal *pWal, sqlite3_snapshot *pSnapshot){
int rc;
rc = walLockShared(pWal, WAL_CKPT_LOCK);
if( rc==SQLITE_OK ){
WalIndexHdr *pNew = (WalIndexHdr*)pSnapshot;
if( memcmp(pNew->aSalt, pWal->hdr.aSalt, sizeof(pWal->hdr.aSalt))
|| pNew->mxFrame<walCkptInfo(pWal)->nBackfillAttempted
){
rc = SQLITE_ERROR_SNAPSHOT;
walUnlockShared(pWal, WAL_CKPT_LOCK);
}
}
return rc;
}
/*
** Release a lock obtained by an earlier successful call to
** sqlite3WalSnapshotCheck().
*/
void sqlite3WalSnapshotUnlock(Wal *pWal){
assert( pWal );
walUnlockShared(pWal, WAL_CKPT_LOCK);
}
#endif /* SQLITE_ENABLE_SNAPSHOT */
#ifdef SQLITE_ENABLE_ZIPVFS
/*
** If the argument is not NULL, it points to a Wal object that holds a
** read-lock. This function returns the database page-size if it is known,
** or zero if it is not (or if pWal is NULL).
*/
int sqlite3WalFramesize(Wal *pWal){
assert( pWal==0 || pWal->readLock>=0 );
return (pWal ? pWal->szPage : 0);
}
#endif
/* Return the sqlite3_file object for the WAL file
*/
sqlite3_file *sqlite3WalFile(Wal *pWal){
return pWal->pWalFd;
}
#endif /* #ifndef SQLITE_OMIT_WAL */
/*
** 2001 September 15
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface that the sqlite B-Tree file
** subsystem. See comments in the source code for a detailed description
** of what each interface routine does.
*/
#ifndef SQLITE_BTREE_H
#define SQLITE_BTREE_H
/* TODO: This definition is just included so other modules compile. It
** needs to be revisited.
*/
#define SQLITE_N_BTREE_META 16
/*
** If defined as non-zero, auto-vacuum is enabled by default. Otherwise
** it must be turned on for each database using "PRAGMA auto_vacuum = 1".
*/
#ifndef SQLITE_DEFAULT_AUTOVACUUM
#define SQLITE_DEFAULT_AUTOVACUUM 0
#endif
#define BTREE_AUTOVACUUM_NONE 0 /* Do not do auto-vacuum */
#define BTREE_AUTOVACUUM_FULL 1 /* Do full auto-vacuum */
#define BTREE_AUTOVACUUM_INCR 2 /* Incremental vacuum */
/*
** Forward declarations of structure
*/
typedef struct Btree Btree;
typedef struct BtCursor BtCursor;
typedef struct BtShared BtShared;
typedef struct BtreePayload BtreePayload;
int sqlite3BtreeOpen(
sqlite3_vfs *pVfs, /* VFS to use with this b-tree */
const char *zFilename, /* Name of database file to open */
sqlite3 *db, /* Associated database connection */
Btree **ppBtree, /* Return open Btree* here */
int flags, /* Flags */
int vfsFlags /* Flags passed through to VFS open */
);
/* The flags parameter to sqlite3BtreeOpen can be the bitwise or of the
** following values.
**
** NOTE: These values must match the corresponding PAGER_ values in
** pager.h.
*/
#define BTREE_OMIT_JOURNAL 1 /* Do not create or use a rollback journal */
#define BTREE_MEMORY 2 /* This is an in-memory DB */
#define BTREE_SINGLE 4 /* The file contains at most 1 b-tree */
#define BTREE_UNORDERED 8 /* Use of a hash implementation is OK */
int sqlite3BtreeClose(Btree*);
int sqlite3BtreeSetCacheSize(Btree*,int);
int sqlite3BtreeSetSpillSize(Btree*,int);
#if SQLITE_MAX_MMAP_SIZE>0
int sqlite3BtreeSetMmapLimit(Btree*,sqlite3_int64);
#endif
int sqlite3BtreeSetPagerFlags(Btree*,unsigned);
int sqlite3BtreeSetPageSize(Btree *p, int nPagesize, int nReserve, int eFix);
int sqlite3BtreeGetPageSize(Btree*);
Pgno sqlite3BtreeMaxPageCount(Btree*,Pgno);
Pgno sqlite3BtreeLastPage(Btree*);
int sqlite3BtreeSecureDelete(Btree*,int);
int sqlite3BtreeGetRequestedReserve(Btree*);
int sqlite3BtreeGetReserveNoMutex(Btree *p);
int sqlite3BtreeSetAutoVacuum(Btree *, int);
int sqlite3BtreeGetAutoVacuum(Btree *);
int sqlite3BtreeBeginTrans(Btree*,int,int*);
int sqlite3BtreeCommitPhaseOne(Btree*, const char*);
int sqlite3BtreeCommitPhaseTwo(Btree*, int);
int sqlite3BtreeCommit(Btree*);
int sqlite3BtreeRollback(Btree*,int,int);
int sqlite3BtreeBeginStmt(Btree*,int);
int sqlite3BtreeCreateTable(Btree*, Pgno*, int flags);
int sqlite3BtreeTxnState(Btree*);
int sqlite3BtreeIsInBackup(Btree*);
void *sqlite3BtreeSchema(Btree *, int, void(*)(void *));
int sqlite3BtreeSchemaLocked(Btree *pBtree);
#ifndef SQLITE_OMIT_SHARED_CACHE
int sqlite3BtreeLockTable(Btree *pBtree, int iTab, u8 isWriteLock);
#endif
/* Savepoints are named, nestable SQL transactions mostly implemented */
/* in vdbe.c and pager.c See https://sqlite.org/lang_savepoint.html */
int sqlite3BtreeSavepoint(Btree *, int, int);
/* "Checkpoint" only refers to WAL. See https://sqlite.org/wal.html#ckpt */
#ifndef SQLITE_OMIT_WAL
int sqlite3BtreeCheckpoint(Btree*, int, int *, int *);
#endif
const char *sqlite3BtreeGetFilename(Btree *);
const char *sqlite3BtreeGetJournalname(Btree *);
int sqlite3BtreeCopyFile(Btree *, Btree *);
int sqlite3BtreeIncrVacuum(Btree *);
/* The flags parameter to sqlite3BtreeCreateTable can be the bitwise OR
** of the flags shown below.
**
** Every SQLite table must have either BTREE_INTKEY or BTREE_BLOBKEY set.
** With BTREE_INTKEY, the table key is a 64-bit integer and arbitrary data
** is stored in the leaves. (BTREE_INTKEY is used for SQL tables.) With
** BTREE_BLOBKEY, the key is an arbitrary BLOB and no content is stored
** anywhere - the key is the content. (BTREE_BLOBKEY is used for SQL
** indices.)
*/
#define BTREE_INTKEY 1 /* Table has only 64-bit signed integer keys */
#define BTREE_BLOBKEY 2 /* Table has keys only - no data */
int sqlite3BtreeDropTable(Btree*, int, int*);
int sqlite3BtreeClearTable(Btree*, int, i64*);
int sqlite3BtreeClearTableOfCursor(BtCursor*);
int sqlite3BtreeTripAllCursors(Btree*, int, int);
void sqlite3BtreeGetMeta(Btree *pBtree, int idx, u32 *pValue);
int sqlite3BtreeUpdateMeta(Btree*, int idx, u32 value);
int sqlite3BtreeNewDb(Btree *p);
/*
** The second parameter to sqlite3BtreeGetMeta or sqlite3BtreeUpdateMeta
** should be one of the following values. The integer values are assigned
** to constants so that the offset of the corresponding field in an
** SQLite database header may be found using the following formula:
**
** offset = 36 + (idx * 4)
**
** For example, the free-page-count field is located at byte offset 36 of
** the database file header. The incr-vacuum-flag field is located at
** byte offset 64 (== 36+4*7).
**
** The BTREE_DATA_VERSION value is not really a value stored in the header.
** It is a read-only number computed by the pager. But we merge it with
** the header value access routines since its access pattern is the same.
** Call it a "virtual meta value".
*/
#define BTREE_FREE_PAGE_COUNT 0
#define BTREE_SCHEMA_VERSION 1
#define BTREE_FILE_FORMAT 2
#define BTREE_DEFAULT_CACHE_SIZE 3
#define BTREE_LARGEST_ROOT_PAGE 4
#define BTREE_TEXT_ENCODING 5
#define BTREE_USER_VERSION 6
#define BTREE_INCR_VACUUM 7
#define BTREE_APPLICATION_ID 8
#define BTREE_DATA_VERSION 15 /* A virtual meta-value */
/*
** Kinds of hints that can be passed into the sqlite3BtreeCursorHint()
** interface.
**
** BTREE_HINT_RANGE (arguments: Expr*, Mem*)
**
** The first argument is an Expr* (which is guaranteed to be constant for
** the lifetime of the cursor) that defines constraints on which rows
** might be fetched with this cursor. The Expr* tree may contain
** TK_REGISTER nodes that refer to values stored in the array of registers
** passed as the second parameter. In other words, if Expr.op==TK_REGISTER
** then the value of the node is the value in Mem[pExpr.iTable]. Any
** TK_COLUMN node in the expression tree refers to the Expr.iColumn-th
** column of the b-tree of the cursor. The Expr tree will not contain
** any function calls nor subqueries nor references to b-trees other than
** the cursor being hinted.
**
** The design of the _RANGE hint is aid b-tree implementations that try
** to prefetch content from remote machines - to provide those
** implementations with limits on what needs to be prefetched and thereby
** reduce network bandwidth.
**
** Note that BTREE_HINT_FLAGS with BTREE_BULKLOAD is the only hint used by
** standard SQLite. The other hints are provided for extentions that use
** the SQLite parser and code generator but substitute their own storage
** engine.
*/
#define BTREE_HINT_RANGE 0 /* Range constraints on queries */
/*
** Values that may be OR'd together to form the argument to the
** BTREE_HINT_FLAGS hint for sqlite3BtreeCursorHint():
**
** The BTREE_BULKLOAD flag is set on index cursors when the index is going
** to be filled with content that is already in sorted order.
**
** The BTREE_SEEK_EQ flag is set on cursors that will get OP_SeekGE or
** OP_SeekLE opcodes for a range search, but where the range of entries
** selected will all have the same key. In other words, the cursor will
** be used only for equality key searches.
**
*/
#define BTREE_BULKLOAD 0x00000001 /* Used to full index in sorted order */
#define BTREE_SEEK_EQ 0x00000002 /* EQ seeks only - no range seeks */
/*
** Flags passed as the third argument to sqlite3BtreeCursor().
**
** For read-only cursors the wrFlag argument is always zero. For read-write
** cursors it may be set to either (BTREE_WRCSR|BTREE_FORDELETE) or just
** (BTREE_WRCSR). If the BTREE_FORDELETE bit is set, then the cursor will
** only be used by SQLite for the following:
**
** * to seek to and then delete specific entries, and/or
**
** * to read values that will be used to create keys that other
** BTREE_FORDELETE cursors will seek to and delete.
**
** The BTREE_FORDELETE flag is an optimization hint. It is not used by
** by this, the native b-tree engine of SQLite, but it is available to
** alternative storage engines that might be substituted in place of this
** b-tree system. For alternative storage engines in which a delete of
** the main table row automatically deletes corresponding index rows,
** the FORDELETE flag hint allows those alternative storage engines to
** skip a lot of work. Namely: FORDELETE cursors may treat all SEEK
** and DELETE operations as no-ops, and any READ operation against a
** FORDELETE cursor may return a null row: 0x01 0x00.
*/
#define BTREE_WRCSR 0x00000004 /* read-write cursor */
#define BTREE_FORDELETE 0x00000008 /* Cursor is for seek/delete only */
int sqlite3BtreeCursor(
Btree*, /* BTree containing table to open */
Pgno iTable, /* Index of root page */
int wrFlag, /* 1 for writing. 0 for read-only */
struct KeyInfo*, /* First argument to compare function */
BtCursor *pCursor /* Space to write cursor structure */
);
BtCursor *sqlite3BtreeFakeValidCursor(void);
int sqlite3BtreeCursorSize(void);
void sqlite3BtreeCursorZero(BtCursor*);
void sqlite3BtreeCursorHintFlags(BtCursor*, unsigned);
#ifdef SQLITE_ENABLE_CURSOR_HINTS
void sqlite3BtreeCursorHint(BtCursor*, int, ...);
#endif
int sqlite3BtreeCloseCursor(BtCursor*);
int sqlite3BtreeTableMoveto(
BtCursor*,
i64 intKey,
int bias,
int *pRes
);
int sqlite3BtreeIndexMoveto(
BtCursor*,
UnpackedRecord *pUnKey,
int *pRes
);
int sqlite3BtreeCursorHasMoved(BtCursor*);
int sqlite3BtreeCursorRestore(BtCursor*, int*);
int sqlite3BtreeDelete(BtCursor*, u8 flags);
/* Allowed flags for sqlite3BtreeDelete() and sqlite3BtreeInsert() */
#define BTREE_SAVEPOSITION 0x02 /* Leave cursor pointing at NEXT or PREV */
#define BTREE_AUXDELETE 0x04 /* not the primary delete operation */
#define BTREE_APPEND 0x08 /* Insert is likely an append */
#define BTREE_PREFORMAT 0x80 /* Inserted data is a preformated cell */
/* An instance of the BtreePayload object describes the content of a single
** entry in either an index or table btree.
**
** Index btrees (used for indexes and also WITHOUT ROWID tables) contain
** an arbitrary key and no data. These btrees have pKey,nKey set to the
** key and the pData,nData,nZero fields are uninitialized. The aMem,nMem
** fields give an array of Mem objects that are a decomposition of the key.
** The nMem field might be zero, indicating that no decomposition is available.
**
** Table btrees (used for rowid tables) contain an integer rowid used as
** the key and passed in the nKey field. The pKey field is zero.
** pData,nData hold the content of the new entry. nZero extra zero bytes
** are appended to the end of the content when constructing the entry.
** The aMem,nMem fields are uninitialized for table btrees.
**
** Field usage summary:
**
** Table BTrees Index Btrees
**
** pKey always NULL encoded key
** nKey the ROWID length of pKey
** pData data not used
** aMem not used decomposed key value
** nMem not used entries in aMem
** nData length of pData not used
** nZero extra zeros after pData not used
**
** This object is used to pass information into sqlite3BtreeInsert(). The
** same information used to be passed as five separate parameters. But placing
** the information into this object helps to keep the interface more
** organized and understandable, and it also helps the resulting code to
** run a little faster by using fewer registers for parameter passing.
*/
struct BtreePayload {
const void *pKey; /* Key content for indexes. NULL for tables */
sqlite3_int64 nKey; /* Size of pKey for indexes. PRIMARY KEY for tabs */
const void *pData; /* Data for tables. */
sqlite3_value *aMem; /* First of nMem value in the unpacked pKey */
u16 nMem; /* Number of aMem[] value. Might be zero */
int nData; /* Size of pData. 0 if none. */
int nZero; /* Extra zero data appended after pData,nData */
};
int sqlite3BtreeInsert(BtCursor*, const BtreePayload *pPayload,
int flags, int seekResult);
int sqlite3BtreeFirst(BtCursor*, int *pRes);
int sqlite3BtreeLast(BtCursor*, int *pRes);
int sqlite3BtreeNext(BtCursor*, int flags);
int sqlite3BtreeEof(BtCursor*);
int sqlite3BtreePrevious(BtCursor*, int flags);
i64 sqlite3BtreeIntegerKey(BtCursor*);
void sqlite3BtreeCursorPin(BtCursor*);
void sqlite3BtreeCursorUnpin(BtCursor*);
#ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC
i64 sqlite3BtreeOffset(BtCursor*);
#endif
int sqlite3BtreePayload(BtCursor*, u32 offset, u32 amt, void*);
const void *sqlite3BtreePayloadFetch(BtCursor*, u32 *pAmt);
u32 sqlite3BtreePayloadSize(BtCursor*);
sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor*);
char *sqlite3BtreeIntegrityCheck(sqlite3*,Btree*,Pgno*aRoot,int nRoot,int,int*);
struct Pager *sqlite3BtreePager(Btree*);
i64 sqlite3BtreeRowCountEst(BtCursor*);
#ifndef SQLITE_OMIT_INCRBLOB
int sqlite3BtreePayloadChecked(BtCursor*, u32 offset, u32 amt, void*);
int sqlite3BtreePutData(BtCursor*, u32 offset, u32 amt, void*);
void sqlite3BtreeIncrblobCursor(BtCursor *);
#endif
void sqlite3BtreeClearCursor(BtCursor *);
int sqlite3BtreeSetVersion(Btree *pBt, int iVersion);
int sqlite3BtreeCursorHasHint(BtCursor*, unsigned int mask);
int sqlite3BtreeIsReadonly(Btree *pBt);
int sqlite3HeaderSizeBtree(void);
#ifdef SQLITE_DEBUG
sqlite3_uint64 sqlite3BtreeSeekCount(Btree*);
#else
# define sqlite3BtreeSeekCount(X) 0
#endif
#ifndef NDEBUG
int sqlite3BtreeCursorIsValid(BtCursor*);
#endif
int sqlite3BtreeCursorIsValidNN(BtCursor*);
int sqlite3BtreeCount(sqlite3*, BtCursor*, i64*);
#ifdef SQLITE_TEST
int sqlite3BtreeCursorInfo(BtCursor*, int*, int);
void sqlite3BtreeCursorList(Btree*);
#endif
#ifndef SQLITE_OMIT_WAL
int sqlite3BtreeCheckpoint(Btree*, int, int *, int *);
#endif
int sqlite3BtreeTransferRow(BtCursor*, BtCursor*, i64);
/*
** If we are not using shared cache, then there is no need to
** use mutexes to access the BtShared structures. So make the
** Enter and Leave procedures no-ops.
*/
#ifndef SQLITE_OMIT_SHARED_CACHE
void sqlite3BtreeEnter(Btree*);
void sqlite3BtreeEnterAll(sqlite3*);
int sqlite3BtreeSharable(Btree*);
void sqlite3BtreeEnterCursor(BtCursor*);
int sqlite3BtreeConnectionCount(Btree*);
#else
# define sqlite3BtreeEnter(X)
# define sqlite3BtreeEnterAll(X)
# define sqlite3BtreeSharable(X) 0
# define sqlite3BtreeEnterCursor(X)
# define sqlite3BtreeConnectionCount(X) 1
#endif
#if !defined(SQLITE_OMIT_SHARED_CACHE) && SQLITE_THREADSAFE
void sqlite3BtreeLeave(Btree*);
void sqlite3BtreeLeaveCursor(BtCursor*);
void sqlite3BtreeLeaveAll(sqlite3*);
#ifndef NDEBUG
/* These routines are used inside assert() statements only. */
int sqlite3BtreeHoldsMutex(Btree*);
int sqlite3BtreeHoldsAllMutexes(sqlite3*);
int sqlite3SchemaMutexHeld(sqlite3*,int,Schema*);
#endif
#else
# define sqlite3BtreeLeave(X)
# define sqlite3BtreeLeaveCursor(X)
# define sqlite3BtreeLeaveAll(X)
# define sqlite3BtreeHoldsMutex(X) 1
# define sqlite3BtreeHoldsAllMutexes(X) 1
# define sqlite3SchemaMutexHeld(X,Y,Z) 1
#endif
#endif /* SQLITE_BTREE_H */
/*
** 2004 April 6
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This file implements an external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
**
** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
** "Sorting And Searching", pages 473-480. Addison-Wesley
** Publishing Company, Reading, Massachusetts.
**
** The basic idea is that each page of the file contains N database
** entries and N+1 pointers to subpages.
**
** ----------------------------------------------------------------
** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N-1) | Ptr(N) |
** ----------------------------------------------------------------
**
** All of the keys on the page that Ptr(0) points to have values less
** than Key(0). All of the keys on page Ptr(1) and its subpages have
** values greater than Key(0) and less than Key(1). All of the keys
** on Ptr(N) and its subpages have values greater than Key(N-1). And
** so forth.
**
** Finding a particular key requires reading O(log(M)) pages from the
** disk where M is the number of entries in the tree.
**
** In this implementation, a single file can hold one or more separate
** BTrees. Each BTree is identified by the index of its root page. The
** key and data for any entry are combined to form the "payload". A
** fixed amount of payload can be carried directly on the database
** page. If the payload is larger than the preset amount then surplus
** bytes are stored on overflow pages. The payload for an entry
** and the preceding pointer are combined to form a "Cell". Each
** page has a small header which contains the Ptr(N) pointer and other
** information such as the size of key and data.
**
** FORMAT DETAILS
**
** The file is divided into pages. The first page is called page 1,
** the second is page 2, and so forth. A page number of zero indicates
** "no such page". The page size can be any power of 2 between 512 and 65536.
** Each page can be either a btree page, a freelist page, an overflow
** page, or a pointer-map page.
**
** The first page is always a btree page. The first 100 bytes of the first
** page contain a special header (the "file header") that describes the file.
** The format of the file header is as follows:
**
** OFFSET SIZE DESCRIPTION
** 0 16 Header string: "SQLite format 3\000"
** 16 2 Page size in bytes. (1 means 65536)
** 18 1 File format write version
** 19 1 File format read version
** 20 1 Bytes of unused space at the end of each page
** 21 1 Max embedded payload fraction (must be 64)
** 22 1 Min embedded payload fraction (must be 32)
** 23 1 Min leaf payload fraction (must be 32)
** 24 4 File change counter
** 28 4 Reserved for future use
** 32 4 First freelist page
** 36 4 Number of freelist pages in the file
** 40 60 15 4-byte meta values passed to higher layers
**
** 40 4 Schema cookie
** 44 4 File format of schema layer
** 48 4 Size of page cache
** 52 4 Largest root-page (auto/incr_vacuum)
** 56 4 1=UTF-8 2=UTF16le 3=UTF16be
** 60 4 User version
** 64 4 Incremental vacuum mode
** 68 4 Application-ID
** 72 20 unused
** 92 4 The version-valid-for number
** 96 4 SQLITE_VERSION_NUMBER
**
** All of the integer values are big-endian (most significant byte first).
**
** The file change counter is incremented when the database is changed
** This counter allows other processes to know when the file has changed
** and thus when they need to flush their cache.
**
** The max embedded payload fraction is the amount of the total usable
** space in a page that can be consumed by a single cell for standard
** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
** is to limit the maximum cell size so that at least 4 cells will fit
** on one page. Thus the default max embedded payload fraction is 64.
**
** If the payload for a cell is larger than the max payload, then extra
** payload is spilled to overflow pages. Once an overflow page is allocated,
** as many bytes as possible are moved into the overflow pages without letting
** the cell size drop below the min embedded payload fraction.
**
** The min leaf payload fraction is like the min embedded payload fraction
** except that it applies to leaf nodes in a LEAFDATA tree. The maximum
** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
** not specified in the header.
**
** Each btree pages is divided into three sections: The header, the
** cell pointer array, and the cell content area. Page 1 also has a 100-byte
** file header that occurs before the page header.
**
** |----------------|
** | file header | 100 bytes. Page 1 only.
** |----------------|
** | page header | 8 bytes for leaves. 12 bytes for interior nodes
** |----------------|
** | cell pointer | | 2 bytes per cell. Sorted order.
** | array | | Grows downward
** | | v
** |----------------|
** | unallocated |
** | space |
** |----------------| ^ Grows upwards
** | cell content | | Arbitrary order interspersed with freeblocks.
** | area | | and free space fragments.
** |----------------|
**
** The page headers looks like this:
**
** OFFSET SIZE DESCRIPTION
** 0 1 Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf
** 1 2 byte offset to the first freeblock
** 3 2 number of cells on this page
** 5 2 first byte of the cell content area
** 7 1 number of fragmented free bytes
** 8 4 Right child (the Ptr(N) value). Omitted on leaves.
**
** The flags define the format of this btree page. The leaf flag means that
** this page has no children. The zerodata flag means that this page carries
** only keys and no data. The intkey flag means that the key is an integer
** which is stored in the key size entry of the cell header rather than in
** the payload area.
**
** The cell pointer array begins on the first byte after the page header.
** The cell pointer array contains zero or more 2-byte numbers which are
** offsets from the beginning of the page to the cell content in the cell
** content area. The cell pointers occur in sorted order. The system strives
** to keep free space after the last cell pointer so that new cells can
** be easily added without having to defragment the page.
**
** Cell content is stored at the very end of the page and grows toward the
** beginning of the page.
**
** Unused space within the cell content area is collected into a linked list of
** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
** to the first freeblock is given in the header. Freeblocks occur in
** increasing order. Because a freeblock must be at least 4 bytes in size,
** any group of 3 or fewer unused bytes in the cell content area cannot
** exist on the freeblock chain. A group of 3 or fewer free bytes is called
** a fragment. The total number of bytes in all fragments is recorded.
** in the page header at offset 7.
**
** SIZE DESCRIPTION
** 2 Byte offset of the next freeblock
** 2 Bytes in this freeblock
**
** Cells are of variable length. Cells are stored in the cell content area at
** the end of the page. Pointers to the cells are in the cell pointer array
** that immediately follows the page header. Cells is not necessarily
** contiguous or in order, but cell pointers are contiguous and in order.
**
** Cell content makes use of variable length integers. A variable
** length integer is 1 to 9 bytes where the lower 7 bits of each
** byte are used. The integer consists of all bytes that have bit 8 set and
** the first byte with bit 8 clear. The most significant byte of the integer
** appears first. A variable-length integer may not be more than 9 bytes long.
** As a special case, all 8 bytes of the 9th byte are used as data. This
** allows a 64-bit integer to be encoded in 9 bytes.
**
** 0x00 becomes 0x00000000
** 0x7f becomes 0x0000007f
** 0x81 0x00 becomes 0x00000080
** 0x82 0x00 becomes 0x00000100
** 0x80 0x7f becomes 0x0000007f
** 0x8a 0x91 0xd1 0xac 0x78 becomes 0x12345678
** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
**
** Variable length integers are used for rowids and to hold the number of
** bytes of key and data in a btree cell.
**
** The content of a cell looks like this:
**
** SIZE DESCRIPTION
** 4 Page number of the left child. Omitted if leaf flag is set.
** var Number of bytes of data. Omitted if the zerodata flag is set.
** var Number of bytes of key. Or the key itself if intkey flag is set.
** * Payload
** 4 First page of the overflow chain. Omitted if no overflow
**
** Overflow pages form a linked list. Each page except the last is completely
** filled with data (pagesize - 4 bytes). The last page can have as little
** as 1 byte of data.
**
** SIZE DESCRIPTION
** 4 Page number of next overflow page
** * Data
**
** Freelist pages come in two subtypes: trunk pages and leaf pages. The
** file header points to the first in a linked list of trunk page. Each trunk
** page points to multiple leaf pages. The content of a leaf page is
** unspecified. A trunk page looks like this:
**
** SIZE DESCRIPTION
** 4 Page number of next trunk page
** 4 Number of leaf pointers on this page
** * zero or more pages numbers of leaves
*/
#include "sqliteInt.h"
/* The following value is the maximum cell size assuming a maximum page
** size give above.
*/
#define MX_CELL_SIZE(pBt) ((int)(pBt->pageSize-8))
/* The maximum number of cells on a single page of the database. This
** assumes a minimum cell size of 6 bytes (4 bytes for the cell itself
** plus 2 bytes for the index to the cell in the page header). Such
** small cells will be rare, but they are possible.
*/
#define MX_CELL(pBt) ((pBt->pageSize-8)/6)
/* Forward declarations */
typedef struct MemPage MemPage;
typedef struct BtLock BtLock;
typedef struct CellInfo CellInfo;
/*
** This is a magic string that appears at the beginning of every
** SQLite database in order to identify the file as a real database.
**
** You can change this value at compile-time by specifying a
** -DSQLITE_FILE_HEADER="..." on the compiler command-line. The
** header must be exactly 16 bytes including the zero-terminator so
** the string itself should be 15 characters long. If you change
** the header, then your custom library will not be able to read
** databases generated by the standard tools and the standard tools
** will not be able to read databases created by your custom library.
*/
#ifndef SQLITE_FILE_HEADER /* 123456789 123456 */
# define SQLITE_FILE_HEADER "SQLite format 3"
#endif
/*
** Page type flags. An ORed combination of these flags appear as the
** first byte of on-disk image of every BTree page.
*/
#define PTF_INTKEY 0x01
#define PTF_ZERODATA 0x02
#define PTF_LEAFDATA 0x04
#define PTF_LEAF 0x08
/*
** An instance of this object stores information about each a single database
** page that has been loaded into memory. The information in this object
** is derived from the raw on-disk page content.
**
** As each database page is loaded into memory, the pager allocats an
** instance of this object and zeros the first 8 bytes. (This is the
** "extra" information associated with each page of the pager.)
**
** Access to all fields of this structure is controlled by the mutex
** stored in MemPage.pBt->mutex.
*/
struct MemPage {
u8 isInit; /* True if previously initialized. MUST BE FIRST! */
u8 intKey; /* True if table b-trees. False for index b-trees */
u8 intKeyLeaf; /* True if the leaf of an intKey table */
Pgno pgno; /* Page number for this page */
/* Only the first 8 bytes (above) are zeroed by pager.c when a new page
** is allocated. All fields that follow must be initialized before use */
u8 leaf; /* True if a leaf page */
u8 hdrOffset; /* 100 for page 1. 0 otherwise */
u8 childPtrSize; /* 0 if leaf==1. 4 if leaf==0 */
u8 max1bytePayload; /* min(maxLocal,127) */
u8 nOverflow; /* Number of overflow cell bodies in aCell[] */
u16 maxLocal; /* Copy of BtShared.maxLocal or BtShared.maxLeaf */
u16 minLocal; /* Copy of BtShared.minLocal or BtShared.minLeaf */
u16 cellOffset; /* Index in aData of first cell pointer */
int nFree; /* Number of free bytes on the page. -1 for unknown */
u16 nCell; /* Number of cells on this page, local and ovfl */
u16 maskPage; /* Mask for page offset */
u16 aiOvfl[4]; /* Insert the i-th overflow cell before the aiOvfl-th
** non-overflow cell */
u8 *apOvfl[4]; /* Pointers to the body of overflow cells */
BtShared *pBt; /* Pointer to BtShared that this page is part of */
u8 *aData; /* Pointer to disk image of the page data */
u8 *aDataEnd; /* One byte past the end of usable data */
u8 *aCellIdx; /* The cell index area */
u8 *aDataOfst; /* Same as aData for leaves. aData+4 for interior */
DbPage *pDbPage; /* Pager page handle */
u16 (*xCellSize)(MemPage*,u8*); /* cellSizePtr method */
void (*xParseCell)(MemPage*,u8*,CellInfo*); /* btreeParseCell method */
};
/*
** A linked list of the following structures is stored at BtShared.pLock.
** Locks are added (or upgraded from READ_LOCK to WRITE_LOCK) when a cursor
** is opened on the table with root page BtShared.iTable. Locks are removed
** from this list when a transaction is committed or rolled back, or when
** a btree handle is closed.
*/
struct BtLock {
Btree *pBtree; /* Btree handle holding this lock */
Pgno iTable; /* Root page of table */
u8 eLock; /* READ_LOCK or WRITE_LOCK */
BtLock *pNext; /* Next in BtShared.pLock list */
};
/* Candidate values for BtLock.eLock */
#define READ_LOCK 1
#define WRITE_LOCK 2
/* A Btree handle
**
** A database connection contains a pointer to an instance of
** this object for every database file that it has open. This structure
** is opaque to the database connection. The database connection cannot
** see the internals of this structure and only deals with pointers to
** this structure.
**
** For some database files, the same underlying database cache might be
** shared between multiple connections. In that case, each connection
** has it own instance of this object. But each instance of this object
** points to the same BtShared object. The database cache and the
** schema associated with the database file are all contained within
** the BtShared object.
**
** All fields in this structure are accessed under sqlite3.mutex.
** The pBt pointer itself may not be changed while there exists cursors
** in the referenced BtShared that point back to this Btree since those
** cursors have to go through this Btree to find their BtShared and
** they often do so without holding sqlite3.mutex.
*/
struct Btree {
sqlite3 *db; /* The database connection holding this btree */
BtShared *pBt; /* Sharable content of this btree */
u8 inTrans; /* TRANS_NONE, TRANS_READ or TRANS_WRITE */
u8 sharable; /* True if we can share pBt with another db */
u8 locked; /* True if db currently has pBt locked */
u8 hasIncrblobCur; /* True if there are one or more Incrblob cursors */
int wantToLock; /* Number of nested calls to sqlite3BtreeEnter() */
int nBackup; /* Number of backup operations reading this btree */
u32 iBDataVersion; /* Combines with pBt->pPager->iDataVersion */
Btree *pNext; /* List of other sharable Btrees from the same db */
Btree *pPrev; /* Back pointer of the same list */
#ifdef SQLITE_DEBUG
u64 nSeek; /* Calls to sqlite3BtreeMovetoUnpacked() */
#endif
#ifndef SQLITE_OMIT_SHARED_CACHE
BtLock lock; /* Object used to lock page 1 */
#endif
};
/*
** Btree.inTrans may take one of the following values.
**
** If the shared-data extension is enabled, there may be multiple users
** of the Btree structure. At most one of these may open a write transaction,
** but any number may have active read transactions.
**
** These values must match SQLITE_TXN_NONE, SQLITE_TXN_READ, and
** SQLITE_TXN_WRITE
*/
#define TRANS_NONE 0
#define TRANS_READ 1
#define TRANS_WRITE 2
#if TRANS_NONE!=SQLITE_TXN_NONE
# error wrong numeric code for no-transaction
#endif
#if TRANS_READ!=SQLITE_TXN_READ
# error wrong numeric code for read-transaction
#endif
#if TRANS_WRITE!=SQLITE_TXN_WRITE
# error wrong numeric code for write-transaction
#endif
/*
** An instance of this object represents a single database file.
**
** A single database file can be in use at the same time by two
** or more database connections. When two or more connections are
** sharing the same database file, each connection has it own
** private Btree object for the file and each of those Btrees points
** to this one BtShared object. BtShared.nRef is the number of
** connections currently sharing this database file.
**
** Fields in this structure are accessed under the BtShared.mutex
** mutex, except for nRef and pNext which are accessed under the
** global SQLITE_MUTEX_STATIC_MAIN mutex. The pPager field
** may not be modified once it is initially set as long as nRef>0.
** The pSchema field may be set once under BtShared.mutex and
** thereafter is unchanged as long as nRef>0.
**
** isPending:
**
** If a BtShared client fails to obtain a write-lock on a database
** table (because there exists one or more read-locks on the table),
** the shared-cache enters 'pending-lock' state and isPending is
** set to true.
**
** The shared-cache leaves the 'pending lock' state when either of
** the following occur:
**
** 1) The current writer (BtShared.pWriter) concludes its transaction, OR
** 2) The number of locks held by other connections drops to zero.
**
** while in the 'pending-lock' state, no connection may start a new
** transaction.
**
** This feature is included to help prevent writer-starvation.
*/
struct BtShared {
Pager *pPager; /* The page cache */
sqlite3 *db; /* Database connection currently using this Btree */
BtCursor *pCursor; /* A list of all open cursors */
MemPage *pPage1; /* First page of the database */
u8 openFlags; /* Flags to sqlite3BtreeOpen() */
#ifndef SQLITE_OMIT_AUTOVACUUM
u8 autoVacuum; /* True if auto-vacuum is enabled */
u8 incrVacuum; /* True if incr-vacuum is enabled */
u8 bDoTruncate; /* True to truncate db on commit */
#endif
u8 inTransaction; /* Transaction state */
u8 max1bytePayload; /* Maximum first byte of cell for a 1-byte payload */
u8 nReserveWanted; /* Desired number of extra bytes per page */
u16 btsFlags; /* Boolean parameters. See BTS_* macros below */
u16 maxLocal; /* Maximum local payload in non-LEAFDATA tables */
u16 minLocal; /* Minimum local payload in non-LEAFDATA tables */
u16 maxLeaf; /* Maximum local payload in a LEAFDATA table */
u16 minLeaf; /* Minimum local payload in a LEAFDATA table */
u32 pageSize; /* Total number of bytes on a page */
u32 usableSize; /* Number of usable bytes on each page */
int nTransaction; /* Number of open transactions (read + write) */
u32 nPage; /* Number of pages in the database */
void *pSchema; /* Pointer to space allocated by sqlite3BtreeSchema() */
void (*xFreeSchema)(void*); /* Destructor for BtShared.pSchema */
sqlite3_mutex *mutex; /* Non-recursive mutex required to access this object */
Bitvec *pHasContent; /* Set of pages moved to free-list this transaction */
#ifndef SQLITE_OMIT_SHARED_CACHE
int nRef; /* Number of references to this structure */
BtShared *pNext; /* Next on a list of sharable BtShared structs */
BtLock *pLock; /* List of locks held on this shared-btree struct */
Btree *pWriter; /* Btree with currently open write transaction */
#endif
u8 *pTmpSpace; /* Temp space sufficient to hold a single cell */
int nPreformatSize; /* Size of last cell written by TransferRow() */
};
/*
** Allowed values for BtShared.btsFlags
*/
#define BTS_READ_ONLY 0x0001 /* Underlying file is readonly */
#define BTS_PAGESIZE_FIXED 0x0002 /* Page size can no longer be changed */
#define BTS_SECURE_DELETE 0x0004 /* PRAGMA secure_delete is enabled */
#define BTS_OVERWRITE 0x0008 /* Overwrite deleted content with zeros */
#define BTS_FAST_SECURE 0x000c /* Combination of the previous two */
#define BTS_INITIALLY_EMPTY 0x0010 /* Database was empty at trans start */
#define BTS_NO_WAL 0x0020 /* Do not open write-ahead-log files */
#define BTS_EXCLUSIVE 0x0040 /* pWriter has an exclusive lock */
#define BTS_PENDING 0x0080 /* Waiting for read-locks to clear */
/*
** An instance of the following structure is used to hold information
** about a cell. The parseCellPtr() function fills in this structure
** based on information extract from the raw disk page.
*/
struct CellInfo {
i64 nKey; /* The key for INTKEY tables, or nPayload otherwise */
u8 *pPayload; /* Pointer to the start of payload */
u32 nPayload; /* Bytes of payload */
u16 nLocal; /* Amount of payload held locally, not on overflow */
u16 nSize; /* Size of the cell content on the main b-tree page */
};
/*
** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than
** this will be declared corrupt. This value is calculated based on a
** maximum database size of 2^31 pages a minimum fanout of 2 for a
** root-node and 3 for all other internal nodes.
**
** If a tree that appears to be taller than this is encountered, it is
** assumed that the database is corrupt.
*/
#define BTCURSOR_MAX_DEPTH 20
/*
** A cursor is a pointer to a particular entry within a particular
** b-tree within a database file.
**
** The entry is identified by its MemPage and the index in
** MemPage.aCell[] of the entry.
**
** A single database file can be shared by two more database connections,
** but cursors cannot be shared. Each cursor is associated with a
** particular database connection identified BtCursor.pBtree.db.
**
** Fields in this structure are accessed under the BtShared.mutex
** found at self->pBt->mutex.
**
** skipNext meaning:
** The meaning of skipNext depends on the value of eState:
**
** eState Meaning of skipNext
** VALID skipNext is meaningless and is ignored
** INVALID skipNext is meaningless and is ignored
** SKIPNEXT sqlite3BtreeNext() is a no-op if skipNext>0 and
** sqlite3BtreePrevious() is no-op if skipNext<0.
** REQUIRESEEK restoreCursorPosition() restores the cursor to
** eState=SKIPNEXT if skipNext!=0
** FAULT skipNext holds the cursor fault error code.
*/
struct BtCursor {
u8 eState; /* One of the CURSOR_XXX constants (see below) */
u8 curFlags; /* zero or more BTCF_* flags defined below */
u8 curPagerFlags; /* Flags to send to sqlite3PagerGet() */
u8 hints; /* As configured by CursorSetHints() */
int skipNext; /* Prev() is noop if negative. Next() is noop if positive.
** Error code if eState==CURSOR_FAULT */
Btree *pBtree; /* The Btree to which this cursor belongs */
Pgno *aOverflow; /* Cache of overflow page locations */
void *pKey; /* Saved key that was cursor last known position */
/* All fields above are zeroed when the cursor is allocated. See
** sqlite3BtreeCursorZero(). Fields that follow must be manually
** initialized. */
#define BTCURSOR_FIRST_UNINIT pBt /* Name of first uninitialized field */
BtShared *pBt; /* The BtShared this cursor points to */
BtCursor *pNext; /* Forms a linked list of all cursors */
CellInfo info; /* A parse of the cell we are pointing at */
i64 nKey; /* Size of pKey, or last integer key */
Pgno pgnoRoot; /* The root page of this tree */
i8 iPage; /* Index of current page in apPage */
u8 curIntKey; /* Value of apPage[0]->intKey */
u16 ix; /* Current index for apPage[iPage] */
u16 aiIdx[BTCURSOR_MAX_DEPTH-1]; /* Current index in apPage[i] */
struct KeyInfo *pKeyInfo; /* Arg passed to comparison function */
MemPage *pPage; /* Current page */
MemPage *apPage[BTCURSOR_MAX_DEPTH-1]; /* Stack of parents of current page */
};
/*
** Legal values for BtCursor.curFlags
*/
#define BTCF_WriteFlag 0x01 /* True if a write cursor */
#define BTCF_ValidNKey 0x02 /* True if info.nKey is valid */
#define BTCF_ValidOvfl 0x04 /* True if aOverflow is valid */
#define BTCF_AtLast 0x08 /* Cursor is pointing ot the last entry */
#define BTCF_Incrblob 0x10 /* True if an incremental I/O handle */
#define BTCF_Multiple 0x20 /* Maybe another cursor on the same btree */
#define BTCF_Pinned 0x40 /* Cursor is busy and cannot be moved */
/*
** Potential values for BtCursor.eState.
**
** CURSOR_INVALID:
** Cursor does not point to a valid entry. This can happen (for example)
** because the table is empty or because BtreeCursorFirst() has not been
** called.
**
** CURSOR_VALID:
** Cursor points to a valid entry. getPayload() etc. may be called.
**
** CURSOR_SKIPNEXT:
** Cursor is valid except that the Cursor.skipNext field is non-zero
** indicating that the next sqlite3BtreeNext() or sqlite3BtreePrevious()
** operation should be a no-op.
**
** CURSOR_REQUIRESEEK:
** The table that this cursor was opened on still exists, but has been
** modified since the cursor was last used. The cursor position is saved
** in variables BtCursor.pKey and BtCursor.nKey. When a cursor is in
** this state, restoreCursorPosition() can be called to attempt to
** seek the cursor to the saved position.
**
** CURSOR_FAULT:
** An unrecoverable error (an I/O error or a malloc failure) has occurred
** on a different connection that shares the BtShared cache with this
** cursor. The error has left the cache in an inconsistent state.
** Do nothing else with this cursor. Any attempt to use the cursor
** should return the error code stored in BtCursor.skipNext
*/
#define CURSOR_VALID 0
#define CURSOR_INVALID 1
#define CURSOR_SKIPNEXT 2
#define CURSOR_REQUIRESEEK 3
#define CURSOR_FAULT 4
/*
** The database page the PENDING_BYTE occupies. This page is never used.
*/
# define PENDING_BYTE_PAGE(pBt) PAGER_MJ_PGNO(pBt)
/*
** These macros define the location of the pointer-map entry for a
** database page. The first argument to each is the number of usable
** bytes on each page of the database (often 1024). The second is the
** page number to look up in the pointer map.
**
** PTRMAP_PAGENO returns the database page number of the pointer-map
** page that stores the required pointer. PTRMAP_PTROFFSET returns
** the offset of the requested map entry.
**
** If the pgno argument passed to PTRMAP_PAGENO is a pointer-map page,
** then pgno is returned. So (pgno==PTRMAP_PAGENO(pgsz, pgno)) can be
** used to test if pgno is a pointer-map page. PTRMAP_ISPAGE implements
** this test.
*/
#define PTRMAP_PAGENO(pBt, pgno) ptrmapPageno(pBt, pgno)
#define PTRMAP_PTROFFSET(pgptrmap, pgno) (5*(pgno-pgptrmap-1))
#define PTRMAP_ISPAGE(pBt, pgno) (PTRMAP_PAGENO((pBt),(pgno))==(pgno))
/*
** The pointer map is a lookup table that identifies the parent page for
** each child page in the database file. The parent page is the page that
** contains a pointer to the child. Every page in the database contains
** 0 or 1 parent pages. (In this context 'database page' refers
** to any page that is not part of the pointer map itself.) Each pointer map
** entry consists of a single byte 'type' and a 4 byte parent page number.
** The PTRMAP_XXX identifiers below are the valid types.
**
** The purpose of the pointer map is to facility moving pages from one
** position in the file to another as part of autovacuum. When a page
** is moved, the pointer in its parent must be updated to point to the
** new location. The pointer map is used to locate the parent page quickly.
**
** PTRMAP_ROOTPAGE: The database page is a root-page. The page-number is not
** used in this case.
**
** PTRMAP_FREEPAGE: The database page is an unused (free) page. The page-number
** is not used in this case.
**
** PTRMAP_OVERFLOW1: The database page is the first page in a list of
** overflow pages. The page number identifies the page that
** contains the cell with a pointer to this overflow page.
**
** PTRMAP_OVERFLOW2: The database page is the second or later page in a list of
** overflow pages. The page-number identifies the previous
** page in the overflow page list.
**
** PTRMAP_BTREE: The database page is a non-root btree page. The page number
** identifies the parent page in the btree.
*/
#define PTRMAP_ROOTPAGE 1
#define PTRMAP_FREEPAGE 2
#define PTRMAP_OVERFLOW1 3
#define PTRMAP_OVERFLOW2 4
#define PTRMAP_BTREE 5
/* A bunch of assert() statements to check the transaction state variables
** of handle p (type Btree*) are internally consistent.
*/
#define btreeIntegrity(p) \
assert( p->pBt->inTransaction!=TRANS_NONE || p->pBt->nTransaction==0 ); \
assert( p->pBt->inTransaction>=p->inTrans );
/*
** The ISAUTOVACUUM macro is used within balance_nonroot() to determine
** if the database supports auto-vacuum or not. Because it is used
** within an expression that is an argument to another macro
** (sqliteMallocRaw), it is not possible to use conditional compilation.
** So, this macro is defined instead.
*/
#ifndef SQLITE_OMIT_AUTOVACUUM
#define ISAUTOVACUUM (pBt->autoVacuum)
#else
#define ISAUTOVACUUM 0
#endif
/*
** This structure is passed around through all the sanity checking routines
** in order to keep track of some global state information.
**
** The aRef[] array is allocated so that there is 1 bit for each page in
** the database. As the integrity-check proceeds, for each page used in
** the database the corresponding bit is set. This allows integrity-check to
** detect pages that are used twice and orphaned pages (both of which
** indicate corruption).
*/
typedef struct IntegrityCk IntegrityCk;
struct IntegrityCk {
BtShared *pBt; /* The tree being checked out */
Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */
u8 *aPgRef; /* 1 bit per page in the db (see above) */
Pgno nPage; /* Number of pages in the database */
int mxErr; /* Stop accumulating errors when this reaches zero */
int nErr; /* Number of messages written to zErrMsg so far */
int bOomFault; /* A memory allocation error has occurred */
const char *zPfx; /* Error message prefix */
Pgno v1; /* Value for first %u substitution in zPfx */
int v2; /* Value for second %d substitution in zPfx */
StrAccum errMsg; /* Accumulate the error message text here */
u32 *heap; /* Min-heap used for analyzing cell coverage */
sqlite3 *db; /* Database connection running the check */
};
/*
** Routines to read or write a two- and four-byte big-endian integer values.
*/
#define get2byte(x) ((x)[0]<<8 | (x)[1])
#define put2byte(p,v) ((p)[0] = (u8)((v)>>8), (p)[1] = (u8)(v))
#define get4byte sqlite3Get4byte
#define put4byte sqlite3Put4byte
/*
** get2byteAligned(), unlike get2byte(), requires that its argument point to a
** two-byte aligned address. get2bytea() is only used for accessing the
** cell addresses in a btree header.
*/
#if SQLITE_BYTEORDER==4321
# define get2byteAligned(x) (*(u16*)(x))
#elif SQLITE_BYTEORDER==1234 && GCC_VERSION>=4008000
# define get2byteAligned(x) __builtin_bswap16(*(u16*)(x))
#elif SQLITE_BYTEORDER==1234 && MSVC_VERSION>=1300
# define get2byteAligned(x) _byteswap_ushort(*(u16*)(x))
#else
# define get2byteAligned(x) ((x)[0]<<8 | (x)[1])
#endif
/*
** 2001 September 15
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface that the sqlite page cache
** subsystem. The page cache subsystem reads and writes a file a page
** at a time and provides a journal for rollback.
*/
#ifndef SQLITE_PAGER_H
#define SQLITE_PAGER_H
/*
** Default maximum size for persistent journal files. A negative
** value means no limit. This value may be overridden using the
** sqlite3PagerJournalSizeLimit() API. See also "PRAGMA journal_size_limit".
*/
#ifndef SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT
#define SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT -1
#endif
/*
** The type used to represent a page number. The first page in a file
** is called page 1. 0 is used to represent "not a page".
*/
typedef u32 Pgno;
/*
** Each open file is managed by a separate instance of the "Pager" structure.
*/
typedef struct Pager Pager;
/*
** Handle type for pages.
*/
typedef struct PgHdr DbPage;
// /*
// ** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is
// ** reserved for working around a windows/posix incompatibility). It is
// ** used in the journal to signify that the remainder of the journal file
// ** is devoted to storing a super-journal name - there are no more pages to
// ** roll back. See comments for function writeSuperJournal() in pager.c
// ** for details.
// */
// #define PAGER_MJ_PGNO(x) ((Pgno)((PENDING_BYTE/((x)->pageSize))+1))
/*
** Allowed values for the flags parameter to sqlite3PagerOpen().
**
** NOTE: These values must match the corresponding BTREE_ values in btree.h.
*/
#define PAGER_OMIT_JOURNAL 0x0001 /* Do not use a rollback journal */
#define PAGER_MEMORY 0x0002 /* In-memory database */
/*
** Valid values for the second argument to sqlite3PagerLockingMode().
*/
#define PAGER_LOCKINGMODE_QUERY -1
#define PAGER_LOCKINGMODE_NORMAL 0
#define PAGER_LOCKINGMODE_EXCLUSIVE 1
/*
** Numeric constants that encode the journalmode.
**
** The numeric values encoded here (other than PAGER_JOURNALMODE_QUERY)
** are exposed in the API via the "PRAGMA journal_mode" command and
** therefore cannot be changed without a compatibility break.
*/
#define PAGER_JOURNALMODE_QUERY (-1) /* Query the value of journalmode */
#define PAGER_JOURNALMODE_DELETE 0 /* Commit by deleting journal file */
#define PAGER_JOURNALMODE_PERSIST 1 /* Commit by zeroing journal header */
#define PAGER_JOURNALMODE_OFF 2 /* Journal omitted. */
#define PAGER_JOURNALMODE_TRUNCATE 3 /* Commit by truncating journal */
#define PAGER_JOURNALMODE_MEMORY 4 /* In-memory journal file */
#define PAGER_JOURNALMODE_WAL 5 /* Use write-ahead logging */
/*
** Flags that make up the mask passed to sqlite3PagerGet().
*/
#define PAGER_GET_NOCONTENT 0x01 /* Do not load data from disk */
#define PAGER_GET_READONLY 0x02 /* Read-only page is acceptable */
/*
** Flags for sqlite3PagerSetFlags()
**
** Value constraints (enforced via assert()):
** PAGER_FULLFSYNC == SQLITE_FullFSync
** PAGER_CKPT_FULLFSYNC == SQLITE_CkptFullFSync
** PAGER_CACHE_SPILL == SQLITE_CacheSpill
*/
#define PAGER_SYNCHRONOUS_OFF 0x01 /* PRAGMA synchronous=OFF */
#define PAGER_SYNCHRONOUS_NORMAL 0x02 /* PRAGMA synchronous=NORMAL */
#define PAGER_SYNCHRONOUS_FULL 0x03 /* PRAGMA synchronous=FULL */
#define PAGER_SYNCHRONOUS_EXTRA 0x04 /* PRAGMA synchronous=EXTRA */
#define PAGER_SYNCHRONOUS_MASK 0x07 /* Mask for four values above */
#define PAGER_FULLFSYNC 0x08 /* PRAGMA fullfsync=ON */
#define PAGER_CKPT_FULLFSYNC 0x10 /* PRAGMA checkpoint_fullfsync=ON */
#define PAGER_CACHESPILL 0x20 /* PRAGMA cache_spill=ON */
#define PAGER_FLAGS_MASK 0x38 /* All above except SYNCHRONOUS */
/*
** The remainder of this file contains the declarations of the functions
** that make up the Pager sub-system API. See source code comments for
** a detailed description of each routine.
*/
/* Open and close a Pager connection. */
int sqlite3PagerOpen(
Pager **ppPager,
const char*,
int,
int,
int,
void(*)(DbPage*)
);
int sqlite3PagerClose(Pager *pPager, sqlite3*);
// int sqlite3PagerReadFileheader(Pager*, int, unsigned char*);
// /* Functions used to configure a Pager object. */
// void sqlite3PagerSetBusyHandler(Pager*, int(*)(void *), void *);
// int sqlite3PagerSetPagesize(Pager*, u32*, int);
// Pgno sqlite3PagerMaxPageCount(Pager*, Pgno);
// void sqlite3PagerSetCachesize(Pager*, int);
// int sqlite3PagerSetSpillsize(Pager*, int);
// void sqlite3PagerSetMmapLimit(Pager *, sqlite3_int64);
// void sqlite3PagerShrink(Pager*);
// void sqlite3PagerSetFlags(Pager*,unsigned);
// int sqlite3PagerLockingMode(Pager *, int);
// int sqlite3PagerSetJournalMode(Pager *, int);
// int sqlite3PagerGetJournalMode(Pager*);
// int sqlite3PagerOkToChangeJournalMode(Pager*);
// i64 sqlite3PagerJournalSizeLimit(Pager *, i64);
// sqlite3_backup **sqlite3PagerBackupPtr(Pager*);
// int sqlite3PagerFlush(Pager*);
// /* Functions used to obtain and release page references. */
// int sqlite3PagerGet(Pager *pPager, Pgno pgno, DbPage **ppPage, int clrFlag);
// DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno);
// void sqlite3PagerRef(DbPage*);
// void sqlite3PagerUnref(DbPage*);
// void sqlite3PagerUnrefNotNull(DbPage*);
// void sqlite3PagerUnrefPageOne(DbPage*);
// /* Operations on page references. */
// int sqlite3PagerWrite(DbPage*);
// void sqlite3PagerDontWrite(DbPage*);
// int sqlite3PagerMovepage(Pager*,DbPage*,Pgno,int);
// int sqlite3PagerPageRefcount(DbPage*);
// void *sqlite3PagerGetData(DbPage *);
// void *sqlite3PagerGetExtra(DbPage *);
// /* Functions used to manage pager transactions and savepoints. */
// void sqlite3PagerPagecount(Pager*, int*);
// int sqlite3PagerBegin(Pager*, int exFlag, int);
// int sqlite3PagerCommitPhaseOne(Pager*,const char *zSuper, int);
// int sqlite3PagerExclusiveLock(Pager*);
// int sqlite3PagerSync(Pager *pPager, const char *zSuper);
// int sqlite3PagerCommitPhaseTwo(Pager*);
// int sqlite3PagerRollback(Pager*);
// int sqlite3PagerOpenSavepoint(Pager *pPager, int n);
// int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint);
// int sqlite3PagerSharedLock(Pager *pPager);
// #ifndef SQLITE_OMIT_WAL
// int sqlite3PagerCheckpoint(Pager *pPager, sqlite3*, int, int*, int*);
// int sqlite3PagerWalSupported(Pager *pPager);
// int sqlite3PagerWalCallback(Pager *pPager);
// int sqlite3PagerOpenWal(Pager *pPager, int *pisOpen);
// int sqlite3PagerCloseWal(Pager *pPager, sqlite3*);
// # ifdef SQLITE_ENABLE_SNAPSHOT
// int sqlite3PagerSnapshotGet(Pager*, sqlite3_snapshot **ppSnapshot);
// int sqlite3PagerSnapshotOpen(Pager*, sqlite3_snapshot *pSnapshot);
// int sqlite3PagerSnapshotRecover(Pager *pPager);
// int sqlite3PagerSnapshotCheck(Pager *pPager, sqlite3_snapshot *pSnapshot);
// void sqlite3PagerSnapshotUnlock(Pager *pPager);
// # endif
// #endif
// #if !defined(SQLITE_OMIT_WAL) && defined(SQLITE_ENABLE_SETLK_TIMEOUT)
// int sqlite3PagerWalWriteLock(Pager*, int);
// void sqlite3PagerWalDb(Pager*, sqlite3*);
// #else
// # define sqlite3PagerWalWriteLock(y,z) SQLITE_OK
// # define sqlite3PagerWalDb(x,y)
// #endif
// #ifdef SQLITE_DIRECT_OVERFLOW_READ
// int sqlite3PagerDirectReadOk(Pager *pPager, Pgno pgno);
// #endif
// #ifdef SQLITE_ENABLE_ZIPVFS
// int sqlite3PagerWalFramesize(Pager *pPager);
// #endif
// /* Functions used to query pager state and configuration. */
// u8 sqlite3PagerIsreadonly(Pager*);
// u32 sqlite3PagerDataVersion(Pager*);
// #ifdef SQLITE_DEBUG
// int sqlite3PagerRefcount(Pager*);
// #endif
// int sqlite3PagerMemUsed(Pager*);
// const char *sqlite3PagerFilename(const Pager*, int);
// sqlite3_vfs *sqlite3PagerVfs(Pager*);
// sqlite3_file *sqlite3PagerFile(Pager*);
// sqlite3_file *sqlite3PagerJrnlFile(Pager*);
// const char *sqlite3PagerJournalname(Pager*);
// void *sqlite3PagerTempSpace(Pager*);
// int sqlite3PagerIsMemdb(Pager*);
// void sqlite3PagerCacheStat(Pager *, int, int, int *);
// void sqlite3PagerClearCache(Pager*);
// int sqlite3SectorSize(sqlite3_file *);
// /* Functions used to truncate the database file. */
// void sqlite3PagerTruncateImage(Pager*,Pgno);
// void sqlite3PagerRekey(DbPage*, Pgno, u16);
// /* Functions to support testing and debugging. */
// #if !defined(NDEBUG) || defined(SQLITE_TEST)
// Pgno sqlite3PagerPagenumber(DbPage*);
// int sqlite3PagerIswriteable(DbPage*);
// #endif
// #ifdef SQLITE_TEST
// int *sqlite3PagerStats(Pager*);
// void sqlite3PagerRefdump(Pager*);
// void disable_simulated_io_errors(void);
// void enable_simulated_io_errors(void);
// #else
// # define disable_simulated_io_errors()
// # define enable_simulated_io_errors()
// #endif
#endif /* SQLITE_PAGER_H */
/*
** 2008 August 05
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface that the sqlite page cache
** subsystem.
*/
#ifndef _PCACHE_H_
typedef struct PgHdr PgHdr;
typedef struct PCache PCache;
/*
** Every page in the cache is controlled by an instance of the following
** structure.
*/
struct PgHdr {
sqlite3_pcache_page *pPage; /* Pcache object page handle */
void * pData; /* Page data */
void * pExtra; /* Extra content */
PCache * pCache; /* PRIVATE: Cache that owns this page */
PgHdr * pDirty; /* Transient list of dirty sorted by pgno */
Pager * pPager; /* The pager this page is part of */
Pgno pgno; /* Page number for this page */
#ifdef SQLITE_CHECK_PAGES
u32 pageHash; /* Hash of page content */
#endif
u16 flags; /* PGHDR flags defined below */
/**********************************************************************
** Elements above, except pCache, are public. All that follow are
** private to pcache.c and should not be accessed by other modules.
** pCache is grouped with the public elements for efficiency.
*/
i16 nRef; /* Number of users of this page */
PgHdr *pDirtyNext; /* Next element in list of dirty pages */
PgHdr *pDirtyPrev; /* Previous element in list of dirty pages */
/* NB: pDirtyNext and pDirtyPrev are undefined if the
** PgHdr object is not dirty */
};
/* Bit values for PgHdr.flags */
#define PGHDR_CLEAN 0x001 /* Page not on the PCache.pDirty list */
#define PGHDR_DIRTY 0x002 /* Page is on the PCache.pDirty list */
#define PGHDR_WRITEABLE 0x004 /* Journaled and ready to modify */
#define PGHDR_NEED_SYNC \
0x008 /* Fsync the rollback journal before \
** writing this page to the database */
#define PGHDR_DONT_WRITE 0x010 /* Do not write content to disk */
#define PGHDR_MMAP 0x020 /* This is an mmap page object */
#define PGHDR_WAL_APPEND 0x040 /* Appended to wal file */
/* Initialize and shutdown the page cache subsystem */
int sqlite3PcacheInitialize(void);
void sqlite3PcacheShutdown(void);
/* Page cache buffer management:
** These routines implement SQLITE_CONFIG_PAGECACHE.
*/
void sqlite3PCacheBufferSetup(void *, int sz, int n);
/* Create a new pager cache.
** Under memory stress, invoke xStress to try to make pages clean.
** Only clean and unpinned pages can be reclaimed.
*/
int sqlite3PcacheOpen(int szPage, /* Size of every page */
int szExtra, /* Extra space associated with each page */
int bPurgeable, /* True if pages are on backing store */
int (*xStress)(void *, PgHdr *), /* Call to try to make pages clean */
void * pStress, /* Argument to xStress */
PCache *pToInit /* Preallocated space for the PCache */
);
/* Modify the page-size after the cache has been created. */
int sqlite3PcacheSetPageSize(PCache *, int);
/* Return the size in bytes of a PCache object. Used to preallocate
** storage space.
*/
int sqlite3PcacheSize(void);
/* One release per successful fetch. Page is pinned until released.
** Reference counted.
*/
sqlite3_pcache_page *sqlite3PcacheFetch(PCache *, Pgno, int createFlag);
int sqlite3PcacheFetchStress(PCache *, Pgno, sqlite3_pcache_page **);
PgHdr * sqlite3PcacheFetchFinish(PCache *, Pgno, sqlite3_pcache_page *pPage);
void sqlite3PcacheRelease(PgHdr *);
void sqlite3PcacheDrop(PgHdr *); /* Remove page from cache */
void sqlite3PcacheMakeDirty(PgHdr *); /* Make sure page is marked dirty */
void sqlite3PcacheMakeClean(PgHdr *); /* Mark a single page as clean */
void sqlite3PcacheCleanAll(PCache *); /* Mark all dirty list pages as clean */
void sqlite3PcacheClearWritable(PCache *);
/* Change a page number. Used by incr-vacuum. */
void sqlite3PcacheMove(PgHdr *, Pgno);
/* Remove all pages with pgno>x. Reset the cache if x==0 */
void sqlite3PcacheTruncate(PCache *, Pgno x);
/* Get a list of all dirty pages in the cache, sorted by page number */
PgHdr *sqlite3PcacheDirtyList(PCache *);
/* Reset and close the cache object */
void sqlite3PcacheClose(PCache *);
/* Clear flags from pages of the page cache */
void sqlite3PcacheClearSyncFlags(PCache *);
/* Discard the contents of the cache */
void sqlite3PcacheClear(PCache *);
/* Return the total number of outstanding page references */
int sqlite3PcacheRefCount(PCache *);
/* Increment the reference count of an existing page */
void sqlite3PcacheRef(PgHdr *);
int sqlite3PcachePageRefcount(PgHdr *);
/* Return the total number of pages stored in the cache */
int sqlite3PcachePagecount(PCache *);
#if defined(SQLITE_CHECK_PAGES) || defined(SQLITE_DEBUG)
/* Iterate through all dirty pages currently stored in the cache. This
** interface is only available if SQLITE_CHECK_PAGES is defined when the
** library is built.
*/
void sqlite3PcacheIterateDirty(PCache *pCache, void (*xIter)(PgHdr *));
#endif
#if defined(SQLITE_DEBUG)
/* Check invariants on a PgHdr object */
int sqlite3PcachePageSanity(PgHdr *);
#endif
/* Set and get the suggested cache-size for the specified pager-cache.
**
** If no global maximum is configured, then the system attempts to limit
** the total number of pages cached by purgeable pager-caches to the sum
** of the suggested cache-sizes.
*/
void sqlite3PcacheSetCachesize(PCache *, int);
#ifdef SQLITE_TEST
int sqlite3PcacheGetCachesize(PCache *);
#endif
/* Set or get the suggested spill-size for the specified pager-cache.
**
** The spill-size is the minimum number of pages in cache before the cache
** will attempt to spill dirty pages by calling xStress.
*/
int sqlite3PcacheSetSpillsize(PCache *, int);
/* Free up as much memory as possible from the page cache */
void sqlite3PcacheShrink(PCache *);
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
/* Try to return memory used by the pcache module to the main memory heap */
int sqlite3PcacheReleaseMemory(int);
#endif
#ifdef SQLITE_TEST
void sqlite3PcacheStats(int *, int *, int *, int *);
#endif
void sqlite3PCacheSetDefault(void);
/* Return the header size */
int sqlite3HeaderSizePcache(void);
int sqlite3HeaderSizePcache1(void);
/* Number of dirty pages as a percentage of the configured cache size */
int sqlite3PCachePercentDirty(PCache *);
#ifdef SQLITE_DIRECT_OVERFLOW_READ
int sqlite3PCacheIsDirty(PCache *pCache);
#endif
// For real implementation of sqlite3_pcache ========================================
typedef struct sqlite3_pcache sqlite3_pcache;
typedef struct sqlite3_pcache_methods2 {
int iVersion;
void *pArg;
int (*xInit)(void *);
void (*xShutdown)(void *);
sqlite3_pcache *(*xCreate)(int szPage, int szExtra, int bPurgeable);
void (*xCachesize)(sqlite3_pcache *, int nCachesize);
int (*xPagecount)(sqlite3_pcache *);
sqlite3_pcache_page *(*xFetch)(sqlite3_pcache *, unsigned key, int createFlag);
void (*xUnpin)(sqlite3_pcache *, sqlite3_pcache_page *, int discard);
void (*xRekey)(sqlite3_pcache *, sqlite3_pcache_page *, unsigned oldKey, unsigned newKey);
void (*xTruncate)(sqlite3_pcache *, unsigned iLimit);
void (*xDestroy)(sqlite3_pcache *);
void (*xShrink)(sqlite3_pcache *);
} sqlite3_pcache_methods2;
extern sqlite3_pcache_methods2 pcache2;
#endif /* _PCACHE_H_ */
/*
** 2001-09-15
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface that the SQLite library
** presents to client programs. If a C-function, structure, datatype,
** or constant definition does not appear in this file, then it is
** not a published API of SQLite, is subject to change without
** notice, and should not be referenced by programs that use SQLite.
**
** Some of the definitions that are in this file are marked as
** "experimental". Experimental interfaces are normally new
** features recently added to SQLite. We do not anticipate changes
** to experimental interfaces but reserve the right to make minor changes
** if experience from use "in the wild" suggest such changes are prudent.
**
** The official C-language API documentation for SQLite is derived
** from comments in this file. This file is the authoritative source
** on how SQLite interfaces are supposed to operate.
**
** The name of this file under configuration management is "sqlite.h.in".
** The makefile makes some minor changes to this file (such as inserting
** the version number) and changes its name to "sqlite3.h" as
** part of the build process.
*/
#ifndef SQLITE3_H
#define SQLITE3_H
#include <stdarg.h> /* Needed for the definition of va_list */
/*
** Make sure we can call this stuff from C++.
*/
#ifdef __cplusplus
extern "C" {
#endif
/*
** CAPI3REF: Result Codes
** KEYWORDS: {result code definitions}
**
** Many SQLite functions return an integer result code from the set shown
** here in order to indicate success or failure.
**
** New error codes may be added in future versions of SQLite.
**
** See also: [extended result code definitions]
*/
#define SQLITE_OK 0 /* Successful result */
/* beginning-of-error-codes */
#define SQLITE_ERROR 1 /* Generic error */
#define SQLITE_INTERNAL 2 /* Internal logic error in SQLite */
#define SQLITE_PERM 3 /* Access permission denied */
#define SQLITE_ABORT 4 /* Callback routine requested an abort */
#define SQLITE_BUSY 5 /* The database file is locked */
#define SQLITE_LOCKED 6 /* A table in the database is locked */
#define SQLITE_NOMEM 7 /* A malloc() failed */
#define SQLITE_READONLY 8 /* Attempt to write a readonly database */
#define SQLITE_INTERRUPT 9 /* Operation terminated by sqlite3_interrupt()*/
#define SQLITE_IOERR 10 /* Some kind of disk I/O error occurred */
#define SQLITE_CORRUPT 11 /* The database disk image is malformed */
#define SQLITE_NOTFOUND 12 /* Unknown opcode in sqlite3_file_control() */
#define SQLITE_FULL 13 /* Insertion failed because database is full */
#define SQLITE_CANTOPEN 14 /* Unable to open the database file */
#define SQLITE_PROTOCOL 15 /* Database lock protocol error */
#define SQLITE_EMPTY 16 /* Internal use only */
#define SQLITE_SCHEMA 17 /* The database schema changed */
#define SQLITE_TOOBIG 18 /* String or BLOB exceeds size limit */
#define SQLITE_CONSTRAINT 19 /* Abort due to constraint violation */
#define SQLITE_MISMATCH 20 /* Data type mismatch */
#define SQLITE_MISUSE 21 /* Library used incorrectly */
#define SQLITE_NOLFS 22 /* Uses OS features not supported on host */
#define SQLITE_AUTH 23 /* Authorization denied */
#define SQLITE_FORMAT 24 /* Not used */
#define SQLITE_RANGE 25 /* 2nd parameter to sqlite3_bind out of range */
#define SQLITE_NOTADB 26 /* File opened that is not a database file */
#define SQLITE_NOTICE 27 /* Notifications from sqlite3_log() */
#define SQLITE_WARNING 28 /* Warnings from sqlite3_log() */
#define SQLITE_ROW 100 /* sqlite3_step() has another row ready */
#define SQLITE_DONE 101 /* sqlite3_step() has finished executing */
/* end-of-error-codes */
#ifdef __cplusplus
} /* end of the 'extern "C"' block */
#endif
#endif /* _FTS5_H */
/******** End of fts5.h *********/
/*
** 2001 September 15
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** Internal interface definitions for SQLite.
**
*/
#include <assert.h>
#include <fcntl.h>
#include <pthread.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifndef SQLITEINT_H
#define SQLITEINT_H
#include "sqlite3.h"
typedef int8_t i8;
typedef int16_t i16;
typedef int32_t i32;
typedef int64_t i64;
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef struct sqlite3_pcache_page {
void *pBuf; /* The content of the page */
void *pExtra; /* Extra information associated with the page */
} sqlite3_pcache_page;
#define ROUNDDOWN8(x) ((x) & ~7)
#define ROUND8(x) (((x) + 7) & ~7)
typedef struct sqlite3_vfs sqlite3_vfs;
typedef struct sqlite3 sqlite3;
#define SQLITE_DEFAULT_PAGE_SIZE 4096
#include "pager.h"
#include "pcache.h"
#endif /* SQLITEINT_H */
/*
** 2010 February 1
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface to the write-ahead logging
** system. Refer to the comments below and the header comment attached to
** the implementation of each function in log.c for further details.
*/
#ifndef SQLITE_WAL_H
#define SQLITE_WAL_H
#include "sqliteInt.h"
/* Macros for extracting appropriate sync flags for either transaction
** commits (WAL_SYNC_FLAGS(X)) or for checkpoint ops (CKPT_SYNC_FLAGS(X)):
*/
#define WAL_SYNC_FLAGS(X) ((X)&0x03)
#define CKPT_SYNC_FLAGS(X) (((X)>>2)&0x03)
#ifdef SQLITE_OMIT_WAL
# define sqlite3WalOpen(x,y,z) 0
# define sqlite3WalLimit(x,y)
# define sqlite3WalClose(v,w,x,y,z) 0
# define sqlite3WalBeginReadTransaction(y,z) 0
# define sqlite3WalEndReadTransaction(z)
# define sqlite3WalDbsize(y) 0
# define sqlite3WalBeginWriteTransaction(y) 0
# define sqlite3WalEndWriteTransaction(x) 0
# define sqlite3WalUndo(x,y,z) 0
# define sqlite3WalSavepoint(y,z)
# define sqlite3WalSavepointUndo(y,z) 0
# define sqlite3WalFrames(u,v,w,x,y,z) 0
# define sqlite3WalCheckpoint(q,r,s,t,u,v,w,x,y,z) 0
# define sqlite3WalCallback(z) 0
# define sqlite3WalExclusiveMode(y,z) 0
# define sqlite3WalHeapMemory(z) 0
# define sqlite3WalFramesize(z) 0
# define sqlite3WalFindFrame(x,y,z) 0
# define sqlite3WalFile(x) 0
#else
#define WAL_SAVEPOINT_NDATA 4
/* Connection to a write-ahead log (WAL) file.
** There is one object of this type for each pager.
*/
typedef struct Wal Wal;
/* Open and close a connection to a write-ahead log. */
int sqlite3WalOpen(sqlite3_vfs*, sqlite3_file*, const char *, int, i64, Wal**);
int sqlite3WalClose(Wal *pWal, sqlite3*, int sync_flags, int, u8 *);
/* Set the limiting size of a WAL file. */
void sqlite3WalLimit(Wal*, i64);
/* Used by readers to open (lock) and close (unlock) a snapshot. A
** snapshot is like a read-transaction. It is the state of the database
** at an instant in time. sqlite3WalOpenSnapshot gets a read lock and
** preserves the current state even if the other threads or processes
** write to or checkpoint the WAL. sqlite3WalCloseSnapshot() closes the
** transaction and releases the lock.
*/
int sqlite3WalBeginReadTransaction(Wal *pWal, int *);
void sqlite3WalEndReadTransaction(Wal *pWal);
/* Read a page from the write-ahead log, if it is present. */
int sqlite3WalFindFrame(Wal *, Pgno, u32 *);
int sqlite3WalReadFrame(Wal *, u32, int, u8 *);
/* If the WAL is not empty, return the size of the database. */
Pgno sqlite3WalDbsize(Wal *pWal);
/* Obtain or release the WRITER lock. */
int sqlite3WalBeginWriteTransaction(Wal *pWal);
int sqlite3WalEndWriteTransaction(Wal *pWal);
/* Undo any frames written (but not committed) to the log */
int sqlite3WalUndo(Wal *pWal, int (*xUndo)(void *, Pgno), void *pUndoCtx);
/* Return an integer that records the current (uncommitted) write
** position in the WAL */
void sqlite3WalSavepoint(Wal *pWal, u32 *aWalData);
/* Move the write position of the WAL back to iFrame. Called in
** response to a ROLLBACK TO command. */
int sqlite3WalSavepointUndo(Wal *pWal, u32 *aWalData);
/* Write a frame or frames to the log. */
int sqlite3WalFrames(Wal *pWal, int, PgHdr *, Pgno, int, int);
/* Copy pages from the log to the database file */
int sqlite3WalCheckpoint(
Wal *pWal, /* Write-ahead log connection */
sqlite3 *db, /* Check this handle's interrupt flag */
int eMode, /* One of PASSIVE, FULL and RESTART */
int (*xBusy)(void*), /* Function to call when busy */
void *pBusyArg, /* Context argument for xBusyHandler */
int sync_flags, /* Flags to sync db file with (or 0) */
int nBuf, /* Size of buffer nBuf */
u8 *zBuf, /* Temporary buffer to use */
int *pnLog, /* OUT: Number of frames in WAL */
int *pnCkpt /* OUT: Number of backfilled frames in WAL */
);
/* Return the value to pass to a sqlite3_wal_hook callback, the
** number of frames in the WAL at the point of the last commit since
** sqlite3WalCallback() was called. If no commits have occurred since
** the last call, then return 0.
*/
int sqlite3WalCallback(Wal *pWal);
/* Tell the wal layer that an EXCLUSIVE lock has been obtained (or released)
** by the pager layer on the database file.
*/
int sqlite3WalExclusiveMode(Wal *pWal, int op);
/* Return true if the argument is non-NULL and the WAL module is using
** heap-memory for the wal-index. Otherwise, if the argument is NULL or the
** WAL module is using shared-memory, return false.
*/
int sqlite3WalHeapMemory(Wal *pWal);
#ifdef SQLITE_ENABLE_SNAPSHOT
int sqlite3WalSnapshotGet(Wal *pWal, sqlite3_snapshot **ppSnapshot);
void sqlite3WalSnapshotOpen(Wal *pWal, sqlite3_snapshot *pSnapshot);
int sqlite3WalSnapshotRecover(Wal *pWal);
int sqlite3WalSnapshotCheck(Wal *pWal, sqlite3_snapshot *pSnapshot);
void sqlite3WalSnapshotUnlock(Wal *pWal);
#endif
#ifdef SQLITE_ENABLE_ZIPVFS
/* If the WAL file is not empty, return the number of bytes of content
** stored in each frame (i.e. the db page-size when the WAL was created).
*/
int sqlite3WalFramesize(Wal *pWal);
#endif
/* Return the sqlite3_file object for the WAL file */
sqlite3_file *sqlite3WalFile(Wal *pWal);
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
int sqlite3WalWriteLock(Wal *pWal, int bLock);
void sqlite3WalDb(Wal *pWal, sqlite3 *db);
#endif
#endif /* ifndef SQLITE_OMIT_WAL */
#endif /* SQLITE_WAL_H */
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册