Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
taosdata
TDengine
提交
3beef665
T
TDengine
项目概览
taosdata
/
TDengine
大约 2 年 前同步成功
通知
1192
Star
22018
Fork
4786
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
T
TDengine
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
3beef665
编写于
3月 18, 2022
作者:
H
Hongze Cheng
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
more TDB
上级
031d84e7
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
0 addition
and
26153 deletion
+0
-26153
source/libs/tdb/CMakeLists.txt
source/libs/tdb/CMakeLists.txt
+0
-10
source/libs/tdb/src/sqlite/btmutex.c
source/libs/tdb/src/sqlite/btmutex.c
+0
-308
source/libs/tdb/src/sqlite/btree.c
source/libs/tdb/src/sqlite/btree.c
+0
-10869
source/libs/tdb/src/sqlite/pager.c
source/libs/tdb/src/sqlite/pager.c
+0
-6851
source/libs/tdb/src/sqlite/pcache.c
source/libs/tdb/src/sqlite/pcache.c
+0
-851
source/libs/tdb/src/sqlite/pcache1.c
source/libs/tdb/src/sqlite/pcache1.c
+0
-1211
source/libs/tdb/src/sqlite/wal.c
source/libs/tdb/src/sqlite/wal.c
+0
-4153
source/libs/tdb/src/sqliteinc/btree.h
source/libs/tdb/src/sqliteinc/btree.h
+0
-412
source/libs/tdb/src/sqliteinc/btreeInt.h
source/libs/tdb/src/sqliteinc/btreeInt.h
+0
-729
source/libs/tdb/src/sqliteinc/pager.h
source/libs/tdb/src/sqliteinc/pager.h
+0
-241
source/libs/tdb/src/sqliteinc/pcache.h
source/libs/tdb/src/sqliteinc/pcache.h
+0
-210
source/libs/tdb/src/sqliteinc/sqlite3.h
source/libs/tdb/src/sqliteinc/sqlite3.h
+0
-95
source/libs/tdb/src/sqliteinc/sqliteInt.h
source/libs/tdb/src/sqliteinc/sqliteInt.h
+0
-58
source/libs/tdb/src/sqliteinc/wal.h
source/libs/tdb/src/sqliteinc/wal.h
+0
-155
未找到文件。
source/libs/tdb/CMakeLists.txt
浏览文件 @
3beef665
...
@@ -22,16 +22,6 @@ target_link_libraries(
...
@@ -22,16 +22,6 @@ target_link_libraries(
PUBLIC util
PUBLIC util
)
)
# for tdb_sqlite
add_library
(
tdb_sqlite
""
)
target_sources
(
tdb_sqlite
PRIVATE
"src/sqlite/pcache.c"
"src/sqlite/pcache1.c"
"src/sqlite/pager.c"
)
target_include_directories
(
tdb_sqlite PUBLIC
"src/sqliteinc"
)
# for test
# for test
if
(
${
BUILD_TEST
}
)
if
(
${
BUILD_TEST
}
)
add_subdirectory
(
test
)
add_subdirectory
(
test
)
...
...
source/libs/tdb/src/sqlite/btmutex.c
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2007 August 27
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file contains code used to implement mutexes on Btree objects.
** This code really belongs in btree.c. But btree.c is getting too
** big and we want to break it down some. This packaged seemed like
** a good breakout.
*/
#include "btreeInt.h"
#ifndef SQLITE_OMIT_SHARED_CACHE
#if SQLITE_THREADSAFE
/*
** Obtain the BtShared mutex associated with B-Tree handle p. Also,
** set BtShared.db to the database handle associated with p and the
** p->locked boolean to true.
*/
static
void
lockBtreeMutex
(
Btree
*
p
){
assert
(
p
->
locked
==
0
);
assert
(
sqlite3_mutex_notheld
(
p
->
pBt
->
mutex
)
);
assert
(
sqlite3_mutex_held
(
p
->
db
->
mutex
)
);
sqlite3_mutex_enter
(
p
->
pBt
->
mutex
);
p
->
pBt
->
db
=
p
->
db
;
p
->
locked
=
1
;
}
/*
** Release the BtShared mutex associated with B-Tree handle p and
** clear the p->locked boolean.
*/
static
void
SQLITE_NOINLINE
unlockBtreeMutex
(
Btree
*
p
){
BtShared
*
pBt
=
p
->
pBt
;
assert
(
p
->
locked
==
1
);
assert
(
sqlite3_mutex_held
(
pBt
->
mutex
)
);
assert
(
sqlite3_mutex_held
(
p
->
db
->
mutex
)
);
assert
(
p
->
db
==
pBt
->
db
);
sqlite3_mutex_leave
(
pBt
->
mutex
);
p
->
locked
=
0
;
}
/* Forward reference */
static
void
SQLITE_NOINLINE
btreeLockCarefully
(
Btree
*
p
);
/*
** Enter a mutex on the given BTree object.
**
** If the object is not sharable, then no mutex is ever required
** and this routine is a no-op. The underlying mutex is non-recursive.
** But we keep a reference count in Btree.wantToLock so the behavior
** of this interface is recursive.
**
** To avoid deadlocks, multiple Btrees are locked in the same order
** by all database connections. The p->pNext is a list of other
** Btrees belonging to the same database connection as the p Btree
** which need to be locked after p. If we cannot get a lock on
** p, then first unlock all of the others on p->pNext, then wait
** for the lock to become available on p, then relock all of the
** subsequent Btrees that desire a lock.
*/
void
sqlite3BtreeEnter
(
Btree
*
p
){
/* Some basic sanity checking on the Btree. The list of Btrees
** connected by pNext and pPrev should be in sorted order by
** Btree.pBt value. All elements of the list should belong to
** the same connection. Only shared Btrees are on the list. */
assert
(
p
->
pNext
==
0
||
p
->
pNext
->
pBt
>
p
->
pBt
);
assert
(
p
->
pPrev
==
0
||
p
->
pPrev
->
pBt
<
p
->
pBt
);
assert
(
p
->
pNext
==
0
||
p
->
pNext
->
db
==
p
->
db
);
assert
(
p
->
pPrev
==
0
||
p
->
pPrev
->
db
==
p
->
db
);
assert
(
p
->
sharable
||
(
p
->
pNext
==
0
&&
p
->
pPrev
==
0
)
);
/* Check for locking consistency */
assert
(
!
p
->
locked
||
p
->
wantToLock
>
0
);
assert
(
p
->
sharable
||
p
->
wantToLock
==
0
);
/* We should already hold a lock on the database connection */
assert
(
sqlite3_mutex_held
(
p
->
db
->
mutex
)
);
/* Unless the database is sharable and unlocked, then BtShared.db
** should already be set correctly. */
assert
(
(
p
->
locked
==
0
&&
p
->
sharable
)
||
p
->
pBt
->
db
==
p
->
db
);
if
(
!
p
->
sharable
)
return
;
p
->
wantToLock
++
;
if
(
p
->
locked
)
return
;
btreeLockCarefully
(
p
);
}
/* This is a helper function for sqlite3BtreeLock(). By moving
** complex, but seldom used logic, out of sqlite3BtreeLock() and
** into this routine, we avoid unnecessary stack pointer changes
** and thus help the sqlite3BtreeLock() routine to run much faster
** in the common case.
*/
static
void
SQLITE_NOINLINE
btreeLockCarefully
(
Btree
*
p
){
Btree
*
pLater
;
/* In most cases, we should be able to acquire the lock we
** want without having to go through the ascending lock
** procedure that follows. Just be sure not to block.
*/
if
(
sqlite3_mutex_try
(
p
->
pBt
->
mutex
)
==
SQLITE_OK
){
p
->
pBt
->
db
=
p
->
db
;
p
->
locked
=
1
;
return
;
}
/* To avoid deadlock, first release all locks with a larger
** BtShared address. Then acquire our lock. Then reacquire
** the other BtShared locks that we used to hold in ascending
** order.
*/
for
(
pLater
=
p
->
pNext
;
pLater
;
pLater
=
pLater
->
pNext
){
assert
(
pLater
->
sharable
);
assert
(
pLater
->
pNext
==
0
||
pLater
->
pNext
->
pBt
>
pLater
->
pBt
);
assert
(
!
pLater
->
locked
||
pLater
->
wantToLock
>
0
);
if
(
pLater
->
locked
){
unlockBtreeMutex
(
pLater
);
}
}
lockBtreeMutex
(
p
);
for
(
pLater
=
p
->
pNext
;
pLater
;
pLater
=
pLater
->
pNext
){
if
(
pLater
->
wantToLock
){
lockBtreeMutex
(
pLater
);
}
}
}
/*
** Exit the recursive mutex on a Btree.
*/
void
sqlite3BtreeLeave
(
Btree
*
p
){
assert
(
sqlite3_mutex_held
(
p
->
db
->
mutex
)
);
if
(
p
->
sharable
){
assert
(
p
->
wantToLock
>
0
);
p
->
wantToLock
--
;
if
(
p
->
wantToLock
==
0
){
unlockBtreeMutex
(
p
);
}
}
}
#ifndef NDEBUG
/*
** Return true if the BtShared mutex is held on the btree, or if the
** B-Tree is not marked as sharable.
**
** This routine is used only from within assert() statements.
*/
int
sqlite3BtreeHoldsMutex
(
Btree
*
p
){
assert
(
p
->
sharable
==
0
||
p
->
locked
==
0
||
p
->
wantToLock
>
0
);
assert
(
p
->
sharable
==
0
||
p
->
locked
==
0
||
p
->
db
==
p
->
pBt
->
db
);
assert
(
p
->
sharable
==
0
||
p
->
locked
==
0
||
sqlite3_mutex_held
(
p
->
pBt
->
mutex
)
);
assert
(
p
->
sharable
==
0
||
p
->
locked
==
0
||
sqlite3_mutex_held
(
p
->
db
->
mutex
)
);
return
(
p
->
sharable
==
0
||
p
->
locked
);
}
#endif
/*
** Enter the mutex on every Btree associated with a database
** connection. This is needed (for example) prior to parsing
** a statement since we will be comparing table and column names
** against all schemas and we do not want those schemas being
** reset out from under us.
**
** There is a corresponding leave-all procedures.
**
** Enter the mutexes in accending order by BtShared pointer address
** to avoid the possibility of deadlock when two threads with
** two or more btrees in common both try to lock all their btrees
** at the same instant.
*/
static
void
SQLITE_NOINLINE
btreeEnterAll
(
sqlite3
*
db
){
int
i
;
int
skipOk
=
1
;
Btree
*
p
;
assert
(
sqlite3_mutex_held
(
db
->
mutex
)
);
for
(
i
=
0
;
i
<
db
->
nDb
;
i
++
){
p
=
db
->
aDb
[
i
].
pBt
;
if
(
p
&&
p
->
sharable
){
sqlite3BtreeEnter
(
p
);
skipOk
=
0
;
}
}
db
->
noSharedCache
=
skipOk
;
}
void
sqlite3BtreeEnterAll
(
sqlite3
*
db
){
if
(
db
->
noSharedCache
==
0
)
btreeEnterAll
(
db
);
}
static
void
SQLITE_NOINLINE
btreeLeaveAll
(
sqlite3
*
db
){
int
i
;
Btree
*
p
;
assert
(
sqlite3_mutex_held
(
db
->
mutex
)
);
for
(
i
=
0
;
i
<
db
->
nDb
;
i
++
){
p
=
db
->
aDb
[
i
].
pBt
;
if
(
p
)
sqlite3BtreeLeave
(
p
);
}
}
void
sqlite3BtreeLeaveAll
(
sqlite3
*
db
){
if
(
db
->
noSharedCache
==
0
)
btreeLeaveAll
(
db
);
}
#ifndef NDEBUG
/*
** Return true if the current thread holds the database connection
** mutex and all required BtShared mutexes.
**
** This routine is used inside assert() statements only.
*/
int
sqlite3BtreeHoldsAllMutexes
(
sqlite3
*
db
){
int
i
;
if
(
!
sqlite3_mutex_held
(
db
->
mutex
)
){
return
0
;
}
for
(
i
=
0
;
i
<
db
->
nDb
;
i
++
){
Btree
*
p
;
p
=
db
->
aDb
[
i
].
pBt
;
if
(
p
&&
p
->
sharable
&&
(
p
->
wantToLock
==
0
||
!
sqlite3_mutex_held
(
p
->
pBt
->
mutex
))
){
return
0
;
}
}
return
1
;
}
#endif
/* NDEBUG */
#ifndef NDEBUG
/*
** Return true if the correct mutexes are held for accessing the
** db->aDb[iDb].pSchema structure. The mutexes required for schema
** access are:
**
** (1) The mutex on db
** (2) if iDb!=1, then the mutex on db->aDb[iDb].pBt.
**
** If pSchema is not NULL, then iDb is computed from pSchema and
** db using sqlite3SchemaToIndex().
*/
int
sqlite3SchemaMutexHeld
(
sqlite3
*
db
,
int
iDb
,
Schema
*
pSchema
){
Btree
*
p
;
assert
(
db
!=
0
);
if
(
pSchema
)
iDb
=
sqlite3SchemaToIndex
(
db
,
pSchema
);
assert
(
iDb
>=
0
&&
iDb
<
db
->
nDb
);
if
(
!
sqlite3_mutex_held
(
db
->
mutex
)
)
return
0
;
if
(
iDb
==
1
)
return
1
;
p
=
db
->
aDb
[
iDb
].
pBt
;
assert
(
p
!=
0
);
return
p
->
sharable
==
0
||
p
->
locked
==
1
;
}
#endif
/* NDEBUG */
#else
/* SQLITE_THREADSAFE>0 above. SQLITE_THREADSAFE==0 below */
/*
** The following are special cases for mutex enter routines for use
** in single threaded applications that use shared cache. Except for
** these two routines, all mutex operations are no-ops in that case and
** are null #defines in btree.h.
**
** If shared cache is disabled, then all btree mutex routines, including
** the ones below, are no-ops and are null #defines in btree.h.
*/
void
sqlite3BtreeEnter
(
Btree
*
p
){
p
->
pBt
->
db
=
p
->
db
;
}
void
sqlite3BtreeEnterAll
(
sqlite3
*
db
){
int
i
;
for
(
i
=
0
;
i
<
db
->
nDb
;
i
++
){
Btree
*
p
=
db
->
aDb
[
i
].
pBt
;
if
(
p
){
p
->
pBt
->
db
=
p
->
db
;
}
}
}
#endif
/* if SQLITE_THREADSAFE */
#ifndef SQLITE_OMIT_INCRBLOB
/*
** Enter a mutex on a Btree given a cursor owned by that Btree.
**
** These entry points are used by incremental I/O only. Enter() is required
** any time OMIT_SHARED_CACHE is not defined, regardless of whether or not
** the build is threadsafe. Leave() is only required by threadsafe builds.
*/
void
sqlite3BtreeEnterCursor
(
BtCursor
*
pCur
){
sqlite3BtreeEnter
(
pCur
->
pBtree
);
}
# if SQLITE_THREADSAFE
void
sqlite3BtreeLeaveCursor
(
BtCursor
*
pCur
){
sqlite3BtreeLeave
(
pCur
->
pBtree
);
}
# endif
#endif
/* ifndef SQLITE_OMIT_INCRBLOB */
#endif
/* ifndef SQLITE_OMIT_SHARED_CACHE */
source/libs/tdb/src/sqlite/btree.c
已删除
100644 → 0
浏览文件 @
031d84e7
因为 它太大了无法显示 source diff 。你可以改为
查看blob
。
source/libs/tdb/src/sqlite/pager.c
已删除
100644 → 0
浏览文件 @
031d84e7
因为 它太大了无法显示 source diff 。你可以改为
查看blob
。
source/libs/tdb/src/sqlite/pcache.c
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2008 August 05
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This file implements that page cache.
*/
#include "sqliteInt.h"
/*
** A complete page cache is an instance of this structure. Every
** entry in the cache holds a single page of the database file. The
** btree layer only operates on the cached copy of the database pages.
**
** A page cache entry is "clean" if it exactly matches what is currently
** on disk. A page is "dirty" if it has been modified and needs to be
** persisted to disk.
**
** pDirty, pDirtyTail, pSynced:
** All dirty pages are linked into the doubly linked list using
** PgHdr.pDirtyNext and pDirtyPrev. The list is maintained in LRU order
** such that p was added to the list more recently than p->pDirtyNext.
** PCache.pDirty points to the first (newest) element in the list and
** pDirtyTail to the last (oldest).
**
** The PCache.pSynced variable is used to optimize searching for a dirty
** page to eject from the cache mid-transaction. It is better to eject
** a page that does not require a journal sync than one that does.
** Therefore, pSynced is maintained so that it *almost* always points
** to either the oldest page in the pDirty/pDirtyTail list that has a
** clear PGHDR_NEED_SYNC flag or to a page that is older than this one
** (so that the right page to eject can be found by following pDirtyPrev
** pointers).
*/
struct
PCache
{
PgHdr
*
pDirty
,
*
pDirtyTail
;
/* List of dirty pages in LRU order */
PgHdr
*
pSynced
;
/* Last synced page in dirty page list */
int
nRefSum
;
/* Sum of ref counts over all pages */
int
szCache
;
/* Configured cache size */
int
szSpill
;
/* Size before spilling occurs */
int
szPage
;
/* Size of every page in this cache */
int
szExtra
;
/* Size of extra space for each page */
u8
bPurgeable
;
/* True if pages are on backing store */
u8
eCreate
;
/* eCreate value for for xFetch() */
int
(
*
xStress
)(
void
*
,
PgHdr
*
);
/* Call to try make a page clean */
void
*
pStress
;
/* Argument to xStress */
sqlite3_pcache
*
pCache
;
/* Pluggable cache module */
};
/********************************** Test and Debug Logic **********************/
/*
** Debug tracing macros. Enable by by changing the "0" to "1" and
** recompiling.
**
** When sqlite3PcacheTrace is 1, single line trace messages are issued.
** When sqlite3PcacheTrace is 2, a dump of the pcache showing all cache entries
** is displayed for many operations, resulting in a lot of output.
*/
#if defined(SQLITE_DEBUG) && 0
int
sqlite3PcacheTrace
=
2
;
/* 0: off 1: simple 2: cache dumps */
int
sqlite3PcacheMxDump
=
9999
;
/* Max cache entries for pcacheDump() */
#define pcacheTrace(X) \
if (sqlite3PcacheTrace) { \
sqlite3DebugPrintf X; \
}
void
pcacheDump
(
PCache
*
pCache
)
{
int
N
;
int
i
,
j
;
sqlite3_pcache_page
*
pLower
;
PgHdr
*
pPg
;
unsigned
char
*
a
;
if
(
sqlite3PcacheTrace
<
2
)
return
;
if
(
pCache
->
pCache
==
0
)
return
;
N
=
sqlite3PcachePagecount
(
pCache
);
if
(
N
>
sqlite3PcacheMxDump
)
N
=
sqlite3PcacheMxDump
;
for
(
i
=
1
;
i
<=
N
;
i
++
)
{
pLower
=
pcache2
.
xFetch
(
pCache
->
pCache
,
i
,
0
);
if
(
pLower
==
0
)
continue
;
pPg
=
(
PgHdr
*
)
pLower
->
pExtra
;
printf
(
"%3d: nRef %2d flgs %02x data "
,
i
,
pPg
->
nRef
,
pPg
->
flags
);
a
=
(
unsigned
char
*
)
pLower
->
pBuf
;
for
(
j
=
0
;
j
<
12
;
j
++
)
printf
(
"%02x"
,
a
[
j
]);
printf
(
"
\n
"
);
if
(
pPg
->
pPage
==
0
)
{
pcache2
.
xUnpin
(
pCache
->
pCache
,
pLower
,
0
);
}
}
}
#else
#define pcacheTrace(X)
#define pcacheDump(X)
#endif
// /*
// ** Check invariants on a PgHdr entry. Return true if everything is OK.
// ** Return false if any invariant is violated.
// **
// ** This routine is for use inside of assert() statements only. For
// ** example:
// **
// ** assert( sqlite3PcachePageSanity(pPg) );
// */
// #ifdef SQLITE_DEBUG
// int sqlite3PcachePageSanity(PgHdr *pPg) {
// PCache *pCache;
// assert(pPg != 0);
// assert(pPg->pgno > 0 || pPg->pPager == 0); /* Page number is 1 or more */
// pCache = pPg->pCache;
// assert(pCache != 0); /* Every page has an associated PCache */
// if (pPg->flags & PGHDR_CLEAN) {
// assert((pPg->flags & PGHDR_DIRTY) == 0); /* Cannot be both CLEAN and DIRTY */
// assert(pCache->pDirty != pPg); /* CLEAN pages not on dirty list */
// assert(pCache->pDirtyTail != pPg);
// }
// /* WRITEABLE pages must also be DIRTY */
// if (pPg->flags & PGHDR_WRITEABLE) {
// assert(pPg->flags & PGHDR_DIRTY); /* WRITEABLE implies DIRTY */
// }
// /* NEED_SYNC can be set independently of WRITEABLE. This can happen,
// ** for example, when using the sqlite3PagerDontWrite() optimization:
// ** (1) Page X is journalled, and gets WRITEABLE and NEED_SEEK.
// ** (2) Page X moved to freelist, WRITEABLE is cleared
// ** (3) Page X reused, WRITEABLE is set again
// ** If NEED_SYNC had been cleared in step 2, then it would not be reset
// ** in step 3, and page might be written into the database without first
// ** syncing the rollback journal, which might cause corruption on a power
// ** loss.
// **
// ** Another example is when the database page size is smaller than the
// ** disk sector size. When any page of a sector is journalled, all pages
// ** in that sector are marked NEED_SYNC even if they are still CLEAN, just
// ** in case they are later modified, since all pages in the same sector
// ** must be journalled and synced before any of those pages can be safely
// ** written.
// */
// return 1;
// }
// #endif /* SQLITE_DEBUG */
/********************************** Linked List Management ********************/
/* Allowed values for second argument to pcacheManageDirtyList() */
#define PCACHE_DIRTYLIST_REMOVE 1
/* Remove pPage from dirty list */
#define PCACHE_DIRTYLIST_ADD 2
/* Add pPage to the dirty list */
#define PCACHE_DIRTYLIST_FRONT 3
/* Move pPage to the front of the list */
/*
** Manage pPage's participation on the dirty list. Bits of the addRemove
** argument determines what operation to do. The 0x01 bit means first
** remove pPage from the dirty list. The 0x02 means add pPage back to
** the dirty list. Doing both moves pPage to the front of the dirty list.
*/
static
void
pcacheManageDirtyList
(
PgHdr
*
pPage
,
u8
addRemove
)
{
PCache
*
p
=
pPage
->
pCache
;
pcacheTrace
((
"%p.DIRTYLIST.%s %d
\n
"
,
p
,
addRemove
==
1
?
"REMOVE"
:
addRemove
==
2
?
"ADD"
:
"FRONT"
,
pPage
->
pgno
));
if
(
addRemove
&
PCACHE_DIRTYLIST_REMOVE
)
{
assert
(
pPage
->
pDirtyNext
||
pPage
==
p
->
pDirtyTail
);
assert
(
pPage
->
pDirtyPrev
||
pPage
==
p
->
pDirty
);
/* Update the PCache1.pSynced variable if necessary. */
if
(
p
->
pSynced
==
pPage
)
{
p
->
pSynced
=
pPage
->
pDirtyPrev
;
}
if
(
pPage
->
pDirtyNext
)
{
pPage
->
pDirtyNext
->
pDirtyPrev
=
pPage
->
pDirtyPrev
;
}
else
{
assert
(
pPage
==
p
->
pDirtyTail
);
p
->
pDirtyTail
=
pPage
->
pDirtyPrev
;
}
if
(
pPage
->
pDirtyPrev
)
{
pPage
->
pDirtyPrev
->
pDirtyNext
=
pPage
->
pDirtyNext
;
}
else
{
/* If there are now no dirty pages in the cache, set eCreate to 2.
** This is an optimization that allows sqlite3PcacheFetch() to skip
** searching for a dirty page to eject from the cache when it might
** otherwise have to. */
assert
(
pPage
==
p
->
pDirty
);
p
->
pDirty
=
pPage
->
pDirtyNext
;
assert
(
p
->
bPurgeable
||
p
->
eCreate
==
2
);
if
(
p
->
pDirty
==
0
)
{
/*OPTIMIZATION-IF-TRUE*/
assert
(
p
->
bPurgeable
==
0
||
p
->
eCreate
==
1
);
p
->
eCreate
=
2
;
}
}
}
if
(
addRemove
&
PCACHE_DIRTYLIST_ADD
)
{
pPage
->
pDirtyPrev
=
0
;
pPage
->
pDirtyNext
=
p
->
pDirty
;
if
(
pPage
->
pDirtyNext
)
{
assert
(
pPage
->
pDirtyNext
->
pDirtyPrev
==
0
);
pPage
->
pDirtyNext
->
pDirtyPrev
=
pPage
;
}
else
{
p
->
pDirtyTail
=
pPage
;
if
(
p
->
bPurgeable
)
{
assert
(
p
->
eCreate
==
2
);
p
->
eCreate
=
1
;
}
}
p
->
pDirty
=
pPage
;
/* If pSynced is NULL and this page has a clear NEED_SYNC flag, set
** pSynced to point to it. Checking the NEED_SYNC flag is an
** optimization, as if pSynced points to a page with the NEED_SYNC
** flag set sqlite3PcacheFetchStress() searches through all newer
** entries of the dirty-list for a page with NEED_SYNC clear anyway. */
if
(
!
p
->
pSynced
&&
0
==
(
pPage
->
flags
&
PGHDR_NEED_SYNC
)
/*OPTIMIZATION-IF-FALSE*/
)
{
p
->
pSynced
=
pPage
;
}
}
pcacheDump
(
p
);
}
/*
** Wrapper around the pluggable caches xUnpin method. If the cache is
** being used for an in-memory database, this function is a no-op.
*/
static
void
pcacheUnpin
(
PgHdr
*
p
)
{
if
(
p
->
pCache
->
bPurgeable
)
{
pcacheTrace
((
"%p.UNPIN %d
\n
"
,
p
->
pCache
,
p
->
pgno
));
pcache2
.
xUnpin
(
p
->
pCache
->
pCache
,
p
->
pPage
,
0
);
pcacheDump
(
p
->
pCache
);
}
}
/*
** Compute the number of pages of cache requested. p->szCache is the
** cache size requested by the "PRAGMA cache_size" statement.
*/
static
int
numberOfCachePages
(
PCache
*
p
)
{
if
(
p
->
szCache
>=
0
)
{
/* IMPLEMENTATION-OF: R-42059-47211 If the argument N is positive then the
** suggested cache size is set to N. */
return
p
->
szCache
;
}
else
{
i64
n
;
/* IMPLEMANTATION-OF: R-59858-46238 If the argument N is negative, then the
** number of cache pages is adjusted to be a number of pages that would
** use approximately abs(N*1024) bytes of memory based on the current
** page size. */
n
=
((
-
1024
*
(
i64
)
p
->
szCache
)
/
(
p
->
szPage
+
p
->
szExtra
));
if
(
n
>
1000000000
)
n
=
1000000000
;
return
(
int
)
n
;
}
}
/*************************************************** General Interfaces ******
**
** Initialize and shutdown the page cache subsystem. Neither of these
** functions are threadsafe.
*/
int
sqlite3PcacheInitialize
(
void
)
{
return
pcache2
.
xInit
(
pcache2
.
pArg
);
}
void
sqlite3PcacheShutdown
(
void
)
{
if
(
pcache2
.
xShutdown
)
{
/* IMPLEMENTATION-OF: R-26000-56589 The xShutdown() method may be NULL. */
pcache2
.
xShutdown
(
pcache2
.
pArg
);
}
}
/*
** Return the size in bytes of a PCache object.
*/
int
sqlite3PcacheSize
(
void
)
{
return
sizeof
(
PCache
);
}
/*
** Create a new PCache object. Storage space to hold the object
** has already been allocated and is passed in as the p pointer.
** The caller discovers how much space needs to be allocated by
** calling sqlite3PcacheSize().
**
** szExtra is some extra space allocated for each page. The first
** 8 bytes of the extra space will be zeroed as the page is allocated,
** but remaining content will be uninitialized. Though it is opaque
** to this module, the extra space really ends up being the MemPage
** structure in the pager.
*/
int
sqlite3PcacheOpen
(
int
szPage
,
/* Size of every page */
int
szExtra
,
/* Extra space associated with each page */
int
bPurgeable
,
/* True if pages are on backing store */
int
(
*
xStress
)(
void
*
,
PgHdr
*
),
/* Call to try to make pages clean */
void
*
pStress
,
/* Argument to xStress */
PCache
*
p
/* Preallocated space for the PCache */
)
{
memset
(
p
,
0
,
sizeof
(
PCache
));
p
->
szPage
=
1
;
p
->
szExtra
=
szExtra
;
assert
(
szExtra
>=
8
);
/* First 8 bytes will be zeroed */
p
->
bPurgeable
=
bPurgeable
;
p
->
eCreate
=
2
;
p
->
xStress
=
xStress
;
p
->
pStress
=
pStress
;
p
->
szCache
=
100
;
p
->
szSpill
=
1
;
pcacheTrace
((
"%p.OPEN szPage %d bPurgeable %d
\n
"
,
p
,
szPage
,
bPurgeable
));
return
sqlite3PcacheSetPageSize
(
p
,
szPage
);
}
/*
** Change the page size for PCache object. The caller must ensure that there
** are no outstanding page references when this function is called.
*/
int
sqlite3PcacheSetPageSize
(
PCache
*
pCache
,
int
szPage
)
{
assert
(
pCache
->
nRefSum
==
0
&&
pCache
->
pDirty
==
0
);
if
(
pCache
->
szPage
)
{
sqlite3_pcache
*
pNew
;
pNew
=
pcache2
.
xCreate
(
szPage
,
pCache
->
szExtra
+
ROUND8
(
sizeof
(
PgHdr
)),
pCache
->
bPurgeable
);
if
(
pNew
==
0
)
return
SQLITE_NOMEM
;
pcache2
.
xCachesize
(
pNew
,
numberOfCachePages
(
pCache
));
if
(
pCache
->
pCache
)
{
pcache2
.
xDestroy
(
pCache
->
pCache
);
}
pCache
->
pCache
=
pNew
;
pCache
->
szPage
=
szPage
;
pcacheTrace
((
"%p.PAGESIZE %d
\n
"
,
pCache
,
szPage
));
}
return
0
;
}
/*
** Try to obtain a page from the cache.
**
** This routine returns a pointer to an sqlite3_pcache_page object if
** such an object is already in cache, or if a new one is created.
** This routine returns a NULL pointer if the object was not in cache
** and could not be created.
**
** The createFlags should be 0 to check for existing pages and should
** be 3 (not 1, but 3) to try to create a new page.
**
** If the createFlag is 0, then NULL is always returned if the page
** is not already in the cache. If createFlag is 1, then a new page
** is created only if that can be done without spilling dirty pages
** and without exceeding the cache size limit.
**
** The caller needs to invoke sqlite3PcacheFetchFinish() to properly
** initialize the sqlite3_pcache_page object and convert it into a
** PgHdr object. The sqlite3PcacheFetch() and sqlite3PcacheFetchFinish()
** routines are split this way for performance reasons. When separated
** they can both (usually) operate without having to push values to
** the stack on entry and pop them back off on exit, which saves a
** lot of pushing and popping.
*/
sqlite3_pcache_page
*
sqlite3PcacheFetch
(
PCache
*
pCache
,
/* Obtain the page from this cache */
Pgno
pgno
,
/* Page number to obtain */
int
createFlag
/* If true, create page if it does not exist already */
)
{
int
eCreate
;
sqlite3_pcache_page
*
pRes
;
assert
(
pCache
!=
0
);
assert
(
pCache
->
pCache
!=
0
);
assert
(
createFlag
==
3
||
createFlag
==
0
);
assert
(
pCache
->
eCreate
==
((
pCache
->
bPurgeable
&&
pCache
->
pDirty
)
?
1
:
2
));
/* eCreate defines what to do if the page does not exist.
** 0 Do not allocate a new page. (createFlag==0)
** 1 Allocate a new page if doing so is inexpensive.
** (createFlag==1 AND bPurgeable AND pDirty)
** 2 Allocate a new page even it doing so is difficult.
** (createFlag==1 AND !(bPurgeable AND pDirty)
*/
eCreate
=
createFlag
&
pCache
->
eCreate
;
assert
(
eCreate
==
0
||
eCreate
==
1
||
eCreate
==
2
);
assert
(
createFlag
==
0
||
pCache
->
eCreate
==
eCreate
);
assert
(
createFlag
==
0
||
eCreate
==
1
+
(
!
pCache
->
bPurgeable
||
!
pCache
->
pDirty
));
pRes
=
pcache2
.
xFetch
(
pCache
->
pCache
,
pgno
,
eCreate
);
pcacheTrace
((
"%p.FETCH %d%s (result: %p)
\n
"
,
pCache
,
pgno
,
createFlag
?
" create"
:
""
,
pRes
));
return
pRes
;
}
/*
** If the sqlite3PcacheFetch() routine is unable to allocate a new
** page because no clean pages are available for reuse and the cache
** size limit has been reached, then this routine can be invoked to
** try harder to allocate a page. This routine might invoke the stress
** callback to spill dirty pages to the journal. It will then try to
** allocate the new page and will only fail to allocate a new page on
** an OOM error.
**
** This routine should be invoked only after sqlite3PcacheFetch() fails.
*/
int
sqlite3PcacheFetchStress
(
PCache
*
pCache
,
/* Obtain the page from this cache */
Pgno
pgno
,
/* Page number to obtain */
sqlite3_pcache_page
**
ppPage
/* Write result here */
)
{
PgHdr
*
pPg
;
if
(
pCache
->
eCreate
==
2
)
return
0
;
if
(
sqlite3PcachePagecount
(
pCache
)
>
pCache
->
szSpill
)
{
/* Find a dirty page to write-out and recycle. First try to find a
** page that does not require a journal-sync (one with PGHDR_NEED_SYNC
** cleared), but if that is not possible settle for any other
** unreferenced dirty page.
**
** If the LRU page in the dirty list that has a clear PGHDR_NEED_SYNC
** flag is currently referenced, then the following may leave pSynced
** set incorrectly (pointing to other than the LRU page with NEED_SYNC
** cleared). This is Ok, as pSynced is just an optimization. */
for
(
pPg
=
pCache
->
pSynced
;
pPg
&&
(
pPg
->
nRef
||
(
pPg
->
flags
&
PGHDR_NEED_SYNC
));
pPg
=
pPg
->
pDirtyPrev
)
;
pCache
->
pSynced
=
pPg
;
if
(
!
pPg
)
{
for
(
pPg
=
pCache
->
pDirtyTail
;
pPg
&&
pPg
->
nRef
;
pPg
=
pPg
->
pDirtyPrev
)
;
}
if
(
pPg
)
{
int
rc
;
#ifdef SQLITE_LOG_CACHE_SPILL
sqlite3_log
(
SQLITE_FULL
,
"spill page %d making room for %d - cache used: %d/%d"
,
pPg
->
pgno
,
pgno
,
pcache2
.
xPagecount
(
pCache
->
pCache
),
numberOfCachePages
(
pCache
));
#endif
pcacheTrace
((
"%p.SPILL %d
\n
"
,
pCache
,
pPg
->
pgno
));
rc
=
pCache
->
xStress
(
pCache
->
pStress
,
pPg
);
pcacheDump
(
pCache
);
if
(
rc
!=
0
&&
rc
!=
SQLITE_BUSY
)
{
return
rc
;
}
}
}
*
ppPage
=
pcache2
.
xFetch
(
pCache
->
pCache
,
pgno
,
2
);
return
*
ppPage
==
0
?
SQLITE_NOMEM
:
0
;
}
/*
** This is a helper routine for sqlite3PcacheFetchFinish()
**
** In the uncommon case where the page being fetched has not been
** initialized, this routine is invoked to do the initialization.
** This routine is broken out into a separate function since it
** requires extra stack manipulation that can be avoided in the common
** case.
*/
static
PgHdr
*
pcacheFetchFinishWithInit
(
PCache
*
pCache
,
/* Obtain the page from this cache */
Pgno
pgno
,
/* Page number obtained */
sqlite3_pcache_page
*
pPage
/* Page obtained by prior PcacheFetch() call */
)
{
PgHdr
*
pPgHdr
;
assert
(
pPage
!=
0
);
pPgHdr
=
(
PgHdr
*
)
pPage
->
pExtra
;
assert
(
pPgHdr
->
pPage
==
0
);
memset
(
&
pPgHdr
->
pDirty
,
0
,
sizeof
(
PgHdr
)
-
offsetof
(
PgHdr
,
pDirty
));
pPgHdr
->
pPage
=
pPage
;
pPgHdr
->
pData
=
pPage
->
pBuf
;
pPgHdr
->
pExtra
=
(
void
*
)
&
pPgHdr
[
1
];
memset
(
pPgHdr
->
pExtra
,
0
,
8
);
pPgHdr
->
pCache
=
pCache
;
pPgHdr
->
pgno
=
pgno
;
pPgHdr
->
flags
=
PGHDR_CLEAN
;
return
sqlite3PcacheFetchFinish
(
pCache
,
pgno
,
pPage
);
}
/*
** This routine converts the sqlite3_pcache_page object returned by
** sqlite3PcacheFetch() into an initialized PgHdr object. This routine
** must be called after sqlite3PcacheFetch() in order to get a usable
** result.
*/
PgHdr
*
sqlite3PcacheFetchFinish
(
PCache
*
pCache
,
/* Obtain the page from this cache */
Pgno
pgno
,
/* Page number obtained */
sqlite3_pcache_page
*
pPage
/* Page obtained by prior PcacheFetch() call */
)
{
PgHdr
*
pPgHdr
;
assert
(
pPage
!=
0
);
pPgHdr
=
(
PgHdr
*
)
pPage
->
pExtra
;
if
(
!
pPgHdr
->
pPage
)
{
return
pcacheFetchFinishWithInit
(
pCache
,
pgno
,
pPage
);
}
pCache
->
nRefSum
++
;
pPgHdr
->
nRef
++
;
// assert(sqlite3PcachePageSanity(pPgHdr));
return
pPgHdr
;
}
/*
** Decrement the reference count on a page. If the page is clean and the
** reference count drops to 0, then it is made eligible for recycling.
*/
void
sqlite3PcacheRelease
(
PgHdr
*
p
)
{
assert
(
p
->
nRef
>
0
);
p
->
pCache
->
nRefSum
--
;
if
((
--
p
->
nRef
)
==
0
)
{
if
(
p
->
flags
&
PGHDR_CLEAN
)
{
pcacheUnpin
(
p
);
}
else
{
pcacheManageDirtyList
(
p
,
PCACHE_DIRTYLIST_FRONT
);
}
}
}
/*
** Increase the reference count of a supplied page by 1.
*/
void
sqlite3PcacheRef
(
PgHdr
*
p
)
{
assert
(
p
->
nRef
>
0
);
// assert(sqlite3PcachePageSanity(p));
p
->
nRef
++
;
p
->
pCache
->
nRefSum
++
;
}
/*
** Drop a page from the cache. There must be exactly one reference to the
** page. This function deletes that reference, so after it returns the
** page pointed to by p is invalid.
*/
void
sqlite3PcacheDrop
(
PgHdr
*
p
)
{
assert
(
p
->
nRef
==
1
);
// assert(sqlite3PcachePageSanity(p));
if
(
p
->
flags
&
PGHDR_DIRTY
)
{
pcacheManageDirtyList
(
p
,
PCACHE_DIRTYLIST_REMOVE
);
}
p
->
pCache
->
nRefSum
--
;
pcache2
.
xUnpin
(
p
->
pCache
->
pCache
,
p
->
pPage
,
1
);
}
/*
** Make sure the page is marked as dirty. If it isn't dirty already,
** make it so.
*/
void
sqlite3PcacheMakeDirty
(
PgHdr
*
p
)
{
assert
(
p
->
nRef
>
0
);
// assert(sqlite3PcachePageSanity(p));
if
(
p
->
flags
&
(
PGHDR_CLEAN
|
PGHDR_DONT_WRITE
))
{
/*OPTIMIZATION-IF-FALSE*/
p
->
flags
&=
~
PGHDR_DONT_WRITE
;
if
(
p
->
flags
&
PGHDR_CLEAN
)
{
p
->
flags
^=
(
PGHDR_DIRTY
|
PGHDR_CLEAN
);
pcacheTrace
((
"%p.DIRTY %d
\n
"
,
p
->
pCache
,
p
->
pgno
));
assert
((
p
->
flags
&
(
PGHDR_DIRTY
|
PGHDR_CLEAN
))
==
PGHDR_DIRTY
);
pcacheManageDirtyList
(
p
,
PCACHE_DIRTYLIST_ADD
);
}
// assert(sqlite3PcachePageSanity(p));
}
}
/*
** Make sure the page is marked as clean. If it isn't clean already,
** make it so.
*/
void
sqlite3PcacheMakeClean
(
PgHdr
*
p
)
{
// assert(sqlite3PcachePageSanity(p));
assert
((
p
->
flags
&
PGHDR_DIRTY
)
!=
0
);
assert
((
p
->
flags
&
PGHDR_CLEAN
)
==
0
);
pcacheManageDirtyList
(
p
,
PCACHE_DIRTYLIST_REMOVE
);
p
->
flags
&=
~
(
PGHDR_DIRTY
|
PGHDR_NEED_SYNC
|
PGHDR_WRITEABLE
);
p
->
flags
|=
PGHDR_CLEAN
;
pcacheTrace
((
"%p.CLEAN %d
\n
"
,
p
->
pCache
,
p
->
pgno
));
// assert(sqlite3PcachePageSanity(p));
if
(
p
->
nRef
==
0
)
{
pcacheUnpin
(
p
);
}
}
/*
** Make every page in the cache clean.
*/
void
sqlite3PcacheCleanAll
(
PCache
*
pCache
)
{
PgHdr
*
p
;
pcacheTrace
((
"%p.CLEAN-ALL
\n
"
,
pCache
));
while
((
p
=
pCache
->
pDirty
)
!=
0
)
{
sqlite3PcacheMakeClean
(
p
);
}
}
/*
** Clear the PGHDR_NEED_SYNC and PGHDR_WRITEABLE flag from all dirty pages.
*/
void
sqlite3PcacheClearWritable
(
PCache
*
pCache
)
{
PgHdr
*
p
;
pcacheTrace
((
"%p.CLEAR-WRITEABLE
\n
"
,
pCache
));
for
(
p
=
pCache
->
pDirty
;
p
;
p
=
p
->
pDirtyNext
)
{
p
->
flags
&=
~
(
PGHDR_NEED_SYNC
|
PGHDR_WRITEABLE
);
}
pCache
->
pSynced
=
pCache
->
pDirtyTail
;
}
/*
** Clear the PGHDR_NEED_SYNC flag from all dirty pages.
*/
void
sqlite3PcacheClearSyncFlags
(
PCache
*
pCache
)
{
PgHdr
*
p
;
for
(
p
=
pCache
->
pDirty
;
p
;
p
=
p
->
pDirtyNext
)
{
p
->
flags
&=
~
PGHDR_NEED_SYNC
;
}
pCache
->
pSynced
=
pCache
->
pDirtyTail
;
}
/*
** Change the page number of page p to newPgno.
*/
void
sqlite3PcacheMove
(
PgHdr
*
p
,
Pgno
newPgno
)
{
PCache
*
pCache
=
p
->
pCache
;
assert
(
p
->
nRef
>
0
);
assert
(
newPgno
>
0
);
// assert(sqlite3PcachePageSanity(p));
pcacheTrace
((
"%p.MOVE %d -> %d
\n
"
,
pCache
,
p
->
pgno
,
newPgno
));
pcache2
.
xRekey
(
pCache
->
pCache
,
p
->
pPage
,
p
->
pgno
,
newPgno
);
p
->
pgno
=
newPgno
;
if
((
p
->
flags
&
PGHDR_DIRTY
)
&&
(
p
->
flags
&
PGHDR_NEED_SYNC
))
{
pcacheManageDirtyList
(
p
,
PCACHE_DIRTYLIST_FRONT
);
}
}
/*
** Drop every cache entry whose page number is greater than "pgno". The
** caller must ensure that there are no outstanding references to any pages
** other than page 1 with a page number greater than pgno.
**
** If there is a reference to page 1 and the pgno parameter passed to this
** function is 0, then the data area associated with page 1 is zeroed, but
** the page object is not dropped.
*/
void
sqlite3PcacheTruncate
(
PCache
*
pCache
,
Pgno
pgno
)
{
if
(
pCache
->
pCache
)
{
PgHdr
*
p
;
PgHdr
*
pNext
;
pcacheTrace
((
"%p.TRUNCATE %d
\n
"
,
pCache
,
pgno
));
for
(
p
=
pCache
->
pDirty
;
p
;
p
=
pNext
)
{
pNext
=
p
->
pDirtyNext
;
/* This routine never gets call with a positive pgno except right
** after sqlite3PcacheCleanAll(). So if there are dirty pages,
** it must be that pgno==0.
*/
assert
(
p
->
pgno
>
0
);
if
(
p
->
pgno
>
pgno
)
{
assert
(
p
->
flags
&
PGHDR_DIRTY
);
sqlite3PcacheMakeClean
(
p
);
}
}
if
(
pgno
==
0
&&
pCache
->
nRefSum
)
{
sqlite3_pcache_page
*
pPage1
;
pPage1
=
pcache2
.
xFetch
(
pCache
->
pCache
,
1
,
0
);
if
(
pPage1
)
{
/* Page 1 is always available in cache, because
** pCache->nRefSum>0 */
memset
(
pPage1
->
pBuf
,
0
,
pCache
->
szPage
);
pgno
=
1
;
}
}
pcache2
.
xTruncate
(
pCache
->
pCache
,
pgno
+
1
);
}
}
/*
** Close a cache.
*/
void
sqlite3PcacheClose
(
PCache
*
pCache
)
{
assert
(
pCache
->
pCache
!=
0
);
pcacheTrace
((
"%p.CLOSE
\n
"
,
pCache
));
pcache2
.
xDestroy
(
pCache
->
pCache
);
}
/*
** Discard the contents of the cache.
*/
void
sqlite3PcacheClear
(
PCache
*
pCache
)
{
sqlite3PcacheTruncate
(
pCache
,
0
);
}
/*
** Merge two lists of pages connected by pDirty and in pgno order.
** Do not bother fixing the pDirtyPrev pointers.
*/
static
PgHdr
*
pcacheMergeDirtyList
(
PgHdr
*
pA
,
PgHdr
*
pB
)
{
PgHdr
result
,
*
pTail
;
pTail
=
&
result
;
assert
(
pA
!=
0
&&
pB
!=
0
);
for
(;;)
{
if
(
pA
->
pgno
<
pB
->
pgno
)
{
pTail
->
pDirty
=
pA
;
pTail
=
pA
;
pA
=
pA
->
pDirty
;
if
(
pA
==
0
)
{
pTail
->
pDirty
=
pB
;
break
;
}
}
else
{
pTail
->
pDirty
=
pB
;
pTail
=
pB
;
pB
=
pB
->
pDirty
;
if
(
pB
==
0
)
{
pTail
->
pDirty
=
pA
;
break
;
}
}
}
return
result
.
pDirty
;
}
/*
** Sort the list of pages in accending order by pgno. Pages are
** connected by pDirty pointers. The pDirtyPrev pointers are
** corrupted by this sort.
**
** Since there cannot be more than 2^31 distinct pages in a database,
** there cannot be more than 31 buckets required by the merge sorter.
** One extra bucket is added to catch overflow in case something
** ever changes to make the previous sentence incorrect.
*/
#define N_SORT_BUCKET 32
static
PgHdr
*
pcacheSortDirtyList
(
PgHdr
*
pIn
)
{
PgHdr
*
a
[
N_SORT_BUCKET
],
*
p
;
int
i
;
memset
(
a
,
0
,
sizeof
(
a
));
while
(
pIn
)
{
p
=
pIn
;
pIn
=
p
->
pDirty
;
p
->
pDirty
=
0
;
for
(
i
=
0
;
i
<
N_SORT_BUCKET
-
1
;
i
++
)
{
if
(
a
[
i
]
==
0
)
{
a
[
i
]
=
p
;
break
;
}
else
{
p
=
pcacheMergeDirtyList
(
a
[
i
],
p
);
a
[
i
]
=
0
;
}
}
if
(
i
==
N_SORT_BUCKET
-
1
)
{
/* To get here, there need to be 2^(N_SORT_BUCKET) elements in
** the input list. But that is impossible.
*/
a
[
i
]
=
pcacheMergeDirtyList
(
a
[
i
],
p
);
}
}
p
=
a
[
0
];
for
(
i
=
1
;
i
<
N_SORT_BUCKET
;
i
++
)
{
if
(
a
[
i
]
==
0
)
continue
;
p
=
p
?
pcacheMergeDirtyList
(
p
,
a
[
i
])
:
a
[
i
];
}
return
p
;
}
/*
** Return a list of all dirty pages in the cache, sorted by page number.
*/
PgHdr
*
sqlite3PcacheDirtyList
(
PCache
*
pCache
)
{
PgHdr
*
p
;
for
(
p
=
pCache
->
pDirty
;
p
;
p
=
p
->
pDirtyNext
)
{
p
->
pDirty
=
p
->
pDirtyNext
;
}
return
pcacheSortDirtyList
(
pCache
->
pDirty
);
}
/*
** Return the total number of references to all pages held by the cache.
**
** This is not the total number of pages referenced, but the sum of the
** reference count for all pages.
*/
int
sqlite3PcacheRefCount
(
PCache
*
pCache
)
{
return
pCache
->
nRefSum
;
}
/*
** Return the number of references to the page supplied as an argument.
*/
int
sqlite3PcachePageRefcount
(
PgHdr
*
p
)
{
return
p
->
nRef
;
}
/*
** Return the total number of pages in the cache.
*/
int
sqlite3PcachePagecount
(
PCache
*
pCache
)
{
assert
(
pCache
->
pCache
!=
0
);
return
pcache2
.
xPagecount
(
pCache
->
pCache
);
}
#ifdef SQLITE_TEST
/*
** Get the suggested cache-size value.
*/
int
sqlite3PcacheGetCachesize
(
PCache
*
pCache
)
{
return
numberOfCachePages
(
pCache
);
}
#endif
/*
** Set the suggested cache-size value.
*/
void
sqlite3PcacheSetCachesize
(
PCache
*
pCache
,
int
mxPage
)
{
assert
(
pCache
->
pCache
!=
0
);
pCache
->
szCache
=
mxPage
;
pcache2
.
xCachesize
(
pCache
->
pCache
,
numberOfCachePages
(
pCache
));
}
/*
** Set the suggested cache-spill value. Make no changes if if the
** argument is zero. Return the effective cache-spill size, which will
** be the larger of the szSpill and szCache.
*/
int
sqlite3PcacheSetSpillsize
(
PCache
*
p
,
int
mxPage
)
{
int
res
;
assert
(
p
->
pCache
!=
0
);
if
(
mxPage
)
{
if
(
mxPage
<
0
)
{
mxPage
=
(
int
)((
-
1024
*
(
i64
)
mxPage
)
/
(
p
->
szPage
+
p
->
szExtra
));
}
p
->
szSpill
=
mxPage
;
}
res
=
numberOfCachePages
(
p
);
if
(
res
<
p
->
szSpill
)
res
=
p
->
szSpill
;
return
res
;
}
/*
** Free up as much memory as possible from the page cache.
*/
void
sqlite3PcacheShrink
(
PCache
*
pCache
)
{
assert
(
pCache
->
pCache
!=
0
);
pcache2
.
xShrink
(
pCache
->
pCache
);
}
/*
** Return the size of the header added by this middleware layer
** in the page-cache hierarchy.
*/
int
sqlite3HeaderSizePcache
(
void
)
{
return
ROUND8
(
sizeof
(
PgHdr
));
}
/*
** Return the number of dirty pages currently in the cache, as a percentage
** of the configured cache size.
*/
int
sqlite3PCachePercentDirty
(
PCache
*
pCache
)
{
PgHdr
*
pDirty
;
int
nDirty
=
0
;
int
nCache
=
numberOfCachePages
(
pCache
);
for
(
pDirty
=
pCache
->
pDirty
;
pDirty
;
pDirty
=
pDirty
->
pDirtyNext
)
nDirty
++
;
return
nCache
?
(
int
)(((
i64
)
nDirty
*
100
)
/
nCache
)
:
0
;
}
#ifdef SQLITE_DIRECT_OVERFLOW_READ
/*
** Return true if there are one or more dirty pages in the cache. Else false.
*/
int
sqlite3PCacheIsDirty
(
PCache
*
pCache
)
{
return
(
pCache
->
pDirty
!=
0
);
}
#endif
#if defined(SQLITE_CHECK_PAGES) || defined(SQLITE_DEBUG)
/*
** For all dirty pages currently in the cache, invoke the specified
** callback. This is only used if the SQLITE_CHECK_PAGES macro is
** defined.
*/
void
sqlite3PcacheIterateDirty
(
PCache
*
pCache
,
void
(
*
xIter
)(
PgHdr
*
))
{
PgHdr
*
pDirty
;
for
(
pDirty
=
pCache
->
pDirty
;
pDirty
;
pDirty
=
pDirty
->
pDirtyNext
)
{
xIter
(
pDirty
);
}
}
#endif
source/libs/tdb/src/sqlite/pcache1.c
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2008 November 05
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file implements the default page cache implementation (the
** sqlite3_pcache interface). It also contains part of the implementation
** of the SQLITE_CONFIG_PAGECACHE and sqlite3_release_memory() features.
** If the default page cache implementation is overridden, then neither of
** these two features are available.
**
** A Page cache line looks like this:
**
** -------------------------------------------------------------
** | database page content | PgHdr1 | MemPage | PgHdr |
** -------------------------------------------------------------
**
** The database page content is up front (so that buffer overreads tend to
** flow harmlessly into the PgHdr1, MemPage, and PgHdr extensions). MemPage
** is the extension added by the btree.c module containing information such
** as the database page number and how that database page is used. PgHdr
** is added by the pcache.c layer and contains information used to keep track
** of which pages are "dirty". PgHdr1 is an extension added by this
** module (pcache1.c). The PgHdr1 header is a subclass of sqlite3_pcache_page.
** PgHdr1 contains information needed to look up a page by its page number.
** The superclass sqlite3_pcache_page.pBuf points to the start of the
** database page content and sqlite3_pcache_page.pExtra points to PgHdr.
**
** The size of the extension (MemPage+PgHdr+PgHdr1) can be determined at
** runtime using sqlite3_config(SQLITE_CONFIG_PCACHE_HDRSZ, &size). The
** sizes of the extensions sum to 272 bytes on x64 for 3.8.10, but this
** size can vary according to architecture, compile-time options, and
** SQLite library version number.
**
** If SQLITE_PCACHE_SEPARATE_HEADER is defined, then the extension is obtained
** using a separate memory allocation from the database page content. This
** seeks to overcome the "clownshoe" problem (also called "internal
** fragmentation" in academic literature) of allocating a few bytes more
** than a power of two with the memory allocator rounding up to the next
** power of two, and leaving the rounded-up space unused.
**
** This module tracks pointers to PgHdr1 objects. Only pcache.c communicates
** with this module. Information is passed back and forth as PgHdr1 pointers.
**
** The pcache.c and pager.c modules deal pointers to PgHdr objects.
** The btree.c module deals with pointers to MemPage objects.
**
** SOURCE OF PAGE CACHE MEMORY:
**
** Memory for a page might come from any of three sources:
**
** (1) The general-purpose memory allocator - sqlite3Malloc()
** (2) Global page-cache memory provided using sqlite3_config() with
** SQLITE_CONFIG_PAGECACHE.
** (3) PCache-local bulk allocation.
**
** The third case is a chunk of heap memory (defaulting to 100 pages worth)
** that is allocated when the page cache is created. The size of the local
** bulk allocation can be adjusted using
**
** sqlite3_config(SQLITE_CONFIG_PAGECACHE, (void*)0, 0, N).
**
** If N is positive, then N pages worth of memory are allocated using a single
** sqlite3Malloc() call and that memory is used for the first N pages allocated.
** Or if N is negative, then -1024*N bytes of memory are allocated and used
** for as many pages as can be accomodated.
**
** Only one of (2) or (3) can be used. Once the memory available to (2) or
** (3) is exhausted, subsequent allocations fail over to the general-purpose
** memory allocator (1).
**
** Earlier versions of SQLite used only methods (1) and (2). But experiments
** show that method (3) with N==100 provides about a 5% performance boost for
** common workloads.
*/
#include "sqliteInt.h"
typedef
struct
PCache1
PCache1
;
typedef
struct
PgHdr1
PgHdr1
;
typedef
struct
PgFreeslot
PgFreeslot
;
typedef
struct
PGroup
PGroup
;
/*
** Each cache entry is represented by an instance of the following
** structure. Unless SQLITE_PCACHE_SEPARATE_HEADER is defined, a buffer of
** PgHdr1.pCache->szPage bytes is allocated directly before this structure
** in memory.
**
** Note: Variables isBulkLocal and isAnchor were once type "u8". That works,
** but causes a 2-byte gap in the structure for most architectures (since
** pointers must be either 4 or 8-byte aligned). As this structure is located
** in memory directly after the associated page data, if the database is
** corrupt, code at the b-tree layer may overread the page buffer and
** read part of this structure before the corruption is detected. This
** can cause a valgrind error if the unitialized gap is accessed. Using u16
** ensures there is no such gap, and therefore no bytes of unitialized memory
** in the structure.
*/
struct
PgHdr1
{
sqlite3_pcache_page
page
;
/* Base class. Must be first. pBuf & pExtra */
unsigned
int
iKey
;
/* Key value (page number) */
u16
isBulkLocal
;
/* This page from bulk local storage */
u16
isAnchor
;
/* This is the PGroup.lru element */
PgHdr1
*
pNext
;
/* Next in hash table chain */
PCache1
*
pCache
;
/* Cache that currently owns this page */
PgHdr1
*
pLruNext
;
/* Next in LRU list of unpinned pages */
PgHdr1
*
pLruPrev
;
/* Previous in LRU list of unpinned pages */
/* NB: pLruPrev is only valid if pLruNext!=0 */
};
/*
** A page is pinned if it is not on the LRU list. To be "pinned" means
** that the page is in active use and must not be deallocated.
*/
#define PAGE_IS_PINNED(p) ((p)->pLruNext == 0)
#define PAGE_IS_UNPINNED(p) ((p)->pLruNext != 0)
/* Each page cache (or PCache) belongs to a PGroup. A PGroup is a set
** of one or more PCaches that are able to recycle each other's unpinned
** pages when they are under memory pressure. A PGroup is an instance of
** the following object.
**
** This page cache implementation works in one of two modes:
**
** (1) Every PCache is the sole member of its own PGroup. There is
** one PGroup per PCache.
**
** (2) There is a single global PGroup that all PCaches are a member
** of.
**
** Mode 1 uses more memory (since PCache instances are not able to rob
** unused pages from other PCaches) but it also operates without a mutex,
** and is therefore often faster. Mode 2 requires a mutex in order to be
** threadsafe, but recycles pages more efficiently.
**
** For mode (1), PGroup.mutex is NULL. For mode (2) there is only a single
** PGroup which is the pcache1.grp global variable and its mutex is
** SQLITE_MUTEX_STATIC_LRU.
*/
struct
PGroup
{
pthread_mutex_t
mutex
;
/* MUTEX_STATIC_LRU or NULL */
unsigned
int
nMaxPage
;
/* Sum of nMax for purgeable caches */
unsigned
int
nMinPage
;
/* Sum of nMin for purgeable caches */
unsigned
int
mxPinned
;
/* nMaxpage + 10 - nMinPage */
unsigned
int
nPurgeable
;
/* Number of purgeable pages allocated */
PgHdr1
lru
;
/* The beginning and end of the LRU list */
};
/* Each page cache is an instance of the following object. Every
** open database file (including each in-memory database and each
** temporary or transient database) has a single page cache which
** is an instance of this object.
**
** Pointers to structures of this type are cast and returned as
** opaque sqlite3_pcache* handles.
*/
struct
PCache1
{
/* Cache configuration parameters. Page size (szPage) and the purgeable
** flag (bPurgeable) and the pnPurgeable pointer are all set when the
** cache is created and are never changed thereafter. nMax may be
** modified at any time by a call to the pcache1Cachesize() method.
** The PGroup mutex must be held when accessing nMax.
*/
PGroup
*
pGroup
;
/* PGroup this cache belongs to */
unsigned
int
*
pnPurgeable
;
/* Pointer to pGroup->nPurgeable */
int
szPage
;
/* Size of database content section */
int
szExtra
;
/* sizeof(MemPage)+sizeof(PgHdr) */
int
szAlloc
;
/* Total size of one pcache line */
int
bPurgeable
;
/* True if cache is purgeable */
unsigned
int
nMin
;
/* Minimum number of pages reserved */
unsigned
int
nMax
;
/* Configured "cache_size" value */
unsigned
int
n90pct
;
/* nMax*9/10 */
unsigned
int
iMaxKey
;
/* Largest key seen since xTruncate() */
unsigned
int
nPurgeableDummy
;
/* pnPurgeable points here when not used*/
/* Hash table of all pages. The following variables may only be accessed
** when the accessor is holding the PGroup mutex.
*/
unsigned
int
nRecyclable
;
/* Number of pages in the LRU list */
unsigned
int
nPage
;
/* Total number of pages in apHash */
unsigned
int
nHash
;
/* Number of slots in apHash[] */
PgHdr1
**
apHash
;
/* Hash table for fast lookup by key */
PgHdr1
*
pFree
;
/* List of unused pcache-local pages */
void
*
pBulk
;
/* Bulk memory used by pcache-local */
};
/*
** Free slots in the allocator used to divide up the global page cache
** buffer provided using the SQLITE_CONFIG_PAGECACHE mechanism.
*/
struct
PgFreeslot
{
PgFreeslot
*
pNext
;
/* Next free slot */
};
/*
** Global data used by this cache.
*/
static
struct
PCacheGlobal
{
PGroup
grp
;
/* The global PGroup for mode (2) */
/* Variables related to SQLITE_CONFIG_PAGECACHE settings. The
** szSlot, nSlot, pStart, pEnd, nReserve, and isInit values are all
** fixed at sqlite3_initialize() time and do not require mutex protection.
** The nFreeSlot and pFree values do require mutex protection.
*/
int
isInit
;
/* True if initialized */
int
separateCache
;
/* Use a new PGroup for each PCache */
int
nInitPage
;
/* Initial bulk allocation size */
int
szSlot
;
/* Size of each free slot */
int
nSlot
;
/* The number of pcache slots */
int
nReserve
;
/* Try to keep nFreeSlot above this */
void
*
pStart
,
*
pEnd
;
/* Bounds of global page cache memory */
/* Above requires no mutex. Use mutex below for variable that follow. */
pthread_mutex_t
mutex
;
/* Mutex for accessing the following: */
PgFreeslot
*
pFree
;
/* Free page blocks */
int
nFreeSlot
;
/* Number of unused pcache slots */
/* The following value requires a mutex to change. We skip the mutex on
** reading because (1) most platforms read a 32-bit integer atomically and
** (2) even if an incorrect value is read, no great harm is done since this
** is really just an optimization. */
int
bUnderPressure
;
/* True if low on PAGECACHE memory */
}
pcache1
;
#define pcache1EnterMutex(X) pthread_mutex_lock(&((X)->mutex))
#define pcache1LeaveMutex(X) pthread_mutex_unlock(&((X)->mutex))
#define PCACHE1_MIGHT_USE_GROUP_MUTEX 1
/******************************************************************************/
/******** Page Allocation/SQLITE_CONFIG_PCACHE Related Functions **************/
/*
** This function is called during initialization if a static buffer is
** supplied to use for the page-cache by passing the SQLITE_CONFIG_PAGECACHE
** verb to sqlite3_config(). Parameter pBuf points to an allocation large
** enough to contain 'n' buffers of 'sz' bytes each.
**
** This routine is called from sqlite3_initialize() and so it is guaranteed
** to be serialized already. There is no need for further mutexing.
*/
void
sqlite3PCacheBufferSetup
(
void
*
pBuf
,
int
sz
,
int
n
)
{
if
(
pcache1
.
isInit
)
{
PgFreeslot
*
p
;
if
(
pBuf
==
0
)
sz
=
n
=
0
;
if
(
n
==
0
)
sz
=
0
;
sz
=
ROUNDDOWN8
(
sz
);
pcache1
.
szSlot
=
sz
;
pcache1
.
nSlot
=
pcache1
.
nFreeSlot
=
n
;
pcache1
.
nReserve
=
n
>
90
?
10
:
(
n
/
10
+
1
);
pcache1
.
pStart
=
pBuf
;
pcache1
.
pFree
=
0
;
pcache1
.
bUnderPressure
=
0
;
while
(
n
--
)
{
p
=
(
PgFreeslot
*
)
pBuf
;
p
->
pNext
=
pcache1
.
pFree
;
pcache1
.
pFree
=
p
;
pBuf
=
(
void
*
)
&
((
char
*
)
pBuf
)[
sz
];
}
pcache1
.
pEnd
=
pBuf
;
}
}
/*
** Try to initialize the pCache->pFree and pCache->pBulk fields. Return
** true if pCache->pFree ends up containing one or more free pages.
*/
static
int
pcache1InitBulk
(
PCache1
*
pCache
)
{
i64
szBulk
;
char
*
zBulk
;
if
(
pcache1
.
nInitPage
==
0
)
return
0
;
/* Do not bother with a bulk allocation if the cache size very small */
if
(
pCache
->
nMax
<
3
)
return
0
;
// sqlite3BeginBenignMalloc();
if
(
pcache1
.
nInitPage
>
0
)
{
szBulk
=
pCache
->
szAlloc
*
(
i64
)
pcache1
.
nInitPage
;
}
else
{
szBulk
=
-
1024
*
(
i64
)
pcache1
.
nInitPage
;
}
if
(
szBulk
>
pCache
->
szAlloc
*
(
i64
)
pCache
->
nMax
)
{
szBulk
=
pCache
->
szAlloc
*
(
i64
)
pCache
->
nMax
;
}
zBulk
=
pCache
->
pBulk
=
malloc
(
szBulk
);
// sqlite3EndBenignMalloc();
if
(
zBulk
)
{
int
nBulk
=
szBulk
/
pCache
->
szAlloc
;
do
{
PgHdr1
*
pX
=
(
PgHdr1
*
)(
&
zBulk
[
pCache
->
szPage
]);
pX
->
page
.
pBuf
=
zBulk
;
pX
->
page
.
pExtra
=
&
pX
[
1
];
pX
->
isBulkLocal
=
1
;
pX
->
isAnchor
=
0
;
pX
->
pNext
=
pCache
->
pFree
;
pX
->
pLruPrev
=
0
;
/* Initializing this saves a valgrind error */
pCache
->
pFree
=
pX
;
zBulk
+=
pCache
->
szAlloc
;
}
while
(
--
nBulk
);
}
return
pCache
->
pFree
!=
0
;
}
/*
** Malloc function used within this file to allocate space from the buffer
** configured using sqlite3_config(SQLITE_CONFIG_PAGECACHE) option. If no
** such buffer exists or there is no space left in it, this function falls
** back to sqlite3Malloc().
**
** Multiple threads can run this routine at the same time. Global variables
** in pcache1 need to be protected via mutex.
*/
static
void
*
pcache1Alloc
(
int
nByte
)
{
void
*
p
=
0
;
// assert(sqlite3_mutex_notheld(pcache1.grp.mutex));
if
(
nByte
<=
pcache1
.
szSlot
)
{
pthread_mutex_lock
(
&
(
pcache1
.
mutex
));
p
=
(
PgHdr1
*
)
pcache1
.
pFree
;
if
(
p
)
{
pcache1
.
pFree
=
pcache1
.
pFree
->
pNext
;
pcache1
.
nFreeSlot
--
;
pcache1
.
bUnderPressure
=
pcache1
.
nFreeSlot
<
pcache1
.
nReserve
;
assert
(
pcache1
.
nFreeSlot
>=
0
);
// sqlite3StatusHighwater(SQLITE_STATUS_PAGECACHE_SIZE, nByte);
// sqlite3StatusUp(SQLITE_STATUS_PAGECACHE_USED, 1);
}
pthread_mutex_unlock
(
&
pcache1
.
mutex
);
}
if
(
p
==
0
)
{
/* Memory is not available in the SQLITE_CONFIG_PAGECACHE pool. Get
** it from sqlite3Malloc instead.
*/
p
=
malloc
(
nByte
);
#ifndef SQLITE_DISABLE_PAGECACHE_OVERFLOW_STATS
if
(
p
)
{
int
sz
=
nByte
;
pthread_mutex_lock
(
&
pcache1
.
mutex
);
// sqlite3StatusHighwater(SQLITE_STATUS_PAGECACHE_SIZE, nByte);
// sqlite3StatusUp(SQLITE_STATUS_PAGECACHE_OVERFLOW, sz);
pthread_mutex_unlock
(
&
pcache1
.
mutex
);
}
#endif
// sqlite3MemdebugSetType(p, MEMTYPE_PCACHE);
}
return
p
;
}
/*
** Free an allocated buffer obtained from pcache1Alloc().
*/
static
void
pcache1Free
(
void
*
p
)
{
if
(
p
==
0
)
return
;
// if (SQLITE_WITHIN(p, pcache1.pStart, pcache1.pEnd)) {
if
(
p
>=
pcache1
.
pStart
&&
p
<
pcache1
.
pEnd
)
{
PgFreeslot
*
pSlot
;
pthread_mutex_lock
(
&
pcache1
.
mutex
);
// sqlite3StatusDown(SQLITE_STATUS_PAGECACHE_USED, 1);
pSlot
=
(
PgFreeslot
*
)
p
;
pSlot
->
pNext
=
pcache1
.
pFree
;
pcache1
.
pFree
=
pSlot
;
pcache1
.
nFreeSlot
++
;
pcache1
.
bUnderPressure
=
pcache1
.
nFreeSlot
<
pcache1
.
nReserve
;
assert
(
pcache1
.
nFreeSlot
<=
pcache1
.
nSlot
);
pthread_mutex_unlock
(
&
pcache1
.
mutex
);
}
else
{
// assert(sqlite3MemdebugHasType(p, MEMTYPE_PCACHE));
// sqlite3MemdebugSetType(p, MEMTYPE_HEAP);
#ifndef SQLITE_DISABLE_PAGECACHE_OVERFLOW_STATS
{
int
nFreed
=
0
;
// nFreed = sqlite3MallocSize(p);
pthread_mutex_lock
(
&
pcache1
.
mutex
);
// sqlite3StatusDown(SQLITE_STATUS_PAGECACHE_OVERFLOW, nFreed);
pthread_mutex_unlock
(
&
pcache1
.
mutex
);
}
#endif
free
(
p
);
}
}
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
/*
** Return the size of a pcache allocation
*/
static
int
pcache1MemSize
(
void
*
p
)
{
if
(
p
>=
pcache1
.
pStart
&&
p
<
pcache1
.
pEnd
)
{
return
pcache1
.
szSlot
;
}
else
{
int
iSize
;
assert
(
sqlite3MemdebugHasType
(
p
,
MEMTYPE_PCACHE
));
sqlite3MemdebugSetType
(
p
,
MEMTYPE_HEAP
);
iSize
=
sqlite3MallocSize
(
p
);
sqlite3MemdebugSetType
(
p
,
MEMTYPE_PCACHE
);
return
iSize
;
}
}
#endif
/* SQLITE_ENABLE_MEMORY_MANAGEMENT */
/*
** Allocate a new page object initially associated with cache pCache.
*/
static
PgHdr1
*
pcache1AllocPage
(
PCache1
*
pCache
,
int
benignMalloc
)
{
PgHdr1
*
p
=
0
;
void
*
pPg
;
// assert(sqlite3_mutex_held(pCache->pGroup->mutex));
if
(
pCache
->
pFree
||
(
pCache
->
nPage
==
0
&&
pcache1InitBulk
(
pCache
)))
{
assert
(
pCache
->
pFree
!=
0
);
p
=
pCache
->
pFree
;
pCache
->
pFree
=
p
->
pNext
;
p
->
pNext
=
0
;
}
else
{
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
/* The group mutex must be released before pcache1Alloc() is called. This
** is because it might call sqlite3_release_memory(), which assumes that
** this mutex is not held. */
assert
(
pcache1
.
separateCache
==
0
);
assert
(
pCache
->
pGroup
==
&
pcache1
.
grp
);
pcache1LeaveMutex
(
pCache
->
pGroup
);
#endif
if
(
benignMalloc
)
{
// sqlite3BeginBenignMalloc();
}
#ifdef SQLITE_PCACHE_SEPARATE_HEADER
pPg
=
pcache1Alloc
(
pCache
->
szPage
);
p
=
sqlite3Malloc
(
sizeof
(
PgHdr1
)
+
pCache
->
szExtra
);
if
(
!
pPg
||
!
p
)
{
pcache1Free
(
pPg
);
sqlite3_free
(
p
);
pPg
=
0
;
}
#else
pPg
=
pcache1Alloc
(
pCache
->
szAlloc
);
#endif
if
(
benignMalloc
)
{
// sqlite3EndBenignMalloc();
}
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
pcache1EnterMutex
(
pCache
->
pGroup
);
#endif
if
(
pPg
==
0
)
return
0
;
#ifndef SQLITE_PCACHE_SEPARATE_HEADER
p
=
(
PgHdr1
*
)
&
((
u8
*
)
pPg
)[
pCache
->
szPage
];
#endif
p
->
page
.
pBuf
=
pPg
;
p
->
page
.
pExtra
=
&
p
[
1
];
p
->
isBulkLocal
=
0
;
p
->
isAnchor
=
0
;
p
->
pLruPrev
=
0
;
/* Initializing this saves a valgrind error */
}
(
*
pCache
->
pnPurgeable
)
++
;
return
p
;
}
/*
** Free a page object allocated by pcache1AllocPage().
*/
static
void
pcache1FreePage
(
PgHdr1
*
p
)
{
PCache1
*
pCache
;
assert
(
p
!=
0
);
pCache
=
p
->
pCache
;
// assert(sqlite3_mutex_held(p->pCache->pGroup->mutex));
if
(
p
->
isBulkLocal
)
{
p
->
pNext
=
pCache
->
pFree
;
pCache
->
pFree
=
p
;
}
else
{
pcache1Free
(
p
->
page
.
pBuf
);
#ifdef SQLITE_PCACHE_SEPARATE_HEADER
sqlite3_free
(
p
);
#endif
}
(
*
pCache
->
pnPurgeable
)
--
;
}
/*
** Malloc function used by SQLite to obtain space from the buffer configured
** using sqlite3_config(SQLITE_CONFIG_PAGECACHE) option. If no such buffer
** exists, this function falls back to sqlite3Malloc().
*/
void
*
sqlite3PageMalloc
(
int
sz
)
{
assert
(
sz
<=
65536
+
8
);
/* These allocations are never very large */
return
pcache1Alloc
(
sz
);
}
/*
** Free an allocated buffer obtained from sqlite3PageMalloc().
*/
void
sqlite3PageFree
(
void
*
p
)
{
pcache1Free
(
p
);
}
/*
** Return true if it desirable to avoid allocating a new page cache
** entry.
**
** If memory was allocated specifically to the page cache using
** SQLITE_CONFIG_PAGECACHE but that memory has all been used, then
** it is desirable to avoid allocating a new page cache entry because
** presumably SQLITE_CONFIG_PAGECACHE was suppose to be sufficient
** for all page cache needs and we should not need to spill the
** allocation onto the heap.
**
** Or, the heap is used for all page cache memory but the heap is
** under memory pressure, then again it is desirable to avoid
** allocating a new page cache entry in order to avoid stressing
** the heap even further.
*/
static
int
pcache1UnderMemoryPressure
(
PCache1
*
pCache
)
{
// if (pcache1.nSlot && (pCache->szPage + pCache->szExtra) <= pcache1.szSlot) {
return
pcache1
.
bUnderPressure
;
// } else {
// return sqlite3HeapNearlyFull();
// }
}
/******************************************************************************/
/******** General Implementation Functions ************************************/
/*
** This function is used to resize the hash table used by the cache passed
** as the first argument.
**
** The PCache mutex must be held when this function is called.
*/
static
void
pcache1ResizeHash
(
PCache1
*
p
)
{
PgHdr1
**
apNew
;
unsigned
int
nNew
;
unsigned
int
i
;
// assert(sqlite3_mutex_held(p->pGroup->mutex));
nNew
=
p
->
nHash
*
2
;
if
(
nNew
<
256
)
{
nNew
=
256
;
}
pcache1LeaveMutex
(
p
->
pGroup
);
if
(
p
->
nHash
)
{
// sqlite3BeginBenignMalloc();
}
apNew
=
(
PgHdr1
**
)
calloc
(
nNew
,
sizeof
(
PgHdr1
*
));
if
(
p
->
nHash
)
{
// sqlite3EndBenignMalloc();
}
pcache1EnterMutex
(
p
->
pGroup
);
if
(
apNew
)
{
for
(
i
=
0
;
i
<
p
->
nHash
;
i
++
)
{
PgHdr1
*
pPage
;
PgHdr1
*
pNext
=
p
->
apHash
[
i
];
while
((
pPage
=
pNext
)
!=
0
)
{
unsigned
int
h
=
pPage
->
iKey
%
nNew
;
pNext
=
pPage
->
pNext
;
pPage
->
pNext
=
apNew
[
h
];
apNew
[
h
]
=
pPage
;
}
}
free
(
p
->
apHash
);
p
->
apHash
=
apNew
;
p
->
nHash
=
nNew
;
}
}
/*
** This function is used internally to remove the page pPage from the
** PGroup LRU list, if is part of it. If pPage is not part of the PGroup
** LRU list, then this function is a no-op.
**
** The PGroup mutex must be held when this function is called.
*/
static
PgHdr1
*
pcache1PinPage
(
PgHdr1
*
pPage
)
{
assert
(
pPage
!=
0
);
assert
(
PAGE_IS_UNPINNED
(
pPage
));
assert
(
pPage
->
pLruNext
);
assert
(
pPage
->
pLruPrev
);
// assert(sqlite3_mutex_held(pPage->pCache->pGroup->mutex));
pPage
->
pLruPrev
->
pLruNext
=
pPage
->
pLruNext
;
pPage
->
pLruNext
->
pLruPrev
=
pPage
->
pLruPrev
;
pPage
->
pLruNext
=
0
;
/* pPage->pLruPrev = 0;
** No need to clear pLruPrev as it is never accessed if pLruNext is 0 */
assert
(
pPage
->
isAnchor
==
0
);
assert
(
pPage
->
pCache
->
pGroup
->
lru
.
isAnchor
==
1
);
pPage
->
pCache
->
nRecyclable
--
;
return
pPage
;
}
/*
** Remove the page supplied as an argument from the hash table
** (PCache1.apHash structure) that it is currently stored in.
** Also free the page if freePage is true.
**
** The PGroup mutex must be held when this function is called.
*/
static
void
pcache1RemoveFromHash
(
PgHdr1
*
pPage
,
int
freeFlag
)
{
unsigned
int
h
;
PCache1
*
pCache
=
pPage
->
pCache
;
PgHdr1
**
pp
;
// assert(sqlite3_mutex_held(pCache->pGroup->mutex));
h
=
pPage
->
iKey
%
pCache
->
nHash
;
for
(
pp
=
&
pCache
->
apHash
[
h
];
(
*
pp
)
!=
pPage
;
pp
=
&
(
*
pp
)
->
pNext
)
;
*
pp
=
(
*
pp
)
->
pNext
;
pCache
->
nPage
--
;
if
(
freeFlag
)
pcache1FreePage
(
pPage
);
}
/*
** If there are currently more than nMaxPage pages allocated, try
** to recycle pages to reduce the number allocated to nMaxPage.
*/
static
void
pcache1EnforceMaxPage
(
PCache1
*
pCache
)
{
PGroup
*
pGroup
=
pCache
->
pGroup
;
PgHdr1
*
p
;
// assert(sqlite3_mutex_held(pGroup->mutex));
while
(
pGroup
->
nPurgeable
>
pGroup
->
nMaxPage
&&
(
p
=
pGroup
->
lru
.
pLruPrev
)
->
isAnchor
==
0
)
{
assert
(
p
->
pCache
->
pGroup
==
pGroup
);
assert
(
PAGE_IS_UNPINNED
(
p
));
pcache1PinPage
(
p
);
pcache1RemoveFromHash
(
p
,
1
);
}
if
(
pCache
->
nPage
==
0
&&
pCache
->
pBulk
)
{
free
(
pCache
->
pBulk
);
pCache
->
pBulk
=
pCache
->
pFree
=
0
;
}
}
/*
** Discard all pages from cache pCache with a page number (key value)
** greater than or equal to iLimit. Any pinned pages that meet this
** criteria are unpinned before they are discarded.
**
** The PCache mutex must be held when this function is called.
*/
static
void
pcache1TruncateUnsafe
(
PCache1
*
pCache
,
/* The cache to truncate */
unsigned
int
iLimit
/* Drop pages with this pgno or larger */
)
{
int
nPage
=
0
;
/* To assert pCache->nPage is correct */
unsigned
int
h
,
iStop
;
// assert(sqlite3_mutex_held(pCache->pGroup->mutex));
assert
(
pCache
->
iMaxKey
>=
iLimit
);
assert
(
pCache
->
nHash
>
0
);
if
(
pCache
->
iMaxKey
-
iLimit
<
pCache
->
nHash
)
{
/* If we are just shaving the last few pages off the end of the
** cache, then there is no point in scanning the entire hash table.
** Only scan those hash slots that might contain pages that need to
** be removed. */
h
=
iLimit
%
pCache
->
nHash
;
iStop
=
pCache
->
iMaxKey
%
pCache
->
nHash
;
nPage
=
-
10
;
/* Disable the pCache->nPage validity check */
}
else
{
/* This is the general case where many pages are being removed.
** It is necessary to scan the entire hash table */
h
=
pCache
->
nHash
/
2
;
iStop
=
h
-
1
;
}
for
(;;)
{
PgHdr1
**
pp
;
PgHdr1
*
pPage
;
assert
(
h
<
pCache
->
nHash
);
pp
=
&
pCache
->
apHash
[
h
];
while
((
pPage
=
*
pp
)
!=
0
)
{
if
(
pPage
->
iKey
>=
iLimit
)
{
pCache
->
nPage
--
;
*
pp
=
pPage
->
pNext
;
if
(
PAGE_IS_UNPINNED
(
pPage
))
pcache1PinPage
(
pPage
);
pcache1FreePage
(
pPage
);
}
else
{
pp
=
&
pPage
->
pNext
;
if
(
nPage
>=
0
)
nPage
++
;
}
}
if
(
h
==
iStop
)
break
;
h
=
(
h
+
1
)
%
pCache
->
nHash
;
}
assert
(
nPage
<
0
||
pCache
->
nPage
==
(
unsigned
)
nPage
);
}
/******************************************************************************/
/******** sqlite3_pcache Methods **********************************************/
/*
** Implementation of the sqlite3_pcache.xInit method.
*/
static
int
pcache1Init
(
void
*
NotUsed
)
{
assert
(
pcache1
.
isInit
==
0
);
memset
(
&
pcache1
,
0
,
sizeof
(
pcache1
));
// /*
// ** The pcache1.separateCache variable is true if each PCache has its own
// ** private PGroup (mode-1). pcache1.separateCache is false if the single
// ** PGroup in pcache1.grp is used for all page caches (mode-2).
// **
// ** * Always use a unified cache (mode-2) if ENABLE_MEMORY_MANAGEMENT
// **
// ** * Use a unified cache in single-threaded applications that have
// ** configured a start-time buffer for use as page-cache memory using
// ** sqlite3_config(SQLITE_CONFIG_PAGECACHE, pBuf, sz, N) with non-NULL
// ** pBuf argument.
// **
// ** * Otherwise use separate caches (mode-1)
// */
// #if defined(SQLITE_ENABLE_MEMORY_MANAGEMENT)
// pcache1.separateCache = 0;
// #elif SQLITE_THREADSAFE
// pcache1.separateCache = sqlite3GlobalConfig.pPage==0
// || sqlite3GlobalConfig.bCoreMutex>0;
// #else
// pcache1.separateCache = sqlite3GlobalConfig.pPage==0;
// #endif
pcache1
.
separateCache
=
1
;
pthread_mutex_init
(
&
pcache1
.
grp
.
mutex
,
NULL
);
pthread_mutex_init
(
&
pcache1
.
mutex
,
NULL
);
// if (pcache1.separateCache && sqlite3GlobalConfig.nPage != 0 && sqlite3GlobalConfig.pPage == 0) {
// pcache1.nInitPage = sqlite3GlobalConfig.nPage;
// } else {
pcache1
.
nInitPage
=
0
;
// }
pcache1
.
grp
.
mxPinned
=
10
;
pcache1
.
isInit
=
1
;
return
0
;
}
/*
** Implementation of the sqlite3_pcache.xShutdown method.
** Note that the static mutex allocated in xInit does
** not need to be freed.
*/
static
void
pcache1Shutdown
(
void
*
NotUsed
)
{
assert
(
pcache1
.
isInit
!=
0
);
memset
(
&
pcache1
,
0
,
sizeof
(
pcache1
));
}
/* forward declaration */
static
void
pcache1Destroy
(
sqlite3_pcache
*
p
);
/*
** Implementation of the sqlite3_pcache.xCreate method.
**
** Allocate a new cache.
*/
static
sqlite3_pcache
*
pcache1Create
(
int
szPage
,
int
szExtra
,
int
bPurgeable
)
{
PCache1
*
pCache
;
/* The newly created page cache */
PGroup
*
pGroup
;
/* The group the new page cache will belong to */
int
sz
;
/* Bytes of memory required to allocate the new cache */
assert
((
szPage
&
(
szPage
-
1
))
==
0
&&
szPage
>=
512
&&
szPage
<=
65536
);
assert
(
szExtra
<
300
);
sz
=
sizeof
(
PCache1
)
+
sizeof
(
PGroup
)
*
pcache1
.
separateCache
;
pCache
=
(
PCache1
*
)
calloc
(
1
,
sz
);
if
(
pCache
)
{
if
(
pcache1
.
separateCache
)
{
pGroup
=
(
PGroup
*
)
&
pCache
[
1
];
pGroup
->
mxPinned
=
10
;
}
else
{
pGroup
=
&
pcache1
.
grp
;
}
pcache1EnterMutex
(
pGroup
);
if
(
pGroup
->
lru
.
isAnchor
==
0
)
{
pGroup
->
lru
.
isAnchor
=
1
;
pGroup
->
lru
.
pLruPrev
=
pGroup
->
lru
.
pLruNext
=
&
pGroup
->
lru
;
}
pCache
->
pGroup
=
pGroup
;
pCache
->
szPage
=
szPage
;
pCache
->
szExtra
=
szExtra
;
pCache
->
szAlloc
=
szPage
+
szExtra
+
ROUND8
(
sizeof
(
PgHdr1
));
pCache
->
bPurgeable
=
(
bPurgeable
?
1
:
0
);
pcache1ResizeHash
(
pCache
);
if
(
bPurgeable
)
{
pCache
->
nMin
=
10
;
pGroup
->
nMinPage
+=
pCache
->
nMin
;
pGroup
->
mxPinned
=
pGroup
->
nMaxPage
+
10
-
pGroup
->
nMinPage
;
pCache
->
pnPurgeable
=
&
pGroup
->
nPurgeable
;
}
else
{
pCache
->
pnPurgeable
=
&
pCache
->
nPurgeableDummy
;
}
pcache1LeaveMutex
(
pGroup
);
if
(
pCache
->
nHash
==
0
)
{
pcache1Destroy
((
sqlite3_pcache
*
)
pCache
);
pCache
=
0
;
}
}
return
(
sqlite3_pcache
*
)
pCache
;
}
/*
** Implementation of the sqlite3_pcache.xCachesize method.
**
** Configure the cache_size limit for a cache.
*/
static
void
pcache1Cachesize
(
sqlite3_pcache
*
p
,
int
nMax
)
{
PCache1
*
pCache
=
(
PCache1
*
)
p
;
u32
n
;
assert
(
nMax
>=
0
);
if
(
pCache
->
bPurgeable
)
{
PGroup
*
pGroup
=
pCache
->
pGroup
;
pcache1EnterMutex
(
pGroup
);
n
=
(
u32
)
nMax
;
if
(
n
>
0x7fff0000
-
pGroup
->
nMaxPage
+
pCache
->
nMax
)
{
n
=
0x7fff0000
-
pGroup
->
nMaxPage
+
pCache
->
nMax
;
}
pGroup
->
nMaxPage
+=
(
n
-
pCache
->
nMax
);
pGroup
->
mxPinned
=
pGroup
->
nMaxPage
+
10
-
pGroup
->
nMinPage
;
pCache
->
nMax
=
n
;
pCache
->
n90pct
=
pCache
->
nMax
*
9
/
10
;
pcache1EnforceMaxPage
(
pCache
);
pcache1LeaveMutex
(
pGroup
);
}
}
/*
** Implementation of the sqlite3_pcache.xShrink method.
**
** Free up as much memory as possible.
*/
static
void
pcache1Shrink
(
sqlite3_pcache
*
p
)
{
PCache1
*
pCache
=
(
PCache1
*
)
p
;
if
(
pCache
->
bPurgeable
)
{
PGroup
*
pGroup
=
pCache
->
pGroup
;
unsigned
int
savedMaxPage
;
pcache1EnterMutex
(
pGroup
);
savedMaxPage
=
pGroup
->
nMaxPage
;
pGroup
->
nMaxPage
=
0
;
pcache1EnforceMaxPage
(
pCache
);
pGroup
->
nMaxPage
=
savedMaxPage
;
pcache1LeaveMutex
(
pGroup
);
}
}
/*
** Implementation of the sqlite3_pcache.xPagecount method.
*/
static
int
pcache1Pagecount
(
sqlite3_pcache
*
p
)
{
int
n
;
PCache1
*
pCache
=
(
PCache1
*
)
p
;
pcache1EnterMutex
(
pCache
->
pGroup
);
n
=
pCache
->
nPage
;
pcache1LeaveMutex
(
pCache
->
pGroup
);
return
n
;
}
/*
** Implement steps 3, 4, and 5 of the pcache1Fetch() algorithm described
** in the header of the pcache1Fetch() procedure.
**
** This steps are broken out into a separate procedure because they are
** usually not needed, and by avoiding the stack initialization required
** for these steps, the main pcache1Fetch() procedure can run faster.
*/
static
PgHdr1
*
pcache1FetchStage2
(
PCache1
*
pCache
,
unsigned
int
iKey
,
int
createFlag
)
{
unsigned
int
nPinned
;
PGroup
*
pGroup
=
pCache
->
pGroup
;
PgHdr1
*
pPage
=
0
;
/* Step 3: Abort if createFlag is 1 but the cache is nearly full */
assert
(
pCache
->
nPage
>=
pCache
->
nRecyclable
);
nPinned
=
pCache
->
nPage
-
pCache
->
nRecyclable
;
assert
(
pGroup
->
mxPinned
==
pGroup
->
nMaxPage
+
10
-
pGroup
->
nMinPage
);
assert
(
pCache
->
n90pct
==
pCache
->
nMax
*
9
/
10
);
if
(
createFlag
==
1
&&
(
nPinned
>=
pGroup
->
mxPinned
||
nPinned
>=
pCache
->
n90pct
||
(
pcache1UnderMemoryPressure
(
pCache
)
&&
pCache
->
nRecyclable
<
nPinned
)))
{
return
0
;
}
if
(
pCache
->
nPage
>=
pCache
->
nHash
)
pcache1ResizeHash
(
pCache
);
assert
(
pCache
->
nHash
>
0
&&
pCache
->
apHash
);
/* Step 4. Try to recycle a page. */
if
(
pCache
->
bPurgeable
&&
!
pGroup
->
lru
.
pLruPrev
->
isAnchor
&&
((
pCache
->
nPage
+
1
>=
pCache
->
nMax
)
||
pcache1UnderMemoryPressure
(
pCache
)))
{
PCache1
*
pOther
;
pPage
=
pGroup
->
lru
.
pLruPrev
;
assert
(
PAGE_IS_UNPINNED
(
pPage
));
pcache1RemoveFromHash
(
pPage
,
0
);
pcache1PinPage
(
pPage
);
pOther
=
pPage
->
pCache
;
if
(
pOther
->
szAlloc
!=
pCache
->
szAlloc
)
{
pcache1FreePage
(
pPage
);
pPage
=
0
;
}
else
{
pGroup
->
nPurgeable
-=
(
pOther
->
bPurgeable
-
pCache
->
bPurgeable
);
}
}
/* Step 5. If a usable page buffer has still not been found,
** attempt to allocate a new one.
*/
if
(
!
pPage
)
{
pPage
=
pcache1AllocPage
(
pCache
,
createFlag
==
1
);
}
if
(
pPage
)
{
unsigned
int
h
=
iKey
%
pCache
->
nHash
;
pCache
->
nPage
++
;
pPage
->
iKey
=
iKey
;
pPage
->
pNext
=
pCache
->
apHash
[
h
];
pPage
->
pCache
=
pCache
;
pPage
->
pLruNext
=
0
;
/* pPage->pLruPrev = 0;
** No need to clear pLruPrev since it is not accessed when pLruNext==0 */
*
(
void
**
)
pPage
->
page
.
pExtra
=
0
;
pCache
->
apHash
[
h
]
=
pPage
;
if
(
iKey
>
pCache
->
iMaxKey
)
{
pCache
->
iMaxKey
=
iKey
;
}
}
return
pPage
;
}
/*
** Implementation of the sqlite3_pcache.xFetch method.
**
** Fetch a page by key value.
**
** Whether or not a new page may be allocated by this function depends on
** the value of the createFlag argument. 0 means do not allocate a new
** page. 1 means allocate a new page if space is easily available. 2
** means to try really hard to allocate a new page.
**
** For a non-purgeable cache (a cache used as the storage for an in-memory
** database) there is really no difference between createFlag 1 and 2. So
** the calling function (pcache.c) will never have a createFlag of 1 on
** a non-purgeable cache.
**
** There are three different approaches to obtaining space for a page,
** depending on the value of parameter createFlag (which may be 0, 1 or 2).
**
** 1. Regardless of the value of createFlag, the cache is searched for a
** copy of the requested page. If one is found, it is returned.
**
** 2. If createFlag==0 and the page is not already in the cache, NULL is
** returned.
**
** 3. If createFlag is 1, and the page is not already in the cache, then
** return NULL (do not allocate a new page) if any of the following
** conditions are true:
**
** (a) the number of pages pinned by the cache is greater than
** PCache1.nMax, or
**
** (b) the number of pages pinned by the cache is greater than
** the sum of nMax for all purgeable caches, less the sum of
** nMin for all other purgeable caches, or
**
** 4. If none of the first three conditions apply and the cache is marked
** as purgeable, and if one of the following is true:
**
** (a) The number of pages allocated for the cache is already
** PCache1.nMax, or
**
** (b) The number of pages allocated for all purgeable caches is
** already equal to or greater than the sum of nMax for all
** purgeable caches,
**
** (c) The system is under memory pressure and wants to avoid
** unnecessary pages cache entry allocations
**
** then attempt to recycle a page from the LRU list. If it is the right
** size, return the recycled buffer. Otherwise, free the buffer and
** proceed to step 5.
**
** 5. Otherwise, allocate and return a new page buffer.
**
** There are two versions of this routine. pcache1FetchWithMutex() is
** the general case. pcache1FetchNoMutex() is a faster implementation for
** the common case where pGroup->mutex is NULL. The pcache1Fetch() wrapper
** invokes the appropriate routine.
*/
static
PgHdr1
*
pcache1FetchNoMutex
(
sqlite3_pcache
*
p
,
unsigned
int
iKey
,
int
createFlag
)
{
PCache1
*
pCache
=
(
PCache1
*
)
p
;
PgHdr1
*
pPage
=
0
;
/* Step 1: Search the hash table for an existing entry. */
pPage
=
pCache
->
apHash
[
iKey
%
pCache
->
nHash
];
while
(
pPage
&&
pPage
->
iKey
!=
iKey
)
{
pPage
=
pPage
->
pNext
;
}
/* Step 2: If the page was found in the hash table, then return it.
** If the page was not in the hash table and createFlag is 0, abort.
** Otherwise (page not in hash and createFlag!=0) continue with
** subsequent steps to try to create the page. */
if
(
pPage
)
{
if
(
PAGE_IS_UNPINNED
(
pPage
))
{
return
pcache1PinPage
(
pPage
);
}
else
{
return
pPage
;
}
}
else
if
(
createFlag
)
{
/* Steps 3, 4, and 5 implemented by this subroutine */
return
pcache1FetchStage2
(
pCache
,
iKey
,
createFlag
);
}
else
{
return
0
;
}
}
#if PCACHE1_MIGHT_USE_GROUP_MUTEX
static
PgHdr1
*
pcache1FetchWithMutex
(
sqlite3_pcache
*
p
,
unsigned
int
iKey
,
int
createFlag
)
{
PCache1
*
pCache
=
(
PCache1
*
)
p
;
PgHdr1
*
pPage
;
pcache1EnterMutex
(
pCache
->
pGroup
);
pPage
=
pcache1FetchNoMutex
(
p
,
iKey
,
createFlag
);
assert
(
pPage
==
0
||
pCache
->
iMaxKey
>=
iKey
);
pcache1LeaveMutex
(
pCache
->
pGroup
);
return
pPage
;
}
#endif
static
sqlite3_pcache_page
*
pcache1Fetch
(
sqlite3_pcache
*
p
,
unsigned
int
iKey
,
int
createFlag
)
{
#if PCACHE1_MIGHT_USE_GROUP_MUTEX || defined(SQLITE_DEBUG)
PCache1
*
pCache
=
(
PCache1
*
)
p
;
#endif
assert
(
offsetof
(
PgHdr1
,
page
)
==
0
);
assert
(
pCache
->
bPurgeable
||
createFlag
!=
1
);
assert
(
pCache
->
bPurgeable
||
pCache
->
nMin
==
0
);
assert
(
pCache
->
bPurgeable
==
0
||
pCache
->
nMin
==
10
);
assert
(
pCache
->
nMin
==
0
||
pCache
->
bPurgeable
);
assert
(
pCache
->
nHash
>
0
);
return
(
sqlite3_pcache_page
*
)
pcache1FetchWithMutex
(
p
,
iKey
,
createFlag
);
}
/*
** Implementation of the sqlite3_pcache.xUnpin method.
**
** Mark a page as unpinned (eligible for asynchronous recycling).
*/
static
void
pcache1Unpin
(
sqlite3_pcache
*
p
,
sqlite3_pcache_page
*
pPg
,
int
reuseUnlikely
)
{
PCache1
*
pCache
=
(
PCache1
*
)
p
;
PgHdr1
*
pPage
=
(
PgHdr1
*
)
pPg
;
PGroup
*
pGroup
=
pCache
->
pGroup
;
assert
(
pPage
->
pCache
==
pCache
);
pcache1EnterMutex
(
pGroup
);
/* It is an error to call this function if the page is already
** part of the PGroup LRU list.
*/
assert
(
pPage
->
pLruNext
==
0
);
assert
(
PAGE_IS_PINNED
(
pPage
));
if
(
reuseUnlikely
||
pGroup
->
nPurgeable
>
pGroup
->
nMaxPage
)
{
pcache1RemoveFromHash
(
pPage
,
1
);
}
else
{
/* Add the page to the PGroup LRU list. */
PgHdr1
**
ppFirst
=
&
pGroup
->
lru
.
pLruNext
;
pPage
->
pLruPrev
=
&
pGroup
->
lru
;
(
pPage
->
pLruNext
=
*
ppFirst
)
->
pLruPrev
=
pPage
;
*
ppFirst
=
pPage
;
pCache
->
nRecyclable
++
;
}
pcache1LeaveMutex
(
pCache
->
pGroup
);
}
/*
** Implementation of the sqlite3_pcache.xRekey method.
*/
static
void
pcache1Rekey
(
sqlite3_pcache
*
p
,
sqlite3_pcache_page
*
pPg
,
unsigned
int
iOld
,
unsigned
int
iNew
)
{
PCache1
*
pCache
=
(
PCache1
*
)
p
;
PgHdr1
*
pPage
=
(
PgHdr1
*
)
pPg
;
PgHdr1
**
pp
;
unsigned
int
h
;
assert
(
pPage
->
iKey
==
iOld
);
assert
(
pPage
->
pCache
==
pCache
);
pcache1EnterMutex
(
pCache
->
pGroup
);
h
=
iOld
%
pCache
->
nHash
;
pp
=
&
pCache
->
apHash
[
h
];
while
((
*
pp
)
!=
pPage
)
{
pp
=
&
(
*
pp
)
->
pNext
;
}
*
pp
=
pPage
->
pNext
;
h
=
iNew
%
pCache
->
nHash
;
pPage
->
iKey
=
iNew
;
pPage
->
pNext
=
pCache
->
apHash
[
h
];
pCache
->
apHash
[
h
]
=
pPage
;
if
(
iNew
>
pCache
->
iMaxKey
)
{
pCache
->
iMaxKey
=
iNew
;
}
pcache1LeaveMutex
(
pCache
->
pGroup
);
}
/*
** Implementation of the sqlite3_pcache.xTruncate method.
**
** Discard all unpinned pages in the cache with a page number equal to
** or greater than parameter iLimit. Any pinned pages with a page number
** equal to or greater than iLimit are implicitly unpinned.
*/
static
void
pcache1Truncate
(
sqlite3_pcache
*
p
,
unsigned
int
iLimit
)
{
PCache1
*
pCache
=
(
PCache1
*
)
p
;
pcache1EnterMutex
(
pCache
->
pGroup
);
if
(
iLimit
<=
pCache
->
iMaxKey
)
{
pcache1TruncateUnsafe
(
pCache
,
iLimit
);
pCache
->
iMaxKey
=
iLimit
-
1
;
}
pcache1LeaveMutex
(
pCache
->
pGroup
);
}
/*
** Implementation of the sqlite3_pcache.xDestroy method.
**
** Destroy a cache allocated using pcache1Create().
*/
static
void
pcache1Destroy
(
sqlite3_pcache
*
p
)
{
PCache1
*
pCache
=
(
PCache1
*
)
p
;
PGroup
*
pGroup
=
pCache
->
pGroup
;
assert
(
pCache
->
bPurgeable
||
(
pCache
->
nMax
==
0
&&
pCache
->
nMin
==
0
));
pcache1EnterMutex
(
pGroup
);
if
(
pCache
->
nPage
)
pcache1TruncateUnsafe
(
pCache
,
0
);
assert
(
pGroup
->
nMaxPage
>=
pCache
->
nMax
);
pGroup
->
nMaxPage
-=
pCache
->
nMax
;
assert
(
pGroup
->
nMinPage
>=
pCache
->
nMin
);
pGroup
->
nMinPage
-=
pCache
->
nMin
;
pGroup
->
mxPinned
=
pGroup
->
nMaxPage
+
10
-
pGroup
->
nMinPage
;
pcache1EnforceMaxPage
(
pCache
);
pcache1LeaveMutex
(
pGroup
);
free
(
pCache
->
pBulk
);
free
(
pCache
->
apHash
);
free
(
pCache
);
}
/*
** Return the size of the header on each page of this PCACHE implementation.
*/
int
sqlite3HeaderSizePcache1
(
void
)
{
return
ROUND8
(
sizeof
(
PgHdr1
));
}
// /*
// ** Return the global mutex used by this PCACHE implementation. The
// ** sqlite3_status() routine needs access to this mutex.
// */
// sqlite3_mutex *sqlite3Pcache1Mutex(void) { return pcache1.mutex; }
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
/*
** This function is called to free superfluous dynamically allocated memory
** held by the pager system. Memory in use by any SQLite pager allocated
** by the current thread may be sqlite3_free()ed.
**
** nReq is the number of bytes of memory required. Once this much has
** been released, the function returns. The return value is the total number
** of bytes of memory released.
*/
int
sqlite3PcacheReleaseMemory
(
int
nReq
)
{
int
nFree
=
0
;
// assert(sqlite3_mutex_notheld(pcache1.grp.mutex));
// assert(sqlite3_mutex_notheld(pcache1.mutex));
if
(
sqlite3GlobalConfig
.
pPage
==
0
)
{
PgHdr1
*
p
;
pcache1EnterMutex
(
&
pcache1
.
grp
);
while
((
nReq
<
0
||
nFree
<
nReq
)
&&
(
p
=
pcache1
.
grp
.
lru
.
pLruPrev
)
!=
0
&&
p
->
isAnchor
==
0
)
{
nFree
+=
pcache1MemSize
(
p
->
page
.
pBuf
);
#ifdef SQLITE_PCACHE_SEPARATE_HEADER
nFree
+=
sqlite3MemSize
(
p
);
#endif
assert
(
PAGE_IS_UNPINNED
(
p
));
pcache1PinPage
(
p
);
pcache1RemoveFromHash
(
p
,
1
);
}
pcache1LeaveMutex
(
&
pcache1
.
grp
);
}
return
nFree
;
}
#endif
/* SQLITE_ENABLE_MEMORY_MANAGEMENT */
#ifdef SQLITE_TEST
/*
** This function is used by test procedures to inspect the internal state
** of the global cache.
*/
void
sqlite3PcacheStats
(
int
*
pnCurrent
,
/* OUT: Total number of pages cached */
int
*
pnMax
,
/* OUT: Global maximum cache size */
int
*
pnMin
,
/* OUT: Sum of PCache1.nMin for purgeable caches */
int
*
pnRecyclable
/* OUT: Total number of pages available for recycling */
)
{
PgHdr1
*
p
;
int
nRecyclable
=
0
;
for
(
p
=
pcache1
.
grp
.
lru
.
pLruNext
;
p
&&
!
p
->
isAnchor
;
p
=
p
->
pLruNext
)
{
assert
(
PAGE_IS_UNPINNED
(
p
));
nRecyclable
++
;
}
*
pnCurrent
=
pcache1
.
grp
.
nPurgeable
;
*
pnMax
=
(
int
)
pcache1
.
grp
.
nMaxPage
;
*
pnMin
=
(
int
)
pcache1
.
grp
.
nMinPage
;
*
pnRecyclable
=
nRecyclable
;
}
#endif
sqlite3_pcache_methods2
pcache2
=
{
1
,
/* iVersion */
0
,
/* pArg */
pcache1Init
,
/* xInit */
pcache1Shutdown
,
/* xShutdown */
pcache1Create
,
/* xCreate */
pcache1Cachesize
,
/* xCachesize */
pcache1Pagecount
,
/* xPagecount */
pcache1Fetch
,
/* xFetch */
pcache1Unpin
,
/* xUnpin */
pcache1Rekey
,
/* xRekey */
pcache1Truncate
,
/* xTruncate */
pcache1Destroy
,
/* xDestroy */
pcache1Shrink
/* xShrink */
};
source/libs/tdb/src/sqlite/wal.c
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2010 February 1
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file contains the implementation of a write-ahead log (WAL) used in
** "journal_mode=WAL" mode.
**
** WRITE-AHEAD LOG (WAL) FILE FORMAT
**
** A WAL file consists of a header followed by zero or more "frames".
** Each frame records the revised content of a single page from the
** database file. All changes to the database are recorded by writing
** frames into the WAL. Transactions commit when a frame is written that
** contains a commit marker. A single WAL can and usually does record
** multiple transactions. Periodically, the content of the WAL is
** transferred back into the database file in an operation called a
** "checkpoint".
**
** A single WAL file can be used multiple times. In other words, the
** WAL can fill up with frames and then be checkpointed and then new
** frames can overwrite the old ones. A WAL always grows from beginning
** toward the end. Checksums and counters attached to each frame are
** used to determine which frames within the WAL are valid and which
** are leftovers from prior checkpoints.
**
** The WAL header is 32 bytes in size and consists of the following eight
** big-endian 32-bit unsigned integer values:
**
** 0: Magic number. 0x377f0682 or 0x377f0683
** 4: File format version. Currently 3007000
** 8: Database page size. Example: 1024
** 12: Checkpoint sequence number
** 16: Salt-1, random integer incremented with each checkpoint
** 20: Salt-2, a different random integer changing with each ckpt
** 24: Checksum-1 (first part of checksum for first 24 bytes of header).
** 28: Checksum-2 (second part of checksum for first 24 bytes of header).
**
** Immediately following the wal-header are zero or more frames. Each
** frame consists of a 24-byte frame-header followed by a <page-size> bytes
** of page data. The frame-header is six big-endian 32-bit unsigned
** integer values, as follows:
**
** 0: Page number.
** 4: For commit records, the size of the database image in pages
** after the commit. For all other records, zero.
** 8: Salt-1 (copied from the header)
** 12: Salt-2 (copied from the header)
** 16: Checksum-1.
** 20: Checksum-2.
**
** A frame is considered valid if and only if the following conditions are
** true:
**
** (1) The salt-1 and salt-2 values in the frame-header match
** salt values in the wal-header
**
** (2) The checksum values in the final 8 bytes of the frame-header
** exactly match the checksum computed consecutively on the
** WAL header and the first 8 bytes and the content of all frames
** up to and including the current frame.
**
** The checksum is computed using 32-bit big-endian integers if the
** magic number in the first 4 bytes of the WAL is 0x377f0683 and it
** is computed using little-endian if the magic number is 0x377f0682.
** The checksum values are always stored in the frame header in a
** big-endian format regardless of which byte order is used to compute
** the checksum. The checksum is computed by interpreting the input as
** an even number of unsigned 32-bit integers: x[0] through x[N]. The
** algorithm used for the checksum is as follows:
**
** for i from 0 to n-1 step 2:
** s0 += x[i] + s1;
** s1 += x[i+1] + s0;
** endfor
**
** Note that s0 and s1 are both weighted checksums using fibonacci weights
** in reverse order (the largest fibonacci weight occurs on the first element
** of the sequence being summed.) The s1 value spans all 32-bit
** terms of the sequence whereas s0 omits the final term.
**
** On a checkpoint, the WAL is first VFS.xSync-ed, then valid content of the
** WAL is transferred into the database, then the database is VFS.xSync-ed.
** The VFS.xSync operations serve as write barriers - all writes launched
** before the xSync must complete before any write that launches after the
** xSync begins.
**
** After each checkpoint, the salt-1 value is incremented and the salt-2
** value is randomized. This prevents old and new frames in the WAL from
** being considered valid at the same time and being checkpointing together
** following a crash.
**
** READER ALGORITHM
**
** To read a page from the database (call it page number P), a reader
** first checks the WAL to see if it contains page P. If so, then the
** last valid instance of page P that is a followed by a commit frame
** or is a commit frame itself becomes the value read. If the WAL
** contains no copies of page P that are valid and which are a commit
** frame or are followed by a commit frame, then page P is read from
** the database file.
**
** To start a read transaction, the reader records the index of the last
** valid frame in the WAL. The reader uses this recorded "mxFrame" value
** for all subsequent read operations. New transactions can be appended
** to the WAL, but as long as the reader uses its original mxFrame value
** and ignores the newly appended content, it will see a consistent snapshot
** of the database from a single point in time. This technique allows
** multiple concurrent readers to view different versions of the database
** content simultaneously.
**
** The reader algorithm in the previous paragraphs works correctly, but
** because frames for page P can appear anywhere within the WAL, the
** reader has to scan the entire WAL looking for page P frames. If the
** WAL is large (multiple megabytes is typical) that scan can be slow,
** and read performance suffers. To overcome this problem, a separate
** data structure called the wal-index is maintained to expedite the
** search for frames of a particular page.
**
** WAL-INDEX FORMAT
**
** Conceptually, the wal-index is shared memory, though VFS implementations
** might choose to implement the wal-index using a mmapped file. Because
** the wal-index is shared memory, SQLite does not support journal_mode=WAL
** on a network filesystem. All users of the database must be able to
** share memory.
**
** In the default unix and windows implementation, the wal-index is a mmapped
** file whose name is the database name with a "-shm" suffix added. For that
** reason, the wal-index is sometimes called the "shm" file.
**
** The wal-index is transient. After a crash, the wal-index can (and should
** be) reconstructed from the original WAL file. In fact, the VFS is required
** to either truncate or zero the header of the wal-index when the last
** connection to it closes. Because the wal-index is transient, it can
** use an architecture-specific format; it does not have to be cross-platform.
** Hence, unlike the database and WAL file formats which store all values
** as big endian, the wal-index can store multi-byte values in the native
** byte order of the host computer.
**
** The purpose of the wal-index is to answer this question quickly: Given
** a page number P and a maximum frame index M, return the index of the
** last frame in the wal before frame M for page P in the WAL, or return
** NULL if there are no frames for page P in the WAL prior to M.
**
** The wal-index consists of a header region, followed by an one or
** more index blocks.
**
** The wal-index header contains the total number of frames within the WAL
** in the mxFrame field.
**
** Each index block except for the first contains information on
** HASHTABLE_NPAGE frames. The first index block contains information on
** HASHTABLE_NPAGE_ONE frames. The values of HASHTABLE_NPAGE_ONE and
** HASHTABLE_NPAGE are selected so that together the wal-index header and
** first index block are the same size as all other index blocks in the
** wal-index. The values are:
**
** HASHTABLE_NPAGE 4096
** HASHTABLE_NPAGE_ONE 4062
**
** Each index block contains two sections, a page-mapping that contains the
** database page number associated with each wal frame, and a hash-table
** that allows readers to query an index block for a specific page number.
** The page-mapping is an array of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE
** for the first index block) 32-bit page numbers. The first entry in the
** first index-block contains the database page number corresponding to the
** first frame in the WAL file. The first entry in the second index block
** in the WAL file corresponds to the (HASHTABLE_NPAGE_ONE+1)th frame in
** the log, and so on.
**
** The last index block in a wal-index usually contains less than the full
** complement of HASHTABLE_NPAGE (or HASHTABLE_NPAGE_ONE) page-numbers,
** depending on the contents of the WAL file. This does not change the
** allocated size of the page-mapping array - the page-mapping array merely
** contains unused entries.
**
** Even without using the hash table, the last frame for page P
** can be found by scanning the page-mapping sections of each index block
** starting with the last index block and moving toward the first, and
** within each index block, starting at the end and moving toward the
** beginning. The first entry that equals P corresponds to the frame
** holding the content for that page.
**
** The hash table consists of HASHTABLE_NSLOT 16-bit unsigned integers.
** HASHTABLE_NSLOT = 2*HASHTABLE_NPAGE, and there is one entry in the
** hash table for each page number in the mapping section, so the hash
** table is never more than half full. The expected number of collisions
** prior to finding a match is 1. Each entry of the hash table is an
** 1-based index of an entry in the mapping section of the same
** index block. Let K be the 1-based index of the largest entry in
** the mapping section. (For index blocks other than the last, K will
** always be exactly HASHTABLE_NPAGE (4096) and for the last index block
** K will be (mxFrame%HASHTABLE_NPAGE).) Unused slots of the hash table
** contain a value of 0.
**
** To look for page P in the hash table, first compute a hash iKey on
** P as follows:
**
** iKey = (P * 383) % HASHTABLE_NSLOT
**
** Then start scanning entries of the hash table, starting with iKey
** (wrapping around to the beginning when the end of the hash table is
** reached) until an unused hash slot is found. Let the first unused slot
** be at index iUnused. (iUnused might be less than iKey if there was
** wrap-around.) Because the hash table is never more than half full,
** the search is guaranteed to eventually hit an unused entry. Let
** iMax be the value between iKey and iUnused, closest to iUnused,
** where aHash[iMax]==P. If there is no iMax entry (if there exists
** no hash slot such that aHash[i]==p) then page P is not in the
** current index block. Otherwise the iMax-th mapping entry of the
** current index block corresponds to the last entry that references
** page P.
**
** A hash search begins with the last index block and moves toward the
** first index block, looking for entries corresponding to page P. On
** average, only two or three slots in each index block need to be
** examined in order to either find the last entry for page P, or to
** establish that no such entry exists in the block. Each index block
** holds over 4000 entries. So two or three index blocks are sufficient
** to cover a typical 10 megabyte WAL file, assuming 1K pages. 8 or 10
** comparisons (on average) suffice to either locate a frame in the
** WAL or to establish that the frame does not exist in the WAL. This
** is much faster than scanning the entire 10MB WAL.
**
** Note that entries are added in order of increasing K. Hence, one
** reader might be using some value K0 and a second reader that started
** at a later time (after additional transactions were added to the WAL
** and to the wal-index) might be using a different value K1, where K1>K0.
** Both readers can use the same hash table and mapping section to get
** the correct result. There may be entries in the hash table with
** K>K0 but to the first reader, those entries will appear to be unused
** slots in the hash table and so the first reader will get an answer as
** if no values greater than K0 had ever been inserted into the hash table
** in the first place - which is what reader one wants. Meanwhile, the
** second reader using K1 will see additional values that were inserted
** later, which is exactly what reader two wants.
**
** When a rollback occurs, the value of K is decreased. Hash table entries
** that correspond to frames greater than the new K value are removed
** from the hash table at this point.
*/
#ifndef SQLITE_OMIT_WAL
#include "wal.h"
/*
** Trace output macros
*/
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
int
sqlite3WalTrace
=
0
;
# define WALTRACE(X) if(sqlite3WalTrace) sqlite3DebugPrintf X
#else
# define WALTRACE(X)
#endif
/*
** The maximum (and only) versions of the wal and wal-index formats
** that may be interpreted by this version of SQLite.
**
** If a client begins recovering a WAL file and finds that (a) the checksum
** values in the wal-header are correct and (b) the version field is not
** WAL_MAX_VERSION, recovery fails and SQLite returns SQLITE_CANTOPEN.
**
** Similarly, if a client successfully reads a wal-index header (i.e. the
** checksum test is successful) and finds that the version field is not
** WALINDEX_MAX_VERSION, then no read-transaction is opened and SQLite
** returns SQLITE_CANTOPEN.
*/
#define WAL_MAX_VERSION 3007000
#define WALINDEX_MAX_VERSION 3007000
/*
** Index numbers for various locking bytes. WAL_NREADER is the number
** of available reader locks and should be at least 3. The default
** is SQLITE_SHM_NLOCK==8 and WAL_NREADER==5.
**
** Technically, the various VFSes are free to implement these locks however
** they see fit. However, compatibility is encouraged so that VFSes can
** interoperate. The standard implemention used on both unix and windows
** is for the index number to indicate a byte offset into the
** WalCkptInfo.aLock[] array in the wal-index header. In other words, all
** locks are on the shm file. The WALINDEX_LOCK_OFFSET constant (which
** should be 120) is the location in the shm file for the first locking
** byte.
*/
#define WAL_WRITE_LOCK 0
#define WAL_ALL_BUT_WRITE 1
#define WAL_CKPT_LOCK 1
#define WAL_RECOVER_LOCK 2
#define WAL_READ_LOCK(I) (3+(I))
#define WAL_NREADER (SQLITE_SHM_NLOCK-3)
/* Object declarations */
typedef
struct
WalIndexHdr
WalIndexHdr
;
typedef
struct
WalIterator
WalIterator
;
typedef
struct
WalCkptInfo
WalCkptInfo
;
/*
** The following object holds a copy of the wal-index header content.
**
** The actual header in the wal-index consists of two copies of this
** object followed by one instance of the WalCkptInfo object.
** For all versions of SQLite through 3.10.0 and probably beyond,
** the locking bytes (WalCkptInfo.aLock) start at offset 120 and
** the total header size is 136 bytes.
**
** The szPage value can be any power of 2 between 512 and 32768, inclusive.
** Or it can be 1 to represent a 65536-byte page. The latter case was
** added in 3.7.1 when support for 64K pages was added.
*/
struct
WalIndexHdr
{
u32
iVersion
;
/* Wal-index version */
u32
unused
;
/* Unused (padding) field */
u32
iChange
;
/* Counter incremented each transaction */
u8
isInit
;
/* 1 when initialized */
u8
bigEndCksum
;
/* True if checksums in WAL are big-endian */
u16
szPage
;
/* Database page size in bytes. 1==64K */
u32
mxFrame
;
/* Index of last valid frame in the WAL */
u32
nPage
;
/* Size of database in pages */
u32
aFrameCksum
[
2
];
/* Checksum of last frame in log */
u32
aSalt
[
2
];
/* Two salt values copied from WAL header */
u32
aCksum
[
2
];
/* Checksum over all prior fields */
};
/*
** A copy of the following object occurs in the wal-index immediately
** following the second copy of the WalIndexHdr. This object stores
** information used by checkpoint.
**
** nBackfill is the number of frames in the WAL that have been written
** back into the database. (We call the act of moving content from WAL to
** database "backfilling".) The nBackfill number is never greater than
** WalIndexHdr.mxFrame. nBackfill can only be increased by threads
** holding the WAL_CKPT_LOCK lock (which includes a recovery thread).
** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from
** mxFrame back to zero when the WAL is reset.
**
** nBackfillAttempted is the largest value of nBackfill that a checkpoint
** has attempted to achieve. Normally nBackfill==nBackfillAtempted, however
** the nBackfillAttempted is set before any backfilling is done and the
** nBackfill is only set after all backfilling completes. So if a checkpoint
** crashes, nBackfillAttempted might be larger than nBackfill. The
** WalIndexHdr.mxFrame must never be less than nBackfillAttempted.
**
** The aLock[] field is a set of bytes used for locking. These bytes should
** never be read or written.
**
** There is one entry in aReadMark[] for each reader lock. If a reader
** holds read-lock K, then the value in aReadMark[K] is no greater than
** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff)
** for any aReadMark[] means that entry is unused. aReadMark[0] is
** a special case; its value is never used and it exists as a place-holder
** to avoid having to offset aReadMark[] indexs by one. Readers holding
** WAL_READ_LOCK(0) always ignore the entire WAL and read all content
** directly from the database.
**
** The value of aReadMark[K] may only be changed by a thread that
** is holding an exclusive lock on WAL_READ_LOCK(K). Thus, the value of
** aReadMark[K] cannot changed while there is a reader is using that mark
** since the reader will be holding a shared lock on WAL_READ_LOCK(K).
**
** The checkpointer may only transfer frames from WAL to database where
** the frame numbers are less than or equal to every aReadMark[] that is
** in use (that is, every aReadMark[j] for which there is a corresponding
** WAL_READ_LOCK(j)). New readers (usually) pick the aReadMark[] with the
** largest value and will increase an unused aReadMark[] to mxFrame if there
** is not already an aReadMark[] equal to mxFrame. The exception to the
** previous sentence is when nBackfill equals mxFrame (meaning that everything
** in the WAL has been backfilled into the database) then new readers
** will choose aReadMark[0] which has value 0 and hence such reader will
** get all their all content directly from the database file and ignore
** the WAL.
**
** Writers normally append new frames to the end of the WAL. However,
** if nBackfill equals mxFrame (meaning that all WAL content has been
** written back into the database) and if no readers are using the WAL
** (in other words, if there are no WAL_READ_LOCK(i) where i>0) then
** the writer will first "reset" the WAL back to the beginning and start
** writing new content beginning at frame 1.
**
** We assume that 32-bit loads are atomic and so no locks are needed in
** order to read from any aReadMark[] entries.
*/
struct
WalCkptInfo
{
u32
nBackfill
;
/* Number of WAL frames backfilled into DB */
u32
aReadMark
[
WAL_NREADER
];
/* Reader marks */
u8
aLock
[
SQLITE_SHM_NLOCK
];
/* Reserved space for locks */
u32
nBackfillAttempted
;
/* WAL frames perhaps written, or maybe not */
u32
notUsed0
;
/* Available for future enhancements */
};
#define READMARK_NOT_USED 0xffffffff
/*
** This is a schematic view of the complete 136-byte header of the
** wal-index file (also known as the -shm file):
**
** +-----------------------------+
** 0: | iVersion | \
** +-----------------------------+ |
** 4: | (unused padding) | |
** +-----------------------------+ |
** 8: | iChange | |
** +-------+-------+-------------+ |
** 12: | bInit | bBig | szPage | |
** +-------+-------+-------------+ |
** 16: | mxFrame | | First copy of the
** +-----------------------------+ | WalIndexHdr object
** 20: | nPage | |
** +-----------------------------+ |
** 24: | aFrameCksum | |
** | | |
** +-----------------------------+ |
** 32: | aSalt | |
** | | |
** +-----------------------------+ |
** 40: | aCksum | |
** | | /
** +-----------------------------+
** 48: | iVersion | \
** +-----------------------------+ |
** 52: | (unused padding) | |
** +-----------------------------+ |
** 56: | iChange | |
** +-------+-------+-------------+ |
** 60: | bInit | bBig | szPage | |
** +-------+-------+-------------+ | Second copy of the
** 64: | mxFrame | | WalIndexHdr
** +-----------------------------+ |
** 68: | nPage | |
** +-----------------------------+ |
** 72: | aFrameCksum | |
** | | |
** +-----------------------------+ |
** 80: | aSalt | |
** | | |
** +-----------------------------+ |
** 88: | aCksum | |
** | | /
** +-----------------------------+
** 96: | nBackfill |
** +-----------------------------+
** 100: | 5 read marks |
** | |
** | |
** | |
** | |
** +-------+-------+------+------+
** 120: | Write | Ckpt | Rcvr | Rd0 | \
** +-------+-------+------+------+ ) 8 lock bytes
** | Read1 | Read2 | Rd3 | Rd4 | /
** +-------+-------+------+------+
** 128: | nBackfillAttempted |
** +-----------------------------+
** 132: | (unused padding) |
** +-----------------------------+
*/
/* A block of WALINDEX_LOCK_RESERVED bytes beginning at
** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems
** only support mandatory file-locks, we do not read or write data
** from the region of the file on which locks are applied.
*/
#define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2+offsetof(WalCkptInfo,aLock))
#define WALINDEX_HDR_SIZE (sizeof(WalIndexHdr)*2+sizeof(WalCkptInfo))
/* Size of header before each frame in wal */
#define WAL_FRAME_HDRSIZE 24
/* Size of write ahead log header, including checksum. */
#define WAL_HDRSIZE 32
/* WAL magic value. Either this value, or the same value with the least
** significant bit also set (WAL_MAGIC | 0x00000001) is stored in 32-bit
** big-endian format in the first 4 bytes of a WAL file.
**
** If the LSB is set, then the checksums for each frame within the WAL
** file are calculated by treating all data as an array of 32-bit
** big-endian words. Otherwise, they are calculated by interpreting
** all data as 32-bit little-endian words.
*/
#define WAL_MAGIC 0x377f0682
/*
** Return the offset of frame iFrame in the write-ahead log file,
** assuming a database page size of szPage bytes. The offset returned
** is to the start of the write-ahead log frame-header.
*/
#define walFrameOffset(iFrame, szPage) ( \
WAL_HDRSIZE + ((iFrame)-1)*(i64)((szPage)+WAL_FRAME_HDRSIZE) \
)
/*
** An open write-ahead log file is represented by an instance of the
** following object.
*/
struct
Wal
{
sqlite3_vfs
*
pVfs
;
/* The VFS used to create pDbFd */
sqlite3_file
*
pDbFd
;
/* File handle for the database file */
sqlite3_file
*
pWalFd
;
/* File handle for WAL file */
u32
iCallback
;
/* Value to pass to log callback (or 0) */
i64
mxWalSize
;
/* Truncate WAL to this size upon reset */
int
nWiData
;
/* Size of array apWiData */
int
szFirstBlock
;
/* Size of first block written to WAL file */
volatile
u32
**
apWiData
;
/* Pointer to wal-index content in memory */
u32
szPage
;
/* Database page size */
i16
readLock
;
/* Which read lock is being held. -1 for none */
u8
syncFlags
;
/* Flags to use to sync header writes */
u8
exclusiveMode
;
/* Non-zero if connection is in exclusive mode */
u8
writeLock
;
/* True if in a write transaction */
u8
ckptLock
;
/* True if holding a checkpoint lock */
u8
readOnly
;
/* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */
u8
truncateOnCommit
;
/* True to truncate WAL file on commit */
u8
syncHeader
;
/* Fsync the WAL header if true */
u8
padToSectorBoundary
;
/* Pad transactions out to the next sector */
u8
bShmUnreliable
;
/* SHM content is read-only and unreliable */
WalIndexHdr
hdr
;
/* Wal-index header for current transaction */
u32
minFrame
;
/* Ignore wal frames before this one */
u32
iReCksum
;
/* On commit, recalculate checksums from here */
const
char
*
zWalName
;
/* Name of WAL file */
u32
nCkpt
;
/* Checkpoint sequence counter in the wal-header */
#ifdef SQLITE_DEBUG
u8
lockError
;
/* True if a locking error has occurred */
#endif
#ifdef SQLITE_ENABLE_SNAPSHOT
WalIndexHdr
*
pSnapshot
;
/* Start transaction here if not NULL */
#endif
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
sqlite3
*
db
;
#endif
};
/*
** Candidate values for Wal.exclusiveMode.
*/
#define WAL_NORMAL_MODE 0
#define WAL_EXCLUSIVE_MODE 1
#define WAL_HEAPMEMORY_MODE 2
/*
** Possible values for WAL.readOnly
*/
#define WAL_RDWR 0
/* Normal read/write connection */
#define WAL_RDONLY 1
/* The WAL file is readonly */
#define WAL_SHM_RDONLY 2
/* The SHM file is readonly */
/*
** Each page of the wal-index mapping contains a hash-table made up of
** an array of HASHTABLE_NSLOT elements of the following type.
*/
typedef
u16
ht_slot
;
/*
** This structure is used to implement an iterator that loops through
** all frames in the WAL in database page order. Where two or more frames
** correspond to the same database page, the iterator visits only the
** frame most recently written to the WAL (in other words, the frame with
** the largest index).
**
** The internals of this structure are only accessed by:
**
** walIteratorInit() - Create a new iterator,
** walIteratorNext() - Step an iterator,
** walIteratorFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see walCheckpoint()).
*/
struct
WalIterator
{
u32
iPrior
;
/* Last result returned from the iterator */
int
nSegment
;
/* Number of entries in aSegment[] */
struct
WalSegment
{
int
iNext
;
/* Next slot in aIndex[] not yet returned */
ht_slot
*
aIndex
;
/* i0, i1, i2... such that aPgno[iN] ascend */
u32
*
aPgno
;
/* Array of page numbers. */
int
nEntry
;
/* Nr. of entries in aPgno[] and aIndex[] */
int
iZero
;
/* Frame number associated with aPgno[0] */
}
aSegment
[
1
];
/* One for every 32KB page in the wal-index */
};
/*
** Define the parameters of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
**
** Changing any of these constants will alter the wal-index format and
** create incompatibilities.
*/
#define HASHTABLE_NPAGE 4096
/* Must be power of 2 */
#define HASHTABLE_HASH_1 383
/* Should be prime */
#define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2)
/* Must be a power of 2 */
/*
** The block of page numbers associated with the first hash-table in a
** wal-index is smaller than usual. This is so that there is a complete
** hash-table on each aligned 32KB page of the wal-index.
*/
#define HASHTABLE_NPAGE_ONE (HASHTABLE_NPAGE - (WALINDEX_HDR_SIZE/sizeof(u32)))
/* The wal-index is divided into pages of WALINDEX_PGSZ bytes each. */
#define WALINDEX_PGSZ ( \
sizeof(ht_slot)*HASHTABLE_NSLOT + HASHTABLE_NPAGE*sizeof(u32) \
)
/*
** Obtain a pointer to the iPage'th page of the wal-index. The wal-index
** is broken into pages of WALINDEX_PGSZ bytes. Wal-index pages are
** numbered from zero.
**
** If the wal-index is currently smaller the iPage pages then the size
** of the wal-index might be increased, but only if it is safe to do
** so. It is safe to enlarge the wal-index if pWal->writeLock is true
** or pWal->exclusiveMode==WAL_HEAPMEMORY_MODE.
**
** Three possible result scenarios:
**
** (1) rc==SQLITE_OK and *ppPage==Requested-Wal-Index-Page
** (2) rc>=SQLITE_ERROR and *ppPage==NULL
** (3) rc==SQLITE_OK and *ppPage==NULL // only if iPage==0
**
** Scenario (3) can only occur when pWal->writeLock is false and iPage==0
*/
static
SQLITE_NOINLINE
int
walIndexPageRealloc
(
Wal
*
pWal
,
/* The WAL context */
int
iPage
,
/* The page we seek */
volatile
u32
**
ppPage
/* Write the page pointer here */
){
int
rc
=
SQLITE_OK
;
/* Enlarge the pWal->apWiData[] array if required */
if
(
pWal
->
nWiData
<=
iPage
){
sqlite3_int64
nByte
=
sizeof
(
u32
*
)
*
(
iPage
+
1
);
volatile
u32
**
apNew
;
apNew
=
(
volatile
u32
**
)
sqlite3Realloc
((
void
*
)
pWal
->
apWiData
,
nByte
);
if
(
!
apNew
){
*
ppPage
=
0
;
return
SQLITE_NOMEM
;
}
memset
((
void
*
)
&
apNew
[
pWal
->
nWiData
],
0
,
sizeof
(
u32
*
)
*
(
iPage
+
1
-
pWal
->
nWiData
));
pWal
->
apWiData
=
apNew
;
pWal
->
nWiData
=
iPage
+
1
;
}
/* Request a pointer to the required page from the VFS */
assert
(
pWal
->
apWiData
[
iPage
]
==
0
);
if
(
pWal
->
exclusiveMode
==
WAL_HEAPMEMORY_MODE
){
pWal
->
apWiData
[
iPage
]
=
(
u32
volatile
*
)
sqlite3MallocZero
(
WALINDEX_PGSZ
);
if
(
!
pWal
->
apWiData
[
iPage
]
)
rc
=
SQLITE_NOMEM
;
}
else
{
rc
=
sqlite3OsShmMap
(
pWal
->
pDbFd
,
iPage
,
WALINDEX_PGSZ
,
pWal
->
writeLock
,
(
void
volatile
**
)
&
pWal
->
apWiData
[
iPage
]
);
assert
(
pWal
->
apWiData
[
iPage
]
!=
0
||
rc
!=
SQLITE_OK
||
(
pWal
->
writeLock
==
0
&&
iPage
==
0
)
);
testcase
(
pWal
->
apWiData
[
iPage
]
==
0
&&
rc
==
SQLITE_OK
);
if
(
rc
==
SQLITE_OK
){
if
(
iPage
>
0
&&
sqlite3FaultSim
(
600
)
)
rc
=
SQLITE_NOMEM
;
}
else
if
(
(
rc
&
0xff
)
==
SQLITE_READONLY
){
pWal
->
readOnly
|=
WAL_SHM_RDONLY
;
if
(
rc
==
SQLITE_READONLY
){
rc
=
SQLITE_OK
;
}
}
}
*
ppPage
=
pWal
->
apWiData
[
iPage
];
assert
(
iPage
==
0
||
*
ppPage
||
rc
!=
SQLITE_OK
);
return
rc
;
}
static
int
walIndexPage
(
Wal
*
pWal
,
/* The WAL context */
int
iPage
,
/* The page we seek */
volatile
u32
**
ppPage
/* Write the page pointer here */
){
if
(
pWal
->
nWiData
<=
iPage
||
(
*
ppPage
=
pWal
->
apWiData
[
iPage
])
==
0
){
return
walIndexPageRealloc
(
pWal
,
iPage
,
ppPage
);
}
return
SQLITE_OK
;
}
/*
** Return a pointer to the WalCkptInfo structure in the wal-index.
*/
static
volatile
WalCkptInfo
*
walCkptInfo
(
Wal
*
pWal
){
assert
(
pWal
->
nWiData
>
0
&&
pWal
->
apWiData
[
0
]
);
return
(
volatile
WalCkptInfo
*
)
&
(
pWal
->
apWiData
[
0
][
sizeof
(
WalIndexHdr
)
/
2
]);
}
/*
** Return a pointer to the WalIndexHdr structure in the wal-index.
*/
static
volatile
WalIndexHdr
*
walIndexHdr
(
Wal
*
pWal
){
assert
(
pWal
->
nWiData
>
0
&&
pWal
->
apWiData
[
0
]
);
return
(
volatile
WalIndexHdr
*
)
pWal
->
apWiData
[
0
];
}
/*
** The argument to this macro must be of type u32. On a little-endian
** architecture, it returns the u32 value that results from interpreting
** the 4 bytes as a big-endian value. On a big-endian architecture, it
** returns the value that would be produced by interpreting the 4 bytes
** of the input value as a little-endian integer.
*/
#define BYTESWAP32(x) ( \
(((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \
+ (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \
)
/*
** Generate or extend an 8 byte checksum based on the data in
** array aByte[] and the initial values of aIn[0] and aIn[1] (or
** initial values of 0 and 0 if aIn==NULL).
**
** The checksum is written back into aOut[] before returning.
**
** nByte must be a positive multiple of 8.
*/
static
void
walChecksumBytes
(
int
nativeCksum
,
/* True for native byte-order, false for non-native */
u8
*
a
,
/* Content to be checksummed */
int
nByte
,
/* Bytes of content in a[]. Must be a multiple of 8. */
const
u32
*
aIn
,
/* Initial checksum value input */
u32
*
aOut
/* OUT: Final checksum value output */
){
u32
s1
,
s2
;
u32
*
aData
=
(
u32
*
)
a
;
u32
*
aEnd
=
(
u32
*
)
&
a
[
nByte
];
if
(
aIn
){
s1
=
aIn
[
0
];
s2
=
aIn
[
1
];
}
else
{
s1
=
s2
=
0
;
}
assert
(
nByte
>=
8
);
assert
(
(
nByte
&
0x00000007
)
==
0
);
assert
(
nByte
<=
65536
);
if
(
nativeCksum
){
do
{
s1
+=
*
aData
++
+
s2
;
s2
+=
*
aData
++
+
s1
;
}
while
(
aData
<
aEnd
);
}
else
{
do
{
s1
+=
BYTESWAP32
(
aData
[
0
])
+
s2
;
s2
+=
BYTESWAP32
(
aData
[
1
])
+
s1
;
aData
+=
2
;
}
while
(
aData
<
aEnd
);
}
aOut
[
0
]
=
s1
;
aOut
[
1
]
=
s2
;
}
/*
** If there is the possibility of concurrent access to the SHM file
** from multiple threads and/or processes, then do a memory barrier.
*/
static
void
walShmBarrier
(
Wal
*
pWal
){
if
(
pWal
->
exclusiveMode
!=
WAL_HEAPMEMORY_MODE
){
sqlite3OsShmBarrier
(
pWal
->
pDbFd
);
}
}
/*
** Add the SQLITE_NO_TSAN as part of the return-type of a function
** definition as a hint that the function contains constructs that
** might give false-positive TSAN warnings.
**
** See tag-20200519-1.
*/
#if defined(__clang__) && !defined(SQLITE_NO_TSAN)
# define SQLITE_NO_TSAN __attribute__((no_sanitize_thread))
#else
# define SQLITE_NO_TSAN
#endif
/*
** Write the header information in pWal->hdr into the wal-index.
**
** The checksum on pWal->hdr is updated before it is written.
*/
static
SQLITE_NO_TSAN
void
walIndexWriteHdr
(
Wal
*
pWal
){
volatile
WalIndexHdr
*
aHdr
=
walIndexHdr
(
pWal
);
const
int
nCksum
=
offsetof
(
WalIndexHdr
,
aCksum
);
assert
(
pWal
->
writeLock
);
pWal
->
hdr
.
isInit
=
1
;
pWal
->
hdr
.
iVersion
=
WALINDEX_MAX_VERSION
;
walChecksumBytes
(
1
,
(
u8
*
)
&
pWal
->
hdr
,
nCksum
,
0
,
pWal
->
hdr
.
aCksum
);
/* Possible TSAN false-positive. See tag-20200519-1 */
memcpy
((
void
*
)
&
aHdr
[
1
],
(
const
void
*
)
&
pWal
->
hdr
,
sizeof
(
WalIndexHdr
));
walShmBarrier
(
pWal
);
memcpy
((
void
*
)
&
aHdr
[
0
],
(
const
void
*
)
&
pWal
->
hdr
,
sizeof
(
WalIndexHdr
));
}
/*
** This function encodes a single frame header and writes it to a buffer
** supplied by the caller. A frame-header is made up of a series of
** 4-byte big-endian integers, as follows:
**
** 0: Page number.
** 4: For commit records, the size of the database image in pages
** after the commit. For all other records, zero.
** 8: Salt-1 (copied from the wal-header)
** 12: Salt-2 (copied from the wal-header)
** 16: Checksum-1.
** 20: Checksum-2.
*/
static
void
walEncodeFrame
(
Wal
*
pWal
,
/* The write-ahead log */
u32
iPage
,
/* Database page number for frame */
u32
nTruncate
,
/* New db size (or 0 for non-commit frames) */
u8
*
aData
,
/* Pointer to page data */
u8
*
aFrame
/* OUT: Write encoded frame here */
){
int
nativeCksum
;
/* True for native byte-order checksums */
u32
*
aCksum
=
pWal
->
hdr
.
aFrameCksum
;
assert
(
WAL_FRAME_HDRSIZE
==
24
);
sqlite3Put4byte
(
&
aFrame
[
0
],
iPage
);
sqlite3Put4byte
(
&
aFrame
[
4
],
nTruncate
);
if
(
pWal
->
iReCksum
==
0
){
memcpy
(
&
aFrame
[
8
],
pWal
->
hdr
.
aSalt
,
8
);
nativeCksum
=
(
pWal
->
hdr
.
bigEndCksum
==
SQLITE_BIGENDIAN
);
walChecksumBytes
(
nativeCksum
,
aFrame
,
8
,
aCksum
,
aCksum
);
walChecksumBytes
(
nativeCksum
,
aData
,
pWal
->
szPage
,
aCksum
,
aCksum
);
sqlite3Put4byte
(
&
aFrame
[
16
],
aCksum
[
0
]);
sqlite3Put4byte
(
&
aFrame
[
20
],
aCksum
[
1
]);
}
else
{
memset
(
&
aFrame
[
8
],
0
,
16
);
}
}
/*
** Check to see if the frame with header in aFrame[] and content
** in aData[] is valid. If it is a valid frame, fill *piPage and
** *pnTruncate and return true. Return if the frame is not valid.
*/
static
int
walDecodeFrame
(
Wal
*
pWal
,
/* The write-ahead log */
u32
*
piPage
,
/* OUT: Database page number for frame */
u32
*
pnTruncate
,
/* OUT: New db size (or 0 if not commit) */
u8
*
aData
,
/* Pointer to page data (for checksum) */
u8
*
aFrame
/* Frame data */
){
int
nativeCksum
;
/* True for native byte-order checksums */
u32
*
aCksum
=
pWal
->
hdr
.
aFrameCksum
;
u32
pgno
;
/* Page number of the frame */
assert
(
WAL_FRAME_HDRSIZE
==
24
);
/* A frame is only valid if the salt values in the frame-header
** match the salt values in the wal-header.
*/
if
(
memcmp
(
&
pWal
->
hdr
.
aSalt
,
&
aFrame
[
8
],
8
)
!=
0
){
return
0
;
}
/* A frame is only valid if the page number is creater than zero.
*/
pgno
=
sqlite3Get4byte
(
&
aFrame
[
0
]);
if
(
pgno
==
0
){
return
0
;
}
/* A frame is only valid if a checksum of the WAL header,
** all prior frams, the first 16 bytes of this frame-header,
** and the frame-data matches the checksum in the last 8
** bytes of this frame-header.
*/
nativeCksum
=
(
pWal
->
hdr
.
bigEndCksum
==
SQLITE_BIGENDIAN
);
walChecksumBytes
(
nativeCksum
,
aFrame
,
8
,
aCksum
,
aCksum
);
walChecksumBytes
(
nativeCksum
,
aData
,
pWal
->
szPage
,
aCksum
,
aCksum
);
if
(
aCksum
[
0
]
!=
sqlite3Get4byte
(
&
aFrame
[
16
])
||
aCksum
[
1
]
!=
sqlite3Get4byte
(
&
aFrame
[
20
])
){
/* Checksum failed. */
return
0
;
}
/* If we reach this point, the frame is valid. Return the page number
** and the new database size.
*/
*
piPage
=
pgno
;
*
pnTruncate
=
sqlite3Get4byte
(
&
aFrame
[
4
]);
return
1
;
}
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
/*
** Names of locks. This routine is used to provide debugging output and is not
** a part of an ordinary build.
*/
static
const
char
*
walLockName
(
int
lockIdx
){
if
(
lockIdx
==
WAL_WRITE_LOCK
){
return
"WRITE-LOCK"
;
}
else
if
(
lockIdx
==
WAL_CKPT_LOCK
){
return
"CKPT-LOCK"
;
}
else
if
(
lockIdx
==
WAL_RECOVER_LOCK
){
return
"RECOVER-LOCK"
;
}
else
{
static
char
zName
[
15
];
sqlite3_snprintf
(
sizeof
(
zName
),
zName
,
"READ-LOCK[%d]"
,
lockIdx
-
WAL_READ_LOCK
(
0
));
return
zName
;
}
}
#endif
/*defined(SQLITE_TEST) || defined(SQLITE_DEBUG) */
/*
** Set or release locks on the WAL. Locks are either shared or exclusive.
** A lock cannot be moved directly between shared and exclusive - it must go
** through the unlocked state first.
**
** In locking_mode=EXCLUSIVE, all of these routines become no-ops.
*/
static
int
walLockShared
(
Wal
*
pWal
,
int
lockIdx
){
int
rc
;
if
(
pWal
->
exclusiveMode
)
return
SQLITE_OK
;
rc
=
sqlite3OsShmLock
(
pWal
->
pDbFd
,
lockIdx
,
1
,
SQLITE_SHM_LOCK
|
SQLITE_SHM_SHARED
);
WALTRACE
((
"WAL%p: acquire SHARED-%s %s
\n
"
,
pWal
,
walLockName
(
lockIdx
),
rc
?
"failed"
:
"ok"
));
VVA_ONLY
(
pWal
->
lockError
=
(
u8
)(
rc
!=
SQLITE_OK
&&
(
rc
&
0xFF
)
!=
SQLITE_BUSY
);
)
return
rc
;
}
static
void
walUnlockShared
(
Wal
*
pWal
,
int
lockIdx
){
if
(
pWal
->
exclusiveMode
)
return
;
(
void
)
sqlite3OsShmLock
(
pWal
->
pDbFd
,
lockIdx
,
1
,
SQLITE_SHM_UNLOCK
|
SQLITE_SHM_SHARED
);
WALTRACE
((
"WAL%p: release SHARED-%s
\n
"
,
pWal
,
walLockName
(
lockIdx
)));
}
static
int
walLockExclusive
(
Wal
*
pWal
,
int
lockIdx
,
int
n
){
int
rc
;
if
(
pWal
->
exclusiveMode
)
return
SQLITE_OK
;
rc
=
sqlite3OsShmLock
(
pWal
->
pDbFd
,
lockIdx
,
n
,
SQLITE_SHM_LOCK
|
SQLITE_SHM_EXCLUSIVE
);
WALTRACE
((
"WAL%p: acquire EXCLUSIVE-%s cnt=%d %s
\n
"
,
pWal
,
walLockName
(
lockIdx
),
n
,
rc
?
"failed"
:
"ok"
));
VVA_ONLY
(
pWal
->
lockError
=
(
u8
)(
rc
!=
SQLITE_OK
&&
(
rc
&
0xFF
)
!=
SQLITE_BUSY
);
)
return
rc
;
}
static
void
walUnlockExclusive
(
Wal
*
pWal
,
int
lockIdx
,
int
n
){
if
(
pWal
->
exclusiveMode
)
return
;
(
void
)
sqlite3OsShmLock
(
pWal
->
pDbFd
,
lockIdx
,
n
,
SQLITE_SHM_UNLOCK
|
SQLITE_SHM_EXCLUSIVE
);
WALTRACE
((
"WAL%p: release EXCLUSIVE-%s cnt=%d
\n
"
,
pWal
,
walLockName
(
lockIdx
),
n
));
}
/*
** Compute a hash on a page number. The resulting hash value must land
** between 0 and (HASHTABLE_NSLOT-1). The walHashNext() function advances
** the hash to the next value in the event of a collision.
*/
static
int
walHash
(
u32
iPage
){
assert
(
iPage
>
0
);
assert
(
(
HASHTABLE_NSLOT
&
(
HASHTABLE_NSLOT
-
1
))
==
0
);
return
(
iPage
*
HASHTABLE_HASH_1
)
&
(
HASHTABLE_NSLOT
-
1
);
}
static
int
walNextHash
(
int
iPriorHash
){
return
(
iPriorHash
+
1
)
&
(
HASHTABLE_NSLOT
-
1
);
}
/*
** An instance of the WalHashLoc object is used to describe the location
** of a page hash table in the wal-index. This becomes the return value
** from walHashGet().
*/
typedef
struct
WalHashLoc
WalHashLoc
;
struct
WalHashLoc
{
volatile
ht_slot
*
aHash
;
/* Start of the wal-index hash table */
volatile
u32
*
aPgno
;
/* aPgno[1] is the page of first frame indexed */
u32
iZero
;
/* One less than the frame number of first indexed*/
};
/*
** Return pointers to the hash table and page number array stored on
** page iHash of the wal-index. The wal-index is broken into 32KB pages
** numbered starting from 0.
**
** Set output variable pLoc->aHash to point to the start of the hash table
** in the wal-index file. Set pLoc->iZero to one less than the frame
** number of the first frame indexed by this hash table. If a
** slot in the hash table is set to N, it refers to frame number
** (pLoc->iZero+N) in the log.
**
** Finally, set pLoc->aPgno so that pLoc->aPgno[0] is the page number of the
** first frame indexed by the hash table, frame (pLoc->iZero).
*/
static
int
walHashGet
(
Wal
*
pWal
,
/* WAL handle */
int
iHash
,
/* Find the iHash'th table */
WalHashLoc
*
pLoc
/* OUT: Hash table location */
){
int
rc
;
/* Return code */
rc
=
walIndexPage
(
pWal
,
iHash
,
&
pLoc
->
aPgno
);
assert
(
rc
==
SQLITE_OK
||
iHash
>
0
);
if
(
pLoc
->
aPgno
){
pLoc
->
aHash
=
(
volatile
ht_slot
*
)
&
pLoc
->
aPgno
[
HASHTABLE_NPAGE
];
if
(
iHash
==
0
){
pLoc
->
aPgno
=
&
pLoc
->
aPgno
[
WALINDEX_HDR_SIZE
/
sizeof
(
u32
)];
pLoc
->
iZero
=
0
;
}
else
{
pLoc
->
iZero
=
HASHTABLE_NPAGE_ONE
+
(
iHash
-
1
)
*
HASHTABLE_NPAGE
;
}
}
else
if
(
NEVER
(
rc
==
SQLITE_OK
)
){
rc
=
SQLITE_ERROR
;
}
return
rc
;
}
/*
** Return the number of the wal-index page that contains the hash-table
** and page-number array that contain entries corresponding to WAL frame
** iFrame. The wal-index is broken up into 32KB pages. Wal-index pages
** are numbered starting from 0.
*/
static
int
walFramePage
(
u32
iFrame
){
int
iHash
=
(
iFrame
+
HASHTABLE_NPAGE
-
HASHTABLE_NPAGE_ONE
-
1
)
/
HASHTABLE_NPAGE
;
assert
(
(
iHash
==
0
||
iFrame
>
HASHTABLE_NPAGE_ONE
)
&&
(
iHash
>=
1
||
iFrame
<=
HASHTABLE_NPAGE_ONE
)
&&
(
iHash
<=
1
||
iFrame
>
(
HASHTABLE_NPAGE_ONE
+
HASHTABLE_NPAGE
))
&&
(
iHash
>=
2
||
iFrame
<=
HASHTABLE_NPAGE_ONE
+
HASHTABLE_NPAGE
)
&&
(
iHash
<=
2
||
iFrame
>
(
HASHTABLE_NPAGE_ONE
+
2
*
HASHTABLE_NPAGE
))
);
assert
(
iHash
>=
0
);
return
iHash
;
}
/*
** Return the page number associated with frame iFrame in this WAL.
*/
static
u32
walFramePgno
(
Wal
*
pWal
,
u32
iFrame
){
int
iHash
=
walFramePage
(
iFrame
);
if
(
iHash
==
0
){
return
pWal
->
apWiData
[
0
][
WALINDEX_HDR_SIZE
/
sizeof
(
u32
)
+
iFrame
-
1
];
}
return
pWal
->
apWiData
[
iHash
][(
iFrame
-
1
-
HASHTABLE_NPAGE_ONE
)
%
HASHTABLE_NPAGE
];
}
/*
** Remove entries from the hash table that point to WAL slots greater
** than pWal->hdr.mxFrame.
**
** This function is called whenever pWal->hdr.mxFrame is decreased due
** to a rollback or savepoint.
**
** At most only the hash table containing pWal->hdr.mxFrame needs to be
** updated. Any later hash tables will be automatically cleared when
** pWal->hdr.mxFrame advances to the point where those hash tables are
** actually needed.
*/
static
void
walCleanupHash
(
Wal
*
pWal
){
WalHashLoc
sLoc
;
/* Hash table location */
int
iLimit
=
0
;
/* Zero values greater than this */
int
nByte
;
/* Number of bytes to zero in aPgno[] */
int
i
;
/* Used to iterate through aHash[] */
assert
(
pWal
->
writeLock
);
testcase
(
pWal
->
hdr
.
mxFrame
==
HASHTABLE_NPAGE_ONE
-
1
);
testcase
(
pWal
->
hdr
.
mxFrame
==
HASHTABLE_NPAGE_ONE
);
testcase
(
pWal
->
hdr
.
mxFrame
==
HASHTABLE_NPAGE_ONE
+
1
);
if
(
pWal
->
hdr
.
mxFrame
==
0
)
return
;
/* Obtain pointers to the hash-table and page-number array containing
** the entry that corresponds to frame pWal->hdr.mxFrame. It is guaranteed
** that the page said hash-table and array reside on is already mapped.(1)
*/
assert
(
pWal
->
nWiData
>
walFramePage
(
pWal
->
hdr
.
mxFrame
)
);
assert
(
pWal
->
apWiData
[
walFramePage
(
pWal
->
hdr
.
mxFrame
)]
);
i
=
walHashGet
(
pWal
,
walFramePage
(
pWal
->
hdr
.
mxFrame
),
&
sLoc
);
if
(
NEVER
(
i
)
)
return
;
/* Defense-in-depth, in case (1) above is wrong */
/* Zero all hash-table entries that correspond to frame numbers greater
** than pWal->hdr.mxFrame.
*/
iLimit
=
pWal
->
hdr
.
mxFrame
-
sLoc
.
iZero
;
assert
(
iLimit
>
0
);
for
(
i
=
0
;
i
<
HASHTABLE_NSLOT
;
i
++
){
if
(
sLoc
.
aHash
[
i
]
>
iLimit
){
sLoc
.
aHash
[
i
]
=
0
;
}
}
/* Zero the entries in the aPgno array that correspond to frames with
** frame numbers greater than pWal->hdr.mxFrame.
*/
nByte
=
(
int
)((
char
*
)
sLoc
.
aHash
-
(
char
*
)
&
sLoc
.
aPgno
[
iLimit
]);
assert
(
nByte
>=
0
);
memset
((
void
*
)
&
sLoc
.
aPgno
[
iLimit
],
0
,
nByte
);
#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
/* Verify that the every entry in the mapping region is still reachable
** via the hash table even after the cleanup.
*/
if
(
iLimit
){
int
j
;
/* Loop counter */
int
iKey
;
/* Hash key */
for
(
j
=
0
;
j
<
iLimit
;
j
++
){
for
(
iKey
=
walHash
(
sLoc
.
aPgno
[
j
]);
sLoc
.
aHash
[
iKey
];
iKey
=
walNextHash
(
iKey
)){
if
(
sLoc
.
aHash
[
iKey
]
==
j
+
1
)
break
;
}
assert
(
sLoc
.
aHash
[
iKey
]
==
j
+
1
);
}
}
#endif
/* SQLITE_ENABLE_EXPENSIVE_ASSERT */
}
/*
** Set an entry in the wal-index that will map database page number
** pPage into WAL frame iFrame.
*/
static
int
walIndexAppend
(
Wal
*
pWal
,
u32
iFrame
,
u32
iPage
){
int
rc
;
/* Return code */
WalHashLoc
sLoc
;
/* Wal-index hash table location */
rc
=
walHashGet
(
pWal
,
walFramePage
(
iFrame
),
&
sLoc
);
/* Assuming the wal-index file was successfully mapped, populate the
** page number array and hash table entry.
*/
if
(
rc
==
SQLITE_OK
){
int
iKey
;
/* Hash table key */
int
idx
;
/* Value to write to hash-table slot */
int
nCollide
;
/* Number of hash collisions */
idx
=
iFrame
-
sLoc
.
iZero
;
assert
(
idx
<=
HASHTABLE_NSLOT
/
2
+
1
);
/* If this is the first entry to be added to this hash-table, zero the
** entire hash table and aPgno[] array before proceeding.
*/
if
(
idx
==
1
){
int
nByte
=
(
int
)((
u8
*
)
&
sLoc
.
aHash
[
HASHTABLE_NSLOT
]
-
(
u8
*
)
sLoc
.
aPgno
);
assert
(
nByte
>=
0
);
memset
((
void
*
)
sLoc
.
aPgno
,
0
,
nByte
);
}
/* If the entry in aPgno[] is already set, then the previous writer
** must have exited unexpectedly in the middle of a transaction (after
** writing one or more dirty pages to the WAL to free up memory).
** Remove the remnants of that writers uncommitted transaction from
** the hash-table before writing any new entries.
*/
if
(
sLoc
.
aPgno
[
idx
-
1
]
){
walCleanupHash
(
pWal
);
assert
(
!
sLoc
.
aPgno
[
idx
-
1
]
);
}
/* Write the aPgno[] array entry and the hash-table slot. */
nCollide
=
idx
;
for
(
iKey
=
walHash
(
iPage
);
sLoc
.
aHash
[
iKey
];
iKey
=
walNextHash
(
iKey
)){
if
(
(
nCollide
--
)
==
0
)
return
SQLITE_CORRUPT_BKPT
;
}
sLoc
.
aPgno
[
idx
-
1
]
=
iPage
;
AtomicStore
(
&
sLoc
.
aHash
[
iKey
],
(
ht_slot
)
idx
);
#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
/* Verify that the number of entries in the hash table exactly equals
** the number of entries in the mapping region.
*/
{
int
i
;
/* Loop counter */
int
nEntry
=
0
;
/* Number of entries in the hash table */
for
(
i
=
0
;
i
<
HASHTABLE_NSLOT
;
i
++
){
if
(
sLoc
.
aHash
[
i
]
)
nEntry
++
;
}
assert
(
nEntry
==
idx
);
}
/* Verify that the every entry in the mapping region is reachable
** via the hash table. This turns out to be a really, really expensive
** thing to check, so only do this occasionally - not on every
** iteration.
*/
if
(
(
idx
&
0x3ff
)
==
0
){
int
i
;
/* Loop counter */
for
(
i
=
0
;
i
<
idx
;
i
++
){
for
(
iKey
=
walHash
(
sLoc
.
aPgno
[
i
]);
sLoc
.
aHash
[
iKey
];
iKey
=
walNextHash
(
iKey
)){
if
(
sLoc
.
aHash
[
iKey
]
==
i
+
1
)
break
;
}
assert
(
sLoc
.
aHash
[
iKey
]
==
i
+
1
);
}
}
#endif
/* SQLITE_ENABLE_EXPENSIVE_ASSERT */
}
return
rc
;
}
/*
** Recover the wal-index by reading the write-ahead log file.
**
** This routine first tries to establish an exclusive lock on the
** wal-index to prevent other threads/processes from doing anything
** with the WAL or wal-index while recovery is running. The
** WAL_RECOVER_LOCK is also held so that other threads will know
** that this thread is running recovery. If unable to establish
** the necessary locks, this routine returns SQLITE_BUSY.
*/
static
int
walIndexRecover
(
Wal
*
pWal
){
int
rc
;
/* Return Code */
i64
nSize
;
/* Size of log file */
u32
aFrameCksum
[
2
]
=
{
0
,
0
};
int
iLock
;
/* Lock offset to lock for checkpoint */
/* Obtain an exclusive lock on all byte in the locking range not already
** locked by the caller. The caller is guaranteed to have locked the
** WAL_WRITE_LOCK byte, and may have also locked the WAL_CKPT_LOCK byte.
** If successful, the same bytes that are locked here are unlocked before
** this function returns.
*/
assert
(
pWal
->
ckptLock
==
1
||
pWal
->
ckptLock
==
0
);
assert
(
WAL_ALL_BUT_WRITE
==
WAL_WRITE_LOCK
+
1
);
assert
(
WAL_CKPT_LOCK
==
WAL_ALL_BUT_WRITE
);
assert
(
pWal
->
writeLock
);
iLock
=
WAL_ALL_BUT_WRITE
+
pWal
->
ckptLock
;
rc
=
walLockExclusive
(
pWal
,
iLock
,
WAL_READ_LOCK
(
0
)
-
iLock
);
if
(
rc
){
return
rc
;
}
WALTRACE
((
"WAL%p: recovery begin...
\n
"
,
pWal
));
memset
(
&
pWal
->
hdr
,
0
,
sizeof
(
WalIndexHdr
));
rc
=
sqlite3OsFileSize
(
pWal
->
pWalFd
,
&
nSize
);
if
(
rc
!=
SQLITE_OK
){
goto
recovery_error
;
}
if
(
nSize
>
WAL_HDRSIZE
){
u8
aBuf
[
WAL_HDRSIZE
];
/* Buffer to load WAL header into */
u32
*
aPrivate
=
0
;
/* Heap copy of *-shm hash being populated */
u8
*
aFrame
=
0
;
/* Malloc'd buffer to load entire frame */
int
szFrame
;
/* Number of bytes in buffer aFrame[] */
u8
*
aData
;
/* Pointer to data part of aFrame buffer */
int
szPage
;
/* Page size according to the log */
u32
magic
;
/* Magic value read from WAL header */
u32
version
;
/* Magic value read from WAL header */
int
isValid
;
/* True if this frame is valid */
u32
iPg
;
/* Current 32KB wal-index page */
u32
iLastFrame
;
/* Last frame in wal, based on nSize alone */
/* Read in the WAL header. */
rc
=
sqlite3OsRead
(
pWal
->
pWalFd
,
aBuf
,
WAL_HDRSIZE
,
0
);
if
(
rc
!=
SQLITE_OK
){
goto
recovery_error
;
}
/* If the database page size is not a power of two, or is greater than
** SQLITE_MAX_PAGE_SIZE, conclude that the WAL file contains no valid
** data. Similarly, if the 'magic' value is invalid, ignore the whole
** WAL file.
*/
magic
=
sqlite3Get4byte
(
&
aBuf
[
0
]);
szPage
=
sqlite3Get4byte
(
&
aBuf
[
8
]);
if
(
(
magic
&
0xFFFFFFFE
)
!=
WAL_MAGIC
||
szPage
&
(
szPage
-
1
)
||
szPage
>
SQLITE_MAX_PAGE_SIZE
||
szPage
<
512
){
goto
finished
;
}
pWal
->
hdr
.
bigEndCksum
=
(
u8
)(
magic
&
0x00000001
);
pWal
->
szPage
=
szPage
;
pWal
->
nCkpt
=
sqlite3Get4byte
(
&
aBuf
[
12
]);
memcpy
(
&
pWal
->
hdr
.
aSalt
,
&
aBuf
[
16
],
8
);
/* Verify that the WAL header checksum is correct */
walChecksumBytes
(
pWal
->
hdr
.
bigEndCksum
==
SQLITE_BIGENDIAN
,
aBuf
,
WAL_HDRSIZE
-
2
*
4
,
0
,
pWal
->
hdr
.
aFrameCksum
);
if
(
pWal
->
hdr
.
aFrameCksum
[
0
]
!=
sqlite3Get4byte
(
&
aBuf
[
24
])
||
pWal
->
hdr
.
aFrameCksum
[
1
]
!=
sqlite3Get4byte
(
&
aBuf
[
28
])
){
goto
finished
;
}
/* Verify that the version number on the WAL format is one that
** are able to understand */
version
=
sqlite3Get4byte
(
&
aBuf
[
4
]);
if
(
version
!=
WAL_MAX_VERSION
){
rc
=
SQLITE_CANTOPEN_BKPT
;
goto
finished
;
}
/* Malloc a buffer to read frames into. */
szFrame
=
szPage
+
WAL_FRAME_HDRSIZE
;
aFrame
=
(
u8
*
)
sqlite3_malloc64
(
szFrame
+
WALINDEX_PGSZ
);
if
(
!
aFrame
){
rc
=
SQLITE_NOMEM
;
goto
recovery_error
;
}
aData
=
&
aFrame
[
WAL_FRAME_HDRSIZE
];
aPrivate
=
(
u32
*
)
&
aData
[
szPage
];
/* Read all frames from the log file. */
iLastFrame
=
(
nSize
-
WAL_HDRSIZE
)
/
szFrame
;
for
(
iPg
=
0
;
iPg
<=
(
u32
)
walFramePage
(
iLastFrame
);
iPg
++
){
u32
*
aShare
;
u32
iFrame
;
/* Index of last frame read */
u32
iLast
=
MIN
(
iLastFrame
,
HASHTABLE_NPAGE_ONE
+
iPg
*
HASHTABLE_NPAGE
);
u32
iFirst
=
1
+
(
iPg
==
0
?
0
:
HASHTABLE_NPAGE_ONE
+
(
iPg
-
1
)
*
HASHTABLE_NPAGE
);
u32
nHdr
,
nHdr32
;
rc
=
walIndexPage
(
pWal
,
iPg
,
(
volatile
u32
**
)
&
aShare
);
assert
(
aShare
!=
0
||
rc
!=
SQLITE_OK
);
if
(
aShare
==
0
)
break
;
pWal
->
apWiData
[
iPg
]
=
aPrivate
;
for
(
iFrame
=
iFirst
;
iFrame
<=
iLast
;
iFrame
++
){
i64
iOffset
=
walFrameOffset
(
iFrame
,
szPage
);
u32
pgno
;
/* Database page number for frame */
u32
nTruncate
;
/* dbsize field from frame header */
/* Read and decode the next log frame. */
rc
=
sqlite3OsRead
(
pWal
->
pWalFd
,
aFrame
,
szFrame
,
iOffset
);
if
(
rc
!=
SQLITE_OK
)
break
;
isValid
=
walDecodeFrame
(
pWal
,
&
pgno
,
&
nTruncate
,
aData
,
aFrame
);
if
(
!
isValid
)
break
;
rc
=
walIndexAppend
(
pWal
,
iFrame
,
pgno
);
if
(
NEVER
(
rc
!=
SQLITE_OK
)
)
break
;
/* If nTruncate is non-zero, this is a commit record. */
if
(
nTruncate
){
pWal
->
hdr
.
mxFrame
=
iFrame
;
pWal
->
hdr
.
nPage
=
nTruncate
;
pWal
->
hdr
.
szPage
=
(
u16
)((
szPage
&
0xff00
)
|
(
szPage
>>
16
));
testcase
(
szPage
<=
32768
);
testcase
(
szPage
>=
65536
);
aFrameCksum
[
0
]
=
pWal
->
hdr
.
aFrameCksum
[
0
];
aFrameCksum
[
1
]
=
pWal
->
hdr
.
aFrameCksum
[
1
];
}
}
pWal
->
apWiData
[
iPg
]
=
aShare
;
nHdr
=
(
iPg
==
0
?
WALINDEX_HDR_SIZE
:
0
);
nHdr32
=
nHdr
/
sizeof
(
u32
);
#ifndef SQLITE_SAFER_WALINDEX_RECOVERY
/* Memcpy() should work fine here, on all reasonable implementations.
** Technically, memcpy() might change the destination to some
** intermediate value before setting to the final value, and that might
** cause a concurrent reader to malfunction. Memcpy() is allowed to
** do that, according to the spec, but no memcpy() implementation that
** we know of actually does that, which is why we say that memcpy()
** is safe for this. Memcpy() is certainly a lot faster.
*/
memcpy
(
&
aShare
[
nHdr32
],
&
aPrivate
[
nHdr32
],
WALINDEX_PGSZ
-
nHdr
);
#else
/* In the event that some platform is found for which memcpy()
** changes the destination to some intermediate value before
** setting the final value, this alternative copy routine is
** provided.
*/
{
int
i
;
for
(
i
=
nHdr32
;
i
<
WALINDEX_PGSZ
/
sizeof
(
u32
);
i
++
){
if
(
aShare
[
i
]
!=
aPrivate
[
i
]
){
/* Atomic memory operations are not required here because if
** the value needs to be changed, that means it is not being
** accessed concurrently. */
aShare
[
i
]
=
aPrivate
[
i
];
}
}
}
#endif
if
(
iFrame
<=
iLast
)
break
;
}
sqlite3_free
(
aFrame
);
}
finished:
if
(
rc
==
SQLITE_OK
){
volatile
WalCkptInfo
*
pInfo
;
int
i
;
pWal
->
hdr
.
aFrameCksum
[
0
]
=
aFrameCksum
[
0
];
pWal
->
hdr
.
aFrameCksum
[
1
]
=
aFrameCksum
[
1
];
walIndexWriteHdr
(
pWal
);
/* Reset the checkpoint-header. This is safe because this thread is
** currently holding locks that exclude all other writers and
** checkpointers. Then set the values of read-mark slots 1 through N.
*/
pInfo
=
walCkptInfo
(
pWal
);
pInfo
->
nBackfill
=
0
;
pInfo
->
nBackfillAttempted
=
pWal
->
hdr
.
mxFrame
;
pInfo
->
aReadMark
[
0
]
=
0
;
for
(
i
=
1
;
i
<
WAL_NREADER
;
i
++
){
rc
=
walLockExclusive
(
pWal
,
WAL_READ_LOCK
(
i
),
1
);
if
(
rc
==
SQLITE_OK
){
if
(
i
==
1
&&
pWal
->
hdr
.
mxFrame
){
pInfo
->
aReadMark
[
i
]
=
pWal
->
hdr
.
mxFrame
;
}
else
{
pInfo
->
aReadMark
[
i
]
=
READMARK_NOT_USED
;
}
walUnlockExclusive
(
pWal
,
WAL_READ_LOCK
(
i
),
1
);
}
else
if
(
rc
!=
SQLITE_BUSY
){
goto
recovery_error
;
}
}
/* If more than one frame was recovered from the log file, report an
** event via sqlite3_log(). This is to help with identifying performance
** problems caused by applications routinely shutting down without
** checkpointing the log file.
*/
if
(
pWal
->
hdr
.
nPage
){
sqlite3_log
(
SQLITE_NOTICE_RECOVER_WAL
,
"recovered %d frames from WAL file %s"
,
pWal
->
hdr
.
mxFrame
,
pWal
->
zWalName
);
}
}
recovery_error:
WALTRACE
((
"WAL%p: recovery %s
\n
"
,
pWal
,
rc
?
"failed"
:
"ok"
));
walUnlockExclusive
(
pWal
,
iLock
,
WAL_READ_LOCK
(
0
)
-
iLock
);
return
rc
;
}
/*
** Close an open wal-index.
*/
static
void
walIndexClose
(
Wal
*
pWal
,
int
isDelete
){
if
(
pWal
->
exclusiveMode
==
WAL_HEAPMEMORY_MODE
||
pWal
->
bShmUnreliable
){
int
i
;
for
(
i
=
0
;
i
<
pWal
->
nWiData
;
i
++
){
sqlite3_free
((
void
*
)
pWal
->
apWiData
[
i
]);
pWal
->
apWiData
[
i
]
=
0
;
}
}
if
(
pWal
->
exclusiveMode
!=
WAL_HEAPMEMORY_MODE
){
sqlite3OsShmUnmap
(
pWal
->
pDbFd
,
isDelete
);
}
}
/*
** Open a connection to the WAL file zWalName. The database file must
** already be opened on connection pDbFd. The buffer that zWalName points
** to must remain valid for the lifetime of the returned Wal* handle.
**
** A SHARED lock should be held on the database file when this function
** is called. The purpose of this SHARED lock is to prevent any other
** client from unlinking the WAL or wal-index file. If another process
** were to do this just after this client opened one of these files, the
** system would be badly broken.
**
** If the log file is successfully opened, SQLITE_OK is returned and
** *ppWal is set to point to a new WAL handle. If an error occurs,
** an SQLite error code is returned and *ppWal is left unmodified.
*/
int
sqlite3WalOpen
(
sqlite3_vfs
*
pVfs
,
/* vfs module to open wal and wal-index */
sqlite3_file
*
pDbFd
,
/* The open database file */
const
char
*
zWalName
,
/* Name of the WAL file */
int
bNoShm
,
/* True to run in heap-memory mode */
i64
mxWalSize
,
/* Truncate WAL to this size on reset */
Wal
**
ppWal
/* OUT: Allocated Wal handle */
){
int
rc
;
/* Return Code */
Wal
*
pRet
;
/* Object to allocate and return */
int
flags
;
/* Flags passed to OsOpen() */
assert
(
zWalName
&&
zWalName
[
0
]
);
assert
(
pDbFd
);
/* Verify the values of various constants. Any changes to the values
** of these constants would result in an incompatible on-disk format
** for the -shm file. Any change that causes one of these asserts to
** fail is a backward compatibility problem, even if the change otherwise
** works.
**
** This table also serves as a helpful cross-reference when trying to
** interpret hex dumps of the -shm file.
*/
assert
(
48
==
sizeof
(
WalIndexHdr
)
);
assert
(
40
==
sizeof
(
WalCkptInfo
)
);
assert
(
120
==
WALINDEX_LOCK_OFFSET
);
assert
(
136
==
WALINDEX_HDR_SIZE
);
assert
(
4096
==
HASHTABLE_NPAGE
);
assert
(
4062
==
HASHTABLE_NPAGE_ONE
);
assert
(
8192
==
HASHTABLE_NSLOT
);
assert
(
383
==
HASHTABLE_HASH_1
);
assert
(
32768
==
WALINDEX_PGSZ
);
assert
(
8
==
SQLITE_SHM_NLOCK
);
assert
(
5
==
WAL_NREADER
);
assert
(
24
==
WAL_FRAME_HDRSIZE
);
assert
(
32
==
WAL_HDRSIZE
);
assert
(
120
==
WALINDEX_LOCK_OFFSET
+
WAL_WRITE_LOCK
);
assert
(
121
==
WALINDEX_LOCK_OFFSET
+
WAL_CKPT_LOCK
);
assert
(
122
==
WALINDEX_LOCK_OFFSET
+
WAL_RECOVER_LOCK
);
assert
(
123
==
WALINDEX_LOCK_OFFSET
+
WAL_READ_LOCK
(
0
)
);
assert
(
124
==
WALINDEX_LOCK_OFFSET
+
WAL_READ_LOCK
(
1
)
);
assert
(
125
==
WALINDEX_LOCK_OFFSET
+
WAL_READ_LOCK
(
2
)
);
assert
(
126
==
WALINDEX_LOCK_OFFSET
+
WAL_READ_LOCK
(
3
)
);
assert
(
127
==
WALINDEX_LOCK_OFFSET
+
WAL_READ_LOCK
(
4
)
);
/* In the amalgamation, the os_unix.c and os_win.c source files come before
** this source file. Verify that the #defines of the locking byte offsets
** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value.
** For that matter, if the lock offset ever changes from its initial design
** value of 120, we need to know that so there is an assert() to check it.
*/
#ifdef WIN_SHM_BASE
assert
(
WIN_SHM_BASE
==
WALINDEX_LOCK_OFFSET
);
#endif
#ifdef UNIX_SHM_BASE
assert
(
UNIX_SHM_BASE
==
WALINDEX_LOCK_OFFSET
);
#endif
/* Allocate an instance of struct Wal to return. */
*
ppWal
=
0
;
pRet
=
(
Wal
*
)
sqlite3MallocZero
(
sizeof
(
Wal
)
+
pVfs
->
szOsFile
);
if
(
!
pRet
){
return
SQLITE_NOMEM
;
}
pRet
->
pVfs
=
pVfs
;
pRet
->
pWalFd
=
(
sqlite3_file
*
)
&
pRet
[
1
];
pRet
->
pDbFd
=
pDbFd
;
pRet
->
readLock
=
-
1
;
pRet
->
mxWalSize
=
mxWalSize
;
pRet
->
zWalName
=
zWalName
;
pRet
->
syncHeader
=
1
;
pRet
->
padToSectorBoundary
=
1
;
pRet
->
exclusiveMode
=
(
bNoShm
?
WAL_HEAPMEMORY_MODE
:
WAL_NORMAL_MODE
);
/* Open file handle on the write-ahead log file. */
flags
=
(
SQLITE_OPEN_READWRITE
|
SQLITE_OPEN_CREATE
|
SQLITE_OPEN_WAL
);
rc
=
sqlite3OsOpen
(
pVfs
,
zWalName
,
pRet
->
pWalFd
,
flags
,
&
flags
);
if
(
rc
==
SQLITE_OK
&&
flags
&
SQLITE_OPEN_READONLY
){
pRet
->
readOnly
=
WAL_RDONLY
;
}
if
(
rc
!=
SQLITE_OK
){
walIndexClose
(
pRet
,
0
);
sqlite3OsClose
(
pRet
->
pWalFd
);
sqlite3_free
(
pRet
);
}
else
{
int
iDC
=
sqlite3OsDeviceCharacteristics
(
pDbFd
);
if
(
iDC
&
SQLITE_IOCAP_SEQUENTIAL
){
pRet
->
syncHeader
=
0
;
}
if
(
iDC
&
SQLITE_IOCAP_POWERSAFE_OVERWRITE
){
pRet
->
padToSectorBoundary
=
0
;
}
*
ppWal
=
pRet
;
WALTRACE
((
"WAL%d: opened
\n
"
,
pRet
));
}
return
rc
;
}
/*
** Change the size to which the WAL file is trucated on each reset.
*/
void
sqlite3WalLimit
(
Wal
*
pWal
,
i64
iLimit
){
if
(
pWal
)
pWal
->
mxWalSize
=
iLimit
;
}
/*
** Find the smallest page number out of all pages held in the WAL that
** has not been returned by any prior invocation of this method on the
** same WalIterator object. Write into *piFrame the frame index where
** that page was last written into the WAL. Write into *piPage the page
** number.
**
** Return 0 on success. If there are no pages in the WAL with a page
** number larger than *piPage, then return 1.
*/
static
int
walIteratorNext
(
WalIterator
*
p
,
/* Iterator */
u32
*
piPage
,
/* OUT: The page number of the next page */
u32
*
piFrame
/* OUT: Wal frame index of next page */
){
u32
iMin
;
/* Result pgno must be greater than iMin */
u32
iRet
=
0xFFFFFFFF
;
/* 0xffffffff is never a valid page number */
int
i
;
/* For looping through segments */
iMin
=
p
->
iPrior
;
assert
(
iMin
<
0xffffffff
);
for
(
i
=
p
->
nSegment
-
1
;
i
>=
0
;
i
--
){
struct
WalSegment
*
pSegment
=
&
p
->
aSegment
[
i
];
while
(
pSegment
->
iNext
<
pSegment
->
nEntry
){
u32
iPg
=
pSegment
->
aPgno
[
pSegment
->
aIndex
[
pSegment
->
iNext
]];
if
(
iPg
>
iMin
){
if
(
iPg
<
iRet
){
iRet
=
iPg
;
*
piFrame
=
pSegment
->
iZero
+
pSegment
->
aIndex
[
pSegment
->
iNext
];
}
break
;
}
pSegment
->
iNext
++
;
}
}
*
piPage
=
p
->
iPrior
=
iRet
;
return
(
iRet
==
0xFFFFFFFF
);
}
/*
** This function merges two sorted lists into a single sorted list.
**
** aLeft[] and aRight[] are arrays of indices. The sort key is
** aContent[aLeft[]] and aContent[aRight[]]. Upon entry, the following
** is guaranteed for all J<K:
**
** aContent[aLeft[J]] < aContent[aLeft[K]]
** aContent[aRight[J]] < aContent[aRight[K]]
**
** This routine overwrites aRight[] with a new (probably longer) sequence
** of indices such that the aRight[] contains every index that appears in
** either aLeft[] or the old aRight[] and such that the second condition
** above is still met.
**
** The aContent[aLeft[X]] values will be unique for all X. And the
** aContent[aRight[X]] values will be unique too. But there might be
** one or more combinations of X and Y such that
**
** aLeft[X]!=aRight[Y] && aContent[aLeft[X]] == aContent[aRight[Y]]
**
** When that happens, omit the aLeft[X] and use the aRight[Y] index.
*/
static
void
walMerge
(
const
u32
*
aContent
,
/* Pages in wal - keys for the sort */
ht_slot
*
aLeft
,
/* IN: Left hand input list */
int
nLeft
,
/* IN: Elements in array *paLeft */
ht_slot
**
paRight
,
/* IN/OUT: Right hand input list */
int
*
pnRight
,
/* IN/OUT: Elements in *paRight */
ht_slot
*
aTmp
/* Temporary buffer */
){
int
iLeft
=
0
;
/* Current index in aLeft */
int
iRight
=
0
;
/* Current index in aRight */
int
iOut
=
0
;
/* Current index in output buffer */
int
nRight
=
*
pnRight
;
ht_slot
*
aRight
=
*
paRight
;
assert
(
nLeft
>
0
&&
nRight
>
0
);
while
(
iRight
<
nRight
||
iLeft
<
nLeft
){
ht_slot
logpage
;
Pgno
dbpage
;
if
(
(
iLeft
<
nLeft
)
&&
(
iRight
>=
nRight
||
aContent
[
aLeft
[
iLeft
]]
<
aContent
[
aRight
[
iRight
]])
){
logpage
=
aLeft
[
iLeft
++
];
}
else
{
logpage
=
aRight
[
iRight
++
];
}
dbpage
=
aContent
[
logpage
];
aTmp
[
iOut
++
]
=
logpage
;
if
(
iLeft
<
nLeft
&&
aContent
[
aLeft
[
iLeft
]]
==
dbpage
)
iLeft
++
;
assert
(
iLeft
>=
nLeft
||
aContent
[
aLeft
[
iLeft
]]
>
dbpage
);
assert
(
iRight
>=
nRight
||
aContent
[
aRight
[
iRight
]]
>
dbpage
);
}
*
paRight
=
aLeft
;
*
pnRight
=
iOut
;
memcpy
(
aLeft
,
aTmp
,
sizeof
(
aTmp
[
0
])
*
iOut
);
}
/*
** Sort the elements in list aList using aContent[] as the sort key.
** Remove elements with duplicate keys, preferring to keep the
** larger aList[] values.
**
** The aList[] entries are indices into aContent[]. The values in
** aList[] are to be sorted so that for all J<K:
**
** aContent[aList[J]] < aContent[aList[K]]
**
** For any X and Y such that
**
** aContent[aList[X]] == aContent[aList[Y]]
**
** Keep the larger of the two values aList[X] and aList[Y] and discard
** the smaller.
*/
static
void
walMergesort
(
const
u32
*
aContent
,
/* Pages in wal */
ht_slot
*
aBuffer
,
/* Buffer of at least *pnList items to use */
ht_slot
*
aList
,
/* IN/OUT: List to sort */
int
*
pnList
/* IN/OUT: Number of elements in aList[] */
){
struct
Sublist
{
int
nList
;
/* Number of elements in aList */
ht_slot
*
aList
;
/* Pointer to sub-list content */
};
const
int
nList
=
*
pnList
;
/* Size of input list */
int
nMerge
=
0
;
/* Number of elements in list aMerge */
ht_slot
*
aMerge
=
0
;
/* List to be merged */
int
iList
;
/* Index into input list */
u32
iSub
=
0
;
/* Index into aSub array */
struct
Sublist
aSub
[
13
];
/* Array of sub-lists */
memset
(
aSub
,
0
,
sizeof
(
aSub
));
assert
(
nList
<=
HASHTABLE_NPAGE
&&
nList
>
0
);
assert
(
HASHTABLE_NPAGE
==
(
1
<<
(
ArraySize
(
aSub
)
-
1
))
);
for
(
iList
=
0
;
iList
<
nList
;
iList
++
){
nMerge
=
1
;
aMerge
=
&
aList
[
iList
];
for
(
iSub
=
0
;
iList
&
(
1
<<
iSub
);
iSub
++
){
struct
Sublist
*
p
;
assert
(
iSub
<
ArraySize
(
aSub
)
);
p
=
&
aSub
[
iSub
];
assert
(
p
->
aList
&&
p
->
nList
<=
(
1
<<
iSub
)
);
assert
(
p
->
aList
==&
aList
[
iList
&~
((
2
<<
iSub
)
-
1
)]
);
walMerge
(
aContent
,
p
->
aList
,
p
->
nList
,
&
aMerge
,
&
nMerge
,
aBuffer
);
}
aSub
[
iSub
].
aList
=
aMerge
;
aSub
[
iSub
].
nList
=
nMerge
;
}
for
(
iSub
++
;
iSub
<
ArraySize
(
aSub
);
iSub
++
){
if
(
nList
&
(
1
<<
iSub
)
){
struct
Sublist
*
p
;
assert
(
iSub
<
ArraySize
(
aSub
)
);
p
=
&
aSub
[
iSub
];
assert
(
p
->
nList
<=
(
1
<<
iSub
)
);
assert
(
p
->
aList
==&
aList
[
nList
&~
((
2
<<
iSub
)
-
1
)]
);
walMerge
(
aContent
,
p
->
aList
,
p
->
nList
,
&
aMerge
,
&
nMerge
,
aBuffer
);
}
}
assert
(
aMerge
==
aList
);
*
pnList
=
nMerge
;
#ifdef SQLITE_DEBUG
{
int
i
;
for
(
i
=
1
;
i
<*
pnList
;
i
++
){
assert
(
aContent
[
aList
[
i
]]
>
aContent
[
aList
[
i
-
1
]]
);
}
}
#endif
}
/*
** Free an iterator allocated by walIteratorInit().
*/
static
void
walIteratorFree
(
WalIterator
*
p
){
sqlite3_free
(
p
);
}
/*
** Construct a WalInterator object that can be used to loop over all
** pages in the WAL following frame nBackfill in ascending order. Frames
** nBackfill or earlier may be included - excluding them is an optimization
** only. The caller must hold the checkpoint lock.
**
** On success, make *pp point to the newly allocated WalInterator object
** return SQLITE_OK. Otherwise, return an error code. If this routine
** returns an error, the value of *pp is undefined.
**
** The calling routine should invoke walIteratorFree() to destroy the
** WalIterator object when it has finished with it.
*/
static
int
walIteratorInit
(
Wal
*
pWal
,
u32
nBackfill
,
WalIterator
**
pp
){
WalIterator
*
p
;
/* Return value */
int
nSegment
;
/* Number of segments to merge */
u32
iLast
;
/* Last frame in log */
sqlite3_int64
nByte
;
/* Number of bytes to allocate */
int
i
;
/* Iterator variable */
ht_slot
*
aTmp
;
/* Temp space used by merge-sort */
int
rc
=
SQLITE_OK
;
/* Return Code */
/* This routine only runs while holding the checkpoint lock. And
** it only runs if there is actually content in the log (mxFrame>0).
*/
assert
(
pWal
->
ckptLock
&&
pWal
->
hdr
.
mxFrame
>
0
);
iLast
=
pWal
->
hdr
.
mxFrame
;
/* Allocate space for the WalIterator object. */
nSegment
=
walFramePage
(
iLast
)
+
1
;
nByte
=
sizeof
(
WalIterator
)
+
(
nSegment
-
1
)
*
sizeof
(
struct
WalSegment
)
+
iLast
*
sizeof
(
ht_slot
);
p
=
(
WalIterator
*
)
sqlite3_malloc64
(
nByte
);
if
(
!
p
){
return
SQLITE_NOMEM
;
}
memset
(
p
,
0
,
nByte
);
p
->
nSegment
=
nSegment
;
/* Allocate temporary space used by the merge-sort routine. This block
** of memory will be freed before this function returns.
*/
aTmp
=
(
ht_slot
*
)
sqlite3_malloc64
(
sizeof
(
ht_slot
)
*
(
iLast
>
HASHTABLE_NPAGE
?
HASHTABLE_NPAGE
:
iLast
)
);
if
(
!
aTmp
){
rc
=
SQLITE_NOMEM
;
}
for
(
i
=
walFramePage
(
nBackfill
+
1
);
rc
==
SQLITE_OK
&&
i
<
nSegment
;
i
++
){
WalHashLoc
sLoc
;
rc
=
walHashGet
(
pWal
,
i
,
&
sLoc
);
if
(
rc
==
SQLITE_OK
){
int
j
;
/* Counter variable */
int
nEntry
;
/* Number of entries in this segment */
ht_slot
*
aIndex
;
/* Sorted index for this segment */
if
(
(
i
+
1
)
==
nSegment
){
nEntry
=
(
int
)(
iLast
-
sLoc
.
iZero
);
}
else
{
nEntry
=
(
int
)((
u32
*
)
sLoc
.
aHash
-
(
u32
*
)
sLoc
.
aPgno
);
}
aIndex
=
&
((
ht_slot
*
)
&
p
->
aSegment
[
p
->
nSegment
])[
sLoc
.
iZero
];
sLoc
.
iZero
++
;
for
(
j
=
0
;
j
<
nEntry
;
j
++
){
aIndex
[
j
]
=
(
ht_slot
)
j
;
}
walMergesort
((
u32
*
)
sLoc
.
aPgno
,
aTmp
,
aIndex
,
&
nEntry
);
p
->
aSegment
[
i
].
iZero
=
sLoc
.
iZero
;
p
->
aSegment
[
i
].
nEntry
=
nEntry
;
p
->
aSegment
[
i
].
aIndex
=
aIndex
;
p
->
aSegment
[
i
].
aPgno
=
(
u32
*
)
sLoc
.
aPgno
;
}
}
sqlite3_free
(
aTmp
);
if
(
rc
!=
SQLITE_OK
){
walIteratorFree
(
p
);
p
=
0
;
}
*
pp
=
p
;
return
rc
;
}
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
/*
** Attempt to enable blocking locks. Blocking locks are enabled only if (a)
** they are supported by the VFS, and (b) the database handle is configured
** with a busy-timeout. Return 1 if blocking locks are successfully enabled,
** or 0 otherwise.
*/
static
int
walEnableBlocking
(
Wal
*
pWal
){
int
res
=
0
;
if
(
pWal
->
db
){
int
tmout
=
pWal
->
db
->
busyTimeout
;
if
(
tmout
){
int
rc
;
rc
=
sqlite3OsFileControl
(
pWal
->
pDbFd
,
SQLITE_FCNTL_LOCK_TIMEOUT
,
(
void
*
)
&
tmout
);
res
=
(
rc
==
SQLITE_OK
);
}
}
return
res
;
}
/*
** Disable blocking locks.
*/
static
void
walDisableBlocking
(
Wal
*
pWal
){
int
tmout
=
0
;
sqlite3OsFileControl
(
pWal
->
pDbFd
,
SQLITE_FCNTL_LOCK_TIMEOUT
,
(
void
*
)
&
tmout
);
}
/*
** If parameter bLock is true, attempt to enable blocking locks, take
** the WRITER lock, and then disable blocking locks. If blocking locks
** cannot be enabled, no attempt to obtain the WRITER lock is made. Return
** an SQLite error code if an error occurs, or SQLITE_OK otherwise. It is not
** an error if blocking locks can not be enabled.
**
** If the bLock parameter is false and the WRITER lock is held, release it.
*/
int
sqlite3WalWriteLock
(
Wal
*
pWal
,
int
bLock
){
int
rc
=
SQLITE_OK
;
assert
(
pWal
->
readLock
<
0
||
bLock
==
0
);
if
(
bLock
){
assert
(
pWal
->
db
);
if
(
walEnableBlocking
(
pWal
)
){
rc
=
walLockExclusive
(
pWal
,
WAL_WRITE_LOCK
,
1
);
if
(
rc
==
SQLITE_OK
){
pWal
->
writeLock
=
1
;
}
walDisableBlocking
(
pWal
);
}
}
else
if
(
pWal
->
writeLock
){
walUnlockExclusive
(
pWal
,
WAL_WRITE_LOCK
,
1
);
pWal
->
writeLock
=
0
;
}
return
rc
;
}
/*
** Set the database handle used to determine if blocking locks are required.
*/
void
sqlite3WalDb
(
Wal
*
pWal
,
sqlite3
*
db
){
pWal
->
db
=
db
;
}
/*
** Take an exclusive WRITE lock. Blocking if so configured.
*/
static
int
walLockWriter
(
Wal
*
pWal
){
int
rc
;
walEnableBlocking
(
pWal
);
rc
=
walLockExclusive
(
pWal
,
WAL_WRITE_LOCK
,
1
);
walDisableBlocking
(
pWal
);
return
rc
;
}
#else
# define walEnableBlocking(x) 0
# define walDisableBlocking(x)
# define walLockWriter(pWal) walLockExclusive((pWal), WAL_WRITE_LOCK, 1)
# define sqlite3WalDb(pWal, db)
#endif
/* ifdef SQLITE_ENABLE_SETLK_TIMEOUT */
/*
** Attempt to obtain the exclusive WAL lock defined by parameters lockIdx and
** n. If the attempt fails and parameter xBusy is not NULL, then it is a
** busy-handler function. Invoke it and retry the lock until either the
** lock is successfully obtained or the busy-handler returns 0.
*/
static
int
walBusyLock
(
Wal
*
pWal
,
/* WAL connection */
int
(
*
xBusy
)(
void
*
),
/* Function to call when busy */
void
*
pBusyArg
,
/* Context argument for xBusyHandler */
int
lockIdx
,
/* Offset of first byte to lock */
int
n
/* Number of bytes to lock */
){
int
rc
;
do
{
rc
=
walLockExclusive
(
pWal
,
lockIdx
,
n
);
}
while
(
xBusy
&&
rc
==
SQLITE_BUSY
&&
xBusy
(
pBusyArg
)
);
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
if
(
rc
==
SQLITE_BUSY_TIMEOUT
){
walDisableBlocking
(
pWal
);
rc
=
SQLITE_BUSY
;
}
#endif
return
rc
;
}
/*
** The cache of the wal-index header must be valid to call this function.
** Return the page-size in bytes used by the database.
*/
static
int
walPagesize
(
Wal
*
pWal
){
return
(
pWal
->
hdr
.
szPage
&
0xfe00
)
+
((
pWal
->
hdr
.
szPage
&
0x0001
)
<<
16
);
}
/*
** The following is guaranteed when this function is called:
**
** a) the WRITER lock is held,
** b) the entire log file has been checkpointed, and
** c) any existing readers are reading exclusively from the database
** file - there are no readers that may attempt to read a frame from
** the log file.
**
** This function updates the shared-memory structures so that the next
** client to write to the database (which may be this one) does so by
** writing frames into the start of the log file.
**
** The value of parameter salt1 is used as the aSalt[1] value in the
** new wal-index header. It should be passed a pseudo-random value (i.e.
** one obtained from sqlite3_randomness()).
*/
static
void
walRestartHdr
(
Wal
*
pWal
,
u32
salt1
){
volatile
WalCkptInfo
*
pInfo
=
walCkptInfo
(
pWal
);
int
i
;
/* Loop counter */
u32
*
aSalt
=
pWal
->
hdr
.
aSalt
;
/* Big-endian salt values */
pWal
->
nCkpt
++
;
pWal
->
hdr
.
mxFrame
=
0
;
sqlite3Put4byte
((
u8
*
)
&
aSalt
[
0
],
1
+
sqlite3Get4byte
((
u8
*
)
&
aSalt
[
0
]));
memcpy
(
&
pWal
->
hdr
.
aSalt
[
1
],
&
salt1
,
4
);
walIndexWriteHdr
(
pWal
);
AtomicStore
(
&
pInfo
->
nBackfill
,
0
);
pInfo
->
nBackfillAttempted
=
0
;
pInfo
->
aReadMark
[
1
]
=
0
;
for
(
i
=
2
;
i
<
WAL_NREADER
;
i
++
)
pInfo
->
aReadMark
[
i
]
=
READMARK_NOT_USED
;
assert
(
pInfo
->
aReadMark
[
0
]
==
0
);
}
/*
** Copy as much content as we can from the WAL back into the database file
** in response to an sqlite3_wal_checkpoint() request or the equivalent.
**
** The amount of information copies from WAL to database might be limited
** by active readers. This routine will never overwrite a database page
** that a concurrent reader might be using.
**
** All I/O barrier operations (a.k.a fsyncs) occur in this routine when
** SQLite is in WAL-mode in synchronous=NORMAL. That means that if
** checkpoints are always run by a background thread or background
** process, foreground threads will never block on a lengthy fsync call.
**
** Fsync is called on the WAL before writing content out of the WAL and
** into the database. This ensures that if the new content is persistent
** in the WAL and can be recovered following a power-loss or hard reset.
**
** Fsync is also called on the database file if (and only if) the entire
** WAL content is copied into the database file. This second fsync makes
** it safe to delete the WAL since the new content will persist in the
** database file.
**
** This routine uses and updates the nBackfill field of the wal-index header.
** This is the only routine that will increase the value of nBackfill.
** (A WAL reset or recovery will revert nBackfill to zero, but not increase
** its value.)
**
** The caller must be holding sufficient locks to ensure that no other
** checkpoint is running (in any other thread or process) at the same
** time.
*/
static
int
walCheckpoint
(
Wal
*
pWal
,
/* Wal connection */
sqlite3
*
db
,
/* Check for interrupts on this handle */
int
eMode
,
/* One of PASSIVE, FULL or RESTART */
int
(
*
xBusy
)(
void
*
),
/* Function to call when busy */
void
*
pBusyArg
,
/* Context argument for xBusyHandler */
int
sync_flags
,
/* Flags for OsSync() (or 0) */
u8
*
zBuf
/* Temporary buffer to use */
){
int
rc
=
SQLITE_OK
;
/* Return code */
int
szPage
;
/* Database page-size */
WalIterator
*
pIter
=
0
;
/* Wal iterator context */
u32
iDbpage
=
0
;
/* Next database page to write */
u32
iFrame
=
0
;
/* Wal frame containing data for iDbpage */
u32
mxSafeFrame
;
/* Max frame that can be backfilled */
u32
mxPage
;
/* Max database page to write */
int
i
;
/* Loop counter */
volatile
WalCkptInfo
*
pInfo
;
/* The checkpoint status information */
szPage
=
walPagesize
(
pWal
);
testcase
(
szPage
<=
32768
);
testcase
(
szPage
>=
65536
);
pInfo
=
walCkptInfo
(
pWal
);
if
(
pInfo
->
nBackfill
<
pWal
->
hdr
.
mxFrame
){
/* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked
** in the SQLITE_CHECKPOINT_PASSIVE mode. */
assert
(
eMode
!=
SQLITE_CHECKPOINT_PASSIVE
||
xBusy
==
0
);
/* Compute in mxSafeFrame the index of the last frame of the WAL that is
** safe to write into the database. Frames beyond mxSafeFrame might
** overwrite database pages that are in use by active readers and thus
** cannot be backfilled from the WAL.
*/
mxSafeFrame
=
pWal
->
hdr
.
mxFrame
;
mxPage
=
pWal
->
hdr
.
nPage
;
for
(
i
=
1
;
i
<
WAL_NREADER
;
i
++
){
u32
y
=
AtomicLoad
(
pInfo
->
aReadMark
+
i
);
if
(
mxSafeFrame
>
y
){
assert
(
y
<=
pWal
->
hdr
.
mxFrame
);
rc
=
walBusyLock
(
pWal
,
xBusy
,
pBusyArg
,
WAL_READ_LOCK
(
i
),
1
);
if
(
rc
==
SQLITE_OK
){
u32
iMark
=
(
i
==
1
?
mxSafeFrame
:
READMARK_NOT_USED
);
AtomicStore
(
pInfo
->
aReadMark
+
i
,
iMark
);
walUnlockExclusive
(
pWal
,
WAL_READ_LOCK
(
i
),
1
);
}
else
if
(
rc
==
SQLITE_BUSY
){
mxSafeFrame
=
y
;
xBusy
=
0
;
}
else
{
goto
walcheckpoint_out
;
}
}
}
/* Allocate the iterator */
if
(
pInfo
->
nBackfill
<
mxSafeFrame
){
rc
=
walIteratorInit
(
pWal
,
pInfo
->
nBackfill
,
&
pIter
);
assert
(
rc
==
SQLITE_OK
||
pIter
==
0
);
}
if
(
pIter
&&
(
rc
=
walBusyLock
(
pWal
,
xBusy
,
pBusyArg
,
WAL_READ_LOCK
(
0
),
1
))
==
SQLITE_OK
){
u32
nBackfill
=
pInfo
->
nBackfill
;
pInfo
->
nBackfillAttempted
=
mxSafeFrame
;
/* Sync the WAL to disk */
rc
=
sqlite3OsSync
(
pWal
->
pWalFd
,
CKPT_SYNC_FLAGS
(
sync_flags
));
/* If the database may grow as a result of this checkpoint, hint
** about the eventual size of the db file to the VFS layer.
*/
if
(
rc
==
SQLITE_OK
){
i64
nReq
=
((
i64
)
mxPage
*
szPage
);
i64
nSize
;
/* Current size of database file */
sqlite3OsFileControl
(
pWal
->
pDbFd
,
SQLITE_FCNTL_CKPT_START
,
0
);
rc
=
sqlite3OsFileSize
(
pWal
->
pDbFd
,
&
nSize
);
if
(
rc
==
SQLITE_OK
&&
nSize
<
nReq
){
if
(
(
nSize
+
65536
+
(
i64
)
pWal
->
hdr
.
mxFrame
*
szPage
)
<
nReq
){
/* If the size of the final database is larger than the current
** database plus the amount of data in the wal file, plus the
** maximum size of the pending-byte page (65536 bytes), then
** must be corruption somewhere. */
rc
=
SQLITE_CORRUPT_BKPT
;
}
else
{
sqlite3OsFileControlHint
(
pWal
->
pDbFd
,
SQLITE_FCNTL_SIZE_HINT
,
&
nReq
);
}
}
}
/* Iterate through the contents of the WAL, copying data to the db file */
while
(
rc
==
SQLITE_OK
&&
0
==
walIteratorNext
(
pIter
,
&
iDbpage
,
&
iFrame
)
){
i64
iOffset
;
assert
(
walFramePgno
(
pWal
,
iFrame
)
==
iDbpage
);
if
(
AtomicLoad
(
&
db
->
u1
.
isInterrupted
)
){
rc
=
db
->
mallocFailed
?
SQLITE_NOMEM
:
SQLITE_INTERRUPT
;
break
;
}
if
(
iFrame
<=
nBackfill
||
iFrame
>
mxSafeFrame
||
iDbpage
>
mxPage
){
continue
;
}
iOffset
=
walFrameOffset
(
iFrame
,
szPage
)
+
WAL_FRAME_HDRSIZE
;
/* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL file */
rc
=
sqlite3OsRead
(
pWal
->
pWalFd
,
zBuf
,
szPage
,
iOffset
);
if
(
rc
!=
SQLITE_OK
)
break
;
iOffset
=
(
iDbpage
-
1
)
*
(
i64
)
szPage
;
testcase
(
IS_BIG_INT
(
iOffset
)
);
rc
=
sqlite3OsWrite
(
pWal
->
pDbFd
,
zBuf
,
szPage
,
iOffset
);
if
(
rc
!=
SQLITE_OK
)
break
;
}
sqlite3OsFileControl
(
pWal
->
pDbFd
,
SQLITE_FCNTL_CKPT_DONE
,
0
);
/* If work was actually accomplished... */
if
(
rc
==
SQLITE_OK
){
if
(
mxSafeFrame
==
walIndexHdr
(
pWal
)
->
mxFrame
){
i64
szDb
=
pWal
->
hdr
.
nPage
*
(
i64
)
szPage
;
testcase
(
IS_BIG_INT
(
szDb
)
);
rc
=
sqlite3OsTruncate
(
pWal
->
pDbFd
,
szDb
);
if
(
rc
==
SQLITE_OK
){
rc
=
sqlite3OsSync
(
pWal
->
pDbFd
,
CKPT_SYNC_FLAGS
(
sync_flags
));
}
}
if
(
rc
==
SQLITE_OK
){
AtomicStore
(
&
pInfo
->
nBackfill
,
mxSafeFrame
);
}
}
/* Release the reader lock held while backfilling */
walUnlockExclusive
(
pWal
,
WAL_READ_LOCK
(
0
),
1
);
}
if
(
rc
==
SQLITE_BUSY
){
/* Reset the return code so as not to report a checkpoint failure
** just because there are active readers. */
rc
=
SQLITE_OK
;
}
}
/* If this is an SQLITE_CHECKPOINT_RESTART or TRUNCATE operation, and the
** entire wal file has been copied into the database file, then block
** until all readers have finished using the wal file. This ensures that
** the next process to write to the database restarts the wal file.
*/
if
(
rc
==
SQLITE_OK
&&
eMode
!=
SQLITE_CHECKPOINT_PASSIVE
){
assert
(
pWal
->
writeLock
);
if
(
pInfo
->
nBackfill
<
pWal
->
hdr
.
mxFrame
){
rc
=
SQLITE_BUSY
;
}
else
if
(
eMode
>=
SQLITE_CHECKPOINT_RESTART
){
u32
salt1
;
sqlite3_randomness
(
4
,
&
salt1
);
assert
(
pInfo
->
nBackfill
==
pWal
->
hdr
.
mxFrame
);
rc
=
walBusyLock
(
pWal
,
xBusy
,
pBusyArg
,
WAL_READ_LOCK
(
1
),
WAL_NREADER
-
1
);
if
(
rc
==
SQLITE_OK
){
if
(
eMode
==
SQLITE_CHECKPOINT_TRUNCATE
){
/* IMPLEMENTATION-OF: R-44699-57140 This mode works the same way as
** SQLITE_CHECKPOINT_RESTART with the addition that it also
** truncates the log file to zero bytes just prior to a
** successful return.
**
** In theory, it might be safe to do this without updating the
** wal-index header in shared memory, as all subsequent reader or
** writer clients should see that the entire log file has been
** checkpointed and behave accordingly. This seems unsafe though,
** as it would leave the system in a state where the contents of
** the wal-index header do not match the contents of the
** file-system. To avoid this, update the wal-index header to
** indicate that the log file contains zero valid frames. */
walRestartHdr
(
pWal
,
salt1
);
rc
=
sqlite3OsTruncate
(
pWal
->
pWalFd
,
0
);
}
walUnlockExclusive
(
pWal
,
WAL_READ_LOCK
(
1
),
WAL_NREADER
-
1
);
}
}
}
walcheckpoint_out:
walIteratorFree
(
pIter
);
return
rc
;
}
/*
** If the WAL file is currently larger than nMax bytes in size, truncate
** it to exactly nMax bytes. If an error occurs while doing so, ignore it.
*/
static
void
walLimitSize
(
Wal
*
pWal
,
i64
nMax
){
i64
sz
;
int
rx
;
sqlite3BeginBenignMalloc
();
rx
=
sqlite3OsFileSize
(
pWal
->
pWalFd
,
&
sz
);
if
(
rx
==
SQLITE_OK
&&
(
sz
>
nMax
)
){
rx
=
sqlite3OsTruncate
(
pWal
->
pWalFd
,
nMax
);
}
sqlite3EndBenignMalloc
();
if
(
rx
){
sqlite3_log
(
rx
,
"cannot limit WAL size: %s"
,
pWal
->
zWalName
);
}
}
/*
** Close a connection to a log file.
*/
int
sqlite3WalClose
(
Wal
*
pWal
,
/* Wal to close */
sqlite3
*
db
,
/* For interrupt flag */
int
sync_flags
,
/* Flags to pass to OsSync() (or 0) */
int
nBuf
,
u8
*
zBuf
/* Buffer of at least nBuf bytes */
){
int
rc
=
SQLITE_OK
;
if
(
pWal
){
int
isDelete
=
0
;
/* True to unlink wal and wal-index files */
/* If an EXCLUSIVE lock can be obtained on the database file (using the
** ordinary, rollback-mode locking methods, this guarantees that the
** connection associated with this log file is the only connection to
** the database. In this case checkpoint the database and unlink both
** the wal and wal-index files.
**
** The EXCLUSIVE lock is not released before returning.
*/
if
(
zBuf
!=
0
&&
SQLITE_OK
==
(
rc
=
sqlite3OsLock
(
pWal
->
pDbFd
,
SQLITE_LOCK_EXCLUSIVE
))
){
if
(
pWal
->
exclusiveMode
==
WAL_NORMAL_MODE
){
pWal
->
exclusiveMode
=
WAL_EXCLUSIVE_MODE
;
}
rc
=
sqlite3WalCheckpoint
(
pWal
,
db
,
SQLITE_CHECKPOINT_PASSIVE
,
0
,
0
,
sync_flags
,
nBuf
,
zBuf
,
0
,
0
);
if
(
rc
==
SQLITE_OK
){
int
bPersist
=
-
1
;
sqlite3OsFileControlHint
(
pWal
->
pDbFd
,
SQLITE_FCNTL_PERSIST_WAL
,
&
bPersist
);
if
(
bPersist
!=
1
){
/* Try to delete the WAL file if the checkpoint completed and
** fsyned (rc==SQLITE_OK) and if we are not in persistent-wal
** mode (!bPersist) */
isDelete
=
1
;
}
else
if
(
pWal
->
mxWalSize
>=
0
){
/* Try to truncate the WAL file to zero bytes if the checkpoint
** completed and fsynced (rc==SQLITE_OK) and we are in persistent
** WAL mode (bPersist) and if the PRAGMA journal_size_limit is a
** non-negative value (pWal->mxWalSize>=0). Note that we truncate
** to zero bytes as truncating to the journal_size_limit might
** leave a corrupt WAL file on disk. */
walLimitSize
(
pWal
,
0
);
}
}
}
walIndexClose
(
pWal
,
isDelete
);
sqlite3OsClose
(
pWal
->
pWalFd
);
if
(
isDelete
){
sqlite3BeginBenignMalloc
();
sqlite3OsDelete
(
pWal
->
pVfs
,
pWal
->
zWalName
,
0
);
sqlite3EndBenignMalloc
();
}
WALTRACE
((
"WAL%p: closed
\n
"
,
pWal
));
sqlite3_free
((
void
*
)
pWal
->
apWiData
);
sqlite3_free
(
pWal
);
}
return
rc
;
}
/*
** Try to read the wal-index header. Return 0 on success and 1 if
** there is a problem.
**
** The wal-index is in shared memory. Another thread or process might
** be writing the header at the same time this procedure is trying to
** read it, which might result in inconsistency. A dirty read is detected
** by verifying that both copies of the header are the same and also by
** a checksum on the header.
**
** If and only if the read is consistent and the header is different from
** pWal->hdr, then pWal->hdr is updated to the content of the new header
** and *pChanged is set to 1.
**
** If the checksum cannot be verified return non-zero. If the header
** is read successfully and the checksum verified, return zero.
*/
static
SQLITE_NO_TSAN
int
walIndexTryHdr
(
Wal
*
pWal
,
int
*
pChanged
){
u32
aCksum
[
2
];
/* Checksum on the header content */
WalIndexHdr
h1
,
h2
;
/* Two copies of the header content */
WalIndexHdr
volatile
*
aHdr
;
/* Header in shared memory */
/* The first page of the wal-index must be mapped at this point. */
assert
(
pWal
->
nWiData
>
0
&&
pWal
->
apWiData
[
0
]
);
/* Read the header. This might happen concurrently with a write to the
** same area of shared memory on a different CPU in a SMP,
** meaning it is possible that an inconsistent snapshot is read
** from the file. If this happens, return non-zero.
**
** tag-20200519-1:
** There are two copies of the header at the beginning of the wal-index.
** When reading, read [0] first then [1]. Writes are in the reverse order.
** Memory barriers are used to prevent the compiler or the hardware from
** reordering the reads and writes. TSAN and similar tools can sometimes
** give false-positive warnings about these accesses because the tools do not
** account for the double-read and the memory barrier. The use of mutexes
** here would be problematic as the memory being accessed is potentially
** shared among multiple processes and not all mutex implementions work
** reliably in that environment.
*/
aHdr
=
walIndexHdr
(
pWal
);
memcpy
(
&
h1
,
(
void
*
)
&
aHdr
[
0
],
sizeof
(
h1
));
/* Possible TSAN false-positive */
walShmBarrier
(
pWal
);
memcpy
(
&
h2
,
(
void
*
)
&
aHdr
[
1
],
sizeof
(
h2
));
if
(
memcmp
(
&
h1
,
&
h2
,
sizeof
(
h1
))
!=
0
){
return
1
;
/* Dirty read */
}
if
(
h1
.
isInit
==
0
){
return
1
;
/* Malformed header - probably all zeros */
}
walChecksumBytes
(
1
,
(
u8
*
)
&
h1
,
sizeof
(
h1
)
-
sizeof
(
h1
.
aCksum
),
0
,
aCksum
);
if
(
aCksum
[
0
]
!=
h1
.
aCksum
[
0
]
||
aCksum
[
1
]
!=
h1
.
aCksum
[
1
]
){
return
1
;
/* Checksum does not match */
}
if
(
memcmp
(
&
pWal
->
hdr
,
&
h1
,
sizeof
(
WalIndexHdr
))
){
*
pChanged
=
1
;
memcpy
(
&
pWal
->
hdr
,
&
h1
,
sizeof
(
WalIndexHdr
));
pWal
->
szPage
=
(
pWal
->
hdr
.
szPage
&
0xfe00
)
+
((
pWal
->
hdr
.
szPage
&
0x0001
)
<<
16
);
testcase
(
pWal
->
szPage
<=
32768
);
testcase
(
pWal
->
szPage
>=
65536
);
}
/* The header was successfully read. Return zero. */
return
0
;
}
/*
** This is the value that walTryBeginRead returns when it needs to
** be retried.
*/
#define WAL_RETRY (-1)
/*
** Read the wal-index header from the wal-index and into pWal->hdr.
** If the wal-header appears to be corrupt, try to reconstruct the
** wal-index from the WAL before returning.
**
** Set *pChanged to 1 if the wal-index header value in pWal->hdr is
** changed by this operation. If pWal->hdr is unchanged, set *pChanged
** to 0.
**
** If the wal-index header is successfully read, return SQLITE_OK.
** Otherwise an SQLite error code.
*/
static
int
walIndexReadHdr
(
Wal
*
pWal
,
int
*
pChanged
){
int
rc
;
/* Return code */
int
badHdr
;
/* True if a header read failed */
volatile
u32
*
page0
;
/* Chunk of wal-index containing header */
/* Ensure that page 0 of the wal-index (the page that contains the
** wal-index header) is mapped. Return early if an error occurs here.
*/
assert
(
pChanged
);
rc
=
walIndexPage
(
pWal
,
0
,
&
page0
);
if
(
rc
!=
SQLITE_OK
){
assert
(
rc
!=
SQLITE_READONLY
);
/* READONLY changed to OK in walIndexPage */
if
(
rc
==
SQLITE_READONLY_CANTINIT
){
/* The SQLITE_READONLY_CANTINIT return means that the shared-memory
** was openable but is not writable, and this thread is unable to
** confirm that another write-capable connection has the shared-memory
** open, and hence the content of the shared-memory is unreliable,
** since the shared-memory might be inconsistent with the WAL file
** and there is no writer on hand to fix it. */
assert
(
page0
==
0
);
assert
(
pWal
->
writeLock
==
0
);
assert
(
pWal
->
readOnly
&
WAL_SHM_RDONLY
);
pWal
->
bShmUnreliable
=
1
;
pWal
->
exclusiveMode
=
WAL_HEAPMEMORY_MODE
;
*
pChanged
=
1
;
}
else
{
return
rc
;
/* Any other non-OK return is just an error */
}
}
else
{
/* page0 can be NULL if the SHM is zero bytes in size and pWal->writeLock
** is zero, which prevents the SHM from growing */
testcase
(
page0
!=
0
);
}
assert
(
page0
!=
0
||
pWal
->
writeLock
==
0
);
/* If the first page of the wal-index has been mapped, try to read the
** wal-index header immediately, without holding any lock. This usually
** works, but may fail if the wal-index header is corrupt or currently
** being modified by another thread or process.
*/
badHdr
=
(
page0
?
walIndexTryHdr
(
pWal
,
pChanged
)
:
1
);
/* If the first attempt failed, it might have been due to a race
** with a writer. So get a WRITE lock and try again.
*/
if
(
badHdr
){
if
(
pWal
->
bShmUnreliable
==
0
&&
(
pWal
->
readOnly
&
WAL_SHM_RDONLY
)
){
if
(
SQLITE_OK
==
(
rc
=
walLockShared
(
pWal
,
WAL_WRITE_LOCK
))
){
walUnlockShared
(
pWal
,
WAL_WRITE_LOCK
);
rc
=
SQLITE_READONLY_RECOVERY
;
}
}
else
{
int
bWriteLock
=
pWal
->
writeLock
;
if
(
bWriteLock
||
SQLITE_OK
==
(
rc
=
walLockWriter
(
pWal
))
){
pWal
->
writeLock
=
1
;
if
(
SQLITE_OK
==
(
rc
=
walIndexPage
(
pWal
,
0
,
&
page0
))
){
badHdr
=
walIndexTryHdr
(
pWal
,
pChanged
);
if
(
badHdr
){
/* If the wal-index header is still malformed even while holding
** a WRITE lock, it can only mean that the header is corrupted and
** needs to be reconstructed. So run recovery to do exactly that.
*/
rc
=
walIndexRecover
(
pWal
);
*
pChanged
=
1
;
}
}
if
(
bWriteLock
==
0
){
pWal
->
writeLock
=
0
;
walUnlockExclusive
(
pWal
,
WAL_WRITE_LOCK
,
1
);
}
}
}
}
/* If the header is read successfully, check the version number to make
** sure the wal-index was not constructed with some future format that
** this version of SQLite cannot understand.
*/
if
(
badHdr
==
0
&&
pWal
->
hdr
.
iVersion
!=
WALINDEX_MAX_VERSION
){
rc
=
SQLITE_CANTOPEN_BKPT
;
}
if
(
pWal
->
bShmUnreliable
){
if
(
rc
!=
SQLITE_OK
){
walIndexClose
(
pWal
,
0
);
pWal
->
bShmUnreliable
=
0
;
assert
(
pWal
->
nWiData
>
0
&&
pWal
->
apWiData
[
0
]
==
0
);
/* walIndexRecover() might have returned SHORT_READ if a concurrent
** writer truncated the WAL out from under it. If that happens, it
** indicates that a writer has fixed the SHM file for us, so retry */
if
(
rc
==
SQLITE_IOERR_SHORT_READ
)
rc
=
WAL_RETRY
;
}
pWal
->
exclusiveMode
=
WAL_NORMAL_MODE
;
}
return
rc
;
}
/*
** Open a transaction in a connection where the shared-memory is read-only
** and where we cannot verify that there is a separate write-capable connection
** on hand to keep the shared-memory up-to-date with the WAL file.
**
** This can happen, for example, when the shared-memory is implemented by
** memory-mapping a *-shm file, where a prior writer has shut down and
** left the *-shm file on disk, and now the present connection is trying
** to use that database but lacks write permission on the *-shm file.
** Other scenarios are also possible, depending on the VFS implementation.
**
** Precondition:
**
** The *-wal file has been read and an appropriate wal-index has been
** constructed in pWal->apWiData[] using heap memory instead of shared
** memory.
**
** If this function returns SQLITE_OK, then the read transaction has
** been successfully opened. In this case output variable (*pChanged)
** is set to true before returning if the caller should discard the
** contents of the page cache before proceeding. Or, if it returns
** WAL_RETRY, then the heap memory wal-index has been discarded and
** the caller should retry opening the read transaction from the
** beginning (including attempting to map the *-shm file).
**
** If an error occurs, an SQLite error code is returned.
*/
static
int
walBeginShmUnreliable
(
Wal
*
pWal
,
int
*
pChanged
){
i64
szWal
;
/* Size of wal file on disk in bytes */
i64
iOffset
;
/* Current offset when reading wal file */
u8
aBuf
[
WAL_HDRSIZE
];
/* Buffer to load WAL header into */
u8
*
aFrame
=
0
;
/* Malloc'd buffer to load entire frame */
int
szFrame
;
/* Number of bytes in buffer aFrame[] */
u8
*
aData
;
/* Pointer to data part of aFrame buffer */
volatile
void
*
pDummy
;
/* Dummy argument for xShmMap */
int
rc
;
/* Return code */
u32
aSaveCksum
[
2
];
/* Saved copy of pWal->hdr.aFrameCksum */
assert
(
pWal
->
bShmUnreliable
);
assert
(
pWal
->
readOnly
&
WAL_SHM_RDONLY
);
assert
(
pWal
->
nWiData
>
0
&&
pWal
->
apWiData
[
0
]
);
/* Take WAL_READ_LOCK(0). This has the effect of preventing any
** writers from running a checkpoint, but does not stop them
** from running recovery. */
rc
=
walLockShared
(
pWal
,
WAL_READ_LOCK
(
0
));
if
(
rc
!=
SQLITE_OK
){
if
(
rc
==
SQLITE_BUSY
)
rc
=
WAL_RETRY
;
goto
begin_unreliable_shm_out
;
}
pWal
->
readLock
=
0
;
/* Check to see if a separate writer has attached to the shared-memory area,
** thus making the shared-memory "reliable" again. Do this by invoking
** the xShmMap() routine of the VFS and looking to see if the return
** is SQLITE_READONLY instead of SQLITE_READONLY_CANTINIT.
**
** If the shared-memory is now "reliable" return WAL_RETRY, which will
** cause the heap-memory WAL-index to be discarded and the actual
** shared memory to be used in its place.
**
** This step is important because, even though this connection is holding
** the WAL_READ_LOCK(0) which prevents a checkpoint, a writer might
** have already checkpointed the WAL file and, while the current
** is active, wrap the WAL and start overwriting frames that this
** process wants to use.
**
** Once sqlite3OsShmMap() has been called for an sqlite3_file and has
** returned any SQLITE_READONLY value, it must return only SQLITE_READONLY
** or SQLITE_READONLY_CANTINIT or some error for all subsequent invocations,
** even if some external agent does a "chmod" to make the shared-memory
** writable by us, until sqlite3OsShmUnmap() has been called.
** This is a requirement on the VFS implementation.
*/
rc
=
sqlite3OsShmMap
(
pWal
->
pDbFd
,
0
,
WALINDEX_PGSZ
,
0
,
&
pDummy
);
assert
(
rc
!=
SQLITE_OK
);
/* SQLITE_OK not possible for read-only connection */
if
(
rc
!=
SQLITE_READONLY_CANTINIT
){
rc
=
(
rc
==
SQLITE_READONLY
?
WAL_RETRY
:
rc
);
goto
begin_unreliable_shm_out
;
}
/* We reach this point only if the real shared-memory is still unreliable.
** Assume the in-memory WAL-index substitute is correct and load it
** into pWal->hdr.
*/
memcpy
(
&
pWal
->
hdr
,
(
void
*
)
walIndexHdr
(
pWal
),
sizeof
(
WalIndexHdr
));
/* Make sure some writer hasn't come in and changed the WAL file out
** from under us, then disconnected, while we were not looking.
*/
rc
=
sqlite3OsFileSize
(
pWal
->
pWalFd
,
&
szWal
);
if
(
rc
!=
SQLITE_OK
){
goto
begin_unreliable_shm_out
;
}
if
(
szWal
<
WAL_HDRSIZE
){
/* If the wal file is too small to contain a wal-header and the
** wal-index header has mxFrame==0, then it must be safe to proceed
** reading the database file only. However, the page cache cannot
** be trusted, as a read/write connection may have connected, written
** the db, run a checkpoint, truncated the wal file and disconnected
** since this client's last read transaction. */
*
pChanged
=
1
;
rc
=
(
pWal
->
hdr
.
mxFrame
==
0
?
SQLITE_OK
:
WAL_RETRY
);
goto
begin_unreliable_shm_out
;
}
/* Check the salt keys at the start of the wal file still match. */
rc
=
sqlite3OsRead
(
pWal
->
pWalFd
,
aBuf
,
WAL_HDRSIZE
,
0
);
if
(
rc
!=
SQLITE_OK
){
goto
begin_unreliable_shm_out
;
}
if
(
memcmp
(
&
pWal
->
hdr
.
aSalt
,
&
aBuf
[
16
],
8
)
){
/* Some writer has wrapped the WAL file while we were not looking.
** Return WAL_RETRY which will cause the in-memory WAL-index to be
** rebuilt. */
rc
=
WAL_RETRY
;
goto
begin_unreliable_shm_out
;
}
/* Allocate a buffer to read frames into */
szFrame
=
pWal
->
hdr
.
szPage
+
WAL_FRAME_HDRSIZE
;
aFrame
=
(
u8
*
)
sqlite3_malloc64
(
szFrame
);
if
(
aFrame
==
0
){
rc
=
SQLITE_NOMEM
;
goto
begin_unreliable_shm_out
;
}
aData
=
&
aFrame
[
WAL_FRAME_HDRSIZE
];
/* Check to see if a complete transaction has been appended to the
** wal file since the heap-memory wal-index was created. If so, the
** heap-memory wal-index is discarded and WAL_RETRY returned to
** the caller. */
aSaveCksum
[
0
]
=
pWal
->
hdr
.
aFrameCksum
[
0
];
aSaveCksum
[
1
]
=
pWal
->
hdr
.
aFrameCksum
[
1
];
for
(
iOffset
=
walFrameOffset
(
pWal
->
hdr
.
mxFrame
+
1
,
pWal
->
hdr
.
szPage
);
iOffset
+
szFrame
<=
szWal
;
iOffset
+=
szFrame
){
u32
pgno
;
/* Database page number for frame */
u32
nTruncate
;
/* dbsize field from frame header */
/* Read and decode the next log frame. */
rc
=
sqlite3OsRead
(
pWal
->
pWalFd
,
aFrame
,
szFrame
,
iOffset
);
if
(
rc
!=
SQLITE_OK
)
break
;
if
(
!
walDecodeFrame
(
pWal
,
&
pgno
,
&
nTruncate
,
aData
,
aFrame
)
)
break
;
/* If nTruncate is non-zero, then a complete transaction has been
** appended to this wal file. Set rc to WAL_RETRY and break out of
** the loop. */
if
(
nTruncate
){
rc
=
WAL_RETRY
;
break
;
}
}
pWal
->
hdr
.
aFrameCksum
[
0
]
=
aSaveCksum
[
0
];
pWal
->
hdr
.
aFrameCksum
[
1
]
=
aSaveCksum
[
1
];
begin_unreliable_shm_out:
sqlite3_free
(
aFrame
);
if
(
rc
!=
SQLITE_OK
){
int
i
;
for
(
i
=
0
;
i
<
pWal
->
nWiData
;
i
++
){
sqlite3_free
((
void
*
)
pWal
->
apWiData
[
i
]);
pWal
->
apWiData
[
i
]
=
0
;
}
pWal
->
bShmUnreliable
=
0
;
sqlite3WalEndReadTransaction
(
pWal
);
*
pChanged
=
1
;
}
return
rc
;
}
/*
** Attempt to start a read transaction. This might fail due to a race or
** other transient condition. When that happens, it returns WAL_RETRY to
** indicate to the caller that it is safe to retry immediately.
**
** On success return SQLITE_OK. On a permanent failure (such an
** I/O error or an SQLITE_BUSY because another process is running
** recovery) return a positive error code.
**
** The useWal parameter is true to force the use of the WAL and disable
** the case where the WAL is bypassed because it has been completely
** checkpointed. If useWal==0 then this routine calls walIndexReadHdr()
** to make a copy of the wal-index header into pWal->hdr. If the
** wal-index header has changed, *pChanged is set to 1 (as an indication
** to the caller that the local page cache is obsolete and needs to be
** flushed.) When useWal==1, the wal-index header is assumed to already
** be loaded and the pChanged parameter is unused.
**
** The caller must set the cnt parameter to the number of prior calls to
** this routine during the current read attempt that returned WAL_RETRY.
** This routine will start taking more aggressive measures to clear the
** race conditions after multiple WAL_RETRY returns, and after an excessive
** number of errors will ultimately return SQLITE_PROTOCOL. The
** SQLITE_PROTOCOL return indicates that some other process has gone rogue
** and is not honoring the locking protocol. There is a vanishingly small
** chance that SQLITE_PROTOCOL could be returned because of a run of really
** bad luck when there is lots of contention for the wal-index, but that
** possibility is so small that it can be safely neglected, we believe.
**
** On success, this routine obtains a read lock on
** WAL_READ_LOCK(pWal->readLock). The pWal->readLock integer is
** in the range 0 <= pWal->readLock < WAL_NREADER. If pWal->readLock==(-1)
** that means the Wal does not hold any read lock. The reader must not
** access any database page that is modified by a WAL frame up to and
** including frame number aReadMark[pWal->readLock]. The reader will
** use WAL frames up to and including pWal->hdr.mxFrame if pWal->readLock>0
** Or if pWal->readLock==0, then the reader will ignore the WAL
** completely and get all content directly from the database file.
** If the useWal parameter is 1 then the WAL will never be ignored and
** this routine will always set pWal->readLock>0 on success.
** When the read transaction is completed, the caller must release the
** lock on WAL_READ_LOCK(pWal->readLock) and set pWal->readLock to -1.
**
** This routine uses the nBackfill and aReadMark[] fields of the header
** to select a particular WAL_READ_LOCK() that strives to let the
** checkpoint process do as much work as possible. This routine might
** update values of the aReadMark[] array in the header, but if it does
** so it takes care to hold an exclusive lock on the corresponding
** WAL_READ_LOCK() while changing values.
*/
static
int
walTryBeginRead
(
Wal
*
pWal
,
int
*
pChanged
,
int
useWal
,
int
cnt
){
volatile
WalCkptInfo
*
pInfo
;
/* Checkpoint information in wal-index */
u32
mxReadMark
;
/* Largest aReadMark[] value */
int
mxI
;
/* Index of largest aReadMark[] value */
int
i
;
/* Loop counter */
int
rc
=
SQLITE_OK
;
/* Return code */
u32
mxFrame
;
/* Wal frame to lock to */
assert
(
pWal
->
readLock
<
0
);
/* Not currently locked */
/* useWal may only be set for read/write connections */
assert
(
(
pWal
->
readOnly
&
WAL_SHM_RDONLY
)
==
0
||
useWal
==
0
);
/* Take steps to avoid spinning forever if there is a protocol error.
**
** Circumstances that cause a RETRY should only last for the briefest
** instances of time. No I/O or other system calls are done while the
** locks are held, so the locks should not be held for very long. But
** if we are unlucky, another process that is holding a lock might get
** paged out or take a page-fault that is time-consuming to resolve,
** during the few nanoseconds that it is holding the lock. In that case,
** it might take longer than normal for the lock to free.
**
** After 5 RETRYs, we begin calling sqlite3OsSleep(). The first few
** calls to sqlite3OsSleep() have a delay of 1 microsecond. Really this
** is more of a scheduler yield than an actual delay. But on the 10th
** an subsequent retries, the delays start becoming longer and longer,
** so that on the 100th (and last) RETRY we delay for 323 milliseconds.
** The total delay time before giving up is less than 10 seconds.
*/
if
(
cnt
>
5
){
int
nDelay
=
1
;
/* Pause time in microseconds */
if
(
cnt
>
100
){
VVA_ONLY
(
pWal
->
lockError
=
1
;
)
return
SQLITE_PROTOCOL
;
}
if
(
cnt
>=
10
)
nDelay
=
(
cnt
-
9
)
*
(
cnt
-
9
)
*
39
;
sqlite3OsSleep
(
pWal
->
pVfs
,
nDelay
);
}
if
(
!
useWal
){
assert
(
rc
==
SQLITE_OK
);
if
(
pWal
->
bShmUnreliable
==
0
){
rc
=
walIndexReadHdr
(
pWal
,
pChanged
);
}
if
(
rc
==
SQLITE_BUSY
){
/* If there is not a recovery running in another thread or process
** then convert BUSY errors to WAL_RETRY. If recovery is known to
** be running, convert BUSY to BUSY_RECOVERY. There is a race here
** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY
** would be technically correct. But the race is benign since with
** WAL_RETRY this routine will be called again and will probably be
** right on the second iteration.
*/
if
(
pWal
->
apWiData
[
0
]
==
0
){
/* This branch is taken when the xShmMap() method returns SQLITE_BUSY.
** We assume this is a transient condition, so return WAL_RETRY. The
** xShmMap() implementation used by the default unix and win32 VFS
** modules may return SQLITE_BUSY due to a race condition in the
** code that determines whether or not the shared-memory region
** must be zeroed before the requested page is returned.
*/
rc
=
WAL_RETRY
;
}
else
if
(
SQLITE_OK
==
(
rc
=
walLockShared
(
pWal
,
WAL_RECOVER_LOCK
))
){
walUnlockShared
(
pWal
,
WAL_RECOVER_LOCK
);
rc
=
WAL_RETRY
;
}
else
if
(
rc
==
SQLITE_BUSY
){
rc
=
SQLITE_BUSY_RECOVERY
;
}
}
if
(
rc
!=
SQLITE_OK
){
return
rc
;
}
else
if
(
pWal
->
bShmUnreliable
){
return
walBeginShmUnreliable
(
pWal
,
pChanged
);
}
}
assert
(
pWal
->
nWiData
>
0
);
assert
(
pWal
->
apWiData
[
0
]
!=
0
);
pInfo
=
walCkptInfo
(
pWal
);
if
(
!
useWal
&&
AtomicLoad
(
&
pInfo
->
nBackfill
)
==
pWal
->
hdr
.
mxFrame
#ifdef SQLITE_ENABLE_SNAPSHOT
&&
(
pWal
->
pSnapshot
==
0
||
pWal
->
hdr
.
mxFrame
==
0
)
#endif
){
/* The WAL has been completely backfilled (or it is empty).
** and can be safely ignored.
*/
rc
=
walLockShared
(
pWal
,
WAL_READ_LOCK
(
0
));
walShmBarrier
(
pWal
);
if
(
rc
==
SQLITE_OK
){
if
(
memcmp
((
void
*
)
walIndexHdr
(
pWal
),
&
pWal
->
hdr
,
sizeof
(
WalIndexHdr
))
){
/* It is not safe to allow the reader to continue here if frames
** may have been appended to the log before READ_LOCK(0) was obtained.
** When holding READ_LOCK(0), the reader ignores the entire log file,
** which implies that the database file contains a trustworthy
** snapshot. Since holding READ_LOCK(0) prevents a checkpoint from
** happening, this is usually correct.
**
** However, if frames have been appended to the log (or if the log
** is wrapped and written for that matter) before the READ_LOCK(0)
** is obtained, that is not necessarily true. A checkpointer may
** have started to backfill the appended frames but crashed before
** it finished. Leaving a corrupt image in the database file.
*/
walUnlockShared
(
pWal
,
WAL_READ_LOCK
(
0
));
return
WAL_RETRY
;
}
pWal
->
readLock
=
0
;
return
SQLITE_OK
;
}
else
if
(
rc
!=
SQLITE_BUSY
){
return
rc
;
}
}
/* If we get this far, it means that the reader will want to use
** the WAL to get at content from recent commits. The job now is
** to select one of the aReadMark[] entries that is closest to
** but not exceeding pWal->hdr.mxFrame and lock that entry.
*/
mxReadMark
=
0
;
mxI
=
0
;
mxFrame
=
pWal
->
hdr
.
mxFrame
;
#ifdef SQLITE_ENABLE_SNAPSHOT
if
(
pWal
->
pSnapshot
&&
pWal
->
pSnapshot
->
mxFrame
<
mxFrame
){
mxFrame
=
pWal
->
pSnapshot
->
mxFrame
;
}
#endif
for
(
i
=
1
;
i
<
WAL_NREADER
;
i
++
){
u32
thisMark
=
AtomicLoad
(
pInfo
->
aReadMark
+
i
);
if
(
mxReadMark
<=
thisMark
&&
thisMark
<=
mxFrame
){
assert
(
thisMark
!=
READMARK_NOT_USED
);
mxReadMark
=
thisMark
;
mxI
=
i
;
}
}
if
(
(
pWal
->
readOnly
&
WAL_SHM_RDONLY
)
==
0
&&
(
mxReadMark
<
mxFrame
||
mxI
==
0
)
){
for
(
i
=
1
;
i
<
WAL_NREADER
;
i
++
){
rc
=
walLockExclusive
(
pWal
,
WAL_READ_LOCK
(
i
),
1
);
if
(
rc
==
SQLITE_OK
){
AtomicStore
(
pInfo
->
aReadMark
+
i
,
mxFrame
);
mxReadMark
=
mxFrame
;
mxI
=
i
;
walUnlockExclusive
(
pWal
,
WAL_READ_LOCK
(
i
),
1
);
break
;
}
else
if
(
rc
!=
SQLITE_BUSY
){
return
rc
;
}
}
}
if
(
mxI
==
0
){
assert
(
rc
==
SQLITE_BUSY
||
(
pWal
->
readOnly
&
WAL_SHM_RDONLY
)
!=
0
);
return
rc
==
SQLITE_BUSY
?
WAL_RETRY
:
SQLITE_READONLY_CANTINIT
;
}
rc
=
walLockShared
(
pWal
,
WAL_READ_LOCK
(
mxI
));
if
(
rc
){
return
rc
==
SQLITE_BUSY
?
WAL_RETRY
:
rc
;
}
/* Now that the read-lock has been obtained, check that neither the
** value in the aReadMark[] array or the contents of the wal-index
** header have changed.
**
** It is necessary to check that the wal-index header did not change
** between the time it was read and when the shared-lock was obtained
** on WAL_READ_LOCK(mxI) was obtained to account for the possibility
** that the log file may have been wrapped by a writer, or that frames
** that occur later in the log than pWal->hdr.mxFrame may have been
** copied into the database by a checkpointer. If either of these things
** happened, then reading the database with the current value of
** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry
** instead.
**
** Before checking that the live wal-index header has not changed
** since it was read, set Wal.minFrame to the first frame in the wal
** file that has not yet been checkpointed. This client will not need
** to read any frames earlier than minFrame from the wal file - they
** can be safely read directly from the database file.
**
** Because a ShmBarrier() call is made between taking the copy of
** nBackfill and checking that the wal-header in shared-memory still
** matches the one cached in pWal->hdr, it is guaranteed that the
** checkpointer that set nBackfill was not working with a wal-index
** header newer than that cached in pWal->hdr. If it were, that could
** cause a problem. The checkpointer could omit to checkpoint
** a version of page X that lies before pWal->minFrame (call that version
** A) on the basis that there is a newer version (version B) of the same
** page later in the wal file. But if version B happens to like past
** frame pWal->hdr.mxFrame - then the client would incorrectly assume
** that it can read version A from the database file. However, since
** we can guarantee that the checkpointer that set nBackfill could not
** see any pages past pWal->hdr.mxFrame, this problem does not come up.
*/
pWal
->
minFrame
=
AtomicLoad
(
&
pInfo
->
nBackfill
)
+
1
;
walShmBarrier
(
pWal
);
if
(
AtomicLoad
(
pInfo
->
aReadMark
+
mxI
)
!=
mxReadMark
||
memcmp
((
void
*
)
walIndexHdr
(
pWal
),
&
pWal
->
hdr
,
sizeof
(
WalIndexHdr
))
){
walUnlockShared
(
pWal
,
WAL_READ_LOCK
(
mxI
));
return
WAL_RETRY
;
}
else
{
assert
(
mxReadMark
<=
pWal
->
hdr
.
mxFrame
);
pWal
->
readLock
=
(
i16
)
mxI
;
}
return
rc
;
}
#ifdef SQLITE_ENABLE_SNAPSHOT
/*
** Attempt to reduce the value of the WalCkptInfo.nBackfillAttempted
** variable so that older snapshots can be accessed. To do this, loop
** through all wal frames from nBackfillAttempted to (nBackfill+1),
** comparing their content to the corresponding page with the database
** file, if any. Set nBackfillAttempted to the frame number of the
** first frame for which the wal file content matches the db file.
**
** This is only really safe if the file-system is such that any page
** writes made by earlier checkpointers were atomic operations, which
** is not always true. It is also possible that nBackfillAttempted
** may be left set to a value larger than expected, if a wal frame
** contains content that duplicate of an earlier version of the same
** page.
**
** SQLITE_OK is returned if successful, or an SQLite error code if an
** error occurs. It is not an error if nBackfillAttempted cannot be
** decreased at all.
*/
int
sqlite3WalSnapshotRecover
(
Wal
*
pWal
){
int
rc
;
assert
(
pWal
->
readLock
>=
0
);
rc
=
walLockExclusive
(
pWal
,
WAL_CKPT_LOCK
,
1
);
if
(
rc
==
SQLITE_OK
){
volatile
WalCkptInfo
*
pInfo
=
walCkptInfo
(
pWal
);
int
szPage
=
(
int
)
pWal
->
szPage
;
i64
szDb
;
/* Size of db file in bytes */
rc
=
sqlite3OsFileSize
(
pWal
->
pDbFd
,
&
szDb
);
if
(
rc
==
SQLITE_OK
){
void
*
pBuf1
=
sqlite3_malloc
(
szPage
);
void
*
pBuf2
=
sqlite3_malloc
(
szPage
);
if
(
pBuf1
==
0
||
pBuf2
==
0
){
rc
=
SQLITE_NOMEM
;
}
else
{
u32
i
=
pInfo
->
nBackfillAttempted
;
for
(
i
=
pInfo
->
nBackfillAttempted
;
i
>
AtomicLoad
(
&
pInfo
->
nBackfill
);
i
--
){
WalHashLoc
sLoc
;
/* Hash table location */
u32
pgno
;
/* Page number in db file */
i64
iDbOff
;
/* Offset of db file entry */
i64
iWalOff
;
/* Offset of wal file entry */
rc
=
walHashGet
(
pWal
,
walFramePage
(
i
),
&
sLoc
);
if
(
rc
!=
SQLITE_OK
)
break
;
assert
(
i
-
sLoc
.
iZero
-
1
>=
0
);
pgno
=
sLoc
.
aPgno
[
i
-
sLoc
.
iZero
-
1
];
iDbOff
=
(
i64
)(
pgno
-
1
)
*
szPage
;
if
(
iDbOff
+
szPage
<=
szDb
){
iWalOff
=
walFrameOffset
(
i
,
szPage
)
+
WAL_FRAME_HDRSIZE
;
rc
=
sqlite3OsRead
(
pWal
->
pWalFd
,
pBuf1
,
szPage
,
iWalOff
);
if
(
rc
==
SQLITE_OK
){
rc
=
sqlite3OsRead
(
pWal
->
pDbFd
,
pBuf2
,
szPage
,
iDbOff
);
}
if
(
rc
!=
SQLITE_OK
||
0
==
memcmp
(
pBuf1
,
pBuf2
,
szPage
)
){
break
;
}
}
pInfo
->
nBackfillAttempted
=
i
-
1
;
}
}
sqlite3_free
(
pBuf1
);
sqlite3_free
(
pBuf2
);
}
walUnlockExclusive
(
pWal
,
WAL_CKPT_LOCK
,
1
);
}
return
rc
;
}
#endif
/* SQLITE_ENABLE_SNAPSHOT */
/*
** Begin a read transaction on the database.
**
** This routine used to be called sqlite3OpenSnapshot() and with good reason:
** it takes a snapshot of the state of the WAL and wal-index for the current
** instant in time. The current thread will continue to use this snapshot.
** Other threads might append new content to the WAL and wal-index but
** that extra content is ignored by the current thread.
**
** If the database contents have changes since the previous read
** transaction, then *pChanged is set to 1 before returning. The
** Pager layer will use this to know that its cache is stale and
** needs to be flushed.
*/
int
sqlite3WalBeginReadTransaction
(
Wal
*
pWal
,
int
*
pChanged
){
int
rc
;
/* Return code */
int
cnt
=
0
;
/* Number of TryBeginRead attempts */
#ifdef SQLITE_ENABLE_SNAPSHOT
int
bChanged
=
0
;
WalIndexHdr
*
pSnapshot
=
pWal
->
pSnapshot
;
#endif
assert
(
pWal
->
ckptLock
==
0
);
#ifdef SQLITE_ENABLE_SNAPSHOT
if
(
pSnapshot
){
if
(
memcmp
(
pSnapshot
,
&
pWal
->
hdr
,
sizeof
(
WalIndexHdr
))
!=
0
){
bChanged
=
1
;
}
/* It is possible that there is a checkpointer thread running
** concurrent with this code. If this is the case, it may be that the
** checkpointer has already determined that it will checkpoint
** snapshot X, where X is later in the wal file than pSnapshot, but
** has not yet set the pInfo->nBackfillAttempted variable to indicate
** its intent. To avoid the race condition this leads to, ensure that
** there is no checkpointer process by taking a shared CKPT lock
** before checking pInfo->nBackfillAttempted. */
(
void
)
walEnableBlocking
(
pWal
);
rc
=
walLockShared
(
pWal
,
WAL_CKPT_LOCK
);
walDisableBlocking
(
pWal
);
if
(
rc
!=
SQLITE_OK
){
return
rc
;
}
pWal
->
ckptLock
=
1
;
}
#endif
do
{
rc
=
walTryBeginRead
(
pWal
,
pChanged
,
0
,
++
cnt
);
}
while
(
rc
==
WAL_RETRY
);
testcase
(
(
rc
&
0xff
)
==
SQLITE_BUSY
);
testcase
(
(
rc
&
0xff
)
==
SQLITE_IOERR
);
testcase
(
rc
==
SQLITE_PROTOCOL
);
testcase
(
rc
==
SQLITE_OK
);
#ifdef SQLITE_ENABLE_SNAPSHOT
if
(
rc
==
SQLITE_OK
){
if
(
pSnapshot
&&
memcmp
(
pSnapshot
,
&
pWal
->
hdr
,
sizeof
(
WalIndexHdr
))
!=
0
){
/* At this point the client has a lock on an aReadMark[] slot holding
** a value equal to or smaller than pSnapshot->mxFrame, but pWal->hdr
** is populated with the wal-index header corresponding to the head
** of the wal file. Verify that pSnapshot is still valid before
** continuing. Reasons why pSnapshot might no longer be valid:
**
** (1) The WAL file has been reset since the snapshot was taken.
** In this case, the salt will have changed.
**
** (2) A checkpoint as been attempted that wrote frames past
** pSnapshot->mxFrame into the database file. Note that the
** checkpoint need not have completed for this to cause problems.
*/
volatile
WalCkptInfo
*
pInfo
=
walCkptInfo
(
pWal
);
assert
(
pWal
->
readLock
>
0
||
pWal
->
hdr
.
mxFrame
==
0
);
assert
(
pInfo
->
aReadMark
[
pWal
->
readLock
]
<=
pSnapshot
->
mxFrame
);
/* Check that the wal file has not been wrapped. Assuming that it has
** not, also check that no checkpointer has attempted to checkpoint any
** frames beyond pSnapshot->mxFrame. If either of these conditions are
** true, return SQLITE_ERROR_SNAPSHOT. Otherwise, overwrite pWal->hdr
** with *pSnapshot and set *pChanged as appropriate for opening the
** snapshot. */
if
(
!
memcmp
(
pSnapshot
->
aSalt
,
pWal
->
hdr
.
aSalt
,
sizeof
(
pWal
->
hdr
.
aSalt
))
&&
pSnapshot
->
mxFrame
>=
pInfo
->
nBackfillAttempted
){
assert
(
pWal
->
readLock
>
0
);
memcpy
(
&
pWal
->
hdr
,
pSnapshot
,
sizeof
(
WalIndexHdr
));
*
pChanged
=
bChanged
;
}
else
{
rc
=
SQLITE_ERROR_SNAPSHOT
;
}
/* A client using a non-current snapshot may not ignore any frames
** from the start of the wal file. This is because, for a system
** where (minFrame < iSnapshot < maxFrame), a checkpointer may
** have omitted to checkpoint a frame earlier than minFrame in
** the file because there exists a frame after iSnapshot that
** is the same database page. */
pWal
->
minFrame
=
1
;
if
(
rc
!=
SQLITE_OK
){
sqlite3WalEndReadTransaction
(
pWal
);
}
}
}
/* Release the shared CKPT lock obtained above. */
if
(
pWal
->
ckptLock
){
assert
(
pSnapshot
);
walUnlockShared
(
pWal
,
WAL_CKPT_LOCK
);
pWal
->
ckptLock
=
0
;
}
#endif
return
rc
;
}
/*
** Finish with a read transaction. All this does is release the
** read-lock.
*/
void
sqlite3WalEndReadTransaction
(
Wal
*
pWal
){
sqlite3WalEndWriteTransaction
(
pWal
);
if
(
pWal
->
readLock
>=
0
){
walUnlockShared
(
pWal
,
WAL_READ_LOCK
(
pWal
->
readLock
));
pWal
->
readLock
=
-
1
;
}
}
/*
** Search the wal file for page pgno. If found, set *piRead to the frame that
** contains the page. Otherwise, if pgno is not in the wal file, set *piRead
** to zero.
**
** Return SQLITE_OK if successful, or an error code if an error occurs. If an
** error does occur, the final value of *piRead is undefined.
*/
int
sqlite3WalFindFrame
(
Wal
*
pWal
,
/* WAL handle */
Pgno
pgno
,
/* Database page number to read data for */
u32
*
piRead
/* OUT: Frame number (or zero) */
){
u32
iRead
=
0
;
/* If !=0, WAL frame to return data from */
u32
iLast
=
pWal
->
hdr
.
mxFrame
;
/* Last page in WAL for this reader */
int
iHash
;
/* Used to loop through N hash tables */
int
iMinHash
;
/* This routine is only be called from within a read transaction. */
assert
(
pWal
->
readLock
>=
0
||
pWal
->
lockError
);
/* If the "last page" field of the wal-index header snapshot is 0, then
** no data will be read from the wal under any circumstances. Return early
** in this case as an optimization. Likewise, if pWal->readLock==0,
** then the WAL is ignored by the reader so return early, as if the
** WAL were empty.
*/
if
(
iLast
==
0
||
(
pWal
->
readLock
==
0
&&
pWal
->
bShmUnreliable
==
0
)
){
*
piRead
=
0
;
return
SQLITE_OK
;
}
/* Search the hash table or tables for an entry matching page number
** pgno. Each iteration of the following for() loop searches one
** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
**
** This code might run concurrently to the code in walIndexAppend()
** that adds entries to the wal-index (and possibly to this hash
** table). This means the value just read from the hash
** slot (aHash[iKey]) may have been added before or after the
** current read transaction was opened. Values added after the
** read transaction was opened may have been written incorrectly -
** i.e. these slots may contain garbage data. However, we assume
** that any slots written before the current read transaction was
** opened remain unmodified.
**
** For the reasons above, the if(...) condition featured in the inner
** loop of the following block is more stringent that would be required
** if we had exclusive access to the hash-table:
**
** (aPgno[iFrame]==pgno):
** This condition filters out normal hash-table collisions.
**
** (iFrame<=iLast):
** This condition filters out entries that were added to the hash
** table after the current read-transaction had started.
*/
iMinHash
=
walFramePage
(
pWal
->
minFrame
);
for
(
iHash
=
walFramePage
(
iLast
);
iHash
>=
iMinHash
;
iHash
--
){
WalHashLoc
sLoc
;
/* Hash table location */
int
iKey
;
/* Hash slot index */
int
nCollide
;
/* Number of hash collisions remaining */
int
rc
;
/* Error code */
u32
iH
;
rc
=
walHashGet
(
pWal
,
iHash
,
&
sLoc
);
if
(
rc
!=
SQLITE_OK
){
return
rc
;
}
nCollide
=
HASHTABLE_NSLOT
;
iKey
=
walHash
(
pgno
);
while
(
(
iH
=
AtomicLoad
(
&
sLoc
.
aHash
[
iKey
]))
!=
0
){
u32
iFrame
=
iH
+
sLoc
.
iZero
;
if
(
iFrame
<=
iLast
&&
iFrame
>=
pWal
->
minFrame
&&
sLoc
.
aPgno
[
iH
-
1
]
==
pgno
){
assert
(
iFrame
>
iRead
||
CORRUPT_DB
);
iRead
=
iFrame
;
}
if
(
(
nCollide
--
)
==
0
){
return
SQLITE_CORRUPT_BKPT
;
}
iKey
=
walNextHash
(
iKey
);
}
if
(
iRead
)
break
;
}
#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
/* If expensive assert() statements are available, do a linear search
** of the wal-index file content. Make sure the results agree with the
** result obtained using the hash indexes above. */
{
u32
iRead2
=
0
;
u32
iTest
;
assert
(
pWal
->
bShmUnreliable
||
pWal
->
minFrame
>
0
);
for
(
iTest
=
iLast
;
iTest
>=
pWal
->
minFrame
&&
iTest
>
0
;
iTest
--
){
if
(
walFramePgno
(
pWal
,
iTest
)
==
pgno
){
iRead2
=
iTest
;
break
;
}
}
assert
(
iRead
==
iRead2
);
}
#endif
*
piRead
=
iRead
;
return
SQLITE_OK
;
}
/*
** Read the contents of frame iRead from the wal file into buffer pOut
** (which is nOut bytes in size). Return SQLITE_OK if successful, or an
** error code otherwise.
*/
int
sqlite3WalReadFrame
(
Wal
*
pWal
,
/* WAL handle */
u32
iRead
,
/* Frame to read */
int
nOut
,
/* Size of buffer pOut in bytes */
u8
*
pOut
/* Buffer to write page data to */
){
int
sz
;
i64
iOffset
;
sz
=
pWal
->
hdr
.
szPage
;
sz
=
(
sz
&
0xfe00
)
+
((
sz
&
0x0001
)
<<
16
);
testcase
(
sz
<=
32768
);
testcase
(
sz
>=
65536
);
iOffset
=
walFrameOffset
(
iRead
,
sz
)
+
WAL_FRAME_HDRSIZE
;
/* testcase( IS_BIG_INT(iOffset) ); // requires a 4GiB WAL */
return
sqlite3OsRead
(
pWal
->
pWalFd
,
pOut
,
(
nOut
>
sz
?
sz
:
nOut
),
iOffset
);
}
/*
** Return the size of the database in pages (or zero, if unknown).
*/
Pgno
sqlite3WalDbsize
(
Wal
*
pWal
){
if
(
pWal
&&
ALWAYS
(
pWal
->
readLock
>=
0
)
){
return
pWal
->
hdr
.
nPage
;
}
return
0
;
}
/*
** This function starts a write transaction on the WAL.
**
** A read transaction must have already been started by a prior call
** to sqlite3WalBeginReadTransaction().
**
** If another thread or process has written into the database since
** the read transaction was started, then it is not possible for this
** thread to write as doing so would cause a fork. So this routine
** returns SQLITE_BUSY in that case and no write transaction is started.
**
** There can only be a single writer active at a time.
*/
int
sqlite3WalBeginWriteTransaction
(
Wal
*
pWal
){
int
rc
;
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
/* If the write-lock is already held, then it was obtained before the
** read-transaction was even opened, making this call a no-op.
** Return early. */
if
(
pWal
->
writeLock
){
assert
(
!
memcmp
(
&
pWal
->
hdr
,(
void
*
)
walIndexHdr
(
pWal
),
sizeof
(
WalIndexHdr
))
);
return
SQLITE_OK
;
}
#endif
/* Cannot start a write transaction without first holding a read
** transaction. */
assert
(
pWal
->
readLock
>=
0
);
assert
(
pWal
->
writeLock
==
0
&&
pWal
->
iReCksum
==
0
);
if
(
pWal
->
readOnly
){
return
SQLITE_READONLY
;
}
/* Only one writer allowed at a time. Get the write lock. Return
** SQLITE_BUSY if unable.
*/
rc
=
walLockExclusive
(
pWal
,
WAL_WRITE_LOCK
,
1
);
if
(
rc
){
return
rc
;
}
pWal
->
writeLock
=
1
;
/* If another connection has written to the database file since the
** time the read transaction on this connection was started, then
** the write is disallowed.
*/
if
(
memcmp
(
&
pWal
->
hdr
,
(
void
*
)
walIndexHdr
(
pWal
),
sizeof
(
WalIndexHdr
))
!=
0
){
walUnlockExclusive
(
pWal
,
WAL_WRITE_LOCK
,
1
);
pWal
->
writeLock
=
0
;
rc
=
SQLITE_BUSY_SNAPSHOT
;
}
return
rc
;
}
/*
** End a write transaction. The commit has already been done. This
** routine merely releases the lock.
*/
int
sqlite3WalEndWriteTransaction
(
Wal
*
pWal
){
if
(
pWal
->
writeLock
){
walUnlockExclusive
(
pWal
,
WAL_WRITE_LOCK
,
1
);
pWal
->
writeLock
=
0
;
pWal
->
iReCksum
=
0
;
pWal
->
truncateOnCommit
=
0
;
}
return
SQLITE_OK
;
}
/*
** If any data has been written (but not committed) to the log file, this
** function moves the write-pointer back to the start of the transaction.
**
** Additionally, the callback function is invoked for each frame written
** to the WAL since the start of the transaction. If the callback returns
** other than SQLITE_OK, it is not invoked again and the error code is
** returned to the caller.
**
** Otherwise, if the callback function does not return an error, this
** function returns SQLITE_OK.
*/
int
sqlite3WalUndo
(
Wal
*
pWal
,
int
(
*
xUndo
)(
void
*
,
Pgno
),
void
*
pUndoCtx
){
int
rc
=
SQLITE_OK
;
if
(
ALWAYS
(
pWal
->
writeLock
)
){
Pgno
iMax
=
pWal
->
hdr
.
mxFrame
;
Pgno
iFrame
;
/* Restore the clients cache of the wal-index header to the state it
** was in before the client began writing to the database.
*/
memcpy
(
&
pWal
->
hdr
,
(
void
*
)
walIndexHdr
(
pWal
),
sizeof
(
WalIndexHdr
));
for
(
iFrame
=
pWal
->
hdr
.
mxFrame
+
1
;
ALWAYS
(
rc
==
SQLITE_OK
)
&&
iFrame
<=
iMax
;
iFrame
++
){
/* This call cannot fail. Unless the page for which the page number
** is passed as the second argument is (a) in the cache and
** (b) has an outstanding reference, then xUndo is either a no-op
** (if (a) is false) or simply expels the page from the cache (if (b)
** is false).
**
** If the upper layer is doing a rollback, it is guaranteed that there
** are no outstanding references to any page other than page 1. And
** page 1 is never written to the log until the transaction is
** committed. As a result, the call to xUndo may not fail.
*/
assert
(
walFramePgno
(
pWal
,
iFrame
)
!=
1
);
rc
=
xUndo
(
pUndoCtx
,
walFramePgno
(
pWal
,
iFrame
));
}
if
(
iMax
!=
pWal
->
hdr
.
mxFrame
)
walCleanupHash
(
pWal
);
}
return
rc
;
}
/*
** Argument aWalData must point to an array of WAL_SAVEPOINT_NDATA u32
** values. This function populates the array with values required to
** "rollback" the write position of the WAL handle back to the current
** point in the event of a savepoint rollback (via WalSavepointUndo()).
*/
void
sqlite3WalSavepoint
(
Wal
*
pWal
,
u32
*
aWalData
){
assert
(
pWal
->
writeLock
);
aWalData
[
0
]
=
pWal
->
hdr
.
mxFrame
;
aWalData
[
1
]
=
pWal
->
hdr
.
aFrameCksum
[
0
];
aWalData
[
2
]
=
pWal
->
hdr
.
aFrameCksum
[
1
];
aWalData
[
3
]
=
pWal
->
nCkpt
;
}
/*
** Move the write position of the WAL back to the point identified by
** the values in the aWalData[] array. aWalData must point to an array
** of WAL_SAVEPOINT_NDATA u32 values that has been previously populated
** by a call to WalSavepoint().
*/
int
sqlite3WalSavepointUndo
(
Wal
*
pWal
,
u32
*
aWalData
){
int
rc
=
SQLITE_OK
;
assert
(
pWal
->
writeLock
);
assert
(
aWalData
[
3
]
!=
pWal
->
nCkpt
||
aWalData
[
0
]
<=
pWal
->
hdr
.
mxFrame
);
if
(
aWalData
[
3
]
!=
pWal
->
nCkpt
){
/* This savepoint was opened immediately after the write-transaction
** was started. Right after that, the writer decided to wrap around
** to the start of the log. Update the savepoint values to match.
*/
aWalData
[
0
]
=
0
;
aWalData
[
3
]
=
pWal
->
nCkpt
;
}
if
(
aWalData
[
0
]
<
pWal
->
hdr
.
mxFrame
){
pWal
->
hdr
.
mxFrame
=
aWalData
[
0
];
pWal
->
hdr
.
aFrameCksum
[
0
]
=
aWalData
[
1
];
pWal
->
hdr
.
aFrameCksum
[
1
]
=
aWalData
[
2
];
walCleanupHash
(
pWal
);
}
return
rc
;
}
/*
** This function is called just before writing a set of frames to the log
** file (see sqlite3WalFrames()). It checks to see if, instead of appending
** to the current log file, it is possible to overwrite the start of the
** existing log file with the new frames (i.e. "reset" the log). If so,
** it sets pWal->hdr.mxFrame to 0. Otherwise, pWal->hdr.mxFrame is left
** unchanged.
**
** SQLITE_OK is returned if no error is encountered (regardless of whether
** or not pWal->hdr.mxFrame is modified). An SQLite error code is returned
** if an error occurs.
*/
static
int
walRestartLog
(
Wal
*
pWal
){
int
rc
=
SQLITE_OK
;
int
cnt
;
if
(
pWal
->
readLock
==
0
){
volatile
WalCkptInfo
*
pInfo
=
walCkptInfo
(
pWal
);
assert
(
pInfo
->
nBackfill
==
pWal
->
hdr
.
mxFrame
);
if
(
pInfo
->
nBackfill
>
0
){
u32
salt1
;
sqlite3_randomness
(
4
,
&
salt1
);
rc
=
walLockExclusive
(
pWal
,
WAL_READ_LOCK
(
1
),
WAL_NREADER
-
1
);
if
(
rc
==
SQLITE_OK
){
/* If all readers are using WAL_READ_LOCK(0) (in other words if no
** readers are currently using the WAL), then the transactions
** frames will overwrite the start of the existing log. Update the
** wal-index header to reflect this.
**
** In theory it would be Ok to update the cache of the header only
** at this point. But updating the actual wal-index header is also
** safe and means there is no special case for sqlite3WalUndo()
** to handle if this transaction is rolled back. */
walRestartHdr
(
pWal
,
salt1
);
walUnlockExclusive
(
pWal
,
WAL_READ_LOCK
(
1
),
WAL_NREADER
-
1
);
}
else
if
(
rc
!=
SQLITE_BUSY
){
return
rc
;
}
}
walUnlockShared
(
pWal
,
WAL_READ_LOCK
(
0
));
pWal
->
readLock
=
-
1
;
cnt
=
0
;
do
{
int
notUsed
;
rc
=
walTryBeginRead
(
pWal
,
&
notUsed
,
1
,
++
cnt
);
}
while
(
rc
==
WAL_RETRY
);
assert
(
(
rc
&
0xff
)
!=
SQLITE_BUSY
);
/* BUSY not possible when useWal==1 */
testcase
(
(
rc
&
0xff
)
==
SQLITE_IOERR
);
testcase
(
rc
==
SQLITE_PROTOCOL
);
testcase
(
rc
==
SQLITE_OK
);
}
return
rc
;
}
/*
** Information about the current state of the WAL file and where
** the next fsync should occur - passed from sqlite3WalFrames() into
** walWriteToLog().
*/
typedef
struct
WalWriter
{
Wal
*
pWal
;
/* The complete WAL information */
sqlite3_file
*
pFd
;
/* The WAL file to which we write */
sqlite3_int64
iSyncPoint
;
/* Fsync at this offset */
int
syncFlags
;
/* Flags for the fsync */
int
szPage
;
/* Size of one page */
}
WalWriter
;
/*
** Write iAmt bytes of content into the WAL file beginning at iOffset.
** Do a sync when crossing the p->iSyncPoint boundary.
**
** In other words, if iSyncPoint is in between iOffset and iOffset+iAmt,
** first write the part before iSyncPoint, then sync, then write the
** rest.
*/
static
int
walWriteToLog
(
WalWriter
*
p
,
/* WAL to write to */
void
*
pContent
,
/* Content to be written */
int
iAmt
,
/* Number of bytes to write */
sqlite3_int64
iOffset
/* Start writing at this offset */
){
int
rc
;
if
(
iOffset
<
p
->
iSyncPoint
&&
iOffset
+
iAmt
>=
p
->
iSyncPoint
){
int
iFirstAmt
=
(
int
)(
p
->
iSyncPoint
-
iOffset
);
rc
=
sqlite3OsWrite
(
p
->
pFd
,
pContent
,
iFirstAmt
,
iOffset
);
if
(
rc
)
return
rc
;
iOffset
+=
iFirstAmt
;
iAmt
-=
iFirstAmt
;
pContent
=
(
void
*
)(
iFirstAmt
+
(
char
*
)
pContent
);
assert
(
WAL_SYNC_FLAGS
(
p
->
syncFlags
)
!=
0
);
rc
=
sqlite3OsSync
(
p
->
pFd
,
WAL_SYNC_FLAGS
(
p
->
syncFlags
));
if
(
iAmt
==
0
||
rc
)
return
rc
;
}
rc
=
sqlite3OsWrite
(
p
->
pFd
,
pContent
,
iAmt
,
iOffset
);
return
rc
;
}
/*
** Write out a single frame of the WAL
*/
static
int
walWriteOneFrame
(
WalWriter
*
p
,
/* Where to write the frame */
PgHdr
*
pPage
,
/* The page of the frame to be written */
int
nTruncate
,
/* The commit flag. Usually 0. >0 for commit */
sqlite3_int64
iOffset
/* Byte offset at which to write */
){
int
rc
;
/* Result code from subfunctions */
void
*
pData
;
/* Data actually written */
u8
aFrame
[
WAL_FRAME_HDRSIZE
];
/* Buffer to assemble frame-header in */
pData
=
pPage
->
pData
;
walEncodeFrame
(
p
->
pWal
,
pPage
->
pgno
,
nTruncate
,
pData
,
aFrame
);
rc
=
walWriteToLog
(
p
,
aFrame
,
sizeof
(
aFrame
),
iOffset
);
if
(
rc
)
return
rc
;
/* Write the page data */
rc
=
walWriteToLog
(
p
,
pData
,
p
->
szPage
,
iOffset
+
sizeof
(
aFrame
));
return
rc
;
}
/*
** This function is called as part of committing a transaction within which
** one or more frames have been overwritten. It updates the checksums for
** all frames written to the wal file by the current transaction starting
** with the earliest to have been overwritten.
**
** SQLITE_OK is returned if successful, or an SQLite error code otherwise.
*/
static
int
walRewriteChecksums
(
Wal
*
pWal
,
u32
iLast
){
const
int
szPage
=
pWal
->
szPage
;
/* Database page size */
int
rc
=
SQLITE_OK
;
/* Return code */
u8
*
aBuf
;
/* Buffer to load data from wal file into */
u8
aFrame
[
WAL_FRAME_HDRSIZE
];
/* Buffer to assemble frame-headers in */
u32
iRead
;
/* Next frame to read from wal file */
i64
iCksumOff
;
aBuf
=
sqlite3_malloc
(
szPage
+
WAL_FRAME_HDRSIZE
);
if
(
aBuf
==
0
)
return
SQLITE_NOMEM
;
/* Find the checksum values to use as input for the recalculating the
** first checksum. If the first frame is frame 1 (implying that the current
** transaction restarted the wal file), these values must be read from the
** wal-file header. Otherwise, read them from the frame header of the
** previous frame. */
assert
(
pWal
->
iReCksum
>
0
);
if
(
pWal
->
iReCksum
==
1
){
iCksumOff
=
24
;
}
else
{
iCksumOff
=
walFrameOffset
(
pWal
->
iReCksum
-
1
,
szPage
)
+
16
;
}
rc
=
sqlite3OsRead
(
pWal
->
pWalFd
,
aBuf
,
sizeof
(
u32
)
*
2
,
iCksumOff
);
pWal
->
hdr
.
aFrameCksum
[
0
]
=
sqlite3Get4byte
(
aBuf
);
pWal
->
hdr
.
aFrameCksum
[
1
]
=
sqlite3Get4byte
(
&
aBuf
[
sizeof
(
u32
)]);
iRead
=
pWal
->
iReCksum
;
pWal
->
iReCksum
=
0
;
for
(;
rc
==
SQLITE_OK
&&
iRead
<=
iLast
;
iRead
++
){
i64
iOff
=
walFrameOffset
(
iRead
,
szPage
);
rc
=
sqlite3OsRead
(
pWal
->
pWalFd
,
aBuf
,
szPage
+
WAL_FRAME_HDRSIZE
,
iOff
);
if
(
rc
==
SQLITE_OK
){
u32
iPgno
,
nDbSize
;
iPgno
=
sqlite3Get4byte
(
aBuf
);
nDbSize
=
sqlite3Get4byte
(
&
aBuf
[
4
]);
walEncodeFrame
(
pWal
,
iPgno
,
nDbSize
,
&
aBuf
[
WAL_FRAME_HDRSIZE
],
aFrame
);
rc
=
sqlite3OsWrite
(
pWal
->
pWalFd
,
aFrame
,
sizeof
(
aFrame
),
iOff
);
}
}
sqlite3_free
(
aBuf
);
return
rc
;
}
/*
** Write a set of frames to the log. The caller must hold the write-lock
** on the log file (obtained using sqlite3WalBeginWriteTransaction()).
*/
int
sqlite3WalFrames
(
Wal
*
pWal
,
/* Wal handle to write to */
int
szPage
,
/* Database page-size in bytes */
PgHdr
*
pList
,
/* List of dirty pages to write */
Pgno
nTruncate
,
/* Database size after this commit */
int
isCommit
,
/* True if this is a commit */
int
sync_flags
/* Flags to pass to OsSync() (or 0) */
){
int
rc
;
/* Used to catch return codes */
u32
iFrame
;
/* Next frame address */
PgHdr
*
p
;
/* Iterator to run through pList with. */
PgHdr
*
pLast
=
0
;
/* Last frame in list */
int
nExtra
=
0
;
/* Number of extra copies of last page */
int
szFrame
;
/* The size of a single frame */
i64
iOffset
;
/* Next byte to write in WAL file */
WalWriter
w
;
/* The writer */
u32
iFirst
=
0
;
/* First frame that may be overwritten */
WalIndexHdr
*
pLive
;
/* Pointer to shared header */
assert
(
pList
);
assert
(
pWal
->
writeLock
);
/* If this frame set completes a transaction, then nTruncate>0. If
** nTruncate==0 then this frame set does not complete the transaction. */
assert
(
(
isCommit
!=
0
)
==
(
nTruncate
!=
0
)
);
#if defined(SQLITE_TEST) && defined(SQLITE_DEBUG)
{
int
cnt
;
for
(
cnt
=
0
,
p
=
pList
;
p
;
p
=
p
->
pDirty
,
cnt
++
){}
WALTRACE
((
"WAL%p: frame write begin. %d frames. mxFrame=%d. %s
\n
"
,
pWal
,
cnt
,
pWal
->
hdr
.
mxFrame
,
isCommit
?
"Commit"
:
"Spill"
));
}
#endif
pLive
=
(
WalIndexHdr
*
)
walIndexHdr
(
pWal
);
if
(
memcmp
(
&
pWal
->
hdr
,
(
void
*
)
pLive
,
sizeof
(
WalIndexHdr
))
!=
0
){
iFirst
=
pLive
->
mxFrame
+
1
;
}
/* See if it is possible to write these frames into the start of the
** log file, instead of appending to it at pWal->hdr.mxFrame.
*/
if
(
SQLITE_OK
!=
(
rc
=
walRestartLog
(
pWal
))
){
return
rc
;
}
/* If this is the first frame written into the log, write the WAL
** header to the start of the WAL file. See comments at the top of
** this source file for a description of the WAL header format.
*/
iFrame
=
pWal
->
hdr
.
mxFrame
;
if
(
iFrame
==
0
){
u8
aWalHdr
[
WAL_HDRSIZE
];
/* Buffer to assemble wal-header in */
u32
aCksum
[
2
];
/* Checksum for wal-header */
sqlite3Put4byte
(
&
aWalHdr
[
0
],
(
WAL_MAGIC
|
SQLITE_BIGENDIAN
));
sqlite3Put4byte
(
&
aWalHdr
[
4
],
WAL_MAX_VERSION
);
sqlite3Put4byte
(
&
aWalHdr
[
8
],
szPage
);
sqlite3Put4byte
(
&
aWalHdr
[
12
],
pWal
->
nCkpt
);
if
(
pWal
->
nCkpt
==
0
)
sqlite3_randomness
(
8
,
pWal
->
hdr
.
aSalt
);
memcpy
(
&
aWalHdr
[
16
],
pWal
->
hdr
.
aSalt
,
8
);
walChecksumBytes
(
1
,
aWalHdr
,
WAL_HDRSIZE
-
2
*
4
,
0
,
aCksum
);
sqlite3Put4byte
(
&
aWalHdr
[
24
],
aCksum
[
0
]);
sqlite3Put4byte
(
&
aWalHdr
[
28
],
aCksum
[
1
]);
pWal
->
szPage
=
szPage
;
pWal
->
hdr
.
bigEndCksum
=
SQLITE_BIGENDIAN
;
pWal
->
hdr
.
aFrameCksum
[
0
]
=
aCksum
[
0
];
pWal
->
hdr
.
aFrameCksum
[
1
]
=
aCksum
[
1
];
pWal
->
truncateOnCommit
=
1
;
rc
=
sqlite3OsWrite
(
pWal
->
pWalFd
,
aWalHdr
,
sizeof
(
aWalHdr
),
0
);
WALTRACE
((
"WAL%p: wal-header write %s
\n
"
,
pWal
,
rc
?
"failed"
:
"ok"
));
if
(
rc
!=
SQLITE_OK
){
return
rc
;
}
/* Sync the header (unless SQLITE_IOCAP_SEQUENTIAL is true or unless
** all syncing is turned off by PRAGMA synchronous=OFF). Otherwise
** an out-of-order write following a WAL restart could result in
** database corruption. See the ticket:
**
** https://sqlite.org/src/info/ff5be73dee
*/
if
(
pWal
->
syncHeader
){
rc
=
sqlite3OsSync
(
pWal
->
pWalFd
,
CKPT_SYNC_FLAGS
(
sync_flags
));
if
(
rc
)
return
rc
;
}
}
assert
(
(
int
)
pWal
->
szPage
==
szPage
);
/* Setup information needed to write frames into the WAL */
w
.
pWal
=
pWal
;
w
.
pFd
=
pWal
->
pWalFd
;
w
.
iSyncPoint
=
0
;
w
.
syncFlags
=
sync_flags
;
w
.
szPage
=
szPage
;
iOffset
=
walFrameOffset
(
iFrame
+
1
,
szPage
);
szFrame
=
szPage
+
WAL_FRAME_HDRSIZE
;
/* Write all frames into the log file exactly once */
for
(
p
=
pList
;
p
;
p
=
p
->
pDirty
){
int
nDbSize
;
/* 0 normally. Positive == commit flag */
/* Check if this page has already been written into the wal file by
** the current transaction. If so, overwrite the existing frame and
** set Wal.writeLock to WAL_WRITELOCK_RECKSUM - indicating that
** checksums must be recomputed when the transaction is committed. */
if
(
iFirst
&&
(
p
->
pDirty
||
isCommit
==
0
)
){
u32
iWrite
=
0
;
VVA_ONLY
(
rc
=
)
sqlite3WalFindFrame
(
pWal
,
p
->
pgno
,
&
iWrite
);
assert
(
rc
==
SQLITE_OK
||
iWrite
==
0
);
if
(
iWrite
>=
iFirst
){
i64
iOff
=
walFrameOffset
(
iWrite
,
szPage
)
+
WAL_FRAME_HDRSIZE
;
void
*
pData
;
if
(
pWal
->
iReCksum
==
0
||
iWrite
<
pWal
->
iReCksum
){
pWal
->
iReCksum
=
iWrite
;
}
pData
=
p
->
pData
;
rc
=
sqlite3OsWrite
(
pWal
->
pWalFd
,
pData
,
szPage
,
iOff
);
if
(
rc
)
return
rc
;
p
->
flags
&=
~
PGHDR_WAL_APPEND
;
continue
;
}
}
iFrame
++
;
assert
(
iOffset
==
walFrameOffset
(
iFrame
,
szPage
)
);
nDbSize
=
(
isCommit
&&
p
->
pDirty
==
0
)
?
nTruncate
:
0
;
rc
=
walWriteOneFrame
(
&
w
,
p
,
nDbSize
,
iOffset
);
if
(
rc
)
return
rc
;
pLast
=
p
;
iOffset
+=
szFrame
;
p
->
flags
|=
PGHDR_WAL_APPEND
;
}
/* Recalculate checksums within the wal file if required. */
if
(
isCommit
&&
pWal
->
iReCksum
){
rc
=
walRewriteChecksums
(
pWal
,
iFrame
);
if
(
rc
)
return
rc
;
}
/* If this is the end of a transaction, then we might need to pad
** the transaction and/or sync the WAL file.
**
** Padding and syncing only occur if this set of frames complete a
** transaction and if PRAGMA synchronous=FULL. If synchronous==NORMAL
** or synchronous==OFF, then no padding or syncing are needed.
**
** If SQLITE_IOCAP_POWERSAFE_OVERWRITE is defined, then padding is not
** needed and only the sync is done. If padding is needed, then the
** final frame is repeated (with its commit mark) until the next sector
** boundary is crossed. Only the part of the WAL prior to the last
** sector boundary is synced; the part of the last frame that extends
** past the sector boundary is written after the sync.
*/
if
(
isCommit
&&
WAL_SYNC_FLAGS
(
sync_flags
)
!=
0
){
int
bSync
=
1
;
if
(
pWal
->
padToSectorBoundary
){
int
sectorSize
=
sqlite3SectorSize
(
pWal
->
pWalFd
);
w
.
iSyncPoint
=
((
iOffset
+
sectorSize
-
1
)
/
sectorSize
)
*
sectorSize
;
bSync
=
(
w
.
iSyncPoint
==
iOffset
);
testcase
(
bSync
);
while
(
iOffset
<
w
.
iSyncPoint
){
rc
=
walWriteOneFrame
(
&
w
,
pLast
,
nTruncate
,
iOffset
);
if
(
rc
)
return
rc
;
iOffset
+=
szFrame
;
nExtra
++
;
assert
(
pLast
!=
0
);
}
}
if
(
bSync
){
assert
(
rc
==
SQLITE_OK
);
rc
=
sqlite3OsSync
(
w
.
pFd
,
WAL_SYNC_FLAGS
(
sync_flags
));
}
}
/* If this frame set completes the first transaction in the WAL and
** if PRAGMA journal_size_limit is set, then truncate the WAL to the
** journal size limit, if possible.
*/
if
(
isCommit
&&
pWal
->
truncateOnCommit
&&
pWal
->
mxWalSize
>=
0
){
i64
sz
=
pWal
->
mxWalSize
;
if
(
walFrameOffset
(
iFrame
+
nExtra
+
1
,
szPage
)
>
pWal
->
mxWalSize
){
sz
=
walFrameOffset
(
iFrame
+
nExtra
+
1
,
szPage
);
}
walLimitSize
(
pWal
,
sz
);
pWal
->
truncateOnCommit
=
0
;
}
/* Append data to the wal-index. It is not necessary to lock the
** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index
** guarantees that there are no other writers, and no data that may
** be in use by existing readers is being overwritten.
*/
iFrame
=
pWal
->
hdr
.
mxFrame
;
for
(
p
=
pList
;
p
&&
rc
==
SQLITE_OK
;
p
=
p
->
pDirty
){
if
(
(
p
->
flags
&
PGHDR_WAL_APPEND
)
==
0
)
continue
;
iFrame
++
;
rc
=
walIndexAppend
(
pWal
,
iFrame
,
p
->
pgno
);
}
assert
(
pLast
!=
0
||
nExtra
==
0
);
while
(
rc
==
SQLITE_OK
&&
nExtra
>
0
){
iFrame
++
;
nExtra
--
;
rc
=
walIndexAppend
(
pWal
,
iFrame
,
pLast
->
pgno
);
}
if
(
rc
==
SQLITE_OK
){
/* Update the private copy of the header. */
pWal
->
hdr
.
szPage
=
(
u16
)((
szPage
&
0xff00
)
|
(
szPage
>>
16
));
testcase
(
szPage
<=
32768
);
testcase
(
szPage
>=
65536
);
pWal
->
hdr
.
mxFrame
=
iFrame
;
if
(
isCommit
){
pWal
->
hdr
.
iChange
++
;
pWal
->
hdr
.
nPage
=
nTruncate
;
}
/* If this is a commit, update the wal-index header too. */
if
(
isCommit
){
walIndexWriteHdr
(
pWal
);
pWal
->
iCallback
=
iFrame
;
}
}
WALTRACE
((
"WAL%p: frame write %s
\n
"
,
pWal
,
rc
?
"failed"
:
"ok"
));
return
rc
;
}
/*
** This routine is called to implement sqlite3_wal_checkpoint() and
** related interfaces.
**
** Obtain a CHECKPOINT lock and then backfill as much information as
** we can from WAL into the database.
**
** If parameter xBusy is not NULL, it is a pointer to a busy-handler
** callback. In this case this function runs a blocking checkpoint.
*/
int
sqlite3WalCheckpoint
(
Wal
*
pWal
,
/* Wal connection */
sqlite3
*
db
,
/* Check this handle's interrupt flag */
int
eMode
,
/* PASSIVE, FULL, RESTART, or TRUNCATE */
int
(
*
xBusy
)(
void
*
),
/* Function to call when busy */
void
*
pBusyArg
,
/* Context argument for xBusyHandler */
int
sync_flags
,
/* Flags to sync db file with (or 0) */
int
nBuf
,
/* Size of temporary buffer */
u8
*
zBuf
,
/* Temporary buffer to use */
int
*
pnLog
,
/* OUT: Number of frames in WAL */
int
*
pnCkpt
/* OUT: Number of backfilled frames in WAL */
){
int
rc
;
/* Return code */
int
isChanged
=
0
;
/* True if a new wal-index header is loaded */
int
eMode2
=
eMode
;
/* Mode to pass to walCheckpoint() */
int
(
*
xBusy2
)(
void
*
)
=
xBusy
;
/* Busy handler for eMode2 */
assert
(
pWal
->
ckptLock
==
0
);
assert
(
pWal
->
writeLock
==
0
);
/* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked
** in the SQLITE_CHECKPOINT_PASSIVE mode. */
assert
(
eMode
!=
SQLITE_CHECKPOINT_PASSIVE
||
xBusy
==
0
);
if
(
pWal
->
readOnly
)
return
SQLITE_READONLY
;
WALTRACE
((
"WAL%p: checkpoint begins
\n
"
,
pWal
));
/* Enable blocking locks, if possible. If blocking locks are successfully
** enabled, set xBusy2=0 so that the busy-handler is never invoked. */
sqlite3WalDb
(
pWal
,
db
);
(
void
)
walEnableBlocking
(
pWal
);
/* IMPLEMENTATION-OF: R-62028-47212 All calls obtain an exclusive
** "checkpoint" lock on the database file.
** EVIDENCE-OF: R-10421-19736 If any other process is running a
** checkpoint operation at the same time, the lock cannot be obtained and
** SQLITE_BUSY is returned.
** EVIDENCE-OF: R-53820-33897 Even if there is a busy-handler configured,
** it will not be invoked in this case.
*/
rc
=
walLockExclusive
(
pWal
,
WAL_CKPT_LOCK
,
1
);
testcase
(
rc
==
SQLITE_BUSY
);
testcase
(
rc
!=
SQLITE_OK
&&
xBusy2
!=
0
);
if
(
rc
==
SQLITE_OK
){
pWal
->
ckptLock
=
1
;
/* IMPLEMENTATION-OF: R-59782-36818 The SQLITE_CHECKPOINT_FULL, RESTART and
** TRUNCATE modes also obtain the exclusive "writer" lock on the database
** file.
**
** EVIDENCE-OF: R-60642-04082 If the writer lock cannot be obtained
** immediately, and a busy-handler is configured, it is invoked and the
** writer lock retried until either the busy-handler returns 0 or the
** lock is successfully obtained.
*/
if
(
eMode
!=
SQLITE_CHECKPOINT_PASSIVE
){
rc
=
walBusyLock
(
pWal
,
xBusy2
,
pBusyArg
,
WAL_WRITE_LOCK
,
1
);
if
(
rc
==
SQLITE_OK
){
pWal
->
writeLock
=
1
;
}
else
if
(
rc
==
SQLITE_BUSY
){
eMode2
=
SQLITE_CHECKPOINT_PASSIVE
;
xBusy2
=
0
;
rc
=
SQLITE_OK
;
}
}
}
/* Read the wal-index header. */
if
(
rc
==
SQLITE_OK
){
walDisableBlocking
(
pWal
);
rc
=
walIndexReadHdr
(
pWal
,
&
isChanged
);
(
void
)
walEnableBlocking
(
pWal
);
if
(
isChanged
&&
pWal
->
pDbFd
->
pMethods
->
iVersion
>=
3
){
sqlite3OsUnfetch
(
pWal
->
pDbFd
,
0
,
0
);
}
}
/* Copy data from the log to the database file. */
if
(
rc
==
SQLITE_OK
){
if
(
pWal
->
hdr
.
mxFrame
&&
walPagesize
(
pWal
)
!=
nBuf
){
rc
=
SQLITE_CORRUPT_BKPT
;
}
else
{
rc
=
walCheckpoint
(
pWal
,
db
,
eMode2
,
xBusy2
,
pBusyArg
,
sync_flags
,
zBuf
);
}
/* If no error occurred, set the output variables. */
if
(
rc
==
SQLITE_OK
||
rc
==
SQLITE_BUSY
){
if
(
pnLog
)
*
pnLog
=
(
int
)
pWal
->
hdr
.
mxFrame
;
if
(
pnCkpt
)
*
pnCkpt
=
(
int
)(
walCkptInfo
(
pWal
)
->
nBackfill
);
}
}
if
(
isChanged
){
/* If a new wal-index header was loaded before the checkpoint was
** performed, then the pager-cache associated with pWal is now
** out of date. So zero the cached wal-index header to ensure that
** next time the pager opens a snapshot on this database it knows that
** the cache needs to be reset.
*/
memset
(
&
pWal
->
hdr
,
0
,
sizeof
(
WalIndexHdr
));
}
walDisableBlocking
(
pWal
);
sqlite3WalDb
(
pWal
,
0
);
/* Release the locks. */
sqlite3WalEndWriteTransaction
(
pWal
);
if
(
pWal
->
ckptLock
){
walUnlockExclusive
(
pWal
,
WAL_CKPT_LOCK
,
1
);
pWal
->
ckptLock
=
0
;
}
WALTRACE
((
"WAL%p: checkpoint %s
\n
"
,
pWal
,
rc
?
"failed"
:
"ok"
));
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
if
(
rc
==
SQLITE_BUSY_TIMEOUT
)
rc
=
SQLITE_BUSY
;
#endif
return
(
rc
==
SQLITE_OK
&&
eMode
!=
eMode2
?
SQLITE_BUSY
:
rc
);
}
/* Return the value to pass to a sqlite3_wal_hook callback, the
** number of frames in the WAL at the point of the last commit since
** sqlite3WalCallback() was called. If no commits have occurred since
** the last call, then return 0.
*/
int
sqlite3WalCallback
(
Wal
*
pWal
){
u32
ret
=
0
;
if
(
pWal
){
ret
=
pWal
->
iCallback
;
pWal
->
iCallback
=
0
;
}
return
(
int
)
ret
;
}
/*
** This function is called to change the WAL subsystem into or out
** of locking_mode=EXCLUSIVE.
**
** If op is zero, then attempt to change from locking_mode=EXCLUSIVE
** into locking_mode=NORMAL. This means that we must acquire a lock
** on the pWal->readLock byte. If the WAL is already in locking_mode=NORMAL
** or if the acquisition of the lock fails, then return 0. If the
** transition out of exclusive-mode is successful, return 1. This
** operation must occur while the pager is still holding the exclusive
** lock on the main database file.
**
** If op is one, then change from locking_mode=NORMAL into
** locking_mode=EXCLUSIVE. This means that the pWal->readLock must
** be released. Return 1 if the transition is made and 0 if the
** WAL is already in exclusive-locking mode - meaning that this
** routine is a no-op. The pager must already hold the exclusive lock
** on the main database file before invoking this operation.
**
** If op is negative, then do a dry-run of the op==1 case but do
** not actually change anything. The pager uses this to see if it
** should acquire the database exclusive lock prior to invoking
** the op==1 case.
*/
int
sqlite3WalExclusiveMode
(
Wal
*
pWal
,
int
op
){
int
rc
;
assert
(
pWal
->
writeLock
==
0
);
assert
(
pWal
->
exclusiveMode
!=
WAL_HEAPMEMORY_MODE
||
op
==-
1
);
/* pWal->readLock is usually set, but might be -1 if there was a
** prior error while attempting to acquire are read-lock. This cannot
** happen if the connection is actually in exclusive mode (as no xShmLock
** locks are taken in this case). Nor should the pager attempt to
** upgrade to exclusive-mode following such an error.
*/
assert
(
pWal
->
readLock
>=
0
||
pWal
->
lockError
);
assert
(
pWal
->
readLock
>=
0
||
(
op
<=
0
&&
pWal
->
exclusiveMode
==
0
)
);
if
(
op
==
0
){
if
(
pWal
->
exclusiveMode
!=
WAL_NORMAL_MODE
){
pWal
->
exclusiveMode
=
WAL_NORMAL_MODE
;
if
(
walLockShared
(
pWal
,
WAL_READ_LOCK
(
pWal
->
readLock
))
!=
SQLITE_OK
){
pWal
->
exclusiveMode
=
WAL_EXCLUSIVE_MODE
;
}
rc
=
pWal
->
exclusiveMode
==
WAL_NORMAL_MODE
;
}
else
{
/* Already in locking_mode=NORMAL */
rc
=
0
;
}
}
else
if
(
op
>
0
){
assert
(
pWal
->
exclusiveMode
==
WAL_NORMAL_MODE
);
assert
(
pWal
->
readLock
>=
0
);
walUnlockShared
(
pWal
,
WAL_READ_LOCK
(
pWal
->
readLock
));
pWal
->
exclusiveMode
=
WAL_EXCLUSIVE_MODE
;
rc
=
1
;
}
else
{
rc
=
pWal
->
exclusiveMode
==
WAL_NORMAL_MODE
;
}
return
rc
;
}
/*
** Return true if the argument is non-NULL and the WAL module is using
** heap-memory for the wal-index. Otherwise, if the argument is NULL or the
** WAL module is using shared-memory, return false.
*/
int
sqlite3WalHeapMemory
(
Wal
*
pWal
){
return
(
pWal
&&
pWal
->
exclusiveMode
==
WAL_HEAPMEMORY_MODE
);
}
#ifdef SQLITE_ENABLE_SNAPSHOT
/* Create a snapshot object. The content of a snapshot is opaque to
** every other subsystem, so the WAL module can put whatever it needs
** in the object.
*/
int
sqlite3WalSnapshotGet
(
Wal
*
pWal
,
sqlite3_snapshot
**
ppSnapshot
){
int
rc
=
SQLITE_OK
;
WalIndexHdr
*
pRet
;
static
const
u32
aZero
[
4
]
=
{
0
,
0
,
0
,
0
};
assert
(
pWal
->
readLock
>=
0
&&
pWal
->
writeLock
==
0
);
if
(
memcmp
(
&
pWal
->
hdr
.
aFrameCksum
[
0
],
aZero
,
16
)
==
0
){
*
ppSnapshot
=
0
;
return
SQLITE_ERROR
;
}
pRet
=
(
WalIndexHdr
*
)
sqlite3_malloc
(
sizeof
(
WalIndexHdr
));
if
(
pRet
==
0
){
rc
=
SQLITE_NOMEM
;
}
else
{
memcpy
(
pRet
,
&
pWal
->
hdr
,
sizeof
(
WalIndexHdr
));
*
ppSnapshot
=
(
sqlite3_snapshot
*
)
pRet
;
}
return
rc
;
}
/* Try to open on pSnapshot when the next read-transaction starts
*/
void
sqlite3WalSnapshotOpen
(
Wal
*
pWal
,
sqlite3_snapshot
*
pSnapshot
){
pWal
->
pSnapshot
=
(
WalIndexHdr
*
)
pSnapshot
;
}
/*
** Return a +ve value if snapshot p1 is newer than p2. A -ve value if
** p1 is older than p2 and zero if p1 and p2 are the same snapshot.
*/
int
sqlite3_snapshot_cmp
(
sqlite3_snapshot
*
p1
,
sqlite3_snapshot
*
p2
){
WalIndexHdr
*
pHdr1
=
(
WalIndexHdr
*
)
p1
;
WalIndexHdr
*
pHdr2
=
(
WalIndexHdr
*
)
p2
;
/* aSalt[0] is a copy of the value stored in the wal file header. It
** is incremented each time the wal file is restarted. */
if
(
pHdr1
->
aSalt
[
0
]
<
pHdr2
->
aSalt
[
0
]
)
return
-
1
;
if
(
pHdr1
->
aSalt
[
0
]
>
pHdr2
->
aSalt
[
0
]
)
return
+
1
;
if
(
pHdr1
->
mxFrame
<
pHdr2
->
mxFrame
)
return
-
1
;
if
(
pHdr1
->
mxFrame
>
pHdr2
->
mxFrame
)
return
+
1
;
return
0
;
}
/*
** The caller currently has a read transaction open on the database.
** This function takes a SHARED lock on the CHECKPOINTER slot and then
** checks if the snapshot passed as the second argument is still
** available. If so, SQLITE_OK is returned.
**
** If the snapshot is not available, SQLITE_ERROR is returned. Or, if
** the CHECKPOINTER lock cannot be obtained, SQLITE_BUSY. If any error
** occurs (any value other than SQLITE_OK is returned), the CHECKPOINTER
** lock is released before returning.
*/
int
sqlite3WalSnapshotCheck
(
Wal
*
pWal
,
sqlite3_snapshot
*
pSnapshot
){
int
rc
;
rc
=
walLockShared
(
pWal
,
WAL_CKPT_LOCK
);
if
(
rc
==
SQLITE_OK
){
WalIndexHdr
*
pNew
=
(
WalIndexHdr
*
)
pSnapshot
;
if
(
memcmp
(
pNew
->
aSalt
,
pWal
->
hdr
.
aSalt
,
sizeof
(
pWal
->
hdr
.
aSalt
))
||
pNew
->
mxFrame
<
walCkptInfo
(
pWal
)
->
nBackfillAttempted
){
rc
=
SQLITE_ERROR_SNAPSHOT
;
walUnlockShared
(
pWal
,
WAL_CKPT_LOCK
);
}
}
return
rc
;
}
/*
** Release a lock obtained by an earlier successful call to
** sqlite3WalSnapshotCheck().
*/
void
sqlite3WalSnapshotUnlock
(
Wal
*
pWal
){
assert
(
pWal
);
walUnlockShared
(
pWal
,
WAL_CKPT_LOCK
);
}
#endif
/* SQLITE_ENABLE_SNAPSHOT */
#ifdef SQLITE_ENABLE_ZIPVFS
/*
** If the argument is not NULL, it points to a Wal object that holds a
** read-lock. This function returns the database page-size if it is known,
** or zero if it is not (or if pWal is NULL).
*/
int
sqlite3WalFramesize
(
Wal
*
pWal
){
assert
(
pWal
==
0
||
pWal
->
readLock
>=
0
);
return
(
pWal
?
pWal
->
szPage
:
0
);
}
#endif
/* Return the sqlite3_file object for the WAL file
*/
sqlite3_file
*
sqlite3WalFile
(
Wal
*
pWal
){
return
pWal
->
pWalFd
;
}
#endif
/* #ifndef SQLITE_OMIT_WAL */
source/libs/tdb/src/sqliteinc/btree.h
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2001 September 15
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface that the sqlite B-Tree file
** subsystem. See comments in the source code for a detailed description
** of what each interface routine does.
*/
#ifndef SQLITE_BTREE_H
#define SQLITE_BTREE_H
/* TODO: This definition is just included so other modules compile. It
** needs to be revisited.
*/
#define SQLITE_N_BTREE_META 16
/*
** If defined as non-zero, auto-vacuum is enabled by default. Otherwise
** it must be turned on for each database using "PRAGMA auto_vacuum = 1".
*/
#ifndef SQLITE_DEFAULT_AUTOVACUUM
#define SQLITE_DEFAULT_AUTOVACUUM 0
#endif
#define BTREE_AUTOVACUUM_NONE 0
/* Do not do auto-vacuum */
#define BTREE_AUTOVACUUM_FULL 1
/* Do full auto-vacuum */
#define BTREE_AUTOVACUUM_INCR 2
/* Incremental vacuum */
/*
** Forward declarations of structure
*/
typedef
struct
Btree
Btree
;
typedef
struct
BtCursor
BtCursor
;
typedef
struct
BtShared
BtShared
;
typedef
struct
BtreePayload
BtreePayload
;
int
sqlite3BtreeOpen
(
sqlite3_vfs
*
pVfs
,
/* VFS to use with this b-tree */
const
char
*
zFilename
,
/* Name of database file to open */
sqlite3
*
db
,
/* Associated database connection */
Btree
**
ppBtree
,
/* Return open Btree* here */
int
flags
,
/* Flags */
int
vfsFlags
/* Flags passed through to VFS open */
);
/* The flags parameter to sqlite3BtreeOpen can be the bitwise or of the
** following values.
**
** NOTE: These values must match the corresponding PAGER_ values in
** pager.h.
*/
#define BTREE_OMIT_JOURNAL 1
/* Do not create or use a rollback journal */
#define BTREE_MEMORY 2
/* This is an in-memory DB */
#define BTREE_SINGLE 4
/* The file contains at most 1 b-tree */
#define BTREE_UNORDERED 8
/* Use of a hash implementation is OK */
int
sqlite3BtreeClose
(
Btree
*
);
int
sqlite3BtreeSetCacheSize
(
Btree
*
,
int
);
int
sqlite3BtreeSetSpillSize
(
Btree
*
,
int
);
#if SQLITE_MAX_MMAP_SIZE>0
int
sqlite3BtreeSetMmapLimit
(
Btree
*
,
sqlite3_int64
);
#endif
int
sqlite3BtreeSetPagerFlags
(
Btree
*
,
unsigned
);
int
sqlite3BtreeSetPageSize
(
Btree
*
p
,
int
nPagesize
,
int
nReserve
,
int
eFix
);
int
sqlite3BtreeGetPageSize
(
Btree
*
);
Pgno
sqlite3BtreeMaxPageCount
(
Btree
*
,
Pgno
);
Pgno
sqlite3BtreeLastPage
(
Btree
*
);
int
sqlite3BtreeSecureDelete
(
Btree
*
,
int
);
int
sqlite3BtreeGetRequestedReserve
(
Btree
*
);
int
sqlite3BtreeGetReserveNoMutex
(
Btree
*
p
);
int
sqlite3BtreeSetAutoVacuum
(
Btree
*
,
int
);
int
sqlite3BtreeGetAutoVacuum
(
Btree
*
);
int
sqlite3BtreeBeginTrans
(
Btree
*
,
int
,
int
*
);
int
sqlite3BtreeCommitPhaseOne
(
Btree
*
,
const
char
*
);
int
sqlite3BtreeCommitPhaseTwo
(
Btree
*
,
int
);
int
sqlite3BtreeCommit
(
Btree
*
);
int
sqlite3BtreeRollback
(
Btree
*
,
int
,
int
);
int
sqlite3BtreeBeginStmt
(
Btree
*
,
int
);
int
sqlite3BtreeCreateTable
(
Btree
*
,
Pgno
*
,
int
flags
);
int
sqlite3BtreeTxnState
(
Btree
*
);
int
sqlite3BtreeIsInBackup
(
Btree
*
);
void
*
sqlite3BtreeSchema
(
Btree
*
,
int
,
void
(
*
)(
void
*
));
int
sqlite3BtreeSchemaLocked
(
Btree
*
pBtree
);
#ifndef SQLITE_OMIT_SHARED_CACHE
int
sqlite3BtreeLockTable
(
Btree
*
pBtree
,
int
iTab
,
u8
isWriteLock
);
#endif
/* Savepoints are named, nestable SQL transactions mostly implemented */
/* in vdbe.c and pager.c See https://sqlite.org/lang_savepoint.html */
int
sqlite3BtreeSavepoint
(
Btree
*
,
int
,
int
);
/* "Checkpoint" only refers to WAL. See https://sqlite.org/wal.html#ckpt */
#ifndef SQLITE_OMIT_WAL
int
sqlite3BtreeCheckpoint
(
Btree
*
,
int
,
int
*
,
int
*
);
#endif
const
char
*
sqlite3BtreeGetFilename
(
Btree
*
);
const
char
*
sqlite3BtreeGetJournalname
(
Btree
*
);
int
sqlite3BtreeCopyFile
(
Btree
*
,
Btree
*
);
int
sqlite3BtreeIncrVacuum
(
Btree
*
);
/* The flags parameter to sqlite3BtreeCreateTable can be the bitwise OR
** of the flags shown below.
**
** Every SQLite table must have either BTREE_INTKEY or BTREE_BLOBKEY set.
** With BTREE_INTKEY, the table key is a 64-bit integer and arbitrary data
** is stored in the leaves. (BTREE_INTKEY is used for SQL tables.) With
** BTREE_BLOBKEY, the key is an arbitrary BLOB and no content is stored
** anywhere - the key is the content. (BTREE_BLOBKEY is used for SQL
** indices.)
*/
#define BTREE_INTKEY 1
/* Table has only 64-bit signed integer keys */
#define BTREE_BLOBKEY 2
/* Table has keys only - no data */
int
sqlite3BtreeDropTable
(
Btree
*
,
int
,
int
*
);
int
sqlite3BtreeClearTable
(
Btree
*
,
int
,
i64
*
);
int
sqlite3BtreeClearTableOfCursor
(
BtCursor
*
);
int
sqlite3BtreeTripAllCursors
(
Btree
*
,
int
,
int
);
void
sqlite3BtreeGetMeta
(
Btree
*
pBtree
,
int
idx
,
u32
*
pValue
);
int
sqlite3BtreeUpdateMeta
(
Btree
*
,
int
idx
,
u32
value
);
int
sqlite3BtreeNewDb
(
Btree
*
p
);
/*
** The second parameter to sqlite3BtreeGetMeta or sqlite3BtreeUpdateMeta
** should be one of the following values. The integer values are assigned
** to constants so that the offset of the corresponding field in an
** SQLite database header may be found using the following formula:
**
** offset = 36 + (idx * 4)
**
** For example, the free-page-count field is located at byte offset 36 of
** the database file header. The incr-vacuum-flag field is located at
** byte offset 64 (== 36+4*7).
**
** The BTREE_DATA_VERSION value is not really a value stored in the header.
** It is a read-only number computed by the pager. But we merge it with
** the header value access routines since its access pattern is the same.
** Call it a "virtual meta value".
*/
#define BTREE_FREE_PAGE_COUNT 0
#define BTREE_SCHEMA_VERSION 1
#define BTREE_FILE_FORMAT 2
#define BTREE_DEFAULT_CACHE_SIZE 3
#define BTREE_LARGEST_ROOT_PAGE 4
#define BTREE_TEXT_ENCODING 5
#define BTREE_USER_VERSION 6
#define BTREE_INCR_VACUUM 7
#define BTREE_APPLICATION_ID 8
#define BTREE_DATA_VERSION 15
/* A virtual meta-value */
/*
** Kinds of hints that can be passed into the sqlite3BtreeCursorHint()
** interface.
**
** BTREE_HINT_RANGE (arguments: Expr*, Mem*)
**
** The first argument is an Expr* (which is guaranteed to be constant for
** the lifetime of the cursor) that defines constraints on which rows
** might be fetched with this cursor. The Expr* tree may contain
** TK_REGISTER nodes that refer to values stored in the array of registers
** passed as the second parameter. In other words, if Expr.op==TK_REGISTER
** then the value of the node is the value in Mem[pExpr.iTable]. Any
** TK_COLUMN node in the expression tree refers to the Expr.iColumn-th
** column of the b-tree of the cursor. The Expr tree will not contain
** any function calls nor subqueries nor references to b-trees other than
** the cursor being hinted.
**
** The design of the _RANGE hint is aid b-tree implementations that try
** to prefetch content from remote machines - to provide those
** implementations with limits on what needs to be prefetched and thereby
** reduce network bandwidth.
**
** Note that BTREE_HINT_FLAGS with BTREE_BULKLOAD is the only hint used by
** standard SQLite. The other hints are provided for extentions that use
** the SQLite parser and code generator but substitute their own storage
** engine.
*/
#define BTREE_HINT_RANGE 0
/* Range constraints on queries */
/*
** Values that may be OR'd together to form the argument to the
** BTREE_HINT_FLAGS hint for sqlite3BtreeCursorHint():
**
** The BTREE_BULKLOAD flag is set on index cursors when the index is going
** to be filled with content that is already in sorted order.
**
** The BTREE_SEEK_EQ flag is set on cursors that will get OP_SeekGE or
** OP_SeekLE opcodes for a range search, but where the range of entries
** selected will all have the same key. In other words, the cursor will
** be used only for equality key searches.
**
*/
#define BTREE_BULKLOAD 0x00000001
/* Used to full index in sorted order */
#define BTREE_SEEK_EQ 0x00000002
/* EQ seeks only - no range seeks */
/*
** Flags passed as the third argument to sqlite3BtreeCursor().
**
** For read-only cursors the wrFlag argument is always zero. For read-write
** cursors it may be set to either (BTREE_WRCSR|BTREE_FORDELETE) or just
** (BTREE_WRCSR). If the BTREE_FORDELETE bit is set, then the cursor will
** only be used by SQLite for the following:
**
** * to seek to and then delete specific entries, and/or
**
** * to read values that will be used to create keys that other
** BTREE_FORDELETE cursors will seek to and delete.
**
** The BTREE_FORDELETE flag is an optimization hint. It is not used by
** by this, the native b-tree engine of SQLite, but it is available to
** alternative storage engines that might be substituted in place of this
** b-tree system. For alternative storage engines in which a delete of
** the main table row automatically deletes corresponding index rows,
** the FORDELETE flag hint allows those alternative storage engines to
** skip a lot of work. Namely: FORDELETE cursors may treat all SEEK
** and DELETE operations as no-ops, and any READ operation against a
** FORDELETE cursor may return a null row: 0x01 0x00.
*/
#define BTREE_WRCSR 0x00000004
/* read-write cursor */
#define BTREE_FORDELETE 0x00000008
/* Cursor is for seek/delete only */
int
sqlite3BtreeCursor
(
Btree
*
,
/* BTree containing table to open */
Pgno
iTable
,
/* Index of root page */
int
wrFlag
,
/* 1 for writing. 0 for read-only */
struct
KeyInfo
*
,
/* First argument to compare function */
BtCursor
*
pCursor
/* Space to write cursor structure */
);
BtCursor
*
sqlite3BtreeFakeValidCursor
(
void
);
int
sqlite3BtreeCursorSize
(
void
);
void
sqlite3BtreeCursorZero
(
BtCursor
*
);
void
sqlite3BtreeCursorHintFlags
(
BtCursor
*
,
unsigned
);
#ifdef SQLITE_ENABLE_CURSOR_HINTS
void
sqlite3BtreeCursorHint
(
BtCursor
*
,
int
,
...);
#endif
int
sqlite3BtreeCloseCursor
(
BtCursor
*
);
int
sqlite3BtreeTableMoveto
(
BtCursor
*
,
i64
intKey
,
int
bias
,
int
*
pRes
);
int
sqlite3BtreeIndexMoveto
(
BtCursor
*
,
UnpackedRecord
*
pUnKey
,
int
*
pRes
);
int
sqlite3BtreeCursorHasMoved
(
BtCursor
*
);
int
sqlite3BtreeCursorRestore
(
BtCursor
*
,
int
*
);
int
sqlite3BtreeDelete
(
BtCursor
*
,
u8
flags
);
/* Allowed flags for sqlite3BtreeDelete() and sqlite3BtreeInsert() */
#define BTREE_SAVEPOSITION 0x02
/* Leave cursor pointing at NEXT or PREV */
#define BTREE_AUXDELETE 0x04
/* not the primary delete operation */
#define BTREE_APPEND 0x08
/* Insert is likely an append */
#define BTREE_PREFORMAT 0x80
/* Inserted data is a preformated cell */
/* An instance of the BtreePayload object describes the content of a single
** entry in either an index or table btree.
**
** Index btrees (used for indexes and also WITHOUT ROWID tables) contain
** an arbitrary key and no data. These btrees have pKey,nKey set to the
** key and the pData,nData,nZero fields are uninitialized. The aMem,nMem
** fields give an array of Mem objects that are a decomposition of the key.
** The nMem field might be zero, indicating that no decomposition is available.
**
** Table btrees (used for rowid tables) contain an integer rowid used as
** the key and passed in the nKey field. The pKey field is zero.
** pData,nData hold the content of the new entry. nZero extra zero bytes
** are appended to the end of the content when constructing the entry.
** The aMem,nMem fields are uninitialized for table btrees.
**
** Field usage summary:
**
** Table BTrees Index Btrees
**
** pKey always NULL encoded key
** nKey the ROWID length of pKey
** pData data not used
** aMem not used decomposed key value
** nMem not used entries in aMem
** nData length of pData not used
** nZero extra zeros after pData not used
**
** This object is used to pass information into sqlite3BtreeInsert(). The
** same information used to be passed as five separate parameters. But placing
** the information into this object helps to keep the interface more
** organized and understandable, and it also helps the resulting code to
** run a little faster by using fewer registers for parameter passing.
*/
struct
BtreePayload
{
const
void
*
pKey
;
/* Key content for indexes. NULL for tables */
sqlite3_int64
nKey
;
/* Size of pKey for indexes. PRIMARY KEY for tabs */
const
void
*
pData
;
/* Data for tables. */
sqlite3_value
*
aMem
;
/* First of nMem value in the unpacked pKey */
u16
nMem
;
/* Number of aMem[] value. Might be zero */
int
nData
;
/* Size of pData. 0 if none. */
int
nZero
;
/* Extra zero data appended after pData,nData */
};
int
sqlite3BtreeInsert
(
BtCursor
*
,
const
BtreePayload
*
pPayload
,
int
flags
,
int
seekResult
);
int
sqlite3BtreeFirst
(
BtCursor
*
,
int
*
pRes
);
int
sqlite3BtreeLast
(
BtCursor
*
,
int
*
pRes
);
int
sqlite3BtreeNext
(
BtCursor
*
,
int
flags
);
int
sqlite3BtreeEof
(
BtCursor
*
);
int
sqlite3BtreePrevious
(
BtCursor
*
,
int
flags
);
i64
sqlite3BtreeIntegerKey
(
BtCursor
*
);
void
sqlite3BtreeCursorPin
(
BtCursor
*
);
void
sqlite3BtreeCursorUnpin
(
BtCursor
*
);
#ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC
i64
sqlite3BtreeOffset
(
BtCursor
*
);
#endif
int
sqlite3BtreePayload
(
BtCursor
*
,
u32
offset
,
u32
amt
,
void
*
);
const
void
*
sqlite3BtreePayloadFetch
(
BtCursor
*
,
u32
*
pAmt
);
u32
sqlite3BtreePayloadSize
(
BtCursor
*
);
sqlite3_int64
sqlite3BtreeMaxRecordSize
(
BtCursor
*
);
char
*
sqlite3BtreeIntegrityCheck
(
sqlite3
*
,
Btree
*
,
Pgno
*
aRoot
,
int
nRoot
,
int
,
int
*
);
struct
Pager
*
sqlite3BtreePager
(
Btree
*
);
i64
sqlite3BtreeRowCountEst
(
BtCursor
*
);
#ifndef SQLITE_OMIT_INCRBLOB
int
sqlite3BtreePayloadChecked
(
BtCursor
*
,
u32
offset
,
u32
amt
,
void
*
);
int
sqlite3BtreePutData
(
BtCursor
*
,
u32
offset
,
u32
amt
,
void
*
);
void
sqlite3BtreeIncrblobCursor
(
BtCursor
*
);
#endif
void
sqlite3BtreeClearCursor
(
BtCursor
*
);
int
sqlite3BtreeSetVersion
(
Btree
*
pBt
,
int
iVersion
);
int
sqlite3BtreeCursorHasHint
(
BtCursor
*
,
unsigned
int
mask
);
int
sqlite3BtreeIsReadonly
(
Btree
*
pBt
);
int
sqlite3HeaderSizeBtree
(
void
);
#ifdef SQLITE_DEBUG
sqlite3_uint64
sqlite3BtreeSeekCount
(
Btree
*
);
#else
# define sqlite3BtreeSeekCount(X) 0
#endif
#ifndef NDEBUG
int
sqlite3BtreeCursorIsValid
(
BtCursor
*
);
#endif
int
sqlite3BtreeCursorIsValidNN
(
BtCursor
*
);
int
sqlite3BtreeCount
(
sqlite3
*
,
BtCursor
*
,
i64
*
);
#ifdef SQLITE_TEST
int
sqlite3BtreeCursorInfo
(
BtCursor
*
,
int
*
,
int
);
void
sqlite3BtreeCursorList
(
Btree
*
);
#endif
#ifndef SQLITE_OMIT_WAL
int
sqlite3BtreeCheckpoint
(
Btree
*
,
int
,
int
*
,
int
*
);
#endif
int
sqlite3BtreeTransferRow
(
BtCursor
*
,
BtCursor
*
,
i64
);
/*
** If we are not using shared cache, then there is no need to
** use mutexes to access the BtShared structures. So make the
** Enter and Leave procedures no-ops.
*/
#ifndef SQLITE_OMIT_SHARED_CACHE
void
sqlite3BtreeEnter
(
Btree
*
);
void
sqlite3BtreeEnterAll
(
sqlite3
*
);
int
sqlite3BtreeSharable
(
Btree
*
);
void
sqlite3BtreeEnterCursor
(
BtCursor
*
);
int
sqlite3BtreeConnectionCount
(
Btree
*
);
#else
# define sqlite3BtreeEnter(X)
# define sqlite3BtreeEnterAll(X)
# define sqlite3BtreeSharable(X) 0
# define sqlite3BtreeEnterCursor(X)
# define sqlite3BtreeConnectionCount(X) 1
#endif
#if !defined(SQLITE_OMIT_SHARED_CACHE) && SQLITE_THREADSAFE
void
sqlite3BtreeLeave
(
Btree
*
);
void
sqlite3BtreeLeaveCursor
(
BtCursor
*
);
void
sqlite3BtreeLeaveAll
(
sqlite3
*
);
#ifndef NDEBUG
/* These routines are used inside assert() statements only. */
int
sqlite3BtreeHoldsMutex
(
Btree
*
);
int
sqlite3BtreeHoldsAllMutexes
(
sqlite3
*
);
int
sqlite3SchemaMutexHeld
(
sqlite3
*
,
int
,
Schema
*
);
#endif
#else
# define sqlite3BtreeLeave(X)
# define sqlite3BtreeLeaveCursor(X)
# define sqlite3BtreeLeaveAll(X)
# define sqlite3BtreeHoldsMutex(X) 1
# define sqlite3BtreeHoldsAllMutexes(X) 1
# define sqlite3SchemaMutexHeld(X,Y,Z) 1
#endif
#endif
/* SQLITE_BTREE_H */
source/libs/tdb/src/sqliteinc/btreeInt.h
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2004 April 6
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This file implements an external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
**
** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
** "Sorting And Searching", pages 473-480. Addison-Wesley
** Publishing Company, Reading, Massachusetts.
**
** The basic idea is that each page of the file contains N database
** entries and N+1 pointers to subpages.
**
** ----------------------------------------------------------------
** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N-1) | Ptr(N) |
** ----------------------------------------------------------------
**
** All of the keys on the page that Ptr(0) points to have values less
** than Key(0). All of the keys on page Ptr(1) and its subpages have
** values greater than Key(0) and less than Key(1). All of the keys
** on Ptr(N) and its subpages have values greater than Key(N-1). And
** so forth.
**
** Finding a particular key requires reading O(log(M)) pages from the
** disk where M is the number of entries in the tree.
**
** In this implementation, a single file can hold one or more separate
** BTrees. Each BTree is identified by the index of its root page. The
** key and data for any entry are combined to form the "payload". A
** fixed amount of payload can be carried directly on the database
** page. If the payload is larger than the preset amount then surplus
** bytes are stored on overflow pages. The payload for an entry
** and the preceding pointer are combined to form a "Cell". Each
** page has a small header which contains the Ptr(N) pointer and other
** information such as the size of key and data.
**
** FORMAT DETAILS
**
** The file is divided into pages. The first page is called page 1,
** the second is page 2, and so forth. A page number of zero indicates
** "no such page". The page size can be any power of 2 between 512 and 65536.
** Each page can be either a btree page, a freelist page, an overflow
** page, or a pointer-map page.
**
** The first page is always a btree page. The first 100 bytes of the first
** page contain a special header (the "file header") that describes the file.
** The format of the file header is as follows:
**
** OFFSET SIZE DESCRIPTION
** 0 16 Header string: "SQLite format 3\000"
** 16 2 Page size in bytes. (1 means 65536)
** 18 1 File format write version
** 19 1 File format read version
** 20 1 Bytes of unused space at the end of each page
** 21 1 Max embedded payload fraction (must be 64)
** 22 1 Min embedded payload fraction (must be 32)
** 23 1 Min leaf payload fraction (must be 32)
** 24 4 File change counter
** 28 4 Reserved for future use
** 32 4 First freelist page
** 36 4 Number of freelist pages in the file
** 40 60 15 4-byte meta values passed to higher layers
**
** 40 4 Schema cookie
** 44 4 File format of schema layer
** 48 4 Size of page cache
** 52 4 Largest root-page (auto/incr_vacuum)
** 56 4 1=UTF-8 2=UTF16le 3=UTF16be
** 60 4 User version
** 64 4 Incremental vacuum mode
** 68 4 Application-ID
** 72 20 unused
** 92 4 The version-valid-for number
** 96 4 SQLITE_VERSION_NUMBER
**
** All of the integer values are big-endian (most significant byte first).
**
** The file change counter is incremented when the database is changed
** This counter allows other processes to know when the file has changed
** and thus when they need to flush their cache.
**
** The max embedded payload fraction is the amount of the total usable
** space in a page that can be consumed by a single cell for standard
** B-tree (non-LEAFDATA) tables. A value of 255 means 100%. The default
** is to limit the maximum cell size so that at least 4 cells will fit
** on one page. Thus the default max embedded payload fraction is 64.
**
** If the payload for a cell is larger than the max payload, then extra
** payload is spilled to overflow pages. Once an overflow page is allocated,
** as many bytes as possible are moved into the overflow pages without letting
** the cell size drop below the min embedded payload fraction.
**
** The min leaf payload fraction is like the min embedded payload fraction
** except that it applies to leaf nodes in a LEAFDATA tree. The maximum
** payload fraction for a LEAFDATA tree is always 100% (or 255) and it
** not specified in the header.
**
** Each btree pages is divided into three sections: The header, the
** cell pointer array, and the cell content area. Page 1 also has a 100-byte
** file header that occurs before the page header.
**
** |----------------|
** | file header | 100 bytes. Page 1 only.
** |----------------|
** | page header | 8 bytes for leaves. 12 bytes for interior nodes
** |----------------|
** | cell pointer | | 2 bytes per cell. Sorted order.
** | array | | Grows downward
** | | v
** |----------------|
** | unallocated |
** | space |
** |----------------| ^ Grows upwards
** | cell content | | Arbitrary order interspersed with freeblocks.
** | area | | and free space fragments.
** |----------------|
**
** The page headers looks like this:
**
** OFFSET SIZE DESCRIPTION
** 0 1 Flags. 1: intkey, 2: zerodata, 4: leafdata, 8: leaf
** 1 2 byte offset to the first freeblock
** 3 2 number of cells on this page
** 5 2 first byte of the cell content area
** 7 1 number of fragmented free bytes
** 8 4 Right child (the Ptr(N) value). Omitted on leaves.
**
** The flags define the format of this btree page. The leaf flag means that
** this page has no children. The zerodata flag means that this page carries
** only keys and no data. The intkey flag means that the key is an integer
** which is stored in the key size entry of the cell header rather than in
** the payload area.
**
** The cell pointer array begins on the first byte after the page header.
** The cell pointer array contains zero or more 2-byte numbers which are
** offsets from the beginning of the page to the cell content in the cell
** content area. The cell pointers occur in sorted order. The system strives
** to keep free space after the last cell pointer so that new cells can
** be easily added without having to defragment the page.
**
** Cell content is stored at the very end of the page and grows toward the
** beginning of the page.
**
** Unused space within the cell content area is collected into a linked list of
** freeblocks. Each freeblock is at least 4 bytes in size. The byte offset
** to the first freeblock is given in the header. Freeblocks occur in
** increasing order. Because a freeblock must be at least 4 bytes in size,
** any group of 3 or fewer unused bytes in the cell content area cannot
** exist on the freeblock chain. A group of 3 or fewer free bytes is called
** a fragment. The total number of bytes in all fragments is recorded.
** in the page header at offset 7.
**
** SIZE DESCRIPTION
** 2 Byte offset of the next freeblock
** 2 Bytes in this freeblock
**
** Cells are of variable length. Cells are stored in the cell content area at
** the end of the page. Pointers to the cells are in the cell pointer array
** that immediately follows the page header. Cells is not necessarily
** contiguous or in order, but cell pointers are contiguous and in order.
**
** Cell content makes use of variable length integers. A variable
** length integer is 1 to 9 bytes where the lower 7 bits of each
** byte are used. The integer consists of all bytes that have bit 8 set and
** the first byte with bit 8 clear. The most significant byte of the integer
** appears first. A variable-length integer may not be more than 9 bytes long.
** As a special case, all 8 bytes of the 9th byte are used as data. This
** allows a 64-bit integer to be encoded in 9 bytes.
**
** 0x00 becomes 0x00000000
** 0x7f becomes 0x0000007f
** 0x81 0x00 becomes 0x00000080
** 0x82 0x00 becomes 0x00000100
** 0x80 0x7f becomes 0x0000007f
** 0x8a 0x91 0xd1 0xac 0x78 becomes 0x12345678
** 0x81 0x81 0x81 0x81 0x01 becomes 0x10204081
**
** Variable length integers are used for rowids and to hold the number of
** bytes of key and data in a btree cell.
**
** The content of a cell looks like this:
**
** SIZE DESCRIPTION
** 4 Page number of the left child. Omitted if leaf flag is set.
** var Number of bytes of data. Omitted if the zerodata flag is set.
** var Number of bytes of key. Or the key itself if intkey flag is set.
** * Payload
** 4 First page of the overflow chain. Omitted if no overflow
**
** Overflow pages form a linked list. Each page except the last is completely
** filled with data (pagesize - 4 bytes). The last page can have as little
** as 1 byte of data.
**
** SIZE DESCRIPTION
** 4 Page number of next overflow page
** * Data
**
** Freelist pages come in two subtypes: trunk pages and leaf pages. The
** file header points to the first in a linked list of trunk page. Each trunk
** page points to multiple leaf pages. The content of a leaf page is
** unspecified. A trunk page looks like this:
**
** SIZE DESCRIPTION
** 4 Page number of next trunk page
** 4 Number of leaf pointers on this page
** * zero or more pages numbers of leaves
*/
#include "sqliteInt.h"
/* The following value is the maximum cell size assuming a maximum page
** size give above.
*/
#define MX_CELL_SIZE(pBt) ((int)(pBt->pageSize-8))
/* The maximum number of cells on a single page of the database. This
** assumes a minimum cell size of 6 bytes (4 bytes for the cell itself
** plus 2 bytes for the index to the cell in the page header). Such
** small cells will be rare, but they are possible.
*/
#define MX_CELL(pBt) ((pBt->pageSize-8)/6)
/* Forward declarations */
typedef
struct
MemPage
MemPage
;
typedef
struct
BtLock
BtLock
;
typedef
struct
CellInfo
CellInfo
;
/*
** This is a magic string that appears at the beginning of every
** SQLite database in order to identify the file as a real database.
**
** You can change this value at compile-time by specifying a
** -DSQLITE_FILE_HEADER="..." on the compiler command-line. The
** header must be exactly 16 bytes including the zero-terminator so
** the string itself should be 15 characters long. If you change
** the header, then your custom library will not be able to read
** databases generated by the standard tools and the standard tools
** will not be able to read databases created by your custom library.
*/
#ifndef SQLITE_FILE_HEADER
/* 123456789 123456 */
# define SQLITE_FILE_HEADER "SQLite format 3"
#endif
/*
** Page type flags. An ORed combination of these flags appear as the
** first byte of on-disk image of every BTree page.
*/
#define PTF_INTKEY 0x01
#define PTF_ZERODATA 0x02
#define PTF_LEAFDATA 0x04
#define PTF_LEAF 0x08
/*
** An instance of this object stores information about each a single database
** page that has been loaded into memory. The information in this object
** is derived from the raw on-disk page content.
**
** As each database page is loaded into memory, the pager allocats an
** instance of this object and zeros the first 8 bytes. (This is the
** "extra" information associated with each page of the pager.)
**
** Access to all fields of this structure is controlled by the mutex
** stored in MemPage.pBt->mutex.
*/
struct
MemPage
{
u8
isInit
;
/* True if previously initialized. MUST BE FIRST! */
u8
intKey
;
/* True if table b-trees. False for index b-trees */
u8
intKeyLeaf
;
/* True if the leaf of an intKey table */
Pgno
pgno
;
/* Page number for this page */
/* Only the first 8 bytes (above) are zeroed by pager.c when a new page
** is allocated. All fields that follow must be initialized before use */
u8
leaf
;
/* True if a leaf page */
u8
hdrOffset
;
/* 100 for page 1. 0 otherwise */
u8
childPtrSize
;
/* 0 if leaf==1. 4 if leaf==0 */
u8
max1bytePayload
;
/* min(maxLocal,127) */
u8
nOverflow
;
/* Number of overflow cell bodies in aCell[] */
u16
maxLocal
;
/* Copy of BtShared.maxLocal or BtShared.maxLeaf */
u16
minLocal
;
/* Copy of BtShared.minLocal or BtShared.minLeaf */
u16
cellOffset
;
/* Index in aData of first cell pointer */
int
nFree
;
/* Number of free bytes on the page. -1 for unknown */
u16
nCell
;
/* Number of cells on this page, local and ovfl */
u16
maskPage
;
/* Mask for page offset */
u16
aiOvfl
[
4
];
/* Insert the i-th overflow cell before the aiOvfl-th
** non-overflow cell */
u8
*
apOvfl
[
4
];
/* Pointers to the body of overflow cells */
BtShared
*
pBt
;
/* Pointer to BtShared that this page is part of */
u8
*
aData
;
/* Pointer to disk image of the page data */
u8
*
aDataEnd
;
/* One byte past the end of usable data */
u8
*
aCellIdx
;
/* The cell index area */
u8
*
aDataOfst
;
/* Same as aData for leaves. aData+4 for interior */
DbPage
*
pDbPage
;
/* Pager page handle */
u16
(
*
xCellSize
)(
MemPage
*
,
u8
*
);
/* cellSizePtr method */
void
(
*
xParseCell
)(
MemPage
*
,
u8
*
,
CellInfo
*
);
/* btreeParseCell method */
};
/*
** A linked list of the following structures is stored at BtShared.pLock.
** Locks are added (or upgraded from READ_LOCK to WRITE_LOCK) when a cursor
** is opened on the table with root page BtShared.iTable. Locks are removed
** from this list when a transaction is committed or rolled back, or when
** a btree handle is closed.
*/
struct
BtLock
{
Btree
*
pBtree
;
/* Btree handle holding this lock */
Pgno
iTable
;
/* Root page of table */
u8
eLock
;
/* READ_LOCK or WRITE_LOCK */
BtLock
*
pNext
;
/* Next in BtShared.pLock list */
};
/* Candidate values for BtLock.eLock */
#define READ_LOCK 1
#define WRITE_LOCK 2
/* A Btree handle
**
** A database connection contains a pointer to an instance of
** this object for every database file that it has open. This structure
** is opaque to the database connection. The database connection cannot
** see the internals of this structure and only deals with pointers to
** this structure.
**
** For some database files, the same underlying database cache might be
** shared between multiple connections. In that case, each connection
** has it own instance of this object. But each instance of this object
** points to the same BtShared object. The database cache and the
** schema associated with the database file are all contained within
** the BtShared object.
**
** All fields in this structure are accessed under sqlite3.mutex.
** The pBt pointer itself may not be changed while there exists cursors
** in the referenced BtShared that point back to this Btree since those
** cursors have to go through this Btree to find their BtShared and
** they often do so without holding sqlite3.mutex.
*/
struct
Btree
{
sqlite3
*
db
;
/* The database connection holding this btree */
BtShared
*
pBt
;
/* Sharable content of this btree */
u8
inTrans
;
/* TRANS_NONE, TRANS_READ or TRANS_WRITE */
u8
sharable
;
/* True if we can share pBt with another db */
u8
locked
;
/* True if db currently has pBt locked */
u8
hasIncrblobCur
;
/* True if there are one or more Incrblob cursors */
int
wantToLock
;
/* Number of nested calls to sqlite3BtreeEnter() */
int
nBackup
;
/* Number of backup operations reading this btree */
u32
iBDataVersion
;
/* Combines with pBt->pPager->iDataVersion */
Btree
*
pNext
;
/* List of other sharable Btrees from the same db */
Btree
*
pPrev
;
/* Back pointer of the same list */
#ifdef SQLITE_DEBUG
u64
nSeek
;
/* Calls to sqlite3BtreeMovetoUnpacked() */
#endif
#ifndef SQLITE_OMIT_SHARED_CACHE
BtLock
lock
;
/* Object used to lock page 1 */
#endif
};
/*
** Btree.inTrans may take one of the following values.
**
** If the shared-data extension is enabled, there may be multiple users
** of the Btree structure. At most one of these may open a write transaction,
** but any number may have active read transactions.
**
** These values must match SQLITE_TXN_NONE, SQLITE_TXN_READ, and
** SQLITE_TXN_WRITE
*/
#define TRANS_NONE 0
#define TRANS_READ 1
#define TRANS_WRITE 2
#if TRANS_NONE!=SQLITE_TXN_NONE
# error wrong numeric code for no-transaction
#endif
#if TRANS_READ!=SQLITE_TXN_READ
# error wrong numeric code for read-transaction
#endif
#if TRANS_WRITE!=SQLITE_TXN_WRITE
# error wrong numeric code for write-transaction
#endif
/*
** An instance of this object represents a single database file.
**
** A single database file can be in use at the same time by two
** or more database connections. When two or more connections are
** sharing the same database file, each connection has it own
** private Btree object for the file and each of those Btrees points
** to this one BtShared object. BtShared.nRef is the number of
** connections currently sharing this database file.
**
** Fields in this structure are accessed under the BtShared.mutex
** mutex, except for nRef and pNext which are accessed under the
** global SQLITE_MUTEX_STATIC_MAIN mutex. The pPager field
** may not be modified once it is initially set as long as nRef>0.
** The pSchema field may be set once under BtShared.mutex and
** thereafter is unchanged as long as nRef>0.
**
** isPending:
**
** If a BtShared client fails to obtain a write-lock on a database
** table (because there exists one or more read-locks on the table),
** the shared-cache enters 'pending-lock' state and isPending is
** set to true.
**
** The shared-cache leaves the 'pending lock' state when either of
** the following occur:
**
** 1) The current writer (BtShared.pWriter) concludes its transaction, OR
** 2) The number of locks held by other connections drops to zero.
**
** while in the 'pending-lock' state, no connection may start a new
** transaction.
**
** This feature is included to help prevent writer-starvation.
*/
struct
BtShared
{
Pager
*
pPager
;
/* The page cache */
sqlite3
*
db
;
/* Database connection currently using this Btree */
BtCursor
*
pCursor
;
/* A list of all open cursors */
MemPage
*
pPage1
;
/* First page of the database */
u8
openFlags
;
/* Flags to sqlite3BtreeOpen() */
#ifndef SQLITE_OMIT_AUTOVACUUM
u8
autoVacuum
;
/* True if auto-vacuum is enabled */
u8
incrVacuum
;
/* True if incr-vacuum is enabled */
u8
bDoTruncate
;
/* True to truncate db on commit */
#endif
u8
inTransaction
;
/* Transaction state */
u8
max1bytePayload
;
/* Maximum first byte of cell for a 1-byte payload */
u8
nReserveWanted
;
/* Desired number of extra bytes per page */
u16
btsFlags
;
/* Boolean parameters. See BTS_* macros below */
u16
maxLocal
;
/* Maximum local payload in non-LEAFDATA tables */
u16
minLocal
;
/* Minimum local payload in non-LEAFDATA tables */
u16
maxLeaf
;
/* Maximum local payload in a LEAFDATA table */
u16
minLeaf
;
/* Minimum local payload in a LEAFDATA table */
u32
pageSize
;
/* Total number of bytes on a page */
u32
usableSize
;
/* Number of usable bytes on each page */
int
nTransaction
;
/* Number of open transactions (read + write) */
u32
nPage
;
/* Number of pages in the database */
void
*
pSchema
;
/* Pointer to space allocated by sqlite3BtreeSchema() */
void
(
*
xFreeSchema
)(
void
*
);
/* Destructor for BtShared.pSchema */
sqlite3_mutex
*
mutex
;
/* Non-recursive mutex required to access this object */
Bitvec
*
pHasContent
;
/* Set of pages moved to free-list this transaction */
#ifndef SQLITE_OMIT_SHARED_CACHE
int
nRef
;
/* Number of references to this structure */
BtShared
*
pNext
;
/* Next on a list of sharable BtShared structs */
BtLock
*
pLock
;
/* List of locks held on this shared-btree struct */
Btree
*
pWriter
;
/* Btree with currently open write transaction */
#endif
u8
*
pTmpSpace
;
/* Temp space sufficient to hold a single cell */
int
nPreformatSize
;
/* Size of last cell written by TransferRow() */
};
/*
** Allowed values for BtShared.btsFlags
*/
#define BTS_READ_ONLY 0x0001
/* Underlying file is readonly */
#define BTS_PAGESIZE_FIXED 0x0002
/* Page size can no longer be changed */
#define BTS_SECURE_DELETE 0x0004
/* PRAGMA secure_delete is enabled */
#define BTS_OVERWRITE 0x0008
/* Overwrite deleted content with zeros */
#define BTS_FAST_SECURE 0x000c
/* Combination of the previous two */
#define BTS_INITIALLY_EMPTY 0x0010
/* Database was empty at trans start */
#define BTS_NO_WAL 0x0020
/* Do not open write-ahead-log files */
#define BTS_EXCLUSIVE 0x0040
/* pWriter has an exclusive lock */
#define BTS_PENDING 0x0080
/* Waiting for read-locks to clear */
/*
** An instance of the following structure is used to hold information
** about a cell. The parseCellPtr() function fills in this structure
** based on information extract from the raw disk page.
*/
struct
CellInfo
{
i64
nKey
;
/* The key for INTKEY tables, or nPayload otherwise */
u8
*
pPayload
;
/* Pointer to the start of payload */
u32
nPayload
;
/* Bytes of payload */
u16
nLocal
;
/* Amount of payload held locally, not on overflow */
u16
nSize
;
/* Size of the cell content on the main b-tree page */
};
/*
** Maximum depth of an SQLite B-Tree structure. Any B-Tree deeper than
** this will be declared corrupt. This value is calculated based on a
** maximum database size of 2^31 pages a minimum fanout of 2 for a
** root-node and 3 for all other internal nodes.
**
** If a tree that appears to be taller than this is encountered, it is
** assumed that the database is corrupt.
*/
#define BTCURSOR_MAX_DEPTH 20
/*
** A cursor is a pointer to a particular entry within a particular
** b-tree within a database file.
**
** The entry is identified by its MemPage and the index in
** MemPage.aCell[] of the entry.
**
** A single database file can be shared by two more database connections,
** but cursors cannot be shared. Each cursor is associated with a
** particular database connection identified BtCursor.pBtree.db.
**
** Fields in this structure are accessed under the BtShared.mutex
** found at self->pBt->mutex.
**
** skipNext meaning:
** The meaning of skipNext depends on the value of eState:
**
** eState Meaning of skipNext
** VALID skipNext is meaningless and is ignored
** INVALID skipNext is meaningless and is ignored
** SKIPNEXT sqlite3BtreeNext() is a no-op if skipNext>0 and
** sqlite3BtreePrevious() is no-op if skipNext<0.
** REQUIRESEEK restoreCursorPosition() restores the cursor to
** eState=SKIPNEXT if skipNext!=0
** FAULT skipNext holds the cursor fault error code.
*/
struct
BtCursor
{
u8
eState
;
/* One of the CURSOR_XXX constants (see below) */
u8
curFlags
;
/* zero or more BTCF_* flags defined below */
u8
curPagerFlags
;
/* Flags to send to sqlite3PagerGet() */
u8
hints
;
/* As configured by CursorSetHints() */
int
skipNext
;
/* Prev() is noop if negative. Next() is noop if positive.
** Error code if eState==CURSOR_FAULT */
Btree
*
pBtree
;
/* The Btree to which this cursor belongs */
Pgno
*
aOverflow
;
/* Cache of overflow page locations */
void
*
pKey
;
/* Saved key that was cursor last known position */
/* All fields above are zeroed when the cursor is allocated. See
** sqlite3BtreeCursorZero(). Fields that follow must be manually
** initialized. */
#define BTCURSOR_FIRST_UNINIT pBt
/* Name of first uninitialized field */
BtShared
*
pBt
;
/* The BtShared this cursor points to */
BtCursor
*
pNext
;
/* Forms a linked list of all cursors */
CellInfo
info
;
/* A parse of the cell we are pointing at */
i64
nKey
;
/* Size of pKey, or last integer key */
Pgno
pgnoRoot
;
/* The root page of this tree */
i8
iPage
;
/* Index of current page in apPage */
u8
curIntKey
;
/* Value of apPage[0]->intKey */
u16
ix
;
/* Current index for apPage[iPage] */
u16
aiIdx
[
BTCURSOR_MAX_DEPTH
-
1
];
/* Current index in apPage[i] */
struct
KeyInfo
*
pKeyInfo
;
/* Arg passed to comparison function */
MemPage
*
pPage
;
/* Current page */
MemPage
*
apPage
[
BTCURSOR_MAX_DEPTH
-
1
];
/* Stack of parents of current page */
};
/*
** Legal values for BtCursor.curFlags
*/
#define BTCF_WriteFlag 0x01
/* True if a write cursor */
#define BTCF_ValidNKey 0x02
/* True if info.nKey is valid */
#define BTCF_ValidOvfl 0x04
/* True if aOverflow is valid */
#define BTCF_AtLast 0x08
/* Cursor is pointing ot the last entry */
#define BTCF_Incrblob 0x10
/* True if an incremental I/O handle */
#define BTCF_Multiple 0x20
/* Maybe another cursor on the same btree */
#define BTCF_Pinned 0x40
/* Cursor is busy and cannot be moved */
/*
** Potential values for BtCursor.eState.
**
** CURSOR_INVALID:
** Cursor does not point to a valid entry. This can happen (for example)
** because the table is empty or because BtreeCursorFirst() has not been
** called.
**
** CURSOR_VALID:
** Cursor points to a valid entry. getPayload() etc. may be called.
**
** CURSOR_SKIPNEXT:
** Cursor is valid except that the Cursor.skipNext field is non-zero
** indicating that the next sqlite3BtreeNext() or sqlite3BtreePrevious()
** operation should be a no-op.
**
** CURSOR_REQUIRESEEK:
** The table that this cursor was opened on still exists, but has been
** modified since the cursor was last used. The cursor position is saved
** in variables BtCursor.pKey and BtCursor.nKey. When a cursor is in
** this state, restoreCursorPosition() can be called to attempt to
** seek the cursor to the saved position.
**
** CURSOR_FAULT:
** An unrecoverable error (an I/O error or a malloc failure) has occurred
** on a different connection that shares the BtShared cache with this
** cursor. The error has left the cache in an inconsistent state.
** Do nothing else with this cursor. Any attempt to use the cursor
** should return the error code stored in BtCursor.skipNext
*/
#define CURSOR_VALID 0
#define CURSOR_INVALID 1
#define CURSOR_SKIPNEXT 2
#define CURSOR_REQUIRESEEK 3
#define CURSOR_FAULT 4
/*
** The database page the PENDING_BYTE occupies. This page is never used.
*/
# define PENDING_BYTE_PAGE(pBt) PAGER_MJ_PGNO(pBt)
/*
** These macros define the location of the pointer-map entry for a
** database page. The first argument to each is the number of usable
** bytes on each page of the database (often 1024). The second is the
** page number to look up in the pointer map.
**
** PTRMAP_PAGENO returns the database page number of the pointer-map
** page that stores the required pointer. PTRMAP_PTROFFSET returns
** the offset of the requested map entry.
**
** If the pgno argument passed to PTRMAP_PAGENO is a pointer-map page,
** then pgno is returned. So (pgno==PTRMAP_PAGENO(pgsz, pgno)) can be
** used to test if pgno is a pointer-map page. PTRMAP_ISPAGE implements
** this test.
*/
#define PTRMAP_PAGENO(pBt, pgno) ptrmapPageno(pBt, pgno)
#define PTRMAP_PTROFFSET(pgptrmap, pgno) (5*(pgno-pgptrmap-1))
#define PTRMAP_ISPAGE(pBt, pgno) (PTRMAP_PAGENO((pBt),(pgno))==(pgno))
/*
** The pointer map is a lookup table that identifies the parent page for
** each child page in the database file. The parent page is the page that
** contains a pointer to the child. Every page in the database contains
** 0 or 1 parent pages. (In this context 'database page' refers
** to any page that is not part of the pointer map itself.) Each pointer map
** entry consists of a single byte 'type' and a 4 byte parent page number.
** The PTRMAP_XXX identifiers below are the valid types.
**
** The purpose of the pointer map is to facility moving pages from one
** position in the file to another as part of autovacuum. When a page
** is moved, the pointer in its parent must be updated to point to the
** new location. The pointer map is used to locate the parent page quickly.
**
** PTRMAP_ROOTPAGE: The database page is a root-page. The page-number is not
** used in this case.
**
** PTRMAP_FREEPAGE: The database page is an unused (free) page. The page-number
** is not used in this case.
**
** PTRMAP_OVERFLOW1: The database page is the first page in a list of
** overflow pages. The page number identifies the page that
** contains the cell with a pointer to this overflow page.
**
** PTRMAP_OVERFLOW2: The database page is the second or later page in a list of
** overflow pages. The page-number identifies the previous
** page in the overflow page list.
**
** PTRMAP_BTREE: The database page is a non-root btree page. The page number
** identifies the parent page in the btree.
*/
#define PTRMAP_ROOTPAGE 1
#define PTRMAP_FREEPAGE 2
#define PTRMAP_OVERFLOW1 3
#define PTRMAP_OVERFLOW2 4
#define PTRMAP_BTREE 5
/* A bunch of assert() statements to check the transaction state variables
** of handle p (type Btree*) are internally consistent.
*/
#define btreeIntegrity(p) \
assert( p->pBt->inTransaction!=TRANS_NONE || p->pBt->nTransaction==0 ); \
assert( p->pBt->inTransaction>=p->inTrans );
/*
** The ISAUTOVACUUM macro is used within balance_nonroot() to determine
** if the database supports auto-vacuum or not. Because it is used
** within an expression that is an argument to another macro
** (sqliteMallocRaw), it is not possible to use conditional compilation.
** So, this macro is defined instead.
*/
#ifndef SQLITE_OMIT_AUTOVACUUM
#define ISAUTOVACUUM (pBt->autoVacuum)
#else
#define ISAUTOVACUUM 0
#endif
/*
** This structure is passed around through all the sanity checking routines
** in order to keep track of some global state information.
**
** The aRef[] array is allocated so that there is 1 bit for each page in
** the database. As the integrity-check proceeds, for each page used in
** the database the corresponding bit is set. This allows integrity-check to
** detect pages that are used twice and orphaned pages (both of which
** indicate corruption).
*/
typedef
struct
IntegrityCk
IntegrityCk
;
struct
IntegrityCk
{
BtShared
*
pBt
;
/* The tree being checked out */
Pager
*
pPager
;
/* The associated pager. Also accessible by pBt->pPager */
u8
*
aPgRef
;
/* 1 bit per page in the db (see above) */
Pgno
nPage
;
/* Number of pages in the database */
int
mxErr
;
/* Stop accumulating errors when this reaches zero */
int
nErr
;
/* Number of messages written to zErrMsg so far */
int
bOomFault
;
/* A memory allocation error has occurred */
const
char
*
zPfx
;
/* Error message prefix */
Pgno
v1
;
/* Value for first %u substitution in zPfx */
int
v2
;
/* Value for second %d substitution in zPfx */
StrAccum
errMsg
;
/* Accumulate the error message text here */
u32
*
heap
;
/* Min-heap used for analyzing cell coverage */
sqlite3
*
db
;
/* Database connection running the check */
};
/*
** Routines to read or write a two- and four-byte big-endian integer values.
*/
#define get2byte(x) ((x)[0]<<8 | (x)[1])
#define put2byte(p,v) ((p)[0] = (u8)((v)>>8), (p)[1] = (u8)(v))
#define get4byte sqlite3Get4byte
#define put4byte sqlite3Put4byte
/*
** get2byteAligned(), unlike get2byte(), requires that its argument point to a
** two-byte aligned address. get2bytea() is only used for accessing the
** cell addresses in a btree header.
*/
#if SQLITE_BYTEORDER==4321
# define get2byteAligned(x) (*(u16*)(x))
#elif SQLITE_BYTEORDER==1234 && GCC_VERSION>=4008000
# define get2byteAligned(x) __builtin_bswap16(*(u16*)(x))
#elif SQLITE_BYTEORDER==1234 && MSVC_VERSION>=1300
# define get2byteAligned(x) _byteswap_ushort(*(u16*)(x))
#else
# define get2byteAligned(x) ((x)[0]<<8 | (x)[1])
#endif
source/libs/tdb/src/sqliteinc/pager.h
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2001 September 15
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface that the sqlite page cache
** subsystem. The page cache subsystem reads and writes a file a page
** at a time and provides a journal for rollback.
*/
#ifndef SQLITE_PAGER_H
#define SQLITE_PAGER_H
/*
** Default maximum size for persistent journal files. A negative
** value means no limit. This value may be overridden using the
** sqlite3PagerJournalSizeLimit() API. See also "PRAGMA journal_size_limit".
*/
#ifndef SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT
#define SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT -1
#endif
/*
** The type used to represent a page number. The first page in a file
** is called page 1. 0 is used to represent "not a page".
*/
typedef
u32
Pgno
;
/*
** Each open file is managed by a separate instance of the "Pager" structure.
*/
typedef
struct
Pager
Pager
;
/*
** Handle type for pages.
*/
typedef
struct
PgHdr
DbPage
;
// /*
// ** Page number PAGER_MJ_PGNO is never used in an SQLite database (it is
// ** reserved for working around a windows/posix incompatibility). It is
// ** used in the journal to signify that the remainder of the journal file
// ** is devoted to storing a super-journal name - there are no more pages to
// ** roll back. See comments for function writeSuperJournal() in pager.c
// ** for details.
// */
// #define PAGER_MJ_PGNO(x) ((Pgno)((PENDING_BYTE/((x)->pageSize))+1))
/*
** Allowed values for the flags parameter to sqlite3PagerOpen().
**
** NOTE: These values must match the corresponding BTREE_ values in btree.h.
*/
#define PAGER_OMIT_JOURNAL 0x0001
/* Do not use a rollback journal */
#define PAGER_MEMORY 0x0002
/* In-memory database */
/*
** Valid values for the second argument to sqlite3PagerLockingMode().
*/
#define PAGER_LOCKINGMODE_QUERY -1
#define PAGER_LOCKINGMODE_NORMAL 0
#define PAGER_LOCKINGMODE_EXCLUSIVE 1
/*
** Numeric constants that encode the journalmode.
**
** The numeric values encoded here (other than PAGER_JOURNALMODE_QUERY)
** are exposed in the API via the "PRAGMA journal_mode" command and
** therefore cannot be changed without a compatibility break.
*/
#define PAGER_JOURNALMODE_QUERY (-1)
/* Query the value of journalmode */
#define PAGER_JOURNALMODE_DELETE 0
/* Commit by deleting journal file */
#define PAGER_JOURNALMODE_PERSIST 1
/* Commit by zeroing journal header */
#define PAGER_JOURNALMODE_OFF 2
/* Journal omitted. */
#define PAGER_JOURNALMODE_TRUNCATE 3
/* Commit by truncating journal */
#define PAGER_JOURNALMODE_MEMORY 4
/* In-memory journal file */
#define PAGER_JOURNALMODE_WAL 5
/* Use write-ahead logging */
/*
** Flags that make up the mask passed to sqlite3PagerGet().
*/
#define PAGER_GET_NOCONTENT 0x01
/* Do not load data from disk */
#define PAGER_GET_READONLY 0x02
/* Read-only page is acceptable */
/*
** Flags for sqlite3PagerSetFlags()
**
** Value constraints (enforced via assert()):
** PAGER_FULLFSYNC == SQLITE_FullFSync
** PAGER_CKPT_FULLFSYNC == SQLITE_CkptFullFSync
** PAGER_CACHE_SPILL == SQLITE_CacheSpill
*/
#define PAGER_SYNCHRONOUS_OFF 0x01
/* PRAGMA synchronous=OFF */
#define PAGER_SYNCHRONOUS_NORMAL 0x02
/* PRAGMA synchronous=NORMAL */
#define PAGER_SYNCHRONOUS_FULL 0x03
/* PRAGMA synchronous=FULL */
#define PAGER_SYNCHRONOUS_EXTRA 0x04
/* PRAGMA synchronous=EXTRA */
#define PAGER_SYNCHRONOUS_MASK 0x07
/* Mask for four values above */
#define PAGER_FULLFSYNC 0x08
/* PRAGMA fullfsync=ON */
#define PAGER_CKPT_FULLFSYNC 0x10
/* PRAGMA checkpoint_fullfsync=ON */
#define PAGER_CACHESPILL 0x20
/* PRAGMA cache_spill=ON */
#define PAGER_FLAGS_MASK 0x38
/* All above except SYNCHRONOUS */
/*
** The remainder of this file contains the declarations of the functions
** that make up the Pager sub-system API. See source code comments for
** a detailed description of each routine.
*/
/* Open and close a Pager connection. */
int
sqlite3PagerOpen
(
Pager
**
ppPager
,
const
char
*
,
int
,
int
,
int
,
void
(
*
)(
DbPage
*
)
);
int
sqlite3PagerClose
(
Pager
*
pPager
,
sqlite3
*
);
// int sqlite3PagerReadFileheader(Pager*, int, unsigned char*);
// /* Functions used to configure a Pager object. */
// void sqlite3PagerSetBusyHandler(Pager*, int(*)(void *), void *);
// int sqlite3PagerSetPagesize(Pager*, u32*, int);
// Pgno sqlite3PagerMaxPageCount(Pager*, Pgno);
// void sqlite3PagerSetCachesize(Pager*, int);
// int sqlite3PagerSetSpillsize(Pager*, int);
// void sqlite3PagerSetMmapLimit(Pager *, sqlite3_int64);
// void sqlite3PagerShrink(Pager*);
// void sqlite3PagerSetFlags(Pager*,unsigned);
// int sqlite3PagerLockingMode(Pager *, int);
// int sqlite3PagerSetJournalMode(Pager *, int);
// int sqlite3PagerGetJournalMode(Pager*);
// int sqlite3PagerOkToChangeJournalMode(Pager*);
// i64 sqlite3PagerJournalSizeLimit(Pager *, i64);
// sqlite3_backup **sqlite3PagerBackupPtr(Pager*);
// int sqlite3PagerFlush(Pager*);
// /* Functions used to obtain and release page references. */
// int sqlite3PagerGet(Pager *pPager, Pgno pgno, DbPage **ppPage, int clrFlag);
// DbPage *sqlite3PagerLookup(Pager *pPager, Pgno pgno);
// void sqlite3PagerRef(DbPage*);
// void sqlite3PagerUnref(DbPage*);
// void sqlite3PagerUnrefNotNull(DbPage*);
// void sqlite3PagerUnrefPageOne(DbPage*);
// /* Operations on page references. */
// int sqlite3PagerWrite(DbPage*);
// void sqlite3PagerDontWrite(DbPage*);
// int sqlite3PagerMovepage(Pager*,DbPage*,Pgno,int);
// int sqlite3PagerPageRefcount(DbPage*);
// void *sqlite3PagerGetData(DbPage *);
// void *sqlite3PagerGetExtra(DbPage *);
// /* Functions used to manage pager transactions and savepoints. */
// void sqlite3PagerPagecount(Pager*, int*);
// int sqlite3PagerBegin(Pager*, int exFlag, int);
// int sqlite3PagerCommitPhaseOne(Pager*,const char *zSuper, int);
// int sqlite3PagerExclusiveLock(Pager*);
// int sqlite3PagerSync(Pager *pPager, const char *zSuper);
// int sqlite3PagerCommitPhaseTwo(Pager*);
// int sqlite3PagerRollback(Pager*);
// int sqlite3PagerOpenSavepoint(Pager *pPager, int n);
// int sqlite3PagerSavepoint(Pager *pPager, int op, int iSavepoint);
// int sqlite3PagerSharedLock(Pager *pPager);
// #ifndef SQLITE_OMIT_WAL
// int sqlite3PagerCheckpoint(Pager *pPager, sqlite3*, int, int*, int*);
// int sqlite3PagerWalSupported(Pager *pPager);
// int sqlite3PagerWalCallback(Pager *pPager);
// int sqlite3PagerOpenWal(Pager *pPager, int *pisOpen);
// int sqlite3PagerCloseWal(Pager *pPager, sqlite3*);
// # ifdef SQLITE_ENABLE_SNAPSHOT
// int sqlite3PagerSnapshotGet(Pager*, sqlite3_snapshot **ppSnapshot);
// int sqlite3PagerSnapshotOpen(Pager*, sqlite3_snapshot *pSnapshot);
// int sqlite3PagerSnapshotRecover(Pager *pPager);
// int sqlite3PagerSnapshotCheck(Pager *pPager, sqlite3_snapshot *pSnapshot);
// void sqlite3PagerSnapshotUnlock(Pager *pPager);
// # endif
// #endif
// #if !defined(SQLITE_OMIT_WAL) && defined(SQLITE_ENABLE_SETLK_TIMEOUT)
// int sqlite3PagerWalWriteLock(Pager*, int);
// void sqlite3PagerWalDb(Pager*, sqlite3*);
// #else
// # define sqlite3PagerWalWriteLock(y,z) SQLITE_OK
// # define sqlite3PagerWalDb(x,y)
// #endif
// #ifdef SQLITE_DIRECT_OVERFLOW_READ
// int sqlite3PagerDirectReadOk(Pager *pPager, Pgno pgno);
// #endif
// #ifdef SQLITE_ENABLE_ZIPVFS
// int sqlite3PagerWalFramesize(Pager *pPager);
// #endif
// /* Functions used to query pager state and configuration. */
// u8 sqlite3PagerIsreadonly(Pager*);
// u32 sqlite3PagerDataVersion(Pager*);
// #ifdef SQLITE_DEBUG
// int sqlite3PagerRefcount(Pager*);
// #endif
// int sqlite3PagerMemUsed(Pager*);
// const char *sqlite3PagerFilename(const Pager*, int);
// sqlite3_vfs *sqlite3PagerVfs(Pager*);
// sqlite3_file *sqlite3PagerFile(Pager*);
// sqlite3_file *sqlite3PagerJrnlFile(Pager*);
// const char *sqlite3PagerJournalname(Pager*);
// void *sqlite3PagerTempSpace(Pager*);
// int sqlite3PagerIsMemdb(Pager*);
// void sqlite3PagerCacheStat(Pager *, int, int, int *);
// void sqlite3PagerClearCache(Pager*);
// int sqlite3SectorSize(sqlite3_file *);
// /* Functions used to truncate the database file. */
// void sqlite3PagerTruncateImage(Pager*,Pgno);
// void sqlite3PagerRekey(DbPage*, Pgno, u16);
// /* Functions to support testing and debugging. */
// #if !defined(NDEBUG) || defined(SQLITE_TEST)
// Pgno sqlite3PagerPagenumber(DbPage*);
// int sqlite3PagerIswriteable(DbPage*);
// #endif
// #ifdef SQLITE_TEST
// int *sqlite3PagerStats(Pager*);
// void sqlite3PagerRefdump(Pager*);
// void disable_simulated_io_errors(void);
// void enable_simulated_io_errors(void);
// #else
// # define disable_simulated_io_errors()
// # define enable_simulated_io_errors()
// #endif
#endif
/* SQLITE_PAGER_H */
source/libs/tdb/src/sqliteinc/pcache.h
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2008 August 05
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface that the sqlite page cache
** subsystem.
*/
#ifndef _PCACHE_H_
typedef
struct
PgHdr
PgHdr
;
typedef
struct
PCache
PCache
;
/*
** Every page in the cache is controlled by an instance of the following
** structure.
*/
struct
PgHdr
{
sqlite3_pcache_page
*
pPage
;
/* Pcache object page handle */
void
*
pData
;
/* Page data */
void
*
pExtra
;
/* Extra content */
PCache
*
pCache
;
/* PRIVATE: Cache that owns this page */
PgHdr
*
pDirty
;
/* Transient list of dirty sorted by pgno */
Pager
*
pPager
;
/* The pager this page is part of */
Pgno
pgno
;
/* Page number for this page */
#ifdef SQLITE_CHECK_PAGES
u32
pageHash
;
/* Hash of page content */
#endif
u16
flags
;
/* PGHDR flags defined below */
/**********************************************************************
** Elements above, except pCache, are public. All that follow are
** private to pcache.c and should not be accessed by other modules.
** pCache is grouped with the public elements for efficiency.
*/
i16
nRef
;
/* Number of users of this page */
PgHdr
*
pDirtyNext
;
/* Next element in list of dirty pages */
PgHdr
*
pDirtyPrev
;
/* Previous element in list of dirty pages */
/* NB: pDirtyNext and pDirtyPrev are undefined if the
** PgHdr object is not dirty */
};
/* Bit values for PgHdr.flags */
#define PGHDR_CLEAN 0x001
/* Page not on the PCache.pDirty list */
#define PGHDR_DIRTY 0x002
/* Page is on the PCache.pDirty list */
#define PGHDR_WRITEABLE 0x004
/* Journaled and ready to modify */
#define PGHDR_NEED_SYNC \
0x008
/* Fsync the rollback journal before \
** writing this page to the database */
#define PGHDR_DONT_WRITE 0x010
/* Do not write content to disk */
#define PGHDR_MMAP 0x020
/* This is an mmap page object */
#define PGHDR_WAL_APPEND 0x040
/* Appended to wal file */
/* Initialize and shutdown the page cache subsystem */
int
sqlite3PcacheInitialize
(
void
);
void
sqlite3PcacheShutdown
(
void
);
/* Page cache buffer management:
** These routines implement SQLITE_CONFIG_PAGECACHE.
*/
void
sqlite3PCacheBufferSetup
(
void
*
,
int
sz
,
int
n
);
/* Create a new pager cache.
** Under memory stress, invoke xStress to try to make pages clean.
** Only clean and unpinned pages can be reclaimed.
*/
int
sqlite3PcacheOpen
(
int
szPage
,
/* Size of every page */
int
szExtra
,
/* Extra space associated with each page */
int
bPurgeable
,
/* True if pages are on backing store */
int
(
*
xStress
)(
void
*
,
PgHdr
*
),
/* Call to try to make pages clean */
void
*
pStress
,
/* Argument to xStress */
PCache
*
pToInit
/* Preallocated space for the PCache */
);
/* Modify the page-size after the cache has been created. */
int
sqlite3PcacheSetPageSize
(
PCache
*
,
int
);
/* Return the size in bytes of a PCache object. Used to preallocate
** storage space.
*/
int
sqlite3PcacheSize
(
void
);
/* One release per successful fetch. Page is pinned until released.
** Reference counted.
*/
sqlite3_pcache_page
*
sqlite3PcacheFetch
(
PCache
*
,
Pgno
,
int
createFlag
);
int
sqlite3PcacheFetchStress
(
PCache
*
,
Pgno
,
sqlite3_pcache_page
**
);
PgHdr
*
sqlite3PcacheFetchFinish
(
PCache
*
,
Pgno
,
sqlite3_pcache_page
*
pPage
);
void
sqlite3PcacheRelease
(
PgHdr
*
);
void
sqlite3PcacheDrop
(
PgHdr
*
);
/* Remove page from cache */
void
sqlite3PcacheMakeDirty
(
PgHdr
*
);
/* Make sure page is marked dirty */
void
sqlite3PcacheMakeClean
(
PgHdr
*
);
/* Mark a single page as clean */
void
sqlite3PcacheCleanAll
(
PCache
*
);
/* Mark all dirty list pages as clean */
void
sqlite3PcacheClearWritable
(
PCache
*
);
/* Change a page number. Used by incr-vacuum. */
void
sqlite3PcacheMove
(
PgHdr
*
,
Pgno
);
/* Remove all pages with pgno>x. Reset the cache if x==0 */
void
sqlite3PcacheTruncate
(
PCache
*
,
Pgno
x
);
/* Get a list of all dirty pages in the cache, sorted by page number */
PgHdr
*
sqlite3PcacheDirtyList
(
PCache
*
);
/* Reset and close the cache object */
void
sqlite3PcacheClose
(
PCache
*
);
/* Clear flags from pages of the page cache */
void
sqlite3PcacheClearSyncFlags
(
PCache
*
);
/* Discard the contents of the cache */
void
sqlite3PcacheClear
(
PCache
*
);
/* Return the total number of outstanding page references */
int
sqlite3PcacheRefCount
(
PCache
*
);
/* Increment the reference count of an existing page */
void
sqlite3PcacheRef
(
PgHdr
*
);
int
sqlite3PcachePageRefcount
(
PgHdr
*
);
/* Return the total number of pages stored in the cache */
int
sqlite3PcachePagecount
(
PCache
*
);
#if defined(SQLITE_CHECK_PAGES) || defined(SQLITE_DEBUG)
/* Iterate through all dirty pages currently stored in the cache. This
** interface is only available if SQLITE_CHECK_PAGES is defined when the
** library is built.
*/
void
sqlite3PcacheIterateDirty
(
PCache
*
pCache
,
void
(
*
xIter
)(
PgHdr
*
));
#endif
#if defined(SQLITE_DEBUG)
/* Check invariants on a PgHdr object */
int
sqlite3PcachePageSanity
(
PgHdr
*
);
#endif
/* Set and get the suggested cache-size for the specified pager-cache.
**
** If no global maximum is configured, then the system attempts to limit
** the total number of pages cached by purgeable pager-caches to the sum
** of the suggested cache-sizes.
*/
void
sqlite3PcacheSetCachesize
(
PCache
*
,
int
);
#ifdef SQLITE_TEST
int
sqlite3PcacheGetCachesize
(
PCache
*
);
#endif
/* Set or get the suggested spill-size for the specified pager-cache.
**
** The spill-size is the minimum number of pages in cache before the cache
** will attempt to spill dirty pages by calling xStress.
*/
int
sqlite3PcacheSetSpillsize
(
PCache
*
,
int
);
/* Free up as much memory as possible from the page cache */
void
sqlite3PcacheShrink
(
PCache
*
);
#ifdef SQLITE_ENABLE_MEMORY_MANAGEMENT
/* Try to return memory used by the pcache module to the main memory heap */
int
sqlite3PcacheReleaseMemory
(
int
);
#endif
#ifdef SQLITE_TEST
void
sqlite3PcacheStats
(
int
*
,
int
*
,
int
*
,
int
*
);
#endif
void
sqlite3PCacheSetDefault
(
void
);
/* Return the header size */
int
sqlite3HeaderSizePcache
(
void
);
int
sqlite3HeaderSizePcache1
(
void
);
/* Number of dirty pages as a percentage of the configured cache size */
int
sqlite3PCachePercentDirty
(
PCache
*
);
#ifdef SQLITE_DIRECT_OVERFLOW_READ
int
sqlite3PCacheIsDirty
(
PCache
*
pCache
);
#endif
// For real implementation of sqlite3_pcache ========================================
typedef
struct
sqlite3_pcache
sqlite3_pcache
;
typedef
struct
sqlite3_pcache_methods2
{
int
iVersion
;
void
*
pArg
;
int
(
*
xInit
)(
void
*
);
void
(
*
xShutdown
)(
void
*
);
sqlite3_pcache
*
(
*
xCreate
)(
int
szPage
,
int
szExtra
,
int
bPurgeable
);
void
(
*
xCachesize
)(
sqlite3_pcache
*
,
int
nCachesize
);
int
(
*
xPagecount
)(
sqlite3_pcache
*
);
sqlite3_pcache_page
*
(
*
xFetch
)(
sqlite3_pcache
*
,
unsigned
key
,
int
createFlag
);
void
(
*
xUnpin
)(
sqlite3_pcache
*
,
sqlite3_pcache_page
*
,
int
discard
);
void
(
*
xRekey
)(
sqlite3_pcache
*
,
sqlite3_pcache_page
*
,
unsigned
oldKey
,
unsigned
newKey
);
void
(
*
xTruncate
)(
sqlite3_pcache
*
,
unsigned
iLimit
);
void
(
*
xDestroy
)(
sqlite3_pcache
*
);
void
(
*
xShrink
)(
sqlite3_pcache
*
);
}
sqlite3_pcache_methods2
;
extern
sqlite3_pcache_methods2
pcache2
;
#endif
/* _PCACHE_H_ */
source/libs/tdb/src/sqliteinc/sqlite3.h
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2001-09-15
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface that the SQLite library
** presents to client programs. If a C-function, structure, datatype,
** or constant definition does not appear in this file, then it is
** not a published API of SQLite, is subject to change without
** notice, and should not be referenced by programs that use SQLite.
**
** Some of the definitions that are in this file are marked as
** "experimental". Experimental interfaces are normally new
** features recently added to SQLite. We do not anticipate changes
** to experimental interfaces but reserve the right to make minor changes
** if experience from use "in the wild" suggest such changes are prudent.
**
** The official C-language API documentation for SQLite is derived
** from comments in this file. This file is the authoritative source
** on how SQLite interfaces are supposed to operate.
**
** The name of this file under configuration management is "sqlite.h.in".
** The makefile makes some minor changes to this file (such as inserting
** the version number) and changes its name to "sqlite3.h" as
** part of the build process.
*/
#ifndef SQLITE3_H
#define SQLITE3_H
#include <stdarg.h>
/* Needed for the definition of va_list */
/*
** Make sure we can call this stuff from C++.
*/
#ifdef __cplusplus
extern
"C"
{
#endif
/*
** CAPI3REF: Result Codes
** KEYWORDS: {result code definitions}
**
** Many SQLite functions return an integer result code from the set shown
** here in order to indicate success or failure.
**
** New error codes may be added in future versions of SQLite.
**
** See also: [extended result code definitions]
*/
#define SQLITE_OK 0
/* Successful result */
/* beginning-of-error-codes */
#define SQLITE_ERROR 1
/* Generic error */
#define SQLITE_INTERNAL 2
/* Internal logic error in SQLite */
#define SQLITE_PERM 3
/* Access permission denied */
#define SQLITE_ABORT 4
/* Callback routine requested an abort */
#define SQLITE_BUSY 5
/* The database file is locked */
#define SQLITE_LOCKED 6
/* A table in the database is locked */
#define SQLITE_NOMEM 7
/* A malloc() failed */
#define SQLITE_READONLY 8
/* Attempt to write a readonly database */
#define SQLITE_INTERRUPT 9
/* Operation terminated by sqlite3_interrupt()*/
#define SQLITE_IOERR 10
/* Some kind of disk I/O error occurred */
#define SQLITE_CORRUPT 11
/* The database disk image is malformed */
#define SQLITE_NOTFOUND 12
/* Unknown opcode in sqlite3_file_control() */
#define SQLITE_FULL 13
/* Insertion failed because database is full */
#define SQLITE_CANTOPEN 14
/* Unable to open the database file */
#define SQLITE_PROTOCOL 15
/* Database lock protocol error */
#define SQLITE_EMPTY 16
/* Internal use only */
#define SQLITE_SCHEMA 17
/* The database schema changed */
#define SQLITE_TOOBIG 18
/* String or BLOB exceeds size limit */
#define SQLITE_CONSTRAINT 19
/* Abort due to constraint violation */
#define SQLITE_MISMATCH 20
/* Data type mismatch */
#define SQLITE_MISUSE 21
/* Library used incorrectly */
#define SQLITE_NOLFS 22
/* Uses OS features not supported on host */
#define SQLITE_AUTH 23
/* Authorization denied */
#define SQLITE_FORMAT 24
/* Not used */
#define SQLITE_RANGE 25
/* 2nd parameter to sqlite3_bind out of range */
#define SQLITE_NOTADB 26
/* File opened that is not a database file */
#define SQLITE_NOTICE 27
/* Notifications from sqlite3_log() */
#define SQLITE_WARNING 28
/* Warnings from sqlite3_log() */
#define SQLITE_ROW 100
/* sqlite3_step() has another row ready */
#define SQLITE_DONE 101
/* sqlite3_step() has finished executing */
/* end-of-error-codes */
#ifdef __cplusplus
}
/* end of the 'extern "C"' block */
#endif
#endif
/* _FTS5_H */
/******** End of fts5.h *********/
source/libs/tdb/src/sqliteinc/sqliteInt.h
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2001 September 15
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** Internal interface definitions for SQLite.
**
*/
#include <assert.h>
#include <fcntl.h>
#include <pthread.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#ifndef SQLITEINT_H
#define SQLITEINT_H
#include "sqlite3.h"
typedef
int8_t
i8
;
typedef
int16_t
i16
;
typedef
int32_t
i32
;
typedef
int64_t
i64
;
typedef
uint8_t
u8
;
typedef
uint16_t
u16
;
typedef
uint32_t
u32
;
typedef
uint64_t
u64
;
typedef
struct
sqlite3_pcache_page
{
void
*
pBuf
;
/* The content of the page */
void
*
pExtra
;
/* Extra information associated with the page */
}
sqlite3_pcache_page
;
#define ROUNDDOWN8(x) ((x) & ~7)
#define ROUND8(x) (((x) + 7) & ~7)
typedef
struct
sqlite3_vfs
sqlite3_vfs
;
typedef
struct
sqlite3
sqlite3
;
#define SQLITE_DEFAULT_PAGE_SIZE 4096
#include "pager.h"
#include "pcache.h"
#endif
/* SQLITEINT_H */
source/libs/tdb/src/sqliteinc/wal.h
已删除
100644 → 0
浏览文件 @
031d84e7
/*
** 2010 February 1
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
*************************************************************************
** This header file defines the interface to the write-ahead logging
** system. Refer to the comments below and the header comment attached to
** the implementation of each function in log.c for further details.
*/
#ifndef SQLITE_WAL_H
#define SQLITE_WAL_H
#include "sqliteInt.h"
/* Macros for extracting appropriate sync flags for either transaction
** commits (WAL_SYNC_FLAGS(X)) or for checkpoint ops (CKPT_SYNC_FLAGS(X)):
*/
#define WAL_SYNC_FLAGS(X) ((X)&0x03)
#define CKPT_SYNC_FLAGS(X) (((X)>>2)&0x03)
#ifdef SQLITE_OMIT_WAL
# define sqlite3WalOpen(x,y,z) 0
# define sqlite3WalLimit(x,y)
# define sqlite3WalClose(v,w,x,y,z) 0
# define sqlite3WalBeginReadTransaction(y,z) 0
# define sqlite3WalEndReadTransaction(z)
# define sqlite3WalDbsize(y) 0
# define sqlite3WalBeginWriteTransaction(y) 0
# define sqlite3WalEndWriteTransaction(x) 0
# define sqlite3WalUndo(x,y,z) 0
# define sqlite3WalSavepoint(y,z)
# define sqlite3WalSavepointUndo(y,z) 0
# define sqlite3WalFrames(u,v,w,x,y,z) 0
# define sqlite3WalCheckpoint(q,r,s,t,u,v,w,x,y,z) 0
# define sqlite3WalCallback(z) 0
# define sqlite3WalExclusiveMode(y,z) 0
# define sqlite3WalHeapMemory(z) 0
# define sqlite3WalFramesize(z) 0
# define sqlite3WalFindFrame(x,y,z) 0
# define sqlite3WalFile(x) 0
#else
#define WAL_SAVEPOINT_NDATA 4
/* Connection to a write-ahead log (WAL) file.
** There is one object of this type for each pager.
*/
typedef
struct
Wal
Wal
;
/* Open and close a connection to a write-ahead log. */
int
sqlite3WalOpen
(
sqlite3_vfs
*
,
sqlite3_file
*
,
const
char
*
,
int
,
i64
,
Wal
**
);
int
sqlite3WalClose
(
Wal
*
pWal
,
sqlite3
*
,
int
sync_flags
,
int
,
u8
*
);
/* Set the limiting size of a WAL file. */
void
sqlite3WalLimit
(
Wal
*
,
i64
);
/* Used by readers to open (lock) and close (unlock) a snapshot. A
** snapshot is like a read-transaction. It is the state of the database
** at an instant in time. sqlite3WalOpenSnapshot gets a read lock and
** preserves the current state even if the other threads or processes
** write to or checkpoint the WAL. sqlite3WalCloseSnapshot() closes the
** transaction and releases the lock.
*/
int
sqlite3WalBeginReadTransaction
(
Wal
*
pWal
,
int
*
);
void
sqlite3WalEndReadTransaction
(
Wal
*
pWal
);
/* Read a page from the write-ahead log, if it is present. */
int
sqlite3WalFindFrame
(
Wal
*
,
Pgno
,
u32
*
);
int
sqlite3WalReadFrame
(
Wal
*
,
u32
,
int
,
u8
*
);
/* If the WAL is not empty, return the size of the database. */
Pgno
sqlite3WalDbsize
(
Wal
*
pWal
);
/* Obtain or release the WRITER lock. */
int
sqlite3WalBeginWriteTransaction
(
Wal
*
pWal
);
int
sqlite3WalEndWriteTransaction
(
Wal
*
pWal
);
/* Undo any frames written (but not committed) to the log */
int
sqlite3WalUndo
(
Wal
*
pWal
,
int
(
*
xUndo
)(
void
*
,
Pgno
),
void
*
pUndoCtx
);
/* Return an integer that records the current (uncommitted) write
** position in the WAL */
void
sqlite3WalSavepoint
(
Wal
*
pWal
,
u32
*
aWalData
);
/* Move the write position of the WAL back to iFrame. Called in
** response to a ROLLBACK TO command. */
int
sqlite3WalSavepointUndo
(
Wal
*
pWal
,
u32
*
aWalData
);
/* Write a frame or frames to the log. */
int
sqlite3WalFrames
(
Wal
*
pWal
,
int
,
PgHdr
*
,
Pgno
,
int
,
int
);
/* Copy pages from the log to the database file */
int
sqlite3WalCheckpoint
(
Wal
*
pWal
,
/* Write-ahead log connection */
sqlite3
*
db
,
/* Check this handle's interrupt flag */
int
eMode
,
/* One of PASSIVE, FULL and RESTART */
int
(
*
xBusy
)(
void
*
),
/* Function to call when busy */
void
*
pBusyArg
,
/* Context argument for xBusyHandler */
int
sync_flags
,
/* Flags to sync db file with (or 0) */
int
nBuf
,
/* Size of buffer nBuf */
u8
*
zBuf
,
/* Temporary buffer to use */
int
*
pnLog
,
/* OUT: Number of frames in WAL */
int
*
pnCkpt
/* OUT: Number of backfilled frames in WAL */
);
/* Return the value to pass to a sqlite3_wal_hook callback, the
** number of frames in the WAL at the point of the last commit since
** sqlite3WalCallback() was called. If no commits have occurred since
** the last call, then return 0.
*/
int
sqlite3WalCallback
(
Wal
*
pWal
);
/* Tell the wal layer that an EXCLUSIVE lock has been obtained (or released)
** by the pager layer on the database file.
*/
int
sqlite3WalExclusiveMode
(
Wal
*
pWal
,
int
op
);
/* Return true if the argument is non-NULL and the WAL module is using
** heap-memory for the wal-index. Otherwise, if the argument is NULL or the
** WAL module is using shared-memory, return false.
*/
int
sqlite3WalHeapMemory
(
Wal
*
pWal
);
#ifdef SQLITE_ENABLE_SNAPSHOT
int
sqlite3WalSnapshotGet
(
Wal
*
pWal
,
sqlite3_snapshot
**
ppSnapshot
);
void
sqlite3WalSnapshotOpen
(
Wal
*
pWal
,
sqlite3_snapshot
*
pSnapshot
);
int
sqlite3WalSnapshotRecover
(
Wal
*
pWal
);
int
sqlite3WalSnapshotCheck
(
Wal
*
pWal
,
sqlite3_snapshot
*
pSnapshot
);
void
sqlite3WalSnapshotUnlock
(
Wal
*
pWal
);
#endif
#ifdef SQLITE_ENABLE_ZIPVFS
/* If the WAL file is not empty, return the number of bytes of content
** stored in each frame (i.e. the db page-size when the WAL was created).
*/
int
sqlite3WalFramesize
(
Wal
*
pWal
);
#endif
/* Return the sqlite3_file object for the WAL file */
sqlite3_file
*
sqlite3WalFile
(
Wal
*
pWal
);
#ifdef SQLITE_ENABLE_SETLK_TIMEOUT
int
sqlite3WalWriteLock
(
Wal
*
pWal
,
int
bLock
);
void
sqlite3WalDb
(
Wal
*
pWal
,
sqlite3
*
db
);
#endif
#endif
/* ifndef SQLITE_OMIT_WAL */
#endif
/* SQLITE_WAL_H */
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录