tdbInt.h 13.2 KB
Newer Older
H
more  
Hongze Cheng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

H
Hongze Cheng 已提交
16 17
#ifndef _TD_TDB_INTERNAL_H_
#define _TD_TDB_INTERNAL_H_
H
Hongze Cheng 已提交
18

H
Hongze Cheng 已提交
19
#include "tdb.h"
H
refact  
Hongze Cheng 已提交
20

H
Hongze Cheng 已提交
21
#include "tlog.h"
22
#include "trbtree.h"
H
Hongze Cheng 已提交
23

H
more  
Hongze Cheng 已提交
24 25 26 27
#ifdef __cplusplus
extern "C" {
#endif

H
Hongze Cheng 已提交
28 29 30 31 32 33 34 35 36 37 38
// clang-format off
extern int32_t tdbDebugFlag;

#define tdbFatal(...) do { if (tdbDebugFlag & DEBUG_FATAL) { taosPrintLog("TDB FATAL ", DEBUG_FATAL, 255, __VA_ARGS__); }}     while(0)
#define tdbError(...) do { if (tdbDebugFlag & DEBUG_ERROR) { taosPrintLog("TDB ERROR ", DEBUG_ERROR, 255, __VA_ARGS__); }}     while(0)
#define tdbWarn(...)  do { if (tdbDebugFlag & DEBUG_WARN)  { taosPrintLog("TDB WARN ", DEBUG_WARN, 255, __VA_ARGS__); }}       while(0)
#define tdbInfo(...)  do { if (tdbDebugFlag & DEBUG_INFO)  { taosPrintLog("TDB ", DEBUG_INFO, 255, __VA_ARGS__); }}            while(0)
#define tdbDebug(...) do { if (tdbDebugFlag & DEBUG_DEBUG) { taosPrintLog("TDB ", DEBUG_DEBUG, tdbDebugFlag, __VA_ARGS__); }} while(0)
#define tdbTrace(...) do { if (tdbDebugFlag & DEBUG_TRACE) { taosPrintLog("TDB ", DEBUG_TRACE, tdbDebugFlag, __VA_ARGS__); }} while(0)
// clang-format on

H
more  
Hongze Cheng 已提交
39 40 41 42 43 44 45 46 47
typedef int8_t   i8;
typedef int16_t  i16;
typedef int32_t  i32;
typedef int64_t  i64;
typedef uint8_t  u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;

H
Hongze Cheng 已提交
48 49 50 51 52 53
// SPgno
typedef u32 SPgno;

#include "tdbOs.h"
#include "tdbUtil.h"

H
Hongze Cheng 已提交
54 55 56 57 58
// p must be u8 *
#define TDB_GET_U24(p) ((p)[0] * 65536 + *(u16 *)((p) + 1))
#define TDB_PUT_U24(p, v)       \
  do {                          \
    int tv = (v);               \
H
Hongze Cheng 已提交
59 60
    (p)[1] = tv & 0xff;         \
    (p)[2] = (tv >> 8) & 0xff;  \
H
Hongze Cheng 已提交
61 62 63
    (p)[0] = (tv >> 16) & 0xff; \
  } while (0)

H
Hongze Cheng 已提交
64
// fileid
H
Hongze Cheng 已提交
65
#define TDB_FILE_ID_LEN 24
H
Hongze Cheng 已提交
66

H
Hongze Cheng 已提交
67
// SPgid
H
Hongze Cheng 已提交
68
typedef struct {
H
Hongze Cheng 已提交
69
  uint8_t fileid[TDB_FILE_ID_LEN];
H
more  
Hongze Cheng 已提交
70
  SPgno   pgno;
H
Hongze Cheng 已提交
71
} SPgid;
H
Hongze Cheng 已提交
72

H
Hongze Cheng 已提交
73
// pgsz_t
H
Hongze Cheng 已提交
74 75
#define TDB_MIN_PGSIZE       512       // 512B
#define TDB_MAX_PGSIZE       16777216  // 16M
H
more  
Hongze Cheng 已提交
76
#define TDB_DEFAULT_PGSIZE   4096
H
Hongze Cheng 已提交
77 78
#define TDB_IS_PGSIZE_VLD(s) (((s) >= TDB_MIN_PGSIZE) && ((s) <= TDB_MAX_PGSIZE))

H
Hongze Cheng 已提交
79 80 81
// dbname
#define TDB_MAX_DBNAME_LEN 24

H
Hongze Cheng 已提交
82
#define TDB_VARIANT_LEN ((int)-1)
H
Hongze Cheng 已提交
83

H
Hongze Cheng 已提交
84
#define TDB_JOURNAL_NAME "tdb.journal"
H
Hongze Cheng 已提交
85

H
Hongze Cheng 已提交
86
#define TDB_FILENAME_LEN 128
H
Hongze Cheng 已提交
87

H
Hongze Cheng 已提交
88 89
#define BTREE_MAX_DEPTH 20

H
Hongze Cheng 已提交
90
#define TDB_FLAG_IS(flags, flag)     ((flags) == (flag))
H
Hongze Cheng 已提交
91 92
#define TDB_FLAG_HAS(flags, flag)    (((flags) & (flag)) != 0)
#define TDB_FLAG_NO(flags, flag)     ((flags) & (flag) == 0)
H
Hongze Cheng 已提交
93 94
#define TDB_FLAG_ADD(flags, flag)    ((flags) | (flag))
#define TDB_FLAG_REMOVE(flags, flag) ((flags) & (~(flag)))
H
Hongze Cheng 已提交
95

H
refact  
Hongze Cheng 已提交
96 97
typedef struct SPager  SPager;
typedef struct SPCache SPCache;
H
Hongze Cheng 已提交
98
typedef struct SPage   SPage;
H
refact  
Hongze Cheng 已提交
99

H
Hongze Cheng 已提交
100
// transaction
H
Hongze Cheng 已提交
101

H
Hongze Cheng 已提交
102 103 104 105
#define TDB_TXN_IS_WRITE(PTXN)            ((PTXN)->flags & TDB_TXN_WRITE)
#define TDB_TXN_IS_READ(PTXN)             (!TDB_TXN_IS_WRITE(PTXN))
#define TDB_TXN_IS_READ_UNCOMMITTED(PTXN) ((PTXN)->flags & TDB_TXN_READ_UNCOMMITTED)

H
Hongze Cheng 已提交
106
// tdbEnv.c ====================================
H
Hongze Cheng 已提交
107 108 109
void    tdbEnvAddPager(TDB *pEnv, SPager *pPager);
void    tdbEnvRemovePager(TDB *pEnv, SPager *pPager);
SPager *tdbEnvGetPager(TDB *pEnv, const char *fname);
H
Hongze Cheng 已提交
110 111 112 113 114 115 116 117 118 119

// tdbBtree.c ====================================
typedef struct SBTree SBTree;
typedef struct SBTC   SBTC;
typedef struct SBtInfo {
  SPgno root;
  int   nLevel;
  int   nData;
} SBtInfo;

120 121 122 123 124 125 126 127 128 129 130
#define TDB_CELLD_F_NIL 0x0
#define TDB_CELLD_F_KEY 0x1
#define TDB_CELLD_F_VAL 0x2

#define TDB_CELLDECODER_SET_FREE_NIL(pCellDecoder) ((pCellDecoder)->freeKV = TDB_CELLD_F_NIL)
#define TDB_CELLDECODER_SET_FREE_KEY(pCellDecoder) ((pCellDecoder)->freeKV |= TDB_CELLD_F_KEY)
#define TDB_CELLDECODER_SET_FREE_VAL(pCellDecoder) ((pCellDecoder)->freeKV |= TDB_CELLD_F_VAL)

#define TDB_CELLDECODER_FREE_KEY(pCellDecoder) ((pCellDecoder)->freeKV & TDB_CELLD_F_KEY)
#define TDB_CELLDECODER_FREE_VAL(pCellDecoder) ((pCellDecoder)->freeKV & TDB_CELLD_F_VAL)

H
Hongze Cheng 已提交
131
typedef struct {
132 133 134 135 136 137 138
  int   kLen;
  u8   *pKey;
  int   vLen;
  u8   *pVal;
  SPgno pgno;
  u8   *pBuf;
  u8    freeKV;
H
Hongze Cheng 已提交
139 140
} SCellDecoder;

H
Hongze Cheng 已提交
141
struct SBTC {
H
Hongze Cheng 已提交
142 143 144 145 146 147 148 149
  SBTree      *pBt;
  i8           iPage;
  SPage       *pPage;
  int          idx;
  int          idxStack[BTREE_MAX_DEPTH + 1];
  SPage       *pgStack[BTREE_MAX_DEPTH + 1];
  SCellDecoder coder;
  TXN         *pTxn;
150
  i8           freeTxn;
H
Hongze Cheng 已提交
151 152 153
};

// SBTree
154
int tdbBtreeOpen(int keyLen, int valLen, SPager *pFile, char const *tbname, SPgno pgno, tdb_cmpr_fn_t kcmpr, TDB *pEnv,
155
                 SBTree **ppBt);
H
Hongze Cheng 已提交
156 157
int tdbBtreeClose(SBTree *pBt);
int tdbBtreeInsert(SBTree *pBt, const void *pKey, int kLen, const void *pVal, int vLen, TXN *pTxn);
H
Hongze Cheng 已提交
158
int tdbBtreeDelete(SBTree *pBt, const void *pKey, int kLen, TXN *pTxn);
H
Hongze Cheng 已提交
159
int tdbBtreeUpsert(SBTree *pBt, const void *pKey, int nKey, const void *pData, int nData, TXN *pTxn);
H
Hongze Cheng 已提交
160 161 162
int tdbBtreeGet(SBTree *pBt, const void *pKey, int kLen, void **ppVal, int *vLen);
int tdbBtreePGet(SBTree *pBt, const void *pKey, int kLen, void **ppKey, int *pkLen, void **ppVal, int *vLen);

163 164 165 166 167 168 169
typedef struct {
  u8      flags;
  SBTree *pBt;
} SBtreeInitPageArg;

int tdbBtreeInitPage(SPage *pPage, void *arg, int init);

H
Hongze Cheng 已提交
170 171
// SBTC
int tdbBtcOpen(SBTC *pBtc, SBTree *pBt, TXN *pTxn);
H
Hongze Cheng 已提交
172
int tdbBtcClose(SBTC *pBtc);
H
Hongze Cheng 已提交
173
int tdbBtcIsValid(SBTC *pBtc);
H
Hongze Cheng 已提交
174
int tdbBtcMoveTo(SBTC *pBtc, const void *pKey, int kLen, int *pCRst);
H
Hongze Cheng 已提交
175 176
int tdbBtcMoveToFirst(SBTC *pBtc);
int tdbBtcMoveToLast(SBTC *pBtc);
H
Hongze Cheng 已提交
177 178
int tdbBtcMoveToNext(SBTC *pBtc);
int tdbBtcMoveToPrev(SBTC *pBtc);
H
Hongze Cheng 已提交
179
int tdbBtreeNext(SBTC *pBtc, void **ppKey, int *kLen, void **ppVal, int *vLen);
180
int tdbBtreePrev(SBTC *pBtc, void **ppKey, int *kLen, void **ppVal, int *vLen);
H
Hongze Cheng 已提交
181
int tdbBtcGet(SBTC *pBtc, const void **ppKey, int *kLen, const void **ppVal, int *vLen);
H
Hongze Cheng 已提交
182
int tdbBtcDelete(SBTC *pBtc);
H
Hongze Cheng 已提交
183
int tdbBtcUpsert(SBTC *pBtc, const void *pKey, int kLen, const void *pData, int nData, int insert);
H
Hongze Cheng 已提交
184 185 186 187 188

// tdbPager.c ====================================

int  tdbPagerOpen(SPCache *pCache, const char *fileName, SPager **ppPager);
int  tdbPagerClose(SPager *pPager);
189
int  tdbPagerOpenDB(SPager *pPager, SPgno *ppgno, bool toCreate, SBTree *pBt);
H
Hongze Cheng 已提交
190 191 192
int  tdbPagerWrite(SPager *pPager, SPage *pPage);
int  tdbPagerBegin(SPager *pPager, TXN *pTxn);
int  tdbPagerCommit(SPager *pPager, TXN *pTxn);
193
int  tdbPagerPostCommit(SPager *pPager, TXN *pTxn);
194
int  tdbPagerPrepareAsyncCommit(SPager *pPager, TXN *pTxn);
195
int  tdbPagerAbort(SPager *pPager, TXN *pTxn);
H
Hongze Cheng 已提交
196 197 198 199
int  tdbPagerFetchPage(SPager *pPager, SPgno *ppgno, SPage **ppPage, int (*initPage)(SPage *, void *, int), void *arg,
                       TXN *pTxn);
void tdbPagerReturnPage(SPager *pPager, SPage *pPage, TXN *pTxn);
int  tdbPagerAllocPage(SPager *pPager, SPgno *ppgno);
200
int  tdbPagerRestoreJournals(SPager *pPager);
201
int  tdbPagerRollback(SPager *pPager);
H
Hongze Cheng 已提交
202 203

// tdbPCache.c ====================================
H
Hongze Cheng 已提交
204 205 206 207
#define TDB_PCACHE_PAGE    \
  u8           isAnchor;   \
  u8           isLocal;    \
  u8           isDirty;    \
M
Minglei Jin 已提交
208
  u8           isFree;     \
H
Hongze Cheng 已提交
209 210 211 212 213 214 215 216 217
  volatile i32 nRef;       \
  i32          id;         \
  SPage       *pFreeNext;  \
  SPage       *pHashNext;  \
  SPage       *pLruNext;   \
  SPage       *pLruPrev;   \
  SPage       *pDirtyNext; \
  SPager      *pPager;     \
  SPgid        pgid;
H
Hongze Cheng 已提交
218 219 220 221 222

// For page ref

int    tdbPCacheOpen(int pageSize, int cacheSize, SPCache **ppCache);
int    tdbPCacheClose(SPCache *pCache);
H
Hongze Cheng 已提交
223
int    tdbPCacheAlter(SPCache *pCache, int32_t nPage);
H
Hongze Cheng 已提交
224 225
SPage *tdbPCacheFetch(SPCache *pCache, const SPgid *pPgid, TXN *pTxn);
void   tdbPCacheRelease(SPCache *pCache, SPage *pPage, TXN *pTxn);
M
Minglei Jin 已提交
226
void   tdbPCacheMarkFree(SPCache *pCache, SPage *pPage);
227
void   tdbPCacheInvalidatePage(SPCache *pCache, SPager *pPager, SPgno pgno);
H
Hongze Cheng 已提交
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
int    tdbPCacheGetPageSize(SPCache *pCache);

// tdbPage.c ====================================
typedef u8 SCell;

// PAGE APIS implemented
typedef struct {
  int szOffset;
  int szPageHdr;
  int szFreeCell;
  // cell number
  int (*getCellNum)(SPage *);
  void (*setCellNum)(SPage *, int);
  // cell content offset
  int (*getCellBody)(SPage *);
  void (*setCellBody)(SPage *, int);
  // first free cell offset (0 means no free cells)
  int (*getCellFree)(SPage *);
  void (*setCellFree)(SPage *, int);
  // total free bytes
  int (*getFreeBytes)(SPage *);
  void (*setFreeBytes)(SPage *, int);
  // cell offset at idx
  int (*getCellOffset)(SPage *, int);
  void (*setCellOffset)(SPage *, int, int);
  // free cell info
  void (*getFreeCellInfo)(SCell *pCell, int *szCell, int *nxOffset);
  void (*setFreeCellInfo)(SCell *pCell, int szCell, int nxOffset);
} SPageMethods;

#pragma pack(push, 1)

// Page footer
typedef struct {
  u8 cksm[4];
} SPageFtr;
#pragma pack(pop)

struct SPage {
267
  SRBTreeNode    node;  // must be the first field for pageCmpFn to work
H
Hongze Cheng 已提交
268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284
  tdb_spinlock_t lock;
  int            pageSize;
  u8            *pData;
  SPageMethods  *pPageMethods;
  // Fields below used by pager and am
  u8       *pPageHdr;
  u8       *pCellIdx;
  u8       *pFreeStart;
  u8       *pFreeEnd;
  SPageFtr *pPageFtr;
  int       nOverflow;
  SCell    *apOvfl[4];
  int       aiOvfl[4];
  int       kLen;  // key length of the page, -1 for unknown
  int       vLen;  // value length of the page, -1 for unknown
  int       maxLocal;
  int       minLocal;
285
  int (*xCellSize)(const SPage *, SCell *, int, TXN *pTxn, SBTree *pBt);
H
Hongze Cheng 已提交
286 287 288 289
  // Fields used by SPCache
  TDB_PCACHE_PAGE
};

H
Hongze Cheng 已提交
290 291
static inline i32 tdbRefPage(SPage *pPage) {
  i32 nRef = atomic_add_fetch_32(&((pPage)->nRef), 1);
292
  // tdbTrace("ref page %p/%d, nRef %d", pPage, pPage->id, nRef);
H
Hongze Cheng 已提交
293 294 295 296 297
  return nRef;
}

static inline i32 tdbUnrefPage(SPage *pPage) {
  i32 nRef = atomic_sub_fetch_32(&((pPage)->nRef), 1);
298
  // tdbTrace("unref page %p/%d, nRef %d", pPage, pPage->id, nRef);
H
Hongze Cheng 已提交
299 300 301 302 303
  return nRef;
}

#define tdbGetPageRef(pPage) atomic_load_32(&((pPage)->nRef))

H
Hongze Cheng 已提交
304 305 306 307 308 309
// For page lock
#define P_LOCK_SUCC 0
#define P_LOCK_BUSY 1
#define P_LOCK_FAIL -1

static inline int tdbTryLockPage(tdb_spinlock_t *pLock) {
H
Hongze Cheng 已提交
310 311 312 313 314
  int ret = tdbSpinlockTrylock(pLock);
  if (ret == 0) {
    return P_LOCK_SUCC;
  } else if (ret == EBUSY) {
    return P_LOCK_BUSY;
H
Hongze Cheng 已提交
315
  } else {
H
Hongze Cheng 已提交
316 317
    ASSERT(0);
    return P_LOCK_FAIL;
H
Hongze Cheng 已提交
318 319
  }
}
H
Hongze Cheng 已提交
320

H
Hongze Cheng 已提交
321 322 323 324 325 326 327
#define TDB_INIT_PAGE_LOCK(pPage)    tdbSpinlockInit(&((pPage)->lock), 0)
#define TDB_DESTROY_PAGE_LOCK(pPage) tdbSpinlockDestroy(&((pPage)->lock))
#define TDB_LOCK_PAGE(pPage)         tdbSpinlockLock(&((pPage)->lock))
#define TDB_UNLOCK_PAGE(pPage)       tdbSpinlockUnlock(&((pPage)->lock))
#define TDB_TRY_LOCK_PAGE(pPage)     tdbTryLockPage(&((pPage)->lock))

// APIs
328 329 330 331 332 333 334
#define TDB_PAGE_TOTAL_CELLS(pPage) ((pPage)->nOverflow + (pPage)->pPageMethods->getCellNum(pPage))
#define TDB_PAGE_USABLE_SIZE(pPage) ((u8 *)(pPage)->pPageFtr - (pPage)->pCellIdx)
#define TDB_PAGE_FREE_SIZE(pPage)   (*(pPage)->pPageMethods->getFreeBytes)(pPage)
#define TDB_PAGE_PGNO(pPage)        ((pPage)->pgid.pgno)
#define TDB_BYTES_CELL_TAKEN(pPage, pCell) \
  ((*(pPage)->xCellSize)(pPage, pCell, 0, NULL, NULL) + (pPage)->pPageMethods->szOffset)
#define TDB_PAGE_OFFSET_SIZE(pPage) ((pPage)->pPageMethods->szOffset)
H
Hongze Cheng 已提交
335 336 337

int  tdbPageCreate(int pageSize, SPage **ppPage, void *(*xMalloc)(void *, size_t), void *arg);
int  tdbPageDestroy(SPage *pPage, void (*xFree)(void *arg, void *ptr), void *arg);
338 339
void tdbPageZero(SPage *pPage, u8 szAmHdr, int (*xCellSize)(const SPage *, SCell *, int, TXN *, SBTree *pBt));
void tdbPageInit(SPage *pPage, u8 szAmHdr, int (*xCellSize)(const SPage *, SCell *, int, TXN *, SBTree *pBt));
H
Hongze Cheng 已提交
340
int  tdbPageInsertCell(SPage *pPage, int idx, SCell *pCell, int szCell, u8 asOvfl);
341 342
int  tdbPageDropCell(SPage *pPage, int idx, TXN *pTxn, SBTree *pBt);
int  tdbPageUpdateCell(SPage *pPage, int idx, SCell *pCell, int szCell, TXN *pTxn, SBTree *pBt);
343
void tdbPageCopy(SPage *pFromPage, SPage *pToPage, int copyOvflCells);
H
Hongze Cheng 已提交
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361
int  tdbPageCapacity(int pageSize, int amHdrSize);

static inline SCell *tdbPageGetCell(SPage *pPage, int idx) {
  SCell *pCell;
  int    iOvfl;
  int    lidx;

  ASSERT(idx >= 0 && idx < TDB_PAGE_TOTAL_CELLS(pPage));

  iOvfl = 0;
  for (; iOvfl < pPage->nOverflow; iOvfl++) {
    if (pPage->aiOvfl[iOvfl] == idx) {
      pCell = pPage->apOvfl[iOvfl];
      return pCell;
    } else if (pPage->aiOvfl[iOvfl] > idx) {
      break;
    }
  }
H
Hongze Cheng 已提交
362

H
Hongze Cheng 已提交
363 364 365
  lidx = idx - iOvfl;
  ASSERT(lidx >= 0 && lidx < pPage->pPageMethods->getCellNum(pPage));
  pCell = pPage->pData + pPage->pPageMethods->getCellOffset(pPage, lidx);
H
Hongze Cheng 已提交
366

H
Hongze Cheng 已提交
367 368
  return pCell;
}
H
Hongze Cheng 已提交
369

370 371 372 373 374 375
#define USE_MAINDB

#ifdef USE_MAINDB
#define TDB_MAINDB_NAME "main.tdb"
#endif

H
Hongze Cheng 已提交
376
struct STDB {
H
Hongze Cheng 已提交
377 378
  char    *dbName;
  char    *jnName;
H
Hongze Cheng 已提交
379 380 381 382 383 384
  int      jfd;
  SPCache *pCache;
  SPager  *pgrList;
  int      nPager;
  int      nPgrHash;
  SPager **pgrHash;
385 386 387
#ifdef USE_MAINDB
  TTB *pMainDb;
#endif
M
Minglei Jin 已提交
388
  int64_t txnId;
H
Hongze Cheng 已提交
389
};
H
Hongze Cheng 已提交
390

H
Hongze Cheng 已提交
391 392 393 394 395 396 397 398 399
struct SPager {
  char    *dbFileName;
  char    *jFileName;
  int      pageSize;
  uint8_t  fid[TDB_FILE_ID_LEN];
  tdb_fd_t fd;
  SPCache *pCache;
  SPgno    dbFileSize;
  SPgno    dbOrigSize;
400
  // SPage   *pDirty;
M
Minglei Jin 已提交
401 402 403 404 405
  SRBTree rbt;
  // u8        inTran;
  TXN    *pActiveTxn;
  SPager *pNext;      // used by TDB
  SPager *pHashNext;  // used by TDB
406 407 408
#ifdef USE_MAINDB
  TDB *pEnv;
#endif
H
Hongze Cheng 已提交
409 410
};

H
more  
Hongze Cheng 已提交
411 412 413 414
#ifdef __cplusplus
}
#endif

H
Hongze Cheng 已提交
415
#endif /*_TD_TDB_INTERNAL_H_*/