tdbBtree.c 35.2 KB
Newer Older
H
Hongze Cheng 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
 *
 * This program is free software: you can use, redistribute, and/or modify
 * it under the terms of the GNU Affero General Public License, version 3
 * or later ("AGPL"), as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
H
refact  
Hongze Cheng 已提交
14 15
 */

H
Hongze Cheng 已提交
16 17
#include "tdbInt.h"

H
Hongze Cheng 已提交
18 19 20
#define TDB_BTREE_ROOT 0x1
#define TDB_BTREE_LEAF 0x2

H
Hongze Cheng 已提交
21
struct SBTree {
H
more  
Hongze Cheng 已提交
22 23 24
  SPgno          root;
  int            keyLen;
  int            valLen;
H
Hongze Cheng 已提交
25
  SPager        *pPager;
H
more  
Hongze Cheng 已提交
26
  FKeyComparator kcmpr;
H
Hongze Cheng 已提交
27
  int            pageSize;
H
more  
Hongze Cheng 已提交
28 29 30 31
  int            maxLocal;
  int            minLocal;
  int            maxLeaf;
  int            minLeaf;
H
Hongze Cheng 已提交
32
  void          *pBuf;
H
Hongze Cheng 已提交
33 34
};

H
Hongze Cheng 已提交
35 36
#define TDB_BTREE_PAGE_COMMON_HDR u8 flags;

H
Hongze Cheng 已提交
37
#define TDB_BTREE_PAGE_GET_FLAGS(PAGE)        (PAGE)->pData[0]
H
Hongze Cheng 已提交
38
#define TDB_BTREE_PAGE_SET_FLAGS(PAGE, flags) ((PAGE)->pData[0] = (flags))
H
Hongze Cheng 已提交
39 40
#define TDB_BTREE_PAGE_IS_ROOT(PAGE)          (TDB_BTREE_PAGE_GET_FLAGS(PAGE) & TDB_BTREE_ROOT)
#define TDB_BTREE_PAGE_IS_LEAF(PAGE)          (TDB_BTREE_PAGE_GET_FLAGS(PAGE) & TDB_BTREE_LEAF)
H
refact  
Hongze Cheng 已提交
41 42 43
#define TDB_BTREE_ASSERT_FLAG(flags)                                                 \
  ASSERT(TDB_FLAG_IS(flags, TDB_BTREE_ROOT) || TDB_FLAG_IS(flags, TDB_BTREE_LEAF) || \
         TDB_FLAG_IS(flags, TDB_BTREE_ROOT | TDB_BTREE_LEAF) || TDB_FLAG_IS(flags, 0))
H
Hongze Cheng 已提交
44 45 46 47 48

typedef struct __attribute__((__packed__)) {
  TDB_BTREE_PAGE_COMMON_HDR
} SLeafHdr;

H
Hongze Cheng 已提交
49
typedef struct __attribute__((__packed__)) {
H
Hongze Cheng 已提交
50 51 52
  TDB_BTREE_PAGE_COMMON_HDR;
  SPgno pgno;  // right-most child
} SIntHdr;
H
more  
Hongze Cheng 已提交
53

H
Hongze Cheng 已提交
54
typedef struct {
H
Hongze Cheng 已提交
55
  u8      flags;
H
Hongze Cheng 已提交
56
  SBTree *pBt;
H
Hongze Cheng 已提交
57
} SBtreeInitPageArg;
H
Hongze Cheng 已提交
58

H
Hongze Cheng 已提交
59
typedef struct {
H
Hongze Cheng 已提交
60 61 62 63 64 65
  int       kLen;
  const u8 *pKey;
  int       vLen;
  const u8 *pVal;
  SPgno     pgno;
  u8       *pBuf;
H
Hongze Cheng 已提交
66 67
} SCellDecoder;

H
refact  
Hongze Cheng 已提交
68
static int tdbBtcMoveTo(SBTC *pBtc, const void *pKey, int kLen, int *pCRst);
H
Hongze Cheng 已提交
69
static int tdbDefaultKeyCmprFn(const void *pKey1, int keyLen1, const void *pKey2, int keyLen2);
H
Hongze Cheng 已提交
70
static int tdbBtreeOpenImpl(SBTree *pBt);
H
Hongze Cheng 已提交
71 72
static int tdbBtreeZeroPage(SPage *pPage, void *arg);
static int tdbBtreeInitPage(SPage *pPage, void *arg);
H
Hongze Cheng 已提交
73 74
static int tdbBtreeEncodeCell(SPage *pPage, const void *pKey, int kLen, const void *pVal, int vLen, SCell *pCell,
                              int *szCell);
H
Hongze Cheng 已提交
75
static int tdbBtreeDecodeCell(SPage *pPage, const SCell *pCell, SCellDecoder *pDecoder);
H
Hongze Cheng 已提交
76
static int tdbBtreeBalance(SBTC *pBtc);
H
Hongze Cheng 已提交
77
static int tdbBtreeCellSize(const SPage *pPage, SCell *pCell);
H
Hongze Cheng 已提交
78
static int tdbBtcMoveToNext(SBTC *pBtc);
H
Hongze Cheng 已提交
79
static int tdbBtcMoveDownward(SBTC *pBtc);
H
Hongze Cheng 已提交
80
static int tdbBtcMoveUpward(SBTC *pBtc);
H
Hongze Cheng 已提交
81

H
Hongze Cheng 已提交
82
int tdbBtreeOpen(int keyLen, int valLen, SPager *pPager, FKeyComparator kcmpr, SBTree **ppBt) {
H
Hongze Cheng 已提交
83
  SBTree *pBt;
H
Hongze Cheng 已提交
84
  int     ret;
H
more  
Hongze Cheng 已提交
85

H
Hongze Cheng 已提交
86 87
  ASSERT(keyLen != 0);

H
Hongze Cheng 已提交
88
  *ppBt = NULL;
H
Hongze Cheng 已提交
89

H
Hongze Cheng 已提交
90
  pBt = (SBTree *)tdbOsCalloc(1, sizeof(*pBt));
H
Hongze Cheng 已提交
91 92 93 94 95
  if (pBt == NULL) {
    return -1;
  }

  // pBt->keyLen
H
Hongze Cheng 已提交
96
  pBt->keyLen = keyLen < 0 ? TDB_VARIANT_LEN : keyLen;
H
Hongze Cheng 已提交
97
  // pBt->valLen
H
Hongze Cheng 已提交
98
  pBt->valLen = valLen < 0 ? TDB_VARIANT_LEN : valLen;
H
refact  
Hongze Cheng 已提交
99 100
  // pBt->pPager
  pBt->pPager = pPager;
H
Hongze Cheng 已提交
101 102
  // pBt->kcmpr
  pBt->kcmpr = kcmpr ? kcmpr : tdbDefaultKeyCmprFn;
H
Hongze Cheng 已提交
103
  // pBt->pageSize
H
Hongze Cheng 已提交
104
  pBt->pageSize = pPager->pageSize;
H
Hongze Cheng 已提交
105
  // pBt->maxLocal
H
Hongze Cheng 已提交
106
  pBt->maxLocal = tdbPageCapacity(pBt->pageSize, sizeof(SIntHdr)) / 4;
H
Hongze Cheng 已提交
107
  // pBt->minLocal: Should not be allowed smaller than 15, which is [nPayload][nKey][nData]
H
Hongze Cheng 已提交
108
  pBt->minLocal = pBt->maxLocal / 2;
H
Hongze Cheng 已提交
109
  // pBt->maxLeaf
H
Hongze Cheng 已提交
110
  pBt->maxLeaf = tdbPageCapacity(pBt->pageSize, sizeof(SLeafHdr));
H
Hongze Cheng 已提交
111 112
  // pBt->minLeaf
  pBt->minLeaf = pBt->minLocal;
H
Hongze Cheng 已提交
113

H
Hongze Cheng 已提交
114 115 116
  // TODO: pBt->root
  ret = tdbBtreeOpenImpl(pBt);
  if (ret < 0) {
H
Hongze Cheng 已提交
117
    tdbOsFree(pBt);
H
Hongze Cheng 已提交
118 119 120
    return -1;
  }

H
Hongze Cheng 已提交
121
  *ppBt = pBt;
H
Hongze Cheng 已提交
122 123 124 125 126 127 128 129
  return 0;
}

int tdbBtreeClose(SBTree *pBt) {
  // TODO
  return 0;
}

H
Hongze Cheng 已提交
130
int tdbBtreeInsert(SBTree *pBt, const void *pKey, int kLen, const void *pVal, int vLen, TXN *pTxn) {
H
Hongze Cheng 已提交
131 132 133 134 135 136 137 138 139
  SBTC   btc;
  SCell *pCell;
  void  *pBuf;
  int    szCell;
  int    szBuf;
  int    ret;
  int    idx;
  int    c;

H
Hongze Cheng 已提交
140
  tdbBtcOpen(&btc, pBt, pTxn);
H
Hongze Cheng 已提交
141

H
Hongze Cheng 已提交
142 143
  // move to the position to insert
  ret = tdbBtcMoveTo(&btc, pKey, kLen, &c);
H
Hongze Cheng 已提交
144
  if (ret < 0) {
H
Hongze Cheng 已提交
145 146
    tdbBtcClose(&btc);
    ASSERT(0);
H
Hongze Cheng 已提交
147 148 149
    return -1;
  }

H
Hongze Cheng 已提交
150
  if (btc.idx == -1) {
H
Hongze Cheng 已提交
151 152
    idx = 0;
  } else {
H
Hongze Cheng 已提交
153 154 155 156
    if (c > 0) {
      idx = btc.idx + 1;
    } else if (c < 0) {
      idx = btc.idx;
H
Hongze Cheng 已提交
157
    } else {
H
Hongze Cheng 已提交
158 159
      // TDB does NOT allow same key
      tdbBtcClose(&btc);
H
Hongze Cheng 已提交
160
      ASSERT(0);
H
Hongze Cheng 已提交
161 162 163 164
      return -1;
    }
  }

H
more  
Hongze Cheng 已提交
165
  // make sure enough space to hold the cell
H
Hongze Cheng 已提交
166 167 168 169 170 171 172 173 174
  szBuf = kLen + vLen + 14;
  pBuf = TDB_REALLOC(pBt->pBuf, pBt->pageSize > szBuf ? szBuf : pBt->pageSize);
  if (pBuf == NULL) {
    tdbBtcClose(&btc);
    ASSERT(0);
    return -1;
  }
  pBt->pBuf = pBuf;
  pCell = (SCell *)pBt->pBuf;
H
Hongze Cheng 已提交
175

H
Hongze Cheng 已提交
176 177
  // encode cell
  ret = tdbBtreeEncodeCell(btc.pPage, pKey, kLen, pVal, vLen, pCell, &szCell);
H
Hongze Cheng 已提交
178
  if (ret < 0) {
H
Hongze Cheng 已提交
179 180
    tdbBtcClose(&btc);
    ASSERT(0);
H
more  
Hongze Cheng 已提交
181 182 183 184 185 186 187 188
    return -1;
  }

  // mark the page dirty
  ret = tdbPagerWrite(pBt->pPager, btc.pPage);
  if (ret < 0) {
    tdbBtcClose(&btc);
    ASSERT(0);
H
Hongze Cheng 已提交
189 190
    return -1;
  }
H
Hongze Cheng 已提交
191

H
Hongze Cheng 已提交
192 193
  // insert the cell
  ret = tdbPageInsertCell(btc.pPage, idx, pCell, szCell, 0);
H
Hongze Cheng 已提交
194
  if (ret < 0) {
H
Hongze Cheng 已提交
195 196
    tdbBtcClose(&btc);
    ASSERT(0);
H
Hongze Cheng 已提交
197
    return -1;
H
Hongze Cheng 已提交
198 199
  }

H
Hongze Cheng 已提交
200 201 202
  // check if need balance
  if (btc.pPage->nOverflow > 0) {
    ret = tdbBtreeBalance(&btc);
H
Hongze Cheng 已提交
203
    if (ret < 0) {
H
Hongze Cheng 已提交
204 205
      tdbBtcClose(&btc);
      ASSERT(0);
H
Hongze Cheng 已提交
206 207 208 209
      return -1;
    }
  }

H
Hongze Cheng 已提交
210 211
  tdbBtcClose(&btc);

H
more  
Hongze Cheng 已提交
212 213 214
  return 0;
}

H
Hongze Cheng 已提交
215
int tdbBtreeGet(SBTree *pBt, const void *pKey, int kLen, void **ppVal, int *vLen) {
H
Hongze Cheng 已提交
216 217 218 219
  return tdbBtreePGet(pBt, pKey, kLen, NULL, NULL, ppVal, vLen);
}

int tdbBtreePGet(SBTree *pBt, const void *pKey, int kLen, void **ppKey, int *pkLen, void **ppVal, int *vLen) {
H
refact  
Hongze Cheng 已提交
220
  SBTC         btc;
H
Hongze Cheng 已提交
221 222
  SCell       *pCell;
  int          cret;
H
Hongze Cheng 已提交
223
  int          ret;
H
Hongze Cheng 已提交
224 225
  void        *pTKey = NULL;
  void        *pTVal = NULL;
H
Hongze Cheng 已提交
226 227
  SCellDecoder cd;

H
Hongze Cheng 已提交
228
  tdbBtcOpen(&btc, pBt, NULL);
H
Hongze Cheng 已提交
229

H
Hongze Cheng 已提交
230 231 232 233 234
  ret = tdbBtcMoveTo(&btc, pKey, kLen, &cret);
  if (ret < 0) {
    tdbBtcClose(&btc);
    ASSERT(0);
  }
H
Hongze Cheng 已提交
235

H
Hongze Cheng 已提交
236
  if (btc.idx < 0 || cret) {
H
Hongze Cheng 已提交
237 238
    tdbBtcClose(&btc);
    return -1;
H
Hongze Cheng 已提交
239 240 241 242 243
  }

  pCell = tdbPageGetCell(btc.pPage, btc.idx);
  tdbBtreeDecodeCell(btc.pPage, pCell, &cd);

H
Hongze Cheng 已提交
244 245 246 247 248 249 250 251 252 253
  if (ppKey) {
    pTKey = TDB_REALLOC(*ppKey, cd.kLen);
    if (pTKey == NULL) {
      tdbBtcClose(&btc);
      ASSERT(0);
      return -1;
    }
    *ppKey = pTKey;
    *pkLen = cd.kLen;
    memcpy(*ppKey, cd.pKey, cd.kLen);
254 255
  }

H
Hongze Cheng 已提交
256 257 258 259 260 261 262 263 264 265
  if (ppVal) {
    pTVal = TDB_REALLOC(*ppVal, cd.vLen);
    if (pTVal == NULL) {
      tdbBtcClose(&btc);
      ASSERT(0);
      return -1;
    }
    *ppVal = pTVal;
    *vLen = cd.vLen;
    memcpy(*ppVal, cd.pVal, cd.vLen);
266
  }
H
Hongze Cheng 已提交
267

H
Hongze Cheng 已提交
268 269
  tdbBtcClose(&btc);

270 271 272
  return 0;
}

H
Hongze Cheng 已提交
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
static int tdbDefaultKeyCmprFn(const void *pKey1, int keyLen1, const void *pKey2, int keyLen2) {
  int mlen;
  int cret;

  ASSERT(keyLen1 > 0 && keyLen2 > 0 && pKey1 != NULL && pKey2 != NULL);

  mlen = keyLen1 < keyLen2 ? keyLen1 : keyLen2;
  cret = memcmp(pKey1, pKey2, mlen);
  if (cret == 0) {
    if (keyLen1 < keyLen2) {
      cret = -1;
    } else if (keyLen1 > keyLen2) {
      cret = 1;
    } else {
      cret = 0;
    }
  }
  return cret;
H
Hongze Cheng 已提交
291 292 293 294 295 296 297 298 299 300
}

static int tdbBtreeOpenImpl(SBTree *pBt) {
  // Try to get the root page of the an existing btree

  SPgno  pgno;
  SPage *pPage;
  int    ret;

  {
H
Hongze Cheng 已提交
301
    // 1. TODO: Search the main DB to check if the DB exists
H
Hongze Cheng 已提交
302 303
    ret = tdbPagerOpenDB(pBt->pPager, &pgno, true);
    ASSERT(ret == 0);
H
Hongze Cheng 已提交
304 305 306 307 308 309 310 311
  }

  if (pgno != 0) {
    pBt->root = pgno;
    return 0;
  }

  // Try to create a new database
H
Hongze Cheng 已提交
312
  SBtreeInitPageArg zArg = {.flags = TDB_BTREE_ROOT | TDB_BTREE_LEAF, .pBt = pBt};
H
Hongze Cheng 已提交
313
  ret = tdbPagerNewPage(pBt->pPager, &pgno, &pPage, tdbBtreeZeroPage, &zArg, NULL);
H
Hongze Cheng 已提交
314 315 316 317
  if (ret < 0) {
    return -1;
  }

H
more  
Hongze Cheng 已提交
318
  // TODO: here still has problem
H
Hongze Cheng 已提交
319
  tdbPagerReturnPage(pBt->pPager, pPage, NULL);
H
Hongze Cheng 已提交
320

H
Hongze Cheng 已提交
321 322
  ASSERT(pgno != 0);
  pBt->root = pgno;
H
Hongze Cheng 已提交
323

H
Hongze Cheng 已提交
324 325 326
  return 0;
}

H
Hongze Cheng 已提交
327
static int tdbBtreeInitPage(SPage *pPage, void *arg) {
H
Hongze Cheng 已提交
328
  SBTree *pBt;
H
Hongze Cheng 已提交
329
  u8      flags;
H
Hongze Cheng 已提交
330
  u8      isLeaf;
H
Hongze Cheng 已提交
331

H
Hongze Cheng 已提交
332 333
  pBt = (SBTree *)arg;
  flags = TDB_BTREE_PAGE_GET_FLAGS(pPage);
H
refact  
Hongze Cheng 已提交
334
  isLeaf = TDB_BTREE_PAGE_IS_LEAF(pPage);
H
Hongze Cheng 已提交
335

H
Hongze Cheng 已提交
336
  ASSERT(flags == TDB_BTREE_PAGE_GET_FLAGS(pPage));
H
Hongze Cheng 已提交
337

H
Hongze Cheng 已提交
338
  tdbPageInit(pPage, isLeaf ? sizeof(SLeafHdr) : sizeof(SIntHdr), tdbBtreeCellSize);
H
Hongze Cheng 已提交
339

H
Hongze Cheng 已提交
340 341
  TDB_BTREE_ASSERT_FLAG(flags);

H
Hongze Cheng 已提交
342
  if (isLeaf) {
H
Hongze Cheng 已提交
343 344 345 346 347 348 349 350 351 352
    pPage->kLen = pBt->keyLen;
    pPage->vLen = pBt->valLen;
    pPage->maxLocal = pBt->maxLeaf;
    pPage->minLocal = pBt->minLeaf;
  } else {
    pPage->kLen = pBt->keyLen;
    pPage->vLen = sizeof(SPgno);
    pPage->maxLocal = pBt->maxLocal;
    pPage->minLocal = pBt->minLocal;
  }
H
Hongze Cheng 已提交
353 354 355 356

  return 0;
}

H
Hongze Cheng 已提交
357
static int tdbBtreeZeroPage(SPage *pPage, void *arg) {
H
Hongze Cheng 已提交
358
  u8      flags;
H
Hongze Cheng 已提交
359
  SBTree *pBt;
H
refact  
Hongze Cheng 已提交
360
  u8      leaf;
H
Hongze Cheng 已提交
361

H
Hongze Cheng 已提交
362 363
  flags = ((SBtreeInitPageArg *)arg)->flags;
  pBt = ((SBtreeInitPageArg *)arg)->pBt;
H
refact  
Hongze Cheng 已提交
364
  leaf = flags & TDB_BTREE_LEAF;
H
Hongze Cheng 已提交
365

H
refact  
Hongze Cheng 已提交
366
  tdbPageZero(pPage, leaf ? sizeof(SLeafHdr) : sizeof(SIntHdr), tdbBtreeCellSize);
H
Hongze Cheng 已提交
367

H
refact  
Hongze Cheng 已提交
368
  if (leaf) {
H
Hongze Cheng 已提交
369
    SLeafHdr *pLeafHdr = (SLeafHdr *)(pPage->pData);
H
Hongze Cheng 已提交
370 371
    pLeafHdr->flags = flags;

H
Hongze Cheng 已提交
372 373 374 375 376
    pPage->kLen = pBt->keyLen;
    pPage->vLen = pBt->valLen;
    pPage->maxLocal = pBt->maxLeaf;
    pPage->minLocal = pBt->minLeaf;
  } else {
H
Hongze Cheng 已提交
377
    SIntHdr *pIntHdr = (SIntHdr *)(pPage->pData);
H
Hongze Cheng 已提交
378 379 380
    pIntHdr->flags = flags;
    pIntHdr->pgno = 0;

H
Hongze Cheng 已提交
381 382 383 384 385
    pPage->kLen = pBt->keyLen;
    pPage->vLen = sizeof(SPgno);
    pPage->maxLocal = pBt->maxLocal;
    pPage->minLocal = pBt->minLocal;
  }
H
Hongze Cheng 已提交
386

H
Hongze Cheng 已提交
387
  return 0;
H
Hongze Cheng 已提交
388 389
}

H
Hongze Cheng 已提交
390
// TDB_BTREE_BALANCE =====================
H
Hongze Cheng 已提交
391
static int tdbBtreeBalanceDeeper(SBTree *pBt, SPage *pRoot, SPage **ppChild, TXN *pTxn) {
H
Hongze Cheng 已提交
392 393 394 395
  SPager           *pPager;
  SPage            *pChild;
  SPgno             pgnoChild;
  int               ret;
H
Hongze Cheng 已提交
396
  u8                flags;
H
Hongze Cheng 已提交
397
  SIntHdr          *pIntHdr;
H
Hongze Cheng 已提交
398
  SBtreeInitPageArg zArg;
H
Hongze Cheng 已提交
399
  u8                leaf;
H
Hongze Cheng 已提交
400 401

  pPager = pRoot->pPager;
H
Hongze Cheng 已提交
402
  flags = TDB_BTREE_PAGE_GET_FLAGS(pRoot);
H
refact  
Hongze Cheng 已提交
403
  leaf = TDB_BTREE_PAGE_IS_LEAF(pRoot);
H
Hongze Cheng 已提交
404 405

  // Allocate a new child page
H
Hongze Cheng 已提交
406
  zArg.flags = TDB_FLAG_REMOVE(flags, TDB_BTREE_ROOT);
H
Hongze Cheng 已提交
407
  zArg.pBt = pBt;
H
Hongze Cheng 已提交
408
  ret = tdbPagerNewPage(pPager, &pgnoChild, &pChild, tdbBtreeZeroPage, &zArg, pTxn);
H
Hongze Cheng 已提交
409 410 411 412
  if (ret < 0) {
    return -1;
  }

H
Hongze Cheng 已提交
413 414 415 416
  if (!leaf) {
    ((SIntHdr *)pChild->pData)->pgno = ((SIntHdr *)(pRoot->pData))->pgno;
  }

H
Hongze Cheng 已提交
417 418 419 420 421 422 423
  ret = tdbPagerWrite(pPager, pChild);
  if (ret < 0) {
    // TODO
    ASSERT(0);
    return 0;
  }

H
Hongze Cheng 已提交
424
  // Copy the root page content to the child page
H
Hongze Cheng 已提交
425
  tdbPageCopy(pRoot, pChild);
H
Hongze Cheng 已提交
426 427 428 429 430 431 432 433 434

  // Reinitialize the root page
  zArg.flags = TDB_BTREE_ROOT;
  zArg.pBt = pBt;
  ret = tdbBtreeZeroPage(pRoot, &zArg);
  if (ret < 0) {
    return -1;
  }

H
Hongze Cheng 已提交
435
  pIntHdr = (SIntHdr *)(pRoot->pData);
H
Hongze Cheng 已提交
436
  pIntHdr->pgno = pgnoChild;
H
Hongze Cheng 已提交
437

H
Hongze Cheng 已提交
438 439 440 441
  *ppChild = pChild;
  return 0;
}

H
Hongze Cheng 已提交
442
static int tdbBtreeBalanceNonRoot(SBTree *pBt, SPage *pParent, int idx, TXN *pTxn) {
H
Hongze Cheng 已提交
443 444 445
  int ret;

  int    nOlds;
H
Hongze Cheng 已提交
446
  SPage *pOlds[3] = {0};
H
Hongze Cheng 已提交
447 448
  SCell *pDivCell[3] = {0};
  int    szDivCell[3];
H
Hongze Cheng 已提交
449
  int    sIdx;
H
Hongze Cheng 已提交
450
  u8     childNotLeaf;
H
Hongze Cheng 已提交
451
  SPgno  rPgno;
H
Hongze Cheng 已提交
452

H
Hongze Cheng 已提交
453
  {  // Find 3 child pages at most to do balance
H
Hongze Cheng 已提交
454 455 456
    int    nCells = TDB_PAGE_TOTAL_CELLS(pParent);
    SCell *pCell;

H
Hongze Cheng 已提交
457 458 459 460 461 462 463 464 465 466 467 468 469 470
    if (nCells <= 2) {
      sIdx = 0;
      nOlds = nCells + 1;
    } else {
      // has more than three child pages
      if (idx == 0) {
        sIdx = 0;
      } else if (idx == nCells) {
        sIdx = idx - 2;
      } else {
        sIdx = idx - 1;
      }
      nOlds = 3;
    }
H
Hongze Cheng 已提交
471 472
    for (int i = 0; i < nOlds; i++) {
      ASSERT(sIdx + i <= nCells);
H
Hongze Cheng 已提交
473

H
Hongze Cheng 已提交
474
      SPgno pgno;
H
Hongze Cheng 已提交
475
      if (sIdx + i == nCells) {
H
refact  
Hongze Cheng 已提交
476
        ASSERT(!TDB_BTREE_PAGE_IS_LEAF(pParent));
H
Hongze Cheng 已提交
477 478
        pgno = ((SIntHdr *)(pParent->pData))->pgno;
      } else {
H
Hongze Cheng 已提交
479
        pCell = tdbPageGetCell(pParent, sIdx + i);
H
Hongze Cheng 已提交
480 481 482
        pgno = *(SPgno *)pCell;
      }

H
Hongze Cheng 已提交
483
      ret = tdbPagerFetchPage(pBt->pPager, pgno, pOlds + i, tdbBtreeInitPage, pBt, pTxn);
H
Hongze Cheng 已提交
484 485 486 487
      if (ret < 0) {
        ASSERT(0);
        return -1;
      }
H
Hongze Cheng 已提交
488 489 490 491 492 493 494

      ret = tdbPagerWrite(pBt->pPager, pOlds[i]);
      if (ret < 0) {
        // TODO
        ASSERT(0);
        return -1;
      }
H
Hongze Cheng 已提交
495
    }
H
Hongze Cheng 已提交
496
    // copy the parent key out if child pages are not leaf page
H
refact  
Hongze Cheng 已提交
497
    childNotLeaf = !TDB_BTREE_PAGE_IS_LEAF(pOlds[0]);
H
Hongze Cheng 已提交
498
    if (childNotLeaf) {
H
Hongze Cheng 已提交
499 500 501 502
      for (int i = 0; i < nOlds; i++) {
        if (sIdx + i < TDB_PAGE_TOTAL_CELLS(pParent)) {
          pCell = tdbPageGetCell(pParent, sIdx + i);
          szDivCell[i] = tdbBtreeCellSize(pParent, pCell);
H
Hongze Cheng 已提交
503
          pDivCell[i] = tdbOsMalloc(szDivCell[i]);
H
Hongze Cheng 已提交
504 505
          memcpy(pDivCell[i], pCell, szDivCell[i]);
        }
H
Hongze Cheng 已提交
506

H
Hongze Cheng 已提交
507 508 509
        if (i < nOlds - 1) {
          ((SPgno *)pDivCell[i])[0] = ((SIntHdr *)pOlds[i]->pData)->pgno;
          ((SIntHdr *)pOlds[i]->pData)->pgno = 0;
H
Hongze Cheng 已提交
510
          tdbPageInsertCell(pOlds[i], TDB_PAGE_TOTAL_CELLS(pOlds[i]), pDivCell[i], szDivCell[i], 1);
H
Hongze Cheng 已提交
511
        }
H
Hongze Cheng 已提交
512
      }
H
Hongze Cheng 已提交
513
      rPgno = ((SIntHdr *)pOlds[nOlds - 1]->pData)->pgno;
H
Hongze Cheng 已提交
514
    }
H
Hongze Cheng 已提交
515 516 517 518 519 520 521 522

    ret = tdbPagerWrite(pBt->pPager, pParent);
    if (ret < 0) {
      // TODO
      ASSERT(0);
      return -1;
    }

H
Hongze Cheng 已提交
523
    // drop the cells on parent page
H
Hongze Cheng 已提交
524 525 526 527 528 529 530 531
    for (int i = 0; i < nOlds; i++) {
      nCells = TDB_PAGE_TOTAL_CELLS(pParent);
      if (sIdx < nCells) {
        tdbPageDropCell(pParent, sIdx);
      } else {
        ((SIntHdr *)pParent->pData)->pgno = 0;
      }
    }
H
Hongze Cheng 已提交
532 533
  }

H
Hongze Cheng 已提交
534
  int nNews = 0;
H
Hongze Cheng 已提交
535
  struct {
H
Hongze Cheng 已提交
536 537 538 539
    int cnt;
    int size;
    int iPage;
    int oIdx;
H
Hongze Cheng 已提交
540
  } infoNews[5] = {0};
H
Hongze Cheng 已提交
541 542 543

  {  // Get how many new pages are needed and the new distribution

H
Hongze Cheng 已提交
544 545 546
    // first loop to find minimum number of pages needed
    for (int oPage = 0; oPage < nOlds; oPage++) {
      SPage *pPage = pOlds[oPage];
H
Hongze Cheng 已提交
547 548
      SCell *pCell;
      int    cellBytes;
H
Hongze Cheng 已提交
549
      int    oIdx;
H
Hongze Cheng 已提交
550

H
Hongze Cheng 已提交
551
      for (oIdx = 0; oIdx < TDB_PAGE_TOTAL_CELLS(pPage); oIdx++) {
H
Hongze Cheng 已提交
552
        pCell = tdbPageGetCell(pPage, oIdx);
H
Hongze Cheng 已提交
553 554
        cellBytes = TDB_BYTES_CELL_TAKEN(pPage, pCell);

H
Hongze Cheng 已提交
555 556
        if (infoNews[nNews].size + cellBytes > TDB_PAGE_USABLE_SIZE(pPage)) {
          // page is full, use a new page
H
Hongze Cheng 已提交
557
          nNews++;
H
Hongze Cheng 已提交
558

H
Hongze Cheng 已提交
559 560 561 562 563 564 565
          ASSERT(infoNews[nNews].size + cellBytes <= TDB_PAGE_USABLE_SIZE(pPage));

          if (childNotLeaf) {
            // for non-child page, this cell is used as the right-most child,
            // the divider cell to parent as well
            continue;
          }
H
Hongze Cheng 已提交
566
        }
H
Hongze Cheng 已提交
567 568
        infoNews[nNews].cnt++;
        infoNews[nNews].size += cellBytes;
H
Hongze Cheng 已提交
569
        infoNews[nNews].iPage = oPage;
H
Hongze Cheng 已提交
570
        infoNews[nNews].oIdx = oIdx;
H
Hongze Cheng 已提交
571 572 573
      }
    }

H
Hongze Cheng 已提交
574
    nNews++;
H
Hongze Cheng 已提交
575 576 577

    // back loop to make the distribution even
    for (int iNew = nNews - 1; iNew > 0; iNew--) {
H
Hongze Cheng 已提交
578
      SCell *pCell;
H
Hongze Cheng 已提交
579 580 581
      int    szLCell, szRCell;

      for (;;) {
H
Hongze Cheng 已提交
582
        pCell = tdbPageGetCell(pOlds[infoNews[iNew - 1].iPage], infoNews[iNew - 1].oIdx);
H
Hongze Cheng 已提交
583

H
Hongze Cheng 已提交
584
        if (childNotLeaf) {
H
Hongze Cheng 已提交
585
          szLCell = szRCell = tdbBtreeCellSize(pOlds[infoNews[iNew - 1].iPage], pCell);
H
Hongze Cheng 已提交
586
        } else {
H
Hongze Cheng 已提交
587
          szLCell = tdbBtreeCellSize(pOlds[infoNews[iNew - 1].iPage], pCell);
H
Hongze Cheng 已提交
588

H
Hongze Cheng 已提交
589
          int    iPage = infoNews[iNew - 1].iPage;
H
Hongze Cheng 已提交
590
          int    oIdx = infoNews[iNew - 1].oIdx + 1;
H
Hongze Cheng 已提交
591
          SPage *pPage;
H
Hongze Cheng 已提交
592
          for (;;) {
H
Hongze Cheng 已提交
593
            pPage = pOlds[iPage];
H
Hongze Cheng 已提交
594 595 596 597
            if (oIdx < TDB_PAGE_TOTAL_CELLS(pPage)) {
              break;
            }

H
Hongze Cheng 已提交
598
            iPage++;
H
Hongze Cheng 已提交
599 600 601 602 603
            oIdx = 0;
          }

          pCell = tdbPageGetCell(pPage, oIdx);
          szRCell = tdbBtreeCellSize(pPage, pCell);
H
Hongze Cheng 已提交
604 605 606 607 608 609 610 611
        }

        ASSERT(infoNews[iNew - 1].cnt > 0);

        if (infoNews[iNew].size + szRCell >= infoNews[iNew - 1].size - szRCell) {
          break;
        }

H
Hongze Cheng 已提交
612
        // Move a cell right forward
H
Hongze Cheng 已提交
613 614
        infoNews[iNew - 1].cnt--;
        infoNews[iNew - 1].size -= szLCell;
H
Hongze Cheng 已提交
615
        infoNews[iNew - 1].oIdx--;
H
Hongze Cheng 已提交
616
        for (;;) {
H
Hongze Cheng 已提交
617
          if (infoNews[iNew - 1].oIdx >= 0) {
H
Hongze Cheng 已提交
618 619
            break;
          }
H
Hongze Cheng 已提交
620

H
Hongze Cheng 已提交
621 622
          infoNews[iNew - 1].iPage--;
          infoNews[iNew - 1].oIdx = TDB_PAGE_TOTAL_CELLS(pOlds[infoNews[iNew - 1].iPage]) - 1;
H
Hongze Cheng 已提交
623 624 625
        }

        infoNews[iNew].cnt++;
H
Hongze Cheng 已提交
626
        infoNews[iNew].size += szRCell;
H
Hongze Cheng 已提交
627
      }
H
Hongze Cheng 已提交
628
    }
H
Hongze Cheng 已提交
629 630
  }

H
Hongze Cheng 已提交
631 632 633 634 635 636 637 638 639 640 641 642 643 644 645
  SPage *pNews[5] = {0};
  {  // Allocate new pages, reuse the old page when possible

    SPgno             pgno;
    SBtreeInitPageArg iarg;
    u8                flags;

    flags = TDB_BTREE_PAGE_GET_FLAGS(pOlds[0]);

    for (int iNew = 0; iNew < nNews; iNew++) {
      if (iNew < nOlds) {
        pNews[iNew] = pOlds[iNew];
      } else {
        iarg.pBt = pBt;
        iarg.flags = flags;
H
Hongze Cheng 已提交
646
        ret = tdbPagerNewPage(pBt->pPager, &pgno, pNews + iNew, tdbBtreeZeroPage, &iarg, pTxn);
H
Hongze Cheng 已提交
647 648 649
        if (ret < 0) {
          ASSERT(0);
        }
H
Hongze Cheng 已提交
650 651 652 653 654 655 656

        ret = tdbPagerWrite(pBt->pPager, pNews[iNew]);
        if (ret < 0) {
          // TODO
          ASSERT(0);
          return -1;
        }
H
Hongze Cheng 已提交
657 658
      }
    }
H
Hongze Cheng 已提交
659 660

    // TODO: sort the page according to the page number
H
Hongze Cheng 已提交
661 662
  }

H
Hongze Cheng 已提交
663
  {  // Do the real cell distribution
H
Hongze Cheng 已提交
664
    SPage            *pOldsCopy[3] = {0};
H
Hongze Cheng 已提交
665 666 667 668
    SCell            *pCell;
    int               szCell;
    SBtreeInitPageArg iarg;
    int               iNew, nNewCells;
H
Hongze Cheng 已提交
669
    SCellDecoder      cd;
H
Hongze Cheng 已提交
670 671 672 673

    iarg.pBt = pBt;
    iarg.flags = TDB_BTREE_PAGE_GET_FLAGS(pOlds[0]);
    for (int i = 0; i < nOlds; i++) {
H
Hongze Cheng 已提交
674
      tdbPageCreate(pOlds[0]->pageSize, &pOldsCopy[i], NULL, NULL);
H
Hongze Cheng 已提交
675 676 677 678 679
      tdbBtreeZeroPage(pOldsCopy[i], &iarg);
      tdbPageCopy(pOlds[i], pOldsCopy[i]);
    }
    iNew = 0;
    nNewCells = 0;
H
Hongze Cheng 已提交
680
    tdbBtreeZeroPage(pNews[iNew], &iarg);
H
Hongze Cheng 已提交
681 682 683 684 685 686 687 688 689 690 691

    for (int iOld = 0; iOld < nOlds; iOld++) {
      SPage *pPage;

      pPage = pOldsCopy[iOld];

      for (int oIdx = 0; oIdx < TDB_PAGE_TOTAL_CELLS(pPage); oIdx++) {
        pCell = tdbPageGetCell(pPage, oIdx);
        szCell = tdbBtreeCellSize(pPage, pCell);

        ASSERT(nNewCells <= infoNews[iNew].cnt);
H
Hongze Cheng 已提交
692
        ASSERT(iNew < nNews);
H
Hongze Cheng 已提交
693 694 695 696

        if (nNewCells < infoNews[iNew].cnt) {
          tdbPageInsertCell(pNews[iNew], nNewCells, pCell, szCell, 0);
          nNewCells++;
H
Hongze Cheng 已提交
697

H
Hongze Cheng 已提交
698 699 700 701 702 703 704
          // insert parent page
          if (!childNotLeaf && nNewCells == infoNews[iNew].cnt) {
            SIntHdr *pIntHdr = (SIntHdr *)pParent->pData;

            if (iNew == nNews - 1 && pIntHdr->pgno == 0) {
              pIntHdr->pgno = TDB_PAGE_PGNO(pNews[iNew]);
            } else {
H
Hongze Cheng 已提交
705 706
              tdbBtreeDecodeCell(pPage, pCell, &cd);

H
Hongze Cheng 已提交
707
              // TODO: pCell here may be inserted as an overflow cell, handle it
H
Hongze Cheng 已提交
708
              SCell *pNewCell = tdbOsMalloc(cd.kLen + 9);
H
Hongze Cheng 已提交
709 710 711
              int    szNewCell;
              SPgno  pgno;
              pgno = TDB_PAGE_PGNO(pNews[iNew]);
H
Hongze Cheng 已提交
712
              tdbBtreeEncodeCell(pParent, cd.pKey, cd.kLen, (void *)&pgno, sizeof(SPgno), pNewCell, &szNewCell);
H
Hongze Cheng 已提交
713
              tdbPageInsertCell(pParent, sIdx++, pNewCell, szNewCell, 0);
H
Hongze Cheng 已提交
714
              tdbOsFree(pNewCell);
H
Hongze Cheng 已提交
715
            }
H
Hongze Cheng 已提交
716 717

            // move to next new page
H
Hongze Cheng 已提交
718
            iNew++;
H
Hongze Cheng 已提交
719 720 721 722
            nNewCells = 0;
            if (iNew < nNews) {
              tdbBtreeZeroPage(pNews[iNew], &iarg);
            }
H
Hongze Cheng 已提交
723 724 725
          }
        } else {
          ASSERT(childNotLeaf);
H
Hongze Cheng 已提交
726
          ASSERT(iNew < nNews - 1);
H
Hongze Cheng 已提交
727

H
Hongze Cheng 已提交
728 729 730 731
          // set current new page right-most child
          ((SIntHdr *)pNews[iNew]->pData)->pgno = ((SPgno *)pCell)[0];

          // insert to parent as divider cell
H
Hongze Cheng 已提交
732 733 734
          ASSERT(iNew < nNews - 1);
          ((SPgno *)pCell)[0] = TDB_PAGE_PGNO(pNews[iNew]);
          tdbPageInsertCell(pParent, sIdx++, pCell, szCell, 0);
H
Hongze Cheng 已提交
735 736 737 738

          // move to next new page
          iNew++;
          nNewCells = 0;
H
Hongze Cheng 已提交
739 740 741
          if (iNew < nNews) {
            tdbBtreeZeroPage(pNews[iNew], &iarg);
          }
H
Hongze Cheng 已提交
742 743
        }
      }
H
Hongze Cheng 已提交
744
    }
H
Hongze Cheng 已提交
745

H
Hongze Cheng 已提交
746 747 748
    if (childNotLeaf) {
      ASSERT(TDB_PAGE_TOTAL_CELLS(pNews[nNews - 1]) == infoNews[nNews - 1].cnt);
      ((SIntHdr *)(pNews[nNews - 1]->pData))->pgno = rPgno;
H
Hongze Cheng 已提交
749

H
Hongze Cheng 已提交
750 751 752 753 754 755
      SIntHdr *pIntHdr = (SIntHdr *)pParent->pData;
      if (pIntHdr->pgno == 0) {
        pIntHdr->pgno = TDB_PAGE_PGNO(pNews[nNews - 1]);
      } else {
        ((SPgno *)pDivCell[nOlds - 1])[0] = TDB_PAGE_PGNO(pNews[nNews - 1]);
        tdbPageInsertCell(pParent, sIdx, pDivCell[nOlds - 1], szDivCell[nOlds - 1], 0);
H
Hongze Cheng 已提交
756
      }
H
Hongze Cheng 已提交
757 758 759 760 761 762 763
    }

    for (int i = 0; i < nOlds; i++) {
      tdbPageDestroy(pOldsCopy[i], NULL, NULL);
    }
  }

H
Hongze Cheng 已提交
764 765
  for (int i = 0; i < 3; i++) {
    if (pDivCell[i]) {
H
Hongze Cheng 已提交
766
      tdbOsFree(pDivCell[i]);
H
Hongze Cheng 已提交
767 768
    }
  }
H
Hongze Cheng 已提交
769

H
more  
Hongze Cheng 已提交
770 771 772
  // TODO: here is not corrent for drop case
  for (int i = 0; i < nNews; i++) {
    if (i < nOlds) {
H
Hongze Cheng 已提交
773
      tdbPagerReturnPage(pBt->pPager, pOlds[i], pTxn);
H
more  
Hongze Cheng 已提交
774
    } else {
H
Hongze Cheng 已提交
775
      tdbPagerReturnPage(pBt->pPager, pNews[i], pTxn);
H
more  
Hongze Cheng 已提交
776 777 778
    }
  }

H
Hongze Cheng 已提交
779
  return 0;
H
Hongze Cheng 已提交
780 781
}

H
Hongze Cheng 已提交
782
static int tdbBtreeBalance(SBTC *pBtc) {
H
Hongze Cheng 已提交
783
  int    iPage;
H
Hongze Cheng 已提交
784 785
  int    ret;
  int    nFree;
H
Hongze Cheng 已提交
786
  SPage *pParent;
H
Hongze Cheng 已提交
787
  SPage *pPage;
H
Hongze Cheng 已提交
788
  u8     flags;
H
Hongze Cheng 已提交
789 790
  u8     leaf;
  u8     root;
H
Hongze Cheng 已提交
791 792 793

  // Main loop to balance the BTree
  for (;;) {
H
Hongze Cheng 已提交
794 795
    iPage = pBtc->iPage;
    pPage = pBtc->pPage;
H
refact  
Hongze Cheng 已提交
796 797
    leaf = TDB_BTREE_PAGE_IS_LEAF(pPage);
    root = TDB_BTREE_PAGE_IS_ROOT(pPage);
H
Hongze Cheng 已提交
798
    nFree = TDB_PAGE_FREE_SIZE(pPage);
H
Hongze Cheng 已提交
799

H
Hongze Cheng 已提交
800 801
    // when the page is not overflow and not too empty, the balance work
    // is finished. Just break out the balance loop.
H
Hongze Cheng 已提交
802
    if (pPage->nOverflow == 0 && nFree < TDB_PAGE_USABLE_SIZE(pPage) * 2 / 3) {
H
Hongze Cheng 已提交
803 804 805 806
      break;
    }

    if (iPage == 0) {
H
Hongze Cheng 已提交
807 808 809
      // For the root page, only balance when the page is overfull,
      // ignore the case of empty
      if (pPage->nOverflow == 0) break;
H
Hongze Cheng 已提交
810

H
Hongze Cheng 已提交
811
      ret = tdbBtreeBalanceDeeper(pBtc->pBt, pPage, &(pBtc->pgStack[1]), pBtc->pTxn);
H
Hongze Cheng 已提交
812 813 814 815
      if (ret < 0) {
        return -1;
      }

H
Hongze Cheng 已提交
816 817 818 819 820
      pBtc->idx = 0;
      pBtc->idxStack[0] = 0;
      pBtc->pgStack[0] = pBtc->pPage;
      pBtc->iPage = 1;
      pBtc->pPage = pBtc->pgStack[1];
H
Hongze Cheng 已提交
821
    } else {
H
Hongze Cheng 已提交
822
      // Generalized balance step
H
Hongze Cheng 已提交
823
      pParent = pBtc->pgStack[iPage - 1];
H
Hongze Cheng 已提交
824

H
Hongze Cheng 已提交
825
      ret = tdbBtreeBalanceNonRoot(pBtc->pBt, pParent, pBtc->idxStack[pBtc->iPage - 1], pBtc->pTxn);
H
Hongze Cheng 已提交
826 827 828 829
      if (ret < 0) {
        return -1;
      }

H
Hongze Cheng 已提交
830
      tdbPagerReturnPage(pBtc->pBt->pPager, pBtc->pPage, pBtc->pTxn);
H
more  
Hongze Cheng 已提交
831

H
Hongze Cheng 已提交
832 833
      pBtc->iPage--;
      pBtc->pPage = pBtc->pgStack[pBtc->iPage];
H
Hongze Cheng 已提交
834 835 836 837 838
    }
  }

  return 0;
}
H
Hongze Cheng 已提交
839
// TDB_BTREE_BALANCE
H
Hongze Cheng 已提交
840

H
Hongze Cheng 已提交
841
// TDB_BTREE_CELL =====================
H
Hongze Cheng 已提交
842 843
static int tdbBtreeEncodePayload(SPage *pPage, SCell *pCell, int nHeader, const void *pKey, int kLen, const void *pVal,
                                 int vLen, int *szPayload) {
H
Hongze Cheng 已提交
844 845 846
  int nPayload;

  nPayload = kLen + vLen;
H
Hongze Cheng 已提交
847 848 849
  if (nPayload + nHeader <= pPage->maxLocal) {
    // no overflow page is needed
    memcpy(pCell + nHeader, pKey, kLen);
H
Hongze Cheng 已提交
850
    if (pVal) {
H
Hongze Cheng 已提交
851
      memcpy(pCell + nHeader + kLen, pVal, vLen);
H
Hongze Cheng 已提交
852 853 854 855 856 857 858 859 860 861 862 863 864 865
    }

    *szPayload = nPayload;
    return 0;
  }

  {
    // TODO: handle overflow case
    ASSERT(0);
  }

  return 0;
}

H
Hongze Cheng 已提交
866 867
static int tdbBtreeEncodeCell(SPage *pPage, const void *pKey, int kLen, const void *pVal, int vLen, SCell *pCell,
                              int *szCell) {
H
Hongze Cheng 已提交
868 869 870 871
  u8  leaf;
  int nHeader;
  int nPayload;
  int ret;
H
Hongze Cheng 已提交
872 873 874

  ASSERT(pPage->kLen == TDB_VARIANT_LEN || pPage->kLen == kLen);
  ASSERT(pPage->vLen == TDB_VARIANT_LEN || pPage->vLen == vLen);
H
Hongze Cheng 已提交
875
  ASSERT(pKey != NULL && kLen > 0);
H
Hongze Cheng 已提交
876

H
Hongze Cheng 已提交
877 878
  nPayload = 0;
  nHeader = 0;
H
refact  
Hongze Cheng 已提交
879
  leaf = TDB_BTREE_PAGE_IS_LEAF(pPage);
H
Hongze Cheng 已提交
880

H
Hongze Cheng 已提交
881
  // 1. Encode Header part
H
Hongze Cheng 已提交
882 883 884 885 886 887 888 889
  /* Encode SPgno if interior page */
  if (!leaf) {
    ASSERT(pPage->vLen == sizeof(SPgno));

    ((SPgno *)(pCell + nHeader))[0] = ((SPgno *)pVal)[0];
    nHeader = nHeader + sizeof(SPgno);
  }

H
Hongze Cheng 已提交
890
  /* Encode kLen if need */
H
Hongze Cheng 已提交
891
  if (pPage->kLen == TDB_VARIANT_LEN) {
H
Hongze Cheng 已提交
892
    nHeader += tdbPutVarInt(pCell + nHeader, kLen);
H
Hongze Cheng 已提交
893 894
  }

H
Hongze Cheng 已提交
895
  /* Encode vLen if need */
H
Hongze Cheng 已提交
896
  if (pPage->vLen == TDB_VARIANT_LEN) {
H
Hongze Cheng 已提交
897
    nHeader += tdbPutVarInt(pCell + nHeader, vLen);
H
Hongze Cheng 已提交
898 899
  }

H
Hongze Cheng 已提交
900
  // 2. Encode payload part
H
Hongze Cheng 已提交
901 902 903
  if ((!leaf) || pPage->vLen == 0) {
    pVal = NULL;
    vLen = 0;
H
Hongze Cheng 已提交
904
  }
H
Hongze Cheng 已提交
905 906

  ret = tdbBtreeEncodePayload(pPage, pCell, nHeader, pKey, kLen, pVal, vLen, &nPayload);
H
Hongze Cheng 已提交
907
  if (ret < 0) {
H
Hongze Cheng 已提交
908 909 910
    // TODO
    ASSERT(0);
    return 0;
H
Hongze Cheng 已提交
911 912
  }

H
Hongze Cheng 已提交
913
  *szCell = nHeader + nPayload;
H
Hongze Cheng 已提交
914 915 916
  return 0;
}

H
Hongze Cheng 已提交
917
static int tdbBtreeDecodePayload(SPage *pPage, const SCell *pCell, int nHeader, SCellDecoder *pDecoder) {
H
Hongze Cheng 已提交
918 919 920
  int nPayload;

  if (pDecoder->pVal) {
H
Hongze Cheng 已提交
921
    ASSERT(!TDB_BTREE_PAGE_IS_LEAF(pPage));
H
Hongze Cheng 已提交
922
    nPayload = pDecoder->kLen;
H
Hongze Cheng 已提交
923 924
  } else {
    nPayload = pDecoder->kLen + pDecoder->vLen;
H
Hongze Cheng 已提交
925 926
  }

H
Hongze Cheng 已提交
927 928 929 930 931
  if (nHeader + nPayload <= pPage->maxLocal) {
    // no over flow case
    pDecoder->pKey = pCell + nHeader;
    if (pDecoder->pVal == NULL && pDecoder->vLen > 0) {
      pDecoder->pVal = pCell + nHeader + pDecoder->kLen;
H
Hongze Cheng 已提交
932
    }
H
Hongze Cheng 已提交
933 934 935 936
    return 0;
  }

  {
H
Hongze Cheng 已提交
937 938
    // TODO: handle overflow case
    ASSERT(0);
H
Hongze Cheng 已提交
939 940
  }

H
Hongze Cheng 已提交
941 942 943 944 945 946 947 948 949
  return 0;
}

static int tdbBtreeDecodeCell(SPage *pPage, const SCell *pCell, SCellDecoder *pDecoder) {
  u8  leaf;
  int nHeader;
  int ret;

  nHeader = 0;
H
refact  
Hongze Cheng 已提交
950
  leaf = TDB_BTREE_PAGE_IS_LEAF(pPage);
H
Hongze Cheng 已提交
951

H
Hongze Cheng 已提交
952 953 954 955 956 957 958
  // Clear the state of decoder
  pDecoder->kLen = -1;
  pDecoder->pKey = NULL;
  pDecoder->vLen = -1;
  pDecoder->pVal = NULL;
  pDecoder->pgno = 0;

H
Hongze Cheng 已提交
959
  // 1. Decode header part
H
Hongze Cheng 已提交
960 961 962 963 964 965 966 967
  if (!leaf) {
    ASSERT(pPage->vLen == sizeof(SPgno));

    pDecoder->pgno = ((SPgno *)(pCell + nHeader))[0];
    pDecoder->pVal = (u8 *)(&(pDecoder->pgno));
    nHeader = nHeader + sizeof(SPgno);
  }

H
Hongze Cheng 已提交
968 969 970 971 972 973 974
  if (pPage->kLen == TDB_VARIANT_LEN) {
    nHeader += tdbGetVarInt(pCell + nHeader, &(pDecoder->kLen));
  } else {
    pDecoder->kLen = pPage->kLen;
  }

  if (pPage->vLen == TDB_VARIANT_LEN) {
H
Hongze Cheng 已提交
975
    ASSERT(leaf);
H
Hongze Cheng 已提交
976
    nHeader += tdbGetVarInt(pCell + nHeader, &(pDecoder->vLen));
H
Hongze Cheng 已提交
977
  } else {
H
Hongze Cheng 已提交
978 979
    pDecoder->vLen = pPage->vLen;
  }
H
Hongze Cheng 已提交
980

H
Hongze Cheng 已提交
981
  // 2. Decode payload part
H
Hongze Cheng 已提交
982
  ret = tdbBtreeDecodePayload(pPage, pCell, nHeader, pDecoder);
H
Hongze Cheng 已提交
983 984
  if (ret < 0) {
    return -1;
H
Hongze Cheng 已提交
985 986 987 988 989
  }

  return 0;
}

H
Hongze Cheng 已提交
990
static int tdbBtreeCellSize(const SPage *pPage, SCell *pCell) {
H
refact  
Hongze Cheng 已提交
991
  u8  leaf;
H
Hongze Cheng 已提交
992 993 994
  int szCell;
  int kLen = 0, vLen = 0;

H
refact  
Hongze Cheng 已提交
995
  leaf = TDB_BTREE_PAGE_IS_LEAF(pPage);
H
Hongze Cheng 已提交
996 997
  szCell = 0;

H
refact  
Hongze Cheng 已提交
998
  if (!leaf) {
H
Hongze Cheng 已提交
999 1000 1001 1002 1003 1004 1005 1006 1007
    szCell += sizeof(SPgno);
  }

  if (pPage->kLen == TDB_VARIANT_LEN) {
    szCell += tdbGetVarInt(pCell + szCell, &kLen);
  } else {
    kLen = pPage->kLen;
  }

H
refact  
Hongze Cheng 已提交
1008 1009 1010 1011 1012
  if (pPage->vLen == TDB_VARIANT_LEN) {
    ASSERT(leaf);
    szCell += tdbGetVarInt(pCell + szCell, &vLen);
  } else if (leaf) {
    vLen = pPage->vLen;
H
Hongze Cheng 已提交
1013 1014 1015 1016
  }

  szCell = szCell + kLen + vLen;

H
refact  
Hongze Cheng 已提交
1017 1018 1019 1020 1021 1022 1023 1024 1025
  if (szCell <= pPage->maxLocal) {
    return szCell;
  }

  {
    // TODO
    ASSERT(0);
    return 0;
  }
H
Hongze Cheng 已提交
1026
}
H
Hongze Cheng 已提交
1027
// TDB_BTREE_CELL
H
Hongze Cheng 已提交
1028

H
refact  
Hongze Cheng 已提交
1029
// TDB_BTREE_CURSOR =====================
H
Hongze Cheng 已提交
1030
int tdbBtcOpen(SBTC *pBtc, SBTree *pBt, TXN *pTxn) {
H
Hongze Cheng 已提交
1031 1032 1033 1034
  pBtc->pBt = pBt;
  pBtc->iPage = -1;
  pBtc->pPage = NULL;
  pBtc->idx = -1;
H
Hongze Cheng 已提交
1035
  pBtc->pTxn = pTxn;
H
Hongze Cheng 已提交
1036 1037 1038 1039

  return 0;
}

H
Hongze Cheng 已提交
1040
int tdbBtcMoveToFirst(SBTC *pBtc) {
H
refact  
Hongze Cheng 已提交
1041 1042 1043
  int     ret;
  SBTree *pBt;
  SPager *pPager;
H
Hongze Cheng 已提交
1044 1045
  SCell  *pCell;
  SPgno   pgno;
H
refact  
Hongze Cheng 已提交
1046 1047 1048 1049 1050

  pBt = pBtc->pBt;
  pPager = pBt->pPager;

  if (pBtc->iPage < 0) {
H
Hongze Cheng 已提交
1051
    // move a clean cursor
H
Hongze Cheng 已提交
1052
    ret = tdbPagerFetchPage(pPager, pBt->root, &(pBtc->pPage), tdbBtreeInitPage, pBt, pBtc->pTxn);
H
refact  
Hongze Cheng 已提交
1053 1054 1055 1056 1057
    if (ret < 0) {
      ASSERT(0);
      return -1;
    }

H
Hongze Cheng 已提交
1058
    ASSERT(TDB_BTREE_PAGE_IS_ROOT(pBtc->pPage));
H
refact  
Hongze Cheng 已提交
1059

H
Hongze Cheng 已提交
1060 1061 1062 1063
    pBtc->iPage = 0;
    if (TDB_PAGE_TOTAL_CELLS(pBtc->pPage) > 0) {
      pBtc->idx = 0;
    } else {
H
more  
Hongze Cheng 已提交
1064
      // no any data, point to an invalid position
H
Hongze Cheng 已提交
1065 1066
      ASSERT(TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage));
      pBtc->idx = -1;
H
refact  
Hongze Cheng 已提交
1067 1068
      return 0;
    }
H
Hongze Cheng 已提交
1069 1070 1071
  } else {
    // move from a position
    int iPage = 0;
H
refact  
Hongze Cheng 已提交
1072

H
Hongze Cheng 已提交
1073 1074 1075 1076
    for (; iPage < pBtc->iPage; iPage++) {
      ASSERT(pBtc->idxStack[iPage] >= 0);
      if (pBtc->idxStack[iPage]) break;
    }
H
refact  
Hongze Cheng 已提交
1077

H
Hongze Cheng 已提交
1078 1079
    // move upward
    for (;;) {
H
Hongze Cheng 已提交
1080
      if (pBtc->iPage == iPage) {
H
Hongze Cheng 已提交
1081
        pBtc->idx = 0;
H
refact  
Hongze Cheng 已提交
1082 1083
        break;
      }
H
Hongze Cheng 已提交
1084

H
Hongze Cheng 已提交
1085
      tdbBtcMoveUpward(pBtc);
H
Hongze Cheng 已提交
1086 1087 1088 1089 1090
    }
  }

  // move downward
  for (;;) {
H
refact  
Hongze Cheng 已提交
1091
    if (TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage)) break;
H
Hongze Cheng 已提交
1092

H
Hongze Cheng 已提交
1093
    ret = tdbBtcMoveDownward(pBtc);
H
Hongze Cheng 已提交
1094 1095 1096 1097 1098 1099 1100 1101
    if (ret < 0) {
      ASSERT(0);
      return -1;
    }

    pBtc->idx = 0;
  }

H
Hongze Cheng 已提交
1102 1103 1104 1105
  return 0;
}

int tdbBtcMoveToLast(SBTC *pBtc) {
H
Hongze Cheng 已提交
1106
  int     ret;
H
Hongze Cheng 已提交
1107
  int     nCells;
H
Hongze Cheng 已提交
1108 1109 1110 1111 1112 1113 1114 1115 1116
  SBTree *pBt;
  SPager *pPager;
  SPgno   pgno;

  pBt = pBtc->pBt;
  pPager = pBt->pPager;

  if (pBtc->iPage < 0) {
    // move a clean cursor
H
Hongze Cheng 已提交
1117
    ret = tdbPagerFetchPage(pPager, pBt->root, &(pBtc->pPage), tdbBtreeInitPage, pBt, pBtc->pTxn);
H
Hongze Cheng 已提交
1118 1119 1120 1121 1122
    if (ret < 0) {
      ASSERT(0);
      return -1;
    }

H
Hongze Cheng 已提交
1123
    nCells = TDB_PAGE_TOTAL_CELLS(pBtc->pPage);
H
Hongze Cheng 已提交
1124
    pBtc->iPage = 0;
H
Hongze Cheng 已提交
1125 1126 1127 1128 1129 1130 1131 1132
    if (nCells > 0) {
      pBtc->idx = TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage) ? nCells - 1 : nCells;
    } else {
      // no data at all, point to an invalid position
      ASSERT(TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage));
      pBtc->idx = -1;
      return 0;
    }
H
Hongze Cheng 已提交
1133
  } else {
H
Hongze Cheng 已提交
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
    int iPage = 0;

    // downward search
    for (; iPage < pBtc->iPage; iPage++) {
      ASSERT(!TDB_BTREE_PAGE_IS_LEAF(pBtc->pgStack[iPage]));
      nCells = TDB_PAGE_TOTAL_CELLS(pBtc->pgStack[iPage]);
      if (pBtc->idxStack[iPage] != nCells) break;
    }

    // move upward
    for (;;) {
H
Hongze Cheng 已提交
1145
      if (pBtc->iPage == iPage) {
H
Hongze Cheng 已提交
1146 1147 1148 1149 1150
        if (TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage)) {
          pBtc->idx = TDB_PAGE_TOTAL_CELLS(pBtc->pPage) - 1;
        } else {
          pBtc->idx = TDB_PAGE_TOTAL_CELLS(pBtc->pPage);
        }
H
Hongze Cheng 已提交
1151
        break;
H
Hongze Cheng 已提交
1152 1153 1154 1155
      }

      tdbBtcMoveUpward(pBtc);
    }
H
Hongze Cheng 已提交
1156 1157 1158 1159
  }

  // move downward
  for (;;) {
H
Hongze Cheng 已提交
1160 1161 1162 1163 1164 1165 1166 1167 1168
    if (TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage)) break;

    ret = tdbBtcMoveDownward(pBtc);
    if (ret < 0) {
      ASSERT(0);
      return -1;
    }

    nCells = TDB_PAGE_TOTAL_CELLS(pBtc->pPage);
H
refact  
Hongze Cheng 已提交
1169
    if (TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage)) {
H
Hongze Cheng 已提交
1170
      pBtc->idx = nCells - 1;
H
Hongze Cheng 已提交
1171
    } else {
H
Hongze Cheng 已提交
1172
      pBtc->idx = nCells;
H
Hongze Cheng 已提交
1173 1174 1175
    }
  }

H
Hongze Cheng 已提交
1176 1177 1178 1179
  return 0;
}

int tdbBtreeNext(SBTC *pBtc, void **ppKey, int *kLen, void **ppVal, int *vLen) {
H
Hongze Cheng 已提交
1180 1181 1182 1183 1184
  SCell       *pCell;
  SCellDecoder cd;
  void        *pKey, *pVal;
  int          ret;

H
Hongze Cheng 已提交
1185
  // current cursor points to an invalid position
H
Hongze Cheng 已提交
1186
  if (pBtc->idx < 0) {
H
Hongze Cheng 已提交
1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201
    return -1;
  }

  pCell = tdbPageGetCell(pBtc->pPage, pBtc->idx);

  tdbBtreeDecodeCell(pBtc->pPage, pCell, &cd);

  pKey = TDB_REALLOC(*ppKey, cd.kLen);
  if (pKey == NULL) {
    return -1;
  }

  *ppKey = pKey;
  *kLen = cd.kLen;
  memcpy(pKey, cd.pKey, cd.kLen);
C
Cary Xu 已提交
1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214

  if (ppVal) {
    // TODO: vLen may be zero
    pVal = TDB_REALLOC(*ppVal, cd.vLen);
    if (pVal == NULL) {
      TDB_FREE(pKey);
      return -1;
    }

    *ppVal = pVal;
    *vLen = cd.vLen;
    memcpy(pVal, cd.pVal, cd.vLen);
  }
H
Hongze Cheng 已提交
1215 1216

  ret = tdbBtcMoveToNext(pBtc);
H
Hongze Cheng 已提交
1217 1218 1219 1220
  if (ret < 0) {
    ASSERT(0);
    return -1;
  }
H
Hongze Cheng 已提交
1221 1222 1223 1224 1225

  return 0;
}

static int tdbBtcMoveToNext(SBTC *pBtc) {
H
Hongze Cheng 已提交
1226
  int    nCells;
H
Hongze Cheng 已提交
1227
  int    ret;
H
Hongze Cheng 已提交
1228 1229
  SCell *pCell;

H
refact  
Hongze Cheng 已提交
1230
  ASSERT(TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage));
H
Hongze Cheng 已提交
1231 1232 1233

  if (pBtc->idx < 0) return -1;

H
Hongze Cheng 已提交
1234
  pBtc->idx++;
H
Hongze Cheng 已提交
1235 1236 1237 1238
  if (pBtc->idx < TDB_PAGE_TOTAL_CELLS(pBtc->pPage)) {
    return 0;
  }

H
Hongze Cheng 已提交
1239
  // move upward
H
Hongze Cheng 已提交
1240
  for (;;) {
H
Hongze Cheng 已提交
1241 1242 1243 1244 1245
    if (pBtc->iPage == 0) {
      pBtc->idx = -1;
      return 0;
    }

H
Hongze Cheng 已提交
1246 1247 1248
    tdbBtcMoveUpward(pBtc);
    pBtc->idx++;

H
Hongze Cheng 已提交
1249 1250
    ASSERT(!TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage));
    if (pBtc->idx <= TDB_PAGE_TOTAL_CELLS(pBtc->pPage)) {
H
Hongze Cheng 已提交
1251 1252
      break;
    }
H
Hongze Cheng 已提交
1253 1254
  }

H
Hongze Cheng 已提交
1255
  // move downward
H
Hongze Cheng 已提交
1256
  for (;;) {
H
Hongze Cheng 已提交
1257
    if (TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage)) break;
H
Hongze Cheng 已提交
1258

H
Hongze Cheng 已提交
1259 1260 1261 1262
    ret = tdbBtcMoveDownward(pBtc);
    if (ret < 0) {
      ASSERT(0);
      return -1;
H
Hongze Cheng 已提交
1263
    }
H
Hongze Cheng 已提交
1264 1265

    pBtc->idx = 0;
H
Hongze Cheng 已提交
1266 1267
  }

H
Hongze Cheng 已提交
1268 1269 1270
  return 0;
}

H
Hongze Cheng 已提交
1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284
static int tdbBtcMoveDownward(SBTC *pBtc) {
  int    ret;
  SPgno  pgno;
  SCell *pCell;

  ASSERT(pBtc->idx >= 0);
  ASSERT(!TDB_BTREE_PAGE_IS_LEAF(pBtc->pPage));

  if (pBtc->idx < TDB_PAGE_TOTAL_CELLS(pBtc->pPage)) {
    pCell = tdbPageGetCell(pBtc->pPage, pBtc->idx);
    pgno = ((SPgno *)pCell)[0];
  } else {
    pgno = ((SIntHdr *)pBtc->pPage->pData)->pgno;
  }
H
Hongze Cheng 已提交
1285

H
Hongze Cheng 已提交
1286 1287 1288 1289 1290
  pBtc->pgStack[pBtc->iPage] = pBtc->pPage;
  pBtc->idxStack[pBtc->iPage] = pBtc->idx;
  pBtc->iPage++;
  pBtc->pPage = NULL;
  pBtc->idx = -1;
H
Hongze Cheng 已提交
1291

H
Hongze Cheng 已提交
1292
  ret = tdbPagerFetchPage(pBtc->pBt->pPager, pgno, &pBtc->pPage, tdbBtreeInitPage, pBtc->pBt, pBtc->pTxn);
H
Hongze Cheng 已提交
1293 1294
  if (ret < 0) {
    ASSERT(0);
H
Hongze Cheng 已提交
1295
    return -1;
H
Hongze Cheng 已提交
1296 1297 1298 1299 1300 1301
  }

  return 0;
}

static int tdbBtcMoveUpward(SBTC *pBtc) {
H
Hongze Cheng 已提交
1302 1303
  if (pBtc->iPage == 0) return -1;

H
Hongze Cheng 已提交
1304
  tdbPagerReturnPage(pBtc->pBt->pPager, pBtc->pPage, pBtc->pTxn);
H
Hongze Cheng 已提交
1305 1306 1307 1308 1309

  pBtc->iPage--;
  pBtc->pPage = pBtc->pgStack[pBtc->iPage];
  pBtc->idx = pBtc->idxStack[pBtc->iPage];

H
Hongze Cheng 已提交
1310
  return 0;
H
Hongze Cheng 已提交
1311
}
H
refact  
Hongze Cheng 已提交
1312

H
Hongze Cheng 已提交
1313
static int tdbBtcMoveTo(SBTC *pBtc, const void *pKey, int kLen, int *pCRst) {
H
Hongze Cheng 已提交
1314
  int          ret;
H
Hongze Cheng 已提交
1315 1316
  int          nCells;
  int          c;
H
Hongze Cheng 已提交
1317 1318 1319 1320
  SBTree      *pBt;
  SCell       *pCell;
  SPager      *pPager;
  SCellDecoder cd = {0};
H
Hongze Cheng 已提交
1321 1322 1323 1324 1325

  pBt = pBtc->pBt;
  pPager = pBt->pPager;

  if (pBtc->iPage < 0) {
H
Hongze Cheng 已提交
1326
    // move from a clear cursor
H
Hongze Cheng 已提交
1327
    ret = tdbPagerFetchPage(pPager, pBt->root, &(pBtc->pPage), tdbBtreeInitPage, pBt, pBtc->pTxn);
H
Hongze Cheng 已提交
1328
    if (ret < 0) {
H
Hongze Cheng 已提交
1329
      // TODO
H
Hongze Cheng 已提交
1330
      ASSERT(0);
H
Hongze Cheng 已提交
1331
      return 0;
H
Hongze Cheng 已提交
1332 1333 1334
    }

    pBtc->iPage = 0;
H
Hongze Cheng 已提交
1335 1336 1337 1338
    pBtc->idx = -1;
    // for empty tree, just return with an invalid position
    if (TDB_PAGE_TOTAL_CELLS(pBtc->pPage) == 0) return 0;
  } else {
H
Hongze Cheng 已提交
1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372
    SPage *pPage;
    int    idx;
    int    iPage = 0;

    // downward search
    for (; iPage < pBtc->iPage; iPage++) {
      pPage = pBtc->pgStack[iPage];
      idx = pBtc->idxStack[iPage];
      nCells = TDB_PAGE_TOTAL_CELLS(pPage);

      ASSERT(!TDB_BTREE_PAGE_IS_LEAF(pPage));

      // check if key <= current position
      if (idx < nCells) {
        pCell = tdbPageGetCell(pPage, idx);
        tdbBtreeDecodeCell(pPage, pCell, &cd);
        c = pBt->kcmpr(pKey, kLen, cd.pKey, cd.kLen);
        if (c > 0) break;
      }

      // check if key > current - 1 position
      if (idx > 0) {
        pCell = tdbPageGetCell(pPage, idx - 1);
        tdbBtreeDecodeCell(pPage, pCell, &cd);
        c = pBt->kcmpr(pKey, kLen, cd.pKey, cd.kLen);
        if (c <= 0) break;
      }
    }

    // move upward
    for (;;) {
      if (pBtc->iPage == iPage) break;
      tdbBtcMoveUpward(pBtc);
    }
H
Hongze Cheng 已提交
1373
  }
H
Hongze Cheng 已提交
1374

H
Hongze Cheng 已提交
1375 1376
  // search downward to the leaf
  for (;;) {
H
Hongze Cheng 已提交
1377
    int    lidx, ridx, midx;
H
Hongze Cheng 已提交
1378
    SPage *pPage;
H
Hongze Cheng 已提交
1379

H
Hongze Cheng 已提交
1380 1381 1382 1383
    pPage = pBtc->pPage;
    nCells = TDB_PAGE_TOTAL_CELLS(pPage);
    lidx = 0;
    ridx = nCells - 1;
H
Hongze Cheng 已提交
1384

H
Hongze Cheng 已提交
1385 1386
    ASSERT(nCells > 0);
    ASSERT(pBtc->idx == -1);
H
Hongze Cheng 已提交
1387

H
Hongze Cheng 已提交
1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411
    // compare first cell
    midx = lidx;
    pCell = tdbPageGetCell(pPage, midx);
    tdbBtreeDecodeCell(pPage, pCell, &cd);
    c = pBt->kcmpr(pKey, kLen, cd.pKey, cd.kLen);
    if (c <= 0) {
      ridx = lidx - 1;
    } else {
      lidx = lidx + 1;
    }

    // compare last cell
    if (lidx <= ridx) {
      midx = ridx;
      pCell = tdbPageGetCell(pPage, midx);
      tdbBtreeDecodeCell(pPage, pCell, &cd);
      c = pBt->kcmpr(pKey, kLen, cd.pKey, cd.kLen);
      if (c >= 0) {
        lidx = ridx + 1;
      } else {
        ridx = ridx - 1;
      }
    }

H
Hongze Cheng 已提交
1412 1413 1414
    // binary search
    for (;;) {
      if (lidx > ridx) break;
H
Hongze Cheng 已提交
1415

H
Hongze Cheng 已提交
1416
      midx = (lidx + ridx) >> 1;
H
Hongze Cheng 已提交
1417

H
Hongze Cheng 已提交
1418 1419 1420 1421 1422 1423 1424
      pCell = tdbPageGetCell(pPage, midx);
      ret = tdbBtreeDecodeCell(pPage, pCell, &cd);
      if (ret < 0) {
        // TODO: handle error
        ASSERT(0);
        return -1;
      }
H
Hongze Cheng 已提交
1425

H
Hongze Cheng 已提交
1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436
      // Compare the key values
      c = pBt->kcmpr(pKey, kLen, cd.pKey, cd.kLen);
      if (c < 0) {
        // pKey < cd.pKey
        ridx = midx - 1;
      } else if (c > 0) {
        // pKey > cd.pKey
        lidx = midx + 1;
      } else {
        // pKey == cd.pKey
        break;
H
Hongze Cheng 已提交
1437
      }
H
Hongze Cheng 已提交
1438
    }
H
Hongze Cheng 已提交
1439

H
Hongze Cheng 已提交
1440 1441 1442 1443 1444 1445 1446
    // keep search downward or break
    if (TDB_BTREE_PAGE_IS_LEAF(pPage)) {
      pBtc->idx = midx;
      *pCRst = c;
      break;
    } else {
      if (c <= 0) {
H
Hongze Cheng 已提交
1447 1448
        pBtc->idx = midx;
      } else {
H
Hongze Cheng 已提交
1449
        pBtc->idx = midx + 1;
H
Hongze Cheng 已提交
1450
      }
H
Hongze Cheng 已提交
1451
      tdbBtcMoveDownward(pBtc);
H
Hongze Cheng 已提交
1452 1453 1454 1455 1456 1457
    }
  }

  return 0;
}

H
refact  
Hongze Cheng 已提交
1458 1459 1460 1461 1462 1463
int tdbBtcClose(SBTC *pBtc) {
  if (pBtc->iPage < 0) return 0;

  for (;;) {
    ASSERT(pBtc->pPage);

H
Hongze Cheng 已提交
1464
    tdbPagerReturnPage(pBtc->pBt->pPager, pBtc->pPage, pBtc->pTxn);
H
refact  
Hongze Cheng 已提交
1465 1466 1467 1468 1469 1470 1471 1472 1473 1474

    pBtc->iPage--;
    if (pBtc->iPage < 0) break;

    pBtc->pPage = pBtc->pgStack[pBtc->iPage];
    pBtc->idx = pBtc->idxStack[pBtc->iPage];
  }

  return 0;
}
H
refact  
Hongze Cheng 已提交
1475
// TDB_BTREE_CURSOR
H
Hongze Cheng 已提交
1476

H
refact  
Hongze Cheng 已提交
1477
// TDB_BTREE_DEBUG =====================
H
Hongze Cheng 已提交
1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496
#ifndef NODEBUG
typedef struct {
  SPgno pgno;
  u8    root;
  u8    leaf;
  SPgno rChild;
  int   nCells;
  int   nOvfl;
} SBtPageInfo;

SBtPageInfo btPageInfos[20];

void tdbBtPageInfo(SPage *pPage, int idx) {
  SBtPageInfo *pBtPageInfo;

  pBtPageInfo = btPageInfos + idx;

  pBtPageInfo->pgno = TDB_PAGE_PGNO(pPage);

H
refact  
Hongze Cheng 已提交
1497 1498
  pBtPageInfo->root = TDB_BTREE_PAGE_IS_ROOT(pPage);
  pBtPageInfo->leaf = TDB_BTREE_PAGE_IS_LEAF(pPage);
H
Hongze Cheng 已提交
1499 1500 1501 1502 1503 1504 1505 1506 1507

  pBtPageInfo->rChild = 0;
  if (!pBtPageInfo->leaf) {
    pBtPageInfo->rChild = *(SPgno *)(pPage->pData + 1);
  }

  pBtPageInfo->nCells = TDB_PAGE_TOTAL_CELLS(pPage) - pPage->nOverflow;
  pBtPageInfo->nOvfl = pPage->nOverflow;
}
H
refact  
Hongze Cheng 已提交
1508 1509
#endif
// TDB_BTREE_DEBUG