提交 7fb827de 编写于 作者: B Benguang Zhao

fix: refactor walScanLogGetLastVer, walCheckAndRepairMeta, and walCheckAndRepairIdx.

    * search for the last entry in the contiguous range of valid WAL from the last pos fsynced as firstTrial
    * find the last entry before the last pos fsynced backwards as second trial
    * reserve sufficient space for computing CRC32 checksum, esp. of WAL record body
    * rebuild meta info to resolve potential misalignment between lists of meta and actual log files
    * retract commitIndex and appliedIndex to lastLogIndex if needed
    * put an upper size limit on possibly corrupted WAL range to be recovered
上级 3af05070
......@@ -43,6 +43,7 @@ extern "C" {
#define WAL_FILE_LEN (WAL_PATH_LEN + 32)
#define WAL_MAGIC 0xFAFBFCFDF4F3F2F1ULL
#define WAL_SCAN_BUF_SIZE (1024 * 1024 * 3)
#define WAL_RECOV_SIZE_LIMIT (100 * WAL_SCAN_BUF_SIZE)
typedef enum {
TAOS_WAL_WRITE = 1,
......
......@@ -450,6 +450,7 @@ int32_t* taosGetErrno();
#define TSDB_CODE_WAL_INVALID_VER TAOS_DEF_ERROR_CODE(0, 0x1003)
#define TSDB_CODE_WAL_OUT_OF_MEMORY TAOS_DEF_ERROR_CODE(0, 0x1004)
#define TSDB_CODE_WAL_LOG_NOT_EXIST TAOS_DEF_ERROR_CODE(0, 0x1005)
#define TSDB_CODE_WAL_CHKSUM_MISMATCH TAOS_DEF_ERROR_CODE(0, 0x1006)
// tfs
#define TSDB_CODE_FS_INVLD_CFG TAOS_DEF_ERROR_CODE(0, 0x2201)
......
......@@ -146,7 +146,7 @@ void taos_close(TAOS *taos) {
int taos_errno(TAOS_RES *res) {
if (res == NULL || TD_RES_TMQ_META(res)) {
if (terrno == TSDB_CODE_RPC_REDIRECT) terrno = TSDB_CODE_RPC_NETWORK_UNAVAIL;
if (terrno == TSDB_CODE_RPC_REDIRECT) terrno = TSDB_CODE_QRY_NOT_READY;
return terrno;
}
......@@ -154,13 +154,12 @@ int taos_errno(TAOS_RES *res) {
return 0;
}
return ((SRequestObj *)res)->code == TSDB_CODE_RPC_REDIRECT ? TSDB_CODE_RPC_NETWORK_UNAVAIL
: ((SRequestObj *)res)->code;
return ((SRequestObj *)res)->code == TSDB_CODE_RPC_REDIRECT ? TSDB_CODE_QRY_NOT_READY : ((SRequestObj *)res)->code;
}
const char *taos_errstr(TAOS_RES *res) {
if (res == NULL || TD_RES_TMQ_META(res)) {
if (terrno == TSDB_CODE_RPC_REDIRECT) terrno = TSDB_CODE_RPC_NETWORK_UNAVAIL;
if (terrno == TSDB_CODE_RPC_REDIRECT) terrno = TSDB_CODE_QRY_NOT_READY;
return (const char *)tstrerror(terrno);
}
......@@ -172,7 +171,7 @@ const char *taos_errstr(TAOS_RES *res) {
if (NULL != pRequest->msgBuf && (strlen(pRequest->msgBuf) > 0 || pRequest->code == TSDB_CODE_RPC_FQDN_ERROR)) {
return pRequest->msgBuf;
} else {
return pRequest->code == TSDB_CODE_RPC_REDIRECT ? (const char *)tstrerror(TSDB_CODE_RPC_NETWORK_UNAVAIL)
return pRequest->code == TSDB_CODE_RPC_REDIRECT ? (const char *)tstrerror(TSDB_CODE_QRY_NOT_READY)
: (const char *)tstrerror(pRequest->code);
}
}
......
此差异已折叠。
......@@ -110,8 +110,8 @@ SWal *walOpen(const char *path, SWalCfg *pCfg) {
// init ref
pWal->pRefHash = taosHashInit(64, taosGetDefaultHashFunction(TSDB_DATA_TYPE_BIGINT), true, HASH_ENTRY_LOCK);
if (pWal->pRefHash == NULL) {
wError("failed to init hash since %s", tstrerror(terrno));
goto _err;
wError("failed to init hash since %s", tstrerror(terrno));
goto _err;
}
// open meta
......@@ -149,8 +149,8 @@ SWal *walOpen(const char *path, SWalCfg *pCfg) {
// add ref
pWal->refId = taosAddRef(tsWal.refSetId, pWal);
if (pWal->refId < 0) {
wError("failed to add ref for Wal since %s", tstrerror(terrno));
goto _err;
wError("failed to add ref for Wal since %s", tstrerror(terrno));
goto _err;
}
wDebug("vgId:%d, wal:%p is opened, level:%d fsyncPeriod:%d", pWal->cfg.vgId, pWal, pWal->cfg.level,
......
......@@ -476,7 +476,8 @@ int32_t walReadVer(SWalReader *pReader, int64_t ver) {
} else {
terrno = TSDB_CODE_WAL_FILE_CORRUPTED;
}
wError("vgId:%d, failed to read WAL record head from log file since %s", pReader->pWal->cfg.vgId, terrstr());
wError("vgId:%d, failed to read WAL record head, index:%" PRId64 ", from log file since %s",
pReader->pWal->cfg.vgId, ver, terrstr());
taosThreadMutexUnlock(&pReader->mutex);
return -1;
}
......@@ -509,7 +510,8 @@ int32_t walReadVer(SWalReader *pReader, int64_t ver) {
else {
terrno = TSDB_CODE_WAL_FILE_CORRUPTED;
}
wError("vgId:%d, failed to read WAL record body from log file since %s", pReader->pWal->cfg.vgId, terrstr());
wError("vgId:%d, failed to read WAL record body, index:%" PRId64 ", from log file since %s",
pReader->pWal->cfg.vgId, ver, terrstr());
taosThreadMutexUnlock(&pReader->mutex);
return -1;
}
......
......@@ -79,6 +79,11 @@ int64_t walChangeWrite(SWal* pWal, int64_t ver) {
TdFilePtr pIdxTFile, pLogTFile;
char fnameStr[WAL_FILE_LEN];
if (pWal->pLogFile != NULL) {
code = taosFsyncFile(pWal->pLogFile);
if (code != 0) {
terrno = TAOS_SYSTEM_ERROR(errno);
return -1;
}
code = taosCloseFile(&pWal->pLogFile);
if (code != 0) {
terrno = TAOS_SYSTEM_ERROR(errno);
......@@ -86,6 +91,11 @@ int64_t walChangeWrite(SWal* pWal, int64_t ver) {
}
}
if (pWal->pIdxFile != NULL) {
code = taosFsyncFile(pWal->pIdxFile);
if (code != 0) {
terrno = TAOS_SYSTEM_ERROR(errno);
return -1;
}
code = taosCloseFile(&pWal->pIdxFile);
if (code != 0) {
terrno = TAOS_SYSTEM_ERROR(errno);
......
......@@ -540,6 +540,11 @@ int32_t walWrite(SWal *pWal, int64_t index, tmsg_t msgType, const void *body, in
void walFsync(SWal *pWal, bool forceFsync) {
if (forceFsync || (pWal->cfg.level == TAOS_WAL_FSYNC && pWal->cfg.fsyncPeriod == 0)) {
wTrace("vgId:%d, fileId:%" PRId64 ".idx, do fsync", pWal->cfg.vgId, walGetCurFileFirstVer(pWal));
if (taosFsyncFile(pWal->pIdxFile) < 0) {
wError("vgId:%d, file:%" PRId64 ".idx, fsync failed since %s", pWal->cfg.vgId, walGetCurFileFirstVer(pWal),
strerror(errno));
}
wTrace("vgId:%d, fileId:%" PRId64 ".log, do fsync", pWal->cfg.vgId, walGetCurFileFirstVer(pWal));
if (taosFsyncFile(pWal->pLogFile) < 0) {
wError("vgId:%d, file:%" PRId64 ".log, fsync failed since %s", pWal->cfg.vgId, walGetCurFileFirstVer(pWal),
......
......@@ -447,12 +447,13 @@ TAOS_DEFINE_ERROR(TSDB_CODE_TQ_TABLE_SCHEMA_NOT_FOUND, "TQ table schema not f
TAOS_DEFINE_ERROR(TSDB_CODE_TQ_NO_COMMITTED_OFFSET, "TQ no commited offset")
// wal
TAOS_DEFINE_ERROR(TSDB_CODE_WAL_APP_ERROR, "Wal unexpected generic error")
TAOS_DEFINE_ERROR(TSDB_CODE_WAL_APP_ERROR, "WAL unexpected generic error")
TAOS_DEFINE_ERROR(TSDB_CODE_WAL_FILE_CORRUPTED, "WAL file is corrupted")
TAOS_DEFINE_ERROR(TSDB_CODE_WAL_SIZE_LIMIT, "WAL size exceeds limit")
TAOS_DEFINE_ERROR(TSDB_CODE_WAL_INVALID_VER, "WAL use invalid version")
TAOS_DEFINE_ERROR(TSDB_CODE_WAL_OUT_OF_MEMORY, "WAL out of memory")
TAOS_DEFINE_ERROR(TSDB_CODE_WAL_LOG_NOT_EXIST, "WAL log not exist")
TAOS_DEFINE_ERROR(TSDB_CODE_WAL_CHKSUM_MISMATCH, "WAL checksum mismatch")
// tfs
TAOS_DEFINE_ERROR(TSDB_CODE_FS_INVLD_CFG, "tfs invalid mount config")
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册