未验证 提交 68267b1b 编写于 作者: dengyihao's avatar dengyihao 提交者: GitHub

Merge pull request #8829 from taosdata/origin/3.0/fst

Origin/3.0/fst
...@@ -80,6 +80,9 @@ void fstBuilderInsertOutput(FstBuilder *b, FstSlice bs, Output in); ...@@ -80,6 +80,9 @@ void fstBuilderInsertOutput(FstBuilder *b, FstSlice bs, Output in);
OrderType fstBuilderCheckLastKey(FstBuilder *b, FstSlice bs, bool ckDup); OrderType fstBuilderCheckLastKey(FstBuilder *b, FstSlice bs, bool ckDup);
void fstBuilderCompileFrom(FstBuilder *b, uint64_t istate); void fstBuilderCompileFrom(FstBuilder *b, uint64_t istate);
CompiledAddr fstBuilderCompile(FstBuilder *b, FstBuilderNode *bn); CompiledAddr fstBuilderCompile(FstBuilder *b, FstBuilderNode *bn);
void* fstBuilerIntoInner(FstBuilder *b);
void fstBuilderFinish(FstBuilder *b);
...@@ -216,6 +219,15 @@ bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res); ...@@ -216,6 +219,15 @@ bool fstNodeFindInput(FstNode *node, uint8_t b, uint64_t *res);
bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode); bool fstNodeCompile(FstNode *node, void *w, CompiledAddr lastAddr, CompiledAddr addr, FstBuilderNode *builderNode);
FstSlice fstNodeAsSlice(FstNode *node); FstSlice fstNodeAsSlice(FstNode *node);
// ops
typedef struct FstIndexedValue {
uint64_t index;
uint64_t value;
} FstIndexedValue;
FstLastTransition *fstLastTransitionCreate(uint8_t inp, Output out);
void fstLastTransitionDestroy(FstLastTransition *trn);
typedef struct FstMeta { typedef struct FstMeta {
...@@ -227,20 +239,20 @@ typedef struct FstMeta { ...@@ -227,20 +239,20 @@ typedef struct FstMeta {
} FstMeta; } FstMeta;
typedef struct Fst { typedef struct Fst {
FstMeta meta; FstMeta *meta;
void *data; // FstSlice *data; //
FstNode *root; //
} Fst; } Fst;
// ops // refactor simple function
typedef struct FstIndexedValue {
uint64_t index;
uint64_t value;
} FstIndexedValue;
FstLastTransition *fstLastTransitionCreate(uint8_t inp, Output out);
void fstLastTransitionDestroy(FstLastTransition *trn);
Fst* fstCreate(FstSlice *data);
void fstDestroy(Fst *fst);
bool fstGet(Fst *fst, FstSlice *b, Output *out);
FstNode* fstGetNode(Fst *fst, CompiledAddr);
FstType fstGetType(Fst *fst);
CompiledAddr fstGetRootAddr(Fst *fst);
Output fstEmptyFinalOutput(Fst *fst, bool *null);
bool fstVerify(Fst *fst);
#endif #endif
...@@ -27,9 +27,11 @@ typedef struct FstCountingWriter { ...@@ -27,9 +27,11 @@ typedef struct FstCountingWriter {
uint64_t fstCountingWriterWrite(FstCountingWriter *write, uint8_t *buf, uint32_t bufLen); uint64_t fstCountingWriterWrite(FstCountingWriter *write, uint8_t *buf, uint32_t bufLen);
int FstCountingWriterFlush(FstCountingWriter *write); int fstCountingWriterFlush(FstCountingWriter *write);
uint32_t fstCountingWriterMaskedCheckSum(FstCountingWriter *write);
FstCountingWriter *fstCountingWriterCreate(void *wtr); FstCountingWriter *fstCountingWriterCreate(void *wtr);
void fstCountingWriterDestroy(FstCountingWriter *w); void fstCountingWriterDestroy(FstCountingWriter *w);
......
...@@ -32,9 +32,9 @@ extern const CompiledAddr EMPTY_ADDRESS; ...@@ -32,9 +32,9 @@ extern const CompiledAddr EMPTY_ADDRESS;
extern const CompiledAddr NONE_ADDRESS; extern const CompiledAddr NONE_ADDRESS;
// This version number is written to every finite state transducer created by // This version number is written to every finite state transducer created by
// this crate. When a finite state transducer is read, its version number is // this version When a finite state transducer is read, its version number is
// checked against this value. // checked against this value.
extern const uint64_t version; extern const uint64_t VERSION;
// The threshold (in number of transitions) at which an index is created for // The threshold (in number of transitions) at which an index is created for
// a node's transitions. This speeds up lookup time at the expense of FST size // a node's transitions. This speeds up lookup time at the expense of FST size
......
...@@ -14,7 +14,8 @@ ...@@ -14,7 +14,8 @@
*/ */
#include "index_fst.h" #include "index_fst.h"
#include "tcoding.h"
#include "tchecksum.h"
static void fstPackDeltaIn(FstCountingWriter *wrt, CompiledAddr nodeAddr, CompiledAddr transAddr, uint8_t nBytes) { static void fstPackDeltaIn(FstCountingWriter *wrt, CompiledAddr nodeAddr, CompiledAddr transAddr, uint8_t nBytes) {
...@@ -98,7 +99,7 @@ void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *nodes, FstSlice bs, Output ...@@ -98,7 +99,7 @@ void fstUnFinishedNodesAddSuffix(FstUnFinishedNodes *nodes, FstSlice bs, Output
FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz); FstBuilderNodeUnfinished *un = taosArrayGet(nodes->stack, sz);
assert(un->last == NULL); assert(un->last == NULL);
//FstLastTransition *trn = malloc(sizeof(FstLastTransition)); //FstLastTransition *trn = malloc(sizeof(FstLastTransition));
//trn->inp = s->data[s->start]; //trn->inp = s->data[s->start];
...@@ -146,24 +147,27 @@ uint64_t fstUnFinishedNodesFindCommPrefixAndSetOutput(FstUnFinishedNodes *node, ...@@ -146,24 +147,27 @@ uint64_t fstUnFinishedNodesFindCommPrefixAndSetOutput(FstUnFinishedNodes *node,
size_t lsz = (size_t)(s->end - s->start + 1); // data len size_t lsz = (size_t)(s->end - s->start + 1); // data len
size_t ssz = taosArrayGetSize(node->stack); // stack size size_t ssz = taosArrayGetSize(node->stack); // stack size
uint64_t res = 0; uint64_t i = 0;
for (size_t i = 0; i < lsz && i < ssz; i++) { for (i = 0; i < lsz && i < ssz; i++) {
FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i); FstBuilderNodeUnfinished *un = taosArrayGet(node->stack, i);
FstLastTransition *last = un->last; FstLastTransition *t = un->last;
if (last->inp == s->data[s->start + i]) { uint64_t addPrefix = 0;
uint64_t commPrefix = last->out; if (t && t->inp == s->data[s->start + i]) {
uint64_t addPrefix = last->out - commPrefix; uint64_t commPrefix = MIN(t->out, *out);
out = out - commPrefix; uint64_t tAddPrefix = t->out - commPrefix;
last->out = commPrefix; (*out) = (*out) - commPrefix;
if (addPrefix != 0) { t->out = commPrefix;
fstBuilderNodeUnfinishedAddOutputPrefix(un, addPrefix); addPrefix = tAddPrefix;
}
} else { } else {
break; break;
}
if (addPrefix != 0) {
fstBuilderNodeUnfinishedAddOutputPrefix(un, addPrefix);
} }
} }
return res; return i;
} }
...@@ -771,16 +775,16 @@ void fstBuilderInsertOutput(FstBuilder *b, FstSlice bs, Output in) { ...@@ -771,16 +775,16 @@ void fstBuilderInsertOutput(FstBuilder *b, FstSlice bs, Output in) {
return; return;
} }
Output out; Output out;
uint64_t prefixLen; //if (in != 0) { //if let Some(in) = in
if (in != 0) { //if let Some(in) = in // prefixLen = fstUnFinishedNodesFindCommPrefixAndSetOutput(b->unfinished, bs, in, &out);
prefixLen = fstUnFinishedNodesFindCommPrefixAndSetOutput(b->unfinished, bs, in, &out); //} else {
} else { // prefixLen = fstUnFinishedNodesFindCommPrefix(b->unfinished, bs);
prefixLen = fstUnFinishedNodesFindCommPrefix(b->unfinished, bs); // out = 0;
out = 0; //}
} uint64_t prefixLen = fstUnFinishedNodesFindCommPrefixAndSetOutput(b->unfinished, bs, in, &out);
if (prefixLen == FST_SLICE_LEN(s)) { if (prefixLen == FST_SLICE_LEN(s)) {
assert(out != 0); assert(out == 0);
return; return;
} }
...@@ -849,6 +853,31 @@ CompiledAddr fstBuilderCompile(FstBuilder *b, FstBuilderNode *bn) { ...@@ -849,6 +853,31 @@ CompiledAddr fstBuilderCompile(FstBuilder *b, FstBuilderNode *bn) {
return b->lastAddr; return b->lastAddr;
} }
void* fstBuilderInsertInner(FstBuilder *b) {
fstBuilderCompileFrom(b, 0);
FstBuilderNode *rootNode = fstUnFinishedNodesPopRoot(b->unfinished);
CompiledAddr rootAddr = fstBuilderCompile(b, rootNode);
uint8_t buf64[8] = {0};
taosEncodeFixedU64((void **)&buf64, b->len);
fstCountingWriterWrite(b->wrt, buf64, sizeof(buf64));
taosEncodeFixedU64((void **)&buf64, rootAddr);
fstCountingWriterWrite(b->wrt, buf64, sizeof(buf64));
uint8_t buf32[4] = {0};
uint32_t sum = fstCountingWriterMaskedCheckSum(b->wrt);
taosEncodeFixedU32((void **)&buf32, sum);
fstCountingWriterWrite(b->wrt, buf32, sizeof(buf32));
fstCountingWriterFlush(b->wrt);
return b->wrt;
}
void fstBuilderFinish(FstBuilder *b) {
fstBuilderInsertInner(b);
}
...@@ -894,4 +923,108 @@ void fstBuilderNodeUnfinishedAddOutputPrefix(FstBuilderNodeUnfinished *unNode, O ...@@ -894,4 +923,108 @@ void fstBuilderNodeUnfinishedAddOutputPrefix(FstBuilderNodeUnfinished *unNode, O
return; return;
} }
Fst* fstCreate(FstSlice *slice) {
char *buf = slice->data;
uint64_t skip = 0;
uint64_t len = slice->dLen;
if (len < 36) {
return NULL;
}
uint64_t version;
taosDecodeFixedU64(buf, &version);
skip += sizeof(version);
if (version == 0 || version > VERSION) {
return NULL;
}
uint64_t type;
taosDecodeFixedU64(buf + skip, &type);
skip += sizeof(type);
uint32_t checkSum = 0;
len -= sizeof(checkSum);
taosDecodeFixedU32(buf + len, &checkSum);
CompiledAddr rootAddr;
len -= sizeof(rootAddr);
taosDecodeFixedU64(buf + len, &rootAddr);
uint64_t fstLen;
len -= sizeof(fstLen);
taosDecodeFixedU64(buf + len, &fstLen);
//TODO(validat root addr)
//
Fst *fst= (Fst *)calloc(1, sizeof(Fst));
if (fst == NULL) { return NULL; }
fst->meta = (FstMeta *)malloc(sizeof(FstMeta));
if (NULL == fst->meta) {
goto FST_CREAT_FAILED;
}
fst->meta->version = version;
fst->meta->rootAddr = rootAddr;
fst->meta->ty = type;
fst->meta->len = fstLen;
fst->meta->checkSum = checkSum;
fst->data = slice;
return fst;
FST_CREAT_FAILED:
free(fst->meta);
free(fst);
}
void fstDestroy(Fst *fst) {
if (fst) {
free(fst->meta);
fstNodeDestroy(fst->root);
}
free(fst);
}
bool fstGet(Fst *fst, FstSlice *b, Output *out) {
return false;
}
FstNode* fstGetNode(Fst *fst, CompiledAddr addr) {
if (fst->root != NULL) {
return fst->root;
}
fst->root = fstNodeCreate(fst->meta->version, addr, fst->data);
return fst->root;
}
FstType fstGetType(Fst *fst) {
return fst->meta->ty;
}
CompiledAddr fstGetRootAddr(Fst *fst) {
return fst->meta->rootAddr;
}
Output fstEmptyFinalOutput(Fst *fst, bool *null) {
Output res = 0;
FstNode *node = fst->root;
if (FST_NODE_IS_FINAL(node)) {
*null = false;
res = FST_NODE_FINAL_OUTPUT(node);
} else {
*null = true;
}
return res;
}
bool fstVerify(Fst *fst) {
uint32_t checkSum = fst->meta->checkSum;
FstSlice *data = fst->data;
TSCKSUM initSum = 0;
if (taosCheckChecksumWhole(data->data, data->dLen)) {
return false;
}
}
...@@ -37,6 +37,9 @@ uint64_t fstCountingWriterWrite(FstCountingWriter *write, uint8_t *buf, uint32_t ...@@ -37,6 +37,9 @@ uint64_t fstCountingWriterWrite(FstCountingWriter *write, uint8_t *buf, uint32_t
return bufLen; return bufLen;
} }
uint32_t fstCountingWriterMaskedCheckSum(FstCountingWriter *write) {
return 0;
}
int fstCountingWriterFlush(FstCountingWriter *write) { int fstCountingWriterFlush(FstCountingWriter *write) {
//write->wtr->flush //write->wtr->flush
return 1; return 1;
......
...@@ -25,7 +25,7 @@ const CompiledAddr NONE_ADDRESS = 1; ...@@ -25,7 +25,7 @@ const CompiledAddr NONE_ADDRESS = 1;
// This version number is written to every finite state transducer created by // This version number is written to every finite state transducer created by
// this crate. When a finite state transducer is read, its version number is // this crate. When a finite state transducer is read, its version number is
// checked against this value. // checked against this value.
const uint64_t version = 3; const uint64_t VERSION = 3;
// The threshold (in number of transitions) at which an index is created for // The threshold (in number of transitions) at which an index is created for
// a node's transitions. This speeds up lookup time at the expense of FST size // a node's transitions. This speeds up lookup time at the expense of FST size
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册