未验证 提交 6ce25563 编写于 作者: dengyihao's avatar dengyihao 提交者: GitHub

Merge pull request #11137 from taosdata/feature/fst_update_query

add fuzzy search
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef __INDEX_FST_DFA_H__
#define __INDEX_FST_DFA_H__
#include "indexFstRegex.h"
#include "indexFstSparse.h"
#include "tarray.h"
#include "thash.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct FstDfa FstDfa;
typedef struct {
SArray * insts;
uint32_t next[256];
bool isMatch;
} State;
/*
* dfa builder related func
**/
typedef struct FstDfaBuilder {
FstDfa * dfa;
SHashObj *cache;
} FstDfaBuilder;
FstDfaBuilder *dfaBuilderCreate(SArray *insts);
void dfaBuilderDestroy(FstDfaBuilder *builder);
FstDfa *dfaBuilderBuild(FstDfaBuilder *builder);
bool dfaBuilderRunState(FstDfaBuilder *builder, FstSparseSet *cur, FstSparseSet *next, uint32_t state, uint8_t bytes,
uint32_t *result);
bool dfaBuilderCachedState(FstDfaBuilder *builder, FstSparseSet *set, uint32_t *result);
/*
* dfa related func
**/
typedef struct FstDfa {
SArray *insts;
SArray *states;
} FstDfa;
FstDfa *dfaCreate(SArray *insts, SArray *states);
bool dfaIsMatch(FstDfa *dfa, uint32_t si);
bool dfaAccept(FstDfa *dfa, uint32_t si, uint8_t byte, uint32_t *result);
void dfaAdd(FstDfa *dfa, FstSparseSet *set, uint32_t ip);
bool dfaRun(FstDfa *dfa, FstSparseSet *from, FstSparseSet *to, uint8_t byte);
#ifdef __cplusplus
}
#endif
#endif
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TD_INDEX_FST_REGEX_H_
#define _TD_INDEX_FST_REGEX_H_
//#include "indexFstDfa.h"
#include "taos.h"
#include "tarray.h"
#include "tchecksum.h"
#include "thash.h"
#include "tlog.h"
#include "tutil.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef enum { MATCH, JUMP, SPLIT, RANGE } InstType;
typedef struct MatchValue {
} MatchValue;
typedef struct JumpValue {
uint32_t step;
} JumpValue;
typedef struct SplitValue {
uint32_t len1;
uint32_t len2;
} SplitValue;
typedef struct RangeValue {
uint8_t start;
uint8_t end;
} RangeValue;
typedef struct {
InstType ty;
union {
MatchValue mv;
JumpValue jv;
SplitValue sv;
RangeValue rv;
};
} Inst;
typedef struct {
char *orig;
void *dfa;
} FstRegex;
FstRegex *regexCreate(const char *str);
void regexSetup(FstRegex *regex, uint32_t size, const char *str);
// uint32_t regexStart()
#ifdef __cplusplus
}
#endif
#endif
......@@ -13,8 +13,8 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef _TD_INDEX_SPARSE_H_
#define _TD_INDEX_SPARSE_H_
#ifndef _TD_INDEX_FST_SPARSE_H_
#define _TD_INDEX_FST_SPARSE_H_
#include "tarray.h"
......
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "indexFstDfa.h"
#include "thash.h"
const static uint32_t STATE_LIMIT = 1000;
static int dfaInstsEqual(const void *a, const void *b, size_t size) {
SArray *ar = (SArray *)a;
SArray *br = (SArray *)b;
size_t al = ar != NULL ? taosArrayGetSize(ar) : 0;
size_t bl = br != NULL ? taosArrayGetSize(br) : 0;
if (al != bl) {
return -1;
}
for (int i = 0; i < al; i++) {
uint32_t v1 = *(uint32_t *)taosArrayGet(ar, i);
uint32_t v2 = *(uint32_t *)taosArrayGet(br, i);
if (v1 != v2) {
return -1;
}
}
return 0;
}
FstDfaBuilder *dfaBuilderCreate(SArray *insts) {
FstDfaBuilder *builder = taosMemoryCalloc(1, sizeof(FstDfaBuilder));
if (builder == NULL) {
return NULL;
}
SArray *states = taosArrayInit(4, sizeof(State));
builder->dfa = dfaCreate(insts, states);
builder->cache = taosHashInit(
4, taosGetDefaultHashFunction(POINTER_BYTES == sizeof(int64_t) ? TSDB_DATA_TYPE_BIGINT : TSDB_DATA_TYPE_INT),
false, HASH_NO_LOCK);
taosHashSetEqualFp(builder->cache, dfaInstsEqual);
return builder;
}
void dfaBuilderDestroy(FstDfaBuilder *builder) {
if (builder == NULL) {
return;
}
void *pIter = builder->cache != NULL ? taosHashIterate(builder->cache, NULL) : NULL;
while (pIter) {
SArray **key = pIter;
taosArrayDestroy(*key);
pIter = taosHashIterate(builder->cache, pIter);
}
taosHashCleanup(builder->cache);
}
FstDfa *dfaBuilderBuild(FstDfaBuilder *builder) {
uint32_t sz = taosArrayGetSize(builder->dfa->insts);
FstSparseSet *cur = sparSetCreate(sz);
FstSparseSet *nxt = sparSetCreate(sz);
dfaAdd(builder->dfa, cur, 0);
SArray * states = taosArrayInit(0, sizeof(uint32_t));
uint32_t result;
if (dfaBuilderCachedState(builder, cur, &result)) {
taosArrayPush(states, &result);
}
SHashObj *seen = taosHashInit(12, taosGetDefaultHashFunction(TSDB_DATA_TYPE_INT), false, HASH_NO_LOCK);
while (taosArrayGetSize(states) != 0) {
result = *(uint32_t *)taosArrayPop(states);
for (int i = 0; i < 256; i++) {
uint32_t ns, dummpy = 0;
if (dfaBuilderRunState(builder, cur, nxt, result, i, &ns)) {
if (taosHashGet(seen, &ns, sizeof(ns)) == NULL) {
taosHashPut(seen, &ns, sizeof(ns), &dummpy, sizeof(dummpy));
taosArrayPush(states, &ns);
}
}
if (taosArrayGetSize(builder->dfa->states) > STATE_LIMIT) {
// Too many state;
//
}
}
}
taosArrayDestroy(states);
taosHashCleanup(seen);
return builder->dfa;
}
bool dfaBuilderRunState(FstDfaBuilder *builder, FstSparseSet *cur, FstSparseSet *next, uint32_t state, uint8_t byte,
uint32_t *result) {
sparSetClear(cur);
State *t = taosArrayGet(builder->dfa->states, state);
for (int i = 0; i < taosArrayGetSize(t->insts); i++) {
uint32_t ip = *(int32_t *)taosArrayGet(t->insts, i);
sparSetAdd(cur, ip);
}
dfaRun(builder->dfa, cur, next, byte);
t = taosArrayGet(builder->dfa->states, state);
uint32_t nxtState;
if (dfaBuilderCachedState(builder, next, &nxtState)) {
t->next[byte] = nxtState;
*result = nxtState;
return true;
}
return false;
}
bool dfaBuilderCachedState(FstDfaBuilder *builder, FstSparseSet *set, uint32_t *result) {
SArray *tinsts = taosArrayInit(4, sizeof(uint32_t));
bool isMatch = false;
for (int i = 0; i < sparSetLen(set); i++) {
uint32_t ip = sparSetGet(set, i);
Inst *inst = taosArrayGet(builder->dfa->insts, ip);
if (inst->ty == JUMP || inst->ty == SPLIT) {
continue;
} else if (inst->ty == RANGE) {
taosArrayPush(tinsts, &ip);
} else if (inst->ty == MATCH) {
isMatch = true;
taosArrayPush(tinsts, &ip);
}
}
if (taosArrayGetSize(tinsts) == 0) {
return false;
}
uint32_t *v = taosHashGet(builder->cache, &tinsts, sizeof(POINTER_BYTES));
if (v != NULL) {
*result = *v;
taosArrayDestroy(tinsts);
} else {
State st;
st.insts = tinsts;
st.isMatch = isMatch;
taosArrayPush(builder->dfa->states, &st);
int32_t sz = taosArrayGetSize(builder->dfa->states) - 1;
taosHashPut(builder->cache, &tinsts, sizeof(POINTER_BYTES), &sz, sizeof(sz));
*result = sz;
}
return true;
}
FstDfa *dfaCreate(SArray *insts, SArray *states) {
FstDfa *dfa = taosMemoryCalloc(1, sizeof(FstDfa));
if (dfa == NULL) {
return NULL;
}
dfa->insts = insts;
dfa->states = states;
return dfa;
}
bool dfaIsMatch(FstDfa *dfa, uint32_t si) {
if (dfa->states == NULL || si < taosArrayGetSize(dfa->states)) {
return false;
}
State *st = taosArrayGet(dfa->states, si);
return st != NULL ? st->isMatch : false;
}
bool dfaAccept(FstDfa *dfa, uint32_t si, uint8_t byte, uint32_t *result) {
if (dfa->states == NULL || si < taosArrayGetSize(dfa->states)) {
return false;
}
State *st = taosArrayGet(dfa->states, si);
*result = st->next[byte];
return true;
}
void dfaAdd(FstDfa *dfa, FstSparseSet *set, uint32_t ip) {
if (sparSetContains(set, ip)) {
return;
}
sparSetAdd(set, ip);
Inst *inst = taosArrayGet(dfa->insts, ip);
if (inst->ty == MATCH || inst->ty == RANGE) {
// do nothing
} else if (inst->ty == JUMP) {
dfaAdd(dfa, set, inst->jv.step);
} else if (inst->ty == SPLIT) {
dfaAdd(dfa, set, inst->sv.len1);
dfaAdd(dfa, set, inst->sv.len2);
}
return;
}
bool dfaRun(FstDfa *dfa, FstSparseSet *from, FstSparseSet *to, uint8_t byte) {
bool isMatch = false;
sparSetClear(to);
for (int i = 0; i < sparSetLen(from); i++) {
uint32_t ip = sparSetGet(from, i);
Inst *inst = taosArrayGet(dfa->insts, ip);
if (inst->ty == JUMP || inst->ty == SPLIT) {
continue;
} else if (inst->ty == MATCH) {
isMatch = true;
} else if (inst->ty == RANGE) {
if (inst->rv.start <= byte && byte <= inst->rv.end) {
dfaAdd(dfa, to, ip + 1);
}
}
}
return isMatch;
}
/*
* Copyright (c) 2019 TAOS Data, Inc. <jhtao@taosdata.com>
*
* This program is free software: you can use, redistribute, and/or modify
* it under the terms of the GNU Affero General Public License, version 3
* or later ("AGPL"), as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "indexFstRegex.h"
#include "indexFstSparse.h"
FstRegex *regexCreate(const char *str) {
FstRegex *regex = taosMemoryCalloc(1, sizeof(FstRegex));
if (regex == NULL) {
return NULL;
}
int32_t sz = (int32_t)strlen(str);
char * orig = taosMemoryCalloc(1, sz);
memcpy(orig, str, sz);
regex->orig = orig;
}
void regexSetup(FstRegex *regex, uint32_t size, const char *str) {
// return
// return;
}
......@@ -13,7 +13,7 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include "indexSparse.h"
#include "indexFstSparse.h"
FstSparseSet *sparSetCreate(int32_t sz) {
FstSparseSet *ss = taosMemoryCalloc(1, sizeof(FstSparseSet));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册