未验证 提交 e138d297 编写于 作者: E Evan 提交者: GitHub

add text analyzer for es (#6249)

上级 c52aa023
......@@ -76,6 +76,7 @@ Release Notes.
* OAL supports multiple values when as numeric
* Add node information from the Openensus proto to the labels of the samples, to support the identification of the source of the Metric data.
* Fix bug that the same sample name in one MAL expression caused `IllegalArgumentException` in `Analyzer.analyse`.
* Add the text analyzer for querying log in the es storage.
#### UI
* Fix un-removed tags in trace query.
......
......@@ -71,6 +71,8 @@ storage:
metadataQueryMaxSize: ${SW_STORAGE_ES_QUERY_MAX_SIZE:5000}
segmentQueryMaxSize: ${SW_STORAGE_ES_QUERY_SEGMENT_SIZE:200}
profileTaskQueryMaxSize: ${SW_STORAGE_ES_QUERY_PROFILE_TASK_SIZE:200}
oapAnalyzer: ${SW_STORAGE_ES_OAP_ANALYZER:"{\"analyzer\":{\"oap_analyzer\":{\"type\":\"stop\"}}}"} # the oap analyzer.
oapLogAnalyzer: ${SW_STORAGE_ES_OAP_LOG_ANALYZER:"{\"analyzer\":{\"oap_log_analyzer\":{\"type\":\"standard\"}}}"} # the oap log analyzer. It could be customized by the ES analyzer configuration to support more language log formats, such as Chinese log, Japanese log and etc.
advanced: ${SW_STORAGE_ES_ADVANCED:""}
```
......
......@@ -128,6 +128,8 @@ storage:
metadataQueryMaxSize: ${SW_STORAGE_ES_QUERY_MAX_SIZE:5000}
segmentQueryMaxSize: ${SW_STORAGE_ES_QUERY_SEGMENT_SIZE:200}
profileTaskQueryMaxSize: ${SW_STORAGE_ES_QUERY_PROFILE_TASK_SIZE:200}
oapAnalyzer: ${SW_STORAGE_ES_OAP_ANALYZER:"{\"analyzer\":{\"oap_analyzer\":{\"type\":\"stop\"}}}"} # the oap analyzer.
oapLogAnalyzer: ${SW_STORAGE_ES_OAP_LOG_ANALYZER:"{\"analyzer\":{\"oap_log_analyzer\":{\"type\":\"standard\"}}}"} # the oap log analyzer. It could be customized by the ES analyzer configuration to support more language log formats, such as Chinese log, Japanese log and etc.
advanced: ${SW_STORAGE_ES_ADVANCED:""}
elasticsearch7:
nameSpace: ${SW_NAMESPACE:""}
......@@ -153,6 +155,8 @@ storage:
metadataQueryMaxSize: ${SW_STORAGE_ES_QUERY_MAX_SIZE:5000}
segmentQueryMaxSize: ${SW_STORAGE_ES_QUERY_SEGMENT_SIZE:200}
profileTaskQueryMaxSize: ${SW_STORAGE_ES_QUERY_PROFILE_TASK_SIZE:200}
oapAnalyzer: ${SW_STORAGE_ES_OAP_ANALYZER:"{\"analyzer\":{\"oap_analyzer\":{\"type\":\"stop\"}}}"} # the oap analyzer.
oapLogAnalyzer: ${SW_STORAGE_ES_OAP_LOG_ANALYZER:"{\"analyzer\":{\"oap_log_analyzer\":{\"type\":\"standard\"}}}"} # the oap log analyzer. It could be customized by the ES analyzer configuration to support more language log formats, such as Chinese log, Japanese log and etc.
advanced: ${SW_STORAGE_ES_ADVANCED:""}
h2:
driver: ${SW_STORAGE_H2_DRIVER:org.h2.jdbcx.JdbcDataSource}
......
......@@ -87,7 +87,7 @@ public abstract class AbstractLogRecord extends Record {
private int contentType = ContentType.NONE.value();
@Setter
@Getter
@Column(columnName = CONTENT, length = 1_000_000, matchQuery = true)
@Column(columnName = CONTENT, length = 1_000_000, matchQuery = true, analyzer = Column.AnalyzerType.OAP_LOG_ANALYZER)
private String content;
@Setter
@Getter
......@@ -107,8 +107,8 @@ public abstract class AbstractLogRecord extends Record {
private List<String> tagsInString;
/**
* tags is a duplicate field of {@link #tagsInString}. Some storage don't support array values in a single
* column. Then, those implementations could use this raw data to generate necessary data structures.
* tags is a duplicate field of {@link #tagsInString}. Some storage don't support array values in a single column.
* Then, those implementations could use this raw data to generate necessary data structures.
*/
@Setter
@Getter
......
......@@ -87,6 +87,34 @@ public @interface Column {
*/
ValueDataType dataType() default ValueDataType.NOT_VALUE;
/**
* The storage analyzer mode.
*
* @since 8.4.0
*/
AnalyzerType analyzer() default AnalyzerType.OAP_ANALYZER;
/**
* The analyzer declares the text analysis mode.
*/
enum AnalyzerType {
/**
* The default analyzer.
*/
OAP_ANALYZER("oap_analyzer"),
/**
* The log analyzer.
*/
OAP_LOG_ANALYZER("oap_log_analyzer");
@Getter
private final String name;
AnalyzerType(final String name) {
this.name = name;
}
}
/**
* ValueDataType represents the data structure of value column. The persistent way of the value column determine the
* available ways to query the data.
......
......@@ -21,6 +21,7 @@ package org.apache.skywalking.oap.server.core.storage.model;
import java.lang.reflect.Type;
import lombok.Getter;
import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable;
import org.apache.skywalking.oap.server.core.storage.annotation.Column;
@Getter
public class ModelColumn {
......@@ -30,6 +31,7 @@ public class ModelColumn {
private final boolean matchQuery;
private final boolean storageOnly;
private final int length;
private final Column.AnalyzerType analyzer;
public ModelColumn(ColumnName columnName,
Class<?> type,
......@@ -37,12 +39,14 @@ public class ModelColumn {
boolean matchQuery,
boolean storageOnly,
boolean isValue,
int length) {
int length,
Column.AnalyzerType analyzer) {
this.columnName = columnName;
this.type = type;
this.genericType = genericType;
this.matchQuery = matchQuery;
this.length = length;
this.analyzer = analyzer;
/*
* byte[] and {@link IntKeyLongValueHashMap} could never be query.
*/
......
......@@ -124,7 +124,8 @@ public class StorageModels implements IModelManager, ModelCreator, ModelManipula
modelColumns.add(
new ModelColumn(
new ColumnName(modelName, column.columnName()), field.getType(), field.getGenericType(),
column.matchQuery(), column.storageOnly(), column.dataType().isValue(), columnLength
column.matchQuery(), column.storageOnly(), column.dataType().isValue(), columnLength,
column.analyzer()
));
if (log.isDebugEnabled()) {
log.debug("The field named {} with the {} type", column.columnName(), field.getType());
......
......@@ -19,6 +19,7 @@
package org.apache.skywalking.oap.server.core.storage.model;
import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable;
import org.apache.skywalking.oap.server.core.storage.annotation.Column;
import org.junit.Assert;
import org.junit.Test;
......@@ -26,20 +27,23 @@ public class ModelColumnTest {
@Test
public void testColumnDefine() {
ModelColumn column = new ModelColumn(new ColumnName("", "abc"), byte[].class, byte[].class, true,
false, true, 0
false, true, 0,
Column.AnalyzerType.OAP_ANALYZER
);
Assert.assertEquals(true, column.isStorageOnly());
Assert.assertEquals("abc", column.getColumnName().getName());
column = new ModelColumn(new ColumnName("", "abc"), DataTable.class, DataTable.class, true,
false, true, 200
false, true, 200,
Column.AnalyzerType.OAP_ANALYZER
);
Assert.assertEquals(true, column.isStorageOnly());
Assert.assertEquals("abc", column.getColumnName().getName());
Assert.assertEquals(200, column.getLength());
column = new ModelColumn(new ColumnName("", "abc"), String.class, String.class, true,
false, true, 200
false, true, 200,
Column.AnalyzerType.OAP_ANALYZER
);
Assert.assertEquals(false, column.isStorageOnly());
Assert.assertEquals("abc", column.getColumnName().getName());
......@@ -48,7 +52,8 @@ public class ModelColumnTest {
@Test(expected = IllegalArgumentException.class)
public void testConflictDefinition() {
ModelColumn column = new ModelColumn(new ColumnName("", "abc"), String.class, String.class,
true, true, true, 200
true, true, true, 200,
Column.AnalyzerType.OAP_ANALYZER
);
}
}
......@@ -41,7 +41,8 @@ public class StorageModuleElasticsearchConfig extends ModuleConfig {
private int indexReplicasNumber = 0;
private int indexShardsNumber = 1;
/**
* @since 8.2.0, the record day step is for super size dataset record index rolling when the value of it is greater than 0
* @since 8.2.0, the record day step is for super size dataset record index rolling when the value of it is greater
* than 0
*/
private int superDatasetDayStep = -1;
/**
......@@ -76,5 +77,17 @@ public class StorageModuleElasticsearchConfig extends ModuleConfig {
private int metadataQueryMaxSize = 5000;
private int segmentQueryMaxSize = 200;
private int profileTaskQueryMaxSize = 200;
/**
* The default analyzer for match query field. {@link org.apache.skywalking.oap.server.core.storage.annotation.Column.AnalyzerType#OAP_ANALYZER}
*
* @since 8.4.0
*/
private String oapAnalyzer = "{\"analyzer\":{\"oap_analyzer\":{\"type\":\"stop\"}}}";
/**
* The log analyzer for match query field. {@link org.apache.skywalking.oap.server.core.storage.annotation.Column.AnalyzerType#OAP_LOG_ANALYZER}
*
* @since 8.4.0
*/
private String oapLogAnalyzer = "{\"analyzer\":{\"oap_log_analyzer\":{\"type\":\"standard\"}}}";
private String advanced;
}
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base;
import com.google.gson.Gson;
import com.google.gson.annotations.SerializedName;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.oap.server.core.storage.StorageException;
import org.apache.skywalking.oap.server.core.storage.annotation.Column;
import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.StorageModuleElasticsearchConfig;
@Getter
@Setter
@Slf4j
public class AnalyzerSetting {
/**
* A built-in or customised tokenizer.
*/
private Map<String, Object> tokenizer = new HashMap<>();
/**
* An optional array of built-in or customised character filters.
*/
@SerializedName("char_filter")
private Map<String, Object> charFilter = new HashMap<>();
/**
* An optional array of built-in or customised token filters.
*/
private Map<String, Object> filter = new HashMap<>();
/**
* The custom analyzers.
*/
private Map<String, Object> analyzer = new HashMap<>();
public void combine(AnalyzerSetting analyzerSetting) {
this.analyzer.putAll(analyzerSetting.getAnalyzer());
this.tokenizer.putAll(analyzerSetting.tokenizer);
this.filter.putAll(analyzerSetting.filter);
this.charFilter.putAll(analyzerSetting.charFilter);
}
@Override
public boolean equals(final Object o) {
if (this == o)
return true;
if (!(o instanceof AnalyzerSetting))
return false;
final AnalyzerSetting that = (AnalyzerSetting) o;
return getTokenizer().equals(that.getTokenizer()) &&
getCharFilter().equals(that.getCharFilter()) &&
getFilter().equals(that.getFilter()) &&
getAnalyzer().equals(that.getAnalyzer());
}
@Override
public int hashCode() {
return Objects.hash(getTokenizer(), getCharFilter(), getFilter(), getAnalyzer());
}
public enum Generator {
OAP_ANALYZER_SETTING_GENERATOR(
Column.AnalyzerType.OAP_ANALYZER,
config -> new Gson().fromJson(config.getOapAnalyzer(), AnalyzerSetting.class)
),
OAP_LOG_ANALYZER_SETTING_GENERATOR(
Column.AnalyzerType.OAP_LOG_ANALYZER,
config -> new Gson().fromJson(config.getOapLogAnalyzer(), AnalyzerSetting.class)
);
private final Column.AnalyzerType type;
private final GenerateAnalyzerSettingFunc func;
Generator(final Column.AnalyzerType type,
final GenerateAnalyzerSettingFunc func) {
this.type = type;
this.func = func;
}
public GenerateAnalyzerSettingFunc getGenerateFunc() {
return this.func;
}
public static Generator getGenerator(Column.AnalyzerType type) throws StorageException {
for (final Generator value : Generator.values()) {
if (value.type == type) {
return value;
}
}
throw new StorageException("cannot found the AnalyzerSettingGenerator for the " + type.getName() + " type");
}
}
@FunctionalInterface
public interface GenerateAnalyzerSettingFunc {
AnalyzerSetting generate(StorageModuleElasticsearchConfig config);
}
}
......@@ -21,6 +21,7 @@ package org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base;
import com.google.gson.Gson;
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.apm.util.StringUtil;
......@@ -101,7 +102,7 @@ public class StorageEsInstaller extends ModelInstaller {
}
}
protected Map<String, Object> createSetting(Model model) {
protected Map<String, Object> createSetting(Model model) throws StorageException {
Map<String, Object> setting = new HashMap<>();
setting.put("index.number_of_replicas", model.isSuperDataset()
......@@ -113,7 +114,7 @@ public class StorageEsInstaller extends ModelInstaller {
setting.put("index.refresh_interval", model.isRecord()
? TimeValue.timeValueSeconds(10).toString()
: TimeValue.timeValueSeconds(config.getFlushInterval()).toString());
setting.put("analysis.analyzer.oap_analyzer.type", "stop");
setting.put("analysis", getAnalyzerSetting(model.getColumns()));
if (!StringUtil.isEmpty(config.getAdvanced())) {
Map<String, Object> advancedSettings = gson.fromJson(config.getAdvanced(), Map.class);
advancedSettings.forEach(setting::put);
......@@ -121,6 +122,17 @@ public class StorageEsInstaller extends ModelInstaller {
return setting;
}
private Map getAnalyzerSetting(List<ModelColumn> analyzerTypes) throws StorageException {
AnalyzerSetting analyzerSetting = new AnalyzerSetting();
for (final ModelColumn column : analyzerTypes) {
AnalyzerSetting setting = AnalyzerSetting.Generator.getGenerator(column.getAnalyzer())
.getGenerateFunc()
.generate(config);
analyzerSetting.combine(setting);
}
return gson.fromJson(gson.toJson(analyzerSetting), Map.class);
}
protected Map<String, Object> createMapping(Model model) {
Map<String, Object> mapping = new HashMap<>();
Map<String, Object> type = new HashMap<>();
......@@ -135,17 +147,19 @@ public class StorageEsInstaller extends ModelInstaller {
String matchCName = MatchCNameBuilder.INSTANCE.build(columnDefine.getColumnName().getName());
Map<String, Object> originalColumn = new HashMap<>();
originalColumn.put("type", columnTypeEsMapping.transform(columnDefine.getType(), columnDefine.getGenericType()));
originalColumn.put(
"type", columnTypeEsMapping.transform(columnDefine.getType(), columnDefine.getGenericType()));
originalColumn.put("copy_to", matchCName);
properties.put(columnDefine.getColumnName().getName(), originalColumn);
Map<String, Object> matchColumn = new HashMap<>();
matchColumn.put("type", "text");
matchColumn.put("analyzer", "oap_analyzer");
matchColumn.put("analyzer", columnDefine.getAnalyzer().getName());
properties.put(matchCName, matchColumn);
} else {
Map<String, Object> column = new HashMap<>();
column.put("type", columnTypeEsMapping.transform(columnDefine.getType(), columnDefine.getGenericType()));
column.put(
"type", columnTypeEsMapping.transform(columnDefine.getType(), columnDefine.getGenericType()));
if (columnDefine.isStorageOnly()) {
column.put("index", false);
}
......
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.skywalking.oap.server.storage.plugin.elasticsearch.base;
import com.google.gson.Gson;
import java.util.Arrays;
import java.util.HashMap;
import org.apache.skywalking.oap.server.storage.plugin.elasticsearch.StorageModuleElasticsearchConfig;
import org.junit.Assert;
import org.junit.Test;
public class AnalyzerSettingTest {
private final Gson gson = new Gson();
private static final String ANALYZER_JSON = "{\"analyzer\":{\"my_custom_analyzer\":{\"type\":\"custom\",\"char_filter\":[\"emoticons\"],\"tokenizer\":\"punctuation\",\"filter\":[\"lowercase\",\"english_stop\"]}},\"tokenizer\":{\"punctuation\":{\"type\":\"pattern\",\"pattern\":\"[ .,!?]\"}},\"char_filter\":{\"emoticons\":{\"type\":\"mapping\",\"mappings\":[\":) => _happy_\",\":( => _sad_\"]}},\"filter\":{\"english_stop\":{\"type\":\"stop\",\"stopwords\":\"_english_\"}}}";
@Test
public void combine() {
StorageModuleElasticsearchConfig elasticsearchConfig = new StorageModuleElasticsearchConfig();
AnalyzerSetting oapAnalyzerSetting = gson.fromJson(elasticsearchConfig.getOapAnalyzer(), AnalyzerSetting.class);
Assert.assertEquals(oapAnalyzerSetting, getDefaultOapAnalyzer());
AnalyzerSetting oapLogAnalyzerSetting = gson.fromJson(
elasticsearchConfig.getOapLogAnalyzer(), AnalyzerSetting.class);
Assert.assertEquals(oapLogAnalyzerSetting, getDefaultOapLogAnalyzer());
AnalyzerSetting testAnalyzerSetting = gson.fromJson(ANALYZER_JSON, AnalyzerSetting.class);
Assert.assertEquals(testAnalyzerSetting, getTestOapAnalyzerSetting());
oapAnalyzerSetting.combine(oapLogAnalyzerSetting);
oapAnalyzerSetting.combine(testAnalyzerSetting);
Assert.assertEquals(oapAnalyzerSetting, getMergedAnalyzerSetting());
}
private AnalyzerSetting getMergedAnalyzerSetting() {
AnalyzerSetting analyzerSetting = new AnalyzerSetting();
analyzerSetting.setTokenizer(new HashMap<String, Object>() {
{
put("punctuation", new HashMap<String, Object>() {
{
put("type", "pattern");
put("pattern", "[ .,!?]");
}
});
}
});
analyzerSetting.setCharFilter(new HashMap<String, Object>() {
{
put("emoticons", new HashMap<String, Object>() {
{
put("type", "mapping");
put("mappings", Arrays.asList(":) => _happy_", ":( => _sad_"));
}
});
}
});
analyzerSetting.setFilter(new HashMap<String, Object>() {
{
put("english_stop", new HashMap<String, Object>() {
{
put("type", "stop");
put("stopwords", "_english_");
}
});
}
});
analyzerSetting.setAnalyzer(new HashMap<String, Object>() {
{
put("my_custom_analyzer", new HashMap<String, Object>() {
{
put("type", "custom");
put("char_filter", Arrays.asList("emoticons"));
put("tokenizer", "punctuation");
put("filter", Arrays.asList("lowercase", "english_stop"));
}
});
put("oap_log_analyzer", new HashMap<String, Object>() {
{
put("type", "standard");
}
});
put("oap_analyzer", new HashMap<String, Object>() {
{
put("type", "stop");
}
});
}
});
return analyzerSetting;
}
private AnalyzerSetting getTestOapAnalyzerSetting() {
AnalyzerSetting analyzerSetting = new AnalyzerSetting();
analyzerSetting.setTokenizer(new HashMap<String, Object>() {
{
put("punctuation", new HashMap<String, Object>() {
{
put("type", "pattern");
put("pattern", "[ .,!?]");
}
});
}
});
analyzerSetting.setCharFilter(new HashMap<String, Object>() {
{
put("emoticons", new HashMap<String, Object>() {
{
put("type", "mapping");
put("mappings", Arrays.asList(":) => _happy_", ":( => _sad_"));
}
});
}
});
analyzerSetting.setFilter(new HashMap<String, Object>() {
{
put("english_stop", new HashMap<String, Object>() {
{
put("type", "stop");
put("stopwords", "_english_");
}
});
}
});
analyzerSetting.setAnalyzer(new HashMap<String, Object>() {
{
put("my_custom_analyzer", new HashMap<String, Object>() {
{
put("type", "custom");
put("char_filter", Arrays.asList("emoticons"));
put("tokenizer", "punctuation");
put("filter", Arrays.asList("lowercase", "english_stop"));
}
});
}
});
return analyzerSetting;
}
private AnalyzerSetting getDefaultOapAnalyzer() {
AnalyzerSetting analyzerSetting = new AnalyzerSetting();
HashMap<String, Object> map = new HashMap<>();
map.put("oap_analyzer", new HashMap<String, Object>() {
{
put("type", "stop");
}
});
analyzerSetting.setAnalyzer(map);
return analyzerSetting;
}
private AnalyzerSetting getDefaultOapLogAnalyzer() {
AnalyzerSetting analyzerSetting = new AnalyzerSetting();
HashMap<String, Object> analyzerMap = new HashMap<String, Object>() {
{
put("oap_log_analyzer", new HashMap<String, Object>() {
{
put("type", "standard");
}
});
}
};
analyzerSetting.setAnalyzer(analyzerMap);
return analyzerSetting;
}
}
\ No newline at end of file
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册