未验证 提交 622a9c41 编写于 作者: J J.J. Liu 提交者: GitHub

[IOTDB-1403] Dictionary encoding for TEXT (#3218)

上级 e19ad057
......@@ -638,7 +638,7 @@ dateExpression
;
encoding
: PLAIN | PLAIN_DICTIONARY | RLE | DIFF | TS_2DIFF | GORILLA | REGULAR
: PLAIN | DICTIONARY | RLE | DIFF | TS_2DIFF | GORILLA | REGULAR
;
realLiteral
......@@ -864,8 +864,8 @@ PLAIN
: P L A I N
;
PLAIN_DICTIONARY
: P L A I N '_' D I C T I O N A R Y
DICTIONARY
: D I C T I O N A R Y
;
RLE
......@@ -884,7 +884,6 @@ GORILLA
: G O R I L L A
;
REGULAR
: R E G U L A R
;
......
......@@ -105,14 +105,14 @@ namespace TSDataType{
namespace TSEncoding {
enum TSEncoding {
PLAIN = 0,
PLAIN_DICTIONARY = 1,
DICTIONARY = 1,
RLE = 2,
DIFF = 3,
TS_2DIFF = 4,
BITMAP = 5,
GORILLA_V1 = 6,
REGULAR = 7,
GORILLA = 8
GORILLA = 8,
};
}
namespace TSStatusCode {
......
......@@ -32,7 +32,7 @@ class TSDataType(Enum):
@unique
class TSEncoding(Enum):
PLAIN = 0
PLAIN_DICTIONARY = 1
DICTIONARY = 1
RLE = 2
DIFF = 3
TS_2DIFF = 4
......@@ -41,7 +41,6 @@ class TSEncoding(Enum):
REGULAR = 7
GORILLA = 8
@unique
class Compressor(Enum):
UNCOMPRESSED = 0
......
......@@ -45,7 +45,7 @@
- 5: TEXT (`String`)
- **Encoding Type Hardcode**
- 0: PLAIN
- 1: PLAIN_DICTIONARY
- 1: DICTIONARY
- 2: RLE
- 3: DIFF
- 4: TS_2DIFF
......
......@@ -53,9 +53,12 @@ Regular data encoding is more suitable for time encoding regular sequence increa
Regular data encoding method is not suitable for the data with fluctuations (irregular data), and TS_2DIFF is recommended to deal with it.
* DICTIONARY
DICTIONARY encoding is lossless. It is suitable for TEXT data with low cardinality (i.e. low number of distinct values). It is not recommended to use it for high-cardinality data.
* Correspondence between data type and encoding
The four encodings described in the previous sections are applicable to different data types. If the correspondence is wrong, the time series cannot be created correctly. The correspondence between the data type and its supported encodings is summarized in the Table below.
The five encodings described in the previous sections are applicable to different data types. If the correspondence is wrong, the time series cannot be created correctly. The correspondence between the data type and its supported encodings is summarized in the Table below.
<center> **The correspondence between the data type and its supported encodings**
......@@ -66,6 +69,6 @@ The four encodings described in the previous sections are applicable to differen
|INT64 |PLAIN, RLE, TS_2DIFF, GORILLA|
|FLOAT |PLAIN, RLE, TS_2DIFF, GORILLA|
|DOUBLE |PLAIN, RLE, TS_2DIFF, GORILLA|
|TEXT |PLAIN|
|TEXT |PLAIN, DICTIONARY|
</center>
......@@ -44,7 +44,7 @@
- 5: TEXT (`String`)
- **编码类型**
- 0: PLAIN
- 1: PLAIN_DICTIONARY
- 1: DICTIONARY
- 2: RLE
- 3: DIFF
- 4: TS_2DIFF
......
......@@ -53,9 +53,13 @@ GORILLA编码是一种无损编码,它比较适合编码前后值比较接近
定频数据编码无法用于非定频数据,建议使用二阶差分编码(TS_2DIFF)进行处理。
* 字典编码 (DICTIONARY)
字典编码是一种无损编码。它适合编码基数小的数据(即数据去重后唯一值数量小)。不推荐用于基数大的数据。
* 数据类型与编码的对应关系
前文介绍的种编码适用于不同的数据类型,若对应关系错误,则无法正确创建时间序列。数据类型与支持其编码的编码方式对应关系总结如表格2-3。
前文介绍的种编码适用于不同的数据类型,若对应关系错误,则无法正确创建时间序列。数据类型与支持其编码的编码方式对应关系总结如表格2-3。
<div style="text-align: center;"> **表格2-3 数据类型与支持其编码的对应关系**
......@@ -66,6 +70,6 @@ GORILLA编码是一种无损编码,它比较适合编码前后值比较接近
|INT64 |PLAIN, RLE, TS_2DIFF, GORILLA|
|FLOAT |PLAIN, RLE, TS_2DIFF, GORILLA|
|DOUBLE |PLAIN, RLE, TS_2DIFF, GORILLA|
|TEXT |PLAIN|
|TEXT |PLAIN, DICTIONARY|
</div>
......@@ -80,6 +80,7 @@ public class SchemaUtils {
Set<TSEncoding> textSet = new HashSet<>();
textSet.add(TSEncoding.PLAIN);
textSet.add(TSEncoding.DICTIONARY);
schemaChecker.put(TSDataType.TEXT, textSet);
}
......
......@@ -241,6 +241,62 @@ public class IoTDBEncodingIT {
}
}
@Test
public void testSetTimeEncoderRegularAndValueEncoderDictionary() {
try (Connection connection =
DriverManager.getConnection(
Config.IOTDB_URL_PREFIX + "127.0.0.1:6667/", "root", "root");
Statement statement = connection.createStatement()) {
statement.execute(
"CREATE TIMESERIES root.db_0.tab0.city WITH DATATYPE=TEXT,ENCODING=DICTIONARY");
statement.execute("insert into root.db_0.tab0(time,city) values(1,\"Nanjing\")");
statement.execute("insert into root.db_0.tab0(time,city) values(2,\"Nanjing\")");
statement.execute("insert into root.db_0.tab0(time,city) values(3,\"Beijing\")");
statement.execute("insert into root.db_0.tab0(time,city) values(4,\"Shanghai\")");
statement.execute("flush");
String[] result = new String[] {"Nanjing", "Nanjing", "Beijing", "Shanghai"};
try (ResultSet resultSet = statement.executeQuery("select * from root.db_0.tab0")) {
int index = 0;
while (resultSet.next()) {
String city = resultSet.getString("root.db_0.tab0.city");
assertEquals(result[index], city);
index++;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
@Test
public void testSetTimeEncoderRegularAndValueEncoderDictionaryOutOfOrder() {
try (Connection connection =
DriverManager.getConnection(
Config.IOTDB_URL_PREFIX + "127.0.0.1:6667/", "root", "root");
Statement statement = connection.createStatement()) {
statement.execute(
"CREATE TIMESERIES root.db_0.tab0.city WITH DATATYPE=TEXT,ENCODING=DICTIONARY");
statement.execute("insert into root.db_0.tab0(time,city) values(1,\"Nanjing\")");
statement.execute("insert into root.db_0.tab0(time,city) values(2,\"Nanjing\")");
statement.execute("insert into root.db_0.tab0(time,city) values(4,\"Beijing\")");
statement.execute("insert into root.db_0.tab0(time,city) values(3,\"Shanghai\")");
statement.execute("flush");
String[] result = new String[] {"Nanjing", "Nanjing", "Shanghai", "Beijing"};
try (ResultSet resultSet = statement.executeQuery("select * from root.db_0.tab0")) {
int index = 0;
while (resultSet.next()) {
String city = resultSet.getString("root.db_0.tab0.city");
assertEquals(result[index], city);
index++;
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
private static void insertData() throws ClassNotFoundException {
List<String> sqls =
new ArrayList<>(
......
......@@ -133,6 +133,31 @@ public abstract class Cases {
Assert.assertEquals(25.0, last, 0.1);
resultSet.close();
}
// test dictionary encoding
writeStatement.execute(
"create timeseries root.ln.wf01.wt01.city WITH DATATYPE=TEXT, ENCODING=DICTIONARY");
initDataArray =
new String[] {
"INSERT INTO root.ln.wf01.wt01(timestamp, city) values(250, \"Nanjing\")",
"INSERT INTO root.ln.wf01.wt01(timestamp, city) values(300, \"Nanjing\")",
"INSERT INTO root.ln.wf01.wt01(timestamp, city) values(350, \"Singapore\")",
"INSERT INTO root.ln.wf01.wt01(timestamp, city) values(350, \"Shanghai\")"
};
for (String initData : initDataArray) {
writeStatement.execute(initData);
}
String[] results = new String[] {"Nanjing", "Nanjing", "Singapore", "Shanghai"};
for (Statement readStatement : readStatements) {
resultSet = readStatement.executeQuery("select * from root.ln.wf01.wt01");
int i = 0;
while (resultSet.next()) {
Assert.assertEquals(results[i++], resultSet.getString("root.ln.wf01.wt01.city"));
}
Assert.assertFalse(resultSet.next());
resultSet.close();
}
}
// test https://issues.apache.org/jira/browse/IOTDB-1266
......
......@@ -110,6 +110,8 @@ public abstract class Decoder {
default:
throw new TsFileDecodingException(String.format(ERROR_MSG, encoding, dataType));
}
case DICTIONARY:
return new DictionaryDecoder();
default:
throw new TsFileDecodingException(String.format(ERROR_MSG, encoding, dataType));
}
......
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iotdb.tsfile.encoding.decoder;
import org.apache.iotdb.tsfile.file.metadata.enums.TSEncoding;
import org.apache.iotdb.tsfile.utils.Binary;
import org.apache.iotdb.tsfile.utils.ReadWriteForEncodingUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
public class DictionaryDecoder extends Decoder {
private static final Logger logger = LoggerFactory.getLogger(DictionaryDecoder.class);
private List<Binary> entryIndex;
private IntRleDecoder valueDecoder;
public DictionaryDecoder() {
super(TSEncoding.DICTIONARY);
valueDecoder = new IntRleDecoder();
}
@Override
public boolean hasNext(ByteBuffer buffer) {
if (entryIndex == null) {
initMap(buffer);
}
try {
return valueDecoder.hasNext(buffer);
} catch (IOException e) {
logger.error("tsfile-decoding DictionaryDecoder: error occurs when decoding", e);
}
return false;
}
@Override
public Binary readBinary(ByteBuffer buffer) {
if (entryIndex == null) {
initMap(buffer);
}
int code = valueDecoder.readInt(buffer);
return entryIndex.get(code);
}
private void initMap(ByteBuffer buffer) {
int length = ReadWriteForEncodingUtils.readVarInt(buffer);
entryIndex = new ArrayList<>(length);
for (int i = 0; i < length; i++) {
int binaryLength = ReadWriteForEncodingUtils.readVarInt(buffer);
byte[] buf = new byte[binaryLength];
buffer.get(buf, 0, binaryLength);
entryIndex.add(new Binary(buf));
}
}
@Override
public void reset() {
entryIndex = null;
valueDecoder.reset();
}
}
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iotdb.tsfile.encoding.encoder;
import org.apache.iotdb.tsfile.file.metadata.enums.TSEncoding;
import org.apache.iotdb.tsfile.utils.Binary;
import org.apache.iotdb.tsfile.utils.ReadWriteForEncodingUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
/**
* An encoder implementing dictionary encoding.
*
* <pre>Encoding format: {@code
* <map> <indexes>
* <map> := <map length> <map data>
* <map data> := [<entry size><entry data>]...
* <indexes> := [<index>]...
* }</pre>
*/
public class DictionaryEncoder extends Encoder {
private static final Logger logger = LoggerFactory.getLogger(DictionaryEncoder.class);
private HashMap<Binary, Integer> entryIndex;
private List<Binary> indexEntry;
private IntRleEncoder valuesEncoder;
private long mapSize;
public DictionaryEncoder() {
super(TSEncoding.DICTIONARY);
entryIndex = new HashMap<>();
indexEntry = new ArrayList<>();
valuesEncoder = new IntRleEncoder();
mapSize = 0;
}
@Override
public void encode(Binary value, ByteArrayOutputStream out) {
entryIndex.computeIfAbsent(
value,
(v) -> {
indexEntry.add(v);
mapSize += v.getLength();
return entryIndex.size();
});
valuesEncoder.encode(entryIndex.get(value), out);
}
@Override
public void flush(ByteArrayOutputStream out) {
try {
writeMap(out);
writeEncodedData(out);
} catch (IOException e) {
logger.error("tsfile-encoding DictionaryEncoder: error occurs when flushing", e);
}
reset();
}
@Override
public int getOneItemMaxSize() {
// map + one encoded value = (map size + map value) + one encoded value = (4 + 4) + 4
return 12;
}
@Override
public long getMaxByteSize() {
// has max size when when all points are unique
return 4 + mapSize + valuesEncoder.getMaxByteSize();
}
private void writeMap(ByteArrayOutputStream out) throws IOException {
ReadWriteForEncodingUtils.writeVarInt(indexEntry.size(), out);
for (Binary value : indexEntry) {
ReadWriteForEncodingUtils.writeVarInt(value.getLength(), out);
out.write(value.getValues());
}
}
private void writeEncodedData(ByteArrayOutputStream out) throws IOException {
valuesEncoder.flush(out);
}
private void reset() {
entryIndex.clear();
indexEntry.clear();
valuesEncoder.reset();
mapSize = 0;
}
}
......@@ -68,6 +68,8 @@ public abstract class TSEncodingBuilder {
return new Regular();
case GORILLA:
return new GorillaV2();
case DICTIONARY:
return new Dictionary();
default:
throw new UnsupportedOperationException(type.toString());
}
......@@ -296,4 +298,20 @@ public abstract class TSEncodingBuilder {
// allowed do nothing
}
}
public static class Dictionary extends TSEncodingBuilder {
@Override
public Encoder getEncoder(TSDataType type) {
if (type == TSDataType.TEXT) {
return new DictionaryEncoder();
}
throw new UnSupportedDataTypeException("DICTIONARY doesn't support data type: " + type);
}
@Override
public void initFromProps(Map<String, String> props) {
// do nothing
}
}
}
......@@ -20,7 +20,7 @@ package org.apache.iotdb.tsfile.file.metadata.enums;
public enum TSEncoding {
PLAIN((byte) 0),
PLAIN_DICTIONARY((byte) 1),
DICTIONARY((byte) 1),
RLE((byte) 2),
DIFF((byte) 3),
TS_2DIFF((byte) 4),
......@@ -50,7 +50,7 @@ public enum TSEncoding {
case 0:
return TSEncoding.PLAIN;
case 1:
return TSEncoding.PLAIN_DICTIONARY;
return TSEncoding.DICTIONARY;
case 2:
return TSEncoding.RLE;
case 3:
......
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.iotdb.tsfile.encoding.decoder;
import org.apache.iotdb.tsfile.encoding.encoder.DictionaryEncoder;
import org.apache.iotdb.tsfile.utils.Binary;
import org.junit.Test;
import java.io.ByteArrayOutputStream;
import java.nio.ByteBuffer;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class DictionaryDecoderTest {
private DictionaryEncoder encoder = new DictionaryEncoder();
private DictionaryDecoder decoder = new DictionaryDecoder();
private ByteArrayOutputStream baos = new ByteArrayOutputStream();
@Test
public void testSingle() {
testAll("a");
testAll("b");
testAll("c");
}
@Test
public void testAllUnique() {
testAll("a", "b", "c");
testAll("x", "o", "q");
testAll(",", ".", "c", "b", "e");
}
@Test
public void testAllSame() {
testAll("a", "a", "a");
testAll("b", "b", "b");
}
@Test
public void testMixed() {
// all characters
String[] allChars = new String[256];
allChars[0] = "" + (char) ('a' + 1);
for (int i = 0; i < 256; i++) {
allChars[i] = "" + (char) (i) + (char) (i) + (char) (i);
}
testAll(allChars);
}
private void testAll(String... all) {
for (String s : all) {
encoder.encode(new Binary(s), baos);
}
encoder.flush(baos);
ByteBuffer out = ByteBuffer.wrap(baos.toByteArray());
for (String s : all) {
assertTrue(decoder.hasNext(out));
assertEquals(s, decoder.readBinary(out).getStringValue());
}
decoder.reset();
baos.reset();
}
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册