提交 e2d80d97 编写于 作者: S ShadelessFox

#10702 Allow to customize column's minimal length and samples count


Former-commit-id: 9e62680f
上级 7ae502c9
......@@ -150,6 +150,10 @@ dataTransfer.producer.stream.processor.csv.property.timestampFormat.name = Date/
dataTransfer.producer.stream.processor.csv.property.timestampFormat.description = Date/time format pattern. Use this to clarify the date format in CSV file, not to change output data.\nSearch for 'java DateTimeFormatter' for format details.
dataTransfer.producer.stream.processor.csv.property.timestampZone.name = Timezone ID
dataTransfer.producer.stream.processor.csv.property.timestampZone.description = Timezone ID. By default local machine timezone is used.\n3 ways to specify zone:\n\t-Local zone offset (+3, -04:30)\n\t-Specific zone offset (GMT+2, UTC+01:00)\n\t-Region based (UTC, ECT, PST, etc)
dataTransfer.producer.stream.processor.csv.property.columnTypeSamplesCount.name = Column samples count
dataTransfer.producer.stream.processor.csv.property.columnTypeSamplesCount.description = Amount of samples for guessing length and type of imported data.
dataTransfer.producer.stream.processor.csv.property.columnTypeMinimalLength.name = Column minimal length
dataTransfer.producer.stream.processor.csv.property.columnTypeMinimalLength.description = Minimal length of column.
task.category.name.common = Common
......
......@@ -59,6 +59,10 @@ dataTransfer.producer.stream.processor.csv.property.delimiter.description = \u04
dataTransfer.producer.stream.processor.csv.property.delimiter.name = \u0420\u0430\u0437\u0434\u0435\u043B\u0438\u0442\u0435\u043B\u044C \u0441\u0442\u043E\u043B\u0431\u0446\u043E\u0432
dataTransfer.producer.stream.processor.csv.property.encoding.label = \u041A\u043E\u0434\u0438\u0440\u043E\u0432\u043A\u0430
dataTransfer.producer.stream.processor.csv.property.extension.label = \u0420\u0430\u0441\u0448\u0438\u0440\u0435\u043D\u0438\u0435
dataTransfer.producer.stream.processor.csv.property.columnTypeSamplesCount.name = \u041A\u043E\u043B\u0438\u0447\u0435\u0441\u0442\u0432\u043E \u043F\u0440\u043E\u0431 \u043A\u043E\u043B\u043E\u043D\u043A\u0438
dataTransfer.producer.stream.processor.csv.property.columnTypeSamplesCount.description = \u0423\u0441\u0442\u0430\u043D\u0430\u0432\u043B\u0438\u0432\u0430\u0435\u0442 \u043C\u0430\u043A\u0441\u0438\u043C\u0430\u043B\u044C\u043D\u043E\u0435 \u043A\u043E\u043B\u0438\u0447\u0435\u0441\u0442\u0432\u043E \u043F\u0440\u043E\u0431, \u0438\u0441\u043F\u043E\u043B\u044C\u0437\u0443\u0435\u043C\u044B\u0445 \u0434\u043B\u044F \u0443\u0433\u0430\u0434\u044B\u0432\u0430\u043D\u0438\u044F \u0434\u043B\u0438\u043D\u044B \u0438 \u0442\u0438\u043F\u0430 \u0438\u043C\u043F\u043E\u0440\u0442\u0438\u0440\u0443\u0435\u043C\u044B\u0445 \u0434\u0430\u043D\u043D\u044B\u0445.
dataTransfer.producer.stream.processor.csv.property.columnTypeMinimalLength.name = \u041C\u0438\u043D\u0438\u043C\u0430\u043B\u044C\u043D\u0430\u044F \u0434\u043B\u0438\u043D\u0430 \u043A\u043E\u043B\u043E\u043D\u043A\u0438
dataTransfer.producer.stream.processor.csv.property.columnTypeMinimalLength.description = \u0423\u0441\u0442\u0430\u043D\u0430\u0432\u043B\u0438\u0432\u0430\u0435\u0442 \u043C\u0438\u043D\u0438\u043C\u0430\u043B\u044C\u043D\u0443\u044E \u0434\u043B\u0438\u043D\u0443 \u043A\u043E\u043B\u043E\u043D\u043A\u0438.
dataTransfer.processor.json.property.printTableName.label = \u041D\u0430\u043F\u0435\u0447\u0430\u0442\u0430\u0442\u044C \u0438\u043C\u044F \u0442\u0430\u0431\u043B\u0438\u0446\u044B
dataTransfer.processor.json.property.formatDateISO.label = \u0424\u043E\u0440\u043C\u0430\u0442 \u0434\u0430\u0442\u044B \u0432 ISO 8601
dataTransfer.processor.json.property.extension.label = \u0420\u0430\u0441\u0448\u0438\u0440\u0435\u043D\u0438\u0435 \u0444\u0430\u0439\u043B\u0430
......
......@@ -52,6 +52,8 @@
<property id="emptyStringNull" label="%dataTransfer.producer.stream.processor.csv.property.emptyStringNull.name" type="boolean" description="%dataTransfer.producer.stream.processor.csv.property.emptyStringNull.description" defaultValue="" required="false"/>
<property id="timestampFormat" label="%dataTransfer.producer.stream.processor.csv.property.timestampFormat.name" type="string" description="%dataTransfer.producer.stream.processor.csv.property.timestampFormat.description" defaultValue="yyyy-MM-dd[ HH:mm:ss[.SSS]]" required="false"/>
<property id="timestampZone" label="%dataTransfer.producer.stream.processor.csv.property.timestampZone.name" type="string" description="%dataTransfer.producer.stream.processor.csv.property.timestampZone.description" defaultValue="" required="false"/>
<property id="columnTypeSamplesCount" label="%dataTransfer.producer.stream.processor.csv.property.columnTypeSamplesCount.name" type="integer" description="%dataTransfer.producer.stream.processor.csv.property.columnTypeSamplesCount.description" defaultValue="1000" required="false"/>
<property id="columnTypeMinimalLength" label="%dataTransfer.producer.stream.processor.csv.property.columnTypeMinimalLength.name" type="integer" description="%dataTransfer.producer.stream.processor.csv.property.columnTypeMinimalLength.description" defaultValue="1" required="false"/>
</propertyGroup>
</processor>
</node>
......
......@@ -54,11 +54,6 @@ public class DataImporterCSV extends StreamImporterAbstract {
private static final String PROP_EMPTY_STRING_NULL = "emptyStringNull";
private static final String PROP_ESCAPE_CHAR = "escapeChar";
// Default length for new column. This is a "lower" bound, so sample data could be longer than this threshold
private static final int DEFAULT_COLUMN_LENGTH = 1024;
// Amount of sample rows used to determine approximate type and data length of the column
private static final int MAX_COLUMN_SAMPLES = 1000;
public enum HeaderPosition {
none,
top,
......@@ -74,6 +69,9 @@ public class DataImporterCSV extends StreamImporterAbstract {
Map<String, Object> processorProperties = getSite().getProcessorProperties();
HeaderPosition headerPosition = getHeaderPosition(processorProperties);
final int columnSamplesCount = Math.max(CommonUtils.toInt(processorProperties.get(PROP_COLUMN_TYPE_SAMPLES), 1000), 0);
final int columnMinimalLength = Math.max(CommonUtils.toInt(processorProperties.get(PROP_COLUMN_TYPE_LENGTH), 1), 1);
try (Reader reader = openStreamReader(inputStream, processorProperties)) {
try (CSVReader csvReader = openCSVReader(reader, processorProperties)) {
String[] header = getNextLine(csvReader);
......@@ -89,12 +87,12 @@ public class DataImporterCSV extends StreamImporterAbstract {
if (CommonUtils.isEmptyTrimmed(column)) {
column = "Column" + (i + 1);
}
StreamDataImporterColumnInfo columnInfo = new StreamDataImporterColumnInfo(entityMapping, i, column, null, DEFAULT_COLUMN_LENGTH, DBPDataKind.UNKNOWN);
StreamDataImporterColumnInfo columnInfo = new StreamDataImporterColumnInfo(entityMapping, i, column, null, columnMinimalLength, DBPDataKind.UNKNOWN);
columnInfo.setMappingMetadataPresent(headerPosition != HeaderPosition.none);
columnsInfo.add(columnInfo);
}
for (int sample = 0; sample < MAX_COLUMN_SAMPLES; sample++) {
for (int sample = 0; sample < columnSamplesCount; sample++) {
String[] line;
if (sample == 0 && headerPosition == HeaderPosition.none) {
......
......@@ -44,6 +44,8 @@ public abstract class StreamImporterAbstract implements IStreamDataImporter {
protected static final String PROP_TIMESTAMP_FORMAT = "timestampFormat";
protected static final String PROP_TIMESTAMP_ZONE = "timestampZone";
protected static final String PROP_COLUMN_TYPE_SAMPLES = "columnTypeSamplesCount";
protected static final String PROP_COLUMN_TYPE_LENGTH = "columnTypeMinimalLength";
private IStreamDataImporterSite site;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册