[FLINK-2567] [core] Allow quoted strings in CSV fields to contain quotation...

[FLINK-2567] [core] Allow quoted strings in CSV fields to contain quotation character inside of the field, as long as its escaped Ex: 'Hi my name is \'Flink\'' This closes #1059

[FLINK-2567] [core] Allow quoted strings in CSV fields to contain quotation...
[FLINK-2567] [core] Allow quoted strings in CSV fields to contain quotation character inside of the field, as long as its escaped Ex: 'Hi my name is \'Flink\'' This closes #1059
948b6e05 · tammymendt · Stephan Ewen · 97ad55f1 · 948b6e05 · 948b6e05
8 changed file
--- a/docs/apis/programming_guide.md
+++ b/docs/apis/programming_guide.md
@@ -1767,7 +1767,7 @@ Flink offers a number of configuration options for CSV parsing:

 - `includeFields(boolean ... flag)`, `includeFields(String mask)`, or `includeFields(long bitMask)` defines which fields to read from the input file (and which to ignore). By default the first *n* fields (as defined by the number of types in the `types()` call) are parsed.

- `parseQuotedStrings(char quoteChar)` enables quoted string parsing. Strings are parsed as quoted strings if the first character of the string field is the quote character (leading or tailing whitespaces are *not* trimmed). Field delimiters within quoted strings are ignored. Quoted string parsing fails if the last character of a quoted string field is not the quote character. If quoted string parsing is enabled and the first character of the field is *not* the quoting string, the string is parsed as unquoted string. By default, quoted string parsing is disabled.
+- `parseQuotedStrings(char quoteChar)` enables quoted string parsing. Strings are parsed as quoted strings if the first character of the string field is the quote character (leading or tailing whitespaces are *not* trimmed). Field delimiters within quoted strings are ignored. Quoted string parsing fails if the last character of a quoted string field is not the quote character or if the quote character appears at some point which is not the start or the end of the quoted string field (unless the quote character is escaped using '\'). If quoted string parsing is enabled and the first character of the field is *not* the quoting string, the string is parsed as unquoted string. By default, quoted string parsing is disabled.

 - `ignoreComments(String commentPrefix)` specifies a comment prefix. All lines that start with the specified comment prefix are not parsed and ignored. By default, no lines are ignored.


--- a/flink-core/src/main/java/org/apache/flink/api/common/io/GenericCsvInputFormat.java
+++ b/flink-core/src/main/java/org/apache/flink/api/common/io/GenericCsvInputFormat.java
@@ -53,6 +53,8 @@ public abstract class GenericCsvInputFormat<OT> extends DelimitedInputFormat<OT>
 	
 	private static final byte[] DEFAULT_FIELD_DELIMITER = new byte[] {','};

+	private static final byte BACKSLASH = 92;
+
 	// --------------------------------------------------------------------------------------------
 	//  Variables for internal operation.
 	//  They are all transient, because we do not want them so be serialized 
@@ -443,9 +445,10 @@ public abstract class GenericCsvInputFormat<OT> extends DelimitedInputFormat<OT>
 		if(quotedStringParsing == true && bytes[i] == quoteCharacter) {

 			// quoted string parsing enabled and field is quoted
-			// search for ending quote character
+			// search for ending quote character, continue when it is escaped
 			i++;
-			while(i < limit && bytes[i] != quoteCharacter) {
+
+			while (i < limit && (bytes[i] != quoteCharacter || bytes[i-1] == BACKSLASH)){
 				i++;
 			}
 			i++;

--- a/flink-core/src/main/java/org/apache/flink/types/parser/StringParser.java
+++ b/flink-core/src/main/java/org/apache/flink/types/parser/StringParser.java
@@ -27,6 +27,7 @@ public class StringParser extends FieldParser<String> {

 	private boolean quotedStringParsing = false;
 	private byte quoteCharacter;
+	private static final byte BACKSLASH = 92;

 	private String result;

@@ -46,8 +47,8 @@ public class StringParser extends FieldParser<String> {
 			// quoted string parsing enabled and first character Vis a quote
 			i++;

-			// search for ending quote character
-			while(i < limit && bytes[i] != quoteCharacter) {
+			// search for ending quote character, continue when it is escaped
+			while (i < limit && (bytes[i] != quoteCharacter || bytes[i-1] == BACKSLASH)){
 				i++;
 			}


--- a/flink-core/src/main/java/org/apache/flink/types/parser/StringValueParser.java
+++ b/flink-core/src/main/java/org/apache/flink/types/parser/StringValueParser.java
@@ -31,6 +31,7 @@ public class StringValueParser extends FieldParser<StringValue> {

 	private boolean quotedStringParsing = false;
 	private byte quoteCharacter;
+	private static final byte BACKSLASH = 92;

 	private StringValue result;

@@ -51,8 +52,8 @@ public class StringValueParser extends FieldParser<StringValue> {
 			// quoted string parsing enabled and first character is a quote
 			i++;

-			// search for ending quote character
-			while(i < limit && bytes[i] != quoteCharacter) {
+			// search for ending quote character, continue when it is escaped
+			while (i < limit && (bytes[i] != quoteCharacter || bytes[i-1] == BACKSLASH)){
 				i++;
 			}


--- a/flink-core/src/test/java/org/apache/flink/api/common/io/GenericCsvInputFormatTest.java
+++ b/flink-core/src/test/java/org/apache/flink/api/common/io/GenericCsvInputFormatTest.java
@@ -587,6 +587,39 @@ public class GenericCsvInputFormatTest {
 			fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage());
 		}
 	}
+
+	@Test
+	public void readWithParseQuotedStrings() {
+		try {
+			final String fileContent = "\"ab\\\"c\"|\"def\"\n\"ghijk\"|\"abc\"";
+			final FileInputSplit split = createTempFile(fileContent);
+
+			final Configuration parameters = new Configuration();
+
+			format.setFieldDelimiter("|");
+			format.setFieldTypesGeneric(StringValue.class, StringValue.class);
+			format.enableQuotedStringParsing('"');
+
+			format.configure(parameters);
+			format.open(split);
+
+			Value[] values = new Value[] { new StringValue(), new StringValue()};
+
+			values = format.nextRecord(values);
+			assertNotNull(values);
+			assertEquals("ab\\\"c", ((StringValue) values[0]).getValue());
+			assertEquals("def", ((StringValue) values[1]).getValue());
+
+			values = format.nextRecord(values);
+			assertNotNull(values);
+			assertEquals("ghijk", ((StringValue) values[0]).getValue());
+			assertEquals("abc", ((StringValue) values[1]).getValue());
+
+		}
+		catch (Exception ex) {
+			fail("Test failed due to a " + ex.getClass().getSimpleName() + ": " + ex.getMessage());
+		}
+	}
 	
 	@Test
 	public void readWithHeaderLine() {

--- a/flink-core/src/test/java/org/apache/flink/types/parser/QuotedStringParserTest.java
+++ b/flink-core/src/test/java/org/apache/flink/types/parser/QuotedStringParserTest.java
@@ -25,6 +25,7 @@ public class QuotedStringParserTest extends ParserTestBase<String> {
    @Override
    public String[] getValidTestValues() {
        return new String[] {
+                "\"\\\"Hello World\\\"\"",
                "\"abcdefgh\"", "\"i\"", "\"jklmno\"", "\"abc|de|fgh\"",
                "\"abc&&&&def&&&&ghij\"", "\"i\"", "\"Hello9\"",
                "abcdefgh", "i", "jklmno", "Hello9"
@@ -34,6 +35,7 @@ public class QuotedStringParserTest extends ParserTestBase<String> {
    @Override
    public String[] getValidTestResults() {
        return new String[] {
+                "\\\"Hello World\\\"",
                "abcdefgh", "i", "jklmno", "abc|de|fgh",
                "abc&&&&def&&&&ghij", "i", "Hello9",
                "abcdefgh", "i", "jklmno", "Hello9"

--- a/flink-core/src/test/java/org/apache/flink/types/parser/QuotedStringValueParserTest.java
+++ b/flink-core/src/test/java/org/apache/flink/types/parser/QuotedStringValueParserTest.java
@@ -29,6 +29,7 @@ public class QuotedStringValueParserTest extends ParserTestBase<StringValue> {
    @Override
    public String[] getValidTestValues() {
        return new String[] {
+                "\\\"Hello \\\"World\\\"",
                "\"abcdefgh\"", "\"i\"", "\"jklmno\"", "\"abc|de|fgh\"",
                "\"abc&&&&def&&&&ghij\"", "\"i\"", "\"Hello9\"",
                "abcdefgh", "i", "jklmno", "Hello9"
@@ -38,6 +39,7 @@ public class QuotedStringValueParserTest extends ParserTestBase<StringValue> {
    @Override
    public StringValue[] getValidTestResults() {
        return new StringValue[] {
+                new StringValue("\\\"Hello \\\"World\\\""),
                new StringValue("abcdefgh"), new StringValue("i"), new StringValue("jklmno"), new StringValue("abc|de|fgh"),
                new StringValue("abc&&&&def&&&&ghij"), new StringValue("i"), new StringValue("Hello9"),
                new StringValue("abcdefgh"), new StringValue("i"), new StringValue("jklmno"), new StringValue("Hello9"),

--- a/flink-java/src/test/java/org/apache/flink/api/java/io/CsvInputFormatTest.java
+++ b/flink-java/src/test/java/org/apache/flink/api/java/io/CsvInputFormatTest.java
@@ -984,7 +984,7 @@ public class CsvInputFormatTest {
 	@Test
 	public void testQuotedStringParsingWithIncludeFields() throws Exception {
 		final String fileContent = "\"20:41:52-1-3-2015\"|\"Re: Taskmanager memory error in Eclipse\"|" +
-				"\"Blahblah <blah@blahblah.org>\"|\"bla\"|\"blubb\"";
+				"\"Blahblah <blah@blahblah.org>\"|\"blaaa|\"blubb\"";

 		final File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
 		tempFile.deleteOnExit();
@@ -1013,6 +1013,36 @@ public class CsvInputFormatTest {
 		assertEquals("Blahblah <blah@blahblah.org>", record.f1);
 	}

+	@Test
+	public void testQuotedStringParsingWithEscapedQuotes() throws Exception {
+		final String fileContent = "\"\\\"Hello\\\" World\"|\"We are\\\" young\"";
+
+		final File tempFile = File.createTempFile("CsvReaderQuotedString", "tmp");
+		tempFile.deleteOnExit();
+		tempFile.setWritable(true);
+
+		OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tempFile));
+		writer.write(fileContent);
+		writer.close();
+
+		TypeInformation<Tuple2<String, String>> typeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, String.class);
+		CsvInputFormat<Tuple2<String, String>> inputFormat = new CsvInputFormat<>(new Path(tempFile.toURI().toString()), typeInfo);
+
+		inputFormat.enableQuotedStringParsing('"');
+		inputFormat.setFieldDelimiter('|');
+		inputFormat.setDelimiter('\n');
+
+		inputFormat.configure(new Configuration());
+		FileInputSplit[] splits = inputFormat.createInputSplits(1);
+
+		inputFormat.open(splits[0]);
+
+		Tuple2<String, String> record = inputFormat.nextRecord(new Tuple2<String, String>());
+
+		assertEquals("\\\"Hello\\\" World", record.f0);
+		assertEquals("We are\\\" young", record.f1);
+	}
+
 	// --------------------------------------------------------------------------------------------
 	// Custom types for testing
 	// --------------------------------------------------------------------------------------------