Fix #26708 - use StringDecoder to handle data chunks that split multibyte characters

042217fc · Rob Lourens · fe69f9ac · 042217fc · 042217fc
2 changed file
--- a/src/vs/workbench/services/search/node/ripgrepTextSearch.ts
+++ b/src/vs/workbench/services/search/node/ripgrepTextSearch.ts
@@ -6,6 +6,7 @@

 import { EventEmitter } from 'events';
 import * as path from 'path';
+import { StringDecoder, NodeStringDecoder } from 'string_decoder';

 import * as cp from 'child_process';
 import { rgPath } from 'vscode-ripgrep';
@@ -174,11 +175,13 @@ export class RipgrepParser extends EventEmitter {
 	private fileMatch: FileMatch;
 	private remainder: string;
 	private isDone: boolean;
+	private stringDecoder: NodeStringDecoder;

 	private numResults = 0;

 	constructor(private maxResults: number, private rootFolder: string) {
 		super();
+		this.stringDecoder = new StringDecoder();
 	}

 	public cancel(): void {
@@ -186,16 +189,23 @@ export class RipgrepParser extends EventEmitter {
 	}

 	public flush(): void {
+		this.handleDecodedData(this.stringDecoder.end());
+
 		if (this.fileMatch) {
 			this.onResult();
 		}
 	}

-	public handleData(data: string | Buffer): void {
+	public handleData(data: Buffer | string): void {
+		const dataStr = typeof data === 'string' ? data : this.stringDecoder.write(data);
+		this.handleDecodedData(dataStr);
+	}
+
+	private handleDecodedData(decodedData: string): void {
 		// If the previous data chunk didn't end in a newline, prepend it to this chunk
 		const dataStr = this.remainder ?
-			this.remainder + data.toString() :
-			data.toString();
+			this.remainder + decodedData :
+			decodedData;

 		const dataLines: string[] = dataStr.split(/\r\n|\n/);
 		this.remainder = dataLines[dataLines.length - 1] ? dataLines.pop() : null;

--- a/src/vs/workbench/services/search/test/node/ripgrepTextSearch.test.ts
+++ b/src/vs/workbench/services/search/test/node/ripgrepTextSearch.test.ts
@@ -33,7 +33,11 @@ suite('RipgrepParser', () => {
 		return matchLine;
 	}

-	function parseInput(inputChunks: string[]): ISerializedFileMatch[] {
+	function parseInputStrings(inputChunks: string[]): ISerializedFileMatch[] {
+		return parseInput(inputChunks.map(chunk => new Buffer(chunk)));
+	}
+
+	function parseInput(inputChunks: Buffer[]): ISerializedFileMatch[] {
 		const matches: ISerializedFileMatch[] = [];
 		const rgp = new RipgrepParser(1e6, rootFolder);
 		rgp.on('result', (match: ISerializedFileMatch) => {
@@ -65,7 +69,7 @@ suite('RipgrepParser', () => {
 			[getFileLine('a.txt'), getMatchLine(1, ['before', 'match', 'after']), getMatchLine(2, ['before', 'match', 'after']), fileSectionEnd].join('\n')
 		];

-		const results = parseInput(input);
+		const results = parseInputStrings(input);
 		assert.equal(results.length, 1);
 		assert.deepEqual(results[0],
 			<ISerializedFileMatch>{
@@ -93,7 +97,7 @@ suite('RipgrepParser', () => {
 			[getFileLine('c.txt'), getMatchLine(1, ['before', 'match', 'after']), getMatchLine(2, ['before', 'match', 'after']), fileSectionEnd].join('\n')
 		];

-		const results = parseInput(input);
+		const results = parseInputStrings(input);
 		assert.equal(results.length, 3);
 		results.forEach(fileResult => assert.equal(fileResult.numMatches, 2));
 	});
@@ -116,7 +120,7 @@ suite('RipgrepParser', () => {
 	test('Parses multiple chunks broken at each line', () => {
 		const input = singleLineChunks.map(chunk => chunk + '\n');

-		const results = parseInput(input);
+		const results = parseInputStrings(input);
 		assert.equal(results.length, 3);
 		results.forEach(fileResult => assert.equal(fileResult.numMatches, 2));
 	});
@@ -126,7 +130,7 @@ suite('RipgrepParser', () => {
 			.map(chunk => chunk + '\n')
 			.map(halve));

-		const results = parseInput(input);
+		const results = parseInputStrings(input);
 		assert.equal(results.length, 3);
 		results.forEach(fileResult => assert.equal(fileResult.numMatches, 2));
 	});
@@ -136,7 +140,7 @@ suite('RipgrepParser', () => {
 			.map(chunk => chunk + '\n')
 			.map(arrayOfChars));

-		const results = parseInput(input);
+		const results = parseInputStrings(input);
 		assert.equal(results.length, 3);
 		results.forEach(fileResult => assert.equal(fileResult.numMatches, 2));
 	});
@@ -145,8 +149,26 @@ suite('RipgrepParser', () => {
 		const input = singleLineChunks
 			.map(chunk => '\n' + chunk);

-		const results = parseInput(input);
+		const results = parseInputStrings(input);
 		assert.equal(results.length, 3);
 		results.forEach(fileResult => assert.equal(fileResult.numMatches, 2));
 	});
+
+	test('Parses chunks broken in the middle of a multibyte character', () => {
+		const multibyteStr = '漢';
+		const multibyteBuf = new Buffer(multibyteStr);
+		const text = getFileLine('foo/bar') + '\n' + getMatchLine(0, ['before', 'match', 'after']) + '\n';
+
+		// Split the multibyte char into two pieces and divide between the two buffers
+		const beforeIndex = 24;
+		const inputBufs = [
+			Buffer.concat([new Buffer(text.substr(0, beforeIndex)), multibyteBuf.slice(0, 2)]),
+			Buffer.concat([multibyteBuf.slice(2), new Buffer(text.substr(beforeIndex))])
+		];
+
+		const results = parseInput(inputBufs);
+		assert.equal(results.length, 1);
+		assert.equal(results[0].lineMatches.length, 1);
+		assert.deepEqual(results[0].lineMatches[0].offsetAndLengths, [[7, 5]]);
+	});
 });
\ No newline at end of file