提交 a8f66c5a 编写于 作者: R Robert Metzger

FileInputFormat.acceptFile() to filter out files and directories while...

FileInputFormat.acceptFile() to filter out files and directories while creating input splits (like _log or _SUCCESS)
上级 389bb0ce
......@@ -321,11 +321,14 @@ public abstract class FileInputFormat<OT> implements InputFormat<OT, FileInputSp
final FileSystem fs = path.getFileSystem();
final FileStatus pathFile = fs.getFileStatus(path);
if(!acceptFile(pathFile)) {
throw new IOException("The given file does not pass the file-filter");
}
if (pathFile.isDir()) {
// input is directory. list all contained files
final FileStatus[] dir = fs.listStatus(path);
for (int i = 0; i < dir.length; i++) {
if (!dir[i].isDir()) {
if (!dir[i].isDir() && acceptFile(dir[i])) {
files.add(dir[i]);
totalLength += dir[i].getLen();
}
......@@ -409,6 +412,19 @@ public abstract class FileInputFormat<OT> implements InputFormat<OT, FileInputSp
return inputSplits.toArray(new FileInputSplit[inputSplits.size()]);
}
/**
* A simple hook to filter files and directories from the input.
* The method may be overridden. Hadoop's FileInputFormat has a similar mechanism and applies the
* same filters by default.
*
* @param fileStatus
* @return true, if the given file or directory is accepted
*/
public boolean acceptFile(FileStatus fileStatus) {
final String name = fileStatus.getPath().getName();
return !name.startsWith("_") && !name.startsWith(".");
}
/**
* Retrieves the index of the <tt>BlockLocation</tt> that contains the part of the file described by the given
* offset.
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册