From 6bc10a756350a54f998bba7ab018ac08acc6fa77 Mon Sep 17 00:00:00 2001 From: Jack <498607067@qq.com> Date: Sun, 24 Nov 2019 13:46:15 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20FileUtils.getFileCharsetSimple()?= =?UTF-8?q?=E5=8C=BA=E5=88=86UTF-8=E6=97=A0BOM=E5=92=8CGBK=E7=BC=96?= =?UTF-8?q?=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../com/blankj/utilcode/util/FileUtils.java | 173 ++++++++++-------- lib/utilcode/src/test/res/file/GBK.txt | 3 +- 2 files changed, 96 insertions(+), 80 deletions(-) diff --git a/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java b/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java index c73b994b..02dd200a 100644 --- a/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java +++ b/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java @@ -18,6 +18,7 @@ import java.security.DigestInputStream; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.ArrayList; +import java.util.BitSet; import java.util.Collections; import java.util.Comparator; import java.util.List; @@ -37,6 +38,8 @@ public final class FileUtils { private static final String LINE_SEP = System.getProperty("line.separator"); + private static final int BYTE_SIZE = 8; + private FileUtils() { throw new UnsupportedOperationException("u can't instantiate me..."); } @@ -903,109 +906,121 @@ public final class FileUtils { case 0xfeff: return "UTF-16BE"; default: - return "GBK"; + try { + if (isUtf8(file)) { + return "UTF-8"; + } else { + return "GBK"; + } + } catch (Exception e) { + e.printStackTrace(); + return "GBK"; + } } } /** * Return whether the charset of file is utf8. * - * @param filePath The path of file. + * @param file The file. * @return {@code true}: yes
{@code false}: no */ - public static boolean isUtf8(final String filePath) { - return isUtf8(getFileByPath(filePath)); + private static boolean isUtf8(File file) throws Exception { + BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file)); + // 读取第一个字节 + int code = bis.read(); + do { + BitSet bitSet = convert2BitSet(code); + if (bitSet.get(0)) { + // 多字节时,再读取N个字节 + if (!checkMultiByte(bis, bitSet)) { + bis.close(); + return false; + } + } + // 单字节时什么都不用做,再次读取字节 + code = bis.read(); + } while (code != -1); + bis.close(); + return true; } + /** - * Return whether the charset of file is utf8. - * - * @param file The file. - * @return {@code true}: yes
{@code false}: no + * 检测多字节,判断是否符合utf8编码 */ - public static boolean isUtf8(final File file) { - if (file == null) return false; - InputStream is = null; - try { - byte[] bytes = new byte[24]; - is = new BufferedInputStream(new FileInputStream(file)); - int read = is.read(bytes); - if (read != -1) { - byte[] readArr = new byte[read]; - System.arraycopy(bytes, 0, readArr, 0, read); - return isUtf8(readArr) == 100; - } else { + private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception { + int count = getCountOfSequential(bitSet); + // 已经读取了一个字节,不能再读取 + byte[] bytes = new byte[count - 1]; + bis.read(bytes); + for (byte b : bytes) { + if (!checkUtf8Byte(b)) { return false; } - } catch (IOException e) { - e.printStackTrace(); - } finally { - try { - if (is != null) { - is.close(); - } - } catch (IOException e) { - e.printStackTrace(); - } } - return false; + return true; } - private static int isUtf8(byte[] raw) { - int i, len; - int utf8 = 0, ascii = 0; - if (raw.length > 3) { - if ((raw[0] == (byte) 0xEF) && (raw[1] == (byte) 0xBB) && (raw[2] == (byte) 0xBF)) { - return 100; + + /** + * 检测bitSet中从开始有多少个连续的1 + */ + private static int getCountOfSequential(BitSet bitSet) { + int count = 0; + for (int i = 0; i < BYTE_SIZE; i++) { + if (bitSet.get(i)) { + count++; + } else { + break; } } - len = raw.length; - int child = 0; - for (i = 0; i < len; ) { - if ((raw[i] & (byte) 0xFF) == (byte) 0xFF || (raw[i] & (byte) 0xFE) == (byte) 0xFE) { - return 0; - } - if (child == 0) { - if ((raw[i] & (byte) 0x7F) == raw[i] && raw[i] != 0) { - ascii++; - } else if ((raw[i] & (byte) 0xC0) == (byte) 0xC0) { - for (int bit = 0; bit < 8; bit++) { - if ((((byte) (0x80 >> bit)) & raw[i]) == ((byte) (0x80 >> bit))) { - child = bit; - } else { - break; - } - } - utf8++; - } - i++; - } else { - child = (raw.length - i > child) ? child : (raw.length - i); - boolean currentNotUtf8 = false; - for (int children = 0; children < child; children++) { - if ((raw[i + children] & ((byte) 0x80)) != ((byte) 0x80)) { - if ((raw[i + children] & (byte) 0x7F) == raw[i + children] && raw[i] != 0) { - ascii++; - } - currentNotUtf8 = true; - } - } - if (currentNotUtf8) { - utf8--; - i++; - } else { - utf8 += child; - i += child; - } - child = 0; + return count; + } + + + /** + * 检测单字节,判断是否为utf8 + */ + private static boolean checkUtf8Byte(byte b) throws Exception { + BitSet bitSet = convert2BitSet(b); + return bitSet.get(0) && !bitSet.get(1); + } + + + /** + * 将整形转为BitSet + */ + private static BitSet convert2BitSet(int code) { + BitSet bitSet = new BitSet(BYTE_SIZE); + + for (int i = 0; i < BYTE_SIZE; i++) { + int tmp3 = code >> (BYTE_SIZE - i - 1); + int tmp2 = 0x1 & tmp3; + if (tmp2 == 1) { + bitSet.set(i); } } - if (ascii == len) { - return 100; + return bitSet; + } + + /** + * Return whether the charset of file is utf8. + * + * @param filePath The path of file. + * @return {@code true}: yes
{@code false}: no + */ + public static boolean isUtf8(final String filePath) { + try { + return isUtf8(getFileByPath(filePath)); + } catch (Exception e) { + e.printStackTrace(); + return false; } - return (int) (100 * ((float) (utf8 + ascii) / (float) len)); } + + /** * Return the number of lines of file. * diff --git a/lib/utilcode/src/test/res/file/GBK.txt b/lib/utilcode/src/test/res/file/GBK.txt index 88ef6b94..3d8088b4 100644 --- a/lib/utilcode/src/test/res/file/GBK.txt +++ b/lib/utilcode/src/test/res/file/GBK.txt @@ -1 +1,2 @@ -GBK \ No newline at end of file +GBK +Ұй \ No newline at end of file -- GitLab