feat: FileUtils.getFileCharsetSimple()区分UTF-8无BOM和GBK编码

6bc10a75 · Jack---Jiao · 3513c74c · 6bc10a75 · 6bc10a75
Showing with 96 addition and 80 deletion

lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java ...ode/src/main/java/com/blankj/utilcode/util/FileUtils.java +94 -79

lib/utilcode/src/test/res/file/GBK.txt lib/utilcode/src/test/res/file/GBK.txt +2 -1

未找到文件。
--- a/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java
+++ b/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java
@@ -18,6 +18,7 @@ import java.security.DigestInputStream;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 import java.util.ArrayList;
+import java.util.BitSet;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
@@ -37,6 +38,8 @@ public final class FileUtils {

    private static final String LINE_SEP = System.getProperty("line.separator");

+    private static final int BYTE_SIZE = 8;
+
    private FileUtils() {
        throw new UnsupportedOperationException("u can't instantiate me...");
    }
@@ -903,109 +906,121 @@ public final class FileUtils {
            case 0xfeff:
                return "UTF-16BE";
            default:
-                return "GBK";
+                try {
+                    if (isUtf8(file)) {
+                        return "UTF-8";
+                    } else {
+                        return "GBK";
+                    }
+                } catch (Exception e) {
+                    e.printStackTrace();
+                    return "GBK";
+                }
        }
    }

    /**
     * Return whether the charset of file is utf8.
     *
-     * @param filePath The path of file.
+     * @param file The file.
     * @return {@code true}: yes<br>{@code false}: no
     */
-    public static boolean isUtf8(final String filePath) {
-        return isUtf8(getFileByPath(filePath));
+    private static boolean isUtf8(File file) throws Exception {
+        BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
+        // 读取第一个字节
+        int code = bis.read();
+        do {
+            BitSet bitSet = convert2BitSet(code);
+            if (bitSet.get(0)) {
+                // 多字节时，再读取N个字节
+                if (!checkMultiByte(bis, bitSet)) {
+                    bis.close();
+                    return false;
+                }
+            }
+            // 单字节时什么都不用做，再次读取字节
+            code = bis.read();
+        } while (code != -1);
+        bis.close();
+        return true;
    }

+
    /**
-     * Return whether the charset of file is utf8.
-     *
-     * @param file The file.
-     * @return {@code true}: yes<br>{@code false}: no
+     * 检测多字节，判断是否符合utf8编码
     */
-    public static boolean isUtf8(final File file) {
-        if (file == null) return false;
-        InputStream is = null;
-        try {
-            byte[] bytes = new byte[24];
-            is = new BufferedInputStream(new FileInputStream(file));
-            int read = is.read(bytes);
-            if (read != -1) {
-                byte[] readArr = new byte[read];
-                System.arraycopy(bytes, 0, readArr, 0, read);
-                return isUtf8(readArr) == 100;
-            } else {
+    private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception {
+        int count = getCountOfSequential(bitSet);
+        // 已经读取了一个字节，不能再读取
+        byte[] bytes = new byte[count - 1];
+        bis.read(bytes);
+        for (byte b : bytes) {
+            if (!checkUtf8Byte(b)) {
                return false;
            }
-        } catch (IOException e) {
-            e.printStackTrace();
-        } finally {
-            try {
-                if (is != null) {
-                    is.close();
-                }
-            } catch (IOException e) {
-                e.printStackTrace();
-            }
        }
-        return false;
+        return true;
    }

-    private static int isUtf8(byte[] raw) {
-        int i, len;
-        int utf8 = 0, ascii = 0;
-        if (raw.length > 3) {
-            if ((raw[0] == (byte) 0xEF) && (raw[1] == (byte) 0xBB) && (raw[2] == (byte) 0xBF)) {
-                return 100;
+
+    /**
+     * 检测bitSet中从开始有多少个连续的1
+     */
+    private static int getCountOfSequential(BitSet bitSet) {
+        int count = 0;
+        for (int i = 0; i < BYTE_SIZE; i++) {
+            if (bitSet.get(i)) {
+                count++;
+            } else {
+                break;
            }
        }
-        len = raw.length;
-        int child = 0;
-        for (i = 0; i < len; ) {
-            if ((raw[i] & (byte) 0xFF) == (byte) 0xFF || (raw[i] & (byte) 0xFE) == (byte) 0xFE) {
-                return 0;
-            }
-            if (child == 0) {
-                if ((raw[i] & (byte) 0x7F) == raw[i] && raw[i] != 0) {
-                    ascii++;
-                } else if ((raw[i] & (byte) 0xC0) == (byte) 0xC0) {
-                    for (int bit = 0; bit < 8; bit++) {
-                        if ((((byte) (0x80 >> bit)) & raw[i]) == ((byte) (0x80 >> bit))) {
-                            child = bit;
-                        } else {
-                            break;
-                        }
-                    }
-                    utf8++;
-                }
-                i++;
-            } else {
-                child = (raw.length - i > child) ? child : (raw.length - i);
-                boolean currentNotUtf8 = false;
-                for (int children = 0; children < child; children++) {
-                    if ((raw[i + children] & ((byte) 0x80)) != ((byte) 0x80)) {
-                        if ((raw[i + children] & (byte) 0x7F) == raw[i + children] && raw[i] != 0) {
-                            ascii++;
-                        }
-                        currentNotUtf8 = true;
-                    }
-                }
-                if (currentNotUtf8) {
-                    utf8--;
-                    i++;
-                } else {
-                    utf8 += child;
-                    i += child;
-                }
-                child = 0;
+        return count;
+    }
+
+
+    /**
+     * 检测单字节，判断是否为utf8
+     */
+    private static boolean checkUtf8Byte(byte b) throws Exception {
+        BitSet bitSet = convert2BitSet(b);
+        return bitSet.get(0) && !bitSet.get(1);
+    }
+
+
+    /**
+     * 将整形转为BitSet
+     */
+    private static BitSet convert2BitSet(int code) {
+        BitSet bitSet = new BitSet(BYTE_SIZE);
+
+        for (int i = 0; i < BYTE_SIZE; i++) {
+            int tmp3 = code >> (BYTE_SIZE - i - 1);
+            int tmp2 = 0x1 & tmp3;
+            if (tmp2 == 1) {
+                bitSet.set(i);
            }
        }
-        if (ascii == len) {
-            return 100;
+        return bitSet;
+    }
+
+    /**
+     * Return whether the charset of file is utf8.
+     *
+     * @param filePath The path of file.
+     * @return {@code true}: yes<br>{@code false}: no
+     */
+    public static boolean isUtf8(final String filePath) {
+        try {
+            return isUtf8(getFileByPath(filePath));
+        } catch (Exception e) {
+            e.printStackTrace();
+            return false;
        }
-        return (int) (100 * ((float) (utf8 + ascii) / (float) len));
    }

+
+
    /**
     * Return the number of lines of file.
     *

--- a/lib/utilcode/src/test/res/file/GBK.txt
+++ b/lib/utilcode/src/test/res/file/GBK.txt
-GBK
\ No newline at end of file
+GBK
+Ұй
\ No newline at end of file