From 6bc10a756350a54f998bba7ab018ac08acc6fa77 Mon Sep 17 00:00:00 2001
From: Jack <498607067@qq.com>
Date: Sun, 24 Nov 2019 13:46:15 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20FileUtils.getFileCharsetSimple()?=
=?UTF-8?q?=E5=8C=BA=E5=88=86UTF-8=E6=97=A0BOM=E5=92=8CGBK=E7=BC=96?=
=?UTF-8?q?=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.../com/blankj/utilcode/util/FileUtils.java | 173 ++++++++++--------
lib/utilcode/src/test/res/file/GBK.txt | 3 +-
2 files changed, 96 insertions(+), 80 deletions(-)
diff --git a/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java b/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java
index c73b994b..02dd200a 100644
--- a/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java
+++ b/lib/utilcode/src/main/java/com/blankj/utilcode/util/FileUtils.java
@@ -18,6 +18,7 @@ import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
+import java.util.BitSet;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
@@ -37,6 +38,8 @@ public final class FileUtils {
private static final String LINE_SEP = System.getProperty("line.separator");
+ private static final int BYTE_SIZE = 8;
+
private FileUtils() {
throw new UnsupportedOperationException("u can't instantiate me...");
}
@@ -903,109 +906,121 @@ public final class FileUtils {
case 0xfeff:
return "UTF-16BE";
default:
- return "GBK";
+ try {
+ if (isUtf8(file)) {
+ return "UTF-8";
+ } else {
+ return "GBK";
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ return "GBK";
+ }
}
}
/**
* Return whether the charset of file is utf8.
*
- * @param filePath The path of file.
+ * @param file The file.
* @return {@code true}: yes
{@code false}: no
*/
- public static boolean isUtf8(final String filePath) {
- return isUtf8(getFileByPath(filePath));
+ private static boolean isUtf8(File file) throws Exception {
+ BufferedInputStream bis = new BufferedInputStream(new FileInputStream(file));
+ // 读取第一个字节
+ int code = bis.read();
+ do {
+ BitSet bitSet = convert2BitSet(code);
+ if (bitSet.get(0)) {
+ // 多字节时,再读取N个字节
+ if (!checkMultiByte(bis, bitSet)) {
+ bis.close();
+ return false;
+ }
+ }
+ // 单字节时什么都不用做,再次读取字节
+ code = bis.read();
+ } while (code != -1);
+ bis.close();
+ return true;
}
+
/**
- * Return whether the charset of file is utf8.
- *
- * @param file The file.
- * @return {@code true}: yes
{@code false}: no
+ * 检测多字节,判断是否符合utf8编码
*/
- public static boolean isUtf8(final File file) {
- if (file == null) return false;
- InputStream is = null;
- try {
- byte[] bytes = new byte[24];
- is = new BufferedInputStream(new FileInputStream(file));
- int read = is.read(bytes);
- if (read != -1) {
- byte[] readArr = new byte[read];
- System.arraycopy(bytes, 0, readArr, 0, read);
- return isUtf8(readArr) == 100;
- } else {
+ private static boolean checkMultiByte(BufferedInputStream bis, BitSet bitSet) throws Exception {
+ int count = getCountOfSequential(bitSet);
+ // 已经读取了一个字节,不能再读取
+ byte[] bytes = new byte[count - 1];
+ bis.read(bytes);
+ for (byte b : bytes) {
+ if (!checkUtf8Byte(b)) {
return false;
}
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- try {
- if (is != null) {
- is.close();
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
}
- return false;
+ return true;
}
- private static int isUtf8(byte[] raw) {
- int i, len;
- int utf8 = 0, ascii = 0;
- if (raw.length > 3) {
- if ((raw[0] == (byte) 0xEF) && (raw[1] == (byte) 0xBB) && (raw[2] == (byte) 0xBF)) {
- return 100;
+
+ /**
+ * 检测bitSet中从开始有多少个连续的1
+ */
+ private static int getCountOfSequential(BitSet bitSet) {
+ int count = 0;
+ for (int i = 0; i < BYTE_SIZE; i++) {
+ if (bitSet.get(i)) {
+ count++;
+ } else {
+ break;
}
}
- len = raw.length;
- int child = 0;
- for (i = 0; i < len; ) {
- if ((raw[i] & (byte) 0xFF) == (byte) 0xFF || (raw[i] & (byte) 0xFE) == (byte) 0xFE) {
- return 0;
- }
- if (child == 0) {
- if ((raw[i] & (byte) 0x7F) == raw[i] && raw[i] != 0) {
- ascii++;
- } else if ((raw[i] & (byte) 0xC0) == (byte) 0xC0) {
- for (int bit = 0; bit < 8; bit++) {
- if ((((byte) (0x80 >> bit)) & raw[i]) == ((byte) (0x80 >> bit))) {
- child = bit;
- } else {
- break;
- }
- }
- utf8++;
- }
- i++;
- } else {
- child = (raw.length - i > child) ? child : (raw.length - i);
- boolean currentNotUtf8 = false;
- for (int children = 0; children < child; children++) {
- if ((raw[i + children] & ((byte) 0x80)) != ((byte) 0x80)) {
- if ((raw[i + children] & (byte) 0x7F) == raw[i + children] && raw[i] != 0) {
- ascii++;
- }
- currentNotUtf8 = true;
- }
- }
- if (currentNotUtf8) {
- utf8--;
- i++;
- } else {
- utf8 += child;
- i += child;
- }
- child = 0;
+ return count;
+ }
+
+
+ /**
+ * 检测单字节,判断是否为utf8
+ */
+ private static boolean checkUtf8Byte(byte b) throws Exception {
+ BitSet bitSet = convert2BitSet(b);
+ return bitSet.get(0) && !bitSet.get(1);
+ }
+
+
+ /**
+ * 将整形转为BitSet
+ */
+ private static BitSet convert2BitSet(int code) {
+ BitSet bitSet = new BitSet(BYTE_SIZE);
+
+ for (int i = 0; i < BYTE_SIZE; i++) {
+ int tmp3 = code >> (BYTE_SIZE - i - 1);
+ int tmp2 = 0x1 & tmp3;
+ if (tmp2 == 1) {
+ bitSet.set(i);
}
}
- if (ascii == len) {
- return 100;
+ return bitSet;
+ }
+
+ /**
+ * Return whether the charset of file is utf8.
+ *
+ * @param filePath The path of file.
+ * @return {@code true}: yes
{@code false}: no
+ */
+ public static boolean isUtf8(final String filePath) {
+ try {
+ return isUtf8(getFileByPath(filePath));
+ } catch (Exception e) {
+ e.printStackTrace();
+ return false;
}
- return (int) (100 * ((float) (utf8 + ascii) / (float) len));
}
+
+
/**
* Return the number of lines of file.
*
diff --git a/lib/utilcode/src/test/res/file/GBK.txt b/lib/utilcode/src/test/res/file/GBK.txt
index 88ef6b94..3d8088b4 100644
--- a/lib/utilcode/src/test/res/file/GBK.txt
+++ b/lib/utilcode/src/test/res/file/GBK.txt
@@ -1 +1,2 @@
-GBK
\ No newline at end of file
+GBK
+Ұй
\ No newline at end of file
--
GitLab