formatting.py 4.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
#
# File      : formatting.py
# This file is part of RT-Thread RTOS
# COPYRIGHT (C) 2006 - 2018, RT-Thread Development Team
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Change Logs:
# Date           Author       Notes
# 2021-03-02     Meco Man     The first version
# 2021-03-04     Meco Man     增加统一转换成UTF-8编码格式功能


#本文件会自动对指定路径下的所有文件包括子文件夹的文件(仅针对.c.h)进行扫描
#   1)将源文件编码统一为UTF-8;
#   2)将TAB键替换为空格;
#   3)将每行末尾多余的空格删除,并统一换行符为'\n'; 
#使用时只需要双击本文件,输入要扫描的文件夹路径即可
#不能保证100%全部成功转换为UTF-8,有一些编码特殊或识别不准确会在终端打印信息,需人工转换

mysterywolf's avatar
mysterywolf 已提交
33 34
#欢迎对本文件的功能继续做出补充,欢迎提交PR

35 36 37 38 39 40 41
import os
import chardet

#用空格代替TAB键
#这里并不是简单的将TAB替换成4个空格
#空格个数到底是多少需要计算,因为TAB制表本身有自动对齐的功能
def tab2spaces(line):
mysterywolf's avatar
update  
mysterywolf 已提交
42
    list_str = list(line) #字符串打散成列表,放边操作
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
    i = list_str.count('\t')
    
    while i > 0:
        ptr = list_str.index('\t')
        del list_str[ptr]
        space_need_to_insert = 4 - (ptr%4)
        j = 0
        while j < space_need_to_insert:
            list_str.insert(ptr,' ')
            j = j+1
        
        i = i-1

    line = ''.join(list_str) #列表恢复成字符串
    return line

#删除每行末尾多余的空格 统一使用\n作为结尾
def formattail(line):
    line = line.rstrip()
    line = line + '\n'
    return line

#对单个文件进行格式整理
def format_codes(filename):
    try:
        file=open(filename,'r',encoding = 'utf-8')
        file_temp=open('temp','w',encoding = 'utf-8')
        for line in file:
            line = tab2spaces(line)
            line = formattail(line)
            file_temp.write(line)
        file_temp.close()
        file.close()
        os.remove(filename)
        os.rename('temp',filename)

def get_encode_info(file):
    with open(file, 'rb') as f:
mysterywolf's avatar
update  
mysterywolf 已提交
81 82 83 84
        code = chardet.detect(f.read())['encoding']       
        #charde库有一定几率对当前文件的编码识别不准确        
        if code == 'EUC-JP': #容易将含着少量中文的英文字符文档识别为日语编码格式
            code = 'GB2312'
85 86 87
        elif code == 'ISO-8859-1': #部分文件GB2312码会被识别成ISO-8859-1
            code = 'GB2312'

mysterywolf's avatar
update  
mysterywolf 已提交
88 89
        if not (code == 'ascii' or code == 'utf-8' or code == 'GB2312' #编码识别正确
                or code == 'Windows-1252'): # Windows-1252 是由于意法半导体是法国企业's的'是法语的'导致的
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
            if code != None:
                print('未处理,需人工确认:'+code+':'+file) #需要人工确认
                code = None

        return code

#将单个文件转为UTF-8编码
def conver_to_utf_8 (path):
    try: 
        info = get_encode_info(path)
        if info == None:
            return 0 #0 失败

        file=open(path,'rb+')
        data = file.read()
        string = data.decode(info)
        utf = string.encode('utf-8')
        file.seek(0)
        file.write(utf)
        file.close()
        return 1 #1成功
    except UnicodeDecodeError:
        print("UnicodeDecodeError未处理,需人工确认"+path)
        return 0
    except UnicodeEncodeError:
        print("UnicodeEncodeError未处理,需人工确认"+path)
        return 0

# 递归扫描目录下的所有文件
def traversalallfile(path):
    filelist=os.listdir(path)
    for file in filelist:
        filepath=os.path.join(path,file)
        if os.path.isdir(filepath):
            traversalallfile(filepath)
        elif os.path.isfile(filepath):
            if filepath.endswith(".c") == True or filepath.endswith(".h") == True: #只处理.c和.h文件
                if conver_to_utf_8(filepath) == 1: #先把这个文件转为UTF-8编码,1成功
                    format_codes(filepath) #再对这个文件进行格式整理

def formatfiles():
    workpath = input('enter work path: ')
    traversalallfile(workpath)

if __name__ == '__main__':
    formatfiles()