__init__.py 4.0 KB
Newer Older
W
init  
wizardforcel 已提交
1 2 3 4 5
# coding: utf-8

import re
import time
from os import path
6
from pyquery import PyQuery as pq
W
init  
wizardforcel 已提交
7 8 9 10 11 12
from .api import GoTransApi
from . import config

__author__ = "ApacheCN"
__email__ = "apachecn@163.com"
__license__ = "SATA"
W
wizardforcel 已提交
13
__version__ = "2020.08.01"
W
init  
wizardforcel 已提交
14

W
wizardforcel 已提交
15
RE_CODE = r'<(pre|code|tt|var|kbd)[^>]*?>[\s\S]*?</\1>'
W
init  
wizardforcel 已提交
16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
RE_TAG = r'<[^>]*?>'
RE_ENTITY = r'&(\w+|#x?\d+);'

api = GoTransApi()

def tags_preprocess(html):

    '''
    # 去头去尾
    html = html.replace("<?xml version='1.0' encoding='utf-8'?>", '')
    html = re.sub(r'<html[^>]*?>.*?<body[^>]*?>', '', html, flags=re.RegexFlag.DOTALL)
    html = re.sub(r'</body>.*?</html>', '', html, flags=re.RegexFlag.DOTALL)
    '''
    
    tags = []
    
    def replace_func(m):
        s = m.group()
        tags.append(s)
        idx = len(tags) - 1
        tk = f' [HTG{idx}] '
        return tk
        
    # 移除 <pre|code>
    html = re.sub(RE_CODE, replace_func, html)
    # 移除其它标签
    html = re.sub(RE_TAG, replace_func, html)
    # 移除实体
    html = re.sub(RE_ENTITY, replace_func, html)
    
    # 去掉 Unix 和 Windows 换行
    html = html.replace('\n', ' ')
    html = html.replace('\r', '')
    return html, tags

def tags_recover(html, tags):

    # 还原标签
    for i, t in enumerate(tags):
        html = html.replace(f'[HTG{i}]', t)
        
    return html

def trans_real(src):

    dst = None
    for i in range(config.retry):
        try:
            print(src)
            dst = api.translate(
                src, 
                src=config.src, 
                dst=config.dst
            )
            print(dst)
            if dst: break
            time.sleep(config.wait_sec)
        except Exception as ex:
            print(ex)
            time.sleep(config.wait_sec)
    
    if not dst: return None
    
    # 修复占位符
    dst = re.sub(r'\[\s*(?:htg|HTG)\s*(\d+)\s*\]', r'[HTG\1]', dst)
    return dst

def trans_one(html):
W
wizardforcel 已提交
84
    if html is None or html.strip() == '':
W
init  
wizardforcel 已提交
85 86 87 88 89 90 91 92 93 94 95 96 97 98
        return ''
    
    # 标签预处理
    html, tokens = tags_preprocess(html)
    
    # 按句子翻译
    html = trans_real(html)
    if not html: return None
    
    # 标签还原
    html = tags_recover(html, tokens)
    return html

def trans_html(html):
W
wizardforcel 已提交
99
    # 预处理
W
wizardforcel 已提交
100
    html = preprocess(html)
101
    root = pq(html)
W
init  
wizardforcel 已提交
102 103
    
    # 处理 <p> <h?>
104
    elems = root('p, h1, h2, h3, h4, h5, h6')
W
init  
wizardforcel 已提交
105
    for elem in elems:
106 107
        elem = pq(elem)
        to_trans = elem.html()
W
init  
wizardforcel 已提交
108 109
        trans = trans_one(to_trans)
        if not trans: continue
110
        elem.html(trans)
W
init  
wizardforcel 已提交
111 112
        
    # 处理 <blockquote> <td> <th>
113
    elems = root('blockquote, td, th')
W
init  
wizardforcel 已提交
114
    for elem in elems:
115 116 117
        elem = pq(elem)
        if elem.children('p'): continue
        to_trans = elem.html()
W
init  
wizardforcel 已提交
118 119
        trans = trans_one(to_trans)
        if not trans: continue
120
        elem.html(trans)
W
init  
wizardforcel 已提交
121 122
    
    # 处理 <li>
123
    elems = root('li')
W
init  
wizardforcel 已提交
124
    for elem in elems:
125 126
        elem = pq(elem)
        if elem.children('p'): continue
W
init  
wizardforcel 已提交
127 128 129
        
        # 如果有子列表,就取下来
        sub_list = None
130 131 132
        if elem.children('ul'): sub_list = elem.children('ul')
        if elem.children('ol'): sub_list = elem.children('ol')
        if sub_list: sub_list.remove()
W
init  
wizardforcel 已提交
133
        
134
        to_trans = elem.html()
W
init  
wizardforcel 已提交
135 136
        trans = trans_one(to_trans)
        if not trans: continue
137
        elem.html(trans)
W
init  
wizardforcel 已提交
138 139 140 141
        
        # 将子列表还原
        if sub_list: elem.append(sub_list)
    
142
    return str(root)
W
init  
wizardforcel 已提交
143

W
wizardforcel 已提交
144 145 146 147 148 149
def preprocess(html):
    html = re.sub(r'<\?xml[^>]*\?>', '', html)
    html = re.sub(r'xmlns=".+?"', '', html)
    html = html.replace('&#160;', ' ') \
               .replace('&nbsp;', ' ')

150
    root = pq(html)
W
init  
wizardforcel 已提交
151
    
152
    pres = root('div.code, div.Code')
W
init  
wizardforcel 已提交
153
    for p in pres:
154 155 156
        p = pq(p)
        newp = pq('<pre></pre>')
        newp.append(p.text())
W
init  
wizardforcel 已提交
157 158
        p.replace_with(newp)
        
159
    codes = root('span.inline-code, span.CodeInline')
W
init  
wizardforcel 已提交
160
    for c in codes:
161 162
        c = pq(c)
        newc = pq('<code></code>')
W
init  
wizardforcel 已提交
163 164 165 166 167
        newc.append(c.text)
        c.replace_with(newc)
        
    return str(root)