Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleOCR
提交
e6de4b1e
P
PaddleOCR
项目概览
PaddlePaddle
/
PaddleOCR
大约 1 年 前同步成功
通知
1525
Star
32962
Fork
6643
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
108
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
108
Issue
108
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
e6de4b1e
编写于
6月 08, 2021
作者:
W
WenmuZhou
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add tablepyxl
上级
dfba983c
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
414 addition
and
0 deletion
+414
-0
ppstructure/table/tablepyxl/__init__.py
ppstructure/table/tablepyxl/__init__.py
+13
-0
ppstructure/table/tablepyxl/style.py
ppstructure/table/tablepyxl/style.py
+283
-0
ppstructure/table/tablepyxl/tablepyxl.py
ppstructure/table/tablepyxl/tablepyxl.py
+118
-0
未找到文件。
ppstructure/table/tablepyxl/__init__.py
0 → 100644
浏览文件 @
e6de4b1e
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
\ No newline at end of file
ppstructure/table/tablepyxl/style.py
0 → 100644
浏览文件 @
e6de4b1e
# This is where we handle translating css styles into openpyxl styles
# and cascading those from parent to child in the dom.
from
openpyxl.cell
import
cell
from
openpyxl.styles
import
Font
,
Alignment
,
PatternFill
,
NamedStyle
,
Border
,
Side
,
Color
from
openpyxl.styles.fills
import
FILL_SOLID
from
openpyxl.styles.numbers
import
FORMAT_CURRENCY_USD_SIMPLE
,
FORMAT_PERCENTAGE
from
openpyxl.styles.colors
import
BLACK
FORMAT_DATE_MMDDYYYY
=
'mm/dd/yyyy'
def
colormap
(
color
):
"""
Convenience for looking up known colors
"""
cmap
=
{
'black'
:
BLACK
}
return
cmap
.
get
(
color
,
color
)
def
style_string_to_dict
(
style
):
"""
Convert css style string to a python dictionary
"""
def
clean_split
(
string
,
delim
):
return
(
s
.
strip
()
for
s
in
string
.
split
(
delim
))
styles
=
[
clean_split
(
s
,
":"
)
for
s
in
style
.
split
(
";"
)
if
":"
in
s
]
return
dict
(
styles
)
def
get_side
(
style
,
name
):
return
{
'border_style'
:
style
.
get
(
'border-{}-style'
.
format
(
name
)),
'color'
:
colormap
(
style
.
get
(
'border-{}-color'
.
format
(
name
)))}
known_styles
=
{}
def
style_dict_to_named_style
(
style_dict
,
number_format
=
None
):
"""
Change css style (stored in a python dictionary) to openpyxl NamedStyle
"""
style_and_format_string
=
str
({
'style_dict'
:
style_dict
,
'parent'
:
style_dict
.
parent
,
'number_format'
:
number_format
,
})
if
style_and_format_string
not
in
known_styles
:
# Font
font
=
Font
(
bold
=
style_dict
.
get
(
'font-weight'
)
==
'bold'
,
color
=
style_dict
.
get_color
(
'color'
,
None
),
size
=
style_dict
.
get
(
'font-size'
))
# Alignment
alignment
=
Alignment
(
horizontal
=
style_dict
.
get
(
'text-align'
,
'general'
),
vertical
=
style_dict
.
get
(
'vertical-align'
),
wrap_text
=
style_dict
.
get
(
'white-space'
,
'nowrap'
)
==
'normal'
)
# Fill
bg_color
=
style_dict
.
get_color
(
'background-color'
)
fg_color
=
style_dict
.
get_color
(
'foreground-color'
,
Color
())
fill_type
=
style_dict
.
get
(
'fill-type'
)
if
bg_color
and
bg_color
!=
'transparent'
:
fill
=
PatternFill
(
fill_type
=
fill_type
or
FILL_SOLID
,
start_color
=
bg_color
,
end_color
=
fg_color
)
else
:
fill
=
PatternFill
()
# Border
border
=
Border
(
left
=
Side
(
**
get_side
(
style_dict
,
'left'
)),
right
=
Side
(
**
get_side
(
style_dict
,
'right'
)),
top
=
Side
(
**
get_side
(
style_dict
,
'top'
)),
bottom
=
Side
(
**
get_side
(
style_dict
,
'bottom'
)),
diagonal
=
Side
(
**
get_side
(
style_dict
,
'diagonal'
)),
diagonal_direction
=
None
,
outline
=
Side
(
**
get_side
(
style_dict
,
'outline'
)),
vertical
=
None
,
horizontal
=
None
)
name
=
'Style {}'
.
format
(
len
(
known_styles
)
+
1
)
pyxl_style
=
NamedStyle
(
name
=
name
,
font
=
font
,
fill
=
fill
,
alignment
=
alignment
,
border
=
border
,
number_format
=
number_format
)
known_styles
[
style_and_format_string
]
=
pyxl_style
return
known_styles
[
style_and_format_string
]
class
StyleDict
(
dict
):
"""
It's like a dictionary, but it looks for items in the parent dictionary
"""
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
parent
=
kwargs
.
pop
(
'parent'
,
None
)
super
(
StyleDict
,
self
).
__init__
(
*
args
,
**
kwargs
)
def
__getitem__
(
self
,
item
):
if
item
in
self
:
return
super
(
StyleDict
,
self
).
__getitem__
(
item
)
elif
self
.
parent
:
return
self
.
parent
[
item
]
else
:
raise
KeyError
(
'{} not found'
.
format
(
item
))
def
__hash__
(
self
):
return
hash
(
tuple
([(
k
,
self
.
get
(
k
))
for
k
in
self
.
_keys
()]))
# Yielding the keys avoids creating unnecessary data structures
# and happily works with both python2 and python3 where the
# .keys() method is a dictionary_view in python3 and a list in python2.
def
_keys
(
self
):
yielded
=
set
()
for
k
in
self
.
keys
():
yielded
.
add
(
k
)
yield
k
if
self
.
parent
:
for
k
in
self
.
parent
.
_keys
():
if
k
not
in
yielded
:
yielded
.
add
(
k
)
yield
k
def
get
(
self
,
k
,
d
=
None
):
try
:
return
self
[
k
]
except
KeyError
:
return
d
def
get_color
(
self
,
k
,
d
=
None
):
"""
Strip leading # off colors if necessary
"""
color
=
self
.
get
(
k
,
d
)
if
hasattr
(
color
,
'startswith'
)
and
color
.
startswith
(
'#'
):
color
=
color
[
1
:]
if
len
(
color
)
==
3
:
# Premailers reduces colors like #00ff00 to #0f0, openpyxl doesn't like that
color
=
''
.
join
(
2
*
c
for
c
in
color
)
return
color
class
Element
(
object
):
"""
Our base class for representing an html element along with a cascading style.
The element is created along with a parent so that the StyleDict that we store
can point to the parent's StyleDict.
"""
def
__init__
(
self
,
element
,
parent
=
None
):
self
.
element
=
element
self
.
number_format
=
None
parent_style
=
parent
.
style_dict
if
parent
else
None
self
.
style_dict
=
StyleDict
(
style_string_to_dict
(
element
.
get
(
'style'
,
''
)),
parent
=
parent_style
)
self
.
_style_cache
=
None
def
style
(
self
):
"""
Turn the css styles for this element into an openpyxl NamedStyle.
"""
if
not
self
.
_style_cache
:
self
.
_style_cache
=
style_dict_to_named_style
(
self
.
style_dict
,
number_format
=
self
.
number_format
)
return
self
.
_style_cache
def
get_dimension
(
self
,
dimension_key
):
"""
Extracts the dimension from the style dict of the Element and returns it as a float.
"""
dimension
=
self
.
style_dict
.
get
(
dimension_key
)
if
dimension
:
if
dimension
[
-
2
:]
in
[
'px'
,
'em'
,
'pt'
,
'in'
,
'cm'
]:
dimension
=
dimension
[:
-
2
]
dimension
=
float
(
dimension
)
return
dimension
class
Table
(
Element
):
"""
The concrete implementations of Elements are semantically named for the types of elements we are interested in.
This defines a very concrete tree structure for html tables that we expect to deal with. I prefer this compared to
allowing Element to have an arbitrary number of children and dealing with an abstract element tree.
"""
def
__init__
(
self
,
table
):
"""
takes an html table object (from lxml)
"""
super
(
Table
,
self
).
__init__
(
table
)
table_head
=
table
.
find
(
'thead'
)
self
.
head
=
TableHead
(
table_head
,
parent
=
self
)
if
table_head
is
not
None
else
None
table_body
=
table
.
find
(
'tbody'
)
self
.
body
=
TableBody
(
table_body
if
table_body
is
not
None
else
table
,
parent
=
self
)
class
TableHead
(
Element
):
"""
This class maps to the `<th>` element of the html table.
"""
def
__init__
(
self
,
head
,
parent
=
None
):
super
(
TableHead
,
self
).
__init__
(
head
,
parent
=
parent
)
self
.
rows
=
[
TableRow
(
tr
,
parent
=
self
)
for
tr
in
head
.
findall
(
'tr'
)]
class
TableBody
(
Element
):
"""
This class maps to the `<tbody>` element of the html table.
"""
def
__init__
(
self
,
body
,
parent
=
None
):
super
(
TableBody
,
self
).
__init__
(
body
,
parent
=
parent
)
self
.
rows
=
[
TableRow
(
tr
,
parent
=
self
)
for
tr
in
body
.
findall
(
'tr'
)]
class
TableRow
(
Element
):
"""
This class maps to the `<tr>` element of the html table.
"""
def
__init__
(
self
,
tr
,
parent
=
None
):
super
(
TableRow
,
self
).
__init__
(
tr
,
parent
=
parent
)
self
.
cells
=
[
TableCell
(
cell
,
parent
=
self
)
for
cell
in
tr
.
findall
(
'th'
)
+
tr
.
findall
(
'td'
)]
def
element_to_string
(
el
):
return
_element_to_string
(
el
).
strip
()
def
_element_to_string
(
el
):
string
=
''
for
x
in
el
.
iterchildren
():
string
+=
'
\n
'
+
_element_to_string
(
x
)
text
=
el
.
text
.
strip
()
if
el
.
text
else
''
tail
=
el
.
tail
.
strip
()
if
el
.
tail
else
''
return
text
+
string
+
'
\n
'
+
tail
class
TableCell
(
Element
):
"""
This class maps to the `<td>` element of the html table.
"""
CELL_TYPES
=
{
'TYPE_STRING'
,
'TYPE_FORMULA'
,
'TYPE_NUMERIC'
,
'TYPE_BOOL'
,
'TYPE_CURRENCY'
,
'TYPE_PERCENTAGE'
,
'TYPE_NULL'
,
'TYPE_INLINE'
,
'TYPE_ERROR'
,
'TYPE_FORMULA_CACHE_STRING'
,
'TYPE_INTEGER'
}
def
__init__
(
self
,
cell
,
parent
=
None
):
super
(
TableCell
,
self
).
__init__
(
cell
,
parent
=
parent
)
self
.
value
=
element_to_string
(
cell
)
self
.
number_format
=
self
.
get_number_format
()
def
data_type
(
self
):
cell_types
=
self
.
CELL_TYPES
&
set
(
self
.
element
.
get
(
'class'
,
''
).
split
())
if
cell_types
:
if
'TYPE_FORMULA'
in
cell_types
:
# Make sure TYPE_FORMULA takes precedence over the other classes in the set.
cell_type
=
'TYPE_FORMULA'
elif
cell_types
&
{
'TYPE_CURRENCY'
,
'TYPE_INTEGER'
,
'TYPE_PERCENTAGE'
}:
cell_type
=
'TYPE_NUMERIC'
else
:
cell_type
=
cell_types
.
pop
()
else
:
cell_type
=
'TYPE_STRING'
return
getattr
(
cell
,
cell_type
)
def
get_number_format
(
self
):
if
'TYPE_CURRENCY'
in
self
.
element
.
get
(
'class'
,
''
).
split
():
return
FORMAT_CURRENCY_USD_SIMPLE
if
'TYPE_INTEGER'
in
self
.
element
.
get
(
'class'
,
''
).
split
():
return
'#,##0'
if
'TYPE_PERCENTAGE'
in
self
.
element
.
get
(
'class'
,
''
).
split
():
return
FORMAT_PERCENTAGE
if
'TYPE_DATE'
in
self
.
element
.
get
(
'class'
,
''
).
split
():
return
FORMAT_DATE_MMDDYYYY
if
self
.
data_type
()
==
cell
.
TYPE_NUMERIC
:
try
:
int
(
self
.
value
)
except
ValueError
:
return
'#,##0.##'
else
:
return
'#,##0'
def
format
(
self
,
cell
):
cell
.
style
=
self
.
style
()
data_type
=
self
.
data_type
()
if
data_type
:
cell
.
data_type
=
data_type
\ No newline at end of file
ppstructure/table/tablepyxl/tablepyxl.py
0 → 100644
浏览文件 @
e6de4b1e
# Do imports like python3 so our package works for 2 and 3
from
__future__
import
absolute_import
from
lxml
import
html
from
openpyxl
import
Workbook
from
openpyxl.utils
import
get_column_letter
from
premailer
import
Premailer
from
tablepyxl.style
import
Table
def
string_to_int
(
s
):
if
s
.
isdigit
():
return
int
(
s
)
return
0
def
get_Tables
(
doc
):
tree
=
html
.
fromstring
(
doc
)
comments
=
tree
.
xpath
(
'//comment()'
)
for
comment
in
comments
:
comment
.
drop_tag
()
return
[
Table
(
table
)
for
table
in
tree
.
xpath
(
'//table'
)]
def
write_rows
(
worksheet
,
elem
,
row
,
column
=
1
):
"""
Writes every tr child element of elem to a row in the worksheet
returns the next row after all rows are written
"""
from
openpyxl.cell.cell
import
MergedCell
initial_column
=
column
for
table_row
in
elem
.
rows
:
for
table_cell
in
table_row
.
cells
:
cell
=
worksheet
.
cell
(
row
=
row
,
column
=
column
)
while
isinstance
(
cell
,
MergedCell
):
column
+=
1
cell
=
worksheet
.
cell
(
row
=
row
,
column
=
column
)
colspan
=
string_to_int
(
table_cell
.
element
.
get
(
"colspan"
,
"1"
))
rowspan
=
string_to_int
(
table_cell
.
element
.
get
(
"rowspan"
,
"1"
))
if
rowspan
>
1
or
colspan
>
1
:
worksheet
.
merge_cells
(
start_row
=
row
,
start_column
=
column
,
end_row
=
row
+
rowspan
-
1
,
end_column
=
column
+
colspan
-
1
)
cell
.
value
=
table_cell
.
value
table_cell
.
format
(
cell
)
min_width
=
table_cell
.
get_dimension
(
'min-width'
)
max_width
=
table_cell
.
get_dimension
(
'max-width'
)
if
colspan
==
1
:
# Initially, when iterating for the first time through the loop, the width of all the cells is None.
# As we start filling in contents, the initial width of the cell (which can be retrieved by:
# worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous
# cell in the same column (i.e. width of A2 = width of A1)
width
=
max
(
worksheet
.
column_dimensions
[
get_column_letter
(
column
)].
width
or
0
,
len
(
table_cell
.
value
)
+
2
)
if
max_width
and
width
>
max_width
:
width
=
max_width
elif
min_width
and
width
<
min_width
:
width
=
min_width
worksheet
.
column_dimensions
[
get_column_letter
(
column
)].
width
=
width
column
+=
colspan
row
+=
1
column
=
initial_column
return
row
def
table_to_sheet
(
table
,
wb
):
"""
Takes a table and workbook and writes the table to a new sheet.
The sheet title will be the same as the table attribute name.
"""
ws
=
wb
.
create_sheet
(
title
=
table
.
element
.
get
(
'name'
))
insert_table
(
table
,
ws
,
1
,
1
)
def
document_to_workbook
(
doc
,
wb
=
None
,
base_url
=
None
):
"""
Takes a string representation of an html document and writes one sheet for
every table in the document.
The workbook is returned
"""
if
not
wb
:
wb
=
Workbook
()
wb
.
remove
(
wb
.
active
)
inline_styles_doc
=
Premailer
(
doc
,
base_url
=
base_url
,
remove_classes
=
False
).
transform
()
tables
=
get_Tables
(
inline_styles_doc
)
for
table
in
tables
:
table_to_sheet
(
table
,
wb
)
return
wb
def
document_to_xl
(
doc
,
filename
,
base_url
=
None
):
"""
Takes a string representation of an html document and writes one sheet for
every table in the document. The workbook is written out to a file called filename
"""
wb
=
document_to_workbook
(
doc
,
base_url
=
base_url
)
wb
.
save
(
filename
)
def
insert_table
(
table
,
worksheet
,
column
,
row
):
if
table
.
head
:
row
=
write_rows
(
worksheet
,
table
.
head
,
row
,
column
)
if
table
.
body
:
row
=
write_rows
(
worksheet
,
table
.
body
,
row
,
column
)
def
insert_table_at_cell
(
table
,
cell
):
"""
Inserts a table at the location of an openpyxl Cell object.
"""
ws
=
cell
.
parent
column
,
row
=
cell
.
column
,
cell
.
row
insert_table
(
table
,
ws
,
column
,
row
)
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录