Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
fuyue82
o2oa
提交
29ff9339
o2oa
项目概览
fuyue82
/
o2oa
与 Fork 源项目一致
Fork自
浙江兰德纵横网络技术股份有限公司 / o2oa
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
DevOps
流水线
流水线任务
计划
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
o2oa
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
DevOps
DevOps
流水线
流水线任务
计划
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
流水线任务
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
29ff9339
编写于
3月 01, 2019
作者:
R
roo00
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
仅支持windows
上级
d3ad7e02
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
182 addition
and
4 deletion
+182
-4
o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Person.java
.../src/main/java/com/x/base/core/project/config/Person.java
+1
-1
o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Query.java
...t/src/main/java/com/x/base/core/project/config/Query.java
+2
-3
o2server/x_base_core_project/src/main/java/com/x/base/core/project/tools/ExtractTextTools.java
.../java/com/x/base/core/project/tools/ExtractTextTools.java
+179
-0
未找到文件。
o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Person.java
浏览文件 @
29ff9339
...
...
@@ -31,7 +31,7 @@ public class Person extends ConfigObject {
public
static
final
String
ICON_UNKOWN
=
"iVBORw0KGgoAAAANSUhEUgAAAJAAAACQCAYAAADnRuK4AAAAAXNSR0IArs4c6QAAFlZJREFUeAHtXXuMFdUZH5aF5bHLYxFcXgXchV0kuzxNqqVaMca2VBNNFFNTommjVm0Tq8YYTayvNDGFmiY+mqKNYLRR/6hWErEWGjRaElgEsuCyGNgHCKwswrIisEB/v+md69y78zgzc+bMOcueZO6ZO3Me3/fNb77zne88ZpB1gYWWlpaynp6emnPnztWdP3++FuxX4xiD84pBgwZVMMZ/93kvrnfjerc7RppuHJ241oyymktKSprnz5/fhv/ncf2CCYP6M6fbt28fe+bMmR/h4S/Gg52NuBbxdMQlKfF9EuW3oPxmlL8DxwYcmxYtWnQmpfoyL7ZfAaipqan89OnTV549e3YJJHs1Hua8FMEi9PBAQw8Sfox4A4718+bNa0R8ViizAYmMBxBAU3ny5MllkPXPcXwfR6nmcj8GAL0PYK9ZuHDhOpz3ak5vIHlGAgigGQrQ/BTCXw7uluJhDA3kUtOboP8wSHsD9K9GM9eoKZmBZBkFoM2bN9dD6HdD4NQ44wI5M+9mE3hbPWzYsFVz5szpMoV8IwC0devWy2DXPAoB3wDwGEFzAgCwt/fi0KFDV9bX1x9KUI6SrFo/jMbGxivRRX4MkrhWiTT0qoQ9upcBpGcBpHa9SPuOGi0BtGXLliXQNE+AzMXfkXrBntEFsBrHk7CT2nSTglYA+uyzzyajqVoJ8Nyim6A0oOcb0PAUjhU6+ZW0ABAAUwo757dorn4PAdETPBB8JIBmbRfkdS9ARCdl5iFzAKFnxWbqBRz1mUvDLAJeHz58+APosR3MkuzMALR3795hR44c+RPeqLvwRmVGR5bCl1D3cYzB3bdgwYI1EsqKVUQmD27btm21vb29bwI4DbGoHshUIAG8hK9MmDDhvqlTp54suKHgj3IAoYd1G/h6CeApV8DfhVQFHZE3Y3hkl0qmlQGovb19+KFDh/4M5n6lksELqS4AqCdnYL+qim8lAIKh/D0w9B6OAUNZzZNdBU30awAq9YHa1AEE384c+HbW4c2YrEZ2A7XkJLAW8S3o7tN/lFpIa2KVTTB8O1cAPB8NgCe15xdU8FJooA853SUoUdJ7qWkgGMs/A3DeBIHDkxI5kD++BACinaWlpdfNnTu3I34p/jlTARDAczvA81dUq/vkLn/J9K877QDSdWn00KQDKAeev/Uv+ZvPDQDEyWs/AIj2yORGqg2Ua7aoeQaCZhJAizABxwewiapkkiZNA+UM5g9B3IDNI/MJSS4LmmgbgHQVemfHZBQtRQOxq46RdPp5BsAj46mkWAbAMxcgeofr42RUk9jIpZMw5+cZK4OgrMoYPHiwhfnIFmYAWhigtA8I2wJv9oHlQtapU6csXjM9UAMdP36ck/lvBpgSLTFK1ITlhic2QaDGeZgJloqKCmvUqFFWeXm5ha5uKC4IHoLoxIkTVnd3t4WHYGFQODSfrgkAnj/CqH4oCX2JAATtswqV/zIJASrzUrNUVlZa48ePt0aMGJG4agLq66+/tr766isbTIkLVFwAAAQWzl8Pe4he61ghNoDQ47oNlb8Wq1bFmQicqqoqC1MeLDZVaQRqJsxvssGE5dRpVJFWmUfKysrmx524HwtAufk8mwEg7adkEDQTJ04UaqJkPCHaTF9++aV1+PBhY+wlaKJPMCntKsSR2+PIAOJMwq6urk0Aj9aTwWjjTJ8+3Ro5cqQMXEQu49tvv7Xa2tpsWyly5gwyADzPwh56OGrVkQEEu+dFVHJ31IpUpqfWmTx5st2TUlmvV1142WwgUTPpHGgPgb6fAETrotAZCUAAD7dJ2QjtEylfFIKSpKWtQ60zdqxeHgWs47f27Nlj0RWgc8CzbcWzvTTKFBBhRyIKZj/3BV3BM2TIEKuurk478BAwWD1h05ZVcyoKWjzbaQDRo6LpmU5Yk2CZ8e/gbV4RpXBVadGLsGbOnGkx1jlAfta+ffuso0ePaksmAHQaPrEGTP/gJlmhQUgDccVobtFfaIGqE9BzPGvWLO3BQ7mwiZ0xY4Y1evRo1WISrg9aaCico8+LZhACEAzAlSiwQrRQVeno06mpqbGHH1TVmbQevOHWJZdcIsWRmZQWv/wA0TVocW71u+++HgogOAy50YGWa9X5NtO+MC04moixrgEtzkr4+0J9IKEcADxP6Mgku+o6NwVhMqOfatq0aWHJsrw/ES3PXWEEBAKI+/OggMVhhai+zx4X/TymB47L8dA1QAs9GDbtIxBAKOAxHZnTxUkoQzbkhXaRpmEiZh3cEUSbL4C4rRwyarczGLvqOr+1QcL2usde5EUXXeR1S4trMGEexuE718UXQGj/IjmUVHFL20fjNzaWGHR+IQCe6Zx54ceYJ4AwZMHdUG/wy5TldZ2FHVcu9FDTrtM4PAIgeWLFUzUBPNxKV7uGmbMHRWYOJnkQnG1ITzG+p2FxXg9nHNLfRHcBe33jxo2TPqeIGnXMmDFWZ2dnEtLTzFsLk+ZqVPDv4kr6ACi3ifey4oQ6/OfU07QCZxYeOHDA4sBncUBnwgYTp7Du37/fnpjGOUYy/TgEp8YAsiADbureB0B91BJ3gEdCLTfxTsNpSA3zxRdf2IcXeLzAdPDgQWvnzp2eYCtOL/o/Dd5E6xZJBy15k5djsQ+AkJBI0zLIFjInfe3atcue1xyVYU5hbW5utpu6qHm90rM3ltZ0W6/6ol6DSVOOl+3G4nwFAMrt5LC0OJEO/9lcyBxtJ3gIgCRzdDhJDI62RGW4ZUvvtM4BIOqjXAoABBW+DIm0/HCJbOMZU3OlLMkhiFpbW6U8d90BhNaJg6yT3MwWAAg3+MkkLYNMg5XG6jffyNt3icY1e21Jg+ZdeS4SIF4KRunzAPr88885XYPf29IysKsrI0AI9qoJGWW5yzh2LPlSc51tIIdXyK9gdCIPIDRfP0SiPt16J2PWsSwNxO562Lot1kWHJcepROdX03+UNJgAIPC4GI7mvNczDxi05UuSCiDN/PTFyAhc/BcU2BviJDV3j49NHpfoBAURF0BQft4zAUDQQOWgk+Okn5DmvAbCudYACtMaZEYkBNkqbCarq6sLwMMyuRRaZg/Qj05ZWtavfFnXAaI8VmwAsfsO4c2VVUEa5dDhR/slaQjSZFz+7LdmPmxFhYyHL4O/pPIRyQ8ZFgIIPpGrQLwNJpECskrDLnPS4AcENlkcnvALQcBjHt274H58xbx+OVcoM68DGhrQ2ockTj+HOXxPos+ALJsnTnT36+kRPEFNH8uWMU5nigYCu/xQziLy7RjRdfyje+DwgV8TI0o7Nc3s2bPtgUunPE7oCnJU0lEYZoNxND1pMAhAfNlmg9+PbQCB8NqkzKvIz+EHGYE9LZE51QQNlySHOR05zcTda4tLo4wmOm7dUfM5mCnhpGmgaXrUArJIT42hMoiAh83epEkF3v3YJJoEIPBtK50SvF0zgSbHForNvIqMsjSQCK2cVBameVgOe25+hrlIPe407GkaFGwAlQL1RjRfFKxKAIU5HKl52AxefPHF0p65SQCC0plBj3Sp05ZJk0KKBVHFU8hBBq+M6tnr4iaafoHgobeato/MEGaoy6xLQlm0n6vZdNVIKExZETKGDMKIZZc9yO/DlSGywUOaZLgpwniTeR8vUg0BlLz/KZOqkLJUNGNBtg+1j8xmy82uYRqIXfmxBJB2u264hVp8rgJAQVqOe0unMW+H4AnSesVy0OE/6C0vgQ1kFICCHq4soQZpIAIojaDaRSGDB2igihL+yChMVRkqduQIakpkddmL5cWpHGl3DorrTPqfyocaKL3FVkkpLMpP2yMt+8OpCvIInCstw+Ps1OWOWS57drSxTAlGaSC+obI8vkEPKEj7cMpGmlqC2k3njRY85FZBP5ARTRibLhlzbjyE0OeS30NMEzwOEVw6rfMKVYfOXFxOZxD95/agatFNrf4mHYUXZYYDrVnuHEY+2YyxKdU9gMZeGtH+LleNOEij66wRe3lSCB5TeCV2aEQbAaC8hC+AE4LIhADsnCglikxQl0HGbRrC5jovfnWHjktOV+V0VxUuBPKimte48jNKA6l0tBE8nAvEMTEO4DLmfxmLB8MeFgeLTfFI2wDiTxhTOtxX4YF2+KTm8Qp+173Sxr2mks+4NDr5aP5wLMwIAHF4QVVT6zfe5nfdEaiMOGzyvow6ZJVhayAUpu2+am5GqdaD5ui40yY991ui43c9aX3u/NyowZSAF/owu/FCX2XRgSl+3FZF8Fsf5nddFk1svlS9JDJohmN1dwnebGMAxHnKKgTM3hbHpTi0wCEUxvyfZi+MzXPY+nsZD11WGVA8PQ0NDftLMTzQbIrVT+bZG+JHVmSswwoSJsGSJmDcdbOnx++Iydjhw11umucA/G6A6HzJ/Pnz21BR361J06w9QdkEOzfF3L17t9RNLhOQFDsrtQ6/7rxjx45Y+zTGrlhCRsf0oQ10HkeLhDKVFsGmDJtiGQ2ijo4Oq7293fY1KRWenMps04fdeHaPjbGD3LxTG6nwzbjrlHVOxyi1j6khr4FyDOwwlREVRnUasjGpu+7FP2zn7bxuayDEG7wSmXCNrn/TlsNQrkHzrnWXO7RPJz7Ku5N0OgDaRJ50J9yPPpO8tw4PJtLs0I54A21n/rcBhA/Nn8H5x7xgYjCp+0v5UmuaNOblgYn1zjVHA3G6aP6ic9OU2DQ7yDR6i3HgxkoeQFBJxgKIb7PK6R7FAo36n1sNGxw64DvMu33yAJo3b14jmEq+W3ZGkunq6sqo5mjV0vVgMoCgaAo6XHkA4cZZHO9HE4c+qTnQSs+u7oFAN2noyEOea93X8gDiRTyANe6bJp2zK6+7FiLA+a0xg8Nx7Ezyrpv+AgAtXLhwHbSQse5ReqV1frsJcJNsNTdQcudvYZfbgnHTAgABPFwj9oZHRiMu8eHo+oZzxJ1jXyYH9L76tFAFACJzULOrTWaSANLRy8tBU/p/TA1QLvvQ+9pYTH8fAMGpyN5YU3FCU/7TzuDcGsa6BI57he25qAutfnRAnq8BRH2E2gdALAAJjdZC9AvpZFCb3nQRE5i+2qf54nVPAGHy+CrcM2K1BpnwCpz+qkNgc2r4sAUVynvwE+72kqcngObMmdOFTC96ZTDlmi7jY6YPW+Se9zN+z90TQEyMXSpWIirosvkVouN19np06NKbskzZ7xlCkayHe+e/fvd9AVRfX38ImV/2y2jCdYIo66ADDUlkAOP56aD8vgBiJmihZxFxqoeRAX6LzOnGS5g5DXEJAO2foldeMPZVXFaghKGF2pFhdXEmU/5zTVfWwWQAQfv42j6OXAMBlEv0JGLjZivqAB7KTxc6cs9SOALwN0L7FAycemUOBRAKaUPGp7wy63xNxX6GIvxzyzwDQy+Af48I3aEAyhWyAojcJVKgLmnS2o43Kn8mAgjP+jn4fYRGI4QABC10Bu3hvVGFl2V6VZtyhvGoCx1hdLrud8CR/ITrf+CpEIBYQs4afz2wNI1u6vLg2JSq2BZGlujRc70fjuQTouUJA4gFoll4AJH2G9jQcE3rmxaignWnU7VJg7vOOOdoutYtWLDg7Sh5IwEIyDwIhN4XpYIs0nKjcB18QA7v3DzcgNAFM+XOqHRGAhALB0LXAKmvRK1IZfrx48errC60Lhr0OmlEL4LxTG/P9bi9bvteiwwgloR5sdRCQla6b80p3eDXBMvKylIqPX6xU6ZMiZ85/ZwrMN71zzjVxAIQ58UCsTfj6IlTaVp5aKyKfA8+rfqDyqVRT3BrGLis/ZG4dMUCECsDYnfp1LUHmO2dy3SyfYofCrVQeXl58eXM/kNmRyGvZXTTxCUiNoBYISp+FREnn2Ua2OuaNWuWpUvX3U8YBHl1dbUWdIKWc6BzOWzaVj96Ra4nHiqGFirdsmXLP1DZUpEKZaehn2XmzJlaPBRR3ji53tkJXzSP7HQA0N1oRf6StNxEGoiVgxAuNbgF8adJiYmanxqnrq7OKPCQR4KeGjNDm+hxGeAhL4k1EAthaGpqqsRO7h9BI136/yvp/QKsVlVVlf0BFJ6bHLhOntv7qpq5CHm9APBIG5aSKv1t27ZNgSA+wQOdmtZDHTVqlIVeoFHDA2Gy4KzFAwcOqNgz8W2AZxlARPtHSpAKIFIEe2g2ov9AE0nts3Kzb3bRdXfIJXkq/BYHgZTSipJ/YUjletiLp5LQWJxXOoBYAUBUAwB9gNMZxRVG/U+Nw+aqPwOnWCZcBsQVtgQS5Fh8O87/twCeX8gGDwlJBUAsGDZRFd6o9yGAufwfJdCXw/EjDknoMq8nCv2y0nLHEW4FzK1r4k7Op82DrvpvZDZbbv5SAxAr2bx582gQ/g5AdJW7Ur9z9qo4EFpZWWnsVFA/3pJc5/IkaiMCKeJ6t8fhq3sySd1heVMFECtvaWkpw9rwNwCiG4OIYRNFJ5upc4iDeJN5j702fuohKOS0zT2yuuqBdQXdlHUP4Bnc2Nj4B5T3IM59QcuvFfOT26bMn5ElH5FyIDfbLuIeSDz3CwAP13QvB3je80sj87rvw5RZiVMWmjR6q1/FEThBhvYPx410mRjv0J9VzPX1ra2tItvWbOLYVtLhiSh8KgUQCcOXaabCOPw73qIrgggleCZNmmTbRHirgpL223sc8qDG6ezsDNQ6OQGsQPxIkoHROILM5MkAPKVo0rho7SGcB9LAKRrURhdSs0ajmb0vduUFel/cCON2NFmx5vPEAY07T+DDcydM4xz+oh+j3JcAomlh5dPI5icn+7M/CHKw9zWiM1Hk+x8AzjrkuRNapy1MfmndzxRAZAp20QgI4lGc0sAOXYXHrj4di/xiYX9p2qhl2ExR6wiOiXXA1rkftk6kCfBpgChzADlMYRytFm3+8wDRNc61oJjTVjmaTZ+RqcY2hy4IHPp3BLei6cVL8xzXbUVZehMkx6T3tAGQwwhso1sBohU4JjnXgmJqIdpHBBJjerF1DtQwdApyz8Qom4GCz41cbiy6YlSVDLQDEBmHNhoJtX4X3soH8XeiqDDohGTTxvEzHrpoJm4/zI026QSM+qE5AOdTvEzPwM5ZKyoHlem0BJAjAHqxsUXcHRDgwzimO9dFY47gE0ich0zbSRWgaADze2Dc3o6AibO5OICzHjw/DeBsEOU3i3RaA8gRCATJabO34T9XD9Q616PGtJsIJB4858YHjOMACzTZBi/BwoMj6GySeCTZDxrAoQf5GXTLfbeVi8p3mumNAJAjADy0kq1bt16Npm05BH0T/ktZ4kC7iWBiE8hz98G6aeA6B3tMBIhIN9uhOywGL/vAy2sA8hq/3VDDysjqvlEAcguJdhIe5I0QPMF0DcHlvm/AOfcYeAtgXcMd4MGD/wCXxswYCyC3TNFzY4+NvbdrES9GLEUzueuQdN4BoNCmWQsXxLvFHy6RVIfSYvoFgNwSg2NyCJqiywCiJWh2luDe5TiGudOoOgdYOlEXAbMemma9+0t/qmhIu55+B6Bige3du3cYfC6L8DBnA1S1iGmE1+J8BuLS4vRx/qPMHpS3G3Ez8jczBmC289PYODeyaRKVQ78HkJ8gqKlwrxoPuAbHWGircsQVAEIFY9zjUY7//JJjN+Juxq5zbsJ0GNquuaGhYT+u92uggFfP8D85spfwanUgIQAAAABJRU5ErkJggg=="
;
public
static
final
String
REGULAREXPRESSION_SCRIPT
=
"^\\((.+?)\\)$"
;
public
static
final
String
DEFAULT_PASSWORD
=
"(person.getMobile())"
;
public
static
final
String
DEFAULT_PASSWORD
=
"(
return
person.getMobile())"
;
public
static
final
Integer
DEFAULT_PASSWORDPERIOD
=
0
;
public
Person
()
{
...
...
o2server/x_base_core_project/src/main/java/com/x/base/core/project/config/Query.java
浏览文件 @
29ff9339
package
com.x.base.core.project.config
;
import
java.io.File
;
import
java.util.ArrayList
;
import
java.util.List
;
import
org.apache.commons.io.FileUtils
;
import
org.apache.commons.lang3.BooleanUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.tika.utils.SystemUtils
;
import
org.quartz.CronExpression
;
import
com.x.base.core.project.annotation.FieldDescribe
;
...
...
@@ -82,7 +81,7 @@ public class Query extends ConfigObject {
}
public
Boolean
getExtractImage
()
{
return
BooleanUtils
.
isTrue
(
extractImage
);
return
SystemUtils
.
IS_OS_WINDOWS
&&
BooleanUtils
.
isTrue
(
extractImage
);
}
public
String
getTessLanguage
()
{
...
...
o2server/x_base_core_project/src/main/java/com/x/base/core/project/tools/ExtractTextTools.java
0 → 100644
浏览文件 @
29ff9339
package
com.x.base.core.project.tools
;
import
java.awt.image.BufferedImage
;
import
java.io.ByteArrayInputStream
;
import
java.io.File
;
import
java.util.List
;
import
javax.imageio.ImageIO
;
import
org.apache.commons.collections4.list.UnmodifiableList
;
import
org.apache.commons.io.FileUtils
;
import
org.apache.commons.lang3.StringUtils
;
import
org.apache.pdfbox.cos.COSDocument
;
import
org.apache.pdfbox.io.RandomAccessBuffer
;
import
org.apache.pdfbox.pdfparser.PDFParser
;
import
org.apache.pdfbox.pdmodel.PDDocument
;
import
org.apache.pdfbox.text.PDFTextStripper
;
import
org.apache.tika.Tika
;
import
org.junit.Test
;
import
com.x.base.core.project.config.Config
;
import
com.x.base.core.project.logger.Logger
;
import
com.x.base.core.project.logger.LoggerFactory
;
import
com.x.base.core.project.tools.DefaultCharset
;
import
com.x.base.core.project.tools.ListTools
;
import
net.sourceforge.tess4j.Tesseract
;
public
class
ExtractTextTools
{
private
static
Logger
logger
=
LoggerFactory
.
getLogger
(
ExtractTextTools
.
class
);
private
static
Tesseract
tesseract
=
null
;
private
static
Tika
tika
=
null
;
public
static
final
Integer
MAXLENGTH
=
1024
*
1024
*
32
;
public
static
boolean
support
(
String
name
)
{
String
ext
=
StringUtils
.
substringAfterLast
(
name
,
"."
);
if
(
StringUtils
.
isNotEmpty
(
ext
))
{
ext
=
"."
+
StringUtils
.
lowerCase
(
ext
);
return
SUPPORT_TYPES
.
contains
(
ext
);
}
return
false
;
}
public
static
boolean
supportImage
(
String
name
)
{
String
ext
=
StringUtils
.
substringAfterLast
(
name
,
"."
);
if
(
StringUtils
.
isNotEmpty
(
ext
))
{
ext
=
"."
+
StringUtils
.
lowerCase
(
ext
);
return
SUPPORT_IMAGE_TYPES
.
contains
(
ext
);
}
return
false
;
}
public
static
boolean
available
(
byte
[]
bytes
)
{
if
(
null
==
bytes
||
bytes
.
length
==
0
||
bytes
.
length
>
MAXLENGTH
)
{
return
false
;
}
return
true
;
}
public
static
final
List
<
String
>
SUPPORT_TYPES
=
UnmodifiableList
.
unmodifiableList
(
ListTools
.
toList
(
".doc"
,
".docx"
,
".pdf"
,
".xls"
,
".xlsx"
,
".txt"
,
".bmp"
,
".jpg"
,
".png"
,
".gif"
,
".jpeg"
,
"jpe"
));
public
static
final
List
<
String
>
SUPPORT_IMAGE_TYPES
=
UnmodifiableList
.
unmodifiableList
(
ListTools
.
toList
(
".bmp"
,
".jpg"
,
".png"
,
".gif"
,
".jpeg"
,
"jpe"
));
public
static
String
extract
(
byte
[]
bytes
,
String
name
,
Boolean
office
,
Boolean
pdf
,
Boolean
txt
,
Boolean
image
)
{
if
((
null
!=
bytes
)
&&
bytes
.
length
>
0
&&
bytes
.
length
<
1024
*
1024
*
10
)
{
if
(
office
)
{
if
(
StringUtils
.
endsWithIgnoreCase
(
name
,
".doc"
)
||
StringUtils
.
endsWithIgnoreCase
(
name
,
".docx"
))
{
return
word
(
bytes
);
}
if
(
StringUtils
.
endsWithIgnoreCase
(
name
,
".xls"
)
||
StringUtils
.
endsWithIgnoreCase
(
name
,
".xlsx"
))
{
return
excel
(
bytes
);
}
}
if
(
pdf
)
{
if
(
StringUtils
.
endsWithIgnoreCase
(
name
,
".pdf"
))
{
return
pdf
(
bytes
);
}
}
if
(
txt
)
{
if
(
StringUtils
.
endsWithIgnoreCase
(
name
,
".txt"
))
{
return
text
(
bytes
);
}
}
if
(
image
)
{
if
(
StringUtils
.
endsWithIgnoreCase
(
name
,
".jpg"
)
||
StringUtils
.
endsWithIgnoreCase
(
name
,
".png"
)
||
StringUtils
.
endsWithIgnoreCase
(
name
,
".gif"
)
||
StringUtils
.
endsWithIgnoreCase
(
name
,
".bmp"
)
||
StringUtils
.
endsWithIgnoreCase
(
name
,
".jpeg"
)
||
StringUtils
.
endsWithIgnoreCase
(
name
,
".jpe"
))
{
return
image
(
bytes
);
}
}
}
return
null
;
}
public
static
String
pdf
(
byte
[]
bytes
)
{
try
{
PDFParser
parser
=
new
PDFParser
(
new
RandomAccessBuffer
(
bytes
));
parser
.
parse
();
try
(
COSDocument
cos
=
parser
.
getDocument
();
PDDocument
pd
=
new
PDDocument
(
cos
))
{
PDFTextStripper
stripper
=
new
PDFTextStripper
();
stripper
.
setStartPage
(
1
);
stripper
.
setEndPage
(
pd
.
getNumberOfPages
());
return
stripper
.
getText
(
pd
);
}
}
catch
(
Exception
e
)
{
logger
.
error
(
e
);
}
return
null
;
}
public
static
String
word
(
byte
[]
bytes
)
{
try
(
ByteArrayInputStream
in
=
new
ByteArrayInputStream
(
bytes
))
{
return
tikaInstance
().
parseToString
(
in
);
}
catch
(
Exception
e
)
{
logger
.
error
(
e
);
}
return
null
;
}
public
static
String
excel
(
byte
[]
bytes
)
{
try
(
ByteArrayInputStream
in
=
new
ByteArrayInputStream
(
bytes
))
{
return
tikaInstance
().
parseToString
(
in
);
}
catch
(
Exception
e
)
{
logger
.
error
(
e
);
}
return
null
;
}
public
static
String
text
(
byte
[]
bytes
)
{
return
new
String
(
bytes
,
DefaultCharset
.
charset
);
}
public
static
String
image
(
byte
[]
bytes
)
{
try
(
ByteArrayInputStream
in
=
new
ByteArrayInputStream
(
bytes
))
{
BufferedImage
image
=
ImageIO
.
read
(
in
);
return
tesseractInstance
().
doOCR
(
image
);
}
catch
(
Exception
e
)
{
logger
.
error
(
e
);
}
return
null
;
}
private
static
Tesseract
tesseractInstance
()
throws
Exception
{
if
(
null
==
tesseract
)
{
synchronized
(
ExtractTextTools
.
class
)
{
if
(
null
==
tesseract
)
{
tesseract
=
new
Tesseract
();
tesseract
.
setDatapath
(
Config
.
dir_commons_tess4j_tessdata
().
getAbsolutePath
());
// 设置训练库的位置
tesseract
.
setLanguage
(
Config
.
query
().
getTessLanguage
());
// 中文识别
}
}
}
return
tesseract
;
}
private
static
Tika
tikaInstance
()
throws
Exception
{
if
(
null
==
tika
)
{
synchronized
(
ExtractTextTools
.
class
)
{
if
(
null
==
tika
)
{
tika
=
new
Tika
();
}
}
}
return
tika
;
}
@Test
public
void
test1
()
throws
Exception
{
System
.
out
.
println
(
word
(
FileUtils
.
readFileToByteArray
(
new
File
(
"d:/1.html"
))));
}
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录