怎樣提取pdf表格的内容?pdf文件内容格式比較特殊,目前根據實際觀察發現pdf内容隻是把源目标文件,一行一行讀到pdf中,通過定位方式實現同版面展示,所以表格在pdf中表現形式比較特殊,接下來我們就來聊聊關于怎樣提取pdf表格的内容?以下内容大家不妨參考一二希望能幫到您!
pdf文件内容格式比較特殊,目前根據實際觀察發現pdf内容隻是把源目标文件,一行一行讀到pdf中,通過定位方式實現同版面展示,所以表格在pdf中表現形式比較特殊。
實現思路:
通過pdf内容識别,找到表格所屬的頁(隻是提高一些速度,減少其他内容),然後将表格所在的頁面截取到新的pdf文件中,然後将新生成的pdf轉換為html文件,通過算法重新組裝表格,此方法可識别空白列以及一個表格中存在多行數據的情況
用的技術框架:
jsoup,itextpdf,pdfbox
/**
* 讀取pdf文件轉為list集合
* @param pdfPath
* @return
*/
public static List<List<String>> getDataFromPdf(String pdfPath){
List<List<String>> datas=new ArrayList<>();
String newPdfPath=pdfPath.replace(".pdf","_01.pdf");
String htmlPath=pdfPath.replace(".pdf","_01.html");
//确認附件表格所在的頁面,返回頁碼
int[] pageNums=readPdf(pdfPath);
//讀取存在表格附件的頁面
partitionPdfFile(pdfPath,newPdfPath,pageNums[0],pageNums[1]);
byte[] bytes = getBytes(newPdfPath);
try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(htmlPath)),"UTF-8"));){
//加載PDF文檔
PDDocument document = PDDocument.load(bytes);
PDFDomTree pdfDomTree = new PDFDomTree();
pdfDomTree.writeText(document,out);
datas=ParseHtml(htmlPath);
} catch (Exception e) {
e.printStackTrace();
}finally {
//删除緩存文件
File pdf_01=new File(newPdfPath);
if(pdf_01.exists()){
pdf_01.delete();
}
File html_01=new File(htmlPath);
if(html_01.exists()){
html_01.delete();
}
}
return datas;
}
/***
* 讀取pdf 确定内容所在頁
* @param pdfPath
*/
private static int[] readPdf(String pdfPath){
int[] pageNums=new int[2];
try {
PdfReader reader = new PdfReader(pdfPath);
int pageNum = reader.getNumberOfPages();
boolean isGo=false;
for(int i=1;i<=pageNum;i ){
String pageContent = PdfTextExtractor.getTextFromPage(reader, i);//讀取第i頁的文檔内容
if((pageContent.trim().length()>0&&pageContent.startsWith("附件"))){
pageNums[0]=i;
isGo=true;
}
if(isGo&&pageContent.trim().length()<50){
pageNums[1]=i-1;
//break;
}
}
} catch (Exception e) {
e.printStackTrace();
}finally{
}
return pageNums;
}
/**
* pdf 轉換為html
* @param html
* @return
* @throws IOException
*/
private static List<List<String>> ParseHtml(String html) throws IOException {
org.jsoup.nodes.Document document = Jsoup.parse(new File(html), "utf-8");
Elements postItems = document.select("div.page");
//循環處理每頁
List<List<String>> datas=new ArrayList<>();
for (int i=0;i<postItems.size()-1;i ) {
//border-bottom
Elements table_row= postItems.get(i).select("[style*=border-bottom:]");
if(table_row.size()==0)
continue;
//輸出表格第一行
String css=table_row.first().attr("style");
String width=(process(css,"width"));
//獲取除标題部分内容區域
table_row=postItems.get(i).select(String.format("[style*=border-bottom:][style*=width:%s]",width));
Elements table_col= postItems.get(i).select("[style*=border-right:]");
for (int iw=(i==0?1:0);iw<table_row.size()-1;iw ) {
datas.add(getRow(postItems.get(i), table_row, table_col, iw));
}
}
return datas;
}
/**
* 讀取pdf指定頁内容
* @param pdfFile
* @param newFile
* @param from
* @param end
*/
private static void partitionPdfFile(String pdfFile,String newFile, int from, int end) {
Document document = null;
Pdfcopy copy = null;
try {
PdfReader reader = new PdfReader(pdfFile);
int n = reader.getNumberOfPages();
if (end == 0) {
end = n;
}
document = new Document(reader.getPageSize(1));
copy = new PdfCopy(document, new FileOutputStream(newFile));
document.open();
for (int j = from; j <= end; j ) {
document.newPage();
PdfImportedPage page = copy.getImportedPage(reader, j);
copy.addPage(page);
}
document.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/*
将文件轉換為byte數組
*/
private static byte[] getBytes(String filePath){
byte[] buffer = null;
try {
File file = new File(filePath);
FileInputStream fis = new FileInputStream(file);
ByteArrayOutputStream bos = new ByteArrayOutputStream(1000);
byte[] b = new byte[1000];
int n;
while ((n = fis.read(b)) != -1) {
bos.write(b, 0, n);
}
fis.close();
bos.close();
buffer = bos.toByteArray();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return buffer;
}
下面是html的解析方式,通過邊框定位,找到每一行每一列所處的位置一級在該位置所屬的元素。
/**
* 從第二行開始(去除标題行)
* @param postItem
* @param table_col
* @param index
* @return
*/
private static List<String> getRow(Element postItem,Elements postItems,Elements table_col,int index) {
String top = (process(postItems.get(index).attr("style"), "top"));
String bottom = (process(postItems.get(index 1).attr("style"), "top"));
Elements tables = postItem.select("[style*=top:]");
List<String> data = new ArrayList<>();
double dbottom = Double.parseDouble(bottom);
double dtop = Double.parseDouble(top);
boolean isGo = false;
for (int iiy = 0; iiy < table_col.size() - 1; iiy ) {
StringBuilder sbs = new StringBuilder();
for (Element spostItem : tables) {
String top2 = (process(spostItem.attr("style"), "top"));
double top2s = Double.parseDouble(top2);
if (top2s > dtop && top2s < dbottom) {
String left2 = (process(spostItem.attr("style"), "left"));
double[] cols = getRowCol(table_col, iiy);
double left2s = Double.parseDouble(left2);
if (left2s > cols[0] && left2s < cols[1]) {
sbs.append(spostItem.text());
}
}
}
if(sbs.length()==0) {
data.add("-");
}else{
data.add(sbs.toString());
}
}
return data;
}
/**
* 定位列的位置
* @param table_col
* @param index
* @return
*/
private static double[] getRowCol(Elements table_col,int index){
StringBuilder sbd=new StringBuilder();
String left=(process(table_col.get(index).attr("style"),"left"));
String right=(process(table_col.get(index 1).attr("style"),"left"));
return new double[]{Double.parseDouble(left),Double.parseDouble(right)};
}
/**
* 讀取html中樣式的指定屬性
* @param style
* @param extract
* @return
*/
private static String process(String style,String extract) {
if (style.contains(extract)) {
style = style.substring(style.indexOf(extract ":"));
style = style.substring(0, style.indexOf(";"));
String attr = style.substring(style.indexOf(":") 1);
return (attr.substring(0,attr.length()-2));
}
return null;
}
pom配置
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.13</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.12.1</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.5</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.0</version>
</dependency>
<dependency>
<groupId>com.itextpdf.tool</groupId>
<artifactId>xmlworker</artifactId>
<version>5.5.11</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.1</version>
</dependency>
更多精彩资讯请关注tft每日頭條,我们将持续为您更新最新资讯!