tft每日頭條

 > 科技

 > 怎樣提取pdf表格的内容

怎樣提取pdf表格的内容

科技 更新时间:2025-01-09 15:31:41

怎樣提取pdf表格的内容?pdf文件内容格式比較特殊,目前根據實際觀察發現pdf内容隻是把源目标文件,一行一行讀到pdf中,通過定位方式實現同版面展示,所以表格在pdf中表現形式比較特殊,接下來我們就來聊聊關于怎樣提取pdf表格的内容?以下内容大家不妨參考一二希望能幫到您!

怎樣提取pdf表格的内容(關于PDF文件表格提取實現)1

怎樣提取pdf表格的内容

pdf文件内容格式比較特殊,目前根據實際觀察發現pdf内容隻是把源目标文件,一行一行讀到pdf中,通過定位方式實現同版面展示,所以表格在pdf中表現形式比較特殊。

實現思路:

通過pdf内容識别,找到表格所屬的頁(隻是提高一些速度,減少其他内容),然後将表格所在的頁面截取到新的pdf文件中,然後将新生成的pdf轉換為html文件,通過算法重新組裝表格,此方法可識别空白列以及一個表格中存在多行數據的情況

用的技術框架:

jsoup,itextpdf,pdfbox

/** * 讀取pdf文件轉為list集合 * @param pdfPath * @return */ public static List<List<String>> getDataFromPdf(String pdfPath){ List<List<String>> datas=new ArrayList<>(); String newPdfPath=pdfPath.replace(".pdf","_01.pdf"); String htmlPath=pdfPath.replace(".pdf","_01.html"); //确認附件表格所在的頁面,返回頁碼 int[] pageNums=readPdf(pdfPath); //讀取存在表格附件的頁面 partitionPdfFile(pdfPath,newPdfPath,pageNums[0],pageNums[1]); byte[] bytes = getBytes(newPdfPath); try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(htmlPath)),"UTF-8"));){ //加載PDF文檔 PDDocument document = PDDocument.load(bytes); PDFDomTree pdfDomTree = new PDFDomTree(); pdfDomTree.writeText(document,out); datas=ParseHtml(htmlPath); } catch (Exception e) { e.printStackTrace(); }finally { //删除緩存文件 File pdf_01=new File(newPdfPath); if(pdf_01.exists()){ pdf_01.delete(); } File html_01=new File(htmlPath); if(html_01.exists()){ html_01.delete(); } } return datas; } /*** * 讀取pdf 确定内容所在頁 * @param pdfPath */ private static int[] readPdf(String pdfPath){ int[] pageNums=new int[2]; try { PdfReader reader = new PdfReader(pdfPath); int pageNum = reader.getNumberOfPages(); boolean isGo=false; for(int i=1;i<=pageNum;i ){ String pageContent = PdfTextExtractor.getTextFromPage(reader, i);//讀取第i頁的文檔内容 if((pageContent.trim().length()>0&&pageContent.startsWith("附件"))){ pageNums[0]=i; isGo=true; } if(isGo&&pageContent.trim().length()<50){ pageNums[1]=i-1; //break; } } } catch (Exception e) { e.printStackTrace(); }finally{ } return pageNums; }

/** * pdf 轉換為html * @param html * @return * @throws IOException */ private static List<List<String>> ParseHtml(String html) throws IOException { org.jsoup.nodes.Document document = Jsoup.parse(new File(html), "utf-8"); Elements postItems = document.select("div.page"); //循環處理每頁 List<List<String>> datas=new ArrayList<>(); for (int i=0;i<postItems.size()-1;i ) { //border-bottom Elements table_row= postItems.get(i).select("[style*=border-bottom:]"); if(table_row.size()==0) continue; //輸出表格第一行 String css=table_row.first().attr("style"); String width=(process(css,"width")); //獲取除标題部分内容區域 table_row=postItems.get(i).select(String.format("[style*=border-bottom:][style*=width:%s]",width)); Elements table_col= postItems.get(i).select("[style*=border-right:]"); for (int iw=(i==0?1:0);iw<table_row.size()-1;iw ) { datas.add(getRow(postItems.get(i), table_row, table_col, iw)); } } return datas; } /** * 讀取pdf指定頁内容 * @param pdfFile * @param newFile * @param from * @param end */ private static void partitionPdfFile(String pdfFile,String newFile, int from, int end) { Document document = null; Pdfcopy copy = null; try { PdfReader reader = new PdfReader(pdfFile); int n = reader.getNumberOfPages(); if (end == 0) { end = n; } document = new Document(reader.getPageSize(1)); copy = new PdfCopy(document, new FileOutputStream(newFile)); document.open(); for (int j = from; j <= end; j ) { document.newPage(); PdfImportedPage page = copy.getImportedPage(reader, j); copy.addPage(page); } document.close(); } catch (Exception e) { e.printStackTrace(); } } /* 将文件轉換為byte數組 */ private static byte[] getBytes(String filePath){ byte[] buffer = null; try { File file = new File(filePath); FileInputStream fis = new FileInputStream(file); ByteArrayOutputStream bos = new ByteArrayOutputStream(1000); byte[] b = new byte[1000]; int n; while ((n = fis.read(b)) != -1) { bos.write(b, 0, n); } fis.close(); bos.close(); buffer = bos.toByteArray(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return buffer; }

下面是html的解析方式,通過邊框定位,找到每一行每一列所處的位置一級在該位置所屬的元素。

/** * 從第二行開始(去除标題行) * @param postItem * @param table_col * @param index * @return */ private static List<String> getRow(Element postItem,Elements postItems,Elements table_col,int index) { String top = (process(postItems.get(index).attr("style"), "top")); String bottom = (process(postItems.get(index 1).attr("style"), "top")); Elements tables = postItem.select("[style*=top:]"); List<String> data = new ArrayList<>(); double dbottom = Double.parseDouble(bottom); double dtop = Double.parseDouble(top); boolean isGo = false; for (int iiy = 0; iiy < table_col.size() - 1; iiy ) { StringBuilder sbs = new StringBuilder(); for (Element spostItem : tables) { String top2 = (process(spostItem.attr("style"), "top")); double top2s = Double.parseDouble(top2); if (top2s > dtop && top2s < dbottom) { String left2 = (process(spostItem.attr("style"), "left")); double[] cols = getRowCol(table_col, iiy); double left2s = Double.parseDouble(left2); if (left2s > cols[0] && left2s < cols[1]) { sbs.append(spostItem.text()); } } } if(sbs.length()==0) { data.add("-"); }else{ data.add(sbs.toString()); } } return data; } /** * 定位列的位置 * @param table_col * @param index * @return */ private static double[] getRowCol(Elements table_col,int index){ StringBuilder sbd=new StringBuilder(); String left=(process(table_col.get(index).attr("style"),"left")); String right=(process(table_col.get(index 1).attr("style"),"left")); return new double[]{Double.parseDouble(left),Double.parseDouble(right)}; } /** * 讀取html中樣式的指定屬性 * @param style * @param extract * @return */ private static String process(String style,String extract) { if (style.contains(extract)) { style = style.substring(style.indexOf(extract ":")); style = style.substring(0, style.indexOf(";")); String attr = style.substring(style.indexOf(":") 1); return (attr.substring(0,attr.length()-2)); } return null; }

pom配置

<dependency> <groupId>com.itextpdf</groupId> <artifactId>itextpdf</artifactId> <version>5.5.13</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.12.1</version> </dependency> <dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>2.5</version> </dependency> <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>fontbox</artifactId> <version>2.0.0</version> </dependency> <dependency> <groupId>com.itextpdf.tool</groupId> <artifactId>xmlworker</artifactId> <version>5.5.11</version> </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>ooxml-schemas</artifactId> <version>1.1</version> </dependency>

,

更多精彩资讯请关注tft每日頭條,我们将持续为您更新最新资讯!

查看全部

相关科技资讯推荐

热门科技资讯推荐

网友关注

Copyright 2023-2025 - www.tftnews.com All Rights Reserved