/** * Extracts text from a PDF document. * * @param src the original PDF document * @throws java.io.IOException */ public List<Page> extractText(InputStream src) throws IOException { List<Page> pages = Lists.newArrayList(); PdfReader reader = new PdfReader(src); RenderListener listener = new InternalListener(); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(listener); for (int i = 1; i <= reader.getNumberOfPages(); i++) { pages.add(currentPage = new Page()); PdfDictionary pageDic = reader.getPageN(i); PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES); processor.processContent(ContentByteUtils.getContentBytesForPage(reader, i), resourcesDic); } reader.close(); return pages; }
public static float[] getKeyWords(String filePath) { try { PdfReader pdfReader = new PdfReader(filePath); int pageNum = pdfReader.getNumberOfPages(); PdfReaderContentParser pdfReaderContentParser = new PdfReaderContentParser(pdfReader); // 下标从1开始 for (i = 1; i <= pageNum; i++) { pdfReaderContentParser.processContent(i, new RenderListener() { public void renderText(TextRenderInfo textRenderInfo) { String text = textRenderInfo.getText(); System.out.println(i + ":" +text); if (null != text && text.contains(KEY_WORD)) { Float boundingRectange = textRenderInfo.getBaseline().getBoundingRectange(); resu = new float[3]; resu[0] = boundingRectange.x; resu[1] = boundingRectange.y; resu[2] = i; } } public void renderImage(ImageRenderInfo arg0) { } public void endTextBlock() { } public void beginTextBlock() { } }); } } catch (IOException e) { e.printStackTrace(); } return resu; }