private String getPdfContent(String pdfFile) { try { PdfReader reader = new PdfReader(pdfFile); StringBuffer sb = new StringBuffer(); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); sb.append(strategy.getResultantText()); } reader.close(); return sb.toString(); } catch (IOException e) { throw new IllegalArgumentException("Not able to read file " + pdfFile, e); } }
void merge(PdfReader reader, PdfReaderContentParser parser, int page) throws IOException { TextMarginFinder finder = parser.processContent(page, new TextMarginFinder()); Rectangle pageSizeToImport = reader.getPageSize(page); float heightToImport = finder.getHeight(); float maxHeight = pageSize.getHeight() - topMargin - bottomMargin; if (heightToImport > maxHeight) { throw new IllegalArgumentException(String.format("Page %s content too large; height: %s, limit: %s.", page, heightToImport, maxHeight)); } if (heightToImport > yPosition - pageSize.getBottom(bottomMargin)) { newPage(); } else if (!writer.isPageEmpty()) { heightToImport += gap; } yPosition -= heightToImport; PdfImportedPage importedPage = writer.getImportedPage(reader, page); writer.getDirectContent().addTemplate(importedPage, 0, yPosition - (finder.getLly() - pageSizeToImport.getBottom())); }
@Test public void testCertifiedSchoolList_9_16_2015() throws IOException { try ( Writer data = new OutputStreamWriter(new FileOutputStream(new File(RESULT_FOLDER, "data.txt")), "UTF-8"); Writer nonData = new OutputStreamWriter(new FileOutputStream(new File(RESULT_FOLDER, "non-data.txt")), "UTF-8"); InputStream resource = getClass().getResourceAsStream("certified-school-list-9-16-2015.pdf") ) { CertifiedSchoolListExtractionStrategy strategy = new CertifiedSchoolListExtractionStrategy(data, nonData); PdfReader reader = new PdfReader(resource); PdfReaderContentParser parser = new PdfReaderContentParser(reader); for (int page = 1; page <= reader.getNumberOfPages(); page++) parser.processContent(page, strategy); // parser.processContent(28, strategy); strategy.close(); } }
/** * <a href="http://stackoverflow.com/questions/40549977/reading-legacy-word-forms-checkboxes-converted-to-pdf"> * Reading legacy Word forms checkboxes converted to PDF * </a> * <br> * <a href="https://www.dropbox.com/s/4z7ky3yy2yaj53i/Doc1.pdf?dl=0"> * Doc1.pdf * </a> * <p> * This test shows how one can extract the sample drawn "checkboxes" from the * sample PDF provided by the OP. * </p> */ @Test public void testExtractDoc1() throws IOException { try ( InputStream resource = getClass().getResourceAsStream("Doc1.pdf")) { PdfReader pdfReader = new PdfReader(resource); for (int page = 1; page <= pdfReader.getNumberOfPages(); page++) { System.out.printf("\nPage %s\n====\n", page); CheckBoxExtractionStrategy strategy = new CheckBoxExtractionStrategy(); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); parser.processContent(page, strategy); for (Box box : strategy.getBoxes()) { Vector basePoint = box.getDiagonal().getStartPoint(); System.out.printf("at %s, %s - %s\n", basePoint.get(Vector.I1), basePoint.get(Vector.I2), box.isChecked() ? "checked" : "unchecked"); } } } }
String extractAndStore(PdfReader reader, String format, int from, int to, BaseColor headerColor) throws IOException { StringBuilder builder = new StringBuilder(); for (int page = from; page <= to; page++) { PdfReaderContentParser parser = new PdfReaderContentParser(reader); DividerAwareTextExtrationStrategy strategy = parser.processContent(page, new DividerAndColorAwareTextExtractionStrategy(810, 30, 20, 575, headerColor)); List<Section> sections = strategy.getSections(); int i = 0; for (Section section : sections) { String sectionText = strategy.getResultantText(section); Files.write(Paths.get(String.format(format, page, i)), sectionText.getBytes("UTF8")); builder.append("--\n") .append(sectionText) .append('\n'); i++; } builder.append("\n\n"); } return builder.toString(); }
String extractAndStore(PdfReader reader, String format, int from, int to) throws IOException { StringBuilder builder = new StringBuilder(); for (int page = from; page <= to; page++) { PdfReaderContentParser parser = new PdfReaderContentParser(reader); DividerAwareTextExtrationStrategy strategy = parser.processContent(page, new DividerAwareTextExtrationStrategy(810, 30, 20, 575)); List<Section> sections = strategy.getSections(); int i = 0; for (Section section : sections) { String sectionText = strategy.getResultantText(section); Files.write(Paths.get(String.format(format, page, i)), sectionText.getBytes("UTF8")); builder.append("--\n") .append(sectionText) .append('\n'); i++; } builder.append("\n\n"); } return builder.toString(); }
public static String[] extractsPdfLines(String PdfFile) throws IOException { try { StringBuffer buff = new StringBuffer(); String ExtractedText = null; PdfReader reader = new PdfReader(PdfFile); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); ExtractedText = strategy.getResultantText().toString(); buff.append(ExtractedText + "\n"); } String[] LinesArray; LinesArray = buff.toString().split("\n"); reader.close(); return LinesArray; } catch (Exception e) { return null; } }
private static List<LocationTextExtractionStrategy.TextChunk> getTextChunks() throws IOException { PdfReader reader = new PdfReader(RESOURCES_DIR + "KLEE.pdf"); PdfReaderContentParser parser = new PdfReaderContentParser(reader); PrintWriter out = new PrintWriter(new FileOutputStream(RESOURCES_DIR + "extracted text")); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new LocationTextExtractionStrategy()); out.println(strategy.getResultantText()); } out.flush(); out.close(); return null; }
public TextExtractor(InputStream is) throws IOException { PdfReader pdfReader = new PdfReader(is); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); int pages = pdfReader.getNumberOfPages(); for (int i = 1; i <= pages; i++) { ImportRenderListener renderListener = new ImportRenderListener(); parser.processContent(i, renderListener); pageText.add(renderListener.text); } }
public static float[] getKeyWords(String filePath) { try { PdfReader pdfReader = new PdfReader(filePath); int pageNum = pdfReader.getNumberOfPages(); PdfReaderContentParser pdfReaderContentParser = new PdfReaderContentParser(pdfReader); // 下标从1开始 for (i = 1; i <= pageNum; i++) { pdfReaderContentParser.processContent(i, new RenderListener() { public void renderText(TextRenderInfo textRenderInfo) { String text = textRenderInfo.getText(); System.out.println(i + ":" +text); if (null != text && text.contains(KEY_WORD)) { Float boundingRectange = textRenderInfo.getBaseline().getBoundingRectange(); resu = new float[3]; resu[0] = boundingRectange.x; resu[1] = boundingRectange.y; resu[2] = i; } } public void renderImage(ImageRenderInfo arg0) { } public void endTextBlock() { } public void beginTextBlock() { } }); } } catch (IOException e) { e.printStackTrace(); } return resu; }
void merge(PdfReader reader) throws IOException { PdfReaderContentParser parser = new PdfReaderContentParser(reader); for (int page = 1; page <= reader.getNumberOfPages(); page++) { merge(reader, parser, page); } }
void analyzeVertically(InputStream pdf, File target) throws IOException { final PdfReader reader = new PdfReader(pdf); try { PdfReaderContentParser parser = new PdfReaderContentParser(reader); StringBuilder builder = new StringBuilder(); for (int page=1; page <= reader.getNumberOfPages(); page++) { PageVerticalAnalyzer analyzer = parser.processContent(page, new PageVerticalAnalyzer()); builder.append("Page ").append(page).append('\n'); if (analyzer.verticalFlips.size() > 0) { for (int i = 0; i < analyzer.verticalFlips.size() - 1; i+=2) { builder.append(String.format("%3.3f - %3.3f\n", analyzer.verticalFlips.get(i), analyzer.verticalFlips.get(i+1))); } builder.append('\n'); } else { builder.append("No content\n\n"); } } String sections = builder.toString(); System.out.print(sections); Files.write(target.toPath(), sections.getBytes()); } finally { reader.close(); } }
void mark(InputStream input, OutputStream output, Pattern pattern) throws DocumentException, IOException { PdfReader reader = new PdfReader(input); PdfStamper stamper = new PdfStamper(reader, output); try { PdfReaderContentParser parser = new PdfReaderContentParser(reader); for (int pageNr = 1; pageNr <= reader.getNumberOfPages(); pageNr++) { SearchTextLocationExtractionStrategy strategy = new SearchTextLocationExtractionStrategy(pattern); parser.processContent(pageNr, strategy, Collections.emptyMap()).getResultantText(); Collection<TextRectangle> locations = strategy.getLocations(null); if (locations.isEmpty()) continue; PdfContentByte canvas = stamper.getOverContent(pageNr); canvas.setRGBColorStroke(255, 255, 0); for (TextRectangle location : locations) { canvas.rectangle(location.getMinX(), location.getMinY(), location.getWidth(), location.getHeight()); } canvas.stroke(); } stamper.close(); } finally { reader.close(); } }
public Collection<Rectangle2D> find(PdfReader reader, float minWidth, float minHeight, int page) throws IOException { Rectangle cropBox = reader.getCropBox(page); Rectangle2D crop = new Rectangle2D.Float(cropBox.getLeft(), cropBox.getBottom(), cropBox.getWidth(), cropBox.getHeight()); FreeSpaceFinder finder = new FreeSpaceFinder(crop, minWidth, minHeight); PdfReaderContentParser parser = new PdfReaderContentParser(reader); parser.processContent(page, finder); return finder.freeSpaces; }
public Collection<Rectangle2D> findExt(PdfReader reader, float minWidth, float minHeight, int page) throws IOException { Rectangle cropBox = reader.getCropBox(page); Rectangle2D crop = new Rectangle2D.Float(cropBox.getLeft(), cropBox.getBottom(), cropBox.getWidth(), cropBox.getHeight()); FreeSpaceFinder finder = new FreeSpaceFinderExt(crop, minWidth, minHeight); PdfReaderContentParser parser = new PdfReaderContentParser(reader); parser.processContent(page, finder); return finder.freeSpaces; }
public PDFReadResult readPDFFromFile(File file) throws IOException { PdfReader reader = new PdfReader(file.getAbsolutePath()); PdfReaderContentParser parser = new PdfReaderContentParser(reader); ArrayList<TextChunk> textChunks = new ArrayList<TextChunk>(); for (int i = 1; i <= reader.getNumberOfPages(); i++) { TextChunkExtractionStrategy strategy = new TextChunkExtractionStrategy(i); strategy = parser.processContent(i, strategy); List<TextChunk> textChunksInPage = strategy.getResultantTextChunks(); textChunks.addAll(textChunksInPage); } reader.close(); return new PDFReadResult(file, textChunks); }
@Override public BookReadingResult open(@NonNull File file, @NonNull PercentSender percentSender, @NonNull Runnable readingEndSender) { try { PdfReader pdfReader = new PdfReader(file.getAbsolutePath()); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); int numberOfPages = pdfReader.getNumberOfPages(); int oldPercent = 0, newPercent; StringBuffer stringBuffer = new StringBuffer(); for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) { TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); String pageText = strategy.getResultantText(); stringBuffer.append(pageText); if (pageText.endsWith("\\.") || pageText.endsWith("!") || pageText.endsWith("?") || pageText.endsWith(":")) { stringBuffer.append(" "); } else { stringBuffer.append(". "); } newPercent = 100 * i / numberOfPages; if (newPercent != oldPercent) { percentSender.refreshPercents(oldPercent, newPercent); oldPercent = newPercent; } } pdfReader.close(); String resultText = new String(stringBuffer); resultText = resultText.trim(); // delete first and last space (if exist) resultText = resultText.replaceAll("\\s+", " "); // delete all duplicate white spaces resultText = resultText.replaceAll("(\\.)+", "\\."); // delete all duplicate dots if (resultText.length() < 1) { return null; } readingEndSender.run(); return new BookReadingResult(resultText, InternalStorageFileHelper.fileNameWithoutExtension(file), ""); } catch (IOException e) { e.printStackTrace(); return null; } }
void merge(PdfReader reader, PdfReaderContentParser parser, int page) throws IOException { PdfImportedPage importedPage = writer.getImportedPage(reader, page); PdfContentByte directContent = writer.getDirectContent(); PageVerticalAnalyzer finder = parser.processContent(page, new PageVerticalAnalyzer()); if (finder.verticalFlips.size() < 2) return; Rectangle pageSizeToImport = reader.getPageSize(page); int startFlip = finder.verticalFlips.size() - 1; boolean first = true; while (startFlip > 0) { if (!first) newPage(); float freeSpace = yPosition - pageSize.getBottom(bottomMargin); int endFlip = startFlip + 1; while ((endFlip > 1) && (finder.verticalFlips.get(startFlip) - finder.verticalFlips.get(endFlip - 2) < freeSpace)) endFlip -=2; if (endFlip < startFlip) { float height = finder.verticalFlips.get(startFlip) - finder.verticalFlips.get(endFlip); directContent.saveState(); directContent.rectangle(0, yPosition - height, pageSizeToImport.getWidth(), height); directContent.clip(); directContent.newPath(); writer.getDirectContent().addTemplate(importedPage, 0, yPosition - (finder.verticalFlips.get(startFlip) - pageSizeToImport.getBottom())); directContent.restoreState(); yPosition -= height + gap; startFlip = endFlip - 1; } else if (!first) throw new IllegalArgumentException(String.format("Page %s content sections too large.", page)); first = false; } }
void markLineBoundaries(String resource, int startPage, int endPage) throws IOException, DocumentException { String name = new File(resource).getName(); String target = String.format("%s-lines-%s-%s.pdf", name, startPage, endPage); InputStream resourceStream = getClass().getResourceAsStream(resource); try { PdfReader reader = new PdfReader(resourceStream); PdfReaderContentParser parser = new PdfReaderContentParser(reader); System.out.printf("\nLine boundaries in %s\n", name); PdfStamper stamper = new PdfStamper(reader, new FileOutputStream(new File(RESULT_FOLDER, target))); for (int page = startPage; page < endPage; page++) { System.out.printf("\n Page %s\n ", page); TextLineFinder finder = new TextLineFinder(); parser.processContent(page, finder); PdfContentByte over = stamper.getOverContent(page); Rectangle mediaBox = reader.getPageSize(page); for (float flip: finder.verticalFlips) { System.out.printf(" %s", flip); over.moveTo(mediaBox.getLeft(), flip); over.lineTo(mediaBox.getRight(), flip); } System.out.println(); over.stroke(); } stamper.close(); } finally { if (resourceStream != null) resourceStream.close(); } }
public static void searchforStringinPdfFiles(File file) throws IOException { outputfile.println("<Dateiname>" + (file.getName()) + "</Dateiname>"); int trefferinDatei; if (filetools.pdf.PdfAnalysis.testPdfOk(file)) { try { PdfReader reader = new PdfReader(file.toString()); int pagesPdf = reader.getNumberOfPages(); StringBuffer buff = new StringBuffer(); String ExtractedText = null; PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; trefferinDatei = 0; for (int i = 1; i <= pagesPdf; i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); ExtractedText = strategy.getResultantText().toString(); buff.append(ExtractedText + "\n"); String[] LinesArray = buff.toString().split("\n"); int linesPdf = LinesArray.length; for (int j = 0; (j < linesPdf && (stringfound < MAXIMAL_HITS)); j++) { String paragraph = LinesArray[j].toLowerCase(); String searchStringlowerCase = searchedString.toLowerCase(); if (paragraph.contains(searchStringlowerCase)) { trefferinDatei++; stringfound++; outputfile.println("<Seitenzahl>" + i + "</Seitenzahl>"); outputfile.println("<GanzeZeile>" + (LinesArray[j]) + "</GanzeZeile>"); } } } outputfile.println("<TextinDatei>" + trefferinDatei + "</TextinDatei>"); outputfile.println("<Suchergebnis>" + trefferinDatei + " x " + "</Suchergebnis>"); reader.close(); } catch (Exception e) { outputfile.println("<Fehlermeldung>" + e + "</Fehlermeldung>"); } } }
/** * Need to get the size of the page excluding whitespace...... * <p> * The OP's code * * @param pageSize the original page size * @param reader the pdf reader * @return a new page size which cuts out the whitespace * @throws IOException */ private Rectangle getOutputPageSize(Rectangle pageSize, PdfReader reader, int page) throws IOException { PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextMarginFinder finder = parser.processContent(page, new TextMarginFinder()); Rectangle result = new Rectangle(finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry()); System.out.printf("Actual boundary: (%f;%f) to (%f;%f)\n", finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry()); return result; }
/** * Need to get the size of the page excluding whitespace...... * <p> * The OP's code revised to use the whole page width * * @param pageSize the original page size * @param reader the pdf reader * @return a new page size which cuts out the whitespace * @throws IOException */ private Rectangle getOutputPageSize2(Rectangle pageSize, PdfReader reader, int page) throws IOException { PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextMarginFinder finder = parser.processContent(page, new TextMarginFinder()); Rectangle result = new Rectangle(pageSize.getLeft(), finder.getLly(), pageSize.getRight(), finder.getUry()); System.out.printf("Actual boundary: (%f;%f) to (%f;%f)\n", finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry()); return result; }
/** * Need to get the size of the page excluding whitespace...... * <p> * The OP's code revised to use a width with equal margins left and right * * @param pageSize the original page size * @param reader the pdf reader * @return a new page size which cuts out the whitespace * @throws IOException */ private Rectangle getOutputPageSize3(Rectangle pageSize, PdfReader reader, int page) throws IOException { PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextMarginFinder finder = parser.processContent(page, new TextMarginFinder()); float right = 2 * finder.getUrx() - finder.getLlx(); Rectangle result = new Rectangle(finder.getLlx(), finder.getLly(), right, finder.getUry()); System.out.printf("Actual boundary: (%f;%f) to (%f;%f)\n", finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry()); return result; }
/** * Need to get the size of the page excluding whitespace...... * <p> * The OP's code revised to use MarginFinder * * @param pageSize the original page size * @param reader the pdf reader * @return a new page size which cuts out the whitespace * @throws IOException */ private Rectangle getOutputPageSize4(Rectangle pageSize, PdfReader reader, int page) throws IOException { PdfReaderContentParser parser = new PdfReaderContentParser(reader); MarginFinder finder = parser.processContent(page, new MarginFinder()); Rectangle result = new Rectangle(finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry()); System.out.printf("Actual boundary: (%f;%f) to (%f;%f)\n", finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry()); return result; }
/** * <p> * This method restricts the media boxes of the pages in the given {@link PdfReader} * to the actual content found by the {@link MarginFinder} extended render listener. * </p> * <p> * It essentially is copied from the {@link TestTrimPdfPage} methods * {@link TestTrimPdfPage#testWithStamperExtFinder()} and * {@link TestTrimPdfPage#getOutputPageSize4(Rectangle, PdfReader, int)}. * In contrast to the code there this method manipulates * the media box because this is the only box respected by * {@link PdfWriter#getImportedPage(PdfReader, int)}. * </p> */ static void cropPdf(PdfReader reader) throws IOException { int n = reader.getNumberOfPages(); for (int i = 1; i <= n; i++) { PdfReaderContentParser parser = new PdfReaderContentParser(reader); MarginFinder finder = parser.processContent(i, new MarginFinder()); Rectangle rect = new Rectangle(finder.getLlx(), finder.getLly(), finder.getUrx(), finder.getUry()); PdfDictionary page = reader.getPageN(i); page.put(PdfName.MEDIABOX, new PdfArray(new float[]{rect.getLeft(), rect.getBottom(), rect.getRight(), rect.getTop()})); } }