public static float[] getKeyWords(String filePath) { try { PdfReader pdfReader = new PdfReader(filePath); int pageNum = pdfReader.getNumberOfPages(); PdfReaderContentParser pdfReaderContentParser = new PdfReaderContentParser(pdfReader); // 下标从1开始 for (i = 1; i <= pageNum; i++) { pdfReaderContentParser.processContent(i, new RenderListener() { public void renderText(TextRenderInfo textRenderInfo) { String text = textRenderInfo.getText(); System.out.println(i + ":" +text); if (null != text && text.contains(KEY_WORD)) { Float boundingRectange = textRenderInfo.getBaseline().getBoundingRectange(); resu = new float[3]; resu[0] = boundingRectange.x; resu[1] = boundingRectange.y; resu[2] = i; } } public void renderImage(ImageRenderInfo arg0) { } public void endTextBlock() { } public void beginTextBlock() { } }); } } catch (IOException e) { e.printStackTrace(); } return resu; }
/** * Method invokes by the PdfContentStreamProcessor. * Passes a TextRenderInfo for every text chunk that is encountered. * We'll use this object to obtain coordinates. * @see RenderListener#renderText(TextRenderInfo) */ public void renderText(TextRenderInfo renderInfo) { if (textRectangle == null) textRectangle = renderInfo.getDescentLine().getBoundingRectange(); else textRectangle.add(renderInfo.getDescentLine().getBoundingRectange()); textRectangle.add(renderInfo.getAscentLine().getBoundingRectange()); }
@Override public void renderText(TextRenderInfo renderInfo) { LineSegment ascentLine = renderInfo.getAscentLine(); LineSegment descentLine = renderInfo.getDescentLine(); float[] yCoords = new float[]{ ascentLine.getStartPoint().get(Vector.I2), ascentLine.getEndPoint().get(Vector.I2), descentLine.getStartPoint().get(Vector.I2), descentLine.getEndPoint().get(Vector.I2) }; Arrays.sort(yCoords); addVerticalUseSection(yCoords[0], yCoords[3]); }
@Override public void renderText(TextRenderInfo renderInfo) { try { Vector startPoint = renderInfo.getBaseline().getStartPoint(); BaseColor fillColor = renderInfo.getFillColor(); if (fillColor instanceof GrayColor && ((GrayColor)fillColor).getGray() == 0) { if (debug) data.append(String.format("%4d\t%3.3f %3.3f\t%s\n", chunk, startPoint.get(I1), startPoint.get(I2), renderInfo.getText())); for (TextRenderInfo info : renderInfo.getCharacterRenderInfos()) { renderCharacter(info); } } else { if (debug) nonData.append(String.format("%4d\t%3.3f %3.3f\t%s\n", chunk, startPoint.get(I1), startPoint.get(I2), renderInfo.getText())); if (currentField > -1) finishEntry(); entryBuilder.append(renderInfo.getText()); } } catch (IOException e) { e.printStackTrace(); } finally { chunk++; } }
public void renderCharacter(TextRenderInfo renderInfo) throws IOException { Vector startPoint = renderInfo.getBaseline().getStartPoint(); float x = startPoint.get(I1); if (currentField > -1) { if (isInCurrentField(x)) { entryBuilder.append(renderInfo.getText()); return; } if (isInNextField(x)) { currentField++; entryBuilder.append('\t').append(renderInfo.getText()); return; } finishEntry(); // nonData.append(String.format("%4d\t%3.3f %3.3f\t%s\n", chunk, startPoint.get(I1), startPoint.get(I2), renderInfo.getText())); } if (isInNextField(x)) { finishEntry(); currentField = 0; } entryBuilder.append(renderInfo.getText()); }
public SearchTextLocationExtractionStrategy(Pattern pattern) { super(new TextChunkLocationStrategy() { public TextChunkLocation createLocation(TextRenderInfo renderInfo, LineSegment baseline) { // while baseLine has been changed to not neutralize // effects of rise, ascentLine and descentLine explicitly // have not: We want the actual positions. return new AscentDescentTextChunkLocation(baseline, renderInfo.getAscentLine(), renderInfo.getDescentLine(), renderInfo.getSingleSpaceWidth()); } }); this.pattern = pattern; }
@Override public void renderText(TextRenderInfo renderInfo) { DocumentFont font =renderInfo.getFont(); PdfDictionary dict = font.getFontDictionary(); PdfDictionary encoding = dict.getAsDict(PdfName.ENCODING); PdfArray diffs = encoding.getAsArray(PdfName.DIFFERENCES); ; StringBuilder builder = new StringBuilder(); for (byte b : renderInfo.getPdfString().getBytes()) { PdfName name = diffs.getAsName((char)b); String s = name.toString().substring(2); int i = Integer.parseUnsignedInt(s, 16); builder.append((char)i); } try { stringField.set(renderInfo, builder.toString()); } catch (IllegalArgumentException | IllegalAccessException e) { e.printStackTrace(); } strategy.renderText(renderInfo); }
/** * * @see com.itextpdf.text.pdf.parser.RenderListener#renderText(com.itextpdf.text.pdf.parser.TextRenderInfo) */ public void renderText(TextRenderInfo renderInfo) { LineSegment segment = renderInfo.getBaseline(); if (renderInfo.getRise() != 0){ // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to Matrix riseOffsetTransform = new Matrix(0, -renderInfo.getRise()); segment = segment.transformBy(riseOffsetTransform); } TextChunk location = new TextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(), renderInfo.getSingleSpaceWidth()); locationalResult.add(location); }
@Override public void renderText(TextRenderInfo renderInfo) { textLineFinder.renderText(renderInfo); LineSegment segment = renderInfo.getBaseline(); if (renderInfo.getRise() != 0){ // remove the rise from the baseline - we do this because the text from a super/subscript render operations should probably be considered as part of the baseline of the text the super/sub is relative to Matrix riseOffsetTransform = new Matrix(0, -renderInfo.getRise()); segment = segment.transformBy(riseOffsetTransform); } TextChunk location = new HorizontalTextChunk(renderInfo.getText(), segment.getStartPoint(), segment.getEndPoint(), renderInfo.getSingleSpaceWidth()); getLocationalResult().add(location); }
@Override public void renderText(TextRenderInfo renderInfo) { try { Rectangle2D usedSpace = renderInfo.getAscentLine().getBoundingRectange(); usedSpace.add(renderInfo.getDescentLine().getBoundingRectange()); remove(usedSpace); } catch(ArrayIndexOutOfBoundsException aioube) { System.err.printf("!!! Ignoring text render info due to translation problem: %s\n", renderInfo); aioube.printStackTrace(); } }
/** * Checks if the text is COMPLETELY inside render filter region. */ @Override public boolean allowText(TextRenderInfo renderInfo) { LineSegment ascent = renderInfo.getAscentLine(); LineSegment descent = renderInfo.getDescentLine(); Point2D[] glyphRect = new Point2D[] { new Point2D.Float(ascent.getStartPoint().get(0), ascent.getStartPoint().get(1)), new Point2D.Float(ascent.getEndPoint().get(0), ascent.getEndPoint().get(1)), new Point2D.Float(descent.getEndPoint().get(0), descent.getEndPoint().get(1)), new Point2D.Float(descent.getStartPoint().get(0), descent.getStartPoint().get(1)), }; for (Rectangle rectangle : rectangles) { boolean glyphInRectangle = true; for (Point2D point2d : glyphRect) { glyphInRectangle &= rectangle.getLeft() <= point2d.getX(); glyphInRectangle &= point2d.getX() <= rectangle.getRight(); glyphInRectangle &= rectangle.getBottom() <= point2d.getY(); glyphInRectangle &= point2d.getY() <= rectangle.getTop(); } if (glyphInRectangle) return false; } return true; }
/** * <a href="http://stackoverflow.com/questions/37262087/avoid-reading-hidden-text-from-pdf"> * Avoid reading hidden text from PDF * </a> * <br/> * <a href="https://drive.google.com/file/d/0B-JlUfbplwmhUjN3QWExeUVNclU/view?usp=sharing"> * demo.pdf * </a> * * <p> * The extra, invisible text turns out to be drawn in rendering mode 3 at the origin. * Filtering by text rendering mode gets rid of it. * </p> */ @Test public void testDemo() throws Exception { InputStream resourceStream = getClass().getResourceAsStream("demo.pdf"); try { PdfReader reader = new PdfReader(resourceStream); String content = extractAndStore(reader, new File(RESULT_FOLDER, "demo.%s.txt").toString()); RenderFilter modeFilter = new RenderFilter() { public boolean allowText(TextRenderInfo renderInfo){ return renderInfo.getTextRenderMode() != 3; } }; String filteredContent = extractAndStore(reader, new File(RESULT_FOLDER, "demo.filtered.%s.txt").toString(), modeFilter); System.out.println("\nText demo.pdf\n************************"); System.out.println(content); System.out.println("\n*filtered"); System.out.println(filteredContent); System.out.println("************************"); } finally { if (resourceStream != null) resourceStream.close(); } }
String extractSimple(PdfReader reader, int pageNo) throws IOException { return PdfTextExtractor.getTextFromPage(reader, pageNo, new SimpleTextExtractionStrategy() { boolean empty = true; @Override public void beginTextBlock() { if (!empty) appendTextChunk("<BLOCK>"); super.beginTextBlock(); } @Override public void endTextBlock() { if (!empty) appendTextChunk("</BLOCK>\n"); super.endTextBlock(); } @Override public String getResultantText() { if (empty) return super.getResultantText(); else return "<BLOCK>" + super.getResultantText(); } @Override public void renderText(TextRenderInfo renderInfo) { empty = false; super.renderText(renderInfo); } }); }
/** * @see com.itextpdf.text.pdf.parser.RenderListener#renderText( *com.itextpdf.text.pdf.parser.TextRenderInfo) */ public void renderText(TextRenderInfo renderInfo) { LineSegment baseline = renderInfo.getBaseline(); float x = baseline.getStartPoint().get(0); float y = baseline.getStartPoint().get(1); float w = baseline.getLength(); String text = renderInfo.getText(); log.debug("Text: @({}, {}) width: {} '{}'", x, y, w, text); currentPage().rowAt(y).text(x, text, w); }
@Override public void renderText(TextRenderInfo info) { text.put(info.getBaseline().getStartPoint(), info.getText()); }
@Override public void renderText(TextRenderInfo renderInfo) { }
@Override public void renderText(TextRenderInfo renderInfo) { for (TextRenderInfo info : renderInfo.getCharacterRenderInfos()) super.renderText(info); }
@Override public void renderText(TextRenderInfo arg0) { }
@Override public TextChunkLocation createLocation(TextRenderInfo renderInfo, LineSegment baseline) { return new HorizontalTextChunkLocation(baseline.getStartPoint(), baseline.getEndPoint(), renderInfo.getSingleSpaceWidth()); }
@Override public void renderText(TextRenderInfo renderInfo) { textLineFinder.renderText(renderInfo); super.renderText(renderInfo); }
public RemappingExtractionFilter(TextExtractionStrategy strategy) throws NoSuchFieldException, SecurityException { this.strategy = strategy; this.stringField = TextRenderInfo.class.getDeclaredField("text"); this.stringField.setAccessible(true); }
/** * <p> * The ascender lines of text rendered using a fill color approximately * like the given header color are added to the divider lines. * </p> * <p> * Beware: we add the ascender line of each chunk in the given color. * We actually should join the ascender lines of all text chunks forming * a header line. As the blue header lines in the sample document consist * of merely a single chunk, we don't need to in this sample code. * </p> */ @Override public void renderText(TextRenderInfo renderInfo) { if (approximates(renderInfo.getFillColor(), headerColor)) { lines.add(renderInfo.getAscentLine()); } super.renderText(renderInfo); }