public TextWalker(Iterable<Entry<Vector, String>> pdfText) throws IOException { for (Entry<Vector, String> entry : pdfText) { x = entry.getKey().get(Vector.I1); if (entry.getKey().get(Vector.I2) != y || isPastRightEdge(x)) { prefix.setLength(0); y = entry.getKey().get(Vector.I2); } List<ImportField> fieldCandidates = getMatches(); if (fieldCandidates.size() == 1) { ImportField field = fieldCandidates.get(0); if (field.hasLabel(prefix.toString().trim()) && field.isValueRegion(x)) { appendValue(field, entry.getValue()); } else if (field.isLabelRegion(x)) { appendPrefix(entry.getValue()); } } else if (! fieldCandidates.isEmpty()) { appendPrefix(entry.getValue()); } } }
/** * @see RenderListener#renderImage(ImageRenderInfo) */ public void renderImage(ImageRenderInfo renderInfo) { Matrix imageCtm = renderInfo.getImageCTM(); Vector a = new Vector(0, 0, 1).cross(imageCtm); Vector b = new Vector(1, 0, 1).cross(imageCtm); Vector c = new Vector(0, 1, 1).cross(imageCtm); Vector d = new Vector(1, 1, 1).cross(imageCtm); LineSegment bottom = new LineSegment(a, b); LineSegment top = new LineSegment(c, d); if (textRectangle == null) textRectangle = bottom.getBoundingRectange(); else textRectangle.add(bottom.getBoundingRectange()); textRectangle.add(top.getBoundingRectange()); }
/** * This method adds the current path to {@link #lines} if it consists * of a single line, the operation is no no-op, and the line is * approximately horizontal. * * @see ExtRenderListener#renderPath(PathPaintingRenderInfo) */ @Override public Path renderPath(PathPaintingRenderInfo renderInfo) { if (moveToVector != null && lineToVector != null && renderInfo.getOperation() != PathPaintingRenderInfo.NO_OP) { Vector from = moveToVector.cross(renderInfo.getCtm()); Vector to = lineToVector.cross(renderInfo.getCtm()); Vector extent = to.subtract(from); if (Math.abs(20 * extent.get(Vector.I2)) < Math.abs(extent.get(Vector.I1))) { LineSegment line; if (extent.get(Vector.I1) >= 0) line = new LineSegment(from, to); else line = new LineSegment(to, from); lines.add(line); } } moveToVector = null; lineToVector = null; return null; }
public int getLineNumber() { Vector startLocation = getStartLocation(); float y = startLocation.get(Vector.I2); List<Float> flips = textLineFinder.verticalFlips; if (flips == null || flips.isEmpty()) return 0; if (y < flips.get(0)) return flips.size() / 2 + 1; for (int i = 1; i < flips.size(); i+=2) { if (y < flips.get(i)) { return (1 + flips.size() - i) / 2; } } return 0; }
@Override public void renderImage(ImageRenderInfo renderInfo) { Matrix imageMatrix = renderInfo.getImageCTM(); Vector image00 = rect00.cross(imageMatrix); Vector image01 = rect01.cross(imageMatrix); Vector image10 = rect10.cross(imageMatrix); Vector image11 = rect11.cross(imageMatrix); Rectangle2D usedSpace = new Rectangle2D.Float(image00.get(Vector.I1), image00.get(Vector.I2), 0, 0); usedSpace.add(image01.get(Vector.I1), image01.get(Vector.I2)); usedSpace.add(image10.get(Vector.I1), image10.get(Vector.I2)); usedSpace.add(image11.get(Vector.I1), image11.get(Vector.I2)); remove(usedSpace); }
/** * <p> * As the {@link DividerAwareTextExtrationStrategy#lines} are not * properly sorted anymore (the additional lines come after all * divider lines of the same column), we have to sort that {@link List} * first. * </p> * <p> * Please be aware that the {@link Comparator} used here is not really * proper: It ignores a certain difference in the x coordinate which * makes it not really transitive. It only works if the individual lines * of the same column have approximately the same starting x coordinate * differing clearly from those of different columns. * </p> */ @Override public List<Section> getSections() { Collections.sort(lines, new Comparator<LineSegment>() { @Override public int compare(LineSegment o1, LineSegment o2) { Vector start1 = o1.getStartPoint(); Vector start2 = o2.getStartPoint(); float v1 = start1.get(Vector.I1), v2 = start2.get(Vector.I1); if (Math.abs(v1 - v2) < 2) { v1 = start2.get(Vector.I2); v2 = start1.get(Vector.I2); } return Float.compare(v1, v2); } }); return super.getSections(); }
/** * <a href="http://stackoverflow.com/questions/40549977/reading-legacy-word-forms-checkboxes-converted-to-pdf"> * Reading legacy Word forms checkboxes converted to PDF * </a> * <br> * <a href="https://www.dropbox.com/s/4z7ky3yy2yaj53i/Doc1.pdf?dl=0"> * Doc1.pdf * </a> * <p> * This test shows how one can extract the sample drawn "checkboxes" from the * sample PDF provided by the OP. * </p> */ @Test public void testExtractDoc1() throws IOException { try ( InputStream resource = getClass().getResourceAsStream("Doc1.pdf")) { PdfReader pdfReader = new PdfReader(resource); for (int page = 1; page <= pdfReader.getNumberOfPages(); page++) { System.out.printf("\nPage %s\n====\n", page); CheckBoxExtractionStrategy strategy = new CheckBoxExtractionStrategy(); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); parser.processContent(page, strategy); for (Box box : strategy.getBoxes()) { Vector basePoint = box.getDiagonal().getStartPoint(); System.out.printf("at %s, %s - %s\n", basePoint.get(Vector.I1), basePoint.get(Vector.I2), box.isChecked() ? "checked" : "unchecked"); } } } }
public static void main(String[] args) { try { FileInputStream stream = new FileInputStream(args[0]); Stream<Entry<Vector, String>> text = new TextExtractor(stream).getText(); text.forEach(entry -> System.out.println(entry.getKey().toString() + ":" + entry.getValue())); } catch (Exception ex) { ex.printStackTrace(); } }
private float addPdfText(String text, float x, float y) { for (String word : text.split(" ")) { pdfTextBuilder.put(new Vector(x, y, 0f), word); x += 1f; } return x; }
@Override public void modifyPath(PathConstructionRenderInfo renderInfo) { List<Vector> points = new ArrayList<Vector>(); if (renderInfo.getOperation() == PathConstructionRenderInfo.RECT) { float x = renderInfo.getSegmentData().get(0); float y = renderInfo.getSegmentData().get(1); float w = renderInfo.getSegmentData().get(2); float h = renderInfo.getSegmentData().get(3); points.add(new Vector(x, y, 1)); points.add(new Vector(x+w, y, 1)); points.add(new Vector(x, y+h, 1)); points.add(new Vector(x+w, y+h, 1)); } else if (renderInfo.getSegmentData() != null) { for (int i = 0; i < renderInfo.getSegmentData().size()-1; i+=2) { points.add(new Vector(renderInfo.getSegmentData().get(i), renderInfo.getSegmentData().get(i+1), 1)); } } for (Vector point: points) { point = point.cross(renderInfo.getCtm()); Rectangle2D.Float pointRectangle = new Rectangle2D.Float(point.get(Vector.I1), point.get(Vector.I2), 0, 0); if (currentPathRectangle == null) currentPathRectangle = pointRectangle; else currentPathRectangle.add(pointRectangle); } }
@Override public void renderText(TextRenderInfo renderInfo) { LineSegment ascentLine = renderInfo.getAscentLine(); LineSegment descentLine = renderInfo.getDescentLine(); float[] yCoords = new float[]{ ascentLine.getStartPoint().get(Vector.I2), ascentLine.getEndPoint().get(Vector.I2), descentLine.getStartPoint().get(Vector.I2), descentLine.getEndPoint().get(Vector.I2) }; Arrays.sort(yCoords); addVerticalUseSection(yCoords[0], yCoords[3]); }
@Override public void renderImage(ImageRenderInfo renderInfo) { Matrix ctm = renderInfo.getImageCTM(); float[] yCoords = new float[4]; for (int x=0; x < 2; x++) for (int y=0; y < 2; y++) { Vector corner = new Vector(x, y, 1).cross(ctm); yCoords[2*x+y] = corner.get(Vector.I2); } Arrays.sort(yCoords); addVerticalUseSection(yCoords[0], yCoords[3]); }
/** * <p> * This method returns a {@link List} of {@link Section} instances each representing * a section of the page delimited by a divider line above and/or below. The topmost * and bottommost sections of each text column are open at the top or the bottom, * implicitly delimited by the matching margin line. * </p> * <p> * {@link Section} implements {@link TextChunkFilter}. Thus, these section objects can be * used as argument of the parent class method {@link #getResultantText(TextChunkFilter)}. * </p> */ public List<Section> getSections() { List<Section> result = new ArrayList<Section>(); // TODO: Sort the array columnwise. In case of the OP's document, the lines already appear in the // correct order, so there was no need for sorting in the POC. LineSegment previous = null; for (LineSegment line : lines) { if (previous == null) { result.add(new Section(null, line)); } else if (Math.abs(previous.getStartPoint().get(Vector.I1) - line.getStartPoint().get(Vector.I1)) < 2) // 2 is a magic number... { result.add(new Section(previous, line)); } else { result.add(new Section(previous, null)); result.add(new Section(null, line)); } previous = line; } return result; }
Section(LineSegment topLine, LineSegment bottomLine) { float left, right, top, bottom; if (topLine != null) { this.topLine = topLine; top = Math.max(topLine.getStartPoint().get(Vector.I2), topLine.getEndPoint().get(Vector.I2)); right = Math.max(topLine.getStartPoint().get(Vector.I1), topLine.getEndPoint().get(Vector.I1)); left = Math.min(topLine.getStartPoint().get(Vector.I1), topLine.getEndPoint().get(Vector.I1)); } else { top = topMargin; left = leftMargin; right = rightMargin; } if (bottomLine != null) { this.bottomLine = bottomLine; bottom = Math.min(bottomLine.getStartPoint().get(Vector.I2), bottomLine.getEndPoint().get(Vector.I2)); right = Math.max(bottomLine.getStartPoint().get(Vector.I1), bottomLine.getEndPoint().get(Vector.I1)); left = Math.min(bottomLine.getStartPoint().get(Vector.I1), bottomLine.getEndPoint().get(Vector.I1)); } else { bottom = bottomMargin; } this.top = top; this.bottom = bottom; this.left = left; this.right = right; }
@Override public boolean accept(TextChunk textChunk) { // TODO: This code only checks the text chunk starting point. One should take the // whole chunk into consideration Vector startlocation = textChunk.getStartLocation(); float x = startlocation.get(Vector.I1); float y = startlocation.get(Vector.I2); return (left <= x) && (x <= right) && (bottom <= y) && (y <= top); }
@Override public void renderText(TextRenderInfo renderInfo) { try { Vector startPoint = renderInfo.getBaseline().getStartPoint(); BaseColor fillColor = renderInfo.getFillColor(); if (fillColor instanceof GrayColor && ((GrayColor)fillColor).getGray() == 0) { if (debug) data.append(String.format("%4d\t%3.3f %3.3f\t%s\n", chunk, startPoint.get(I1), startPoint.get(I2), renderInfo.getText())); for (TextRenderInfo info : renderInfo.getCharacterRenderInfos()) { renderCharacter(info); } } else { if (debug) nonData.append(String.format("%4d\t%3.3f %3.3f\t%s\n", chunk, startPoint.get(I1), startPoint.get(I2), renderInfo.getText())); if (currentField > -1) finishEntry(); entryBuilder.append(renderInfo.getText()); } } catch (IOException e) { e.printStackTrace(); } finally { chunk++; } }
public void renderCharacter(TextRenderInfo renderInfo) throws IOException { Vector startPoint = renderInfo.getBaseline().getStartPoint(); float x = startPoint.get(I1); if (currentField > -1) { if (isInCurrentField(x)) { entryBuilder.append(renderInfo.getText()); return; } if (isInNextField(x)) { currentField++; entryBuilder.append('\t').append(renderInfo.getText()); return; } finishEntry(); // nonData.append(String.format("%4d\t%3.3f %3.3f\t%s\n", chunk, startPoint.get(I1), startPoint.get(I2), renderInfo.getText())); } if (isInNextField(x)) { finishEntry(); currentField = 0; } entryBuilder.append(renderInfo.getText()); }
@Override public Path renderPath(PathPaintingRenderInfo renderInfo) { if (renderInfo.getOperation() != PathPaintingRenderInfo.NO_OP) { if (rectangle != null) { Vector a = new Vector(rectangle.getLeft(), rectangle.getBottom(), 1).cross(renderInfo.getCtm()); Vector b = new Vector(rectangle.getRight(), rectangle.getBottom(), 1).cross(renderInfo.getCtm()); Vector c = new Vector(rectangle.getRight(), rectangle.getTop(), 1).cross(renderInfo.getCtm()); Vector d = new Vector(rectangle.getLeft(), rectangle.getTop(), 1).cross(renderInfo.getCtm()); Box box = new Box(new LineSegment(a, c), new LineSegment(b, d)); boxes.add(box); } if (moveToVector != null && lineToVector != null) { if (!boxes.isEmpty()) { Vector from = moveToVector.cross(renderInfo.getCtm()); Vector to = lineToVector.cross(renderInfo.getCtm()); boxes.get(boxes.size() - 1).selectDiagonal(new LineSegment(from, to)); } } } moveToVector = null; lineToVector = null; rectangle = null; return null; }
@Override public int compareTo(TextChunk rhs) { if (rhs instanceof HorizontalTextChunk) { HorizontalTextChunk horRhs = (HorizontalTextChunk) rhs; int rslt = Integer.compare(getLineNumber(), horRhs.getLineNumber()); if (rslt != 0) return rslt; return Float.compare(getStartLocation().get(Vector.I1), rhs.getStartLocation().get(Vector.I1)); } else return super.compareTo(rhs); }
public Text(PdfString text, Vector startLocation, Vector endLocation, boolean visible, int numOfStrTextBelongsTo) { super(visible); this.text = text; this.startX = startLocation.get(0); this.endX = endLocation.get(0); this.numOfStrTextBelongsTo = numOfStrTextBelongsTo; }
public TextChunk(String string, Vector startLocation, Vector endLocation, float charSpaceWidth, int pageNumber) { this.text = string; this.startLocation = startLocation; this.endLocation = endLocation; this.charSpaceWidth = charSpaceWidth; this.orientationVector = endLocation.subtract(startLocation).normalize(); this.orientationMagnitude = (int)(Math.atan2((double)this.orientationVector.get(1), (double)this.orientationVector.get(0)) * 1000.0D); Vector origin = new Vector(0.0F, 0.0F, 1.0F); this.distPerpendicular = (int)startLocation.subtract(origin).cross(this.orientationVector).get(2); this.distParallelStart = this.orientationVector.dot(startLocation); this.distParallelEnd = this.orientationVector.dot(endLocation); this.pageNumber = pageNumber; }
@Test public void testParse() throws Exception { CV cv = new CV(); parser.reset(); parser.setCV(cv); parser.appendTextChunk( new TextChunk("National University of Singapore Bachelor of Computing (Computer Science), Honours", new Vector(0,0,1), new Vector(1,0,1), 0, 1)); parser.parseAndSave(); }
protected ListMultimap<ImportField, String> getFieldValues(Stream<Entry<Vector, String>> pdfText) throws IOException { ListMultimap<ImportField, String> fieldValues = transformValues(new TextWalker(pdfText::iterator).fieldValues, StringBuilder::toString); fieldValues.values().removeIf(String::isEmpty); return fieldValues; }
public Stream<Entry<Vector, String>> getText() { return pageText.stream().map(Map::entrySet).flatMap(Collection::stream); }
float getTransformedY(float x, float y, Matrix m) { return new Vector(x, y, 1).cross(m).get(Vector.I2); }
boolean approximatelyEquals(Vector a, Vector b, float permissiveness) { return a.subtract(b).length() < permissiveness; }
/** * @return the start location of the text */ public Vector getStartLocation(){ return startLocation; }
/** * @return the end location of the text */ public Vector getEndLocation(){ return endLocation; }
public HorizontalTextChunk(String string, Vector startLocation, Vector endLocation, float charSpaceWidth) { super(string, startLocation, endLocation, charSpaceWidth); }
public Vector getStartLocation() { return startLocation; }
public Vector getEndLocation() { return endLocation; }