private String getPdfContent(String pdfFile) { try { PdfReader reader = new PdfReader(pdfFile); StringBuffer sb = new StringBuffer(); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); sb.append(strategy.getResultantText()); } reader.close(); return sb.toString(); } catch (IOException e) { throw new IllegalArgumentException("Not able to read file " + pdfFile, e); } }
public static String[] extractsPdfLines(String PdfFile) throws IOException { try { StringBuffer buff = new StringBuffer(); String ExtractedText = null; PdfReader reader = new PdfReader(PdfFile); PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; for (int i = 1; i <= reader.getNumberOfPages(); i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); ExtractedText = strategy.getResultantText().toString(); buff.append(ExtractedText + "\n"); } String[] LinesArray; LinesArray = buff.toString().split("\n"); reader.close(); return LinesArray; } catch (Exception e) { return null; } }
String extractSimple(PdfReader reader, int pageNo) throws IOException { return PdfTextExtractor.getTextFromPage(reader, pageNo, new SimpleTextExtractionStrategy() { boolean empty = true; @Override public void beginTextBlock() { if (!empty) appendTextChunk("<BLOCK>"); super.beginTextBlock(); } @Override public void endTextBlock() { if (!empty) appendTextChunk("</BLOCK>\n"); super.endTextBlock(); } @Override public String getResultantText() { if (empty) return super.getResultantText(); else return "<BLOCK>" + super.getResultantText(); } @Override public void renderText(TextRenderInfo renderInfo) { empty = false; super.renderText(renderInfo); } }); }
@Override public BookReadingResult open(@NonNull File file, @NonNull PercentSender percentSender, @NonNull Runnable readingEndSender) { try { PdfReader pdfReader = new PdfReader(file.getAbsolutePath()); PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader); int numberOfPages = pdfReader.getNumberOfPages(); int oldPercent = 0, newPercent; StringBuffer stringBuffer = new StringBuffer(); for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) { TextExtractionStrategy strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); String pageText = strategy.getResultantText(); stringBuffer.append(pageText); if (pageText.endsWith("\\.") || pageText.endsWith("!") || pageText.endsWith("?") || pageText.endsWith(":")) { stringBuffer.append(" "); } else { stringBuffer.append(". "); } newPercent = 100 * i / numberOfPages; if (newPercent != oldPercent) { percentSender.refreshPercents(oldPercent, newPercent); oldPercent = newPercent; } } pdfReader.close(); String resultText = new String(stringBuffer); resultText = resultText.trim(); // delete first and last space (if exist) resultText = resultText.replaceAll("\\s+", " "); // delete all duplicate white spaces resultText = resultText.replaceAll("(\\.)+", "\\."); // delete all duplicate dots if (resultText.length() < 1) { return null; } readingEndSender.run(); return new BookReadingResult(resultText, InternalStorageFileHelper.fileNameWithoutExtension(file), ""); } catch (IOException e) { e.printStackTrace(); return null; } }
public static void searchforStringinPdfFiles(File file) throws IOException { outputfile.println("<Dateiname>" + (file.getName()) + "</Dateiname>"); int trefferinDatei; if (filetools.pdf.PdfAnalysis.testPdfOk(file)) { try { PdfReader reader = new PdfReader(file.toString()); int pagesPdf = reader.getNumberOfPages(); StringBuffer buff = new StringBuffer(); String ExtractedText = null; PdfReaderContentParser parser = new PdfReaderContentParser(reader); TextExtractionStrategy strategy; trefferinDatei = 0; for (int i = 1; i <= pagesPdf; i++) { strategy = parser.processContent(i, new SimpleTextExtractionStrategy()); ExtractedText = strategy.getResultantText().toString(); buff.append(ExtractedText + "\n"); String[] LinesArray = buff.toString().split("\n"); int linesPdf = LinesArray.length; for (int j = 0; (j < linesPdf && (stringfound < MAXIMAL_HITS)); j++) { String paragraph = LinesArray[j].toLowerCase(); String searchStringlowerCase = searchedString.toLowerCase(); if (paragraph.contains(searchStringlowerCase)) { trefferinDatei++; stringfound++; outputfile.println("<Seitenzahl>" + i + "</Seitenzahl>"); outputfile.println("<GanzeZeile>" + (LinesArray[j]) + "</GanzeZeile>"); } } } outputfile.println("<TextinDatei>" + trefferinDatei + "</TextinDatei>"); outputfile.println("<Suchergebnis>" + trefferinDatei + " x " + "</Suchergebnis>"); reader.close(); } catch (Exception e) { outputfile.println("<Fehlermeldung>" + e + "</Fehlermeldung>"); } } }