private String filter(String value) { StringBuilder out = new StringBuilder(); StringReader strReader = new StringReader(value); try { HTMLStripCharFilter html = new HTMLStripCharFilter(new BufferedReader(strReader)); char[] cbuf = new char[1024 * 10]; while (true) { int count = html.read(cbuf); if (count == -1) break; // end of stream mark is -1 if (count > 0) out.append(cbuf, 0, count); } html.close(); } catch (IOException e) { throw new RuntimeException("Failed stripping HTML for value: " + value, e); } return out.toString(); }
private Object stripHTML(String value, String column) { StringBuilder out = new StringBuilder(); StringReader strReader = new StringReader(value); try { HTMLStripCharFilter html = new HTMLStripCharFilter(strReader.markSupported() ? strReader : new BufferedReader(strReader)); char[] cbuf = new char[1024 * 10]; while (true) { int count = html.read(cbuf); if (count == -1) break; // end of stream mark is -1 if (count > 0) out.append(cbuf, 0, count); } html.close(); } catch (IOException e) { throw new DataImportHandlerException(DataImportHandlerException.SEVERE, "Failed stripping HTML for column: " + column, e); } return out.toString(); }
private String[] analyzeReturnTokens(String docText) { List<String> result = new ArrayList<>(); Reader filter = new HTMLStripCharFilter(new StringReader(docText), Collections.singleton("unescaped")); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { result.add(termAttribute.toString()); } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result.toArray(new String[result.size()]); }
public static String filterHTML(Reader source) throws IOException { if (source == null) { return null; } StringBuilder builder = new StringBuilder(); HTMLStripCharFilter reader = new HTMLStripCharFilter(source); int ch; while ((ch = reader.read()) != -1) { builder.append((char) ch); } return builder.toString(); }
private int[] analyzeTagOne(String docText, String start, String end) { int[] result = {-1, -1}; Reader filter = new HTMLStripCharFilter(new StringReader(docText)); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { final String termString = termAttribute.toString(); if (termString.equals(start)) result[0] = offsetAttribute.startOffset(); if (termString.equals(end)) { result[1] = offsetAttribute.endOffset(); return result; } } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result; }
@Override public Reader create(Reader tokenStream) { return new HTMLStripCharFilter(tokenStream, escapedTags); }