/** * Cleans the html content leaving only the following tags: b, em, i, strong, u, br, cite, em, i, p, strong, img, li, ul, ol, sup, sub, s * @param content html content * @param extraTags any other tags that you may want to keep, e. g. "a" * @return */ public String cleanContent(String content, String ... extraTags) { Whitelist allowedTags = Whitelist.simpleText(); // This whitelist allows only simple text formatting: b, em, i, strong, u. All other HTML (tags and attributes) will be removed. allowedTags.addTags("br", "cite", "em", "i", "p", "strong", "img", "li", "ul", "ol", "sup", "sub", "s"); allowedTags.addTags(extraTags); allowedTags.addAttributes("p", "style"); // Serve per l'allineamento a destra e sinistra allowedTags.addAttributes("img", "src", "style", "class"); if (Arrays.asList(extraTags).contains("a")) { allowedTags.addAttributes("a", "href", "target"); } Document dirty = Jsoup.parseBodyFragment(content, ""); Cleaner cleaner = new Cleaner(allowedTags); Document clean = cleaner.clean(dirty); clean.outputSettings().escapeMode(EscapeMode.xhtml); // Non fa l'escape dei caratteri utf-8 String safe = clean.body().html(); return safe; }
public String extractPackSection() { mDoc = Jsoup.parse(mHtmlStr); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); String pack_section = ""; // Find all information between X and X+1 Element start_elem = mDoc.select("p:contains(Packungen)").first(); Element stop_elem = mDoc.select("p:contains(Zulassungsinhaberin)").first(); // Alternative: /* Element start_elem = mDoc.select("p[id=section18]").first(); Element stop_elem = mDoc.select("p[id=section19]").first(); */ Element pe = start_elem.nextElementSibling(); if (pe!=null && start_elem!=null && stop_elem!=null) { while (pe!=stop_elem) { System.out.println(pe.text()); pe = pe.nextElementSibling(); } } return pack_section; }
/** * Cleans some html text by stripping all tags but <code>br</code> and then * unescapes named entitiesl like '"e';. brs will be replaced by * newlines. * * @param htmlText * @return */ String htmlTextToPlainText(final String htmlText) { final Whitelist whitelist = Whitelist.none(); whitelist.addTags("br"); final Cleaner cleaner = new Cleaner(whitelist); final Document cleanedDocument = cleaner.clean(Jsoup.parse(htmlText)); cleanedDocument .outputSettings() .prettyPrint(false) .escapeMode(EscapeMode.xhtml) .charset(StandardCharsets.UTF_8); return Parser.unescapeEntities(cleanedDocument.body().html().trim(), true).replaceAll("<br(?: ?/)?>", "\r\n"); }
/** * Strips any potential XSS threats out of the value * * @param value * @return */ public String stripXSS(String value) { if (StringUtils.isBlank(value)) { return null; } // try { // value = ESAPI.encoder().encodeForHTML(value); // } catch (Exception e) { // logger.warn(e.getMessage(),e); // // } // Use the ESAPI library to avoid encoded attacks. value = ESAPI.encoder().canonicalize(value); // // // Avoid null characters value = value.replaceAll("\0", ""); value = value.replaceAll("<", "& lt;").replaceAll(">", "& gt;"); value = value.replaceAll("\\(", "& #40;").replaceAll("\\)", "& #41;"); value = value.replaceAll("'", "& #39;"); value = value.replaceAll("eval\\((.*)\\)", ""); value = value.replaceAll("[\\\"\\\'][\\s]*javascript:(.*)[\\\"\\\']", "\"\""); value = value.replaceAll("script", ""); // // // Clean out HTML Document.OutputSettings outputSettings = new Document.OutputSettings(); outputSettings.escapeMode(EscapeMode.xhtml); outputSettings.prettyPrint(false); value = Jsoup.clean(value, "", Whitelist.none(), outputSettings); return value; }
Document parseContent(final String content) { Document document = Jsoup.parse(content); document.outputSettings().escapeMode(EscapeMode.xhtml); document.outputSettings().syntax(Document.OutputSettings.Syntax.xml); // remove script tags, they are not supported in pdf and can lead to // not well formed document (e.g. <\/script> - escaped script tag) document.select("script").remove(); return document; }
public static IDocument parse(String s) { org.jsoup.nodes.Document doc = Jsoup.parse(s); if (NOT_USE_HTML_ENCODE) { doc.outputSettings().escapeMode(EscapeMode.xhtml); } return new Document(doc); }
/** * Clean HTML string and return the cleaner version. * * @param html Input HTML string. * @return Cleaned version of the HTML as string. */ public String clean(String html) { // Parser str into a Document Document doc = Jsoup.parse(html); // Clean the document doc = new Cleaner(wl).clean(doc); // Adjust escape mode doc.outputSettings().escapeMode(EscapeMode.xhtml); // Get back the string of the Document return doc.html(); }
/** * Clean the HTML string and return a document. * * @param html Input HTML string. * @param baseUri Base URI of the document. * @return Cleaned version of the HTML as document. */ public Document clean(String html, String baseUri) { // Parser str into a Document Document doc = Jsoup.parse(html, baseUri); // Clean the document doc = new Cleaner(wl).clean(doc); // Adjust escape mode doc.outputSettings().escapeMode(EscapeMode.xhtml); // Get back the string of the Document return doc; }
public ScheduleDocument(){ doc = Document.createShell(""); doc.outputSettings().escapeMode(EscapeMode.xhtml); DocumentType type = new DocumentType("html", "", "", ""); doc.prependChild(type); doc.select("html").attr("class", "js no-touch geolocation backgroundsize csstransforms csstransforms3d audio localstorage inlinesvg pointerevents webaudio mediaqueries getusermedia"); }
static String addHeaderToXml(String xml_str) { Document mDoc = Jsoup.parse("<kompendium>\n" + xml_str + "</kompendium>"); mDoc.outputSettings().escapeMode(EscapeMode.xhtml); mDoc.outputSettings().prettyPrint(true); mDoc.outputSettings().indentAmount(4); // Add date Date df = new Date(); String date_str = df.toString(); mDoc.select("kompendium").first().prependElement("date"); mDoc.select("date").first().text(date_str); // Add language mDoc.select("date").after("<lang></lang>"); if (DB_LANGUAGE.equals("de")) mDoc.select("lang").first().text("DE"); else if (DB_LANGUAGE.equals("fr")) mDoc.select("lang").first().text("FR"); // Fool jsoup.parse which seems to have its own "life" mDoc.select("tbody").unwrap(); Elements img_elems = mDoc.select("img"); for (Element img_e : img_elems) { if (!img_e.hasAttr("src")) img_e.unwrap(); } mDoc.select("img").tagName("image"); String final_xml_str = mDoc.select("kompendium").first().outerHtml(); return final_xml_str; }
static String[] extractHtmlSection(MedicalInformations.MedicalInformation m) { // Extract section titles and section ids MedicalInformations.MedicalInformation.Sections med_sections = m.getSections(); List<MedicalInformations.MedicalInformation.Sections.Section> med_section_list = med_sections.getSection(); Document doc = Jsoup.parse(m.getContent()); doc.outputSettings().escapeMode(EscapeMode.xhtml); // Clean html code HtmlUtils html_utils = new HtmlUtils(m.getContent()); html_utils.clean(); // Extract registration number (swissmedic no5) String regnr_str = ""; if (DB_LANGUAGE.equals("de")) regnr_str = html_utils.extractRegNrDE(m.getTitle()); else if (DB_LANGUAGE.equals("fr")) regnr_str = html_utils.extractRegNrFR(m.getTitle()); // Sanitize html String html_sanitized = ""; // First check for bad boys (version=1! but actually version>1!) if (!m.getVersion().equals("1") || m.getContent().substring(0, 20).contains("xml")) { for (int i=1; i<22; ++i) { html_sanitized += html_utils.sanitizeSection(i, m.getTitle(), DB_LANGUAGE); } html_sanitized = "<div id=\"monographie\">" + html_sanitized + "</div>" ; } else { html_sanitized = m.getContent(); } // Update "Packungen" section and extract therapeutisches index List<String> mTyIndex_list = new ArrayList<String>(); String mContent_str = updateSectionPackungen(m.getTitle(), package_info, regnr_str, html_sanitized, mTyIndex_list); // Add meta-tag and link mContent_str = mContent_str.replaceAll("<head>", "<head>" + "<link href=\"amiko_stylesheet.css\" rel=\"stylesheet\" type=\"text/css\"></>" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">"); m.setContent(mContent_str); // Fix problem with wrong div class in original Swissmedic file if (DB_LANGUAGE.equals("de")) { m.setStyle(m.getStyle().replaceAll("untertitel", "untertitle")); m.setStyle(m.getStyle().replaceAll("untertitel1", "untertitle1")); } // Correct formatting error introduced by Swissmedic m.setAuthHolder(m.getAuthHolder().replaceAll("&","&")); // Extracts only *first* registration number /* List<String> swissmedicno5_list = Arrays.asList(regnr_str.split("\\s*,\\s*")); String[] swno5_content_map = {swissmedicno5_list.get(0), mContent_str}; */ // Extract *all* registration numbers String[] swno5_content_map = {regnr_str, mContent_str}; return swno5_content_map; //mContent_str; }