Java 类org.jsoup.nodes.Entities.EscapeMode 实例源码

项目:yadaframework    文件:YadaWebUtil.java   
/**
 * Cleans the html content leaving only the following tags: b, em, i, strong, u, br, cite, em, i, p, strong, img, li, ul, ol, sup, sub, s
 * @param content html content
 * @param extraTags any other tags that you may want to keep, e. g. "a"
 * @return
 */
public String cleanContent(String content, String ... extraTags) {
    Whitelist allowedTags = Whitelist.simpleText(); // This whitelist allows only simple text formatting: b, em, i, strong, u. All other HTML (tags and attributes) will be removed.
    allowedTags.addTags("br", "cite", "em", "i", "p", "strong", "img", "li", "ul", "ol", "sup", "sub", "s");
    allowedTags.addTags(extraTags);
    allowedTags.addAttributes("p", "style"); // Serve per l'allineamento a destra e sinistra
    allowedTags.addAttributes("img", "src", "style", "class"); 
    if (Arrays.asList(extraTags).contains("a")) {
        allowedTags.addAttributes("a", "href", "target"); 
    }
    Document dirty = Jsoup.parseBodyFragment(content, "");
    Cleaner cleaner = new Cleaner(allowedTags);
    Document clean = cleaner.clean(dirty);
    clean.outputSettings().escapeMode(EscapeMode.xhtml); // Non fa l'escape dei caratteri utf-8
    String safe = clean.body().html();
    return safe;
}
项目:aips2xml    文件:HtmlUtils.java   
public String extractPackSection() {    
    mDoc = Jsoup.parse(mHtmlStr);
    mDoc.outputSettings().escapeMode(EscapeMode.xhtml); 

    String pack_section = "";
    // Find all information between X and X+1
    Element start_elem = mDoc.select("p:contains(Packungen)").first();          
    Element stop_elem = mDoc.select("p:contains(Zulassungsinhaberin)").first(); 
    // Alternative:
    /*
    Element start_elem = mDoc.select("p[id=section18]").first();            
    Element stop_elem = mDoc.select("p[id=section19]").first(); 
    */
    Element pe = start_elem.nextElementSibling(); 
    if (pe!=null && start_elem!=null && stop_elem!=null) {
        while (pe!=stop_elem) {
            System.out.println(pe.text());
            pe = pe.nextElementSibling();           
        }
    }       
    return pack_section;
}
项目:site    文件:RegistrationService.java   
/**
 * Cleans some html text by stripping all tags but <code>br</code> and then
 * unescapes named entitiesl like '&quote';. brs will be replaced by
 * newlines.
 *
 * @param htmlText
 * @return
 */
String htmlTextToPlainText(final String htmlText) {
    final Whitelist whitelist = Whitelist.none();
    whitelist.addTags("br");
    final Cleaner cleaner = new Cleaner(whitelist);
    final Document cleanedDocument = cleaner.clean(Jsoup.parse(htmlText));
    cleanedDocument
            .outputSettings()
            .prettyPrint(false)
            .escapeMode(EscapeMode.xhtml)
            .charset(StandardCharsets.UTF_8);
    return Parser.unescapeEntities(cleanedDocument.body().html().trim(), true).replaceAll("<br(?: ?/)?>", "\r\n");
}
项目:rabbitframework    文件:XSSFilter.java   
/**
 * Strips any potential XSS threats out of the value
 *
 * @param value
 * @return
 */
public String stripXSS(String value) {
    if (StringUtils.isBlank(value)) {
        return null;
    }
    // try {
    // value = ESAPI.encoder().encodeForHTML(value);
    // } catch (Exception e) {
    // logger.warn(e.getMessage(),e); //
    // }

    // Use the ESAPI library to avoid encoded attacks.
    value = ESAPI.encoder().canonicalize(value);
    //
    // // Avoid null characters
    value = value.replaceAll("\0", "");
    value = value.replaceAll("<", "& lt;").replaceAll(">", "& gt;");
    value = value.replaceAll("\\(", "& #40;").replaceAll("\\)", "& #41;");
    value = value.replaceAll("'", "& #39;");
    value = value.replaceAll("eval\\((.*)\\)", "");
    value = value.replaceAll("[\\\"\\\'][\\s]*javascript:(.*)[\\\"\\\']", "\"\"");
    value = value.replaceAll("script", "");
    //
    // // Clean out HTML
    Document.OutputSettings outputSettings = new Document.OutputSettings();
    outputSettings.escapeMode(EscapeMode.xhtml);
    outputSettings.prettyPrint(false);
    value = Jsoup.clean(value, "", Whitelist.none(), outputSettings);
    return value;
}
项目:site    文件:RegistrationService.java   
/**
 * Cleans some html text by stripping all tags but <code>br</code> and then
 * unescapes named entitiesl like '&quote';. brs will be replaced by
 * newlines.
 *
 * @param htmlText
 * @return
 */
String htmlTextToPlainText(final String htmlText) {
    final Whitelist whitelist = Whitelist.none();
    whitelist.addTags("br");
    final Cleaner cleaner = new Cleaner(whitelist);
    final Document cleanedDocument = cleaner.clean(Jsoup.parse(htmlText));
    cleanedDocument
            .outputSettings()
            .prettyPrint(false)
            .escapeMode(EscapeMode.xhtml)
            .charset(StandardCharsets.UTF_8);
    return Parser.unescapeEntities(cleanedDocument.body().html().trim(), true).replaceAll("<br(?: ?/)?>", "\r\n");
}
项目:struts2-pdfstream    文件:PdfStreamResult.java   
Document parseContent(final String content) {
    Document document = Jsoup.parse(content);
    document.outputSettings().escapeMode(EscapeMode.xhtml);
    document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);

    // remove script tags, they are not supported in pdf and can lead to
    // not well formed document (e.g. <\/script> - escaped script tag)
    document.select("script").remove();

    return document;
}
项目:taulukko-commons-parsers    文件:HTMLParser.java   
public static IDocument parse(String s) {
    org.jsoup.nodes.Document doc = Jsoup.parse(s);

    if (NOT_USE_HTML_ENCODE) {
        doc.outputSettings().escapeMode(EscapeMode.xhtml);
    }

    return new Document(doc);
}
项目:uraptor    文件:HtmlCleaner.java   
/**
 * Clean HTML string and return the cleaner version.
 * 
 * @param html Input HTML string.
 * @return Cleaned version of the HTML as string.
 */
public String clean(String html)
{
    // Parser str into a Document
    Document doc = Jsoup.parse(html);
    // Clean the document
    doc = new Cleaner(wl).clean(doc);
    // Adjust escape mode
    doc.outputSettings().escapeMode(EscapeMode.xhtml);

    // Get back the string of the Document
    return doc.html();
}
项目:uraptor    文件:HtmlCleaner.java   
/**
 * Clean the HTML string and return a document.
 * 
 * @param html Input HTML string.
 * @param baseUri Base URI of the document.
 * @return Cleaned version of the HTML as document.
 */
public Document clean(String html, String baseUri)
{
    // Parser str into a Document
    Document doc = Jsoup.parse(html, baseUri);
    // Clean the document
    doc = new Cleaner(wl).clean(doc);
    // Adjust escape mode
    doc.outputSettings().escapeMode(EscapeMode.xhtml);

    // Get back the string of the Document
    return doc;
}
项目:adan    文件:ScheduleDocument.java   
public ScheduleDocument(){
    doc = Document.createShell("");
    doc.outputSettings().escapeMode(EscapeMode.xhtml);
    DocumentType type = new DocumentType("html", "", "", "");
    doc.prependChild(type);
    doc.select("html").attr("class", "js no-touch geolocation backgroundsize csstransforms csstransforms3d audio localstorage inlinesvg pointerevents webaudio mediaqueries getusermedia");

}
项目:struts2-pdfstream    文件:PdfStreamResult.java   
Document parseContent(final String content) {
    Document document = Jsoup.parse(content);
    document.outputSettings().escapeMode(EscapeMode.xhtml);
    document.outputSettings().syntax(Document.OutputSettings.Syntax.xml);

    // remove script tags, they are not supported in pdf and can lead to
    // not well formed document (e.g. <\/script> - escaped script tag)
    document.select("script").remove();

    return document;
}
项目:aips2xml    文件:Aips2Xml.java   
static String addHeaderToXml(String xml_str) {  
    Document mDoc = Jsoup.parse("<kompendium>\n" + xml_str + "</kompendium>");
    mDoc.outputSettings().escapeMode(EscapeMode.xhtml);
    mDoc.outputSettings().prettyPrint(true);
    mDoc.outputSettings().indentAmount(4);

    // Add date
    Date df = new Date();
    String date_str = df.toString();
    mDoc.select("kompendium").first().prependElement("date");
    mDoc.select("date").first().text(date_str);
    // Add language
    mDoc.select("date").after("<lang></lang>");
    if (DB_LANGUAGE.equals("de"))
        mDoc.select("lang").first().text("DE");
    else if (DB_LANGUAGE.equals("fr"))
        mDoc.select("lang").first().text("FR");

    // Fool jsoup.parse which seems to have its own "life" 
    mDoc.select("tbody").unwrap();
    Elements img_elems = mDoc.select("img");
    for (Element img_e : img_elems) {
        if (!img_e.hasAttr("src"))
            img_e.unwrap();
    }
    mDoc.select("img").tagName("image");

    String final_xml_str = mDoc.select("kompendium").first().outerHtml();       

    return final_xml_str;
}
项目:aips2xml    文件:Aips2Xml.java   
static String[] extractHtmlSection(MedicalInformations.MedicalInformation m) {  
    // Extract section titles and section ids
    MedicalInformations.MedicalInformation.Sections med_sections = m.getSections();
    List<MedicalInformations.MedicalInformation.Sections.Section> med_section_list = med_sections.getSection();

    Document doc = Jsoup.parse(m.getContent());
    doc.outputSettings().escapeMode(EscapeMode.xhtml);

    // Clean html code
    HtmlUtils html_utils = new HtmlUtils(m.getContent());
    html_utils.clean();                 

    // Extract registration number (swissmedic no5)
    String regnr_str = "";
    if (DB_LANGUAGE.equals("de"))
        regnr_str = html_utils.extractRegNrDE(m.getTitle());
    else if (DB_LANGUAGE.equals("fr"))
        regnr_str = html_utils.extractRegNrFR(m.getTitle());

    // Sanitize html
    String html_sanitized = "";                             
    // First check for bad boys (version=1! but actually version>1!)
    if (!m.getVersion().equals("1") || m.getContent().substring(0, 20).contains("xml")) {
        for (int i=1; i<22; ++i) {
            html_sanitized += html_utils.sanitizeSection(i, m.getTitle(), DB_LANGUAGE);
        }
        html_sanitized = "<div id=\"monographie\">" + html_sanitized + "</div>" ;
    } else {
        html_sanitized = m.getContent();
    }

    // Update "Packungen" section and extract therapeutisches index
    List<String> mTyIndex_list = new ArrayList<String>();                       
    String mContent_str = updateSectionPackungen(m.getTitle(), package_info, regnr_str, html_sanitized, mTyIndex_list);

    // Add meta-tag and link
    mContent_str = mContent_str.replaceAll("<head>", "<head>" +
            "<link href=\"amiko_stylesheet.css\" rel=\"stylesheet\" type=\"text/css\"></>" +
            "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\">");

    m.setContent(mContent_str);     

    // Fix problem with wrong div class in original Swissmedic file
    if (DB_LANGUAGE.equals("de")) {
        m.setStyle(m.getStyle().replaceAll("untertitel", "untertitle"));
        m.setStyle(m.getStyle().replaceAll("untertitel1", "untertitle1"));
    }

    // Correct formatting error introduced by Swissmedic
    m.setAuthHolder(m.getAuthHolder().replaceAll("&#038;","&"));

    // Extracts only *first* registration number
    /*
    List<String> swissmedicno5_list = Arrays.asList(regnr_str.split("\\s*,\\s*"));      
    String[] swno5_content_map = {swissmedicno5_list.get(0), mContent_str};
    */
    // Extract *all* registration numbers
    String[] swno5_content_map = {regnr_str, mContent_str};

    return swno5_content_map; //mContent_str;
}