@Override public boolean matches(Element root, Element element) { List<Node> family = element.childNodes(); for (Node n : family) { if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false; } return true; }
@Test public void testParseDeclarationAttributes() { String xml = "<?xml version='1' encoding='UTF-8' something='else'?><val>One</val>"; Document doc = Jsoup.parse(xml, "", Parser.xmlParser()); XmlDeclaration decl = (XmlDeclaration) doc.childNode(0); assertEquals("1", decl.attr("version")); assertEquals("UTF-8", decl.attr("encoding")); assertEquals("else", decl.attr("something")); assertEquals("version=\"1\" encoding=\"UTF-8\" something=\"else\"", decl.getWholeDeclaration()); assertEquals("<?xml version=\"1\" encoding=\"UTF-8\" something=\"else\"?>", decl.outerHtml()); }
@Override public boolean matches(Element root, Element element) { List<Node> family = element.childNodes(); for (int i = 0; i < family.size(); i++) { Node n = family.get(i); if (!(n instanceof Comment || n instanceof XmlDeclaration || n instanceof DocumentType)) return false; } return true; }
static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) { String docData; Document doc = null; // look for BOM - overrides any other header or input charsetName = detectCharsetFromBom(byteData, charsetName); if (charsetName == null) { // determine from meta. safe first parse as UTF-8 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> docData = Charset.forName(defaultCharset).decode(byteData).toString(); doc = parser.parseInput(docData, baseUri); Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first(); String foundCharset = null; // if not found, will keep utf-8 as best attempt if (meta != null) { if (meta.hasAttr("http-equiv")) { foundCharset = getCharsetFromContentType(meta.attr("content")); } if (foundCharset == null && meta.hasAttr("charset")) { foundCharset = meta.attr("charset"); } } // look for <?xml encoding='ISO-8859-1'?> if (foundCharset == null && doc.childNode(0) instanceof XmlDeclaration) { XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0); if (prolog.name().equals("xml")) { foundCharset = prolog.attr("encoding"); } } foundCharset = validateCharset(foundCharset); if (foundCharset != null && !foundCharset.equals(defaultCharset)) { // need to re-decode foundCharset = foundCharset.trim().replaceAll("[\"']", ""); charsetName = foundCharset; byteData.rewind(); docData = Charset.forName(foundCharset).decode(byteData).toString(); doc = null; } } else { // specified by content type header (or by user on file load) Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); docData = Charset.forName(charsetName).decode(byteData).toString(); } if (doc == null) { doc = parser.parseInput(docData, baseUri); doc.outputSettings().charset(charsetName); } return doc; }
static Document parseInputStream(InputStream input, String charsetName, String baseUri, Parser parser) throws IOException { if (input == null) // empty body return new Document(baseUri); if (!(input instanceof ConstrainableInputStream)) input = new ConstrainableInputStream(input, bufferSize, 0); Document doc = null; boolean fullyRead = false; // read the start of the stream and look for a BOM or meta charset input.mark(firstReadBufferSize); ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed fullyRead = input.read() == -1; input.reset(); // look for BOM - overrides any other header or input BomCharset bomCharset = detectCharsetFromBom(firstBytes, charsetName); if (bomCharset != null) { charsetName = bomCharset.charset; input.skip(bomCharset.offset); } if (charsetName == null) { // determine from meta. safe first parse as UTF-8 String docData = Charset.forName(defaultCharset).decode(firstBytes).toString(); doc = parser.parseInput(docData, baseUri); // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); String foundCharset = null; // if not found, will keep utf-8 as best attempt for (Element meta : metaElements) { if (meta.hasAttr("http-equiv")) foundCharset = getCharsetFromContentType(meta.attr("content")); if (foundCharset == null && meta.hasAttr("charset")) foundCharset = meta.attr("charset"); if (foundCharset != null) break; } // look for <?xml encoding='ISO-8859-1'?> if (foundCharset == null && doc.childNodeSize() > 0 && doc.childNode(0) instanceof XmlDeclaration) { XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0); if (prolog.name().equals("xml")) foundCharset = prolog.attr("encoding"); } foundCharset = validateCharset(foundCharset); if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharset)) { // need to re-decode. (case insensitive check here to match how validate works) foundCharset = foundCharset.trim().replaceAll("[\"']", ""); charsetName = foundCharset; doc = null; } else if (!fullyRead) { doc = null; } } else { // specified by content type header (or by user on file load) Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); } if (doc == null) { if (charsetName == null) charsetName = defaultCharset; BufferedReader reader = new BufferedReader(new InputStreamReader(input, charsetName), bufferSize); doc = parser.parseInput(reader, baseUri); doc.outputSettings().charset(charsetName); } input.close(); return doc; }