private void dropInitialAlef(Document doc) throws DOMException { TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.TOKEN), false); Element m = null; boolean prevEndsWithVowel = false; while ((m = (Element) tw.nextNode()) != null) { System.out.println("Element: "+m.getTagName()); if (m != null && m.getTagName().equals(MaryXML.TOKEN)) { String transcription = m.getAttribute("ph"); System.out.println("transcription: "+transcription+", prevEndsWithVowel: "+prevEndsWithVowel); if ( transcription.startsWith("' a ") && prevEndsWithVowel ) { System.out.println("Removing A"); m.setAttribute("ph", transcription.replaceAll("^' a -?","' ")); } if ( transcription.matches(".*[AUIaui][01]?$") ) { prevEndsWithVowel = true; } } } }
protected void filterPunctuation(MaryData d) { Document doc = d.getDocument(); NodeIterator ni = ((DocumentTraversal) doc).createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter( MaryXML.PARAGRAPH), false); Element p = null; while ((p = (Element) ni.nextNode()) != null) { Node textNode = p.getFirstChild(); String s = textNode.getNodeValue(); System.err.println("FilterPunctuation"); System.err.println(s); s = s.replaceAll("،", ","); //s = s.replaceAll("XX", "YY"); System.err.println(s); textNode.setNodeValue(s); } }
protected void vocaliseDoc(Document doc) throws Exception { TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.TOKEN), false); Element t = null; StringBuilder origText = new StringBuilder(); while ((t = (Element) tw.nextNode()) != null) { //if (MaryDomUtils.hasAncestor(t, MaryXML.SAYAS) || t.hasAttribute("ph") || t.hasAttribute("sounds_like")) { // ignore token //continue; origText.append(" " + MaryDomUtils.tokenText(t)); } String vocText = vocaliseText(origText.toString()); String[] vocTextList = vocText.split(" "); TreeWalker tw2 = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.TOKEN), false); Element t2 = null; int i = 0; while ((t2 = (Element) tw2.nextNode()) != null && i < vocTextList.length) { MaryDomUtils.setTokenText(t2, vocTextList[i]); i++; } }
/** * Return the concatenation of the values of all text nodes below the given node. One space character is inserted between * adjacent text nodes. * * @param n * n * @return null if n is null */ public static String getPlainTextBelow(Node n) { if (n == null) return null; Document doc = null; if (n.getNodeType() == Node.DOCUMENT_NODE) { doc = (Document) n; } else { doc = n.getOwnerDocument(); } StringBuilder buf = new StringBuilder(); NodeIterator it = ((DocumentTraversal) doc).createNodeIterator(n, NodeFilter.SHOW_TEXT, null, true); Text text = null; while ((text = (Text) it.nextNode()) != null) { buf.append(text.getData().trim()); buf.append(" "); } return buf.toString(); }
@BeforeClass public static void init() throws IOException, CSSException, SAXException { log.info("\n\n\n == AnalyzerTest test at {} == \n\n\n", new Date()); DOMSource ds = new DOMSource(AnalyzerTest.class.getResourceAsStream("/simple/data.html")); doc = ds.parse(); sheet = CSSFactory.parse(AnalyzerTest.class.getResource("/simple/data.css"), null); analyzer = new Analyzer(sheet); NodeList list = doc.getElementsByTagName("body"); assertEquals("There is one <body> element", 1, list.getLength()); //walker = new TidyTreeWalker(list.item(0), NodeFilter.SHOW_ELEMENT); DocumentTraversal traversal = (DocumentTraversal) doc; walker = traversal.createTreeWalker(list.item(0), NodeFilter.SHOW_ELEMENT, null, false); elements = new ElementMap(doc); }
/** * Outputs information on the matches for debug purposes. * * @param matchSet The set of matching Nodes. * @param doc The first document being differenced */ private static void outputDebug(final NodePairs matchSet, final Document doc) { if (DiffFactory.isDebug()) { NodeIterator ni = ((DocumentTraversal) doc).createNodeIterator( doc.getDocumentElement(), NodeFilter.SHOW_ALL, null, false); Node n; while ((n = ni.nextNode()) != null) { System.err.print(DOMOps.getNodeAsString(n)); if (matchSet.isMatched(n)) { System.err.println(" matches " + DOMOps.getNodeAsString(matchSet.getPartner(n))); } else { System.err.println(" unmatched"); } } ni.detach(); System.err.println(); } }
/** * Returns a list of Nodes sorted according to their depths. * * Does *NOT* include root or documentElement * * TreeSet is sorted in reverse order of depth according to * NodeInfoComparator. * * @param doc The document to be initialised and ordered. * @return A depth-ordered list of the nodes in the doc. */ private static List<NodeDepth> initialiseAndOrderNodes( final Document doc) { NodeIterator ni = ((DocumentTraversal) doc).createNodeIterator( doc, NodeFilter.SHOW_ALL, null, false); List<NodeDepth> depthSorted = new ArrayList<NodeDepth>(); Node n; while ((n = ni.nextNode()) != null) { if (!(NodeOps.checkIfSameNode(doc, n) || NodeOps.checkIfSameNode(doc.getDocumentElement(), n) || n.getNodeType() == Node.DOCUMENT_TYPE_NODE)) { depthSorted.add(new NodeDepth(n)); } } ni.detach(); Collections.sort(depthSorted, new NodeDepthComparator()); return depthSorted; }
public static List<Element> getElementsByTagNameCaseInsensitive(Document doc, final Set<String> lowerCaseNames) { final List<Element> result = Lists.newArrayList(); NodeIterator nodeIterator = ((DocumentTraversal) doc) .createNodeIterator(doc, NodeFilter.SHOW_ELEMENT, new NodeFilter() { public short acceptNode(Node n) { if (lowerCaseNames.contains(n.getNodeName().toLowerCase())) { return NodeFilter.FILTER_ACCEPT; } return NodeFilter.FILTER_REJECT; } }, false); for (Node n = nodeIterator.nextNode(); n != null ; n = nodeIterator.nextNode()) { result.add((Element)n); } return result; }
private boolean checkDocument() { while (true) { if (this.document != panel.getDocument() || textInlineMap == null) { this.document = panel.getDocument(); textInlineMap = null; this.dotInfo = null; this.markInfo = null; this.lastSelectionRange = null; try { this.docRange = (DocumentRange) panel.getDocument(); this.docTraversal = (DocumentTraversal) panel.getDocument(); if (this.document != null && this.createMaps()) { return true; } try { Thread.sleep(10); } catch (InterruptedException e) { return false; } } catch (ClassCastException cce) { XRLog.layout(Level.WARNING, "Document instance cannot create ranges: no selection possible"); return false; } } return true; } }
DocumentTraversal getDocumentTraversal(Document doc) throws XMLStreamException { DOMImplementation dom = doc.getImplementation(); if (!dom.hasFeature("Traversal", "2.0")) throw new XMLStreamException("Traversal not supported"); return (DocumentTraversal) doc; }
private void parseDocument(Document dasDoc, String chr, List<Feature> features) { try { DocumentTraversal traversal = (DocumentTraversal) dasDoc; TreeWalker treewalker = traversal.createTreeWalker( dasDoc.getDocumentElement(), NodeFilter.SHOW_ELEMENT, null, true); parseTree(treewalker, "FEATURE", chr, features); } catch (Exception ex) { log.error(ex); throw new DataLoadException("Error loading DAS resource (" + ex.toString() + ")", getPath()); } }
/** * * @param filePath * @param defaultPackageName * @throws Exception */ public XmiDomTree(String filePath, String defaultPackageName) throws Exception { Document document = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(filePath); xmiTree = ((DocumentTraversal) document).createTreeWalker(document.getDocumentElement(), NodeFilter.SHOW_ALL, new XmiNodeFilter(), true); initializeCollections(defaultPackageName); checkXmiVersion(document); }
public Traversal(Document doc, Object source, int whatToShow) { if (doc instanceof DocumentTraversal) { DocumentTraversal dt = (DocumentTraversal) doc; this.walker = dt.createTreeWalker(doc.getDocumentElement(), whatToShow, null, false); } else { this.walker = new GenericTreeWalker(doc.getDocumentElement(), whatToShow); } this.source = source; }
@Test public void canProcessPunctuatedArabicToTokens() throws Exception { // setup MaryInterface mary = new LocalMaryInterface(); mary.setInputType(MaryDataType.TEXT.name()); mary.setOutputType(MaryDataType.TOKENS.name()); mary.setLocale(new Locale("ar")); String example = "مدينة شِبام الأثريَّة التاريخيَّة، إحدى أقدم مُدن"; // exercise Document doc = mary.generateXML(example); // verify assertNotNull(doc); //We should get 8 tokens from this example, 7 words and one comma TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(MaryXML.TOKEN), false); Element t = null; int count = 0; while ((t = (Element) tw.nextNode()) != null) { count++; } assertEquals(8, count); }
/** * Iterates through all the ELEMENT nodes in a document and gives them ids * if they don't already have them. * * @param document */ public static void augmentDocument(Document document) { final DocumentTraversal traversal = (DocumentTraversal) document; final TreeWalker walker = traversal.createTreeWalker(document, NodeFilter.SHOW_ELEMENT, null, true); traverseLevel(walker, 0); }
/** * Get the mapping between bits of text in the dom & their xpaths * * @return mapping from xpath to text */ public List<MappingNode> getArticleTextMapping() { if (articleContent == null) return null; final List<MappingNode> map = new ArrayList<MappingNode>(); final TreeWalker walker = ((DocumentTraversal) document).createTreeWalker(articleContent, NodeFilter.SHOW_TEXT | NodeFilter.SHOW_ELEMENT, null, true); getArticleTextMapping(walker, map); return map; }
/** * Get the last descendant element with the given tag name, or <code>null</code> if there is no such element. * * @param e * e * @param name * name * @return previous */ public static Element getLastElementByTagName(Element e, String name) { // This implementation is certainly inefficient, but I have // no better idea at the moment. TreeWalker tw = ((DocumentTraversal) e.getOwnerDocument()).createTreeWalker(e, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(name), true); Node current = null; Node previous = null; while ((current = tw.nextNode()) != null) previous = current; return (Element) previous; }
public static boolean isFirstOfItsKindIn(Node node, Node ancestor) { if (ancestor == null) return false; Document doc = node.getOwnerDocument(); if (doc == null) return false; TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ALL, new NameNodeFilter(node.getNodeName()), true); tw.setCurrentNode(node); Node prev = tw.previousNode(); return prev == null || // no node with same name before this one !isAncestor(ancestor, prev); // prev is not in the same ancestor }
public static boolean isLastOfItsKindIn(Node node, Node ancestor) { if (node == null || ancestor == null) return false; Document doc = node.getOwnerDocument(); if (doc == null) return false; TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(doc, NodeFilter.SHOW_ALL, new NameNodeFilter(node.getNodeName()), true); tw.setCurrentNode(node); Node next = tw.nextNode(); return next == null || // no node with same name after this one !isAncestor(ancestor, next); // next is not in the same ancestor }
/** * Find the Element with the same tag name as <code>element</code> preceding <code>element</code> within the same subtree * under <code>root</code>. Precondition: <code>root</code> must be an ancestor of <code>element</code>. * * @param element * element * @param root * root * @return that Element, or <code>null</code> if there is no such Element. */ public static Element getPreviousOfItsKindIn(Element element, Element root) { if (element == null || root == null) return null; String tagname = element.getTagName(); TreeWalker tw = ((DocumentTraversal) element.getOwnerDocument()).createTreeWalker(root, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(tagname), true); tw.setCurrentNode(element); for (Node previous = tw.previousNode(); previous != null; previous = tw.previousNode()) { if (previous.getNodeName().equals(tagname)) { return (Element) previous; } } return null; }
/** * Find the Element with the same tag name as <code>element</code> following <code>element</code> within the same subtree * under <code>root</code>. Precondition: <code>root</code> must be an ancestor of <code>element</code>. * * @param element * element * @param root * root * @return that Element, or <code>null</code> if there is no such Element. */ public static Element getNextOfItsKindIn(Element element, Element root) { if (element == null || root == null) return null; String tagname = element.getTagName(); TreeWalker tw = ((DocumentTraversal) element.getOwnerDocument()).createTreeWalker(root, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(tagname), true); tw.setCurrentNode(element); for (Node next = tw.nextNode(); next != null; next = tw.nextNode()) { if (next.getNodeName().equals(tagname)) { return (Element) next; } } return null; }
/** * Go through all text nodes below this node, and replace their text with a trimmed version of their text. This changes the * DOM document. * * @param root * root */ public static void trimAllTextNodes(Node root) { Document doc = root.getNodeType() == Node.DOCUMENT_NODE ? (Document) root : root.getOwnerDocument(); NodeIterator it = ((DocumentTraversal) doc).createNodeIterator(root, NodeFilter.SHOW_TEXT, null, false); Text t = null; while ((t = (Text) it.nextNode()) != null) { String s = t.getData(); t.setData(s.trim()); } }
/** * Try to cast a Document into a DocumentTraversal * @param document * @return DocumentTraversal interface if the DOM implementation supports it */ private static DocumentTraversal getDocumentTraversal(Document document) { try { return (DocumentTraversal) document; } catch (ClassCastException e) { throw new IllegalArgumentException("DOM Traversal not supported by " + document.getImplementation().getClass().getName() + ". To use this class you will need to switch to a DOM implementation that supports Traversal."); } }
public ElementMap(Document doc) { elementIDs = new HashMap<String, Element>(); elementNames = new HashMap<String, Element>(); DocumentTraversal traversal = (DocumentTraversal) doc; TreeWalker w = traversal.createTreeWalker(doc, NodeFilter.SHOW_ELEMENT, null, false); //TreeWalker w = new TidyTreeWalker(doc, NodeFilter.SHOW_ELEMENT); Element current; while ((current = (Element) w.nextNode()) != null) { elementNames.put(current.getNodeName().toLowerCase(), current); String id = current.getAttribute("id"); if(id!=null) elementIDs.put(id, current); } }
public static List<String> searchGoogle(String query) throws FailingHttpStatusCodeException, MalformedURLException, IOException, InterruptedException { java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); System.setProperty("org.apache.commons.logging.Log", "org.apache.commons.logging.impl.NoOpLog"); List<String> URLList=new ArrayList<String>(); //java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(Level.OFF); // BrowserVersionFeatures[] bvf = new BrowserVersionFeatures[1]; // bvf[0] = BrowserVersionFeatures.HTMLIFRAME_IGNORE_SELFCLOSING; // BrowserVersion bv = new BrowserVersion( // BrowserVersion.CHROME.getApplicationName(), // "5.0 (Windows; en-US)","Chrome", // (float) 3.6, bvf); WebClient webClient = new WebClient(BrowserVersion.CHROME); webClient.getOptions().setThrowExceptionOnScriptError(false); //webClient.setCssErrorHandler(new SilentCssErrorHandler()); String finalQuery="https://www.google.com/search?num="+numResults + "&q="+query; System.out.println(finalQuery); HtmlPage page = webClient.getPage(finalQuery); Thread.sleep(2000); String xmlString=page.asXml(); File URLMapper=new File("searchResult.xml"); URLMapper.createNewFile(); BufferedWriter bw2=new BufferedWriter(new OutputStreamWriter (new FileOutputStream(URLMapper.getAbsolutePath()),"UTF-8")); bw2.write(xmlString); bw2.close(); Document domDoc =null; try { DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); //ByteArrayInputStream bis = new ByteArrayInputStream(str.getBytes()); domDoc = docBuilder.parse(URLMapper); } catch (Exception e) { e.printStackTrace(); } DocumentTraversal traversal = (DocumentTraversal) domDoc; NodeIterator iterator = traversal.createNodeIterator(domDoc.getDocumentElement(), NodeFilter.SHOW_ELEMENT, null, true); for (Node n = iterator.nextNode(); n != null; n = iterator.nextNode()) { //String tagname = ((Element) n).getAttribute("sectionName"); //System.out.println(n.getNodeName()); // String title=""; if(n.getNodeName().contentEquals("h3")) { //if(n.getAttributes().getNamedItem("class").getNodeValue().contentEquals("class=r") String j=((Element)n).getAttribute("class"); //System.out.println(j); if(j.contentEquals("r")) { Node a=n.getChildNodes().item(1); String url=((Element) a).getAttribute("href"); if(!(url.contains("google")||url.contains("wikipedia")||url.contains("amazon") ||url.contains("amazon")) && url.trim().length()>0) { URLList.add(url); } } //System.out.println(n.getAttributes().getNamedItem("class")); //System.out.println(n.getChildNodes().item(1).getAttributes().getNamedItem("href")); } } System.out.println(URLList); webClient.closeAllWindows(); return URLList; }
public String resolve(String infileAddress, String content, String mimeType) throws AddressException{ try{ // get the start and end of the text to find String pieces[] = infileAddress.split("\\.\\.\\."); if(pieces == null || pieces.length != 2){ throw new AddressException("Invalid quote address: " + infileAddress); } String start = pieces[0]; String end = pieces[1]; //System.out.println("start="+start); //System.out.println("end="+end); // parse content into a DOM Document document = parseDocument(content); // now find the beginning and end of the ranges that // match this text Node body = document.getElementsByTagName("body").item(0); Range range = ((DocumentRange)document).createRange(); DocumentTraversal traverse = (DocumentTraversal)document; NodeFilter filter = null; NodeIterator nodes = traverse.createNodeIterator(body, NodeFilter.SHOW_CDATA_SECTION | NodeFilter.SHOW_TEXT, filter, true); boolean results = findString(range, START, nodes, start); if(results == false){ throw new AddressException("Unable to find start of quote range: " + start); } // jump the node iterator backwards one; this is for cases // where the start and end text are in the same node nodes.previousNode(); //System.out.println("nodes="+nodes); results = findString(range, END, nodes, end); if(results == false){ throw new AddressException("Unable to find end of quote range: " + end); } // get the fragment represented by this range DocumentFragment fragment = range.cloneContents(); // serialize fragment into string //System.out.println("fragment="+fragment); return serialize((Node)fragment); }catch(Exception e){ e.printStackTrace(); throw new AddressException(e); } }
public void parse(VxmlDoc vxmlDoc) throws Event { Document doc = vxmlDoc.getXmlDoc(); NodeIterator ni = ((DocumentTraversal) doc).createNodeIterator(doc.getDocumentElement(), NodeFilter.SHOW_ALL & ~NodeFilter.SHOW_COMMENT, new EmptyTextNodeFilter(), true); walk(ni.nextNode()); }
/** * Create a TreeModel for a TreeWalker that returns all nodes in the * specified document **/ public DOMTreeWalkerTreeModel(Document document) { DocumentTraversal dt = (DocumentTraversal) document; walker = dt .createTreeWalker(document, NodeFilter.SHOW_ALL, null, false); }
/** * Create a TreeModel for a TreeWalker that returns the specified element * and all of its descendant nodes. **/ public DOMTreeWalkerTreeModel(Element element) { DocumentTraversal dt = (DocumentTraversal) element.getOwnerDocument(); walker = dt.createTreeWalker(element, NodeFilter.SHOW_ALL, null, false); }
/** * This main() method demonstrates the use of this class, the use of the * Xerces DOM parser, and the creation of a DOM Level 2 TreeWalker object. **/ public static void main(String[] args) throws IOException, SAXException { // Obtain an instance of a Xerces parser to build a DOM tree. // Note that we are not using the JAXP API here, so this // code uses Apache Xerces APIs that are not standards DOMParser parser = new org.apache.xerces.parsers.DOMParser(); // Get a java.io.Reader for the input XML file and // wrap the input file in a SAX input source Reader in = new BufferedReader(new FileReader(args[0])); InputSource input = new org.xml.sax.InputSource(in); // Tell the Xerces parser to parse the input source parser.parse(input); // Ask the parser to give us our DOM Document. Once we've got the DOM // tree, we don't have to use the Apache Xerces APIs any more; from // here on, we use the standard DOM APIs Document document = parser.getDocument(); // If we're using a DOM Level 2 implementation, then our Document // object ought to implement DocumentTraversal DocumentTraversal traversal = (DocumentTraversal) document; // For this demonstration, we create a NodeFilter that filters out // Text nodes containing only space; these just clutter up the tree NodeFilter filter = new NodeFilter() { public short acceptNode(Node n) { if (n.getNodeType() == Node.TEXT_NODE) { // Use trim() to strip off leading and trailing space. // If nothing is left, then reject the node if (((Text) n).getData().trim().length() == 0) return NodeFilter.FILTER_REJECT; } return NodeFilter.FILTER_ACCEPT; } }; // This set of flags says to "show" all node types except comments int whatToShow = NodeFilter.SHOW_ALL & ~NodeFilter.SHOW_COMMENT; // Create a TreeWalker using the filter and the flags TreeWalker walker = traversal.createTreeWalker(document, whatToShow, filter, false); // Instantiate a TreeModel and a JTree to display it JTree tree = new JTree(new DOMTreeWalkerTreeModel(walker)); // Create a frame and a scrollpane to display the tree, and pop them up JFrame frame = new JFrame("DOMTreeWalkerTreeModel Demo"); frame.getContentPane().add(new JScrollPane(tree)); frame.setSize(500, 250); frame.setVisible(true); }
public static TreeWalker createTreeWalker(Document doc, Node root, String... tagNames) { return ((DocumentTraversal) doc).createTreeWalker(root, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(tagNames), false); }
public static NodeIterator createNodeIterator(Document doc, Node root, String... tagNames) { return ((DocumentTraversal) doc).createNodeIterator(root, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(tagNames), false); }
/** * Construct a NodeTest using the specified DocumentTraversal, starting at * the specified root node */ public NodeTest(DocumentTraversal documentTraversal, Node rootNode) { this.documentTraversal = documentTraversal; this.rootNode = rootNode; }
/** * Get the first child element with the given tag name, or <code>null</code> if there is no such element. * * @param n * n * @param name * name * @return tx.nextNode */ public static Element getFirstElementByTagName(Node n, String name) { Document doc = (n instanceof Document) ? (Document) n : n.getOwnerDocument(); TreeWalker tw = ((DocumentTraversal) doc).createTreeWalker(n, NodeFilter.SHOW_ELEMENT, new NameNodeFilter(name), true); return (Element) tw.nextNode(); }