View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.w3c.dom.Document;
21  import org.w3c.dom.NamedNodeMap;
22  import org.w3c.dom.Node;
23  import org.w3c.dom.NodeList;
24  import org.w3c.dom.traversal.DocumentTraversal;
25  import org.w3c.dom.traversal.NodeFilter;
26  import org.w3c.dom.traversal.NodeIterator;
27  
28  import javax.xml.transform.OutputKeys;
29  import javax.xml.transform.Result;
30  import javax.xml.transform.Transformer;
31  import javax.xml.transform.TransformerConfigurationException;
32  import javax.xml.transform.TransformerException;
33  import javax.xml.transform.TransformerFactory;
34  import javax.xml.transform.TransformerFactoryConfigurationError;
35  import javax.xml.transform.dom.DOMSource;
36  import javax.xml.transform.stream.StreamResult;
37  import javax.xml.xpath.XPath;
38  import javax.xml.xpath.XPathConstants;
39  import javax.xml.xpath.XPathExpressionException;
40  import javax.xml.xpath.XPathFactory;
41  
42  import java.io.ByteArrayInputStream;
43  import java.io.ByteArrayOutputStream;
44  import java.io.IOException;
45  import java.io.InputStream;
46  import java.io.StringWriter;
47  import java.io.UnsupportedEncodingException;
48  import java.util.ArrayList;
49  import java.util.List;
50  import java.util.Locale;
51  import java.util.regex.Pattern;
52  
53  /**
54   * This class provides utility methods for DOM manipulation. It is separated from {@link HTMLDocument} so that its
55   * methods can be run on single DOM nodes without having to wrap them into an HTMLDocument.
56   * <p>
57   * We use a mix of XPath and DOM manipulation.
58   * </p>
59   * This is likely to be a performance bottleneck but at least everything is localized here.
60   */
61  public class DomUtils {
62  
63      private static final String[] EMPTY_STRING_ARRAY = new String[0];
64  
65      private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
66  
67      private DomUtils() {
68      }
69  
70      /**
71       * Given a node this method returns the index corresponding to such node within the list of the children of its
72       * parent node.
73       *
74       * @param n
75       *            the node of which returning the index.
76       * 
77       * @return a non negative number.
78       */
79      public static int getIndexInParent(Node n) {
80          Node parent = n.getParentNode();
81          if (parent == null) {
82              return 0;
83          }
84          NodeList nodes = parent.getChildNodes();
85          int counter = -1;
86          for (int i = 0; i < nodes.getLength(); i++) {
87              Node current = nodes.item(i);
88              if (current.getNodeType() == n.getNodeType() && current.getNodeName().equals(n.getNodeName())) {
89                  counter++;
90              }
91              if (current.equals(n)) {
92                  return counter;
93              }
94          }
95          throw new IllegalStateException("Cannot find a child within its parent node list.");
96      }
97  
98      /**
99       * Does a reverse walking of the DOM tree to generate a unique XPath expression leading to this node. The XPath
100      * generated is the canonical one based on sibling index: /html[1]/body[1]/div[2]/span[3] etc..
101      *
102      * @param node
103      *            the input node.
104      * 
105      * @return the XPath location of node as String.
106      */
107     public static String getXPathForNode(Node node) {
108         final StringBuilder sb = new StringBuilder();
109         Node parent = node;
110         while (parent != null && parent.getNodeType() != Node.DOCUMENT_NODE) {
111             sb.insert(0, "]");
112             sb.insert(0, getIndexInParent(parent) + 1);
113             sb.insert(0, "[");
114             sb.insert(0, parent.getNodeName());
115             sb.insert(0, "/");
116             parent = parent.getParentNode();
117         }
118         return sb.toString();
119     }
120 
121     /**
122      * Returns a list of tag names representing the path from the document root to the given node <i>n</i>.
123      *
124      * @param n
125      *            the node for which retrieve the path.
126      * 
127      * @return a sequence of HTML tag names.
128      */
129     public static String[] getXPathListForNode(Node n) {
130         if (n == null) {
131             return EMPTY_STRING_ARRAY;
132         }
133         List<String> ancestors = new ArrayList<String>();
134         ancestors.add(String.format(Locale.ROOT, "%s[%s]", n.getNodeName(), getIndexInParent(n)));
135         Node parent = n.getParentNode();
136         while (parent != null) {
137             ancestors.add(0, String.format(Locale.ROOT, "%s[%s]", parent.getNodeName(), getIndexInParent(parent)));
138             parent = parent.getParentNode();
139         }
140         return ancestors.toArray(new String[ancestors.size()]);
141     }
142 
143     /**
144      * Returns the row/col location of the given node.
145      *
146      * @param n
147      *            input node.
148      * 
149      * @return an array of two elements of type
150      *         <code>[&lt;begin-row&gt;, &lt;begin-col&gt;, &lt;end-row&gt; &lt;end-col&gt;]</code> or <code>null</code>
151      *         if not possible to extract such data.
152      */
153     public static int[] getNodeLocation(Node n) {
154         if (n == null)
155             throw new NullPointerException("node cannot be null.");
156         final TagSoupParser.ElementLocation elementLocation = (TagSoupParser.ElementLocation) n
157                 .getUserData(TagSoupParser.ELEMENT_LOCATION);
158         if (elementLocation == null)
159             return null;
160         return new int[] { elementLocation.getBeginLineNumber(), elementLocation.getBeginColumnNumber(),
161                 elementLocation.getEndLineNumber(), elementLocation.getEndColumnNumber() };
162     }
163 
164     /**
165      * Checks whether a node is ancestor or same of another node.
166      *
167      * @param candidateAncestor
168      *            the candidate ancestor node.
169      * @param candidateSibling
170      *            the candidate sibling node.
171      * @param strict
172      *            if <code>true</code> is not allowed that the ancestor and sibling can be the same node.
173      * 
174      * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
175      *         <code>false</code> otherwise.
176      */
177     public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling, boolean strict) {
178         if (candidateAncestor == null)
179             throw new NullPointerException("candidate ancestor cannot be null null.");
180         if (candidateSibling == null)
181             throw new NullPointerException("candidate sibling cannot be null null.");
182         if (strict && candidateAncestor.equals(candidateSibling))
183             return false;
184         Node parent = candidateSibling;
185         while (parent != null) {
186             if (parent.equals(candidateAncestor))
187                 return true;
188             parent = parent.getParentNode();
189         }
190         return false;
191     }
192 
193     /**
194      * Checks whether a node is ancestor or same of another node. As
195      * {@link #isAncestorOf(org.w3c.dom.Node, org.w3c.dom.Node, boolean)} with <code>strict=false</code>.
196      *
197      * @param candidateAncestor
198      *            the candidate ancestor node.
199      * @param candidateSibling
200      *            the candidate sibling node.
201      * 
202      * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
203      *         <code>false</code> otherwise.
204      */
205     public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling) {
206         return isAncestorOf(candidateAncestor, candidateSibling, false);
207     }
208 
209     /**
210      * Finds all nodes that have a declared class. Note that the className is transformed to lower case before being
211      * matched against the DOM.
212      * 
213      * @param root
214      *            the root node from which start searching.
215      * @param className
216      *            the name of the filtered class.
217      * 
218      * @return list of matching nodes or an empty list.
219      */
220     public static List<Node> findAllByClassName(Node root, String className) {
221         return findAllBy(root, null, "class", className.toLowerCase(Locale.ROOT));
222     }
223 
224     /**
225      * Finds all nodes that have a declared attribute. Note that the className is transformed to lower case before being
226      * matched against the DOM.
227      * 
228      * @param root
229      *            the root node from which start searching.
230      * @param attrName
231      *            the name of the filtered attribue.
232      * 
233      * @return list of matching nodes or an empty list.
234      */
235     public static List<Node> findAllByAttributeName(Node root, String attrName) {
236         return findAllBy(root, null, attrName, null);
237     }
238 
239     public static List<Node> findAllByAttributeContains(Node node, String attrName, String attrContains) {
240         return findAllBy(node, null, attrName, attrContains);
241     }
242 
243     public static List<Node> findAllByTag(Node root, String tagName) {
244         return findAllBy(root, tagName, null, null);
245     }
246 
247     public static List<Node> findAllByTagAndClassName(Node root, final String tagName, final String className) {
248         return findAllBy(root, tagName, "class", className);
249     }
250 
251     /**
252      * Mimics the JS DOM API, or prototype's $()
253      * 
254      * @param root
255      *            the node to locate
256      * @param id
257      *            the id of the node to locate
258      * 
259      * @return the {@link org.w3c.dom.Node} if one exists
260      */
261     public static Node findNodeById(Node root, String id) {
262         Node node;
263         try {
264             String xpath = "//*[@id='" + id + "']";
265             node = (Node) xPathEngine.evaluate(xpath, root, XPathConstants.NODE);
266         } catch (XPathExpressionException ex) {
267             throw new RuntimeException("Should not happen", ex);
268         }
269         return node;
270     }
271 
272     /**
273      * Returns a NodeList composed of all the nodes that match an XPath expression, which must be valid.
274      * 
275      * @param node
276      *            the node object to locate
277      * @param xpath
278      *            an xpath expression
279      * 
280      * @return a list of {@link org.w3c.dom.Node}'s if they exists
281      */
282     public static List<Node> findAll(Node node, String xpath) {
283         if (node == null) {
284             throw new NullPointerException("node cannot be null.");
285         }
286         try {
287             NodeList nodes = (NodeList) xPathEngine.evaluate(xpath, node, XPathConstants.NODESET);
288             List<Node> result = new ArrayList<Node>(nodes.getLength());
289             for (int i = 0; i < nodes.getLength(); i++) {
290                 result.add(nodes.item(i));
291             }
292             return result;
293         } catch (XPathExpressionException ex) {
294             throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
295         }
296     }
297 
298     /**
299      * Gets the string value of an XPath expression.
300      * 
301      * @param node
302      *            the node object to locate
303      * @param xpath
304      *            an xpath expression
305      * 
306      * @return a string xpath value
307      */
308     public static String find(Node node, String xpath) {
309         try {
310             String val = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
311             if (null == val)
312                 return "";
313             return val;
314         } catch (XPathExpressionException ex) {
315             throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
316         }
317     }
318 
319     /**
320      * Tells if an element has a class name <b>not checking the parents in the hierarchy</b> mimicking the <i>CSS</i>
321      * .foo match.
322      * 
323      * @param node
324      *            the node object to locate
325      * @param className
326      *            the CSS class name
327      * 
328      * @return true if the class name exists
329      */
330     public static boolean hasClassName(Node node, String className) {
331         return hasAttribute(node, "class", className);
332     }
333 
334     /**
335      * Checks the presence of an attribute value in attributes that contain whitespace-separated lists of values. The
336      * semantic is the CSS classes' ones: "foo" matches "bar foo", "foo" but not "foob"
337      * 
338      * @param node
339      *            the node object to locate
340      * @param attributeName
341      *            attribute value
342      * @param className
343      *            the CSS class name
344      * 
345      * @return true if the class has the attribute name
346      */
347     public static boolean hasAttribute(Node node, String attributeName, String className) {
348         // regex love, maybe faster but less easy to understand
349         // Pattern pattern = Pattern.compile("(^|\\s+)"+className+"(\\s+|$)");
350         String attr = readAttribute(node, attributeName);
351         for (String c : attr.split("\\s+"))
352             if (c.equalsIgnoreCase(className))
353                 return true;
354         return false;
355     }
356 
357     /**
358      * Checks the presence of an attribute in the given <code>node</code>.
359      *
360      * @param node
361      *            the node container.
362      * @param attributeName
363      *            the name of the attribute.
364      * 
365      * @return true if the attribute is present
366      */
367     public static boolean hasAttribute(Node node, String attributeName) {
368         return readAttribute(node, attributeName, null) != null;
369     }
370 
371     /**
372      * Verifies if the given target node is an element.
373      *
374      * @param target
375      *            target node to check
376      * 
377      * @return <code>true</code> if the element the node is an element, <code>false</code> otherwise.
378      */
379     public static boolean isElementNode(Node target) {
380         return Node.ELEMENT_NODE == target.getNodeType();
381     }
382 
383     /**
384      * Reads the value of the specified <code>attribute</code>, returning the <code>defaultValue</code> string if not
385      * present.
386      *
387      * @param node
388      *            node to read the attribute.
389      * @param attribute
390      *            attribute name.
391      * @param defaultValue
392      *            the default value to return if attribute is not found.
393      * 
394      * @return the attribute value or <code>defaultValue</code> if not found.
395      */
396     public static String readAttribute(Node node, String attribute, String defaultValue) {
397         NamedNodeMap attributes = node.getAttributes();
398         if (null == attributes)
399             return defaultValue;
400         Node attr = attributes.getNamedItem(attribute);
401         if (null == attr)
402             return defaultValue;
403         return attr.getNodeValue();
404     }
405 
406     /**
407      * Reads the value of the first <i>attribute</i> which name matches with the specified <code>attributePrefix</code>.
408      * Returns the <code>defaultValue</code> if not found.
409      *
410      * @param node
411      *            node to look for attributes.
412      * @param attributePrefix
413      *            attribute prefix.
414      * @param defaultValue
415      *            default returned value.
416      * 
417      * @return the value found or default.
418      */
419     public static String readAttributeWithPrefix(Node node, String attributePrefix, String defaultValue) {
420         final NamedNodeMap attributes = node.getAttributes();
421         if (null == attributes) {
422             return defaultValue;
423         }
424         Node attribute;
425         for (int a = 0; a < attributes.getLength(); a++) {
426             attribute = attributes.item(a);
427             if (attribute.getNodeName().startsWith(attributePrefix)) {
428                 return attribute.getNodeValue();
429             }
430         }
431         return defaultValue;
432     }
433 
434     /**
435      * Reads the value of an <code>attribute</code>, returning the empty string if not present.
436      *
437      * @param node
438      *            node to read the attribute.
439      * @param attribute
440      *            attribute name.
441      * 
442      * @return the attribute value or <code>""</code> if not found.
443      */
444     public static String readAttribute(Node node, String attribute) {
445         return readAttribute(node, attribute, "");
446     }
447 
448     /**
449      * Given a <i>DOM</i> {@link Node} produces the <i>XML</i> serialization omitting the <i>XML declaration</i>.
450      *
451      * @param node
452      *            node to be serialized.
453      * @param indent
454      *            if <code>true</code> the output is indented.
455      * 
456      * @return the XML serialization.
457      * 
458      * @throws TransformerException
459      *             if an error occurs during the serializator initialization and activation.
460      * @throws java.io.IOException
461      *             if there is an error locating the node
462      */
463     public static String serializeToXML(Node node, boolean indent) throws TransformerException, IOException {
464         final DOMSource domSource = new DOMSource(node);
465         final Transformer transformer = TransformerFactory.newInstance().newTransformer();
466         transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
467         transformer.setOutputProperty(OutputKeys.METHOD, "xml");
468         transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
469         if (indent) {
470             transformer.setOutputProperty(OutputKeys.INDENT, "yes");
471             transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
472         }
473         final StringWriter sw = new StringWriter();
474         final StreamResult sr = new StreamResult(sw);
475         transformer.transform(domSource, sr);
476         sw.close();
477         return sw.toString();
478     }
479 
480     /**
481      * High performance implementation of {@link #findAll(org.w3c.dom.Node, String)}.
482      *
483      * @param root
484      *            root node to start search.
485      * @param tagName
486      *            name of target tag.
487      * @param attrName
488      *            name of attribute filter.
489      * @param attrContains
490      *            expected content for attribute.
491      * 
492      * @return a {@link java.util.List} of {@link org.w3c.dom.Node}'s
493      */
494     private static List<Node> findAllBy(Node root, final String tagName, final String attrName, String attrContains) {
495         DocumentTraversal documentTraversal = (DocumentTraversal) root.getOwnerDocument();
496         if (documentTraversal == null) {
497             documentTraversal = (DocumentTraversal) root;
498         }
499 
500         final Pattern attrContainsPattern;
501         if (attrContains != null && !attrContains.equals("*")) {
502             attrContainsPattern = Pattern.compile("(^|\\s)" + attrContains + "(\\s|$)", Pattern.CASE_INSENSITIVE);
503         } else {
504             attrContainsPattern = null;
505         }
506 
507         final List<Node> result = new ArrayList<Node>();
508         NodeIterator nodeIterator = documentTraversal.createNodeIterator(root, NodeFilter.SHOW_ELEMENT,
509                 new NodeFilter() {
510                     @Override
511                     public short acceptNode(Node node) {
512                         if (node.getNodeType() == Node.ELEMENT_NODE) {
513                             if (tagName != null && !tagName.equals("*") && !tagName.equals(node.getNodeName())) {
514                                 // tagName given but doesn't match.
515                                 return FILTER_ACCEPT;
516                             }
517 
518                             if (attrName != null) {
519                                 Node attrNameNode = node.getAttributes().getNamedItem(attrName);
520                                 if (attrNameNode == null) {
521                                     // attrName given but doesn't match
522                                     return FILTER_ACCEPT;
523                                 }
524 
525                                 if (attrContainsPattern != null
526                                         && !attrContainsPattern.matcher(attrNameNode.getNodeValue()).find()) {
527                                     // attrContains given but doesn't match
528                                     return FILTER_ACCEPT;
529                                 }
530                             }
531                             result.add(node);
532                         }
533                         return FILTER_ACCEPT;
534                     }
535                 }, false);
536 
537         // To populate result we only need to iterate...
538         while (nodeIterator.nextNode() != null)
539             ;
540 
541         // We have to explicitly declare we are done with this nodeIterator to free it's resources.
542         nodeIterator.detach();
543 
544         return result;
545     }
546 
547     /**
548      * Given a {@link org.w3c.dom.Document} this method will return an input stream representing that document.
549      * 
550      * @param doc
551      *            the input {@link org.w3c.dom.Document}
552      * 
553      * @return an {@link java.io.InputStream}
554      */
555     public static InputStream documentToInputStream(Document doc) {
556         DOMSource source = new DOMSource(doc);
557         StringWriter xmlAsWriter = new StringWriter();
558         StreamResult result = new StreamResult(xmlAsWriter);
559         try {
560             TransformerFactory.newInstance().newTransformer().transform(source, result);
561         } catch (TransformerConfigurationException e) {
562             throw new RuntimeException("Error within Document to InputStream transformation configuration!");
563         } catch (TransformerException e) {
564             throw new RuntimeException("Error whilst transforming the Document to InputStream!");
565         } catch (TransformerFactoryConfigurationError e) {
566             throw new RuntimeException("Error within Document to InputStream transformation configuration factory!");
567         }
568 
569         InputStream is = null;
570         try {
571             is = new ByteArrayInputStream(xmlAsWriter.toString().getBytes("UTF-8"));
572         } catch (UnsupportedEncodingException e) {
573             throw new RuntimeException("Error obtaining data with \"UTF-8\" encoding!", e);
574         }
575         return is;
576     }
577 
578     /**
579      * Convert a w3c dom node to a InputStream
580      * 
581      * @param node
582      *            {@link org.w3c.dom.Node} to convert
583      * 
584      * @return the converted {@link java.io.InputStream}
585      */
586     public static InputStream nodeToInputStream(Node node) {
587         ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
588         Result outputTarget = new StreamResult(outputStream);
589         Transformer t = null;
590         try {
591             t = TransformerFactory.newInstance().newTransformer();
592         } catch (TransformerConfigurationException e) {
593             throw new RuntimeException("Serious configuration error.", e);
594         } catch (TransformerFactoryConfigurationError e) {
595             throw new RuntimeException("Serious configuration error.", e);
596         }
597         t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
598         try {
599             t.transform(new DOMSource(node), outputTarget);
600         } catch (TransformerException e) {
601             throw new RuntimeException("Error whilst transforming the Node to InputStream!");
602         }
603         return new ByteArrayInputStream(outputStream.toByteArray());
604     }
605 
606 }