View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.w3c.dom.Document;
21  import org.w3c.dom.NamedNodeMap;
22  import org.w3c.dom.Node;
23  import org.w3c.dom.NodeList;
24  import org.w3c.dom.traversal.DocumentTraversal;
25  import org.w3c.dom.traversal.NodeFilter;
26  import org.w3c.dom.traversal.NodeIterator;
27  
28  import javax.xml.transform.OutputKeys;
29  import javax.xml.transform.Result;
30  import javax.xml.transform.Transformer;
31  import javax.xml.transform.TransformerConfigurationException;
32  import javax.xml.transform.TransformerException;
33  import javax.xml.transform.TransformerFactory;
34  import javax.xml.transform.TransformerFactoryConfigurationError;
35  import javax.xml.transform.dom.DOMSource;
36  import javax.xml.transform.stream.StreamResult;
37  import javax.xml.xpath.XPath;
38  import javax.xml.xpath.XPathConstants;
39  import javax.xml.xpath.XPathExpressionException;
40  import javax.xml.xpath.XPathFactory;
41  
42  import java.io.ByteArrayInputStream;
43  import java.io.ByteArrayOutputStream;
44  import java.io.IOException;
45  import java.io.InputStream;
46  import java.io.StringWriter;
47  import java.io.UnsupportedEncodingException;
48  import java.util.ArrayList;
49  import java.util.List;
50  import java.util.regex.Pattern;
51  
52  /**
53   * This class provides utility methods for DOM manipulation.
54   * It is separated from {@link HTMLDocument} so that its methods
55   * can be run on single DOM nodes without having to wrap them
56   * into an HTMLDocument.
57   * <p>
58   * We use a mix of XPath and DOM manipulation.
59   * </p>
60   * This is likely to be a performance bottleneck but at least
61   * everything is localized here.
62   */
63  public class DomUtils {
64  
65      private static final String[] EMPTY_STRING_ARRAY = new String[0];
66          
67      private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
68  
69      private DomUtils(){}
70  
71      /**
72       * Given a node this method returns the index corresponding to such node
73       * within the list of the children of its parent node.
74       *
75       * @param n the node of which returning the index.
76       * @return a non negative number.
77       */
78      public static int getIndexInParent(Node n) {
79          Node parent = n.getParentNode();
80          if(parent == null) {
81              return 0;
82          }
83          NodeList nodes = parent.getChildNodes();
84          int counter = -1;
85          for(int i = 0; i < nodes.getLength(); i++) {
86              Node current = nodes.item(i);
87              if ( current.getNodeType() == n.getNodeType() && current.getNodeName().equals( n.getNodeName() ) ) {
88                  counter++;
89              }
90              if( current.equals(n) ) {
91                  return counter;
92              }
93          }
94          throw new IllegalStateException("Cannot find a child within its parent node list.");
95      }
96  
97      /**
98       * Does a reverse walking of the DOM tree to generate a unique XPath
99       * expression leading to this node. The XPath generated is the canonical
100      * one based on sibling index: /html[1]/body[1]/div[2]/span[3] etc..
101      *
102      * @param node the input node.
103      * @return the XPath location of node as String.
104      */
105     public static String getXPathForNode(Node node) {
106         final StringBuilder sb = new StringBuilder();
107         Node parent = node;
108         while(parent != null && parent.getNodeType() != Node.DOCUMENT_NODE) {
109             sb.insert(0, "]");
110             sb.insert(0, getIndexInParent(parent) + 1);
111             sb.insert(0, "[");
112             sb.insert(0, parent.getNodeName());
113             sb.insert(0, "/");
114             parent = parent.getParentNode();
115         }
116         return sb.toString();
117     }
118 
119     /**
120      * Returns a list of tag names representing the path from
121      * the document root to the given node <i>n</i>.
122      *
123      * @param n the node for which retrieve the path.
124      * @return a sequence of HTML tag names.
125      */
126     public static String[] getXPathListForNode(Node n) {
127         if(n == null) {
128             return EMPTY_STRING_ARRAY;
129         }
130         List<String> ancestors = new ArrayList<String>();
131         ancestors.add( String.format("%s[%s]", n.getNodeName(), getIndexInParent(n) ) );
132         Node parent = n.getParentNode();
133         while(parent != null) {
134             ancestors.add(0, String.format("%s[%s]", parent.getNodeName(), getIndexInParent(parent) ) );
135             parent = parent.getParentNode();
136         }
137         return ancestors.toArray( new String[ancestors.size()] );
138     }
139 
140     /**
141      * Returns the row/col location of the given node.
142      *
143      * @param n input node.
144      * @return an array of two elements of type
145      *         <code>[&lt;begin-row&gt;, &lt;begin-col&gt;, &lt;end-row&gt; &lt;end-col&gt;]</code>
146      *         or <code>null</code> if not possible to extract such data.
147      */
148     public static int[] getNodeLocation(Node n) {
149         if(n == null) throw new NullPointerException("node cannot be null.");
150         final TagSoupParser.ElementLocation elementLocation =
151             (TagSoupParser.ElementLocation) n.getUserData( TagSoupParser.ELEMENT_LOCATION );
152         if(elementLocation == null) return null;
153         return new int[]{
154                 elementLocation.getBeginLineNumber(),
155                 elementLocation.getBeginColumnNumber(),
156                 elementLocation.getEndLineNumber(),
157                 elementLocation.getEndColumnNumber()
158         };
159     }
160 
161     /**
162      * Checks whether a node is ancestor or same of another node.
163      *
164      * @param candidateAncestor the candidate ancestor node.
165      * @param candidateSibling the candidate sibling node.
166      * @param strict if <code>true</code> is not allowed that the ancestor and sibling can be the same node.
167      * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
168      *         <code>false</code> otherwise.
169      */
170     public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling, boolean strict) {
171         if(candidateAncestor == null) throw new NullPointerException("candidate ancestor cannot be null null.");
172         if(candidateSibling  == null) throw new NullPointerException("candidate sibling cannot be null null." );
173         if(strict && candidateAncestor.equals(candidateSibling)) return false;
174         Node parent = candidateSibling;
175         while(parent != null) {
176             if(parent.equals(candidateAncestor)) return true;
177             parent = parent.getParentNode();
178         }
179         return false;
180     }
181 
182     /**
183      * Checks whether a node is ancestor or same of another node. As
184      * {@link #isAncestorOf(org.w3c.dom.Node, org.w3c.dom.Node, boolean)} with <code>strict=false</code>.
185      *
186      * @param candidateAncestor the candidate ancestor node.
187      * @param candidateSibling the candidate sibling node.
188      * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
189      *         <code>false</code> otherwise.
190      */
191     public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling) {
192         return isAncestorOf(candidateAncestor, candidateSibling, false);
193     }
194 
195     /**
196      * Finds all nodes that have a declared class.
197      * Note that the className is transformed to lower case before being
198      * matched against the DOM.
199      * @param root the root node from which start searching.
200      * @param className the name of the filtered class.
201      * @return list of matching nodes or an empty list.
202      */
203     public static List<Node> findAllByClassName(Node root, String className) {
204         return findAllBy(root, null, "class", className.toLowerCase());
205     }
206 
207     /**
208      * Finds all nodes that have a declared attribute.
209      * Note that the className is transformed to lower case before being
210      * matched against the DOM.
211      * @param root the root node from which start searching.
212      * @param attrName the name of the filtered attribue.
213      * @return list of matching nodes or an empty list.
214      */
215     public static List<Node> findAllByAttributeName(Node root, String attrName) {
216         return findAllBy(root, null, attrName, null);
217     }
218     
219    public static List<Node> findAllByAttributeContains(Node node, String attrName, String attrContains) {
220        return findAllBy(node, null, attrName, attrContains);
221    }
222 
223     public static List<Node> findAllByTag(Node root, String tagName) {
224            return findAllBy(root, tagName, null, null);
225     }
226     
227     public static List<Node> findAllByTagAndClassName(Node root, final String tagName, final String className) {
228        return findAllBy(root, tagName, "class", className);
229     }
230 
231     /**
232      * Mimics the JS DOM API, or prototype's $()
233      * @param root the node to locate
234      * @param id the id of the node to locate
235      * @return the {@link org.w3c.dom.Node} if one exists
236      */
237     public static Node findNodeById(Node root, String id) {
238         Node node;
239         try {
240             String xpath = "//*[@id='" + id + "']";
241             node = (Node) xPathEngine.evaluate(xpath, root, XPathConstants.NODE);
242         } catch (XPathExpressionException ex) {
243             throw new RuntimeException("Should not happen", ex);
244         }
245         return node;
246     }
247 
248     /**
249      * Returns a NodeList composed of all the nodes that match an XPath
250      * expression, which must be valid.
251      * @param node the node object to locate
252      * @param xpath an xpath expression
253      * @return a list of {@link org.w3c.dom.Node}'s if they exists
254      */
255     public static List<Node> findAll(Node node, String xpath) {
256         if(node == null) {
257             throw new NullPointerException("node cannot be null.");
258         }
259         try {
260             NodeList nodes = (NodeList) xPathEngine.evaluate(xpath, node, XPathConstants.NODESET);
261             List<Node> result = new ArrayList<Node>(nodes.getLength());
262             for (int i = 0; i < nodes.getLength(); i++) {
263                 result.add(nodes.item(i));
264             }
265             return result;
266         } catch (XPathExpressionException ex) {
267             throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
268         }
269     }
270 
271     /**
272      * Gets the string value of an XPath expression.
273      * @param node the node object to locate
274      * @param xpath an xpath expression
275      * @return a string xpath value
276      */
277     public static String find(Node node, String xpath) {
278         try {
279             String val = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
280             if (null == val)
281                 return "";
282             return val;
283         } catch (XPathExpressionException ex) {
284             throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
285         }
286     }
287 
288     /**
289      * Tells if an element has a class name <b>not checking the parents
290      * in the hierarchy</b> mimicking the <i>CSS</i> .foo match.
291      * @param node the node object to locate
292      * @param className the CSS class name
293      * @return true if the class name exists
294      */
295     public static boolean hasClassName(Node node, String className) {
296         return hasAttribute(node, "class", className);
297     }
298 
299     /**
300      * Checks the presence of an attribute value in attributes that
301      * contain whitespace-separated lists of values. The semantic is the
302      * CSS classes' ones: "foo" matches "bar foo", "foo" but not "foob"
303      * @param node the node object to locate
304      * @param attributeName attribute value
305      * @param className the CSS class name
306      * @return true if the class has the attribute name
307      */
308     public static boolean hasAttribute(Node node, String attributeName, String className) {
309         // regex love, maybe faster but less easy to understand
310         // Pattern pattern = Pattern.compile("(^|\\s+)"+className+"(\\s+|$)");
311         String attr = readAttribute(node, attributeName);
312         for (String c : attr.split("\\s+"))
313             if (c.equalsIgnoreCase(className))
314                 return true;
315         return false;
316     }
317 
318      /**
319      * Checks the presence of an attribute in the given <code>node</code>.
320       *
321       * @param node the node container.
322       * @param attributeName the name of the attribute.
323       * @return true if the attribute is present
324       */
325     public static boolean hasAttribute(Node node, String attributeName) {
326         return readAttribute(node, attributeName, null) != null;
327     }
328 
329     /**
330      * Verifies if the given target node is an element.
331      *
332      * @param target target node to check
333      * @return <code>true</code> if the element the node is an element,
334      *         <code>false</code> otherwise.
335      */
336     public static boolean isElementNode(Node target) {
337         return Node.ELEMENT_NODE == target.getNodeType();
338     }
339 
340     /**
341      * Reads the value of the specified <code>attribute</code>, returning the
342      * <code>defaultValue</code> string if not present.
343      *
344      * @param node node to read the attribute.
345      * @param attribute attribute name.
346      * @param defaultValue the default value to return if attribute is not found.
347      * @return the attribute value or <code>defaultValue</code> if not found.
348      */
349     public static String readAttribute(Node node, String attribute, String defaultValue) {
350         NamedNodeMap attributes = node.getAttributes();
351         if (null == attributes)
352             return defaultValue;
353         Node attr = attributes.getNamedItem(attribute);
354         if (null==attr)
355             return defaultValue;
356         return attr.getNodeValue();
357     }
358 
359     /**
360      * Reads the value of the first <i>attribute</i> which name matches with the specified <code>attributePrefix</code>.
361      * Returns the <code>defaultValue</code> if not found.
362      *
363      * @param node node to look for attributes.
364      * @param attributePrefix attribute prefix.
365      * @param defaultValue default returned value.
366      * @return the value found or default.
367      */
368     public static String readAttributeWithPrefix(Node node, String attributePrefix, String defaultValue) {
369         final NamedNodeMap attributes = node.getAttributes();
370         if (null == attributes) {
371             return defaultValue;
372         }
373         Node attribute;
374         for (int a = 0; a < attributes.getLength(); a++) {
375             attribute = attributes.item(a);
376             if (attribute.getNodeName().startsWith(attributePrefix)) {
377                 return attribute.getNodeValue();
378             }
379         }
380         return defaultValue;
381     }
382 
383     /**
384      * Reads the value of an <code>attribute</code>, returning the
385      * empty string if not present.
386      *
387      * @param node node to read the attribute.
388      * @param attribute attribute name.
389      * @return the attribute value or <code>""</code> if not found.
390      */
391     public static String readAttribute(Node node, String attribute) {
392         return readAttribute(node, attribute, "");
393     }
394 
395     /**
396      * Given a <i>DOM</i> {@link Node} produces the <i>XML</i> serialization
397      * omitting the <i>XML declaration</i>.
398      *
399      * @param node node to be serialized.
400      * @param indent if <code>true</code> the output is indented.
401      * @return the XML serialization.
402      * @throws TransformerException if an error occurs during the
403      *         serializator initialization and activation.
404      * @throws java.io.IOException if there is an error locating the node
405      */
406     public static String serializeToXML(Node node, boolean indent) throws TransformerException, IOException {
407         final DOMSource domSource = new DOMSource(node);
408         final Transformer transformer = TransformerFactory.newInstance().newTransformer();
409         transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
410         transformer.setOutputProperty(OutputKeys.METHOD, "xml");
411         transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
412         if(indent) {
413             transformer.setOutputProperty(OutputKeys.INDENT, "yes");
414             transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
415         }
416         final StringWriter sw = new StringWriter();
417         final StreamResult sr = new StreamResult(sw);
418         transformer.transform(domSource, sr);
419         sw.close();
420         return sw.toString();
421     }
422 
423     /**
424      * High performance implementation of {@link #findAll(org.w3c.dom.Node, String)}.
425      *
426      * @param root root node to start search.
427      * @param tagName name of target tag.
428      * @param attrName name of attribute filter.
429      * @param attrContains expected content for attribute.
430      * @return a {@link java.util.List} of {@link org.w3c.dom.Node}'s
431      */
432     private static List<Node> findAllBy(Node root, final String tagName, final String attrName, String attrContains) {
433         DocumentTraversal documentTraversal = (DocumentTraversal) root.getOwnerDocument();
434         if (documentTraversal == null) {
435             documentTraversal = (DocumentTraversal) root;
436         }
437 
438         final Pattern attrContainsPattern;
439         if (attrContains != null && !attrContains.equals("*")) {
440             attrContainsPattern = Pattern.compile("(^|\\s)" + attrContains + "(\\s|$)", Pattern.CASE_INSENSITIVE);
441         } else {
442             attrContainsPattern = null;
443         }
444 
445         final List<Node> result = new ArrayList<Node>();
446         NodeIterator nodeIterator = documentTraversal.createNodeIterator(
447                 root,
448                 NodeFilter.SHOW_ELEMENT,
449                 new NodeFilter() {
450                     @Override
451                     public short acceptNode(Node node) {
452                         if (node.getNodeType() == Node.ELEMENT_NODE) {
453                             if (tagName != null && !tagName.equals("*") && !tagName.equals(node.getNodeName())) {
454                                 // tagName given but doesn't match.
455                                 return FILTER_ACCEPT;
456                             }
457 
458                             if (attrName != null) {
459                                 Node attrNameNode = node.getAttributes().getNamedItem(attrName);
460                                 if (attrNameNode == null) {
461                                     // attrName given but doesn't match
462                                     return FILTER_ACCEPT;
463                                 }
464 
465                                 if (
466                                         attrContainsPattern != null
467                                                 &&
468                                                 !attrContainsPattern.matcher(attrNameNode.getNodeValue()).find()
469                                         ) {
470                                     // attrContains given but doesn't match
471                                     return FILTER_ACCEPT;
472                                 }
473                             }
474                             result.add(node);
475                         }
476                         return FILTER_ACCEPT;
477                     }
478                 }, false);
479 
480         // To populate result we only need to iterate...
481         while (nodeIterator.nextNode() != null) ;
482 
483         // We have to explicitly declare we are done with this nodeIterator to free it's resources.
484         nodeIterator.detach();
485 
486         return result;
487     }
488     
489     /**
490      * Given a {@link org.w3c.dom.Document} this method will return an
491      * input stream representing that document.
492      * @param doc the input {@link org.w3c.dom.Document}
493      * @return an {@link java.io.InputStream}
494      */
495     public static InputStream documentToInputStream(Document doc) {
496       DOMSource source = new DOMSource(doc);
497       StringWriter xmlAsWriter = new StringWriter();
498       StreamResult result = new StreamResult(xmlAsWriter);
499       try {
500         TransformerFactory.newInstance().newTransformer().transform(source, result);
501       } catch (TransformerConfigurationException e) {
502         throw new RuntimeException("Error within Document to InputStream transformation configuration!");
503       } catch (TransformerException e) {
504         throw new RuntimeException("Error whilst transforming the Document to InputStream!");
505       } catch (TransformerFactoryConfigurationError e) {
506         throw new RuntimeException("Error within Document to InputStream transformation configuration!");
507       }
508        
509       InputStream is = null;
510       try {
511         is = new ByteArrayInputStream(xmlAsWriter.toString().getBytes("UTF-8"));
512       } catch (UnsupportedEncodingException e) {
513         e.printStackTrace();
514       }
515       return is;
516     }
517     
518 
519     /**
520      * Convert a w3c dom node to a InputStream
521      * @param node {@link org.w3c.dom.Node} to convert
522      * @return the converted {@link java.io.InputStream}
523      */
524     public static InputStream nodeToInputStream(Node node) {
525         ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
526         Result outputTarget = new StreamResult(outputStream);
527         Transformer t = null;
528         try {
529           t = TransformerFactory.newInstance().newTransformer();
530         } catch (TransformerConfigurationException e) {
531           e.printStackTrace();
532         } catch (TransformerFactoryConfigurationError e) {
533           e.printStackTrace();
534         }
535         t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
536         try {
537           t.transform(new DOMSource(node), outputTarget);
538         } catch (TransformerException e) {
539           e.printStackTrace();
540         }
541         return new ByteArrayInputStream(outputStream.toByteArray());
542     }
543 
544 
545 
546 }