View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
22  import org.apache.any23.rdf.RDFUtils;
23  import org.eclipse.rdf4j.model.IRI;
24  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
25  import org.slf4j.Logger;
26  import org.slf4j.LoggerFactory;
27  import org.w3c.dom.Document;
28  import org.w3c.dom.NamedNodeMap;
29  import org.w3c.dom.Node;
30  import org.w3c.dom.NodeList;
31  import org.w3c.dom.Text;
32  
33  import javax.xml.xpath.XPath;
34  import javax.xml.xpath.XPathConstants;
35  import javax.xml.xpath.XPathExpressionException;
36  import javax.xml.xpath.XPathFactory;
37  import java.net.URISyntaxException;
38  import java.util.ArrayList;
39  import java.util.List;
40  
41  /**
42   * A wrapper around the DOM representation of an HTML document.
43   * Provides convenience access to various parts of the document.
44   *
45   * @author Gabriele Renzi
46   * @author Michele Mostarda
47   */
48  public class HTMLDocument {
49  
50      private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
51      private final static Logger log        = LoggerFactory.getLogger(HTMLDocument.class);
52  
53      private Node         document;
54      private java.net.URI baseIRI;
55  
56      private final Any23ValueFactoryWrapper valueFactory =
57              new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance());
58  
59      /**
60       * Reads a text field from the given node adding the content to the given <i>res</i> list.
61       *
62       * @param node the node from which read the content.
63       * @return a valid TextField
64       */
65      public static TextField readTextField(Node node) {
66          TextField result;
67          final String name = node.getNodeName();
68          final NamedNodeMap attributes = node.getAttributes();
69          // excess of safety check, should be impossible
70          if (attributes == null ) {
71              return new TextField( node.getTextContent(), node);
72          }
73          // first check if there are values inside
74          List<Node> values = DomUtils.findAllByClassName(node, "value");
75          if (!values.isEmpty()) {
76              String val = "";
77              for (Node n : values)
78                  val += n.getTextContent();
79              return new TextField( val.trim(), node);
80          }
81          if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
82              result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
83          } else if ("A".equals(name)) {
84              if (DomUtils.hasAttribute(node, "rel", "tag")) {
85                  String href = extractRelTag(attributes);
86                  result = new TextField(href, node);
87              } else
88                  result = new TextField(node.getTextContent(), node);
89          } else if (("IMG".equals(name) || "AREA".equals(name)) && (null != attributes.getNamedItem("alt"))) {
90              result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
91          } else {
92              result = new TextField(node.getTextContent(), node);
93          }
94          return result;
95      }
96  
97      /**
98       * Reads an URL field from the given node adding the content to the given <i>res</i> list.
99       *
100      * @param res {@link java.util.List} of 
101      * {@link org.apache.any23.extractor.html.HTMLDocument.TextField}
102      * @param node the node to read
103      */
104     public static void readUrlField(List<TextField> res, Node node) {
105         String name = node.getNodeName();
106         NamedNodeMap attributes = node.getAttributes();
107         if (null == attributes) {
108             res.add( new TextField(node.getTextContent(), node) );
109             return;
110         }
111         if ("A".equals(name) || "AREA".equals(name)) {
112             Node n = attributes.getNamedItem("href");
113             if (n != null) {
114                 res.add(new TextField(n.getNodeValue(), n));
115             }
116         } else if ("ABBR".equals(name)) {
117             Node n = attributes.getNamedItem("title");
118             if (n != null) {
119                 res.add(new TextField(n.getNodeValue(), n));
120             }
121         } else if ("IMG".equals(name)) {
122             Node n = attributes.getNamedItem("src");
123             if (n != null) {
124                 res.add(new TextField(n.getNodeValue(), n));
125             } else {
126                 n = attributes.getNamedItem("srcset");
127                 if (n != null) {
128                     res.add(new TextField(n.getNodeValue().split("[\\s,]+")[0], n));
129                 }
130             }
131         } else if ("OBJECT".equals(name)) {
132             Node n = attributes.getNamedItem("data");
133             if (n != null) {
134                 res.add(new TextField(n.getNodeValue(), n));
135             }
136         } else {
137             res.add( new TextField(extractHCardTextContent(node), node) );
138         }
139     }
140 
141     private static String extractHCardTextContent(Node node) {
142         StringBuilder sb = new StringBuilder();
143         NodeList nodes = node.getChildNodes();
144         //if at least one element with 'value' class, concatenate all text in value
145         if (extractTextInValue(nodes, sb) == 0) {
146             //otherwise, concatenate all text not in elements with 'type' class
147             extractTextNotInType(nodes, sb);
148         }
149         return sb.toString();
150     }
151 
152     private static int extractTextInValue(NodeList nodes, StringBuilder b) {
153         int count = 0;
154         for (int i = 0, len = nodes.getLength(); i < len; i++) {
155             Node n = nodes.item(i);
156             if (DomUtils.hasClassName(n, "value")) {
157                 count++;
158                 b.append(n.getTextContent().trim());
159             } else {
160                 count += extractTextInValue(n.getChildNodes(), b);
161             }
162         }
163         return count;
164     }
165 
166     private static void extractTextNotInType(NodeList nodes, StringBuilder b) {
167         for (int i = 0, len = nodes.getLength(); i < len; i++) {
168             Node n = nodes.item(i);
169             if (n.getNodeType() == Node.TEXT_NODE) {
170                 b.append(n.getNodeValue().trim());
171             } else if (!DomUtils.hasClassName(n, "type")) {
172                 extractTextNotInType(n.getChildNodes(), b);
173             }
174         }
175     }
176 
177     /**
178      * Extracts the href specific rel-tag string.
179      * See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
180      *
181      * @param hrefAttributeContent the content of the <i>href</i> attribute.
182      * @return the rel-tag specification.
183      */
184     public static String extractRelTag(String hrefAttributeContent) {
185         String[] all = hrefAttributeContent.split("[#?]");
186         // Cleanup spurious segments.
187         String path = all[0];
188         int pathLenghtMin1 = path.length() - 1;
189         if( '/' == path.charAt(pathLenghtMin1) ) {
190             path = path.substring(0, pathLenghtMin1);
191         }
192         return path;
193     }
194 
195     /**
196      * Extracts the href specific rel-tag string.
197      * See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
198      *
199      * @param attributes the list of attributes of a node.
200      * @return the rel-tag specification.
201      */
202     public static String extractRelTag(NamedNodeMap attributes) {
203         return extractRelTag(attributes.getNamedItem("href").getNodeValue());
204     }
205 
206     /**
207      * Reads the text content of the given node and returns it.
208      * If the <code>prettify</code> flag is <code>true</code>
209      * the text is cleaned up.
210      *
211      * @param node node to read content.
212      * @param prettify if <code>true</code> blank chars will be removed.
213      * @return the read text.
214      */
215     public static String readNodeContent(Node node, boolean prettify) {
216         final String content = node.getTextContent();
217         return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
218     }
219 
220     /**
221      * Constructor accepting the root node.
222      * 
223      * @param document a {@link org.w3c.dom.Node}
224      */
225     public HTMLDocument(Node document) {
226         if (null == document)
227             throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
228         this.document = document;
229     }
230 
231     /**
232      * @param uri string to resolve to {@link org.eclipse.rdf4j.model.IRI}
233      * @return An absolute IRI, or null if the IRI is not fixable
234      * @throws org.apache.any23.extractor.ExtractionException If the base IRI is invalid
235      */
236     public IRI resolveIRI(String uri) throws ExtractionException {
237         return valueFactory.resolveIRI(uri, getBaseIRI());
238     }
239 
240     public String find(String xpath) {
241         return DomUtils.find(getDocument(), xpath);
242     }
243 
244     public Node findNodeById(String id) {
245         return DomUtils.findNodeById(getDocument(), id);
246     }
247 
248     public List<Node> findAll(String xpath) {
249         return DomUtils.findAll(getDocument(), xpath);
250     }
251 
252     public String findMicroformattedValue(
253             String objectTag,
254             String object,
255             String fieldTag,
256             String field,
257             String key
258     ) {
259         Node node = findMicroformattedObjectNode(objectTag, object);
260         if (null == node)
261             return "";
262         // try to check if it is inline
263         if (DomUtils.hasClassName(node, field))
264             return node.getTextContent();
265 
266         // failed, try to find it in a child
267         try {
268             String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
269             String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
270             if (null == value) {
271                 return "";
272             }
273             return value;
274         } catch (XPathExpressionException ex) {
275             throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
276         }
277 
278     }
279 
280     public Node getDocument() {
281         return document;
282     }
283 
284     /**
285      * Returns a singular text field. 
286      *
287      * @param className name of class containing text.
288      * @return if multiple values are found just the first is returned,
289      * if we want to check that there are no n-ary values use plural finder
290      */
291     public TextField getSingularTextField(String className) {
292         TextField[] res = getPluralTextField(className);
293         if (res.length == 0)
294             return new TextField("", null);
295         return res[0];
296     }
297 
298     /**
299      * Returns a plural text field.
300      * 
301      * @param className name of class node containing text.
302      * @return list of fields.
303      */
304     public TextField[] getPluralTextField(String className) {
305         List<TextField> res = new ArrayList<TextField>();
306         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
307         for (Node node : nodes) {
308             res.add( readTextField(node) );
309         }
310         return res.toArray( new TextField[res.size()] );
311     }
312 
313     /**
314      * Returns the URL associated to the field marked with class <i>className</i>.
315      *
316      * @param className name of node class containing the URL field.
317      * @return if multiple values are found just the first is returned,
318      *  if we want to check that there are no n-ary values use plural finder
319      */
320     public TextField getSingularUrlField(String className) {
321         TextField[] res = getPluralUrlField(className);
322         if (res.length < 1)
323             return new TextField("", null);
324         return res[0];
325     }
326 
327     /**
328      * Returns the list of URLs associated to the fields marked with class <i>className</i>.
329      *
330      * @param className name of node class containing the URL field.
331      * @return the list of {@link HTMLDocument.TextField} found.
332      */
333     public TextField[] getPluralUrlField(String className) {
334         List<TextField> res = new ArrayList<TextField>();
335         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
336         for (Node node : nodes)
337             readUrlField(res, node);
338         return res.toArray( new TextField[res.size()] );
339     }
340 
341     public Node findMicroformattedObjectNode(String objectTag, String name) {
342         List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
343         if (nodes.isEmpty())
344             return null;
345         return nodes.get(0);
346     }
347 
348     /**
349      * Read an attribute avoiding NullPointerExceptions, if the attr is
350      * missing it just returns an empty string.
351      *
352      * @param attribute the attribute name.
353      * @return the string representing the attribute.
354      */
355     public String readAttribute(String attribute) {
356         return DomUtils.readAttribute(getDocument(), attribute);
357     }
358 
359     /**
360      * Finds all the nodes by class name.
361      *
362      * @param clazz the class name.
363      * @return list of matching nodes.
364      */
365     public List<Node> findAllByClassName(String clazz) {
366         return DomUtils.findAllByClassName(getDocument(), clazz);
367     }
368 
369     /**
370      * Returns the text contained inside a node if leaf,
371      * <code>null</code> otherwise.
372      *
373      * @return the text of a leaf node.
374      */
375     public String getText() {
376         NodeList children = getDocument().getChildNodes();
377         if(children.getLength() == 1 && children.item(0) instanceof Text) {
378             return children.item(0).getTextContent();
379         }
380         return null;
381     }
382 
383     /**
384      * Returns the document default language.
385      *
386      * @return default language if any, <code>null</code> otherwise.
387      */
388     public String getDefaultLanguage() {
389         final String xpathLanguageSelector = "/HTML";
390         Node html;
391         try {
392             html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
393         } catch (XPathExpressionException xpeee) {
394             throw new IllegalStateException();
395         }
396         if (html == null) {
397             return null;
398         }
399         Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
400         return langAttribute == null ? null : langAttribute.getTextContent();
401     }
402 
403     /**
404      * Returns the sequence of ancestors from the document root to the local root (document).
405      *
406      * @return a sequence of node names.
407      */
408     public String[] getPathToLocalRoot() {
409         return DomUtils.getXPathListForNode(document);
410     }
411 
412     /**
413      * Extracts all the <code>rel</code> tag nodes.
414      *
415      * @return list of rel tag nodes.
416      */
417     public TextField[] extractRelTagNodes() {
418         final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
419         final List<TextField> result = new ArrayList<TextField>();
420         for(Node relTagNode : relTagNodes) {
421             readUrlField(result, relTagNode);
422         }
423         return result.toArray( new TextField[result.size()] );
424     }
425 
426     private java.net.URI getBaseIRI() throws ExtractionException {
427         if (baseIRI == null) {
428             // document.getBaseURI() returns null for document URIs with
429             // special characters, e.g., http://semanticweb.org/wiki/Knud_Möller
430             // It also does *not* take html "base" elements into account.
431             // (But it does take into account urls specified by the attribute "xml:base".)
432 
433             // So, for now, let's use getDocumentURI() instead.
434             // TODO: Make this approach better.
435 
436             Document doc = document instanceof Document ? (Document)document : document.getOwnerDocument();
437 
438             if (doc == null) {
439                 throw new ExtractionException("Node " + document.getNodeName() + " was not associated with a document.");
440             }
441 
442             String uri = doc.getDocumentURI();
443 
444             if (uri == null) {
445                 throw new ExtractionException("document URI is null, this should not happen");
446             }
447 
448             try {
449                 baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(uri));
450             } catch (IllegalArgumentException ex) {
451                 throw new ExtractionException("Error in base IRI: " + uri, ex);
452             } catch (URISyntaxException ex) {
453                 throw new ExtractionException("Error in base IRI: " + uri, ex);
454             }
455         }
456         return baseIRI;
457     }
458 
459     /**
460      * This class represents a text extracted from the <i>HTML</i> DOM related
461      * to the node from which such test has been retrieved.
462      */
463     public static class TextField {
464         private String value;
465         private Node   source;
466 
467         public TextField(String value, Node source) {
468             this.value = value;
469             this.source = source;
470         }
471 
472         public String value() {
473             return value;
474         }
475 
476         public Node source() {
477             return source;
478         }
479     }
480 
481 }