View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
22  import org.apache.any23.rdf.RDFUtils;
23  import org.eclipse.rdf4j.model.IRI;
24  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
25  import org.slf4j.Logger;
26  import org.slf4j.LoggerFactory;
27  import org.w3c.dom.Document;
28  import org.w3c.dom.NamedNodeMap;
29  import org.w3c.dom.Node;
30  import org.w3c.dom.NodeList;
31  import org.w3c.dom.Text;
32  
33  import javax.xml.xpath.XPath;
34  import javax.xml.xpath.XPathConstants;
35  import javax.xml.xpath.XPathExpressionException;
36  import javax.xml.xpath.XPathFactory;
37  import java.net.URISyntaxException;
38  import java.util.ArrayList;
39  import java.util.List;
40  
41  /**
42   * A wrapper around the DOM representation of an HTML document. Provides convenience access to various parts of the
43   * document.
44   *
45   * @author Gabriele Renzi
46   * @author Michele Mostarda
47   */
48  public class HTMLDocument {
49  
50      private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
51      private final static Logger log = LoggerFactory.getLogger(HTMLDocument.class);
52  
53      private Node document;
54      private java.net.URI baseIRI;
55  
56      private final Any23ValueFactoryWrapperl#Any23ValueFactoryWrapper">Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
57              SimpleValueFactory.getInstance());
58  
59      /**
60       * Reads a text field from the given node adding the content to the given <i>res</i> list.
61       *
62       * @param node
63       *            the node from which read the content.
64       * 
65       * @return a valid TextField
66       */
67      public static TextField readTextField(Node node) {
68          TextField result;
69          final String name = node.getNodeName();
70          final NamedNodeMap attributes = node.getAttributes();
71          // excess of safety check, should be impossible
72          if (attributes == null) {
73              return new TextField(node.getTextContent(), node);
74          }
75          // first check if there are values inside
76          List<Node> values = DomUtils.findAllByClassName(node, "value");
77          if (!values.isEmpty()) {
78              StringBuilder val = new StringBuilder();
79              for (Node n : values)
80                  val.append(n.getTextContent());
81              return new TextField(val.toString().trim(), node);
82          }
83          if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
84              result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
85          } else if ("A".equals(name)) {
86              if (DomUtils.hasAttribute(node, "rel", "tag")) {
87                  String href = extractRelTag(attributes);
88                  result = new TextField(href, node);
89              } else
90                  result = new TextField(node.getTextContent(), node);
91          } else if (("IMG".equals(name) || "AREA".equals(name)) && (null != attributes.getNamedItem("alt"))) {
92              result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
93          } else {
94              result = new TextField(node.getTextContent(), node);
95          }
96          return result;
97      }
98  
99      /**
100      * Reads an URL field from the given node adding the content to the given <i>res</i> list.
101      *
102      * @param res
103      *            {@link java.util.List} of {@link org.apache.any23.extractor.html.HTMLDocument.TextField}
104      * @param node
105      *            the node to read
106      */
107     public static void readUrlField(List<TextField> res, Node node) {
108         String name = node.getNodeName();
109         NamedNodeMap attributes = node.getAttributes();
110         if (null == attributes) {
111             res.add(new TextField(node.getTextContent(), node));
112             return;
113         }
114         if ("A".equals(name) || "AREA".equals(name)) {
115             Node n = attributes.getNamedItem("href");
116             if (n != null) {
117                 res.add(new TextField(n.getNodeValue(), n));
118             }
119         } else if ("ABBR".equals(name)) {
120             Node n = attributes.getNamedItem("title");
121             if (n != null) {
122                 res.add(new TextField(n.getNodeValue(), n));
123             }
124         } else if ("IMG".equals(name)) {
125             Node n = attributes.getNamedItem("src");
126             if (n != null) {
127                 res.add(new TextField(n.getNodeValue(), n));
128             } else {
129                 n = attributes.getNamedItem("srcset");
130                 if (n != null) {
131                     res.add(new TextField(n.getNodeValue().split("[\\s,]+")[0], n));
132                 }
133             }
134         } else if ("OBJECT".equals(name)) {
135             Node n = attributes.getNamedItem("data");
136             if (n != null) {
137                 res.add(new TextField(n.getNodeValue(), n));
138             }
139         } else {
140             res.add(new TextField(extractHCardTextContent(node), node));
141         }
142     }
143 
144     private static String extractHCardTextContent(Node node) {
145         StringBuilder sb = new StringBuilder();
146         NodeList nodes = node.getChildNodes();
147         // if at least one element with 'value' class, concatenate all text in value
148         if (extractTextInValue(nodes, sb) == 0) {
149             // otherwise, concatenate all text not in elements with 'type' class
150             extractTextNotInType(nodes, sb);
151         }
152         return sb.toString();
153     }
154 
155     private static int extractTextInValue(NodeList nodes, StringBuilder b) {
156         int count = 0;
157         for (int i = 0, len = nodes.getLength(); i < len; i++) {
158             Node n = nodes.item(i);
159             if (DomUtils.hasClassName(n, "value")) {
160                 count++;
161                 b.append(n.getTextContent().trim());
162             } else {
163                 count += extractTextInValue(n.getChildNodes(), b);
164             }
165         }
166         return count;
167     }
168 
169     private static void extractTextNotInType(NodeList nodes, StringBuilder b) {
170         for (int i = 0, len = nodes.getLength(); i < len; i++) {
171             Node n = nodes.item(i);
172             if (n.getNodeType() == Node.TEXT_NODE) {
173                 b.append(n.getNodeValue().trim());
174             } else if (!DomUtils.hasClassName(n, "type")) {
175                 extractTextNotInType(n.getChildNodes(), b);
176             }
177         }
178     }
179 
180     /**
181      * Extracts the href specific rel-tag string. See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a>
182      * specification.
183      *
184      * @param hrefAttributeContent
185      *            the content of the <i>href</i> attribute.
186      * 
187      * @return the rel-tag specification.
188      */
189     public static String extractRelTag(String hrefAttributeContent) {
190         String[] all = hrefAttributeContent.split("[#?]");
191         // Cleanup spurious segments.
192         String path = all[0];
193         int pathLenghtMin1 = path.length() - 1;
194         if ('/' == path.charAt(pathLenghtMin1)) {
195             path = path.substring(0, pathLenghtMin1);
196         }
197         return path;
198     }
199 
200     /**
201      * Extracts the href specific rel-tag string. See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a>
202      * specification.
203      *
204      * @param attributes
205      *            the list of attributes of a node.
206      * 
207      * @return the rel-tag specification.
208      */
209     public static String extractRelTag(NamedNodeMap attributes) {
210         return extractRelTag(attributes.getNamedItem("href").getNodeValue());
211     }
212 
213     /**
214      * Reads the text content of the given node and returns it. If the <code>prettify</code> flag is <code>true</code>
215      * the text is cleaned up.
216      *
217      * @param node
218      *            node to read content.
219      * @param prettify
220      *            if <code>true</code> blank chars will be removed.
221      * 
222      * @return the read text.
223      */
224     public static String readNodeContent(Node node, boolean prettify) {
225         final String content = node.getTextContent();
226         return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
227     }
228 
229     /**
230      * Constructor accepting the root node.
231      * 
232      * @param document
233      *            a {@link org.w3c.dom.Node}
234      */
235     public HTMLDocument(Node document) {
236         if (null == document)
237             throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
238         this.document = document;
239     }
240 
241     /**
242      * @param uri
243      *            string to resolve to {@link org.eclipse.rdf4j.model.IRI}
244      * 
245      * @return An absolute IRI, or null if the IRI is not fixable
246      * 
247      * @throws org.apache.any23.extractor.ExtractionException
248      *             If the base IRI is invalid
249      */
250     public IRI resolveIRI(String uri) throws ExtractionException {
251         return valueFactory.resolveIRI(uri, getBaseIRI());
252     }
253 
254     public String find(String xpath) {
255         return DomUtils.find(getDocument(), xpath);
256     }
257 
258     public Node findNodeById(String id) {
259         return DomUtils.findNodeById(getDocument(), id);
260     }
261 
262     public List<Node> findAll(String xpath) {
263         return DomUtils.findAll(getDocument(), xpath);
264     }
265 
266     public String findMicroformattedValue(String objectTag, String object, String fieldTag, String field, String key) {
267         Node node = findMicroformattedObjectNode(objectTag, object);
268         if (null == node)
269             return "";
270         // try to check if it is inline
271         if (DomUtils.hasClassName(node, field))
272             return node.getTextContent();
273 
274         // failed, try to find it in a child
275         try {
276             String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
277             String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
278             if (null == value) {
279                 return "";
280             }
281             return value;
282         } catch (XPathExpressionException ex) {
283             throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
284         }
285 
286     }
287 
288     public Node getDocument() {
289         return document;
290     }
291 
292     /**
293      * Returns a singular text field.
294      *
295      * @param className
296      *            name of class containing text.
297      * 
298      * @return if multiple values are found just the first is returned, if we want to check that there are no n-ary
299      *         values use plural finder
300      */
301     public TextField getSingularTextField(String className) {
302         TextField[] res = getPluralTextField(className);
303         if (res.length == 0)
304             return new TextField("", null);
305         return res[0];
306     }
307 
308     /**
309      * Returns a plural text field.
310      * 
311      * @param className
312      *            name of class node containing text.
313      * 
314      * @return list of fields.
315      */
316     public TextField[] getPluralTextField(String className) {
317         List<TextField> res = new ArrayList<TextField>();
318         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
319         for (Node node : nodes) {
320             res.add(readTextField(node));
321         }
322         return res.toArray(new TextField[res.size()]);
323     }
324 
325     /**
326      * Returns the URL associated to the field marked with class <i>className</i>.
327      *
328      * @param className
329      *            name of node class containing the URL field.
330      * 
331      * @return if multiple values are found just the first is returned, if we want to check that there are no n-ary
332      *         values use plural finder
333      */
334     public TextField getSingularUrlField(String className) {
335         TextField[] res = getPluralUrlField(className);
336         if (res.length < 1)
337             return new TextField("", null);
338         return res[0];
339     }
340 
341     /**
342      * Returns the list of URLs associated to the fields marked with class <i>className</i>.
343      *
344      * @param className
345      *            name of node class containing the URL field.
346      * 
347      * @return the list of {@link HTMLDocument.TextField} found.
348      */
349     public TextField[] getPluralUrlField(String className) {
350         List<TextField> res = new ArrayList<TextField>();
351         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
352         for (Node node : nodes)
353             readUrlField(res, node);
354         return res.toArray(new TextField[res.size()]);
355     }
356 
357     public Node findMicroformattedObjectNode(String objectTag, String name) {
358         List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
359         if (nodes.isEmpty())
360             return null;
361         return nodes.get(0);
362     }
363 
364     /**
365      * Read an attribute avoiding NullPointerExceptions, if the attr is missing it just returns an empty string.
366      *
367      * @param attribute
368      *            the attribute name.
369      * 
370      * @return the string representing the attribute.
371      */
372     public String readAttribute(String attribute) {
373         return DomUtils.readAttribute(getDocument(), attribute);
374     }
375 
376     /**
377      * Finds all the nodes by class name.
378      *
379      * @param clazz
380      *            the class name.
381      * 
382      * @return list of matching nodes.
383      */
384     public List<Node> findAllByClassName(String clazz) {
385         return DomUtils.findAllByClassName(getDocument(), clazz);
386     }
387 
388     /**
389      * Returns the text contained inside a node if leaf, <code>null</code> otherwise.
390      *
391      * @return the text of a leaf node.
392      */
393     public String getText() {
394         NodeList children = getDocument().getChildNodes();
395         if (children.getLength() == 1 && children.item(0) instanceof Text) {
396             return children.item(0).getTextContent();
397         }
398         return null;
399     }
400 
401     /**
402      * Returns the document default language.
403      *
404      * @return default language if any, <code>null</code> otherwise.
405      */
406     public String getDefaultLanguage() {
407         final String xpathLanguageSelector = "/HTML";
408         Node html;
409         try {
410             html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
411         } catch (XPathExpressionException xpeee) {
412             throw new IllegalStateException();
413         }
414         if (html == null) {
415             return null;
416         }
417         Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
418         return langAttribute == null ? null : langAttribute.getTextContent();
419     }
420 
421     /**
422      * Returns the sequence of ancestors from the document root to the local root (document).
423      *
424      * @return a sequence of node names.
425      */
426     public String[] getPathToLocalRoot() {
427         return DomUtils.getXPathListForNode(document);
428     }
429 
430     /**
431      * Extracts all the <code>rel</code> tag nodes.
432      *
433      * @return list of rel tag nodes.
434      */
435     public TextField[] extractRelTagNodes() {
436         final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
437         final List<TextField> result = new ArrayList<TextField>();
438         for (Node relTagNode : relTagNodes) {
439             readUrlField(result, relTagNode);
440         }
441         return result.toArray(new TextField[result.size()]);
442     }
443 
444     private java.net.URI getBaseIRI() throws ExtractionException {
445         if (baseIRI == null) {
446             // document.getBaseURI() returns null for document URIs with
447             // special characters, e.g., http://semanticweb.org/wiki/Knud_Möller
448             // It also does *not* take html "base" elements into account.
449             // (But it does take into account urls specified by the attribute "xml:base".)
450 
451             // So, for now, let's use getDocumentURI() instead.
452             // TODO: Make this approach better.
453 
454             Document doc = document instanceof Document ? (Document) document : document.getOwnerDocument();
455 
456             if (doc == null) {
457                 throw new ExtractionException(
458                         "Node " + document.getNodeName() + " was not associated with a document.");
459             }
460 
461             String uri = doc.getDocumentURI();
462 
463             if (uri == null) {
464                 throw new ExtractionException("document URI is null, this should not happen");
465             }
466 
467             try {
468                 baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(uri));
469             } catch (IllegalArgumentException ex) {
470                 throw new ExtractionException("Error in base IRI: " + uri, ex);
471             } catch (URISyntaxException ex) {
472                 throw new ExtractionException("Error in base IRI: " + uri, ex);
473             }
474         }
475         return baseIRI;
476     }
477 
478     /**
479      * This class represents a text extracted from the <i>HTML</i> DOM related to the node from which such test has been
480      * retrieved.
481      */
482     public static class TextField {
483         private String value;
484         private Node source;
485 
486         public TextField(String value, Node source) {
487             this.value = value;
488             this.source = source;
489         }
490 
491         public String value() {
492             return value;
493         }
494 
495         public Node source() {
496             return source;
497         }
498     }
499 
500 }