View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
22  import org.apache.any23.rdf.RDFUtils;
23  import org.openrdf.model.URI;
24  import org.openrdf.model.impl.ValueFactoryImpl;
25  import org.slf4j.Logger;
26  import org.slf4j.LoggerFactory;
27  import org.w3c.dom.NamedNodeMap;
28  import org.w3c.dom.Node;
29  import org.w3c.dom.NodeList;
30  import org.w3c.dom.Text;
31  
32  import javax.xml.xpath.XPath;
33  import javax.xml.xpath.XPathConstants;
34  import javax.xml.xpath.XPathExpressionException;
35  import javax.xml.xpath.XPathFactory;
36  import java.net.URISyntaxException;
37  import java.util.ArrayList;
38  import java.util.List;
39  
40  /**
41   * A wrapper around the DOM representation of an HTML document.
42   * Provides convenience access to various parts of the document.
43   *
44   * @author Gabriele Renzi
45   * @author Michele Mostarda
46   */
47  public class HTMLDocument {
48  
49      private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
50      private final static Logger log        = LoggerFactory.getLogger(HTMLDocument.class);
51  
52      private Node         document;
53      private java.net.URI baseURI;
54  
55      private final Any23ValueFactoryWrapper valueFactory =
56              new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
57  
58      /**
59       * Reads a text field from the given node adding the content to the given <i>res</i> list.
60       *
61       * @param node the node from which read the content.
62       * @return a valid TextField
63       */
64      public static TextField readTextField(Node node) {
65          TextField result;
66          final String name = node.getNodeName();
67          final NamedNodeMap attributes = node.getAttributes();
68          // excess of safety check, should be impossible
69          if (attributes == null ) {
70              return new TextField( node.getTextContent(), node);
71          }
72          // first check if there are values inside
73          List<Node> values = DomUtils.findAllByClassName(node, "value");
74          if (!values.isEmpty()) {
75              String val = "";
76              for (Node n : values)
77                  val += n.getTextContent();
78              return new TextField( val.trim(), node);
79          }
80          if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
81              result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
82          } else if ("A".equals(name)) {
83              if (DomUtils.hasAttribute(node, "rel", "tag")) {
84                  String href = extractRelTag(attributes);
85                  result = new TextField(href, node);
86              } else
87                  result = new TextField(node.getTextContent(), node);
88          } else if ("IMG".equals(name) || "AREA".equals(name)) {
89              result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
90          } else {
91              result = new TextField(node.getTextContent(), node);
92          }
93          return result;
94      }
95  
96      /**
97       * Reads an URL field from the given node adding the content to the given <i>res</i> list.
98       *
99       * @param res
100      * @param node
101      */
102     public static void readUrlField(List<TextField> res, Node node) {
103         String name = node.getNodeName();
104         NamedNodeMap attributes = node.getAttributes();
105         if (null == attributes) {
106             res.add( new TextField(node.getTextContent(), node) );
107             return;
108         }
109         if ("A".equals(name) || "AREA".equals(name)) {
110             Node n = attributes.getNamedItem("href");
111             res.add( new TextField(n.getNodeValue(), n) );
112         } else if ("ABBR".equals(name)) {
113             Node n = attributes.getNamedItem("title");
114             res.add( new TextField(n.getNodeValue(), n) );
115         } else if ("IMG".equals(name)) {
116             Node n = attributes.getNamedItem("src");
117             res.add( new TextField(n.getNodeValue(), n) );
118         } else if ("OBJECT".equals(name)) {
119             Node n = attributes.getNamedItem("data");
120             res.add( new TextField(n.getNodeValue(), n) );
121         } else {
122             res.add( new TextField(node.getTextContent().trim(), node) );
123         }
124     }
125 
126     /**
127      * Extracts the href specific rel-tag string.
128      * See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
129      *
130      * @param hrefAttributeContent the content of the <i>href</i> attribute.
131      * @return the rel-tag specification.
132      */
133     public static String extractRelTag(String hrefAttributeContent) {
134         String[] all = hrefAttributeContent.split("[#?]");
135         // Cleanup spurious segments.
136         String path = all[0];
137         int pathLenghtMin1 = path.length() - 1;
138         if( '/' == path.charAt(pathLenghtMin1) ) {
139             path = path.substring(0, pathLenghtMin1);
140         }
141         return path;
142     }
143 
144     /**
145      * Extracts the href specific rel-tag string.
146      * See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a> specification.
147      *
148      * @param attributes the list of attributes of a node.
149      * @return the rel-tag specification.
150      */
151     public static String extractRelTag(NamedNodeMap attributes) {
152         return extractRelTag(attributes.getNamedItem("href").getNodeValue());
153     }
154 
155     /**
156      * Reads the text content of the given node and returns it.
157      * If the <code>prettify</code> flag is <code>true</code>
158      * the text is cleaned up.
159      *
160      * @param node node to read content.
161      * @param prettify if <code>true</code> blank chars will be removed.
162      * @return the read text.
163      */
164     public static String readNodeContent(Node node, boolean prettify) {
165         final String content = node.getTextContent();
166         return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
167     }
168 
169     /**
170      * Constructor accepting the root node.
171      * 
172      * @param document
173      */
174     public HTMLDocument(Node document) {
175         if (null == document)
176             throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
177         this.document = document;
178     }
179 
180     /**
181      * @return An absolute URI, or null if the URI is not fixable
182      * @throws org.apache.any23.extractor.ExtractionException If the base URI is invalid
183      */
184     public URI resolveURI(String uri) throws ExtractionException {
185         return valueFactory.resolveURI(uri, getBaseURI());
186     }
187 
188     public String find(String xpath) {
189         return DomUtils.find(getDocument(), xpath);
190     }
191 
192     public Node findNodeById(String id) {
193         return DomUtils.findNodeById(getDocument(), id);
194     }
195 
196     public List<Node> findAll(String xpath) {
197         return DomUtils.findAll(getDocument(), xpath);
198     }
199 
200     public String findMicroformattedValue(
201             String objectTag,
202             String object,
203             String fieldTag,
204             String field,
205             String key
206     ) {
207         Node node = findMicroformattedObjectNode(objectTag, object);
208         if (null == node)
209             return "";
210         // try to check if it is inline
211         if (DomUtils.hasClassName(node, field))
212             return node.getTextContent();
213 
214         // failed, try to find it in a child
215         try {
216             String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
217             String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
218             if (null == value) {
219                 return "";
220             }
221             return value;
222         } catch (XPathExpressionException ex) {
223             throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
224         }
225 
226     }
227 
228     public Node getDocument() {
229         return document;
230     }
231 
232     /**
233      * Returns a singular text field. 
234      *
235      * @param className name of class containing text.
236      * @return if multiple values are found just the first is returned,
237      * if we want to check that there are no n-ary values use plural finder
238      */
239     public TextField getSingularTextField(String className) {
240         TextField[] res = getPluralTextField(className);
241         if (res.length == 0)
242             return new TextField("", null);
243         return res[0];
244     }
245 
246     /**
247      * Returns a plural text field.
248      * 
249      * @param className name of class node containing text.
250      * @return list of fields.
251      */
252     public TextField[] getPluralTextField(String className) {
253         List<TextField> res = new ArrayList<TextField>();
254         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
255         for (Node node : nodes) {
256             res.add( readTextField(node) );
257         }
258         return res.toArray( new TextField[res.size()] );
259     }
260 
261     /**
262      * Returns the URL associated to the field marked with class <i>className</i>.
263      *
264      * @param className name of node class containing the URL field.
265      * @return if multiple values are found just the first is returned,
266      *  if we want to check that there are no n-ary values use plural finder
267      */
268     public TextField getSingularUrlField(String className) {
269         TextField[] res = getPluralUrlField(className);
270         if (res.length < 1)
271             return new TextField("", null);
272         return res[0];
273     }
274 
275     /**
276      * Returns the list of URLs associated to the fields marked with class <i>className</i>.
277      *
278      * @param className name of node class containing the URL field.
279      * @return the list of {@link HTMLDocument.TextField} found.
280      */
281     public TextField[] getPluralUrlField(String className) {
282         List<TextField> res = new ArrayList<TextField>();
283         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
284         for (Node node : nodes)
285             readUrlField(res, node);
286         return res.toArray( new TextField[res.size()] );
287     }
288 
289     public Node findMicroformattedObjectNode(String objectTag, String name) {
290         List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
291         if (nodes.isEmpty())
292             return null;
293         return nodes.get(0);
294     }
295 
296     /**
297      * Read an attribute avoiding NullPointerExceptions, if the attr is
298      * missing it just returns an empty string.
299      *
300      * @param attribute the attribute name.
301      * @return the string representing the attribute.
302      */
303     public String readAttribute(String attribute) {
304         return DomUtils.readAttribute(getDocument(), attribute);
305     }
306 
307     /**
308      * Finds all the nodes by class name.
309      *
310      * @param clazz the class name.
311      * @return list of matching nodes.
312      */
313     public List<Node> findAllByClassName(String clazz) {
314         return DomUtils.findAllByClassName(getDocument(), clazz);
315     }
316 
317     /**
318      * Returns the text contained inside a node if leaf,
319      * <code>null</code> otherwise.
320      *
321      * @return the text of a leaf node.
322      */
323     public String getText() {
324         NodeList children = getDocument().getChildNodes();
325         if(children.getLength() == 1 && children.item(0) instanceof Text) {
326             return children.item(0).getTextContent();
327         }
328         return null;
329     }
330 
331     /**
332      * Returns the document default language.
333      *
334      * @return default language if any, <code>null</code> otherwise.
335      */
336     public String getDefaultLanguage() {
337         final String xpathLanguageSelector = "/HTML";
338         Node html;
339         try {
340             html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
341         } catch (XPathExpressionException xpeee) {
342             throw new IllegalStateException();
343         }
344         if (html == null) {
345             return null;
346         }
347         Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
348         return langAttribute == null ? null : langAttribute.getTextContent();
349     }
350 
351     /**
352      * Returns the sequence of ancestors from the document root to the local root (document).
353      *
354      * @return a sequence of node names.
355      */
356     public String[] getPathToLocalRoot() {
357         return DomUtils.getXPathListForNode(document);
358     }
359 
360     /**
361      * Extracts all the <code>rel</code> tag nodes.
362      *
363      * @return list of rel tag nodes.
364      */
365     public TextField[] extractRelTagNodes() {
366         final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
367         final List<TextField> result = new ArrayList<TextField>();
368         for(Node relTagNode : relTagNodes) {
369             readUrlField(result, relTagNode);
370         }
371         return result.toArray( new TextField[result.size()] );
372     }
373 
374     private java.net.URI getBaseURI() throws ExtractionException {
375         if (baseURI == null) {
376             try {
377                 if (document.getBaseURI() == null) {
378                     log.warn("document.getBaseURI() is null, this should not happen");
379                 }
380                 baseURI = new java.net.URI(RDFUtils.fixAbsoluteURI(document.getBaseURI()));
381             } catch (IllegalArgumentException ex) {
382                 throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
383             } catch (URISyntaxException ex) {
384                 throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
385             }
386         }
387         return baseURI;
388     }
389 
390     /**
391      * This class represents a text extracted from the <i>HTML</i> DOM related
392      * to the node from which such test has been retrieved.
393      */
394     public static class TextField {
395         private String value;
396         private Node   source;
397 
398         public TextField(String value, Node source) {
399             this.value = value;
400             this.source = source;
401         }
402 
403         public String value() {
404             return value;
405         }
406 
407         public Node source() {
408             return source;
409         }
410     }
411 
412 }