View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.validator.DefaultValidator;
21  import org.apache.any23.validator.Validator;
22  import org.apache.any23.validator.ValidatorException;
23  import org.slf4j.Logger;
24  import org.slf4j.LoggerFactory;
25  import org.w3c.dom.Document;
26  
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.net.URI;
30  import java.net.URISyntaxException;
31  import java.nio.charset.Charset;
32  import java.nio.charset.UnsupportedCharsetException;
33  
34  /**
35   * <p>Parses an {@link java.io.InputStream}
36   * into an <i>HTML DOM</i> tree.
37   * </p>
38   * <p><strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace
39   * aware, and all element names will be upper case, while attributes
40   * will be lower case. This is because the HTML parser
41   * uses the <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a>
42   * implementation, which doesn't support namespaces and forces uppercase element names. This works
43   * with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>, so we left it this way.</p>
44   *
45   * @author Richard Cyganiak (richard at cyganiak dot de)
46   * @author Michele Mostarda (mostarda@fbk.eu)
47   * @author Davide Palmisano (palmisano@fbk.eu)
48   */
49  
50  public class TagSoupParser {
51  
52      public static final String ELEMENT_LOCATION = "Element-Location";
53  
54      private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
55  
56      private final InputStream input;
57  
58      private final String documentIRI;
59  
60      private final String encoding;
61  
62      private final TagSoupParsingConfiguration config;
63  
64      private Document result = null;
65  
66  
67      public TagSoupParser(InputStream input, String documentIRI) {
68          this.input = input;
69          this.documentIRI = documentIRI;
70          this.encoding = null;
71  
72          config = TagSoupParsingConfiguration.getDefault();
73      }
74  
75      public TagSoupParser(InputStream input, String documentIRI, String encoding) {
76          if (encoding != null && !Charset.isSupported(encoding))
77              throw new UnsupportedCharsetException(String.format("Charset %s is not supported", encoding));
78  
79          this.input = input;
80          this.documentIRI = documentIRI;
81          this.encoding = encoding;
82  
83          config = TagSoupParsingConfiguration.getDefault();
84      }
85  
86  
87      /**
88       * Returns the DOM of the given document IRI. 
89       *
90       * @return the <i>HTML</i> DOM.
91       * @throws IOException if there is an error whilst accessing the DOM
92       */
93      public Document getDOM() throws IOException {
94          if (result == null) {
95              long startTime = System.currentTimeMillis();
96              try {
97                  result = config.parse(input, documentIRI, encoding);
98              } finally {
99                  long elapsed = System.currentTimeMillis() - startTime;
100                 logger.debug("Parsed " + documentIRI + " with " + config.name() + ", " + elapsed + "ms");
101             }
102         }
103         result.setDocumentURI(documentIRI);
104         return result;
105     }
106 
107     /**
108      * Returns the validated DOM and applies fixes on it if <i>applyFix</i>
109      * is set to <code>true</code>.
110      *
111      * @param applyFix whether to apply fixes to the DOM
112      * @return a report containing the <i>HTML</i> DOM that has been validated and fixed if <i>applyFix</i>
113      *         if <code>true</code>. The reports contains also information about the activated rules and the
114      *         the detected issues.
115      * @throws IOException if there is an error accessing the DOM
116      * @throws org.apache.any23.validator.ValidatorException if there is an error validating the DOM
117      */
118     public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException {
119         final URI dIRI;
120         try {
121             dIRI = new URI(documentIRI);
122         } catch (IllegalArgumentException | URISyntaxException urise) {
123             throw new ValidatorException("Error while performing validation, invalid document IRI.", urise);
124         }
125         Validator validator = new DefaultValidator();
126         Document document = getDOM();
127         return new DocumentReport( validator.validate(dIRI, document, applyFix), document );
128     }
129 
130     /**
131      * Describes a <i>DOM Element</i> location.
132      */
133     public static class ElementLocation {
134 
135         private int beginLineNumber;
136         private int beginColumnNumber;
137         private int endLineNumber;
138         private int endColumnNumber;
139 
140         private ElementLocation(
141                 int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber
142         ) {
143             this.beginLineNumber = beginLineNumber;
144             this.beginColumnNumber = beginColumnNumber;
145             this.endLineNumber = endLineNumber;
146             this.endColumnNumber = endColumnNumber;
147         }
148 
149         public int getBeginLineNumber() {
150             return beginLineNumber;
151         }
152 
153         public int getBeginColumnNumber() {
154             return beginColumnNumber;
155         }
156 
157         public int getEndLineNumber() {
158             return endLineNumber;
159         }
160 
161         public int getEndColumnNumber() {
162             return endColumnNumber;
163         }
164     }
165     
166 }