View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.validator.DefaultValidator;
21  import org.apache.any23.validator.Validator;
22  import org.apache.any23.validator.ValidatorException;
23  import org.slf4j.Logger;
24  import org.slf4j.LoggerFactory;
25  import org.w3c.dom.Document;
26  
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.net.URI;
30  import java.net.URISyntaxException;
31  import java.nio.charset.Charset;
32  import java.nio.charset.UnsupportedCharsetException;
33  import java.util.Locale;
34  
35  /**
36   * <p>
37   * Parses an {@link java.io.InputStream} into an <i>HTML DOM</i> tree.
38   * </p>
39   * <p>
40   * <strong>Note:</strong> The resulting <i>DOM</i> tree will not be namespace aware, and all element names will be upper
41   * case, while attributes will be lower case. This is because the HTML parser uses the
42   * <a href="http://xerces.apache.org/xerces2-j/dom.html">Xerces HTML DOM</a> implementation, which doesn't support
43   * namespaces and forces uppercase element names. This works with the <i>RDFa XSLT Converter</i> and with <i>XPath</i>,
44   * so we left it this way.
45   * </p>
46   *
47   * @author Richard Cyganiak (richard at cyganiak dot de)
48   * @author Michele Mostarda (mostarda@fbk.eu)
49   * @author Davide Palmisano (palmisano@fbk.eu)
50   */
51  
52  public class TagSoupParser {
53  
54      public static final String ELEMENT_LOCATION = "Element-Location";
55  
56      private final static Logger logger = LoggerFactory.getLogger(TagSoupParser.class);
57  
58      private final InputStream input;
59  
60      private final String documentIRI;
61  
62      private final String encoding;
63  
64      private final TagSoupParsingConfiguration config;
65  
66      private Document result = null;
67  
68      public TagSoupParser(InputStream input, String documentIRI) {
69          this.input = input;
70          this.documentIRI = documentIRI;
71          this.encoding = null;
72  
73          config = TagSoupParsingConfiguration.getDefault();
74      }
75  
76      public TagSoupParser(InputStream input, String documentIRI, String encoding) {
77          if (encoding != null && !Charset.isSupported(encoding))
78              throw new UnsupportedCharsetException(String.format(Locale.ROOT, "Charset %s is not supported", encoding));
79  
80          this.input = input;
81          this.documentIRI = documentIRI;
82          this.encoding = encoding;
83  
84          config = TagSoupParsingConfiguration.getDefault();
85      }
86  
87      /**
88       * Returns the DOM of the given document IRI.
89       *
90       * @return the <i>HTML</i> DOM.
91       * 
92       * @throws IOException
93       *             if there is an error whilst accessing the DOM
94       */
95      public Document getDOM() throws IOException {
96          if (result == null) {
97              long startTime = System.currentTimeMillis();
98              try {
99                  result = config.parse(input, documentIRI, encoding);
100             } finally {
101                 long elapsed = System.currentTimeMillis() - startTime;
102                 logger.debug("Parsed " + documentIRI + " with " + config.name() + ", " + elapsed + "ms");
103             }
104         }
105         result.setDocumentURI(documentIRI);
106         return result;
107     }
108 
109     /**
110      * Returns the validated DOM and applies fixes on it if <i>applyFix</i> is set to <code>true</code>.
111      *
112      * @param applyFix
113      *            whether to apply fixes to the DOM
114      * 
115      * @return a report containing the <i>HTML</i> DOM that has been validated and fixed if <i>applyFix</i> if
116      *         <code>true</code>. The reports contains also information about the activated rules and the the detected
117      *         issues.
118      * 
119      * @throws IOException
120      *             if there is an error accessing the DOM
121      * @throws org.apache.any23.validator.ValidatorException
122      *             if there is an error validating the DOM
123      */
124     public DocumentReport getValidatedDOM(boolean applyFix) throws IOException, ValidatorException {
125         final URI dIRI;
126         try {
127             dIRI = new URI(documentIRI);
128         } catch (IllegalArgumentException | URISyntaxException urise) {
129             throw new ValidatorException("Error while performing validation, invalid document IRI.", urise);
130         }
131         Validator validator = new DefaultValidator();
132         Document document = getDOM();
133         return new DocumentReport(validator.validate(dIRI, document, applyFix), document);
134     }
135 
136     /**
137      * Describes a <i>DOM Element</i> location.
138      */
139     public static class ElementLocation {
140 
141         private int beginLineNumber;
142         private int beginColumnNumber;
143         private int endLineNumber;
144         private int endColumnNumber;
145 
146         private ElementLocation(int beginLineNumber, int beginColumnNumber, int endLineNumber, int endColumnNumber) {
147             this.beginLineNumber = beginLineNumber;
148             this.beginColumnNumber = beginColumnNumber;
149             this.endLineNumber = endLineNumber;
150             this.endColumnNumber = endColumnNumber;
151         }
152 
153         public int getBeginLineNumber() {
154             return beginLineNumber;
155         }
156 
157         public int getBeginColumnNumber() {
158             return beginColumnNumber;
159         }
160 
161         public int getEndLineNumber() {
162             return endLineNumber;
163         }
164 
165         public int getEndColumnNumber() {
166             return endColumnNumber;
167         }
168     }
169 
170 }