View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.encoding;
19  
20  import org.apache.tika.detect.TextStatistics;
21  import org.apache.tika.parser.txt.CharsetDetector;
22  import org.apache.tika.parser.txt.CharsetMatch;
23  import org.jsoup.nodes.Comment;
24  import org.jsoup.nodes.DataNode;
25  import org.jsoup.nodes.Document;
26  import org.jsoup.nodes.DocumentType;
27  import org.jsoup.nodes.Element;
28  import org.jsoup.nodes.Node;
29  import org.jsoup.nodes.PseudoTextElement;
30  import org.jsoup.nodes.TextNode;
31  import org.jsoup.parser.ParseError;
32  import org.jsoup.parser.ParseErrorList;
33  import org.jsoup.parser.Parser;
34  import org.jsoup.select.NodeTraversor;
35  import org.jsoup.select.NodeVisitor;
36  
37  import java.io.BufferedInputStream;
38  import java.io.IOException;
39  import java.io.InputStream;
40  import java.nio.charset.Charset;
41  
42  import static java.nio.charset.StandardCharsets.UTF_8;
43  import static java.nio.charset.StandardCharsets.ISO_8859_1;
44  
45  /**
46   * An implementation of {@link EncodingDetector} based on
47   * <a href="http://tika.apache.org/">Apache Tika</a>.
48   *
49   * @author Michele Mostarda ( michele.mostarda@gmail.com )
50   * @author Davide Palmisano ( dpalmisano@gmail.com )
51   * @author Hans Brende (hansbrende@apache.org)
52   * @version $Id$
53   */
54  public class TikaEncodingDetector implements EncodingDetector {
55  
56      @Override
57      public String guessEncoding(InputStream input) throws IOException {
58          return guessEncoding(input, (String)null);
59      }
60  
61      private static final String TAG_CHARS = "< />";
62      private static final byte[] TAG_BYTES = TAG_CHARS.getBytes(UTF_8);
63      private static final Node[] EMPTY_NODES = new Node[0];
64  
65      private static Charset guessEncoding(InputStream is, Charset declared) throws IOException {
66          if (!is.markSupported()) {
67              is = new BufferedInputStream(is);
68          }
69  
70          TextStatistics stats = computeAndReset(is, EncodingUtils::stats);
71  
72          //we've overridden the looksLikeUTF8() method to be 100% precise, as in jchardet
73          if (stats.looksLikeUTF8()) {
74              // > 92% of the web is UTF-8. Do not risk false positives from other charsets.
75              // See https://issues.apache.org/jira/browse/TIKA-2771
76              // and https://issues.apache.org/jira/browse/TIKA-539
77              return UTF_8;
78          }
79  
80          declared = EncodingUtils.correctVariant(stats, declared);
81          if (declared != null) {
82              return declared;
83          }
84  
85          // ISO-8859-1 is Java's only "standard charset" which maps 1-to-1 onto the first 256 unicode characters;
86          // use ISO-8859-1 for round-tripping of bytes after stripping html/xml tags from input
87          String iso_8859_1 = computeAndReset(is, EncodingUtils::iso_8859_1);
88  
89          Charset xmlCharset = EncodingUtils.xmlCharset(stats, iso_8859_1);
90          if (xmlCharset != null) {
91              return xmlCharset;
92          }
93  
94          ParseErrorList htmlErrors = ParseErrorList.tracking(Integer.MAX_VALUE);
95          Document doc = parseFragment(iso_8859_1, htmlErrors);
96  
97          Charset htmlCharset = EncodingUtils.htmlCharset(stats, doc);
98  
99          if (htmlCharset != null) {
100             return htmlCharset;
101         }
102 
103         if (stats.countEightBit() == 0) {
104             // All characters are plain ASCII, so it doesn't matter what we choose.
105             return UTF_8;
106         }
107 
108         //HTML & XML tag-stripping is vital for accurate n-gram detection, so use Jsoup instead of icu4j's
109         // "quick and dirty, not 100% accurate" tag-stripping implementation for more accurate results.
110         // Cf. https://issues.apache.org/jira/browse/TIKA-2038
111         long openTags = countTags(doc);
112         long badTags = htmlErrors.stream().map(ParseError::getErrorMessage)
113                 .filter(err -> err != null && err.matches(".*'[</>]'.*")).count();
114 
115         //condition for filtering input adapted from icu4j's CharsetDetector#MungeInput()
116         boolean filterInput = true;
117         if (openTags < 5 || openTags / 5 < badTags) {
118             filterInput = false;
119         } else {
120             String wholeText = wholeText(doc);
121             if (wholeText.length() < 100 && iso_8859_1.length() > 600) {
122                 filterInput = false;
123             } else {
124                 iso_8859_1 = wholeText;
125             }
126         }
127         byte[] text = iso_8859_1.getBytes(ISO_8859_1);
128 
129         CharsetDetector icu4j = new CharsetDetector(text.length);
130         icu4j.setText(text);
131 
132         for (CharsetMatch match : icu4j.detectAll()) {
133             try {
134                 Charset charset = EncodingUtils.forName(match.getName());
135 
136                 // If we successfully filtered input based on 0x3C and 0x3E, then this must be an ascii-compatible charset
137                 // See https://issues.apache.org/jira/browse/TIKA-2771
138                 if (filterInput && !TAG_CHARS.equals(new String(TAG_BYTES, charset))) {
139                     continue;
140                 }
141 
142                 charset = EncodingUtils.correctVariant(stats, charset);
143                 if (charset != null) {
144                     return charset;
145                 }
146             } catch (Exception e) {
147                 //ignore; if this charset isn't supported by this platform, it's probably not correct anyway.
148             }
149         }
150 
151         // No bytes are invalid in ISO-8859-1, so this one is always possible if there are no options left.
152         // Also, has second-highest popularity on the web behind UTF-8.
153         return EncodingUtils.correctVariant(stats, ISO_8859_1);
154     }
155 
156     @Override
157     public String guessEncoding(InputStream is, String contentType) throws IOException {
158         Charset charset = EncodingUtils.contentTypeCharset(contentType);
159         return guessEncoding(is, charset).name();
160     }
161 
162 
163 
164     ////////////////////
165     // STATIC HELPERS //
166     ////////////////////
167 
168     @FunctionalInterface
169     private interface InputStreamFunction<E> {
170         E compute(InputStream is) throws IOException;
171     }
172 
173     private static <E> E computeAndReset(InputStream is, InputStreamFunction<E> function) throws IOException {
174         is.mark(Integer.MAX_VALUE);
175         try {
176             return function.compute(is);
177         } finally {
178             is.reset();
179         }
180     }
181 
182     private static Document parseFragment(String html, ParseErrorList errors) {
183         Document doc = new Document("");
184         Node[] childNodes = Parser.parseFragment(html, null, "", errors).toArray(EMPTY_NODES);
185         for (Node node : childNodes) {
186             if (node.parentNode() != null) {
187                 node.remove();
188             }
189             doc.appendChild(node);
190         }
191         return doc;
192     }
193 
194     private static long countTags(Node node) {
195         long[] ret = {0};
196         NodeTraversor.traverse(new NodeVisitor() {
197             @Override
198             public void head(Node node, int depth) {
199                 if (node instanceof Document || node instanceof PseudoTextElement) {
200                     //subclasses of Element that don't have start/end tags
201                     return;
202                 }
203                 if (node instanceof Element || node instanceof DocumentType || node instanceof Comment) {
204                     ret[0] += node.childNodeSize() == 0 ? 1 : 2;
205                 }
206             }
207             @Override
208             public void tail(Node node, int depth) {
209             }
210         }, node);
211         return ret[0];
212     }
213 
214     private static String wholeText(Node node) {
215         StringBuilder sb = new StringBuilder();
216         NodeTraversor.traverse(new NodeVisitor() {
217             @Override
218             public void head(Node node, int depth) {
219                 if (node instanceof TextNode) {
220                     sb.append(((TextNode) node).getWholeText());
221                 } else if (node instanceof DataNode) {
222                     String data = ((DataNode) node).getWholeData();
223                     do {
224                         //make sure json-ld data is included in text stats
225                         //otherwise, ignore css & javascript
226                         if ("script".equalsIgnoreCase(node.nodeName())) {
227                             if (node.attr("type").toLowerCase().contains("json")) {
228                                 sb.append(data);
229                             }
230                             break;
231                         } else if ("style".equalsIgnoreCase(node.nodeName())) {
232                             break;
233                         }
234                         node = node.parentNode();
235                     } while (node != null);
236                 } else if (node instanceof Comment) {
237                     String data = ((Comment) node).getData();
238                     //avoid comments that are actually processing instructions or xml declarations
239                     if (!data.contains("<!") && !data.contains("<?")) {
240                         sb.append(data);
241                     }
242                 } else if (node instanceof Element) {
243                     //make sure all microdata itemprop "content" values are taken into consideration
244                     sb.append(node.attr("content"));
245                 }
246             }
247             @Override
248             public void tail(Node node, int depth) {
249             }
250         }, node);
251         return sb.toString();
252     }
253 
254 }