View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.encoding;
19  
20  import org.apache.tika.detect.TextStatistics;
21  import org.apache.tika.parser.txt.CharsetDetector;
22  import org.apache.tika.parser.txt.CharsetMatch;
23  import org.jsoup.nodes.Comment;
24  import org.jsoup.nodes.DataNode;
25  import org.jsoup.nodes.Document;
26  import org.jsoup.nodes.DocumentType;
27  import org.jsoup.nodes.Element;
28  import org.jsoup.nodes.Node;
29  import org.jsoup.nodes.PseudoTextElement;
30  import org.jsoup.nodes.TextNode;
31  import org.jsoup.parser.ParseError;
32  import org.jsoup.parser.ParseErrorList;
33  import org.jsoup.parser.Parser;
34  import org.jsoup.select.NodeTraversor;
35  import org.jsoup.select.NodeVisitor;
36  
37  import java.io.BufferedInputStream;
38  import java.io.IOException;
39  import java.io.InputStream;
40  import java.nio.charset.Charset;
41  
42  import static java.nio.charset.StandardCharsets.UTF_8;
43  import static java.nio.charset.StandardCharsets.ISO_8859_1;
44  
45  /**
46   * An implementation of {@link EncodingDetector} based on <a href="http://tika.apache.org/">Apache Tika</a>.
47   *
48   * @author Michele Mostarda ( michele.mostarda@gmail.com )
49   * @author Davide Palmisano ( dpalmisano@gmail.com )
50   * @author Hans Brende (hansbrende@apache.org)
51   * 
52   * @version $Id$
53   */
54  public class TikaEncodingDetector implements EncodingDetector {
55  
56      @Override
57      public String guessEncoding(InputStream input) throws IOException {
58          return guessEncoding(input, (String) null);
59      }
60  
61      private static final String TAG_CHARS = "< />";
62      private static final byte[] TAG_BYTES = TAG_CHARS.getBytes(UTF_8);
63      private static final Node[] EMPTY_NODES = new Node[0];
64  
65      private static Charset guessEncoding(InputStream is, Charset declared) throws IOException {
66          if (!is.markSupported()) {
67              is = new BufferedInputStream(is);
68          }
69  
70          TextStatistics stats = computeAndReset(is, EncodingUtils::stats);
71  
72          // we've overridden the looksLikeUTF8() method to be 100% precise, as in jchardet
73          if (stats.looksLikeUTF8()) {
74              // > 92% of the web is UTF-8. Do not risk false positives from other charsets.
75              // See https://issues.apache.org/jira/browse/TIKA-2771
76              // and https://issues.apache.org/jira/browse/TIKA-539
77              return UTF_8;
78          }
79  
80          declared = EncodingUtils.correctVariant(stats, declared);
81          if (declared != null) {
82              return declared;
83          }
84  
85          // ISO-8859-1 is Java's only "standard charset" which maps 1-to-1 onto the first 256 unicode characters;
86          // use ISO-8859-1 for round-tripping of bytes after stripping html/xml tags from input
87          String iso_8859_1 = computeAndReset(is, EncodingUtils::iso_8859_1);
88  
89          Charset xmlCharset = EncodingUtils.xmlCharset(stats, iso_8859_1);
90          if (xmlCharset != null) {
91              return xmlCharset;
92          }
93  
94          ParseErrorList htmlErrors = ParseErrorList.tracking(Integer.MAX_VALUE);
95          Document doc = parseFragment(iso_8859_1, htmlErrors);
96  
97          Charset htmlCharset = EncodingUtils.htmlCharset(stats, doc);
98  
99          if (htmlCharset != null) {
100             return htmlCharset;
101         }
102 
103         if (stats.countEightBit() == 0) {
104             // All characters are plain ASCII, so it doesn't matter what we choose.
105             return UTF_8;
106         }
107 
108         // HTML & XML tag-stripping is vital for accurate n-gram detection, so use Jsoup instead of icu4j's
109         // "quick and dirty, not 100% accurate" tag-stripping implementation for more accurate results.
110         // Cf. https://issues.apache.org/jira/browse/TIKA-2038
111         long openTags = countTags(doc);
112         long badTags = htmlErrors.stream().map(ParseError::getErrorMessage)
113                 .filter(err -> err != null && err.matches(".*'[</>]'.*")).count();
114 
115         // condition for filtering input adapted from icu4j's CharsetDetector#MungeInput()
116         boolean filterInput = true;
117         if (openTags < 5 || openTags / 5 < badTags) {
118             filterInput = false;
119         } else {
120             String wholeText = wholeText(doc);
121             if (wholeText.length() < 100 && iso_8859_1.length() > 600) {
122                 filterInput = false;
123             } else {
124                 iso_8859_1 = wholeText;
125             }
126         }
127         byte[] text = iso_8859_1.getBytes(ISO_8859_1);
128 
129         CharsetDetector icu4j = new CharsetDetector(text.length);
130         icu4j.setText(text);
131 
132         for (CharsetMatch match : icu4j.detectAll()) {
133             try {
134                 Charset charset = EncodingUtils.forName(match.getName());
135 
136                 // If we successfully filtered input based on 0x3C and 0x3E, then this must be an ascii-compatible
137                 // charset
138                 // See https://issues.apache.org/jira/browse/TIKA-2771
139                 if (filterInput && !TAG_CHARS.equals(new String(TAG_BYTES, charset))) {
140                     continue;
141                 }
142 
143                 charset = EncodingUtils.correctVariant(stats, charset);
144                 if (charset != null) {
145                     return charset;
146                 }
147             } catch (Exception e) {
148                 // ignore; if this charset isn't supported by this platform, it's probably not correct anyway.
149             }
150         }
151 
152         // No bytes are invalid in ISO-8859-1, so this one is always possible if there are no options left.
153         // Also, has second-highest popularity on the web behind UTF-8.
154         return EncodingUtils.correctVariant(stats, ISO_8859_1);
155     }
156 
157     @Override
158     public String guessEncoding(InputStream is, String contentType) throws IOException {
159         Charset charset = EncodingUtils.contentTypeCharset(contentType);
160         return guessEncoding(is, charset).name();
161     }
162 
163     ////////////////////
164     // STATIC HELPERS //
165     ////////////////////
166 
167     @FunctionalInterface
168     private interface InputStreamFunction<E> {
169         E compute(InputStream is) throws IOException;
170     }
171 
172     private static <E> E computeAndReset(InputStream is, InputStreamFunction<E> function) throws IOException {
173         is.mark(Integer.MAX_VALUE);
174         try {
175             return function.compute(is);
176         } finally {
177             is.reset();
178         }
179     }
180 
181     private static Document parseFragment(String html, ParseErrorList errors) {
182         Document doc = new Document("");
183         Node[] childNodes = Parser.parseFragment(html, null, "", errors).toArray(EMPTY_NODES);
184         for (Node node : childNodes) {
185             if (node.parentNode() != null) {
186                 node.remove();
187             }
188             doc.appendChild(node);
189         }
190         return doc;
191     }
192 
193     private static long countTags(Node node) {
194         long[] ret = { 0 };
195         NodeTraversor.traverse(new NodeVisitor() {
196             @Override
197             public void head(Node node, int depth) {
198                 if (node instanceof Document || node instanceof PseudoTextElement) {
199                     // subclasses of Element that don't have start/end tags
200                     return;
201                 }
202                 if (node instanceof Element || node instanceof DocumentType || node instanceof Comment) {
203                     ret[0] += node.childNodeSize() == 0 ? 1 : 2;
204                 }
205             }
206 
207             @Override
208             public void tail(Node node, int depth) {
209             }
210         }, node);
211         return ret[0];
212     }
213 
214     private static String wholeText(Node node) {
215         StringBuilder sb = new StringBuilder();
216         NodeTraversor.traverse(new NodeVisitor() {
217             @Override
218             public void head(Node node, int depth) {
219                 if (node instanceof TextNode) {
220                     sb.append(((TextNode) node).getWholeText());
221                 } else if (node instanceof DataNode) {
222                     String data = ((DataNode) node).getWholeData();
223                     do {
224                         // make sure json-ld data is included in text stats
225                         // otherwise, ignore css & javascript
226                         if ("script".equalsIgnoreCase(node.nodeName())) {
227                             if (node.attr("type").toLowerCase(java.util.Locale.ROOT).contains("json")) {
228                                 sb.append(data);
229                             }
230                             break;
231                         } else if ("style".equalsIgnoreCase(node.nodeName())) {
232                             break;
233                         }
234                         node = node.parentNode();
235                     } while (node != null);
236                 } else if (node instanceof Comment) {
237                     String data = ((Comment) node).getData();
238                     // avoid comments that are actually processing instructions or xml declarations
239                     if (!data.contains("<!") && !data.contains("<?")) {
240                         sb.append(data);
241                     }
242                 } else if (node instanceof Element) {
243                     // make sure all microdata itemprop "content" values are taken into consideration
244                     sb.append(node.attr("content"));
245                 }
246             }
247 
248             @Override
249             public void tail(Node node, int depth) {
250             }
251         }, node);
252         return sb.toString();
253     }
254 
255 }