View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.encoding;
19  
20  import org.apache.tika.detect.TextStatistics;
21  import org.apache.tika.utils.CharsetUtils;
22  import org.jsoup.nodes.Element;
23  import org.jsoup.select.Evaluator;
24  import org.jsoup.select.QueryParser;
25  import org.jsoup.select.Selector;
26  import org.rypt.f8.Utf8Statistics;
27  
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.nio.charset.Charset;
31  import java.nio.charset.StandardCharsets;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  /**
36   * @author Hans Brende
37   */
38  class EncodingUtils {
39  
40      /**
41       * Very efficient method to convert an input stream directly to an ISO-8859-1 encoded string
42       */
43      static String iso_8859_1(InputStream is) throws IOException {
44          StringBuilder chars = new StringBuilder(Math.max(is.available(), 8192));
45          byte[] buffer = new byte[8192];
46          int n;
47          while ((n = is.read(buffer)) != -1) {
48              for (int i = 0; i < n; i++) {
49                  chars.append((char) (buffer[i] & 0xFF));
50              }
51          }
52          return chars.toString();
53      }
54  
55      // get correct variant, or null if charset is incompatible with stats
56      static Charset correctVariant(TextStatistics stats, Charset charset) {
57          if (charset == null) {
58              return null;
59          }
60          switch (charset.name()) {
61          // ISO-8859-1 variants
62          case "ISO-8859-1":
63              // Take a hint from icu4j's CharsetRecog_8859_1 and Tika's UniversalEncodingListener:
64              // return windows-1252 before ISO-8859-1 if:
65              // (1) C1 ctrl chars are used (as in icu4j), or
66              // (2) '\r' is used (as in Tika)
67              if ((stats.count('\r') != 0 || hasC1Control(stats)) && hasNoneOf(stats, windows1252Illegals)) {
68                  try {
69                      return forName("windows-1252");
70                  } catch (Exception e) {
71                      // ignore
72                  }
73              }
74              return iso_8859_1_or_15(stats);
75          case "windows-1252":
76              return hasNoneOf(stats, windows1252Illegals) ? charset : iso_8859_1_or_15(stats);
77  
78          // ISO-8859-2 variants
79          case "ISO-8859-2":
80              // Take a hint from icu4j's CharsetRecog_8859_2 class:
81              // return windows-1250 before ISO-8859-2 if has valid C1 chars
82              if (hasC1Control(stats) && hasNoneOf(stats, windows1250Illegals)) {
83                  try {
84                      return forName("windows-1250");
85                  } catch (Exception e) {
86                      // ignore
87                  }
88              }
89              return charset;
90          case "windows-1250":
91              return hasNoneOf(stats, windows1250Illegals) ? charset : charset("ISO-8859-2");
92  
93          // ISO-8859-7 variants
94          case "ISO-8859-7":
95              // Take a hint from icu4j's CharsetRecog_8859_7 class:
96              // return windows-1253 before ISO-8859-7 if has valid C1 chars
97              if (hasC1Control(stats) && hasNoneOf(stats, windows1253Illegals)) {
98                  try {
99                      return forName("windows-1253");
100                 } catch (Exception e) {
101                     // ignore
102                 }
103             }
104             return hasNoneOf(stats, iso_8859_7Illegals) ? charset : null;
105         case "windows-1253":
106             return hasNoneOf(stats, windows1253Illegals) ? charset
107                     : hasNoneOf(stats, iso_8859_7Illegals) ? charset("ISO-8859-7") : null;
108 
109         // ISO-8859-8 variants
110         case "ISO-8859-8":
111         case "ISO-8859-8-I":
112             // Take a hint from icu4j's CharsetRecog_8859_8 class:
113             // return windows-1255 before ISO-8859-8 if has valid C1 chars
114             if (hasC1Control(stats) && hasNoneOf(stats, windows1255Illegals)) {
115                 try {
116                     return forName("windows-1255");
117                 } catch (Exception e) {
118                     // ignore
119                 }
120             }
121             return hasNoneOf(stats, iso_8859_8Illegals) ? charset : null;
122         case "windows-1255":
123             return hasNoneOf(stats, windows1255Illegals) ? charset
124                     : hasNoneOf(stats, iso_8859_8Illegals) ? charset("ISO-8859-8") : null;
125 
126         // ISO-8859-9 variants
127         case "ISO-8859-9":
128             // Take a hint from icu4j's CharsetRecog_8859_9 class:
129             // return windows-1254 before ISO-8859-9 if has valid C1 chars
130             if (hasC1Control(stats) && hasNoneOf(stats, windows1254Illegals)) {
131                 try {
132                     return forName("windows-1254");
133                 } catch (Exception e) {
134                     // ignore
135                 }
136             }
137             return charset;
138         case "windows-1254":
139             return hasNoneOf(stats, windows1254Illegals) ? charset : charset("ISO-8859-9");
140 
141         // Others: just make sure no illegal characters are present
142         case "windows-1251":
143             return hasNoneOf(stats, windows1251Illegals) ? charset : null;
144         case "ISO-8859-6":
145             return hasNoneOf(stats, iso_8859_6Illegals) ? charset : null;
146         default:
147             return charset;
148         }
149     }
150 
151     private static Charset iso_8859_1_or_15(TextStatistics stats) {
152         // Take a hint from Tika's UniversalEncodingListener:
153         // return ISO-8859-15 before ISO-8859-1 if currency/euro symbol is used
154         if (stats.count(0xa4) != 0) {
155             try {
156                 return forName("ISO-8859-15");
157             } catch (Exception e) {
158                 // ignore
159             }
160         }
161         return StandardCharsets.ISO_8859_1;
162     }
163 
164     private static final int[] windows1252Illegals = { 0x81, 0x8D, 0x8F, 0x90, 0x9D };
165     private static final int[] windows1250Illegals = { 0x81, 0x83, 0x88, 0x90, 0x98 };
166     private static final int[] iso_8859_7Illegals = { 0xAE, 0xD2, 0xFF };
167     private static final int[] windows1253Illegals = { 0x81, 0x88, 0x8A, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x98, 0x9A, 0x9C,
168             0x9D, 0x9E, 0x9F, 0xAA, 0xD2, 0xFF };
169 
170     private static final int[] windows1255Illegals = { 0x81, 0x8A, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x9A, 0x9C, 0x9D, 0x9E,
171             0x9F, 0xCA, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xFB, 0xFC, 0xFF };
172 
173     private static final int[] iso_8859_8Illegals = { 0xA1, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8,
174             0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA,
175             0xDB, 0xDC, 0xDD, 0xDE, 0xFB, 0xFC, 0xFF };
176 
177     private static final int[] windows1254Illegals = { 0x81, 0x8D, 0x8E, 0x8F, 0x90, 0x9D, 0x9E };
178 
179     private static final int[] windows1251Illegals = { 0x98 };
180 
181     private static final int[] iso_8859_6Illegals = { 0xA1, 0xA2, 0xA3, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAE,
182             0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBC, 0xBD, 0xBE, 0xC0, 0xDB, 0xDC,
183             0xDD, 0xDE, 0xDF, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF };
184 
185     private static boolean hasNoneOf(TextStatistics stats, int[] illegals) {
186         for (int i : illegals) {
187             if (stats.count(i) != 0) {
188                 return false;
189             }
190         }
191         return true;
192     }
193 
194     private static boolean hasC1Control(TextStatistics ts) {
195         for (int i = 0x80; i < 0xA0; i++) {
196             if (ts.count(i) != 0) {
197                 return true;
198             }
199         }
200         return false;
201     }
202 
203     private static class TextStatisticsOptimizedForUtf8 extends TextStatistics {
204 
205         private final Utf8Statistics utf8Stats = new Utf8Statistics();
206 
207         @Override
208         public void addData(byte[] buffer, int offset, int length) {
209             super.addData(buffer, offset, length);
210             utf8Stats.write(buffer, offset, length);
211         }
212 
213         @Override
214         public boolean looksLikeUTF8() {
215             return utf8Stats.looksLikeUtf8();
216         }
217     }
218 
219     /*
220      * Returns a custom implementation of Tika's TextStatistics class for an input stream
221      */
222     static TextStatistics stats(InputStream stream) throws IOException {
223         TextStatisticsOptimizedForUtf8 stats = new TextStatisticsOptimizedForUtf8();
224         byte[] buffer = new byte[8192];
225         int n;
226         while ((n = stream.read(buffer)) != -1) {
227             stats.addData(buffer, 0, n);
228         }
229         return stats;
230     }
231 
232     static Charset forName(String charset) throws Exception {
233         try {
234             return CharsetUtils.forName(charset);
235         } catch (Exception e) {
236             // ICU4j sometimes returns 'ISO-8859-8-I', which is unsupported!
237             // Cf. https://en.wikipedia.org/wiki/ISO/IEC_8859-8
238             // "Nominally ISO-8859-8 (code page 28598) is for 'visual order',
239             // and ISO-8859-8-I (code page 38598) is for logical order.
240             // But usually in practice, and required for HTML and XML
241             // documents, ISO-8859-8 also stands for logical order text."
242             charset = charset.replaceAll("(?i)-I\\b", "");
243             try {
244                 return CharsetUtils.forName(charset);
245             } catch (Exception ignored) {
246                 throw e;
247             }
248         }
249     }
250 
251     private static Charset charset(String charset) {
252         try {
253             return forName(charset);
254         } catch (Exception e) {
255             return null;
256         }
257     }
258 
259     private static final Evaluator charsetMetas = QueryParser.parse("meta[http-equiv=content-type], meta[charset]");
260 
261     static Charset htmlCharset(TextStatistics stats, Element root) {
262         for (Element meta : Selector.select(charsetMetas, root)) {
263             Charset foundCharset = correctVariant(stats, charset(meta.attr("charset")));
264             if (foundCharset != null) {
265                 return foundCharset;
266             }
267             foundCharset = correctVariant(stats, contentTypeCharset(meta.attr("content")));
268             if (foundCharset != null) {
269                 return foundCharset;
270             }
271         }
272         return null;
273     }
274 
275     private static final Pattern contentTypeCharsetPattern = Pattern
276             .compile("(?i)\\bcharset\\s*=[\\s\"']*([^\\s,;\"']+)");
277 
278     static Charset contentTypeCharset(CharSequence contentType) {
279         if (contentType == null)
280             return null;
281         Matcher m = contentTypeCharsetPattern.matcher(contentType);
282         if (m.find()) {
283             try {
284                 return forName(m.group(1));
285             } catch (Exception e) {
286                 return null;
287             }
288         }
289         return null;
290     }
291 
292     private static final Pattern xmlEncoding = Pattern
293             .compile("(?is)\\A\\s*<\\?\\s*xml\\s+[^<>]*encoding\\s*=\\s*(?:['\"]\\s*)?([-_:.a-z0-9]+)");
294 
295     static Charset xmlCharset(TextStatistics stats, CharSequence str) {
296         Matcher matcher = xmlEncoding.matcher(str);
297         if (matcher.find()) {
298             return correctVariant(stats, charset(matcher.group(1)));
299         } else {
300             return null;
301         }
302     }
303 
304     // uncomment this handy function to print out invalid bytes for a charset
305     // public static void main(String[] args) throws Exception {
306     // String[] cs = {
307     // "ISO-8859-15",
308     // "windows-1252", "ISO-8859-1",
309     // "windows-1250", "ISO-8859-2",
310     // "windows-1253", "ISO-8859-7",
311     // "windows-1255", "ISO-8859-8",
312     // "windows-1254", "ISO-8859-9",
313     // "windows-1251",
314     // "windows-1256",
315     // "ISO-8859-5",
316     // "ISO-8859-6"
317     // };
318     //
319     // for (String name : cs) {
320     // Charset c = EncodingUtils.forName(name);
321     // if (c.newEncoder().maxBytesPerChar() > 1) {
322     // throw new IllegalArgumentException("this method doesn't support " + c);
323     // }
324     // String line = java.util.stream.IntStream
325     // .range(0, 256)
326     // .filter(i -> new String(new byte[]{(byte) i}, c).getBytes(c)[0] != (byte)i)
327     // .mapToObj(i -> "0x" + Integer.toHexString(i).toUpperCase())
328     // .collect(java.util.stream.Collectors.joining(", ", "undefined " + name + " = {", "};"));
329     //
330     // System.out.println(line);
331     // }
332     // }
333 
334 }