View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.encoding;
19  
20  import org.apache.tika.detect.TextStatistics;
21  import org.apache.tika.utils.CharsetUtils;
22  import org.jsoup.nodes.Element;
23  import org.jsoup.select.Evaluator;
24  import org.jsoup.select.QueryParser;
25  import org.jsoup.select.Selector;
26  import org.rypt.f8.Utf8Statistics;
27  
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.nio.charset.Charset;
31  import java.nio.charset.StandardCharsets;
32  import java.util.regex.Matcher;
33  import java.util.regex.Pattern;
34  
35  /**
36   * @author Hans Brende
37   */
38  class EncodingUtils {
39  
40      /**
41       * Very efficient method to convert an input stream directly to an ISO-8859-1 encoded string
42       */
43      static String iso_8859_1(InputStream is) throws IOException {
44          StringBuilder chars = new StringBuilder(Math.max(is.available(), 8192));
45          byte[] buffer = new byte[8192];
46          int n;
47          while ((n = is.read(buffer)) != -1) {
48              for (int i = 0; i < n; i++) {
49                  chars.append((char)(buffer[i] & 0xFF));
50              }
51          }
52          return chars.toString();
53      }
54  
55  
56      //get correct variant, or null if charset is incompatible with stats
57      static Charset correctVariant(TextStatistics stats, Charset charset) {
58          if (charset == null) {
59              return null;
60          }
61          switch (charset.name()) {
62              //ISO-8859-1 variants
63              case "ISO-8859-1":
64                  //Take a hint from icu4j's CharsetRecog_8859_1 and Tika's UniversalEncodingListener:
65                  // return windows-1252 before ISO-8859-1 if:
66                  // (1) C1 ctrl chars are used (as in icu4j), or
67                  // (2) '\r' is used (as in Tika)
68                  if ((stats.count('\r') != 0 || hasC1Control(stats)) && hasNoneOf(stats, windows1252Illegals)) {
69                      try {
70                          return forName("windows-1252");
71                      } catch (Exception e) {
72                          //ignore
73                      }
74                  }
75                  return iso_8859_1_or_15(stats);
76              case "windows-1252":
77                  return hasNoneOf(stats, windows1252Illegals) ? charset : iso_8859_1_or_15(stats);
78  
79              //ISO-8859-2 variants
80              case "ISO-8859-2":
81                  //Take a hint from icu4j's CharsetRecog_8859_2 class:
82                  // return windows-1250 before ISO-8859-2 if has valid C1 chars
83                  if (hasC1Control(stats) && hasNoneOf(stats, windows1250Illegals)) {
84                      try {
85                          return forName("windows-1250");
86                      } catch (Exception e) {
87                          //ignore
88                      }
89                  }
90                  return charset;
91              case "windows-1250":
92                  return hasNoneOf(stats, windows1250Illegals) ? charset : charset("ISO-8859-2");
93  
94              //ISO-8859-7 variants
95              case "ISO-8859-7":
96                  //Take a hint from icu4j's CharsetRecog_8859_7 class:
97                  // return windows-1253 before ISO-8859-7 if has valid C1 chars
98                  if (hasC1Control(stats) && hasNoneOf(stats, windows1253Illegals)) {
99                      try {
100                         return forName("windows-1253");
101                     } catch (Exception e) {
102                         //ignore
103                     }
104                 }
105                 return hasNoneOf(stats, iso_8859_7Illegals) ? charset : null;
106             case "windows-1253":
107                 return hasNoneOf(stats, windows1253Illegals) ? charset :
108                         hasNoneOf(stats, iso_8859_7Illegals) ? charset("ISO-8859-7") : null;
109 
110             //ISO-8859-8 variants
111             case "ISO-8859-8":
112             case "ISO-8859-8-I":
113                 //Take a hint from icu4j's CharsetRecog_8859_8 class:
114                 // return windows-1255 before ISO-8859-8 if has valid C1 chars
115                 if (hasC1Control(stats) && hasNoneOf(stats, windows1255Illegals)) {
116                     try {
117                         return forName("windows-1255");
118                     } catch (Exception e) {
119                         //ignore
120                     }
121                 }
122                 return hasNoneOf(stats, iso_8859_8Illegals) ? charset : null;
123             case "windows-1255":
124                 return hasNoneOf(stats, windows1255Illegals) ? charset :
125                         hasNoneOf(stats, iso_8859_8Illegals) ? charset("ISO-8859-8") : null;
126 
127             //ISO-8859-9 variants
128             case "ISO-8859-9":
129                 //Take a hint from icu4j's CharsetRecog_8859_9 class:
130                 // return windows-1254 before ISO-8859-9 if has valid C1 chars
131                 if (hasC1Control(stats) && hasNoneOf(stats, windows1254Illegals)) {
132                     try {
133                         return forName("windows-1254");
134                     } catch (Exception e) {
135                         //ignore
136                     }
137                 }
138                 return charset;
139             case "windows-1254":
140                 return hasNoneOf(stats, windows1254Illegals) ? charset : charset("ISO-8859-9");
141 
142             //Others: just make sure no illegal characters are present
143             case "windows-1251":
144                 return hasNoneOf(stats, windows1251Illegals) ? charset : null;
145             case "ISO-8859-6":
146                 return hasNoneOf(stats, iso_8859_6Illegals) ? charset : null;
147             default:
148                 return charset;
149         }
150     }
151 
152     private static Charset iso_8859_1_or_15(TextStatistics stats) {
153         //Take a hint from Tika's UniversalEncodingListener:
154         // return ISO-8859-15 before ISO-8859-1 if currency/euro symbol is used
155         if (stats.count(0xa4) != 0) {
156             try {
157                 return forName("ISO-8859-15");
158             } catch (Exception e) {
159                 //ignore
160             }
161         }
162         return StandardCharsets.ISO_8859_1;
163     }
164 
165     private static final int[] windows1252Illegals = {0x81, 0x8D, 0x8F, 0x90, 0x9D};
166     private static final int[] windows1250Illegals = {0x81, 0x83, 0x88, 0x90, 0x98};
167     private static final int[] iso_8859_7Illegals = {0xAE, 0xD2, 0xFF};
168     private static final int[] windows1253Illegals = {0x81, 0x88, 0x8A, 0x8C, 0x8D,
169             0x8E, 0x8F, 0x90, 0x98, 0x9A, 0x9C, 0x9D, 0x9E, 0x9F, 0xAA, 0xD2, 0xFF};
170 
171     private static final int[] windows1255Illegals = {0x81, 0x8A, 0x8C, 0x8D, 0x8E, 0x8F, 0x90, 0x9A,
172             0x9C, 0x9D, 0x9E, 0x9F, 0xCA, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF, 0xFB, 0xFC, 0xFF};
173 
174     private static final int[] iso_8859_8Illegals = {0xA1, 0xBF, 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5,
175             0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4,
176             0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xFB, 0xFC, 0xFF};
177 
178     private static final int[] windows1254Illegals = {0x81, 0x8D, 0x8E, 0x8F, 0x90, 0x9D, 0x9E};
179 
180     private static final int[] windows1251Illegals = {0x98};
181 
182     private static final int[] iso_8859_6Illegals = {0xA1, 0xA2, 0xA3, 0xA5, 0xA6, 0xA7,
183             0xA8, 0xA9, 0xAA, 0xAB, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6,
184             0xB7, 0xB8, 0xB9, 0xBA, 0xBC, 0xBD, 0xBE, 0xC0, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
185             0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF};
186 
187 
188     private static boolean hasNoneOf(TextStatistics stats, int[] illegals) {
189         for (int i : illegals) {
190             if (stats.count(i) != 0) {
191                 return false;
192             }
193         }
194         return true;
195     }
196 
197     private static boolean hasC1Control(TextStatistics ts) {
198         for (int i = 0x80; i < 0xA0; i++) {
199             if (ts.count(i) != 0) {
200                 return true;
201             }
202         }
203         return false;
204     }
205 
206     private static class TextStatisticsOptimizedForUtf8 extends TextStatistics {
207 
208         private final Utf8Statistics utf8Stats = new Utf8Statistics();
209 
210         @Override
211         public void addData(byte[] buffer, int offset, int length) {
212             super.addData(buffer, offset, length);
213             utf8Stats.write(buffer, offset, length);
214         }
215 
216         @Override
217         public boolean looksLikeUTF8() {
218             return utf8Stats.looksLikeUtf8();
219         }
220     }
221 
222     /*
223      * Returns a custom implementation of Tika's TextStatistics class for an input stream
224      */
225     static TextStatistics stats(InputStream stream) throws IOException {
226         TextStatisticsOptimizedForUtf8 stats = new TextStatisticsOptimizedForUtf8();
227         byte[] buffer = new byte[8192];
228         int n;
229         while ((n = stream.read(buffer)) != -1) {
230             stats.addData(buffer, 0, n);
231         }
232         return stats;
233     }
234 
235     static Charset forName(String charset) throws Exception {
236         try {
237             return CharsetUtils.forName(charset);
238         } catch (Exception e) {
239             //ICU4j sometimes returns 'ISO-8859-8-I', which is unsupported!
240             // Cf. https://en.wikipedia.org/wiki/ISO/IEC_8859-8
241             // "Nominally ISO-8859-8 (code page 28598) is for 'visual order',
242             // and ISO-8859-8-I (code page 38598) is for logical order.
243             // But usually in practice, and required for HTML and XML
244             // documents, ISO-8859-8 also stands for logical order text."
245             charset = charset.replaceAll("(?i)-I\\b", "");
246             try {
247                 return CharsetUtils.forName(charset);
248             } catch (Exception ignored) {
249                 throw e;
250             }
251         }
252     }
253 
254     private static Charset charset(String charset) {
255         try {
256             return forName(charset);
257         } catch (Exception e) {
258             return null;
259         }
260     }
261 
262     private static final Evaluator charsetMetas = QueryParser
263             .parse("meta[http-equiv=content-type], meta[charset]");
264 
265     static Charset htmlCharset(TextStatistics stats, Element root) {
266         for (Element meta : Selector.select(charsetMetas, root)) {
267             Charset foundCharset = correctVariant(stats, charset(meta.attr("charset")));
268             if (foundCharset != null) {
269                 return foundCharset;
270             }
271             foundCharset = correctVariant(stats, contentTypeCharset(meta.attr("content")));
272             if (foundCharset != null) {
273                 return foundCharset;
274             }
275         }
276         return null;
277     }
278 
279 
280     private static final Pattern contentTypeCharsetPattern =
281             Pattern.compile("(?i)\\bcharset\\s*=[\\s\"']*([^\\s,;\"']+)");
282 
283     static Charset contentTypeCharset(CharSequence contentType) {
284         if (contentType == null)
285             return null;
286         Matcher m = contentTypeCharsetPattern.matcher(contentType);
287         if (m.find()) {
288             try {
289                 return forName(m.group(1));
290             } catch (Exception e) {
291                 return null;
292             }
293         }
294         return null;
295     }
296 
297     private static final Pattern xmlEncoding = Pattern.compile(
298             "(?is)\\A\\s*<\\?\\s*xml\\s+[^<>]*encoding\\s*=\\s*(?:['\"]\\s*)?([-_:.a-z0-9]+)");
299 
300     static Charset xmlCharset(TextStatistics stats, CharSequence str) {
301         Matcher matcher = xmlEncoding.matcher(str);
302         if (matcher.find()) {
303             return correctVariant(stats, charset(matcher.group(1)));
304         } else {
305             return null;
306         }
307     }
308 
309 
310     //uncomment this handy function to print out invalid bytes for a charset
311 //    public static void main(String[] args) throws Exception {
312 //        String[] cs = {
313 //                "ISO-8859-15",
314 //                "windows-1252", "ISO-8859-1",
315 //                "windows-1250", "ISO-8859-2",
316 //                "windows-1253", "ISO-8859-7",
317 //                "windows-1255", "ISO-8859-8",
318 //                "windows-1254", "ISO-8859-9",
319 //                "windows-1251",
320 //                "windows-1256",
321 //                "ISO-8859-5",
322 //                "ISO-8859-6"
323 //        };
324 //
325 //        for (String name : cs) {
326 //            Charset c = EncodingUtils.forName(name);
327 //            if (c.newEncoder().maxBytesPerChar() > 1) {
328 //                throw new IllegalArgumentException("this method doesn't support " + c);
329 //            }
330 //            String line = java.util.stream.IntStream
331 //                    .range(0, 256)
332 //                    .filter(i -> new String(new byte[]{(byte) i}, c).getBytes(c)[0] != (byte)i)
333 //                    .mapToObj(i -> "0x" + Integer.toHexString(i).toUpperCase())
334 //                    .collect(java.util.stream.Collectors.joining(", ", "undefined " + name + " = {", "};"));
335 //
336 //            System.out.println(line);
337 //        }
338 //    }
339 
340 }