This project has retired. For details please refer to its Attic page.
TikaEncodingDetectorTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.encoding;
19  
20  import org.apache.tika.detect.TextStatistics;
21  import org.junit.After;
22  import org.junit.Before;
23  import org.junit.Test;
24  
25  import java.io.ByteArrayInputStream;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.nio.charset.Charset;
29  
30  import static java.nio.charset.StandardCharsets.ISO_8859_1;
31  import static java.nio.charset.StandardCharsets.UTF_8;
32  import static org.junit.Assert.assertEquals;
33  
34  /**
35   * Test case for {@link TikaEncodingDetector}.
36   *
37   * @author Michele Mostarda ( michele.mostarda@gmail.com )
38   * @author Davide Palmisano ( dpalmisano@gmail.com )
39   * 
40   * @version $Id$
41   */
42  public class TikaEncodingDetectorTest {
43  
44      private TikaEncodingDetector detector;
45  
46      @Before
47      public void setUp() {
48          detector = new TikaEncodingDetector();
49      }
50  
51      @After
52      public void tearDown() {
53          detector = null;
54      }
55  
56      @Test
57      public void testISO8859HTML() throws IOException {
58          assertEncoding("ISO-8859-1", "/microformats/xfn/encoding-iso-8859-1.html");
59      }
60  
61      @Test
62      public void testISO8859XHTML() throws IOException {
63          assertEncoding("ISO-8859-1", "/microformats/xfn/encoding-iso-8859-1.xhtml");
64      }
65  
66      @Test
67      public void testUTF8AfterTitle() throws IOException {
68          assertEncoding("UTF-8", "/microformats/xfn/encoding-utf-8-after-title.html");
69      }
70  
71      @Test
72      public void testUTF8HTML() throws IOException {
73          assertEncoding("UTF-8", "/microformats/xfn/encoding-utf-8.html");
74      }
75  
76      @Test
77      public void testUTF8XHTML() throws IOException {
78          assertEncoding("UTF-8", "/microformats/xfn/encoding-utf-8.xhtml");
79      }
80  
81      @Test
82      public void testEncodingHTML() throws IOException {
83          assertEncoding("UTF-8", "/html/encoding-test.html");
84      }
85  
86      @Test
87      public void testXMLEncodingPattern() throws IOException {
88          String[] strings = { "<?xml encoding=\"UTF-8\"?>", " \n<?xMl encoding   = 'utf-8'?>",
89                  "\n <?Xml enCoding=Utf8?>" };
90          for (String s : strings) {
91              Charset detected = EncodingUtils.xmlCharset(new TextStatistics(), s);
92              assertEquals(detected, UTF_8);
93          }
94      }
95  
96      private static ByteArrayInputStream bytes(String string, Charset encoding) {
97          return new ByteArrayInputStream(string.getBytes(encoding));
98      }
99  
100     @Test
101     public void testUtf8Simple() throws IOException {
102         assertEquals("UTF-8", detector.guessEncoding(bytes("Hellö Wörld!", UTF_8)));
103     }
104 
105     @Test
106     public void testIso88591Simple() throws IOException {
107         assertEquals("ISO-8859-1", detector.guessEncoding(bytes("Hellö Wörld!", ISO_8859_1)));
108     }
109 
110     @Test
111     public void testTikaIssue771() throws IOException {
112         assertEquals("UTF-8", detector.guessEncoding(bytes("Hello, World!", UTF_8)));
113     }
114 
115     @Test
116     public void testTikaIssue868() throws IOException {
117         assertEquals("UTF-8", detector.guessEncoding(bytes("Indanyl", UTF_8)));
118     }
119 
120     @Test
121     public void testTikaIssue2771() throws IOException {
122         assertEquals("UTF-8", detector.guessEncoding(bytes("Name: Amanda\nJazz Band", UTF_8)));
123     }
124 
125     private void assertEncoding(final String expected, final String resource) throws IOException {
126         try (InputStream fis = getClass().getResourceAsStream(resource)) {
127             String encoding = detector.guessEncoding(fis);
128             assertEquals("Unexpected encoding", expected, encoding);
129         }
130     }
131 
132 }