This project has retired. For details please refer to its
Attic page.
TikaEncodingDetectorTest xref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.encoding;
19
20 import org.apache.tika.detect.TextStatistics;
21 import org.junit.After;
22 import org.junit.Before;
23 import org.junit.Test;
24
25 import java.io.ByteArrayInputStream;
26 import java.io.IOException;
27 import java.io.InputStream;
28 import java.nio.charset.Charset;
29
30 import static java.nio.charset.StandardCharsets.ISO_8859_1;
31 import static java.nio.charset.StandardCharsets.UTF_8;
32 import static org.junit.Assert.assertEquals;
33
34
35
36
37
38
39
40
41
42 public class TikaEncodingDetectorTest {
43
44 private TikaEncodingDetector detector;
45
46 @Before
47 public void setUp() {
48 detector = new TikaEncodingDetector();
49 }
50
51 @After
52 public void tearDown() {
53 detector = null;
54 }
55
56 @Test
57 public void testISO8859HTML() throws IOException {
58 assertEncoding("ISO-8859-1", "/microformats/xfn/encoding-iso-8859-1.html");
59 }
60
61 @Test
62 public void testISO8859XHTML() throws IOException {
63 assertEncoding("ISO-8859-1", "/microformats/xfn/encoding-iso-8859-1.xhtml");
64 }
65
66 @Test
67 public void testUTF8AfterTitle() throws IOException {
68 assertEncoding("UTF-8", "/microformats/xfn/encoding-utf-8-after-title.html");
69 }
70
71 @Test
72 public void testUTF8HTML() throws IOException {
73 assertEncoding("UTF-8", "/microformats/xfn/encoding-utf-8.html");
74 }
75
76 @Test
77 public void testUTF8XHTML() throws IOException {
78 assertEncoding("UTF-8", "/microformats/xfn/encoding-utf-8.xhtml");
79 }
80
81 @Test
82 public void testEncodingHTML() throws IOException {
83 assertEncoding("UTF-8", "/html/encoding-test.html");
84 }
85
86 @Test
87 public void testXMLEncodingPattern() throws IOException {
88 String[] strings = { "<?xml encoding=\"UTF-8\"?>", " \n<?xMl encoding = 'utf-8'?>",
89 "\n <?Xml enCoding=Utf8?>" };
90 for (String s : strings) {
91 Charset detected = EncodingUtils.xmlCharset(new TextStatistics(), s);
92 assertEquals(detected, UTF_8);
93 }
94 }
95
96 private static ByteArrayInputStream bytes(String string, Charset encoding) {
97 return new ByteArrayInputStream(string.getBytes(encoding));
98 }
99
100 @Test
101 public void testUtf8Simple() throws IOException {
102 assertEquals("UTF-8", detector.guessEncoding(bytes("Hellö Wörld!", UTF_8)));
103 }
104
105 @Test
106 public void testIso88591Simple() throws IOException {
107 assertEquals("ISO-8859-1", detector.guessEncoding(bytes("Hellö Wörld!", ISO_8859_1)));
108 }
109
110 @Test
111 public void testTikaIssue771() throws IOException {
112 assertEquals("UTF-8", detector.guessEncoding(bytes("Hello, World!", UTF_8)));
113 }
114
115 @Test
116 public void testTikaIssue868() throws IOException {
117 assertEquals("UTF-8", detector.guessEncoding(bytes("Indanyl", UTF_8)));
118 }
119
120 @Test
121 public void testTikaIssue2771() throws IOException {
122 assertEquals("UTF-8", detector.guessEncoding(bytes("Name: Amanda\nJazz Band", UTF_8)));
123 }
124
125 private void assertEncoding(final String expected, final String resource) throws IOException {
126 try (InputStream fis = getClass().getResourceAsStream(resource)) {
127 String encoding = detector.guessEncoding(fis);
128 assertEquals("Unexpected encoding", expected, encoding);
129 }
130 }
131
132 }