This project has retired. For details please refer to its
Attic page.
TikaMIMETypeDetectorTest xref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.any23.mime;
18
19 import org.junit.Assert;
20 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
21 import org.junit.After;
22 import org.junit.Before;
23 import org.junit.Test;
24 import org.eclipse.rdf4j.rio.RDFFormat;
25
26 import java.io.BufferedInputStream;
27 import java.io.ByteArrayInputStream;
28 import java.io.IOException;
29 import java.io.InputStream;
30 import java.nio.charset.StandardCharsets;
31 import java.util.Arrays;
32 import java.util.Collection;
33 import java.util.List;
34
35
36
37
38
39
40
41 public class TikaMIMETypeDetectorTest {
42
43 private static final String PLAIN = "text/plain";
44 private static final String HTML = "text/html";
45 private static final String XML = "application/xml";
46 private static final String TRIX = RDFFormat.TRIX.getDefaultMIMEType();
47 private static final String XHTML = "application/xhtml+xml";
48 private static final String RDFXML = RDFFormat.RDFXML.getDefaultMIMEType();
49 private static final String TURTLE = RDFFormat.TURTLE.getDefaultMIMEType();
50 private static final String N3 = RDFFormat.N3.getDefaultMIMEType();
51 private static final String NQUADS = RDFFormat.NQUADS.getDefaultMIMEType();
52 private static final String CSV = "text/csv";
53 private static final String RSS = "application/rss+xml";
54 private static final String ATOM = "application/atom+xml";
55 private static final String YAML = "text/x-yaml";
56
57 private TikaMIMETypeDetector detector;
58
59 @Before
60 public void setUp() throws Exception {
61 detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
62 }
63
64 @After
65 public void tearDown() throws Exception {
66 detector = null;
67 }
68
69 @Test
70 public void testN3Detection() throws IOException {
71 assertN3Detection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .");
72 assertN3Detection("_:bnode1 <http://foo.com> _:bnode2 .");
73 assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\" .");
74 assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"@it .");
75 assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"^^<http://xxx.net> .");
76 assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"^^xsd:integer .");
77
78
79 assertN3DetectionFail(
80 "" + "<http://wrong.example.org/path> <http://wrong.foo.com> . <http://wrong.org/Document/foo#>");
81
82 assertN3DetectionFail(
83 "<http://example.org/path> <http://foo.com> <http://dom.org/Document/foo#> <http://path/to/graph> .");
84 }
85
86 @Test
87 public void testNQuadsDetection() throws IOException {
88 assertNQuadsDetection(
89 "<http://www.ex.eu> <http://foo.com> <http://example.org/Document/foo#> <http://path.to.graph> .");
90 assertNQuadsDetection("_:bnode1 <http://foo.com> _:bnode2 <http://path.to.graph> .");
91 assertNQuadsDetection(
92 "<http://www.ex.eu> <http://purl.org/dc/elements/1.1/title> \"x\" <http://path.to.graph> .");
93 assertNQuadsDetection(
94 "<http://www.ex.eu> <http://purl.org/dc/elements/1.1/title> \"x\"@it <http://path.to.graph> .");
95 assertNQuadsDetection(
96 "<http://www.ex.eu> <http://dd.cc.org/1.1/p> \"xxx\"^^<http://www.sp.net/a#tt> <http://path.to.graph> .");
97 assertNQuadsDetection(
98 "<http://www.ex.eu> <http://purlo.org/1.1/title> \"yyy\"^^xsd:datetime <http://path.to.graph> .");
99
100
101 assertNQuadsDetectionFail(
102 "<http://www.wrong.com> <http://wrong.com/1.1/tt> \"x\"^^<http://xxx.net/int> . <http://path.to.graph>");
103
104 assertNQuadsDetectionFail("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .");
105 }
106
107
108 @Test
109 public void testDetectRSS1ByContent() throws Exception {
110 detectMIMEtypeByContent(RDFXML, manifestRss1());
111 }
112
113 private List<String> manifestRss1() {
114 return Arrays.asList("/application/rss1/test1");
115 }
116
117 @Test
118 public void testDetectRSS2ByContent() throws Exception {
119 detectMIMEtypeByContent(RSS, manifestRss2());
120 }
121
122 private List<String> manifestRss2() {
123 return Arrays.asList("/application/rss2/index.html", "/application/rss2/rss2sample.xml",
124 "/application/rss2/test1");
125 }
126
127 @Test
128 public void testDetectRDFN3ByContent() throws Exception {
129 detectMIMEtypeByContent(N3, manifestN3());
130 }
131
132 private List<String> manifestN3() {
133 return Arrays.asList("/application/rdfn3/test1", "/application/rdfn3/test2", "/application/rdfn3/test3");
134 }
135
136 @Test
137 public void testDetectRDFNQuadsByContent() throws Exception {
138 detectMIMEtypeByContent(NQUADS, manifestNQuads());
139 }
140
141 private List<String> manifestNQuads() {
142 return Arrays.asList("/application/nquads/test1.nq", "/application/nquads/test2.nq");
143 }
144
145 @Test
146 public void testDetectRDFXMLByContent() throws Exception {
147 detectMIMEtypeByContent(RDFXML, manifestRdfXml());
148 }
149
150 private List<String> manifestRdfXml() {
151 return Arrays.asList("/application/rdfxml/error.rdf", "/application/rdfxml/foaf",
152 "/application/rdfxml/physics.owl", "/application/rdfxml/test1", "/application/rdfxml/test2",
153 "/application/rdfxml/test3");
154 }
155
156 @Test
157 public void testDetectTriXByContent() throws Exception {
158 detectMIMEtypeByContent(TRIX, manifestTrix());
159 }
160
161 private List<String> manifestTrix() {
162 return Arrays.asList("/application/trix/test1.trx");
163 }
164
165 @Test
166 public void testDetectAtomByContent() throws Exception {
167 detectMIMEtypeByContent(ATOM, manifestAtom());
168 }
169
170 private List<String> manifestAtom() {
171 return Arrays.asList("/application/atom/atom.xml");
172 }
173
174 @Test
175 public void testDetectHTMLByContent() throws Exception {
176 detectMIMEtypeByContent(HTML, manifestHtml());
177 }
178
179 private List<String> manifestHtml() {
180 return Arrays.asList("/text/html/test1");
181 }
182
183 @Test
184 public void testDetectRDFaByContent() throws Exception {
185 detectMIMEtypeByContent(XHTML, manifestRdfa());
186 }
187
188 private List<String> manifestRdfa() {
189 return Arrays.asList("/application/rdfa/false.test", "/application/rdfa/london-gazette.html",
190 "/application/rdfa/mic.xhtml", "/application/rdfa/test1.html");
191 }
192
193 @Test
194 public void testDetectXHTMLByContent() throws Exception {
195 detectMIMEtypeByContent(XHTML, manifestXHtml());
196 }
197
198 private List<String> manifestXHtml() {
199 return Arrays.asList("/application/xhtml/blank-file-header.xhtml", "/application/xhtml/index.html",
200 "/application/xhtml/test1");
201 }
202
203 @Test
204 public void testDetectWSDLByContent() throws Exception {
205 detectMIMEtypeByContent("application/x-wsdl", manifestWsdl());
206 }
207
208 private List<String> manifestWsdl() {
209 return Arrays.asList("/application/wsdl/error.wsdl", "/application/wsdl/test1");
210 }
211
212 @Test
213 public void testDetectZIPByContent() throws Exception {
214 detectMIMEtypeByContent("application/zip", manifestZip());
215 }
216
217 private List<String> manifestZip() {
218 return Arrays.asList("/application/zip/4_entries.zip", "/application/zip/test1.zip", "/application/zip/test2");
219 }
220
221 @Test
222 public void testDetectCSVByContent() throws Exception {
223 detectMIMEtypeByContent(CSV, manifestCsv());
224 }
225
226 private List<String> manifestCsv() {
227 return Arrays.asList("/org/apache/any23/extractor/csv/test-comma.csv",
228 "/org/apache/any23/extractor/csv/test-semicolon.csv", "/org/apache/any23/extractor/csv/test-tab.csv",
229 "/org/apache/any23/extractor/csv/test-type.csv");
230 }
231
232
233
234
235 @Test
236 public void testDetectContentPlainByMeta() throws IOException {
237 detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
238 }
239
240 @Test
241 public void testDetectTextRDFByMeta() throws IOException {
242 detectMIMETypeByMimeTypeHint(RDFXML, "text/rdf");
243 }
244
245 @Test
246 public void testDetectTextN3ByMeta() throws IOException {
247 detectMIMETypeByMimeTypeHint(N3, "text/rdf+n3");
248 }
249
250 @Test
251 public void testDetectTextNQuadsByMeta() throws IOException {
252 detectMIMETypeByMimeTypeHint(NQUADS, "application/n-quads");
253 }
254
255 @Test
256 public void testDetectTextTurtleByMeta() throws IOException {
257 detectMIMETypeByMimeTypeHint(TURTLE, "text/turtle");
258 }
259
260 @Test
261 public void testDetectRDFXMLByMeta() throws IOException {
262 detectMIMETypeByMimeTypeHint(RDFXML, "application/rdf+xml");
263 }
264
265 @Test
266 public void testDetectXMLByMeta() throws IOException {
267 detectMIMETypeByMimeTypeHint(XML, "application/xml");
268 }
269
270 @Test
271 public void testDetectTriXByMeta() throws IOException {
272 detectMIMETypeByMimeTypeHint(TRIX, "application/trix");
273 }
274
275 @Test
276 public void testDetectExtensionN3ByMeta() throws IOException {
277 detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
278 }
279
280 @Test
281 public void testDetectXHTMLByMeta() throws IOException {
282 detectMIMETypeByMimeTypeHint(XHTML, "application/xhtml+xml");
283 }
284
285 @Test
286 public void testDetectTextHTMLByMeta() throws IOException {
287 detectMIMETypeByMimeTypeHint(HTML, "text/html");
288 }
289
290 @Test
291 public void testDetectTextPlainByMeta() throws IOException {
292 detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
293 }
294
295 @Test
296 public void testDetectApplicationXMLByMeta() throws IOException {
297 detectMIMETypeByMimeTypeHint(XML, "application/xml");
298 }
299
300 @Test
301 public void testDetectApplicationCSVByMeta() throws IOException {
302 detectMIMETypeByMimeTypeHint(CSV, "text/csv");
303 }
304
305 @Test
306 public void testDetectApplicationYAMLByMeta() throws IOException {
307 detectMIMETypeByMimeTypeHint(YAML, "text/x-yaml");
308 }
309
310
311
312
313 @Test
314 public void testRDFXMLByContentAndName() throws Exception {
315 detectMIMETypeByContentAndName(RDFXML, manifestRdfXml());
316 }
317
318 @Test
319 public void testTriXByContentAndName() throws Exception {
320 detectMIMETypeByContentAndName(TRIX, manifestTrix());
321 }
322
323 @Test
324 public void testRSS1ByContentAndName() throws Exception {
325 detectMIMETypeByContentAndName(RDFXML, manifestRss1());
326 }
327
328 @Test
329 public void testRSS2ByContentAndName() throws Exception {
330 detectMIMETypeByContentAndName(RSS, manifestRss2());
331 }
332
333 @Test
334 public void testDetectRDFN3ByContentAndName() throws Exception {
335 detectMIMETypeByContentAndName(N3, manifestN3());
336 }
337
338 @Test
339 public void testDetectRDFNQuadsByContentAndName() throws Exception {
340 detectMIMETypeByContentAndName(NQUADS, manifestNQuads());
341 }
342
343 @Test
344 public void testAtomByContentAndName() throws Exception {
345 detectMIMETypeByContentAndName(ATOM, manifestAtom());
346 }
347
348 @Test
349 public void testHTMLByContentAndName() throws Exception {
350 detectMIMETypeByContentAndName(HTML, manifestHtml());
351 }
352
353 @Test
354 public void testXHTMLByContentAndName() throws Exception {
355 detectMIMETypeByContentAndName(XHTML, manifestXHtml());
356 }
357
358 @Test
359 public void testWSDLByContentAndName() throws Exception {
360 detectMIMETypeByContentAndName("application/x-wsdl", manifestWsdl());
361 }
362
363 @Test
364 public void testZipByContentAndName() throws Exception {
365 detectMIMETypeByContentAndName("application/zip", manifestZip());
366 }
367
368 @Test
369 public void testRDFaByContentAndName() throws Exception {
370 detectMIMETypeByContentAndName(XHTML, manifestRdfa());
371 }
372
373 @Test
374 public void testCSVByContentAndName() throws Exception {
375 detectMIMETypeByContentAndName(CSV, manifestCsv());
376 }
377
378
379
380
381
382
383
384 @Test
385 public void testYAMLByContentAndName() throws Exception {
386 detectMIMETypeByContentAndName(YAML, manifestYAML());
387 }
388
389 private List<String> manifestYAML() {
390 return Arrays.asList("/org/apache/any23/extractor/yaml/simple-load.yml",
391 "/org/apache/any23/extractor/yaml/simple-load_no_head.yml",
392 "/org/apache/any23/extractor/yaml/simple-load_yaml.yaml");
393 }
394
395
396 private void assertN3Detection(String n3Exp) throws IOException {
397 ByteArrayInputStream bais = new ByteArrayInputStream(n3Exp.getBytes(StandardCharsets.UTF_8));
398 Assert.assertTrue(TikaMIMETypeDetector.checkN3Format(bais));
399 }
400
401 private void assertN3DetectionFail(String n3Exp) throws IOException {
402 ByteArrayInputStream bais = new ByteArrayInputStream(n3Exp.getBytes(StandardCharsets.UTF_8));
403 Assert.assertFalse(TikaMIMETypeDetector.checkN3Format(bais));
404 }
405
406 private void assertNQuadsDetection(String n4Exp) throws IOException {
407 ByteArrayInputStream bais = new ByteArrayInputStream(n4Exp.getBytes(StandardCharsets.UTF_8));
408 Assert.assertTrue(TikaMIMETypeDetector.checkNQuadsFormat(bais));
409 }
410
411 private void assertNQuadsDetectionFail(String n4Exp) throws IOException {
412 ByteArrayInputStream bais = new ByteArrayInputStream(n4Exp.getBytes(StandardCharsets.UTF_8));
413 Assert.assertFalse(TikaMIMETypeDetector.checkNQuadsFormat(bais));
414 }
415
416
417
418
419
420
421
422
423
424
425
426 private void detectMIMEtypeByContent(String expectedMimeType, Collection<String> manifest) throws IOException {
427 String detectedMimeType;
428 for (String test : manifest) {
429 InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(test));
430 detectedMimeType = detector.guessMIMEType(null, is, null).toString();
431 if (test.contains("error")) {
432 Assert.assertNotSame(expectedMimeType, detectedMimeType);
433 } else {
434 Assert.assertEquals(
435 String.format(java.util.Locale.ROOT, "Error in mimetype detection for file %s", test),
436 expectedMimeType, detectedMimeType);
437 }
438 is.close();
439 }
440 }
441
442
443
444
445
446
447
448
449
450 private void detectMIMETypeByMimeTypeHint(String expectedMimeType, String contentTypeHeader) throws IOException {
451 String detectedMimeType = detector.guessMIMEType(null, null, MIMEType.parse(contentTypeHeader)).toString();
452 Assert.assertEquals(expectedMimeType, detectedMimeType);
453 }
454
455
456
457
458
459
460
461
462
463 private void detectMIMETypeByContentAndName(String expectedMimeType, Collection<String> manifest)
464 throws IOException {
465 String detectedMimeType;
466 for (String test : manifest) {
467 InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(test));
468 detectedMimeType = detector.guessMIMEType(test, is, null).toString();
469 if (test.contains("error")) {
470 Assert.assertNotSame(expectedMimeType, detectedMimeType);
471 } else {
472 Assert.assertEquals(
473 String.format(java.util.Locale.ROOT, "Error while detecting mimetype in file %s", test),
474 expectedMimeType, detectedMimeType);
475 }
476 is.close();
477 }
478 }
479
480 }