This project has retired. For details please refer to its Attic page.
TikaMIMETypeDetectorTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.any23.mime;
18  
19  import org.junit.Assert;
20  import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
21  import org.junit.After;
22  import org.junit.Before;
23  import org.junit.Test;
24  import org.eclipse.rdf4j.rio.RDFFormat;
25  
26  import java.io.BufferedInputStream;
27  import java.io.ByteArrayInputStream;
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.nio.charset.StandardCharsets;
31  import java.util.Arrays;
32  import java.util.Collection;
33  import java.util.List;
34  
35  /**
36   * Test case for {@link TikaMIMETypeDetector} class.
37   *
38   * @author juergen
39   * @author Michele Mostarda (michele.mostarda@gmail.com)
40   */
41  public class TikaMIMETypeDetectorTest {
42  
43      private static final String PLAIN = "text/plain";
44      private static final String HTML = "text/html";
45      private static final String XML = "application/xml";
46      private static final String TRIX = RDFFormat.TRIX.getDefaultMIMEType();
47      private static final String XHTML = "application/xhtml+xml";
48      private static final String RDFXML = RDFFormat.RDFXML.getDefaultMIMEType();
49      private static final String TURTLE = RDFFormat.TURTLE.getDefaultMIMEType();
50      private static final String N3 = RDFFormat.N3.getDefaultMIMEType();
51      private static final String NQUADS = RDFFormat.NQUADS.getDefaultMIMEType();
52      private static final String CSV = "text/csv";
53      private static final String RSS = "application/rss+xml";
54      private static final String ATOM = "application/atom+xml";
55      private static final String YAML = "text/x-yaml";
56  
57      private TikaMIMETypeDetector detector;
58  
59      @Before
60      public void setUp() throws Exception {
61          detector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
62      }
63  
64      @After
65      public void tearDown() throws Exception {
66          detector = null;
67      }
68  
69      @Test
70      public void testN3Detection() throws IOException {
71          assertN3Detection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .");
72          assertN3Detection("_:bnode1 <http://foo.com> _:bnode2 .");
73          assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\" .");
74          assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"@it .");
75          assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"^^<http://xxx.net> .");
76          assertN3Detection("<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"x\"^^xsd:integer .");
77  
78          // Wrong N3 line '.'
79          assertN3DetectionFail(
80                  "" + "<http://wrong.example.org/path> <http://wrong.foo.com> . <http://wrong.org/Document/foo#>");
81          // NQuads is not mislead with N3.
82          assertN3DetectionFail(
83                  "<http://example.org/path> <http://foo.com> <http://dom.org/Document/foo#> <http://path/to/graph> .");
84      }
85  
86      @Test
87      public void testNQuadsDetection() throws IOException {
88          assertNQuadsDetection(
89                  "<http://www.ex.eu> <http://foo.com> <http://example.org/Document/foo#> <http://path.to.graph> .");
90          assertNQuadsDetection("_:bnode1 <http://foo.com> _:bnode2 <http://path.to.graph> .");
91          assertNQuadsDetection(
92                  "<http://www.ex.eu> <http://purl.org/dc/elements/1.1/title> \"x\" <http://path.to.graph> .");
93          assertNQuadsDetection(
94                  "<http://www.ex.eu> <http://purl.org/dc/elements/1.1/title> \"x\"@it <http://path.to.graph> .");
95          assertNQuadsDetection(
96                  "<http://www.ex.eu> <http://dd.cc.org/1.1/p> \"xxx\"^^<http://www.sp.net/a#tt> <http://path.to.graph> .");
97          assertNQuadsDetection(
98                  "<http://www.ex.eu> <http://purlo.org/1.1/title> \"yyy\"^^xsd:datetime <http://path.to.graph> .");
99  
100         // Wrong NQuads line.
101         assertNQuadsDetectionFail(
102                 "<http://www.wrong.com> <http://wrong.com/1.1/tt> \"x\"^^<http://xxx.net/int> . <http://path.to.graph>");
103         // N3 is not mislead with NQuads.
104         assertNQuadsDetectionFail("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .");
105     }
106 
107     /* BEGIN: by content. */
108     @Test
109     public void testDetectRSS1ByContent() throws Exception {
110         detectMIMEtypeByContent(RDFXML, manifestRss1());
111     }
112 
113     private List<String> manifestRss1() {
114         return Arrays.asList("/application/rss1/test1");
115     }
116 
117     @Test
118     public void testDetectRSS2ByContent() throws Exception {
119         detectMIMEtypeByContent(RSS, manifestRss2());
120     }
121 
122     private List<String> manifestRss2() {
123         return Arrays.asList("/application/rss2/index.html", "/application/rss2/rss2sample.xml",
124                 "/application/rss2/test1");
125     }
126 
127     @Test
128     public void testDetectRDFN3ByContent() throws Exception {
129         detectMIMEtypeByContent(N3, manifestN3());
130     }
131 
132     private List<String> manifestN3() {
133         return Arrays.asList("/application/rdfn3/test1", "/application/rdfn3/test2", "/application/rdfn3/test3");
134     }
135 
136     @Test
137     public void testDetectRDFNQuadsByContent() throws Exception {
138         detectMIMEtypeByContent(NQUADS, manifestNQuads());
139     }
140 
141     private List<String> manifestNQuads() {
142         return Arrays.asList("/application/nquads/test1.nq", "/application/nquads/test2.nq");
143     }
144 
145     @Test
146     public void testDetectRDFXMLByContent() throws Exception {
147         detectMIMEtypeByContent(RDFXML, manifestRdfXml());
148     }
149 
150     private List<String> manifestRdfXml() {
151         return Arrays.asList("/application/rdfxml/error.rdf", "/application/rdfxml/foaf",
152                 "/application/rdfxml/physics.owl", "/application/rdfxml/test1", "/application/rdfxml/test2",
153                 "/application/rdfxml/test3");
154     }
155 
156     @Test
157     public void testDetectTriXByContent() throws Exception {
158         detectMIMEtypeByContent(TRIX, manifestTrix());
159     }
160 
161     private List<String> manifestTrix() {
162         return Arrays.asList("/application/trix/test1.trx");
163     }
164 
165     @Test
166     public void testDetectAtomByContent() throws Exception {
167         detectMIMEtypeByContent(ATOM, manifestAtom());
168     }
169 
170     private List<String> manifestAtom() {
171         return Arrays.asList("/application/atom/atom.xml");
172     }
173 
174     @Test
175     public void testDetectHTMLByContent() throws Exception {
176         detectMIMEtypeByContent(HTML, manifestHtml());
177     }
178 
179     private List<String> manifestHtml() {
180         return Arrays.asList("/text/html/test1");
181     }
182 
183     @Test
184     public void testDetectRDFaByContent() throws Exception {
185         detectMIMEtypeByContent(XHTML, manifestRdfa());
186     }
187 
188     private List<String> manifestRdfa() {
189         return Arrays.asList("/application/rdfa/false.test", "/application/rdfa/london-gazette.html",
190                 "/application/rdfa/mic.xhtml", "/application/rdfa/test1.html");
191     }
192 
193     @Test
194     public void testDetectXHTMLByContent() throws Exception {
195         detectMIMEtypeByContent(XHTML, manifestXHtml());
196     }
197 
198     private List<String> manifestXHtml() {
199         return Arrays.asList("/application/xhtml/blank-file-header.xhtml", "/application/xhtml/index.html",
200                 "/application/xhtml/test1");
201     }
202 
203     @Test
204     public void testDetectWSDLByContent() throws Exception {
205         detectMIMEtypeByContent("application/x-wsdl", manifestWsdl());
206     }
207 
208     private List<String> manifestWsdl() {
209         return Arrays.asList("/application/wsdl/error.wsdl", "/application/wsdl/test1");
210     }
211 
212     @Test
213     public void testDetectZIPByContent() throws Exception {
214         detectMIMEtypeByContent("application/zip", manifestZip());
215     }
216 
217     private List<String> manifestZip() {
218         return Arrays.asList("/application/zip/4_entries.zip", "/application/zip/test1.zip", "/application/zip/test2");
219     }
220 
221     @Test
222     public void testDetectCSVByContent() throws Exception {
223         detectMIMEtypeByContent(CSV, manifestCsv());
224     }
225 
226     private List<String> manifestCsv() {
227         return Arrays.asList("/org/apache/any23/extractor/csv/test-comma.csv",
228                 "/org/apache/any23/extractor/csv/test-semicolon.csv", "/org/apache/any23/extractor/csv/test-tab.csv",
229                 "/org/apache/any23/extractor/csv/test-type.csv");
230     }
231 
232     /* END: by content. */
233 
234     /* BEGIN: by content metadata. */
235     @Test
236     public void testDetectContentPlainByMeta() throws IOException {
237         detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
238     }
239 
240     @Test
241     public void testDetectTextRDFByMeta() throws IOException {
242         detectMIMETypeByMimeTypeHint(RDFXML, "text/rdf");
243     }
244 
245     @Test
246     public void testDetectTextN3ByMeta() throws IOException {
247         detectMIMETypeByMimeTypeHint(N3, "text/rdf+n3");
248     }
249 
250     @Test
251     public void testDetectTextNQuadsByMeta() throws IOException {
252         detectMIMETypeByMimeTypeHint(NQUADS, "application/n-quads");
253     }
254 
255     @Test
256     public void testDetectTextTurtleByMeta() throws IOException {
257         detectMIMETypeByMimeTypeHint(TURTLE, "text/turtle");
258     }
259 
260     @Test
261     public void testDetectRDFXMLByMeta() throws IOException {
262         detectMIMETypeByMimeTypeHint(RDFXML, "application/rdf+xml");
263     }
264 
265     @Test
266     public void testDetectXMLByMeta() throws IOException {
267         detectMIMETypeByMimeTypeHint(XML, "application/xml");
268     }
269 
270     @Test
271     public void testDetectTriXByMeta() throws IOException {
272         detectMIMETypeByMimeTypeHint(TRIX, "application/trix");
273     }
274 
275     @Test
276     public void testDetectExtensionN3ByMeta() throws IOException {
277         detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
278     }
279 
280     @Test
281     public void testDetectXHTMLByMeta() throws IOException {
282         detectMIMETypeByMimeTypeHint(XHTML, "application/xhtml+xml");
283     }
284 
285     @Test
286     public void testDetectTextHTMLByMeta() throws IOException {
287         detectMIMETypeByMimeTypeHint(HTML, "text/html");
288     }
289 
290     @Test
291     public void testDetectTextPlainByMeta() throws IOException {
292         detectMIMETypeByMimeTypeHint(PLAIN, "text/plain");
293     }
294 
295     @Test
296     public void testDetectApplicationXMLByMeta() throws IOException {
297         detectMIMETypeByMimeTypeHint(XML, "application/xml");
298     }
299 
300     @Test
301     public void testDetectApplicationCSVByMeta() throws IOException {
302         detectMIMETypeByMimeTypeHint(CSV, "text/csv");
303     }
304 
305     @Test
306     public void testDetectApplicationYAMLByMeta() throws IOException {
307         detectMIMETypeByMimeTypeHint(YAML, "text/x-yaml");
308     }
309 
310     /* END: by content metadata. */
311 
312     /* BEGIN: by content and name. */
313     @Test
314     public void testRDFXMLByContentAndName() throws Exception {
315         detectMIMETypeByContentAndName(RDFXML, manifestRdfXml());
316     }
317 
318     @Test
319     public void testTriXByContentAndName() throws Exception {
320         detectMIMETypeByContentAndName(TRIX, manifestTrix());
321     }
322 
323     @Test
324     public void testRSS1ByContentAndName() throws Exception {
325         detectMIMETypeByContentAndName(RDFXML, manifestRss1());
326     }
327 
328     @Test
329     public void testRSS2ByContentAndName() throws Exception {
330         detectMIMETypeByContentAndName(RSS, manifestRss2());
331     }
332 
333     @Test
334     public void testDetectRDFN3ByContentAndName() throws Exception {
335         detectMIMETypeByContentAndName(N3, manifestN3());
336     }
337 
338     @Test
339     public void testDetectRDFNQuadsByContentAndName() throws Exception {
340         detectMIMETypeByContentAndName(NQUADS, manifestNQuads());
341     }
342 
343     @Test
344     public void testAtomByContentAndName() throws Exception {
345         detectMIMETypeByContentAndName(ATOM, manifestAtom());
346     }
347 
348     @Test
349     public void testHTMLByContentAndName() throws Exception {
350         detectMIMETypeByContentAndName(HTML, manifestHtml());
351     }
352 
353     @Test
354     public void testXHTMLByContentAndName() throws Exception {
355         detectMIMETypeByContentAndName(XHTML, manifestXHtml());
356     }
357 
358     @Test
359     public void testWSDLByContentAndName() throws Exception {
360         detectMIMETypeByContentAndName("application/x-wsdl", manifestWsdl());
361     }
362 
363     @Test
364     public void testZipByContentAndName() throws Exception {
365         detectMIMETypeByContentAndName("application/zip", manifestZip());
366     }
367 
368     @Test
369     public void testRDFaByContentAndName() throws Exception {
370         detectMIMETypeByContentAndName(XHTML, manifestRdfa());
371     }
372 
373     @Test
374     public void testCSVByContentAndName() throws Exception {
375         detectMIMETypeByContentAndName(CSV, manifestCsv());
376     }
377 
378     /**
379      * Test done only based on content is failed because the standard does not require to have "%YAML" header.
380      * 
381      * @throws Exception
382      *             if there is an error detecting the mime type from the content and name
383      */
384     @Test
385     public void testYAMLByContentAndName() throws Exception {
386         detectMIMETypeByContentAndName(YAML, manifestYAML());
387     }
388 
389     private List<String> manifestYAML() {
390         return Arrays.asList("/org/apache/any23/extractor/yaml/simple-load.yml",
391                 "/org/apache/any23/extractor/yaml/simple-load_no_head.yml",
392                 "/org/apache/any23/extractor/yaml/simple-load_yaml.yaml");
393     }
394 
395     /* END: by content and name. */
396     private void assertN3Detection(String n3Exp) throws IOException {
397         ByteArrayInputStream bais = new ByteArrayInputStream(n3Exp.getBytes(StandardCharsets.UTF_8));
398         Assert.assertTrue(TikaMIMETypeDetector.checkN3Format(bais));
399     }
400 
401     private void assertN3DetectionFail(String n3Exp) throws IOException {
402         ByteArrayInputStream bais = new ByteArrayInputStream(n3Exp.getBytes(StandardCharsets.UTF_8));
403         Assert.assertFalse(TikaMIMETypeDetector.checkN3Format(bais));
404     }
405 
406     private void assertNQuadsDetection(String n4Exp) throws IOException {
407         ByteArrayInputStream bais = new ByteArrayInputStream(n4Exp.getBytes(StandardCharsets.UTF_8));
408         Assert.assertTrue(TikaMIMETypeDetector.checkNQuadsFormat(bais));
409     }
410 
411     private void assertNQuadsDetectionFail(String n4Exp) throws IOException {
412         ByteArrayInputStream bais = new ByteArrayInputStream(n4Exp.getBytes(StandardCharsets.UTF_8));
413         Assert.assertFalse(TikaMIMETypeDetector.checkNQuadsFormat(bais));
414     }
415 
416     /**
417      * Checks the detection of a specific MIME based on content analysis.
418      *
419      * @param expectedMimeType
420      *            the expected mime type.
421      * @param testDir
422      *            the target file.
423      * 
424      * @throws IOException
425      */
426     private void detectMIMEtypeByContent(String expectedMimeType, Collection<String> manifest) throws IOException {
427         String detectedMimeType;
428         for (String test : manifest) {
429             InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(test));
430             detectedMimeType = detector.guessMIMEType(null, is, null).toString();
431             if (test.contains("error")) {
432                 Assert.assertNotSame(expectedMimeType, detectedMimeType);
433             } else {
434                 Assert.assertEquals(
435                         String.format(java.util.Locale.ROOT, "Error in mimetype detection for file %s", test),
436                         expectedMimeType, detectedMimeType);
437             }
438             is.close();
439         }
440     }
441 
442     /**
443      * Verifies the detection of a specific MIME based on content, filename and metadata MIME type.
444      *
445      * @param expectedMimeType
446      * @param contentTypeHeader
447      * 
448      * @throws IOException
449      */
450     private void detectMIMETypeByMimeTypeHint(String expectedMimeType, String contentTypeHeader) throws IOException {
451         String detectedMimeType = detector.guessMIMEType(null, null, MIMEType.parse(contentTypeHeader)).toString();
452         Assert.assertEquals(expectedMimeType, detectedMimeType);
453     }
454 
455     /**
456      * Verifies the detection of a specific MIME based on content and filename.
457      *
458      * @param expectedMimeType
459      * @param testDir
460      * 
461      * @throws IOException
462      */
463     private void detectMIMETypeByContentAndName(String expectedMimeType, Collection<String> manifest)
464             throws IOException {
465         String detectedMimeType;
466         for (String test : manifest) {
467             InputStream is = new BufferedInputStream(this.getClass().getResourceAsStream(test));
468             detectedMimeType = detector.guessMIMEType(test, is, null).toString();
469             if (test.contains("error")) {
470                 Assert.assertNotSame(expectedMimeType, detectedMimeType);
471             } else {
472                 Assert.assertEquals(
473                         String.format(java.util.Locale.ROOT, "Error while detecting mimetype in file %s", test),
474                         expectedMimeType, detectedMimeType);
475             }
476             is.close();
477         }
478     }
479 
480 }