View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.mime;
19  
20  import org.apache.any23.extractor.csv.CSVReaderBuilder;
21  import org.apache.any23.mime.purifier.Purifier;
22  import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
23  import org.apache.tika.Tika;
24  import org.apache.tika.config.TikaConfig;
25  import org.apache.tika.metadata.Metadata;
26  import org.apache.tika.mime.MimeType;
27  import org.apache.tika.mime.MimeTypeException;
28  import org.apache.tika.mime.MimeTypes;
29  import org.eclipse.rdf4j.rio.RDFFormat;
30  import org.eclipse.rdf4j.rio.RDFParser;
31  import org.eclipse.rdf4j.rio.Rio;
32  import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
33  
34  import java.io.BufferedReader;
35  import java.io.ByteArrayInputStream;
36  import java.io.IOException;
37  import java.io.InputStream;
38  import java.io.InputStreamReader;
39  import java.nio.charset.StandardCharsets;
40  import java.util.regex.Pattern;
41  
42  /**
43   * Implementation of {@link MIMETypeDetector} based on <a href="http://tika.apache.org/">Apache Tika</a>.
44   *
45   * @author Michele Mostarda (michele.mostarda@gmail.com)
46   * @author Davide Palmisano (dpalmisano@gmail.com)
47   */
48  public class TikaMIMETypeDetector implements MIMETypeDetector {
49  
50      private Purifier purifier;
51  
52      public static final String CSV_MIMETYPE = "text/csv";
53  
54      public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml";
55  
56      /**
57       * N3 patterns.
58       */
59      private static final Pattern[] N3_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\."), // * IRI IRI .
60              Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\."), // * IRI BNODE .
61              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\."), // * IRI LLITERAL .
62              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.") // * IRI TLITERAL .
63      };
64  
65      /**
66       * N-Quads patterns.
67       */
68      private static final Pattern[] NQUADS_PATTERNS = { Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\."), // *
69                                                                                                                          // IRI
70                                                                                                                          // IRI
71                                                                                                                          // IRI
72                                                                                                                          // .
73              Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\."), // * IRI BNODE IRI .
74              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\."), // * IRI LLITERAL IRI .
75              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.") // * IRI TLITERAL IRI .
76      };
77  
78      private static volatile TikaConfig config;
79  
80      private static volatile Tika tika;
81  
82      private static volatile MimeTypes types;
83  
84      /**
85       * Checks if the stream contains the <i>N3</i> triple patterns.
86       *
87       * @param is
88       *            input stream to be verified.
89       * 
90       * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
91       * 
92       * @throws IOException
93       *             if there is an error checking the {@link java.io.InputStream}
94       */
95      public static boolean checkN3Format(InputStream is) throws IOException {
96          return findPattern(N3_PATTERNS, '.', is);
97      }
98  
99      /**
100      * Checks if the stream contains the <i>NQuads</i> patterns.
101      *
102      * @param is
103      *            input stream to be verified.
104      * 
105      * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
106      * 
107      * @throws IOException
108      *             if there is an error checking the {@link java.io.InputStream}
109      */
110     public static boolean checkNQuadsFormat(InputStream is) throws IOException {
111         return findPattern(NQUADS_PATTERNS, '.', is);
112     }
113 
114     /**
115      * Checks if the stream contains <i>Turtle</i> triple patterns.
116      *
117      * @param is
118      *            input stream to be verified.
119      * 
120      * @return <code>true</code> if <i>Turtle</i> patterns are detected, <code>false</code> otherwise.
121      * 
122      * @throws IOException
123      *             if there is an error checking the {@link java.io.InputStream}
124      */
125     public static boolean checkTurtleFormat(InputStream is) throws IOException {
126         String sample = extractDataSample(is, '.');
127         RDFParser turtleParser = Rio.createParser(RDFFormat.TURTLE);
128         turtleParser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
129         ByteArrayInputStream bais = new ByteArrayInputStream(sample.getBytes(StandardCharsets.UTF_8));
130         try {
131             turtleParser.parse(bais, "");
132             return true;
133         } catch (Exception e) {
134             return false;
135         }
136     }
137 
138     /**
139      * Checks if the stream contains a valid <i>CSV</i> pattern.
140      *
141      * @param is
142      *            input stream to be verified.
143      * 
144      * @return <code>true</code> if <i>CSV</i> patterns are detected, <code>false</code> otherwise.
145      * 
146      * @throws IOException
147      *             if there is an error checking the {@link java.io.InputStream}
148      */
149     public static boolean checkCSVFormat(InputStream is) throws IOException {
150         return CSVReaderBuilder.isCSV(is);
151     }
152 
153     /**
154      * Tries to apply one of the given patterns on a sample of the input stream.
155      *
156      * @param patterns
157      *            the patterns to apply.
158      * @param delimiterChar
159      *            the delimiter of the sample.
160      * @param is
161      *            the input stream to sample.
162      * 
163      * @return <code>true</code> if a pattern has been applied, <code>false</code> otherwise.
164      * 
165      * @throws IOException
166      *             if there is an error finding the pattern within the {@link java.io.InputStream}
167      */
168     private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is) throws IOException {
169         String sample = extractDataSample(is, delimiterChar);
170         for (Pattern pattern : patterns) {
171             if (pattern.matcher(sample).find()) {
172                 return true;
173             }
174         }
175         return false;
176     }
177 
178     /**
179      * Extracts a sample data from the input stream, from the current mark to the first <i>breakChar</i> char.
180      *
181      * @param is
182      *            the input stream to sample.
183      * @param breakChar
184      *            the char to break to sample.
185      * 
186      * @return the sample string.
187      * 
188      * @throws IOException
189      *             if an error occurs during sampling.
190      */
191     private static String extractDataSample(InputStream is, char breakChar) throws IOException {
192         BufferedReader br = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
193         StringBuilder sb = new StringBuilder();
194         final int MAX_SIZE = 1024 * 2;
195         int c;
196         boolean insideBlock = false;
197         int read = 0;
198         br.mark(MAX_SIZE);
199         try {
200             while ((c = br.read()) != -1) {
201                 read++;
202                 if (read > MAX_SIZE) {
203                     break;
204                 }
205                 if ('<' == c) {
206                     insideBlock = true;
207                 } else if ('>' == c) {
208                     insideBlock = false;
209                 } else if ('"' == c) {
210                     insideBlock = !insideBlock;
211                 }
212                 sb.append((char) c);
213                 if (!insideBlock && breakChar == c) {
214                     break;
215                 }
216             }
217         } finally {
218             is.reset();
219             br.reset();
220         }
221         return sb.toString();
222     }
223 
224     public TikaMIMETypeDetector(Purifier purifier) {
225         this.purifier = purifier;
226         if (config == null || types == null || tika == null) {
227             synchronized (TikaMIMETypeDetector.class) {
228                 if (config == null) {
229                     InputStream is = getResourceAsStream();
230                     try {
231                         config = new TikaConfig(is);
232                     } catch (Exception e) {
233                         throw new RuntimeException("Error while loading Tika configuration.", e);
234                     }
235                 }
236                 if (types == null) {
237                     types = config.getMimeRepository();
238                 }
239                 if (tika == null) {
240                     tika = new Tika(config);
241                 }
242             }
243         }
244     }
245 
246     public TikaMIMETypeDetector() {
247         this(new WhiteSpacesPurifier());
248     }
249 
250     /**
251      * Estimates the <code>MIME</code> type of the content of input file. The <i>input</i> stream must be resettable.
252      *
253      * @param fileName
254      *            name of the data source.
255      * @param input
256      *            <code>null</code> or a <i>resettable</i> input stream containing data.
257      * @param mimeTypeFromMetadata
258      *            mimetype declared in metadata.
259      * 
260      * @return the supposed mime type or <code>null</code> if nothing appropriate found.
261      * 
262      * @throws IllegalArgumentException
263      *             if <i>input</i> is not <code>null</code> and is not resettable.
264      */
265     public MIMEType.html#MIMEType">MIMEType guessMIMEType(String fileName, InputStream input, MIMEType mimeTypeFromMetadata) {
266         if (input != null) {
267             try {
268                 this.purifier.purify(input);
269             } catch (IOException e) {
270                 throw new RuntimeException("Error while purifying the provided input", e);
271             }
272         }
273 
274         final Metadata meta = new Metadata();
275         if (mimeTypeFromMetadata != null)
276             meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType());
277         if (fileName != null)
278             meta.set(Metadata.RESOURCE_NAME_KEY, fileName);
279 
280         String type;
281         try {
282             final String mt = guessMimeTypeByInputAndMeta(input, meta);
283             if (input == null || !MimeTypes.OCTET_STREAM.equals(mt)) {
284                 type = mt;
285             } else {
286                 if (checkN3Format(input)) {
287                     type = RDFFormat.N3.getDefaultMIMEType();
288                 } else if (checkNQuadsFormat(input)) {
289                     type = RDFFormat.NQUADS.getDefaultMIMEType();
290                 } else if (checkTurtleFormat(input)) {
291                     type = RDFFormat.TURTLE.getDefaultMIMEType();
292                 } else if (checkCSVFormat(input)) {
293                     type = CSV_MIMETYPE;
294                 } else {
295                     type = MimeTypes.OCTET_STREAM;
296                 }
297             }
298         } catch (IOException ioe) {
299             throw new RuntimeException("Error while retrieving mime type.", ioe);
300         }
301         return MIMEType.parse(type);
302     }
303 
304     /**
305      * Loads the <code>Tika</code> configuration file.
306      *
307      * @return the input stream containing the configuration.
308      */
309     private InputStream getResourceAsStream() {
310         InputStream result;
311         result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME);
312         if (result == null) {
313             try {
314                 result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME);
315             } catch (SecurityException e) {
316                 // fall through
317             }
318             if (result == null) {
319                 result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME);
320             }
321         }
322         return result;
323     }
324 
325     /**
326      * Automatically detects the MIME type of a document based on magic markers in the stream prefix and any given
327      * metadata hints.
328      * <p/>
329      * The given stream is expected to support marks, so that this method can reset the stream to the position it was in
330      * before this method was called.
331      *
332      * @param stream
333      *            document stream
334      * @param metadata
335      *            metadata hints
336      * 
337      * @return MIME type of the document
338      * 
339      * @throws IOException
340      *             if the document stream could not be read
341      */
342     private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata) throws IOException {
343         if (stream != null) {
344             final String type = tika.detect(stream);
345             if (type != null && !isGenericMIMEType(type)) {
346                 return type;
347             }
348         }
349 
350         // Determines the MIMEType based on Content-Type hint if available.
351         final String contentType = metadata.get(Metadata.CONTENT_TYPE);
352         String candidateMIMEType = null;
353         if (contentType != null) {
354             try {
355                 MimeType type = types.forName(contentType);
356                 if (type != null) {
357                     candidateMIMEType = type.getName();
358                     if (!isPlainMIMEType(candidateMIMEType)) {
359                         return candidateMIMEType;
360                     }
361                 }
362             } catch (MimeTypeException mte) {
363                 // Malformed ocntent-type value, ignore.
364             }
365         }
366 
367         // Determines the MIMEType based on resource name hint if available.
368         final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
369         if (resourceName != null) {
370             String type = tika.detect(resourceName);
371             if (type != null && !type.equals(MimeTypes.OCTET_STREAM)) {
372                 return type;
373             }
374         }
375 
376         // Finally, use the default type if no matches found
377         if (candidateMIMEType != null) {
378             return candidateMIMEType;
379         } else {
380             return MimeTypes.OCTET_STREAM;
381         }
382     }
383 
384     private boolean isPlainMIMEType(String type) {
385         return type.equals(MimeTypes.OCTET_STREAM) || type.equals(MimeTypes.PLAIN_TEXT);
386     }
387 
388     private boolean isGenericMIMEType(String type) {
389         return isPlainMIMEType(type) || type.equals(MimeTypes.XML);
390     }
391 
392 }