View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.mime;
19  
20  import org.apache.any23.extractor.csv.CSVReaderBuilder;
21  import org.apache.any23.mime.purifier.Purifier;
22  import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
23  import org.apache.tika.Tika;
24  import org.apache.tika.config.TikaConfig;
25  import org.apache.tika.metadata.Metadata;
26  import org.apache.tika.mime.MimeType;
27  import org.apache.tika.mime.MimeTypeException;
28  import org.apache.tika.mime.MimeTypes;
29  import org.eclipse.rdf4j.rio.RDFFormat;
30  import org.eclipse.rdf4j.rio.RDFParser;
31  import org.eclipse.rdf4j.rio.Rio;
32  import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
33  
34  import java.io.BufferedReader;
35  import java.io.ByteArrayInputStream;
36  import java.io.IOException;
37  import java.io.InputStream;
38  import java.io.InputStreamReader;
39  import java.util.regex.Pattern;
40  
41  /**
42   * Implementation of {@link MIMETypeDetector} based on
43   * <a href="http://tika.apache.org/">Apache Tika</a>.
44   *
45   * @author Michele Mostarda (michele.mostarda@gmail.com)
46   * @author Davide Palmisano (dpalmisano@gmail.com)
47   */
48  public class TikaMIMETypeDetector implements MIMETypeDetector {
49  
50      private Purifier purifier;
51  
52      public static final String CSV_MIMETYPE = "text/csv";
53  
54      public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml";
55  
56      /**
57       * N3 patterns.
58       */
59      private static final Pattern[] N3_PATTERNS = {
60              Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\."             ), // * IRI IRI .
61              Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\."             ), // * IRI BNODE .
62              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\."     ), // * IRI LLITERAL .
63              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.")  // * IRI TLITERAL .
64      };
65  
66      /**
67       * N-Quads patterns.
68       */
69      private static final Pattern[] NQUADS_PATTERNS = {
70              Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\."             ), // * IRI IRI      IRI .
71              Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\."             ), // * IRI BNODE    IRI .
72              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\."     ), // * IRI LLITERAL IRI .
73              Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.")  // * IRI TLITERAL IRI .
74      };
75  
76      private static volatile TikaConfig config;
77  
78      private static volatile Tika tika;
79  
80      private static volatile MimeTypes types;
81  
82      /**
83       * Checks if the stream contains the <i>N3</i> triple patterns.
84       *
85       * @param is input stream to be verified.
86       * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
87       * @throws IOException if there is an error checking the {@link java.io.InputStream}
88       */
89      public static boolean checkN3Format(InputStream is) throws IOException {
90          return findPattern(N3_PATTERNS, '.', is);
91      }
92  
93      /**
94       * Checks if the stream contains the <i>NQuads</i> patterns.
95       *
96       * @param is input stream to be verified.
97       * @return <code>true</code> if <i>N3</i> patterns are detected, <code>false</code> otherwise.
98       * @throws IOException if there is an error checking the {@link java.io.InputStream}
99       */
100     public static boolean checkNQuadsFormat(InputStream is) throws IOException {
101         return findPattern(NQUADS_PATTERNS, '.', is);
102     }
103 
104     /**
105      * Checks if the stream contains <i>Turtle</i> triple patterns.
106      *
107      * @param is input stream to be verified.
108      * @return <code>true</code> if <i>Turtle</i> patterns are detected, <code>false</code> otherwise.
109      * @throws IOException if there is an error checking the {@link java.io.InputStream}
110      */
111     public static boolean checkTurtleFormat(InputStream is) throws IOException {
112         String sample = extractDataSample(is, '.');
113         RDFParser turtleParser = Rio.createParser(RDFFormat.TURTLE);
114         turtleParser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
115         ByteArrayInputStream bais = new ByteArrayInputStream(sample.getBytes());
116         try {
117             turtleParser.parse(bais, "");
118             return true;
119         } catch (Exception e) {
120             return false;
121         }
122     }
123 
124     /**
125      * Checks if the stream contains a valid <i>CSV</i> pattern.
126      *
127      * @param is input stream to be verified.
128      * @return <code>true</code> if <i>CSV</i> patterns are detected, <code>false</code> otherwise.
129      * @throws IOException if there is an error checking the {@link java.io.InputStream}
130      */
131     public static boolean checkCSVFormat(InputStream is) throws IOException {
132         return CSVReaderBuilder.isCSV(is);
133     }
134 
135     /**
136      * Tries to apply one of the given patterns on a sample of the input stream.
137      *
138      * @param patterns the patterns to apply.
139      * @param delimiterChar the delimiter of the sample.
140      * @param is the input stream to sample.
141      * @return <code>true</code> if a pattern has been applied, <code>false</code> otherwise.
142      * @throws IOException if there is an error finding the pattern within
143      * the {@link java.io.InputStream}
144      */
145     private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is)
146     throws IOException {
147         String sample = extractDataSample(is, delimiterChar);
148         for (Pattern pattern : patterns) {
149             if (pattern.matcher(sample).find()) {
150                 return true;
151             }
152         }
153         return false;
154     }
155 
156     /**
157      * Extracts a sample data from the input stream, from the current
158      * mark to the first <i>breakChar</i> char.
159      *
160      * @param is the input stream to sample.
161      * @param breakChar the char to break to sample.
162      * @return the sample string.
163      * @throws IOException if an error occurs during sampling.
164      */
165     private static String extractDataSample(InputStream is, char breakChar) throws IOException {
166         BufferedReader br = new BufferedReader(new InputStreamReader(is));
167         StringBuilder sb = new StringBuilder();
168         final int MAX_SIZE = 1024 * 2;
169         int c;
170         boolean insideBlock = false;
171         int read = 0;
172         br.mark(MAX_SIZE);
173         try {
174             while ((c = br.read()) != -1) {
175                 read++;
176                 if (read > MAX_SIZE) {
177                     break;
178                 }
179                 if ('<' == c) {
180                     insideBlock = true;
181                 } else if ('>' == c) {
182                     insideBlock = false;
183                 } else if ('"' == c) {
184                     insideBlock = !insideBlock;
185                 }
186                 sb.append((char) c);
187                 if (!insideBlock && breakChar == c) {
188                     break;
189                 }
190             }
191         } finally {
192             is.reset();
193             br.reset();
194         }
195         return sb.toString();
196     }
197 
198     public TikaMIMETypeDetector(Purifier purifier) {
199         this.purifier = purifier;
200         if (config == null || types == null || tika == null) {
201             synchronized (TikaMIMETypeDetector.class) {
202                 if (config == null) {
203                     InputStream is = getResourceAsStream();
204                     try {
205                         config = new TikaConfig(is);
206                     } catch (Exception e) {
207                         throw new RuntimeException("Error while loading Tika configuration.", e);
208                     }
209                 }
210                 if (types == null) {
211                     types = config.getMimeRepository();
212                 }
213                 if (tika == null) {
214                     tika = new Tika(config);
215                 }
216             }
217         }
218     }
219 
220     public TikaMIMETypeDetector() {
221         this(new WhiteSpacesPurifier());
222     }
223 
224     /**
225      * Estimates the <code>MIME</code> type of the content of input file.
226      * The <i>input</i> stream must be resettable.
227      *
228      * @param fileName name of the data source.
229      * @param input <code>null</code> or a <i>resettable</i> input stream containing data.
230      * @param mimeTypeFromMetadata mimetype declared in metadata.
231      * @return the supposed mime type or <code>null</code> if nothing appropriate found.
232      * @throws IllegalArgumentException if <i>input</i> is not <code>null</code> and is not resettable.
233      */
234     public MIMEType guessMIMEType(
235             String fileName,
236             InputStream input,
237             MIMEType mimeTypeFromMetadata
238     ) {
239         if (input != null) {
240             try {
241                 this.purifier.purify(input);
242             } catch (IOException e) {
243                 throw new RuntimeException("Error while purifying the provided input", e);
244             }
245         }
246 
247         final Metadata meta = new Metadata();
248         if (mimeTypeFromMetadata != null)
249             meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType());
250         if (fileName != null)
251             meta.set(Metadata.RESOURCE_NAME_KEY, fileName);
252 
253         String type;
254         try {
255             final String mt = guessMimeTypeByInputAndMeta(input, meta);
256             if (input == null || !MimeTypes.OCTET_STREAM.equals(mt)) {
257                 type = mt;
258             } else {
259                 if (checkN3Format(input)) {
260                     type = RDFFormat.N3.getDefaultMIMEType();
261                 } else if (checkNQuadsFormat(input)) {
262                     type = RDFFormat.NQUADS.getDefaultMIMEType();
263                 } else if (checkTurtleFormat(input)) {
264                     type = RDFFormat.TURTLE.getDefaultMIMEType();
265                 } else if (checkCSVFormat(input)) {
266                     type = CSV_MIMETYPE;
267                 } else {
268                     type = MimeTypes.OCTET_STREAM; 
269                 }
270             }
271         } catch (IOException ioe) {
272             throw new RuntimeException("Error while retrieving mime type.", ioe);
273         }
274         return MIMEType.parse(type);
275     }
276 
277      /**
278       * Loads the <code>Tika</code> configuration file.
279       *
280       * @return the input stream containing the configuration.
281       */
282      private InputStream getResourceAsStream() {
283          InputStream result;
284          result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME);
285          if (result == null) {
286              try {
287                  result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME);
288              } catch (SecurityException e) {
289                  //fall through
290              }
291              if (result == null) {
292                  result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME);
293              }
294          }
295          return result;
296      }
297 
298     /**
299      * Automatically detects the MIME type of a document based on magic
300      * markers in the stream prefix and any given metadata hints.
301      * <p/>
302      * The given stream is expected to support marks, so that this method
303      * can reset the stream to the position it was in before this method
304      * was called.
305      *
306      * @param stream   document stream
307      * @param metadata metadata hints
308      * @return MIME type of the document
309      * @throws IOException if the document stream could not be read
310      */
311     private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata)
312     throws IOException {
313         if (stream != null) {
314             final String type = tika.detect(stream);
315             if (type != null && !isGenericMIMEType(type)) {
316                 return type;
317             }
318         }
319 
320         // Determines the MIMEType based on Content-Type hint if available.
321         final String contentType = metadata.get(Metadata.CONTENT_TYPE);
322         String candidateMIMEType = null;
323         if (contentType != null) {
324             try {
325                 MimeType type = types.forName(contentType);
326                 if (type != null) {
327                     candidateMIMEType = type.getName();
328                     if (!isPlainMIMEType(candidateMIMEType)) {
329                         return candidateMIMEType;
330                     }
331                 }
332             } catch (MimeTypeException mte) {
333                 // Malformed ocntent-type value, ignore.
334             }
335         }
336 
337         // Determines the MIMEType based on resource name hint if available.
338         final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY);
339         if (resourceName != null) {
340             String type = tika.detect(resourceName);
341             if (type != null && !type.equals(MimeTypes.OCTET_STREAM)) {
342                 return type;
343             }
344         }
345 
346         // Finally, use the default type if no matches found
347         if (candidateMIMEType != null) {
348             return candidateMIMEType;
349         } else {
350             return MimeTypes.OCTET_STREAM;
351         }
352     }
353 
354     private boolean isPlainMIMEType(String type) {
355         return
356             type.equals(MimeTypes.OCTET_STREAM)
357                 ||
358             type.equals(MimeTypes.PLAIN_TEXT);
359     }
360 
361     private boolean isGenericMIMEType(String type) {
362         return
363             isPlainMIMEType(type)
364                 ||
365             type.equals(MimeTypes.XML);
366     }
367 
368 }
369