View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.cli;
19  
20  import com.beust.jcommander.IStringConverter;
21  import com.beust.jcommander.Parameter;
22  import com.beust.jcommander.Parameters;
23  import org.apache.any23.http.DefaultHTTPClient;
24  import org.apache.any23.http.DefaultHTTPClientConfiguration;
25  import org.apache.any23.http.HTTPClient;
26  import org.apache.any23.mime.MIMEType;
27  import org.apache.any23.mime.MIMETypeDetector;
28  import org.apache.any23.mime.TikaMIMETypeDetector;
29  import org.apache.any23.source.DocumentSource;
30  import org.apache.any23.source.FileDocumentSource;
31  import org.apache.any23.source.HTTPDocumentSource;
32  import org.apache.any23.source.StringDocumentSource;
33  
34  import java.io.File;
35  import java.io.PrintStream;
36  import java.net.URISyntaxException;
37  import java.util.LinkedList;
38  import java.util.List;
39  
40  /**
41   * Commandline tool to detect <b>MIME Type</b>s from
42   * file, HTTP and direct input sources.
43   * The implementation of this tool is based on {@link org.apache.any23.mime.TikaMIMETypeDetector}.
44   *
45   * @author Michele Mostarda (mostarda@fbk.eu)
46   */
47  @Parameters(commandNames = { "mimes" }, commandDescription = "MIME Type Detector Tool.")
48  public class MimeDetector extends BaseTool {
49  
50      public static final String FILE_DOCUMENT_PREFIX   = "file://";
51  
52      public static final String INLINE_DOCUMENT_PREFIX = "inline://";
53  
54      public static final String URL_DOCUMENT_RE        = "^https?://.*";
55  
56      @Parameter(
57         arity = 1,
58         description = "Input document URL, {http://path/to/resource.html|file:///path/to/local.file|inline:// some inline content}",
59         converter = MimeDetectorDocumentSourceConverter.class
60      )
61      private List<DocumentSource> document = new LinkedList<DocumentSource>();
62  
63      private PrintStream out = System.out;
64  
65      @Override
66      PrintStream getOut() {
67          return out;
68      }
69  
70      @Override
71      void setOut(PrintStream out) {
72          this.out = out;
73      }
74  
75      public void run() throws Exception {
76          if (document.isEmpty()) {
77              throw new IllegalArgumentException("No input document URL specified");
78          }
79  
80          final DocumentSource documentSource = document.get(0);
81          final MIMETypeDetector detector = new TikaMIMETypeDetector();
82          final MIMEType mimeType = detector.guessMIMEType(
83                  documentSource.getDocumentIRI(),
84                  documentSource.openInputStream(),
85                  MIMEType.parse(documentSource.getContentType())
86          );
87          out.println(mimeType);
88      }
89  
90      public static final class MimeDetectorDocumentSourceConverter implements IStringConverter<DocumentSource> {
91  
92          @Override
93          public DocumentSource convert( String document ) {
94              if (document.startsWith(FILE_DOCUMENT_PREFIX)) {
95                  return new FileDocumentSource( new File( document.substring(FILE_DOCUMENT_PREFIX.length()) ) );
96              }
97              if (document.startsWith(INLINE_DOCUMENT_PREFIX)) {
98                  return new StringDocumentSource( document.substring(INLINE_DOCUMENT_PREFIX.length()), "" );
99              }
100             if (document.matches(URL_DOCUMENT_RE)) {
101                 final HTTPClient client = new DefaultHTTPClient();
102                 client.init( DefaultHTTPClientConfiguration.singleton() );
103                 try {
104                     return new HTTPDocumentSource(client, document);
105                 } catch ( URISyntaxException e ) {
106                     throw new IllegalArgumentException("Invalid source IRI: '" + document + "'");
107                 }
108             }
109             throw new IllegalArgumentException("Unsupported protocol for document " + document);
110         }
111 
112     }
113 
114 }