View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23;
19  
20  import org.apache.any23.configuration.Configuration;
21  import org.apache.any23.configuration.DefaultConfiguration;
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionParameters;
24  import org.apache.any23.extractor.ExtractorFactory;
25  import org.apache.any23.extractor.ExtractorGroup;
26  import org.apache.any23.extractor.ExtractorRegistryImpl;
27  import org.apache.any23.extractor.SingleDocumentExtraction;
28  import org.apache.any23.extractor.SingleDocumentExtractionReport;
29  import org.apache.any23.http.AcceptHeaderBuilder;
30  import org.apache.any23.http.DefaultHTTPClient;
31  import org.apache.any23.http.DefaultHTTPClientConfiguration;
32  import org.apache.any23.http.HTTPClient;
33  import org.apache.any23.mime.MIMEType;
34  import org.apache.any23.mime.MIMETypeDetector;
35  import org.apache.any23.mime.TikaMIMETypeDetector;
36  import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
37  import org.apache.any23.source.DocumentSource;
38  import org.apache.any23.source.FileDocumentSource;
39  import org.apache.any23.source.HTTPDocumentSource;
40  import org.apache.any23.source.LocalCopyFactory;
41  import org.apache.any23.source.MemCopyFactory;
42  import org.apache.any23.source.StringDocumentSource;
43  import org.apache.any23.writer.TripleHandler;
44  import org.slf4j.Logger;
45  import org.slf4j.LoggerFactory;
46  
47  import java.io.File;
48  import java.io.IOException;
49  import java.net.URI;
50  import java.net.URISyntaxException;
51  import java.util.ArrayList;
52  import java.util.Arrays;
53  import java.util.Collection;
54  
55  
56  /**
57   * A facade with convenience methods for typical <i>Any23</i> extraction
58   * operations.
59   *
60   * @author Richard Cyganiak (richard@cyganiak.de)
61   * @author Michele Mostarda (michele.mostarda@gmail.com)
62   */
63  public class Any23 {
64  
65      /**
66       * Any23 core library version.
67       * NOTE: there's also a version string in pom.xml, they should match.
68       */
69      public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");
70  
71      /**
72       * Default HTTP User Agent defined in default configuration.
73       */
74      public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton().getPropertyOrFail(
75              "any23.http.user.agent.default"
76      );
77  
78      protected static final Logger logger = LoggerFactory.getLogger(Any23.class);
79  
80      private final Configuration configuration;
81      private final String        defaultUserAgent;
82  
83      private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
84  
85      private HTTPClient httpClient = new DefaultHTTPClient();
86  
87      private boolean httpClientInitialized = false;
88  
89      private final ExtractorGroup factories;
90      private LocalCopyFactory     streamCache;
91      private String               userAgent;
92  
93      /**
94       * Constructor that allows the specification of a
95       * custom configuration and of a list of extractors.
96       *
97       * @param configuration configuration used to build the <i>Any23</i> instance.
98       * @param extractorGroup the group of extractors to be applied.
99       */
100     public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
101         if (configuration == null)
102             throw new NullPointerException("configuration must be not null.");
103         this.configuration = configuration;
104         if (logger.isDebugEnabled()) {
105             logger.debug(configuration.getConfigurationDump());
106         }
107 
108         this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");
109 
110         this.factories = (extractorGroup == null)
111                 ? ExtractorRegistryImpl.getInstance().getExtractorGroup()
112                 : extractorGroup;
113         setCacheFactory(new MemCopyFactory());
114     }
115 
116     /**
117      * Constructor that allows the specification of a list of extractors.
118      *
119      * @param extractorGroup the group of extractors to be applied.
120      */
121     public Any23(ExtractorGroup extractorGroup) {
122         this(DefaultConfiguration.singleton(), extractorGroup);
123     }
124 
125     /**
126      * Constructor that allows the specification of a
127      * custom configuration and of list of extractor names.
128      *
129      * @param configuration a {@link Configuration} object
130      * @param extractorNames list of extractor's names.
131      */
132     public Any23(Configuration configuration, String... extractorNames) {
133         this(configuration, extractorNames == null ? null :
134                 ExtractorRegistryImpl.getInstance().getExtractorGroup(Arrays.asList(extractorNames))
135         );
136     }
137 
138     /**
139      * Constructor that allows the specification of a list of extractor names.
140      *
141      * @param extractorNames list of extractor's names.
142      */
143     public Any23(String... extractorNames) {
144         this(DefaultConfiguration.singleton(), extractorNames);
145     }
146 
147     /**
148      * Constructor accepting {@link Configuration}.
149      * @param configuration a {@link Configuration} object
150      */
151     public Any23(Configuration configuration) {
152         this(configuration, (String[]) null);
153     }
154 
155     /**
156      * Constructor with default configuration.
157      */
158     public Any23() {
159         this(DefaultConfiguration.singleton());
160     }
161 
162     /**
163      * Sets the <i>HTTP Header User Agent</i>,
164      * see <i>RFC 2616-14.43</i>.
165      *
166      * @param userAgent text describing the user agent.
167      */
168     public void setHTTPUserAgent(String userAgent) {
169         if (httpClientInitialized) {
170             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
171         }
172         if (userAgent == null) {
173             userAgent = defaultUserAgent;
174         }
175         if (userAgent.trim().length() == 0) {
176             throw new IllegalArgumentException(String.format("Invalid user agent: '%s'", userAgent));
177         }
178         this.userAgent = userAgent;
179     }
180 
181     /**
182      * Returns the <i>HTTP Header User Agent</i>,
183      * see <i>RFC 2616-14.43</i>.
184      *
185      * @return text describing the user agent.
186      */
187     public String getHTTPUserAgent() {
188         return this.userAgent;
189     }
190 
191     /**
192      * Allows to set the {@link org.apache.any23.http.HTTPClient} implementation
193      * used to retrieve contents. The default instance is {@link org.apache.any23.http.DefaultHTTPClient}.
194      *
195      * @param httpClient a valid client instance.
196      * @throws IllegalStateException if invoked after client has been initialized.
197      */
198     public void setHTTPClient(HTTPClient httpClient) {
199         if (httpClient == null) {
200             throw new NullPointerException("httpClient cannot be null.");
201         }
202         if (httpClientInitialized) {
203             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
204         }
205         this.httpClient = httpClient;
206     }
207 
208     /**
209      * Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
210      *
211      * @return instance of HTTPClient.
212      * @throws IOException if the HTTP client has not initialized.
213      */
214     public HTTPClient getHTTPClient() throws IOException {
215         if (!httpClientInitialized) {
216             if (userAgent == null) {
217                 throw new IOException("Must call " + Any23.class.getSimpleName() +
218                         ".setHTTPUserAgent(String) before extracting from HTTP IRI");
219             }
220             httpClient.init(new DefaultHTTPClientConfiguration(this.getAcceptHeader()));
221             httpClientInitialized = true;
222         }
223         return httpClient;
224     }
225 
226     /**
227      * Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
228      *
229      * @param cache valid cache instance.
230      */
231     public void setCacheFactory(LocalCopyFactory cache) {
232         if (cache == null) {
233             throw new NullPointerException("cache cannot be null.");
234         }
235         this.streamCache = cache;
236     }
237 
238     /**
239      * Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
240      *
241      * @param detector a valid detector instance, if <code>null</code> all the detectors
242      *        will be used.
243      */
244     public void setMIMETypeDetector(MIMETypeDetector detector) {
245         this.mimeTypeDetector = detector;
246     }
247 
248     /**
249      * <p>Returns the most appropriate {@link DocumentSource} for the given<code>documentIRI</code>.</p>
250      * <p><b>N.B.</b> <code>documentIRI's</code> <i>should</i> contain a protocol.
251      * E.g. <b>http:</b>, <b>https:</b>, <b>file:</b>
252      * </p>
253      *
254      * @param documentIRI the document <i>IRI</i>.
255      * @return a new instance of DocumentSource.
256      * @throws URISyntaxException if an error occurs while parsing the <code>documentIRI</code> as a <i>IRI</i>.
257      * @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
258      */
259     public DocumentSource createDocumentSource(String documentIRI) throws URISyntaxException, IOException {
260         if (documentIRI == null)
261             throw new NullPointerException("documentIRI cannot be null.");
262         if (documentIRI.toLowerCase().startsWith("file:")) {
263             return new FileDocumentSource(new File(new URI(documentIRI)));
264         }
265         if (documentIRI.toLowerCase().startsWith("http:") || documentIRI.toLowerCase().startsWith("https:")) {
266             return new HTTPDocumentSource(getHTTPClient(), documentIRI);
267         }
268         throw new IllegalArgumentException(
269                 String.format("Unsupported protocol for document IRI: '%s' . "
270                     + "Check that document IRI contains a protocol.", documentIRI)
271         );
272     }
273 
274 
275     /**
276      * Performs metadata extraction from the content of the given
277      * <code>in</code> document source, sending the generated events
278      * to the specified <code>outputHandler</code>.
279      *
280      * @param eps the extraction parameters to be applied.
281      * @param in the input document source.
282      * @param outputHandler handler responsible for collecting of the extracted metadata.
283      * @param encoding explicit encoding see
284      *        <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
285      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
286      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
287      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
288      */
289     public ExtractionReport extract(
290             ExtractionParameters eps,
291             DocumentSource in,
292             TripleHandler outputHandler,
293             String encoding
294     ) throws IOException, ExtractionException {
295         final SingleDocumentExtractionaction.html#SingleDocumentExtraction">SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
296         ex.setMIMETypeDetector(mimeTypeDetector);
297         ex.setLocalCopyFactory(streamCache);
298         ex.setParserEncoding(encoding);
299         final SingleDocumentExtractionReport sder = ex.run(eps);
300         return new ExtractionReport(
301                 ex.getMatchingExtractors(),
302                 ex.getParserEncoding(),
303                 ex.getDetectedMIMEType(),
304                 sder.getValidationReport(),
305                 sder.getExtractorToIssues()
306         );
307     }
308 
309     /**
310      * Performs metadata extraction on the <code>in</code> string
311      * associated to the <code>documentIRI</code> IRI, declaring
312      * <code>contentType</code> and <code>encoding</code>.
313      * The generated events are sent to the specified <code>outputHandler</code>.
314      *
315      * @param in raw data to be analyzed.
316      * @param documentIRI IRI from which the raw data has been extracted.
317      * @param contentType declared data content type.
318      * @param encoding declared data encoding.
319      * @param outputHandler handler responsible for collecting of the extracted metadata.
320      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
321      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
322      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
323      */
324     public ExtractionReport extract(
325             String in,
326             String documentIRI,
327             String contentType,
328             String encoding,
329             TripleHandler outputHandler
330     ) throws IOException, ExtractionException {
331         return extract(new StringDocumentSource(in, documentIRI, contentType, encoding), outputHandler);
332     }
333 
334     /**
335      * Performs metadata extraction on the <code>in</code> string
336      * associated to the <code>documentIRI</code> IRI, sending the generated
337      * events to the specified <code>outputHandler</code>.
338      *
339      * @param in raw data to be analyzed.
340      * @param documentIRI IRI from which the raw data has been extracted.
341      * @param outputHandler handler responsible for collecting of the extracted metadata.
342      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
343      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
344      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
345      */
346     public ExtractionReport extract(String in, String documentIRI, TripleHandler outputHandler)
347     throws IOException, ExtractionException {
348         return extract(new StringDocumentSource(in, documentIRI), outputHandler);
349     }
350 
351     /**
352      * Performs metadata extraction from the content of the given <code>file</code>
353      * sending the generated events to the specified <code>outputHandler</code>.
354      *
355      * @param file file containing raw data.
356      * @param outputHandler handler responsible for collecting of the extracted metadata.
357      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
358      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
359      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
360      */
361     public ExtractionReport extract(File file, TripleHandler outputHandler)
362     throws IOException, ExtractionException {
363         return extract(new FileDocumentSource(file), outputHandler);
364     }
365 
366     /**
367      * Performs metadata extraction from the content of the given <code>documentIRI</code>
368      * sending the generated events to the specified <code>outputHandler</code>.
369      * If the <i>IRI</i> is replied with a redirect, the last will be followed.
370      *
371      * @param eps the parameters to be applied to the extraction.
372      * @param documentIRI the IRI from which retrieve document.
373      * @param outputHandler handler responsible for collecting of the extracted metadata.
374      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
375      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
376      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
377      */
378     public ExtractionReport extract(ExtractionParameters eps, String documentIRI, TripleHandler outputHandler)
379     throws IOException, ExtractionException {
380         try {
381             return extract(eps, createDocumentSource(documentIRI), outputHandler);
382         } catch (URISyntaxException ex) {
383             throw new ExtractionException("Error while extracting data from document IRI.", ex);
384         }
385     }
386 
387     /**
388      * Performs metadata extraction from the content of the given <code>documentIRI</code>
389      * sending the generated events to the specified <code>outputHandler</code>.
390      * If the <i>IRI</i> is replied with a redirect, the last will be followed.
391      *
392      * @param documentIRI the IRI from which retrieve document.
393      * @param outputHandler handler responsible for collecting of the extracted metadata.
394      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
395      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
396      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
397      */
398     public ExtractionReport extract(String documentIRI, TripleHandler outputHandler)
399     throws IOException, ExtractionException {
400         return extract((ExtractionParameters) null, documentIRI, outputHandler);
401     }
402 
403     /**
404      * Performs metadata extraction from the content of the given
405      * <code>in</code> document source, sending the generated events
406      * to the specified <code>outputHandler</code>.
407      *
408      * @param in the input document source.
409      * @param outputHandler handler responsible for collecting of the extracted metadata.
410      * @param encoding explicit encoding see
411      *        <a href="http://www.iana.org/assignments/character-sets">available encodings</a>.
412      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
413      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
414      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
415      */
416     public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
417     throws IOException, ExtractionException {
418         return extract(null, in, outputHandler, encoding);
419     }
420 
421     /**
422      * Performs metadata extraction from the content of the given
423      * <code>in</code> document source, sending the generated events
424      * to the specified <code>outputHandler</code>.
425      *
426      * @param in the input document source.
427      * @param outputHandler handler responsible for collecting of the extracted metadata.
428      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
429      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
430      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
431      */
432     public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
433     throws IOException, ExtractionException {
434         return extract(null, in, outputHandler, null);
435     }
436 
437     /**
438      * Performs metadata extraction from the content of the given
439      * <code>in</code> document source, sending the generated events
440      * to the specified <code>outputHandler</code>.
441      *
442      * @param eps the parameters to be applied for the extraction phase.
443      * @param in the input document source.
444      * @param outputHandler handler responsible for collecting of the extracted metadata.
445      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
446      * @throws IOException if there is an error reading the {@link org.apache.any23.source.DocumentSource}
447      * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
448      */
449     public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
450     throws IOException, ExtractionException {
451         return extract(eps, in, outputHandler, null);
452     }
453 
454     private String getAcceptHeader() {
455         Collection<MIMEType> mimeTypes = new ArrayList<>();
456         for (ExtractorFactory<?> factory : factories) {
457             mimeTypes.addAll(factory.getSupportedMIMETypes());
458         }
459         return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
460     }
461     
462 }