View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23;
19  
20  import org.apache.any23.configuration.Configuration;
21  import org.apache.any23.configuration.DefaultConfiguration;
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionParameters;
24  import org.apache.any23.extractor.ExtractorFactory;
25  import org.apache.any23.extractor.ExtractorGroup;
26  import org.apache.any23.extractor.ExtractorRegistryImpl;
27  import org.apache.any23.extractor.SingleDocumentExtraction;
28  import org.apache.any23.extractor.SingleDocumentExtractionReport;
29  import org.apache.any23.http.AcceptHeaderBuilder;
30  import org.apache.any23.http.DefaultHTTPClient;
31  import org.apache.any23.http.DefaultHTTPClientConfiguration;
32  import org.apache.any23.http.HTTPClient;
33  import org.apache.any23.mime.MIMEType;
34  import org.apache.any23.mime.MIMETypeDetector;
35  import org.apache.any23.mime.TikaMIMETypeDetector;
36  import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
37  import org.apache.any23.source.DocumentSource;
38  import org.apache.any23.source.FileDocumentSource;
39  import org.apache.any23.source.HTTPDocumentSource;
40  import org.apache.any23.source.LocalCopyFactory;
41  import org.apache.any23.source.MemCopyFactory;
42  import org.apache.any23.source.StringDocumentSource;
43  import org.apache.any23.writer.TripleHandler;
44  import org.slf4j.Logger;
45  import org.slf4j.LoggerFactory;
46  
47  import java.io.File;
48  import java.io.IOException;
49  import java.net.URI;
50  import java.net.URISyntaxException;
51  import java.util.ArrayList;
52  import java.util.Arrays;
53  import java.util.Collection;
54  import java.util.Locale;
55  
56  /**
57   * A facade with convenience methods for typical <i>Any23</i> extraction operations.
58   *
59   * @author Richard Cyganiak (richard@cyganiak.de)
60   * @author Michele Mostarda (michele.mostarda@gmail.com)
61   */
62  public class Any23 {
63  
64      /**
65       * Any23 core library version. NOTE: there's also a version string in pom.xml, they should match.
66       */
67      public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");
68  
69      /**
70       * Default HTTP User Agent defined in default configuration.
71       */
72      public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton()
73              .getPropertyOrFail("any23.http.user.agent.default");
74  
75      protected static final Logger logger = LoggerFactory.getLogger(Any23.class);
76  
77      private final Configuration configuration;
78      private final String defaultUserAgent;
79  
80      private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
81  
82      private HTTPClient httpClient = new DefaultHTTPClient();
83  
84      private boolean httpClientInitialized = false;
85  
86      private final ExtractorGroup factories;
87      private LocalCopyFactory streamCache;
88      private String userAgent;
89  
90      /**
91       * Constructor that allows the specification of a custom configuration and of a list of extractors.
92       *
93       * @param configuration
94       *            configuration used to build the <i>Any23</i> instance.
95       * @param extractorGroup
96       *            the group of extractors to be applied.
97       */
98      public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
99          if (configuration == null)
100             throw new NullPointerException("configuration must be not null.");
101         this.configuration = configuration;
102         if (logger.isDebugEnabled()) {
103             logger.debug(configuration.getConfigurationDump());
104         }
105 
106         this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");
107 
108         this.factories = (extractorGroup == null) ? ExtractorRegistryImpl.getInstance().getExtractorGroup()
109                 : extractorGroup;
110         setCacheFactory(new MemCopyFactory());
111     }
112 
113     /**
114      * Constructor that allows the specification of a list of extractors.
115      *
116      * @param extractorGroup
117      *            the group of extractors to be applied.
118      */
119     public Any23(ExtractorGroup extractorGroup) {
120         this(DefaultConfiguration.singleton(), extractorGroup);
121     }
122 
123     /**
124      * Constructor that allows the specification of a custom configuration and of list of extractor names.
125      *
126      * @param configuration
127      *            a {@link Configuration} object
128      * @param extractorNames
129      *            list of extractor's names.
130      */
131     public Any23(Configuration configuration, String... extractorNames) {
132         this(configuration, extractorNames == null ? null
133                 : ExtractorRegistryImpl.getInstance().getExtractorGroup(Arrays.asList(extractorNames)));
134     }
135 
136     /**
137      * Constructor that allows the specification of a list of extractor names.
138      *
139      * @param extractorNames
140      *            list of extractor's names.
141      */
142     public Any23(String... extractorNames) {
143         this(DefaultConfiguration.singleton(), extractorNames);
144     }
145 
146     /**
147      * Constructor accepting {@link Configuration}.
148      * 
149      * @param configuration
150      *            a {@link Configuration} object
151      */
152     public Any23(Configuration configuration) {
153         this(configuration, (String[]) null);
154     }
155 
156     /**
157      * Constructor with default configuration.
158      */
159     public Any23() {
160         this(DefaultConfiguration.singleton());
161     }
162 
163     /**
164      * Sets the <i>HTTP Header User Agent</i>, see <i>RFC 2616-14.43</i>.
165      *
166      * @param userAgent
167      *            text describing the user agent.
168      */
169     public void setHTTPUserAgent(String userAgent) {
170         if (httpClientInitialized) {
171             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
172         }
173         if (userAgent == null) {
174             userAgent = defaultUserAgent;
175         }
176         if (userAgent.trim().length() == 0) {
177             throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid user agent: '%s'", userAgent));
178         }
179         this.userAgent = userAgent;
180     }
181 
182     /**
183      * Returns the <i>HTTP Header User Agent</i>, see <i>RFC 2616-14.43</i>.
184      *
185      * @return text describing the user agent.
186      */
187     public String getHTTPUserAgent() {
188         return this.userAgent;
189     }
190 
191     /**
192      * Allows to set the {@link org.apache.any23.http.HTTPClient} implementation used to retrieve contents. The default
193      * instance is {@link org.apache.any23.http.DefaultHTTPClient}.
194      *
195      * @param httpClient
196      *            a valid client instance.
197      * 
198      * @throws IllegalStateException
199      *             if invoked after client has been initialized.
200      */
201     public void setHTTPClient(HTTPClient httpClient) {
202         if (httpClient == null) {
203             throw new NullPointerException("httpClient cannot be null.");
204         }
205         if (httpClientInitialized) {
206             throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
207         }
208         this.httpClient = httpClient;
209     }
210 
211     /**
212      * Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
213      *
214      * @return instance of HTTPClient.
215      * 
216      * @throws IOException
217      *             if the HTTP client has not initialized.
218      */
219     public HTTPClient getHTTPClient() throws IOException {
220         if (!httpClientInitialized) {
221             if (userAgent == null) {
222                 throw new IOException("Must call " + Any23.class.getSimpleName()
223                         + ".setHTTPUserAgent(String) before extracting from HTTP IRI");
224             }
225             httpClient.init(new DefaultHTTPClientConfiguration(this.getAcceptHeader()));
226             httpClientInitialized = true;
227         }
228         return httpClient;
229     }
230 
231     /**
232      * Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
233      *
234      * @param cache
235      *            valid cache instance.
236      */
237     public void setCacheFactory(LocalCopyFactory cache) {
238         if (cache == null) {
239             throw new NullPointerException("cache cannot be null.");
240         }
241         this.streamCache = cache;
242     }
243 
244     /**
245      * Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
246      *
247      * @param detector
248      *            a valid detector instance, if <code>null</code> all the detectors will be used.
249      */
250     public void setMIMETypeDetector(MIMETypeDetector detector) {
251         this.mimeTypeDetector = detector;
252     }
253 
254     /**
255      * <p>
256      * Returns the most appropriate {@link DocumentSource} for the given<code>documentIRI</code>.
257      * </p>
258      * <p>
259      * <b>N.B.</b> <code>documentIRI's</code> <i>should</i> contain a protocol. E.g. <b>http:</b>, <b>https:</b>,
260      * <b>file:</b>
261      * </p>
262      *
263      * @param documentIRI
264      *            the document <i>IRI</i>.
265      * 
266      * @return a new instance of DocumentSource.
267      * 
268      * @throws URISyntaxException
269      *             if an error occurs while parsing the <code>documentIRI</code> as a <i>IRI</i>.
270      * @throws IOException
271      *             if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
272      */
273     public DocumentSource createDocumentSource(String documentIRI) throws URISyntaxException, IOException {
274         if (documentIRI == null)
275             throw new NullPointerException("documentIRI cannot be null.");
276         if (documentIRI.toLowerCase(Locale.ROOT).startsWith("file:")) {
277             return new FileDocumentSource(new File(new URI(documentIRI)));
278         }
279         if (documentIRI.toLowerCase(Locale.ROOT).startsWith("http:")
280                 || documentIRI.toLowerCase(Locale.ROOT).startsWith("https:")) {
281             return new HTTPDocumentSource(getHTTPClient(), documentIRI);
282         }
283         throw new IllegalArgumentException(String.format(Locale.ROOT,
284                 "Unsupported protocol for document IRI: '%s' . " + "Check that document IRI contains a protocol.",
285                 documentIRI));
286     }
287 
288     /**
289      * Performs metadata extraction from the content of the given <code>in</code> document source, sending the generated
290      * events to the specified <code>outputHandler</code>.
291      *
292      * @param eps
293      *            the extraction parameters to be applied.
294      * @param in
295      *            the input document source.
296      * @param outputHandler
297      *            handler responsible for collecting of the extracted metadata.
298      * @param encoding
299      *            explicit encoding see <a href="http://www.iana.org/assignments/character-sets">available
300      *            encodings</a>.
301      * 
302      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
303      * 
304      * @throws IOException
305      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
306      * @throws org.apache.any23.extractor.ExtractionException
307      *             if there is an error during extraction
308      */
309     public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler,
310             String encoding) throws IOException, ExtractionException {
311         final SingleDocumentExtractionaction.html#SingleDocumentExtraction">SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
312         ex.setMIMETypeDetector(mimeTypeDetector);
313         ex.setLocalCopyFactory(streamCache);
314         ex.setParserEncoding(encoding);
315         final SingleDocumentExtractionReport sder = ex.run(eps);
316         return new ExtractionReport(ex.getMatchingExtractors(), ex.getParserEncoding(), ex.getDetectedMIMEType(),
317                 sder.getValidationReport(), sder.getExtractorToIssues());
318     }
319 
320     /**
321      * Performs metadata extraction on the <code>in</code> string associated to the <code>documentIRI</code> IRI,
322      * declaring <code>contentType</code> and <code>encoding</code>. The generated events are sent to the specified
323      * <code>outputHandler</code>.
324      *
325      * @param in
326      *            raw data to be analyzed.
327      * @param documentIRI
328      *            IRI from which the raw data has been extracted.
329      * @param contentType
330      *            declared data content type.
331      * @param encoding
332      *            declared data encoding.
333      * @param outputHandler
334      *            handler responsible for collecting of the extracted metadata.
335      * 
336      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
337      * 
338      * @throws IOException
339      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
340      * @throws org.apache.any23.extractor.ExtractionException
341      *             if there is an error during extraction
342      */
343     public ExtractionReport extract(String in, String documentIRI, String contentType, String encoding,
344             TripleHandler outputHandler) throws IOException, ExtractionException {
345         return extract(new StringDocumentSource(in, documentIRI, contentType, encoding), outputHandler);
346     }
347 
348     /**
349      * Performs metadata extraction on the <code>in</code> string associated to the <code>documentIRI</code> IRI,
350      * sending the generated events to the specified <code>outputHandler</code>.
351      *
352      * @param in
353      *            raw data to be analyzed.
354      * @param documentIRI
355      *            IRI from which the raw data has been extracted.
356      * @param outputHandler
357      *            handler responsible for collecting of the extracted metadata.
358      * 
359      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
360      * 
361      * @throws IOException
362      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
363      * @throws org.apache.any23.extractor.ExtractionException
364      *             if there is an error during extraction
365      */
366     public ExtractionReport extract(String in, String documentIRI, TripleHandler outputHandler)
367             throws IOException, ExtractionException {
368         return extract(new StringDocumentSource(in, documentIRI), outputHandler);
369     }
370 
371     /**
372      * Performs metadata extraction from the content of the given <code>file</code> sending the generated events to the
373      * specified <code>outputHandler</code>.
374      *
375      * @param file
376      *            file containing raw data.
377      * @param outputHandler
378      *            handler responsible for collecting of the extracted metadata.
379      * 
380      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
381      * 
382      * @throws IOException
383      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
384      * @throws org.apache.any23.extractor.ExtractionException
385      *             if there is an error during extraction
386      */
387     public ExtractionReport extract(File file, TripleHandler outputHandler) throws IOException, ExtractionException {
388         return extract(new FileDocumentSource(file), outputHandler);
389     }
390 
391     /**
392      * Performs metadata extraction from the content of the given <code>documentIRI</code> sending the generated events
393      * to the specified <code>outputHandler</code>. If the <i>IRI</i> is replied with a redirect, the last will be
394      * followed.
395      *
396      * @param eps
397      *            the parameters to be applied to the extraction.
398      * @param documentIRI
399      *            the IRI from which retrieve document.
400      * @param outputHandler
401      *            handler responsible for collecting of the extracted metadata.
402      * 
403      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
404      * 
405      * @throws IOException
406      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
407      * @throws org.apache.any23.extractor.ExtractionException
408      *             if there is an error during extraction
409      */
410     public ExtractionReport extract(ExtractionParameters eps, String documentIRI, TripleHandler outputHandler)
411             throws IOException, ExtractionException {
412         try {
413             return extract(eps, createDocumentSource(documentIRI), outputHandler);
414         } catch (URISyntaxException ex) {
415             throw new ExtractionException("Error while extracting data from document IRI.", ex);
416         }
417     }
418 
419     /**
420      * Performs metadata extraction from the content of the given <code>documentIRI</code> sending the generated events
421      * to the specified <code>outputHandler</code>. If the <i>IRI</i> is replied with a redirect, the last will be
422      * followed.
423      *
424      * @param documentIRI
425      *            the IRI from which retrieve document.
426      * @param outputHandler
427      *            handler responsible for collecting of the extracted metadata.
428      * 
429      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
430      * 
431      * @throws IOException
432      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
433      * @throws org.apache.any23.extractor.ExtractionException
434      *             if there is an error during extraction
435      */
436     public ExtractionReport extract(String documentIRI, TripleHandler outputHandler)
437             throws IOException, ExtractionException {
438         return extract((ExtractionParameters) null, documentIRI, outputHandler);
439     }
440 
441     /**
442      * Performs metadata extraction from the content of the given <code>in</code> document source, sending the generated
443      * events to the specified <code>outputHandler</code>.
444      *
445      * @param in
446      *            the input document source.
447      * @param outputHandler
448      *            handler responsible for collecting of the extracted metadata.
449      * @param encoding
450      *            explicit encoding see <a href="http://www.iana.org/assignments/character-sets">available
451      *            encodings</a>.
452      * 
453      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
454      * 
455      * @throws IOException
456      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
457      * @throws org.apache.any23.extractor.ExtractionException
458      *             if there is an error during extraction
459      */
460     public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
461             throws IOException, ExtractionException {
462         return extract(null, in, outputHandler, encoding);
463     }
464 
465     /**
466      * Performs metadata extraction from the content of the given <code>in</code> document source, sending the generated
467      * events to the specified <code>outputHandler</code>.
468      *
469      * @param in
470      *            the input document source.
471      * @param outputHandler
472      *            handler responsible for collecting of the extracted metadata.
473      * 
474      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
475      * 
476      * @throws IOException
477      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
478      * @throws org.apache.any23.extractor.ExtractionException
479      *             if there is an error during extraction
480      */
481     public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
482             throws IOException, ExtractionException {
483         return extract(null, in, outputHandler, null);
484     }
485 
486     /**
487      * Performs metadata extraction from the content of the given <code>in</code> document source, sending the generated
488      * events to the specified <code>outputHandler</code>.
489      *
490      * @param eps
491      *            the parameters to be applied for the extraction phase.
492      * @param in
493      *            the input document source.
494      * @param outputHandler
495      *            handler responsible for collecting of the extracted metadata.
496      * 
497      * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
498      * 
499      * @throws IOException
500      *             if there is an error reading the {@link org.apache.any23.source.DocumentSource}
501      * @throws org.apache.any23.extractor.ExtractionException
502      *             if there is an error during extraction
503      */
504     public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
505             throws IOException, ExtractionException {
506         return extract(eps, in, outputHandler, null);
507     }
508 
509     private String getAcceptHeader() {
510         Collection<MIMEType> mimeTypes = new ArrayList<>();
511         for (ExtractorFactory<?> factory : factories) {
512             mimeTypes.addAll(factory.getSupportedMIMETypes());
513         }
514         return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
515     }
516 
517 }