View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.cli;
19  
20  import com.beust.jcommander.IStringConverter;
21  import com.beust.jcommander.Parameter;
22  import com.beust.jcommander.ParameterException;
23  import com.beust.jcommander.Parameters;
24  import com.beust.jcommander.converters.FileConverter;
25  import org.apache.any23.Any23;
26  import org.apache.any23.configuration.Configuration;
27  import org.apache.any23.configuration.DefaultConfiguration;
28  import org.apache.any23.extractor.ExtractionParameters;
29  import org.apache.any23.extractor.ExtractionParameters.ValidationMode;
30  import org.apache.any23.filter.IgnoreAccidentalRDFa;
31  import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
32  import org.apache.any23.source.DocumentSource;
33  import org.apache.any23.writer.BenchmarkTripleHandler;
34  import org.apache.any23.writer.LoggingTripleHandler;
35  import org.apache.any23.writer.ReportingTripleHandler;
36  import org.apache.any23.writer.TripleHandler;
37  import org.apache.any23.writer.TripleHandlerException;
38  import org.apache.any23.writer.WriterFactoryRegistry;
39  import org.kohsuke.MetaInfServices;
40  import org.slf4j.Logger;
41  import org.slf4j.LoggerFactory;
42  
43  import java.io.File;
44  import java.io.FileNotFoundException;
45  import java.io.PrintStream;
46  import java.io.PrintWriter;
47  import java.net.MalformedURLException;
48  import java.net.URL;
49  import java.util.LinkedList;
50  import java.util.List;
51  
52  import static java.lang.String.format;
53  
54  /**
55   * A default rover implementation. Goes and fetches a URL using an hint
56   * as to what format should require, then tries to convert it to RDF.
57   *
58   * @author Michele Mostarda (mostarda@fbk.eu)
59   * @author Richard Cyganiak (richard@cyganiak.de)
60   * @author Gabriele Renzi
61   */
62  @MetaInfServices
63  @Parameters(commandNames = { "rover" }, commandDescription = "Any23 Command Line Tool.")
64  public class Rover implements Tool {
65  
66      private static final List<String> FORMATS = WriterFactoryRegistry.getInstance().getIdentifiers();
67  
68      private static final int DEFAULT_FORMAT_INDEX = 0;
69  
70      private static final Logger logger = LoggerFactory.getLogger(Rover.class);
71  
72      @Parameter(
73         names = { "-o", "--output" },
74         description = "Specify Output file (defaults to standard output)",
75         converter = PrintStreamConverter.class
76      )
77      private PrintStream outputStream = System.out;
78  
79      @Parameter(description = "input URIs {<url>|<file>}+", converter = ArgumentToURIConverter.class)
80      protected List<String> inputURIs = new LinkedList<String>();
81  
82      @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle")
83      private List<String> extractors = new LinkedList<String>();
84  
85      @Parameter(names = { "-f", "--format" }, description = "the output format")
86      private String format = FORMATS.get(DEFAULT_FORMAT_INDEX);
87  
88      @Parameter(
89         names = { "-l", "--log" },
90         description = "Produce log within a file.",
91         converter = FileConverter.class
92      )
93      private File logFile = null;
94  
95      @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.")
96      private boolean statistics;
97  
98      @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones).")
99      private boolean noTrivial;
100 
101     @Parameter(names = { "-p", "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.")
102     private boolean pedantic;
103 
104     @Parameter(names = { "-n", "--nesting" }, description = "Disable production of nesting triples.")
105     private boolean nestingDisabled;
106 
107     @Parameter(names = { "-d", "--defaultns" }, description = "Override the default namespace used to produce statements.")
108     private String defaultns;
109 
110     // non parameters
111 
112     private TripleHandler tripleHandler;
113 
114     private ReportingTripleHandler reportingTripleHandler;
115 
116     private BenchmarkTripleHandler benchmarkTripleHandler;
117 
118     private Any23 any23;
119 
120     private ExtractionParameters extractionParameters;
121 
122     protected void configure() {
123         try {
124             tripleHandler = WriterFactoryRegistry.getInstance().getWriterInstanceByIdentifier(format, outputStream);
125         } catch (Exception e) {
126             throw new NullPointerException(
127                     format("Invalid output format '%s', admitted values: %s",
128                         format,
129                         FORMATS
130                     )
131             );
132         }
133 
134         if (logFile != null) {
135             try {
136                 tripleHandler = new LoggingTripleHandler(tripleHandler, new PrintWriter(logFile));
137             } catch (FileNotFoundException fnfe) {
138                 throw new IllegalArgumentException( format("Can not write to log file [%s]", logFile), fnfe );
139             }
140         }
141 
142         if (statistics) {
143             benchmarkTripleHandler = new BenchmarkTripleHandler(tripleHandler);
144             tripleHandler = benchmarkTripleHandler;
145         }
146 
147         if (noTrivial) {
148             tripleHandler = new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(tripleHandler),
149                                                      true    // suppress stylesheet triples.
150                                                      );
151         }
152 
153         reportingTripleHandler = new ReportingTripleHandler(tripleHandler);
154 
155         final Configuration configuration = DefaultConfiguration.singleton();
156         extractionParameters =
157                 pedantic
158                         ?
159                 new ExtractionParameters(configuration, ValidationMode.ValidateAndFix, nestingDisabled)
160                         :
161                 new ExtractionParameters(configuration, ValidationMode.None          , nestingDisabled);
162         if (defaultns != null) {
163             extractionParameters.setProperty(ExtractionParameters.EXTRACTION_CONTEXT_URI_PROPERTY,
164                                              defaultns);
165         }
166 
167         any23 = (extractors.isEmpty()) ? new Any23()
168                                                    : new Any23(extractors.toArray(new String[extractors.size()]));
169         any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION);
170     }
171 
172     protected String printReports() {
173         final StringBuilder sb = new StringBuilder();
174         if (benchmarkTripleHandler != null) sb.append( benchmarkTripleHandler.report() ).append('\n');
175         if (reportingTripleHandler != null) sb.append( reportingTripleHandler.printReport() ).append('\n');
176         return sb.toString();
177     }
178 
179     protected void performExtraction(DocumentSource documentSource) throws Exception {
180         if (!any23.extract(extractionParameters, documentSource, tripleHandler).hasMatchingExtractors()) {
181             throw new IllegalStateException(format("No suitable extractors found for source %s", documentSource));
182         }
183     }
184 
185     protected void close() {
186         if (tripleHandler != null) {
187             try {
188                 tripleHandler.close();
189             } catch (TripleHandlerException the) {
190                 throw new RuntimeException("Error while closing TripleHandler", the);
191             }
192         }
193 
194         if (outputStream != null && outputStream != System.out) { // TODO: low - find better solution to avoid closing system out.
195             outputStream.close();
196         }
197     }
198 
199     public void run() throws Exception {
200         if (inputURIs.isEmpty()) {
201             throw new IllegalArgumentException("Expected at least 1 argument.");
202         }
203 
204         configure();
205 
206         // perform conversions
207 
208         try {
209             final long start = System.currentTimeMillis();
210             for (String inputURI : inputURIs) {
211                 DocumentSource source = any23.createDocumentSource(inputURI);
212 
213                 performExtraction( source );
214             }
215             final long elapsed = System.currentTimeMillis() - start;
216 
217             if (benchmarkTripleHandler != null) {
218                 System.err.println(benchmarkTripleHandler.report());
219             }
220 
221             logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames());
222             logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms");
223         } finally {
224             close();
225         }
226     }
227 
228     public static final class ArgumentToURIConverter implements IStringConverter<String> {
229 
230         @Override
231         public String convert(String uri) {
232             uri = uri.trim();
233             if (uri.toLowerCase().startsWith("http:") || uri.toLowerCase().startsWith("https:")) {
234                 try {
235                     return new URL(uri).toString();
236                 } catch (MalformedURLException murle) {
237                     throw new ParameterException(format("Invalid URI: '%s': %s", uri, murle.getMessage()));
238                 }
239             }
240 
241             final File f = new File(uri);
242             if (!f.exists()) {
243                 throw new ParameterException(format("No such file: [%s]", f.getAbsolutePath()));
244             }
245             if (f.isDirectory()) {
246                 throw new ParameterException(format("Found a directory: [%s]", f.getAbsolutePath()));
247             }
248             return f.toURI().toString();
249         }
250 
251     }
252 
253     public static final class PrintStreamConverter implements IStringConverter<PrintStream> {
254 
255         @Override
256         public PrintStream convert( String value ) {
257             final File file = new File(value);
258             try {
259                 return new PrintStream(file);
260             } catch (FileNotFoundException fnfe) {
261                 throw new ParameterException(format("Cannot open file '%s': %s", file, fnfe.getMessage()));
262             }
263         }
264 
265     }
266 
267 }