View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.cli;
19  
20  import com.beust.jcommander.IStringConverter;
21  import com.beust.jcommander.Parameter;
22  import com.beust.jcommander.ParameterException;
23  import com.beust.jcommander.Parameters;
24  import com.beust.jcommander.converters.FileConverter;
25  import org.apache.any23.Any23;
26  import org.apache.any23.configuration.Configuration;
27  import org.apache.any23.configuration.DefaultConfiguration;
28  import org.apache.any23.configuration.Setting;
29  import org.apache.any23.configuration.Settings;
30  import org.apache.any23.extractor.ExtractionParameters;
31  import org.apache.any23.extractor.ExtractionParameters.ValidationMode;
32  import org.apache.any23.filter.IgnoreAccidentalRDFa;
33  import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
34  import org.apache.any23.source.DocumentSource;
35  import org.apache.any23.writer.BenchmarkTripleHandler;
36  import org.apache.any23.writer.DecoratingWriterFactory;
37  import org.apache.any23.writer.TripleWriterFactory;
38  import org.apache.any23.writer.LoggingTripleHandler;
39  import org.apache.any23.writer.NTriplesWriterFactory;
40  import org.apache.any23.writer.ReportingTripleHandler;
41  import org.apache.any23.writer.TripleHandler;
42  import org.apache.any23.writer.TripleHandlerException;
43  import org.apache.any23.writer.WriterFactoryRegistry;
44  import org.slf4j.Logger;
45  import org.slf4j.LoggerFactory;
46  
47  import java.io.File;
48  import java.io.FileNotFoundException;
49  import java.io.OutputStream;
50  import java.io.PrintStream;
51  import java.io.PrintWriter;
52  import java.net.MalformedURLException;
53  import java.net.URL;
54  import java.util.Collections;
55  import java.util.LinkedList;
56  import java.util.List;
57  import java.util.ListIterator;
58  import java.util.Objects;
59  
60  import static java.lang.String.format;
61  
62  /**
63   * A default rover implementation. Goes and fetches a URL using an hint
64   * as to what format should require, then tries to convert it to RDF.
65   *
66   * @author Michele Mostarda (mostarda@fbk.eu)
67   * @author Richard Cyganiak (richard@cyganiak.de)
68   * @author Gabriele Renzi
69   * @author Hans Brende (hansbrende@apache.org)
70   */
71  @Parameters(commandNames = { "rover" }, commandDescription = "Any23 Command Line Tool.")
72  public class Rover extends BaseTool {
73  
74      private static final Logger logger = LoggerFactory.getLogger(Rover.class);
75  
76      private static final WriterFactoryRegistry registry = WriterFactoryRegistry.getInstance();
77      private static final String DEFAULT_WRITER_IDENTIFIER = NTriplesWriterFactory.IDENTIFIER;
78  
79      static {
80          final Setting<Boolean> ALWAYS_SUPPRESS_CSS_TRIPLES = Setting.create(
81                  "alwayssuppresscsstriples", Boolean.TRUE);
82          final Settings supportedSettings = Settings.of(ALWAYS_SUPPRESS_CSS_TRIPLES);
83  
84          registry.register(new DecoratingWriterFactory() {
85  
86              @Override
87              public TripleHandlerrg/apache/any23/writer/TripleHandler.html#TripleHandler">TripleHandler getTripleWriter(TripleHandler delegate, Settings settings) {
88                  boolean always = settings.get(ALWAYS_SUPPRESS_CSS_TRIPLES);
89                  return new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(delegate), always);
90              }
91  
92              @Override
93              public Settings getSupportedSettings() {
94                  return supportedSettings;
95              }
96  
97              @Override
98              public String getIdentifier() {
99                  return "notrivial";
100             }
101         });
102     }
103 
104 
105     @Parameter(
106        names = { "-o", "--output" },
107        description = "Specify Output file (defaults to standard output)",
108        converter = PrintStreamConverter.class
109     )
110     private PrintStream outputStream = System.out;
111 
112     @Parameter(description = "input IRIs {<url>|<file>}+", converter = ArgumentToIRIConverter.class)
113     protected List<String> inputIRIs = new LinkedList<>();
114 
115     @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle")
116     private List<String> extractors = new LinkedList<>();
117 
118     @Parameter(names = { "-f", "--format" }, description = "a comma-separated list of writer factories, e.g. notrivial,nquads")
119     private List<String> formats = new LinkedList<String>() {{
120         add(DEFAULT_WRITER_IDENTIFIER);
121     }};
122 
123     @Parameter(
124        names = { "-l", "--log" },
125        description = "Produce log within a file.",
126        converter = FileConverter.class
127     )
128     private File logFile = null;
129 
130     @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.")
131     private boolean statistics;
132 
133     @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones). [DEPRECATED: As of version 2.3, use --format instead.]")
134     private boolean noTrivial;
135 
136     @Parameter(names = { "-p", "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.")
137     private boolean pedantic;
138 
139     @Parameter(names = { "-n", "--nesting" }, description = "Disable production of nesting triples.")
140     private boolean nestingDisabled;
141 
142     @Parameter(names = { "-d", "--defaultns" }, description = "Override the default namespace used to produce statements.")
143     private String defaultns;
144 
145     // non parameters
146 
147     private TripleHandler tripleHandler;
148 
149     private ReportingTripleHandler reportingTripleHandler;
150 
151     private BenchmarkTripleHandler benchmarkTripleHandler;
152 
153     private Any23 any23;
154 
155     private ExtractionParameters extractionParameters;
156 
157     @Override
158     PrintStream getOut() {
159         return outputStream;
160     }
161 
162     @Override
163     void setOut(PrintStream out) {
164         outputStream = out;
165     }
166 
167     private static TripleHandler getWriter(String id, OutputStream os) {
168         TripleWriterFactory/../org/apache/any23/writer/TripleWriterFactory.html#TripleWriterFactory">TripleWriterFactory f = (TripleWriterFactory)registry.getWriterByIdentifier(id);
169         Objects.requireNonNull(f, () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers());
170         return f.getTripleWriter(os, Settings.of()); //TODO parse TripleWriter settings from format list
171     }
172 
173     private static TripleHandlerache/any23/writer/TripleHandler.html#TripleHandler">TripleHandler getWriter(String id, TripleHandler delegate) {
174         DecoratingWriterFactoryorg/apache/any23/writer/DecoratingWriterFactory.html#DecoratingWriterFactory">DecoratingWriterFactory f = (DecoratingWriterFactory)registry.getWriterByIdentifier(id);
175         Objects.requireNonNull(f, () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers());
176         return f.getTripleWriter(delegate, Settings.of()); //TODO parse delegate settings from format list
177     }
178 
179     protected void configure() {
180         List<String> formats = this.formats;
181         if (formats.isEmpty()) {
182             formats = Collections.singletonList(DEFAULT_WRITER_IDENTIFIER);
183         }
184         ListIterator<String> l = formats.listIterator(formats.size());
185         tripleHandler = getWriter(l.previous(), outputStream);
186 
187         while (l.hasPrevious()) {
188             tripleHandler = getWriter(l.previous(), tripleHandler);
189         }
190 
191         if (logFile != null) {
192             try {
193                 tripleHandler = new LoggingTripleHandler(tripleHandler, new PrintWriter(logFile));
194             } catch (FileNotFoundException fnfe) {
195                 throw new IllegalArgumentException( format("Can not write to log file [%s]", logFile), fnfe );
196             }
197         }
198 
199         if (statistics) {
200             benchmarkTripleHandler = new BenchmarkTripleHandler(tripleHandler);
201             tripleHandler = benchmarkTripleHandler;
202         }
203 
204         if (noTrivial) {
205             tripleHandler = new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(tripleHandler),
206                                                      true    // suppress stylesheet triples.
207                                                      );
208         }
209 
210         reportingTripleHandler = new ReportingTripleHandler(tripleHandler);
211 
212         final Configuration configuration = DefaultConfiguration.singleton();
213         extractionParameters =
214                 pedantic
215                         ?
216                 new ExtractionParameters(configuration, ValidationMode.VALIDATE_AND_FIX, nestingDisabled)
217                         :
218                 new ExtractionParameters(configuration, ValidationMode.NONE          , nestingDisabled);
219         if (defaultns != null) {
220             extractionParameters.setProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY,
221                                              defaultns);
222         }
223 
224         any23 = (extractors.isEmpty()) ? new Any23()
225                                                    : new Any23(extractors.toArray(new String[extractors.size()]));
226         any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION);
227     }
228 
229     protected String printReports() {
230         final StringBuilder sb = new StringBuilder();
231         if (benchmarkTripleHandler != null)
232             sb.append( benchmarkTripleHandler.report() ).append('\n');
233         if (reportingTripleHandler != null)
234             sb.append( reportingTripleHandler.printReport() ).append('\n');
235         return sb.toString();
236     }
237 
238     protected void performExtraction(DocumentSource documentSource) throws Exception {
239         if (!any23.extract(extractionParameters, documentSource, reportingTripleHandler).hasMatchingExtractors()) {
240             throw new IllegalStateException(format("No suitable extractors found for source %s", documentSource.getDocumentIRI()));
241         }
242     }
243 
244     protected void close() {
245         if (tripleHandler != null) {
246             try {
247                 tripleHandler.close();
248             } catch (TripleHandlerException the) {
249                 throw new RuntimeException("Error while closing TripleHandler", the);
250             }
251         }
252 
253         if (outputStream != null && outputStream != System.out) { // TODO: low - find better solution to avoid closing system out.
254             outputStream.close();
255         }
256     }
257 
258     public void run() throws Exception {
259         if (inputIRIs.isEmpty()) {
260             throw new IllegalArgumentException("Expected at least 1 argument.");
261         }
262 
263         configure();
264 
265         // perform conversions
266 
267         try {
268             final long start = System.currentTimeMillis();
269             for (String inputIRI : inputIRIs) {
270                 DocumentSource source = any23.createDocumentSource(inputIRI);
271 
272                 performExtraction( source );
273             }
274             final long elapsed = System.currentTimeMillis() - start;
275 
276             if (benchmarkTripleHandler != null) {
277                 System.err.println(benchmarkTripleHandler.report());
278             }
279 
280             logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames());
281             logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms");
282         } finally {
283             close();
284         }
285     }
286 
287     public static final class ArgumentToIRIConverter implements IStringConverter<String> {
288 
289         @Override
290         public String convert(String uri) {
291             uri = uri.trim();
292             if (uri.toLowerCase().startsWith("http:") || uri.toLowerCase().startsWith("https:")) {
293                 try {
294                     return new URL(uri).toString();
295                 } catch (MalformedURLException murle) {
296                     throw new ParameterException(format("Invalid IRI: '%s': %s", uri, murle.getMessage()));
297                 }
298             }
299 
300             final File f = new File(uri);
301             if (!f.exists()) {
302                 throw new ParameterException(format("No such file: [%s]", f.getAbsolutePath()));
303             }
304             if (f.isDirectory()) {
305                 throw new ParameterException(format("Found a directory: [%s]", f.getAbsolutePath()));
306             }
307             return f.toURI().toString();
308         }
309 
310     }
311 
312     public static final class PrintStreamConverter implements IStringConverter<PrintStream> {
313 
314         @Override
315         public PrintStream convert( String value ) {
316             final File file = new File(value);
317             try {
318                 return new PrintStream(file);
319             } catch (FileNotFoundException fnfe) {
320                 throw new ParameterException(format("Cannot open file '%s': %s", file, fnfe.getMessage()));
321             }
322         }
323 
324     }
325 
326 }