1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.cli;
19
20 import com.beust.jcommander.IStringConverter;
21 import com.beust.jcommander.Parameter;
22 import com.beust.jcommander.ParameterException;
23 import com.beust.jcommander.Parameters;
24 import com.beust.jcommander.converters.FileConverter;
25 import org.apache.any23.Any23;
26 import org.apache.any23.configuration.Configuration;
27 import org.apache.any23.configuration.DefaultConfiguration;
28 import org.apache.any23.extractor.ExtractionParameters;
29 import org.apache.any23.extractor.ExtractionParameters.ValidationMode;
30 import org.apache.any23.filter.IgnoreAccidentalRDFa;
31 import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
32 import org.apache.any23.source.DocumentSource;
33 import org.apache.any23.writer.BenchmarkTripleHandler;
34 import org.apache.any23.writer.LoggingTripleHandler;
35 import org.apache.any23.writer.ReportingTripleHandler;
36 import org.apache.any23.writer.TripleHandler;
37 import org.apache.any23.writer.TripleHandlerException;
38 import org.apache.any23.writer.WriterFactoryRegistry;
39 import org.kohsuke.MetaInfServices;
40 import org.slf4j.Logger;
41 import org.slf4j.LoggerFactory;
42
43 import java.io.File;
44 import java.io.FileNotFoundException;
45 import java.io.PrintStream;
46 import java.io.PrintWriter;
47 import java.net.MalformedURLException;
48 import java.net.URL;
49 import java.util.LinkedList;
50 import java.util.List;
51
52 import static java.lang.String.format;
53
54
55
56
57
58
59
60
61
62 @MetaInfServices
63 @Parameters(commandNames = { "rover" }, commandDescription = "Any23 Command Line Tool.")
64 public class Rover implements Tool {
65
66 private static final List<String> FORMATS = WriterFactoryRegistry.getInstance().getIdentifiers();
67
68 private static final int DEFAULT_FORMAT_INDEX = 0;
69
70 private static final Logger logger = LoggerFactory.getLogger(Rover.class);
71
72 @Parameter(
73 names = { "-o", "--output" },
74 description = "Specify Output file (defaults to standard output)",
75 converter = PrintStreamConverter.class
76 )
77 private PrintStream outputStream = System.out;
78
79 @Parameter(description = "input URIs {<url>|<file>}+", converter = ArgumentToURIConverter.class)
80 protected List<String> inputURIs = new LinkedList<String>();
81
82 @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle")
83 private List<String> extractors = new LinkedList<String>();
84
85 @Parameter(names = { "-f", "--format" }, description = "the output format")
86 private String format = FORMATS.get(DEFAULT_FORMAT_INDEX);
87
88 @Parameter(
89 names = { "-l", "--log" },
90 description = "Produce log within a file.",
91 converter = FileConverter.class
92 )
93 private File logFile = null;
94
95 @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.")
96 private boolean statistics;
97
98 @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones).")
99 private boolean noTrivial;
100
101 @Parameter(names = { "-p", "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.")
102 private boolean pedantic;
103
104 @Parameter(names = { "-n", "--nesting" }, description = "Disable production of nesting triples.")
105 private boolean nestingDisabled;
106
107 @Parameter(names = { "-d", "--defaultns" }, description = "Override the default namespace used to produce statements.")
108 private String defaultns;
109
110
111
112 private TripleHandler tripleHandler;
113
114 private ReportingTripleHandler reportingTripleHandler;
115
116 private BenchmarkTripleHandler benchmarkTripleHandler;
117
118 private Any23 any23;
119
120 private ExtractionParameters extractionParameters;
121
122 protected void configure() {
123 try {
124 tripleHandler = WriterFactoryRegistry.getInstance().getWriterInstanceByIdentifier(format, outputStream);
125 } catch (Exception e) {
126 throw new NullPointerException(
127 format("Invalid output format '%s', admitted values: %s",
128 format,
129 FORMATS
130 )
131 );
132 }
133
134 if (logFile != null) {
135 try {
136 tripleHandler = new LoggingTripleHandler(tripleHandler, new PrintWriter(logFile));
137 } catch (FileNotFoundException fnfe) {
138 throw new IllegalArgumentException( format("Can not write to log file [%s]", logFile), fnfe );
139 }
140 }
141
142 if (statistics) {
143 benchmarkTripleHandler = new BenchmarkTripleHandler(tripleHandler);
144 tripleHandler = benchmarkTripleHandler;
145 }
146
147 if (noTrivial) {
148 tripleHandler = new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(tripleHandler),
149 true
150 );
151 }
152
153 reportingTripleHandler = new ReportingTripleHandler(tripleHandler);
154
155 final Configuration configuration = DefaultConfiguration.singleton();
156 extractionParameters =
157 pedantic
158 ?
159 new ExtractionParameters(configuration, ValidationMode.ValidateAndFix, nestingDisabled)
160 :
161 new ExtractionParameters(configuration, ValidationMode.None , nestingDisabled);
162 if (defaultns != null) {
163 extractionParameters.setProperty(ExtractionParameters.EXTRACTION_CONTEXT_URI_PROPERTY,
164 defaultns);
165 }
166
167 any23 = (extractors.isEmpty()) ? new Any23()
168 : new Any23(extractors.toArray(new String[extractors.size()]));
169 any23.setHTTPUserAgent(Any23.DEFAULT_HTTP_CLIENT_USER_AGENT + "/" + Any23.VERSION);
170 }
171
172 protected String printReports() {
173 final StringBuilder sb = new StringBuilder();
174 if (benchmarkTripleHandler != null) sb.append( benchmarkTripleHandler.report() ).append('\n');
175 if (reportingTripleHandler != null) sb.append( reportingTripleHandler.printReport() ).append('\n');
176 return sb.toString();
177 }
178
179 protected void performExtraction(DocumentSource documentSource) throws Exception {
180 if (!any23.extract(extractionParameters, documentSource, tripleHandler).hasMatchingExtractors()) {
181 throw new IllegalStateException(format("No suitable extractors found for source %s", documentSource));
182 }
183 }
184
185 protected void close() {
186 if (tripleHandler != null) {
187 try {
188 tripleHandler.close();
189 } catch (TripleHandlerException the) {
190 throw new RuntimeException("Error while closing TripleHandler", the);
191 }
192 }
193
194 if (outputStream != null && outputStream != System.out) {
195 outputStream.close();
196 }
197 }
198
199 public void run() throws Exception {
200 if (inputURIs.isEmpty()) {
201 throw new IllegalArgumentException("Expected at least 1 argument.");
202 }
203
204 configure();
205
206
207
208 try {
209 final long start = System.currentTimeMillis();
210 for (String inputURI : inputURIs) {
211 DocumentSource source = any23.createDocumentSource(inputURI);
212
213 performExtraction( source );
214 }
215 final long elapsed = System.currentTimeMillis() - start;
216
217 if (benchmarkTripleHandler != null) {
218 System.err.println(benchmarkTripleHandler.report());
219 }
220
221 logger.info("Extractors used: " + reportingTripleHandler.getExtractorNames());
222 logger.info(reportingTripleHandler.getTotalTriples() + " triples, " + elapsed + "ms");
223 } finally {
224 close();
225 }
226 }
227
228 public static final class ArgumentToURIConverter implements IStringConverter<String> {
229
230 @Override
231 public String convert(String uri) {
232 uri = uri.trim();
233 if (uri.toLowerCase().startsWith("http:") || uri.toLowerCase().startsWith("https:")) {
234 try {
235 return new URL(uri).toString();
236 } catch (MalformedURLException murle) {
237 throw new ParameterException(format("Invalid URI: '%s': %s", uri, murle.getMessage()));
238 }
239 }
240
241 final File f = new File(uri);
242 if (!f.exists()) {
243 throw new ParameterException(format("No such file: [%s]", f.getAbsolutePath()));
244 }
245 if (f.isDirectory()) {
246 throw new ParameterException(format("Found a directory: [%s]", f.getAbsolutePath()));
247 }
248 return f.toURI().toString();
249 }
250
251 }
252
253 public static final class PrintStreamConverter implements IStringConverter<PrintStream> {
254
255 @Override
256 public PrintStream convert( String value ) {
257 final File file = new File(value);
258 try {
259 return new PrintStream(file);
260 } catch (FileNotFoundException fnfe) {
261 throw new ParameterException(format("Cannot open file '%s': %s", file, fnfe.getMessage()));
262 }
263 }
264
265 }
266
267 }