View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.cli;
19  
20  import com.beust.jcommander.IStringConverter;
21  import com.beust.jcommander.Parameter;
22  import com.beust.jcommander.ParameterException;
23  import com.beust.jcommander.Parameters;
24  import org.apache.any23.extractor.html.TagSoupParser;
25  import org.apache.any23.http.DefaultHTTPClient;
26  import org.apache.any23.source.DocumentSource;
27  import org.apache.any23.source.FileDocumentSource;
28  import org.apache.any23.source.HTTPDocumentSource;
29  import org.apache.any23.util.StreamUtils;
30  
31  import java.io.File;
32  import java.io.InputStream;
33  import java.io.PrintStream;
34  import java.net.URISyntaxException;
35  import java.util.LinkedList;
36  import java.util.List;
37  import java.util.regex.Matcher;
38  import java.util.regex.Pattern;
39  
40  /**
41   * Command line <i>Microdata</i> parser, accepting both files and URLs and
42   * returing a <i>JSON</i> representation of the extracted metadata as described at
43   * <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>.
44   *
45   * @author Michele Mostarda (mostarda@fbk.eu)
46   */
47  @Parameters( commandNames = { "microdata" },  commandDescription = "Commandline Tool for extracting Microdata from file/HTTP source.")
48  public class MicrodataParser extends BaseTool {
49  
50      private static final Pattern HTTP_DOCUMENT_PATTERN = Pattern.compile("^https?://.*");
51  
52      private static final Pattern FILE_DOCUMENT_PATTERN = Pattern.compile("^file:(.*)$");
53  
54      @Parameter(
55         arity = 1,
56         description = "Input document URL, {http://path/to/resource.html|file:/path/to/localFile.html}",
57         converter = MicrodataParserDocumentSourceConverter.class
58      )
59      private List<DocumentSource> document = new LinkedList<DocumentSource>();
60  
61      private PrintStream out = System.out;
62  
63      @Override
64      PrintStream getOut() {
65          return out;
66      }
67  
68      @Override
69      void setOut(PrintStream out) {
70          this.out = out;
71      }
72  
73      public void run() throws Exception {
74          if (document.isEmpty()) {
75              throw new IllegalArgumentException("No input document URL specified");
76          }
77          InputStream documentInputInputStream = null;
78          try {
79              final DocumentSource documentSource = document.get(0);
80              documentInputInputStream = documentSource.openInputStream();
81              final TagSoupParserser.html#TagSoupParser">TagSoupParser tagSoupParser = new TagSoupParser(
82                      documentInputInputStream,
83                      documentSource.getDocumentIRI()
84              );
85              org.apache.any23.extractor.microdata.MicrodataParser.getMicrodataAsJSON(tagSoupParser.getDOM(), out);
86          } finally {
87              if (documentInputInputStream != null) StreamUtils.closeGracefully(documentInputInputStream);
88          }
89      }
90  
91      public static final class MicrodataParserDocumentSourceConverter implements IStringConverter<DocumentSource> {
92  
93          @Override
94          public DocumentSource convert( String value ) {
95              final Matcher httpMatcher = HTTP_DOCUMENT_PATTERN.matcher(value);
96              if (httpMatcher.find()) {
97                  try {
98                      return new HTTPDocumentSource(DefaultHTTPClient.createInitializedHTTPClient(), value);
99                  } catch ( URISyntaxException e ) {
100                     throw new ParameterException("Invalid source IRI: '" + value + "'");
101                 }
102             }
103             final Matcher fileMatcher = FILE_DOCUMENT_PATTERN.matcher(value);
104             if (fileMatcher.find()) {
105                 return new FileDocumentSource( new File( fileMatcher.group(1) ) );
106             }
107             throw new ParameterException("Invalid source protocol: '" + value + "'");
108         }
109 
110     }
111 
112 }