View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.cli;
19  
20  import com.beust.jcommander.IStringConverter;
21  import com.beust.jcommander.Parameter;
22  import com.beust.jcommander.ParameterException;
23  import com.beust.jcommander.Parameters;
24  import com.beust.jcommander.converters.FileConverter;
25  import edu.uci.ics.crawler4j.crawler.Page;
26  import edu.uci.ics.crawler4j.parser.HtmlParseData;
27  import edu.uci.ics.crawler4j.parser.ParseData;
28  import org.apache.any23.plugin.crawler.CrawlerListener;
29  import org.apache.any23.plugin.crawler.SiteCrawler;
30  import org.apache.any23.source.StringDocumentSource;
31  import org.kohsuke.MetaInfServices;
32  
33  import java.io.File;
34  import java.net.URL;
35  import java.util.UUID;
36  import java.util.regex.Pattern;
37  import java.util.regex.PatternSyntaxException;
38  
39  import static java.lang.String.format;
40  
41  /**
42   * Implementation of a <b>CLI crawler</b> based on
43   * {@link Rover}.
44   *
45   * @author Michele Mostarda (mostarda@fbk.eu)
46   */
47  @MetaInfServices( value = Tool.class )
48  @Parameters(commandNames = "crawler", commandDescription = "Any23 Crawler Command Line Tool.")
49  public class Crawler extends Rover {
50  
51      private final Object roverLock = new Object();
52  
53      @Parameter(
54         names = { "-pf", "--pagefilter" },
55         description = "Regex used to filter out page URLs during crawling.",
56         converter = PatterConverter.class
57      )
58      private Pattern pageFilter = Pattern.compile( SiteCrawler.DEFAULT_PAGE_FILTER_RE );
59  
60      @Parameter(
61         names = { "-sf", "--storagefolder" },
62         description = "Folder used to store crawler temporary data.",
63         converter = FileConverter.class
64      )
65      private File storageFolder = new File(System.getProperty("java.io.tmpdir"), "crawler-metadata-" + UUID.randomUUID().toString());
66  
67      @Parameter(names = { "-nc", "--numcrawlers" }, description = "Sets the number of crawlers.")
68      private int numCrawlers = SiteCrawler.DEFAULT_NUM_OF_CRAWLERS;
69  
70      @Parameter(names = { "-mp", "--maxpages" }, description = "Max number of pages before interrupting crawl.")
71      private int maxPages = Integer.MAX_VALUE;
72  
73      @Parameter(names = { "-md", "--maxdepth" }, description = "Max allowed crawler depth.")
74      private int maxDepth = Integer.MAX_VALUE;
75  
76      @Parameter(names = { "-pd", "--politenessdelay" }, description = "Politeness delay in milliseconds.")
77      private int politenessDelay = Integer.MAX_VALUE;
78  
79      @Override
80      public void run() throws Exception {
81          super.configure();
82  
83          if (inputURIs.size() != 1) {
84              throw new IllegalArgumentException("Expected just one seed.");
85          }
86          final URL seed = new URL(inputURIs.get( 0 ));
87  
88          if ( storageFolder.isFile() ) {
89              throw new IllegalStateException( format( "Storage folder %s can not be a file, must be a directory",
90                                                       storageFolder ) );
91          }
92  
93          if ( !storageFolder.exists() ) {
94              if ( !storageFolder.mkdirs() ) {
95                  throw new IllegalStateException(
96                          format( "Storage folder %s can not be created, please verify you have enough permissions",
97                                                           storageFolder ) );
98              }
99          }
100 
101         final SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
102         siteCrawler.setNumOfCrawlers( numCrawlers );
103         siteCrawler.setMaxPages( maxPages );
104         siteCrawler.setMaxDepth( maxDepth );
105         siteCrawler.setPolitenessDelay(politenessDelay);
106 
107         siteCrawler.addListener(new CrawlerListener() {
108             @Override
109             public void visitedPage(Page page) {
110                 final String pageURL = page.getWebURL().getURL();
111                 System.err.println( format("Processing page: [%s]", pageURL) );
112 
113                 final ParseData parseData = page.getParseData();
114                 if (parseData instanceof HtmlParseData) {
115                     final HtmlParseData htmlParseData = (HtmlParseData) parseData;
116                     try {
117                         synchronized (roverLock) {
118                             Crawler.super.performExtraction(
119                                     new StringDocumentSource(
120                                             htmlParseData.getHtml(),
121                                             pageURL
122 
123                                     )
124                             );
125                         }
126                     } catch (Exception e) {
127                         System.err.println(format("Error while processing page [%s], error: %s .",
128                                                   pageURL, e.getMessage())
129                         );
130                     }
131                 }
132             }
133         });
134 
135         Runtime.getRuntime().addShutdownHook( new Thread() {
136             @Override
137             public void run() {
138                 try {
139                     System.err.println( Crawler.super.printReports() );
140                     // siteCrawler.stop(); // TODO: cause shutdown hanging.
141                 } catch (Exception e) {
142                     e.printStackTrace(System.err);
143                 }
144             }
145         });
146         siteCrawler.start(seed, pageFilter, true);
147     }
148 
149     public static final class PatterConverter implements IStringConverter<Pattern> {
150 
151         @Override
152         public Pattern convert( String value ) {
153             try {
154                 return Pattern.compile( value );
155             } catch (PatternSyntaxException pse) {
156                 throw new ParameterException( format("Invalid page filter, '%s' must be a regular expression.", value) );
157             }
158         }
159 
160     }
161 
162 }