View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.cli;
19  
20  import com.beust.jcommander.IStringConverter;
21  import com.beust.jcommander.Parameter;
22  import com.beust.jcommander.ParameterException;
23  import com.beust.jcommander.Parameters;
24  import com.beust.jcommander.converters.FileConverter;
25  import edu.uci.ics.crawler4j.crawler.Page;
26  import edu.uci.ics.crawler4j.parser.HtmlParseData;
27  import edu.uci.ics.crawler4j.parser.ParseData;
28  import org.apache.any23.plugin.crawler.CrawlerListener;
29  import org.apache.any23.plugin.crawler.SiteCrawler;
30  import org.apache.any23.source.StringDocumentSource;
31  
32  import java.io.File;
33  import java.net.URL;
34  import java.util.UUID;
35  import java.util.regex.Pattern;
36  import java.util.regex.PatternSyntaxException;
37  
38  import static java.lang.String.format;
39  
40  /**
41   * Implementation of a <b>CLI crawler</b> based on
42   * {@link Rover}.
43   *
44   * @author Michele Mostarda (mostarda@fbk.eu)
45   */
46  @Parameters(commandNames = "crawler", commandDescription = "Any23 Crawler Command Line Tool.")
47  public class Crawler extends Rover {
48  
49      private final Object roverLock = new Object();
50  
51      @Parameter(
52         names = { "-pf", "--pagefilter" },
53         description = "Regex used to filter out page URLs during crawling.",
54         converter = PatterConverter.class
55      )
56      private Pattern pageFilter = Pattern.compile( SiteCrawler.DEFAULT_PAGE_FILTER_RE );
57  
58      @Parameter(
59         names = { "-sf", "--storagefolder" },
60         description = "Folder used to store crawler temporary data.",
61         converter = FileConverter.class
62      )
63      private File storageFolder = new File(System.getProperty("java.io.tmpdir"), "crawler-metadata-" + UUID.randomUUID().toString());
64  
65      @Parameter(names = { "-nc", "--numcrawlers" }, description = "Sets the number of crawlers.")
66      private int numCrawlers = SiteCrawler.DEFAULT_NUM_OF_CRAWLERS;
67  
68      @Parameter(names = { "-mp", "--maxpages" }, description = "Max number of pages before interrupting crawl.")
69      private int maxPages = Integer.MAX_VALUE;
70  
71      @Parameter(names = { "-md", "--maxdepth" }, description = "Max allowed crawler depth.")
72      private int maxDepth = Integer.MAX_VALUE;
73  
74      @Parameter(names = { "-pd", "--politenessdelay" }, description = "Politeness delay in milliseconds.")
75      private int politenessDelay = Integer.MAX_VALUE;
76  
77      @Override
78      public void run() throws Exception {
79          super.configure();
80  
81          if (inputIRIs.size() != 1) {
82              throw new IllegalArgumentException("Expected just one seed.");
83          }
84          final URL seed = new URL(inputIRIs.get( 0 ));
85  
86          if ( storageFolder.isFile() ) {
87              throw new IllegalStateException( format( "Storage folder %s can not be a file, must be a directory",
88                                                       storageFolder ) );
89          }
90  
91          if ( !storageFolder.exists() ) {
92              if ( !storageFolder.mkdirs() ) {
93                  throw new IllegalStateException(
94                          format( "Storage folder %s can not be created, please verify you have enough permissions",
95                                                           storageFolder ) );
96              }
97          }
98  
99          final SiteCrawlerawler.html#SiteCrawler">SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
100         siteCrawler.setNumOfCrawlers( numCrawlers );
101         siteCrawler.setMaxPages( maxPages );
102         siteCrawler.setMaxDepth( maxDepth );
103         siteCrawler.setPolitenessDelay(politenessDelay);
104 
105         siteCrawler.addListener(new CrawlerListener() {
106             @Override
107             public void visitedPage(Page page) {
108                 final String pageURL = page.getWebURL().getURL();
109                 System.err.println( format("Processing page: [%s]", pageURL) );
110 
111                 final ParseData parseData = page.getParseData();
112                 if (parseData instanceof HtmlParseData) {
113                     final HtmlParseData htmlParseData = (HtmlParseData) parseData;
114                     try {
115                         synchronized (roverLock) {
116                             Crawler.super.performExtraction(
117                                     new StringDocumentSource(
118                                             htmlParseData.getHtml(),
119                                             pageURL
120 
121                                     )
122                             );
123                         }
124                     } catch (Exception e) {
125                         System.err.println(format("Error while processing page [%s], error: %s .",
126                                                   pageURL, e.getMessage())
127                         );
128                     }
129                 }
130             }
131         });
132 
133         Runtime.getRuntime().addShutdownHook( new Thread() {
134             @Override
135             public void run() {
136                 try {
137                     System.err.println( Crawler.super.printReports() );
138                     // siteCrawler.stop(); // TODO: cause shutdown hanging.
139                 } catch (Exception e) {
140                     e.printStackTrace(System.err);
141                 }
142             }
143         });
144         siteCrawler.start(seed, pageFilter, true);
145     }
146 
147     public static final class PatterConverter implements IStringConverter<Pattern> {
148 
149         @Override
150         public Pattern convert( String value ) {
151             try {
152                 return Pattern.compile( value );
153             } catch (PatternSyntaxException pse) {
154                 throw new ParameterException( format("Invalid page filter, '%s' must be a regular expression.", value) );
155             }
156         }
157 
158     }
159 
160 }