View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.plugin.crawler;
19  
20  import edu.uci.ics.crawler4j.crawler.CrawlConfig;
21  import edu.uci.ics.crawler4j.crawler.CrawlController;
22  import edu.uci.ics.crawler4j.crawler.WebCrawler;
23  import edu.uci.ics.crawler4j.fetcher.PageFetcher;
24  import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig;
25  import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer;
26  
27  import java.io.File;
28  import java.net.URL;
29  import java.util.ArrayList;
30  import java.util.Collections;
31  import java.util.List;
32  import java.util.concurrent.ExecutorService;
33  import java.util.concurrent.Executors;
34  import java.util.regex.Pattern;
35  
36  /**
37   * A basic <em>site crawler</em> to extract semantic content
38   * of small/medium size sites.
39   *
40   * @author Michele Mostarda (mostarda@fbk.eu)
41   */
42  public class SiteCrawler {
43  
44      public static final String DEFAULT_PAGE_FILTER_RE =
45          ".*(\\.(" +
46                      "css|js"                            +
47                      "|bmp|gif|jpe?g|png|tiff?"          +
48                      "|mid|mp2|mp3|mp4|wav|wma"          +
49                      "|avi|mov|mpeg|ram|m4v|wmv|rm|smil" +
50                      "|pdf"        +
51                      "|swf"        +
52                      "|zip|rar|gz" +
53                      "|xml|txt"    +
54          "))$";
55  
56      /**
57       * Default number of crawler instances.
58       */
59      public static final int DEFAULT_NUM_OF_CRAWLERS = 10;
60  
61      /**
62       * Default crawler implementation.
63       */
64      public static final Class<? extends WebCrawler> DEFAULT_WEB_CRAWLER = DefaultWebCrawler.class;
65  
66      /**
67       * Default filter applied to skip contents.
68       */
69      public final Pattern defaultFilters = Pattern.compile(DEFAULT_PAGE_FILTER_RE);
70  
71      /**
72       * The crawler threads controller.
73       */
74      private final CrawlController controller;
75  
76      /**
77       * Crawler listeners.
78       */
79      private final List<CrawlerListener> listeners = new ArrayList<CrawlerListener>();
80  
81      /**
82       * Actual number of crawler instances.
83       */
84      private int numOfCrawlers = DEFAULT_NUM_OF_CRAWLERS;
85  
86      /**
87       * Actual web crawler.
88       */
89      private Class<? extends WebCrawler> webCrawler = DEFAULT_WEB_CRAWLER;
90  
91      /**
92       * Internal crawler configuration.
93       */
94      private final CrawlConfig crawlConfig;
95  
96      /**
97       * Internal executor service.
98       */
99      private ExecutorService service;
100 
101     /**
102      * Constructor.
103      *
104      * @param storageFolder location used to store the temporary data structures used by the crawler.
105      */
106     public SiteCrawler(File storageFolder) {
107         try {
108             crawlConfig = new CrawlConfig();
109             crawlConfig.setCrawlStorageFolder( storageFolder.getAbsolutePath() );
110             crawlConfig.setUserAgentString("Apache Any23 Web Crawler");
111             
112             final PageFetcher pageFetcher = new PageFetcher(crawlConfig);
113 
114             RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
115             final RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
116             
117             controller = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
118         } catch (Exception e) {
119             throw new IllegalArgumentException("Error while initializing crawler controller.", e);
120         }
121     }
122 
123     /**
124      * @return number of crawler instances.
125      */
126     public int getNumOfCrawlers() {
127         return numOfCrawlers;
128     }
129 
130     /**
131      * Sets the number of crawler instances.
132      *
133      * @param n an integer &gt;= 0.
134      */
135     public void setNumOfCrawlers(int n) {
136         if(n <=0) throw new IllegalArgumentException("Invalid number of crawlers, must be > 0 .");
137         this.numOfCrawlers = n;
138     }
139 
140     public Class<? extends WebCrawler> getWebCrawler() {
141         return webCrawler;
142     }
143 
144     /**
145      * Sets the actual crawler class.
146      *
147      * @param c a not <code>class</code>.
148      */
149     public void setWebCrawler(Class<? extends WebCrawler> c) {
150         if(c == null) throw new NullPointerException("c cannot be null.");
151         this.webCrawler = c;
152     }
153 
154     /**
155      * @return the max allowed crawl depth, <code>-1</code> means no limit.
156      */
157     public int getMaxDepth() {
158         return crawlConfig.getMaxDepthOfCrawling();
159     }
160 
161     /**
162      * Sets the maximum depth.
163      *
164      * @param maxDepth maximum allowed depth. <code>-1</code> means no limit.
165      */
166     public void setMaxDepth(int maxDepth) {
167         if(maxDepth < -1 || maxDepth == 0) throw new IllegalArgumentException("Invalid maxDepth, must be -1 or > 0");
168         crawlConfig.setMaxDepthOfCrawling(maxDepth);
169     }
170 
171     /**
172      * @return max number of allowed pages.
173      */
174     public int getMaxPages() {
175         return crawlConfig.getMaxPagesToFetch();
176     }
177 
178     /**
179      * Sets the maximum collected pages.
180      *
181      * @param maxPages maximum allowed pages. <code>-1</code> means no limit.
182      */
183     public void setMaxPages(int maxPages) {
184         if(maxPages < -1 || maxPages == 0) throw new IllegalArgumentException("Invalid maxPages, must be -1 or > 0");
185         crawlConfig.setMaxPagesToFetch(maxPages);
186     }
187 
188     /**
189      * @return the politeness delay in milliseconds.
190      */
191     public int getPolitenessDelay() {
192         return crawlConfig.getPolitenessDelay();
193     }
194 
195     /**
196      * Sets the politeness delay.
197      *
198      * @param millis delay in milliseconds.
199      */
200     public void setPolitenessDelay(int millis) {
201         if(millis >= 0) crawlConfig.setPolitenessDelay(millis);
202     }
203 
204     /**
205      * Registers a {@link CrawlerListener} to this crawler.
206      *
207      * @param listener
208      */
209     public void addListener(CrawlerListener listener) {
210         listeners.add(listener);
211     }
212 
213     /**
214      * Deregisters a {@link CrawlerListener} from this crawler.
215      *
216      * @param listener
217      */
218     public void removeListener(CrawlerListener listener) {
219         listeners.remove(listener);
220     }
221 
222     /**
223      * Starts the crawling process.
224      *
225      * @param seed the starting URL for the crawler process.
226      * @param filters filters to be applied to the crawler process. Can be <code>null</code>.
227      * @param wait if <code>true</code> the process will wait for the crawler termination.
228      * @throws Exception
229      */
230     public synchronized void start(
231             final URL seed, final Pattern filters, final boolean wait
232     ) throws Exception {
233         SharedData.setCrawlData(seed.toExternalForm(), filters, Collections.synchronizedList(listeners) );
234         controller.addSeed(seed.toExternalForm());
235         final Runnable internalRunnable = new Runnable() {
236             @Override
237             public void run() {
238                 controller.start(getWebCrawler(), getNumOfCrawlers());
239             }
240         };
241         if(wait) {
242             internalRunnable.run();
243         } else {
244             if(service != null) throw new IllegalStateException("Another service seems to run.");
245             service = Executors.newSingleThreadExecutor();
246             service.execute(internalRunnable);
247         }
248     }
249 
250     /**
251      * Starts the crawler process with the {@link #defaultFilters}.
252      *
253      * @param seed the starting URL for the crawler process.
254      * @param wait if <code>true</code> the process will wait for the crawler termination.
255      * @throws Exception
256      */
257     public void start(final URL seed, final boolean wait) throws Exception {
258         start(seed, defaultFilters, wait);
259     }
260 
261     /**
262      * Interrupts the crawler process if started with <code>wait</code> flag == <code>false</code>.
263      */
264     public synchronized void stop() {
265         service.shutdownNow();
266     }
267 
268 }