View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.plugin.crawler;
19  
20  import edu.uci.ics.crawler4j.crawler.Page;
21  import edu.uci.ics.crawler4j.crawler.WebCrawler;
22  import edu.uci.ics.crawler4j.url.WebURL;
23  import org.slf4j.Logger;
24  import org.slf4j.LoggerFactory;
25  
26  import java.util.regex.Pattern;
27  
28  /**
29   * Default {@link WebCrawler} implementation.
30   *
31   * @author Michele Mostarda (mostarda@fbk.eu)
32   */
33  public class DefaultWebCrawler extends WebCrawler {
34  
35      private static final Logger logger = LoggerFactory.getLogger(DefaultWebCrawler.class);
36  
37      /**
38       * Shared data reference.
39       */
40      private final SharedData sharedData = SharedData.getInstance();
41  
42      /**
43       * Page filter pattern.
44       */
45      private final Pattern pattern = sharedData.getPattern();
46  
47      /**
48       * Override this method to specify whether the given URL should be visited or not.
49       */
50  
51      @Override
52      public boolean shouldVisit(Page referringPage, WebURL url) {
53          if (!super.shouldVisit(referringPage, url))
54              return false;
55          if (url.getURL() == null)
56              return false;
57          final String href = url.getURL().toLowerCase();
58          if (!href.startsWith(sharedData.getSeed()))
59              return false;
60          return pattern == null || !pattern.matcher(href).matches();
61      }
62  
63      /**
64       * Override this method to implement the single page processing logic.
65       */
66      @Override
67      public void visit(Page page) {
68          logger.trace("Visiting page: " + page.getWebURL().getURL());
69          sharedData.notifyPage(page);
70      }
71  
72  }
73