View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.plugin.crawler;
19  
20  import edu.uci.ics.crawler4j.crawler.Page;
21  
22  import java.util.List;
23  import java.util.regex.Pattern;
24  
25  /**
26   * This class hosts shared data structures accessible
27   * to all the {@link DefaultWebCrawler} instances
28   * run by the {@link SiteCrawler}.
29   *
30   * @author Michele Mostarda (mostarda@fbk.eu)
31   */
32  public class SharedData {
33  
34      /**
35       * Singleton instance.
36       */
37      private static SharedData instance;
38  
39      /**
40       * Crawl seed.
41       */
42      private final String seed;
43  
44      /**
45       * Crawl page filter pattern.
46       */
47      private final Pattern pattern;
48  
49      /**
50       * List of crawler listeners.
51       */
52      private final List<CrawlerListener> listeners;
53  
54  //    /**
55  //     * Output triple handler.
56  //     */
57  //    private final TripleHandler tripleHandler;
58  
59      /**
60       * @return the singleton instance.
61       */
62      protected static SharedData getInstance() {
63          if(instance == null) throw new IllegalStateException("The configuration has not yet initialized.");
64          return instance;
65      }
66  
67      /**
68       * Initializes the crawler data.
69       *
70       * @param seed crawler seed.
71       * @param regex page filter regex.
72       * @param listeners the listeners to be notified of the crawler activity.
73       */
74      protected static void setCrawlData(String seed, Pattern regex, List<CrawlerListener> listeners) {
75          instance = new SharedData(seed, regex, listeners);
76      }
77  
78      /**
79       * Internal constructor.
80       *
81       * @param seed
82       * @param pattern
83       * @param listeners
84       */
85      private SharedData(String seed, Pattern pattern, List<CrawlerListener> listeners) {
86          if(seed == null || seed.trim().length() == 0)
87              throw new IllegalArgumentException(
88                  String.format("Invalid seed '%s'", seed)
89              );
90  
91          this.seed      = seed;
92          this.pattern   = pattern;
93          this.listeners = listeners;
94      }
95  
96      /**
97       * @return crawl seed.
98       */
99      protected String getSeed() {
100         return seed;
101     }
102 
103     /**
104      * @return page filter pattern.
105      */
106     protected Pattern getPattern() {
107         return pattern;
108     }
109 
110     /**
111      * Notifies all listeners that a page has been discovered.
112      *
113      * @param page the discovered page.
114      */
115     protected void notifyPage(Page page) {
116         for(CrawlerListener listener : listeners) {
117             listener.visitedPage(page);
118         }
119     }
120 
121 }