1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.cli;
19
20 import com.beust.jcommander.IStringConverter;
21 import com.beust.jcommander.Parameter;
22 import com.beust.jcommander.ParameterException;
23 import com.beust.jcommander.Parameters;
24 import com.beust.jcommander.converters.FileConverter;
25 import edu.uci.ics.crawler4j.crawler.Page;
26 import edu.uci.ics.crawler4j.parser.HtmlParseData;
27 import edu.uci.ics.crawler4j.parser.ParseData;
28 import org.apache.any23.plugin.crawler.CrawlerListener;
29 import org.apache.any23.plugin.crawler.SiteCrawler;
30 import org.apache.any23.source.StringDocumentSource;
31 import org.kohsuke.MetaInfServices;
32
33 import java.io.File;
34 import java.net.URL;
35 import java.util.UUID;
36 import java.util.regex.Pattern;
37 import java.util.regex.PatternSyntaxException;
38
39 import static java.lang.String.format;
40
41
42
43
44
45
46
47 @MetaInfServices( value = Tool.class )
48 @Parameters(commandNames = "crawler", commandDescription = "Any23 Crawler Command Line Tool.")
49 public class Crawler extends Rover {
50
51 private final Object roverLock = new Object();
52
53 @Parameter(
54 names = { "-pf", "--pagefilter" },
55 description = "Regex used to filter out page URLs during crawling.",
56 converter = PatterConverter.class
57 )
58 private Pattern pageFilter = Pattern.compile( SiteCrawler.DEFAULT_PAGE_FILTER_RE );
59
60 @Parameter(
61 names = { "-sf", "--storagefolder" },
62 description = "Folder used to store crawler temporary data.",
63 converter = FileConverter.class
64 )
65 private File storageFolder = new File(System.getProperty("java.io.tmpdir"), "crawler-metadata-" + UUID.randomUUID().toString());
66
67 @Parameter(names = { "-nc", "--numcrawlers" }, description = "Sets the number of crawlers.")
68 private int numCrawlers = SiteCrawler.DEFAULT_NUM_OF_CRAWLERS;
69
70 @Parameter(names = { "-mp", "--maxpages" }, description = "Max number of pages before interrupting crawl.")
71 private int maxPages = Integer.MAX_VALUE;
72
73 @Parameter(names = { "-md", "--maxdepth" }, description = "Max allowed crawler depth.")
74 private int maxDepth = Integer.MAX_VALUE;
75
76 @Parameter(names = { "-pd", "--politenessdelay" }, description = "Politeness delay in milliseconds.")
77 private int politenessDelay = Integer.MAX_VALUE;
78
79 @Override
80 public void run() throws Exception {
81 super.configure();
82
83 if (inputURIs.size() != 1) {
84 throw new IllegalArgumentException("Expected just one seed.");
85 }
86 final URL seed = new URL(inputURIs.get( 0 ));
87
88 if ( storageFolder.isFile() ) {
89 throw new IllegalStateException( format( "Storage folder %s can not be a file, must be a directory",
90 storageFolder ) );
91 }
92
93 if ( !storageFolder.exists() ) {
94 if ( !storageFolder.mkdirs() ) {
95 throw new IllegalStateException(
96 format( "Storage folder %s can not be created, please verify you have enough permissions",
97 storageFolder ) );
98 }
99 }
100
101 final SiteCrawler siteCrawler = new SiteCrawler( storageFolder );
102 siteCrawler.setNumOfCrawlers( numCrawlers );
103 siteCrawler.setMaxPages( maxPages );
104 siteCrawler.setMaxDepth( maxDepth );
105 siteCrawler.setPolitenessDelay(politenessDelay);
106
107 siteCrawler.addListener(new CrawlerListener() {
108 @Override
109 public void visitedPage(Page page) {
110 final String pageURL = page.getWebURL().getURL();
111 System.err.println( format("Processing page: [%s]", pageURL) );
112
113 final ParseData parseData = page.getParseData();
114 if (parseData instanceof HtmlParseData) {
115 final HtmlParseData htmlParseData = (HtmlParseData) parseData;
116 try {
117 synchronized (roverLock) {
118 Crawler.super.performExtraction(
119 new StringDocumentSource(
120 htmlParseData.getHtml(),
121 pageURL
122
123 )
124 );
125 }
126 } catch (Exception e) {
127 System.err.println(format("Error while processing page [%s], error: %s .",
128 pageURL, e.getMessage())
129 );
130 }
131 }
132 }
133 });
134
135 Runtime.getRuntime().addShutdownHook( new Thread() {
136 @Override
137 public void run() {
138 try {
139 System.err.println( Crawler.super.printReports() );
140
141 } catch (Exception e) {
142 e.printStackTrace(System.err);
143 }
144 }
145 });
146 siteCrawler.start(seed, pageFilter, true);
147 }
148
149 public static final class PatterConverter implements IStringConverter<Pattern> {
150
151 @Override
152 public Pattern convert( String value ) {
153 try {
154 return Pattern.compile( value );
155 } catch (PatternSyntaxException pse) {
156 throw new ParameterException( format("Invalid page filter, '%s' must be a regular expression.", value) );
157 }
158 }
159
160 }
161
162 }