View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.plugin.htmlscraper;
19  
20  import de.l3s.boilerpipe.BoilerpipeExtractor;
21  import de.l3s.boilerpipe.BoilerpipeProcessingException;
22  import de.l3s.boilerpipe.extractors.ArticleExtractor;
23  import de.l3s.boilerpipe.extractors.CanolaExtractor;
24  import de.l3s.boilerpipe.extractors.DefaultExtractor;
25  import de.l3s.boilerpipe.extractors.LargestContentExtractor;
26  import org.apache.any23.extractor.ExtractionContext;
27  import org.apache.any23.extractor.ExtractionException;
28  import org.apache.any23.extractor.ExtractionParameters;
29  import org.apache.any23.extractor.ExtractionResult;
30  import org.apache.any23.extractor.Extractor;
31  import org.apache.any23.extractor.ExtractorDescription;
32  import org.apache.any23.vocab.SINDICE;
33  import org.eclipse.rdf4j.model.IRI;
34  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
35  
36  import java.io.IOException;
37  import java.io.InputStream;
38  import java.io.InputStreamReader;
39  import java.util.ArrayList;
40  import java.util.List;
41  
42  /**
43   * Implementation of content extractor for performing <i>HTML</i> scraping.
44   *
45   * @author Michele Mostarda (mostarda@fbk.eu)
46   */
47  public class HTMLScraperExtractor implements Extractor.ContentExtractor {
48  
49      public static final IRI PAGE_CONTENT_DE_PROPERTY  =
50              SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/de");
51      public static final IRI PAGE_CONTENT_AE_PROPERTY  =
52              SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ae");
53      public static final IRI PAGE_CONTENT_LCE_PROPERTY =
54              SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/lce");
55      public static final IRI PAGE_CONTENT_CE_PROPERTY  =
56              SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ce");
57  
58      private final List<ExtractionRule> extractionRules = new ArrayList<>();
59  
60      public HTMLScraperExtractor() {
61          loadDefaultRules();
62      }
63  
64      public void addTextExtractor(String name, IRI property, BoilerpipeExtractor extractor) {
65          extractionRules.add( new ExtractionRule(name, property, extractor) );
66      }
67  
68      public String[] getTextExtractors() {
69          final List<String> extractors = new ArrayList<>();
70          for(ExtractionRule er : extractionRules) {
71              extractors.add(er.name);
72          }
73          return extractors.toArray( new String[extractors.size()] );
74      }
75  
76      @Override
77      public void run(
78              ExtractionParameters extractionParameters,
79              ExtractionContext extractionContext,
80              InputStream inputStream,
81              ExtractionResult extractionResult
82      ) throws IOException, ExtractionException {
83          try {
84              final IRI documentIRI = extractionContext.getDocumentIRI();
85              for (ExtractionRule extractionRule : extractionRules) {
86                  final String content = extractionRule.boilerpipeExtractor.getText(new InputStreamReader(inputStream));
87                  extractionResult.writeTriple(
88                          documentIRI,
89                          extractionRule.property,
90                          SimpleValueFactory.getInstance().createLiteral(content)
91                  );
92              }
93          } catch (BoilerpipeProcessingException bpe) {
94              throw new ExtractionException("Error while applying text processor " + ArticleExtractor.class, bpe);
95          }
96      }
97  
98      @Override
99      public ExtractorDescription getDescription() {
100         return HTMLScraperExtractorFactory.getDescriptionInstance();
101     }
102 
103     @Override
104     public void setStopAtFirstError(boolean b) {
105         // Ignored.
106     }
107 
108     private void loadDefaultRules() {
109         addTextExtractor("default-extractor"      , PAGE_CONTENT_DE_PROPERTY , DefaultExtractor.getInstance());
110         addTextExtractor("article-extractor"      , PAGE_CONTENT_AE_PROPERTY , ArticleExtractor.getInstance());
111         addTextExtractor("large-content-extractor", PAGE_CONTENT_LCE_PROPERTY, LargestContentExtractor.getInstance());
112         addTextExtractor("canola-extractor"       , PAGE_CONTENT_CE_PROPERTY , CanolaExtractor.getInstance());
113     }
114 
115     /**
116      * This class associates a <i>BoilerPipe</i> extractor with the property going to host the extracted content.
117      */
118     class ExtractionRule {
119 
120         public final String name;
121         public final IRI property;
122         public final BoilerpipeExtractor boilerpipeExtractor;
123 
124         ExtractionRule(String name, IRI property, BoilerpipeExtractor boilerpipeExtractor) {
125             if(name == null) {
126                 throw new NullPointerException("name cannot be null.");
127             }
128             if(property == null) {
129                 throw new NullPointerException("property cannot be null.");
130             }
131             if(boilerpipeExtractor == null) {
132                 throw new NullPointerException("extractor cannot be null.");
133             }
134             this.name = name;
135             this.property = property;
136             this.boilerpipeExtractor = boilerpipeExtractor;
137         }
138 
139     }
140 }