View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.xpath;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.extractor.ExtractorFactory;
27  import org.apache.any23.extractor.SimpleExtractorFactory;
28  import org.openrdf.model.URI;
29  import org.w3c.dom.Document;
30  
31  import java.io.IOException;
32  import java.util.ArrayList;
33  import java.util.Arrays;
34  import java.util.List;
35  
36  /**
37   * Implementation of an {@link org.apache.any23.extractor.Extractor.TagSoupDOMExtractor} able to
38   * apply {@link XPathExtractionRule}s and generate <i>quads</i>.
39   *
40   * @see XPathExtractionRule
41   * @author Michele Mostarda (mostarda@fbk.eu)
42   */
43  public class XPathExtractor implements Extractor.TagSoupDOMExtractor {
44  
45      public final static String NAME = "html-xpath";
46  
47      public final static ExtractorFactory<XPathExtractor> factory =
48              SimpleExtractorFactory.create(
49                      NAME,
50                      null,
51                      Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
52                      null,
53                      XPathExtractor.class
54              );
55  
56      private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<XPathExtractionRule>();
57  
58      public XPathExtractor(List<XPathExtractionRule> rules) {
59          xPathExtractionRules.addAll(rules);
60      }
61  
62      public void add(XPathExtractionRule rule) {
63          xPathExtractionRules.add(rule);
64      }
65  
66      public void remove(XPathExtractionRule rule) {
67          xPathExtractionRules.remove(rule);
68      }
69  
70      public boolean contains(XPathExtractionRule rule) {
71          return xPathExtractionRules.contains(rule);
72      }
73  
74      public void run(
75              ExtractionParameters extractionParameters,
76              ExtractionContext extractionContext,
77              Document in,
78              ExtractionResult out
79      )
80      throws IOException, ExtractionException {
81          final URI documentURI = extractionContext.getDocumentURI();
82          for(XPathExtractionRule rule : xPathExtractionRules) {
83              if(rule.acceptURI(documentURI)) {
84                  rule.process(in, out);
85              }
86          }
87      }
88  
89      public ExtractorDescription getDescription() {
90          return factory;
91      }
92  
93  }