View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.ExtractorDescription;
25  import org.apache.any23.extractor.ExtractorFactory;
26  import org.apache.any23.extractor.IssueReport;
27  import org.apache.any23.extractor.SimpleExtractorFactory;
28  import org.apache.any23.rdf.PopularPrefixes;
29  import org.apache.any23.vocab.XHTML;
30  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
31  import org.openrdf.model.URI;
32  import org.w3c.dom.Document;
33  import org.w3c.dom.Node;
34  
35  import java.io.IOException;
36  import java.util.Arrays;
37  
38  /**
39   * Extractor for the <a href="http://microformats.org/wiki/rel-license">rel-license</a>
40   * microformat.
41   * <p/>
42   *
43   * @author Gabriele Renzi
44   * @author Richard Cyganiak
45   */
46  public class LicenseExtractor implements TagSoupDOMExtractor {
47  
48      private static final XHTML vXHTML = XHTML.getInstance();
49  
50      public final static ExtractorFactory<LicenseExtractor> factory =
51              SimpleExtractorFactory.create(
52                      "html-mf-license",
53                      PopularPrefixes.createSubset("xhtml"),
54                      Arrays.asList("text/html;q=0.01", "application/xhtml+xml;q=0.01"),
55                      "example-mf-license.html",
56                      LicenseExtractor.class
57              );
58  
59      public void run(
60              ExtractionParameters extractionParameters,
61              ExtractionContext extractionContext,
62              Document in,
63              ExtractionResult out
64      ) throws IOException, ExtractionException {
65          HTMLDocument document = new HTMLDocument(in);
66          final URI documentURI = extractionContext.getDocumentURI();
67          for (Node node : DomUtils.findAll(in, "//A[@rel='license']/@href")) {
68              String link = node.getNodeValue();
69              if ("".equals(link)) {
70                  out.notifyIssue(
71                          IssueReport.IssueLevel.Warning,
72                          String.format(
73                                  "Invalid license link detected within document %s.",
74                                  documentURI.toString()
75                          ),
76                          0, 0
77                  );
78                  continue;
79              }
80              out.writeTriple(documentURI, vXHTML.license, document.resolveURI(link));
81          }
82      }
83  
84      public ExtractorDescription getDescription() {
85          return factory;
86      }
87      
88  }