1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.extractor.ExtractorFactory;
26 import org.apache.any23.extractor.IssueReport;
27 import org.apache.any23.extractor.SimpleExtractorFactory;
28 import org.apache.any23.rdf.PopularPrefixes;
29 import org.apache.any23.vocab.XHTML;
30 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
31 import org.openrdf.model.URI;
32 import org.w3c.dom.Document;
33 import org.w3c.dom.Node;
34
35 import java.io.IOException;
36 import java.util.Arrays;
37
38
39
40
41
42
43
44
45
46 public class LicenseExtractor implements TagSoupDOMExtractor {
47
48 private static final XHTML vXHTML = XHTML.getInstance();
49
50 public final static ExtractorFactory<LicenseExtractor> factory =
51 SimpleExtractorFactory.create(
52 "html-mf-license",
53 PopularPrefixes.createSubset("xhtml"),
54 Arrays.asList("text/html;q=0.01", "application/xhtml+xml;q=0.01"),
55 "example-mf-license.html",
56 LicenseExtractor.class
57 );
58
59 public void run(
60 ExtractionParameters extractionParameters,
61 ExtractionContext extractionContext,
62 Document in,
63 ExtractionResult out
64 ) throws IOException, ExtractionException {
65 HTMLDocument document = new HTMLDocument(in);
66 final URI documentURI = extractionContext.getDocumentURI();
67 for (Node node : DomUtils.findAll(in, "//A[@rel='license']/@href")) {
68 String link = node.getNodeValue();
69 if ("".equals(link)) {
70 out.notifyIssue(
71 IssueReport.IssueLevel.Warning,
72 String.format(
73 "Invalid license link detected within document %s.",
74 documentURI.toString()
75 ),
76 0, 0
77 );
78 continue;
79 }
80 out.writeTriple(documentURI, vXHTML.license, document.resolveURI(link));
81 }
82 }
83
84 public ExtractorDescription getDescription() {
85 return factory;
86 }
87
88 }