1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.extractor.ExtractorFactory;
26 import org.apache.any23.extractor.SimpleExtractorFactory;
27 import org.apache.any23.rdf.PopularPrefixes;
28 import org.apache.any23.vocab.XHTML;
29 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30 import org.openrdf.model.URI;
31 import org.openrdf.model.ValueFactory;
32 import org.openrdf.model.impl.ValueFactoryImpl;
33 import org.w3c.dom.Document;
34 import org.w3c.dom.Node;
35
36 import java.io.IOException;
37 import java.util.Arrays;
38 import java.util.List;
39
40
41
42
43
44 public class HeadLinkExtractor implements TagSoupDOMExtractor {
45
46 public void run(
47 ExtractionParameters extractionParameters,
48 ExtractionContext extractionContext,
49 Document in,
50 ExtractionResult out
51 ) throws IOException, ExtractionException {
52 HTMLDocument html = new HTMLDocument(in);
53 ValueFactory vf = ValueFactoryImpl.getInstance();
54
55 final List<Node> headLinkNodes = DomUtils.findAll(
56 in,
57 "/HTML/HEAD/LINK[(" +
58 "@type='application/rdf+xml' or " +
59 "@type='text/rdf' or " +
60 "@type='application/x-turtle' or " +
61 "@type='application/turtle' or " +
62 "@type='text/turtle' or " +
63 "@type='text/rdf+n3'" +
64 ") and @href and @rel]"
65 );
66 for (Node node : headLinkNodes) {
67 final URI href = html.resolveURI(DomUtils.find(node, "@href"));
68 final String rel = DomUtils.find(node, "@rel");
69 out.writeTriple(
70 extractionContext.getDocumentURI(),
71 vf.createURI(XHTML.NS + rel),
72 href
73 );
74 final String title = DomUtils.find(node, "@title");
75 if (title != null && !"".equals(title)) {
76 out.writeTriple(
77 href,
78 factory.getPrefixes().expand("dcterms:title"),
79 vf.createLiteral(title)
80 );
81 }
82 final String type = DomUtils.find(node, "@type");
83 if (type != null && !"".equals(type)) {
84 out.writeTriple(
85 href,
86 factory.getPrefixes().expand("dcterms:format"),
87 vf.createLiteral(type)
88 );
89 }
90 }
91 }
92
93 public ExtractorDescription getDescription() {
94 return factory;
95 }
96
97 public final static ExtractorFactory<HeadLinkExtractor> factory =
98 SimpleExtractorFactory.create(
99 "html-head-links",
100 PopularPrefixes.createSubset("xhtml", "dcterms"),
101 Arrays.asList("text/html;q=0.05", "application/xhtml+xml;q=0.05"),
102 "example-head-link.html",
103 HeadLinkExtractor.class);
104 }