1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.extractor.ExtractorFactory;
26 import org.apache.any23.extractor.SimpleExtractorFactory;
27 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
28 import org.apache.any23.rdf.PopularPrefixes;
29 import org.apache.any23.vocab.FOAF;
30 import org.apache.any23.vocab.XFN;
31 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
32 import org.openrdf.model.BNode;
33 import org.openrdf.model.URI;
34 import org.openrdf.model.impl.ValueFactoryImpl;
35 import org.openrdf.model.vocabulary.RDF;
36 import org.w3c.dom.Document;
37 import org.w3c.dom.Node;
38
39 import java.io.IOException;
40 import java.util.Arrays;
41
42
43
44
45
46
47
48 public class XFNExtractor implements TagSoupDOMExtractor {
49
50 private static final FOAF vFOAF = FOAF.getInstance();
51 private static final XFN vXFN = XFN.getInstance();
52
53 private final static Any23ValueFactoryWrapper factoryWrapper =
54 new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
55
56 private HTMLDocument document;
57 private ExtractionResult out;
58
59 public final static ExtractorFactory<XFNExtractor> factory =
60 SimpleExtractorFactory.create(
61 "html-mf-xfn",
62 PopularPrefixes.createSubset("rdf", "foaf", "xfn"),
63 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
64 "example-mf-xfn.html",
65 XFNExtractor.class
66 );
67
68 public ExtractorDescription getDescription() {
69 return factory;
70 }
71
72 public void run(
73 ExtractionParameters extractionParameters,
74 ExtractionContext extractionContext,
75 Document in,
76 ExtractionResult out
77 ) throws IOException, ExtractionException {
78 factoryWrapper.setIssueReport(out);
79 try {
80 document = new HTMLDocument(in);
81 this.out = out;
82
83 BNode subject = factoryWrapper.createBNode();
84 boolean foundAnyXFN = false;
85 final URI documentURI = extractionContext.getDocumentURI();
86 for (Node link : document.findAll("//A[@rel][@href]")) {
87 foundAnyXFN |= extractLink(link, subject, documentURI);
88 }
89 if (!foundAnyXFN) return;
90 out.writeTriple(subject, RDF.TYPE, vFOAF.Person);
91 out.writeTriple(subject, vXFN.mePage, documentURI);
92 } finally {
93 factoryWrapper.setIssueReport(null);
94 }
95 }
96
97 private boolean extractLink(Node firstLink, BNode subject, URI documentURI)
98 throws ExtractionException {
99 String href = firstLink.getAttributes().getNamedItem("href").getNodeValue();
100 String rel = firstLink.getAttributes().getNamedItem("rel").getNodeValue();
101
102 String[] rels = rel.split("\\s+");
103 URI link = document.resolveURI(href);
104 if (containsRelMe(rels)) {
105 if (containsXFNRelExceptMe(rels)) {
106 return false;
107 }
108 out.writeTriple(subject, vXFN.mePage, link);
109 out.writeTriple(documentURI, vXFN.getExtendedProperty("me"), link);
110 } else {
111 BNode person2 = factoryWrapper.createBNode();
112 boolean foundAnyXFNRel = false;
113 for (String aRel : rels) {
114 foundAnyXFNRel |= extractRel(aRel, subject, documentURI, person2, link);
115 }
116 if (!foundAnyXFNRel) {
117 return false;
118 }
119 out.writeTriple(person2, RDF.TYPE, vFOAF.Person);
120 out.writeTriple(person2, vXFN.mePage, link);
121 }
122 return true;
123 }
124
125 private boolean containsRelMe(String[] rels) {
126 for (String rel : rels) {
127 if ("me".equals(rel.toLowerCase())) {
128 return true;
129 }
130 }
131 return false;
132 }
133
134 private boolean containsXFNRelExceptMe(String[] rels) {
135 for (String rel : rels) {
136 if (!"me".equals(rel.toLowerCase()) && vXFN.isXFNLocalName(rel)) {
137 return true;
138 }
139 }
140 return false;
141 }
142
143 private boolean extractRel(String rel, BNode person1, URI uri1, BNode person2, URI uri2) {
144 URI peopleProp = vXFN.getPropertyByLocalName(rel);
145 URI hyperlinkProp = vXFN.getExtendedProperty(rel);
146 if (peopleProp == null) {
147 return false;
148 }
149 out.writeTriple(person1, peopleProp, person2);
150 out.writeTriple(uri1, hyperlinkProp, uri2);
151 return true;
152 }
153
154 }