1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.Extractor;
25 import org.apache.any23.extractor.ExtractorDescription;
26 import org.apache.any23.extractor.ExtractorFactory;
27 import org.apache.any23.extractor.SimpleExtractorFactory;
28 import org.apache.any23.rdf.PopularPrefixes;
29 import org.apache.any23.rdf.RDFUtils;
30 import org.apache.any23.vocab.SINDICE;
31 import org.openrdf.model.URI;
32 import org.openrdf.model.impl.LiteralImpl;
33 import org.openrdf.model.impl.URIImpl;
34 import org.w3c.dom.Document;
35 import org.w3c.dom.NamedNodeMap;
36 import org.w3c.dom.Node;
37
38 import java.io.IOException;
39 import java.util.Arrays;
40 import java.util.HashMap;
41 import java.util.HashSet;
42 import java.util.List;
43 import java.util.Map;
44 import java.util.Set;
45
46
47
48
49
50
51
52 public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
53
54 public static final String NAME = "html-head-meta";
55
56 private static final SINDICE vSINDICE = SINDICE.getInstance();
57
58 private URI profile;
59
60 private Map<String, URI> prefixes = new HashMap<String, URI>();
61
62 private String documentLang;
63
64 public final static ExtractorFactory<HTMLMetaExtractor> factory =
65 SimpleExtractorFactory.create(
66 NAME,
67 PopularPrefixes.createSubset("sindice"),
68 Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
69 "example-meta.html",
70 HTMLMetaExtractor.class
71 );
72
73
74
75
76 public void run(
77 ExtractionParameters extractionParameters,
78 ExtractionContext extractionContext,
79 Document in,
80 ExtractionResult out
81 ) throws IOException, ExtractionException {
82 profile = extractProfile(in);
83 documentLang = getDocumentLanguage(in);
84 extractLinkDefinedPrefixes(in);
85
86 String baseProfile = vSINDICE.NS;
87 if(profile != null) {
88 baseProfile = profile.toString();
89 }
90
91 final URI documentURI = extractionContext.getDocumentURI();
92 Set<Meta> metas = extractMetaElement(in, baseProfile);
93 for(Meta meta : metas) {
94 String lang = documentLang;
95 if(meta.getLang() != null) {
96 lang = meta.getLang();
97 }
98 out.writeTriple(
99 documentURI,
100 meta.getName(),
101 new LiteralImpl(meta.getContent(), lang)
102 );
103 }
104 }
105
106
107
108
109
110
111
112 private String getDocumentLanguage(Document in) {
113 String lang = DomUtils.find(in, "string(/HTML/@lang)");
114 if (lang.equals("")) {
115 return null;
116 }
117 return lang;
118 }
119
120 private URI extractProfile(Document in) {
121 String profile = DomUtils.find(in, "string(/HTML/@profile)");
122 if (profile.equals("")) {
123 return null;
124 }
125 return new URIImpl(profile);
126 }
127
128
129
130
131
132
133 private void extractLinkDefinedPrefixes(Document in) {
134 List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
135 for(Node linkNode : linkNodes) {
136 NamedNodeMap attributes = linkNode.getAttributes();
137 String rel = attributes.getNamedItem("rel").getTextContent();
138 String href = attributes.getNamedItem("href").getTextContent();
139 if(rel != null && href !=null && RDFUtils.isAbsoluteURI(href)) {
140 prefixes.put(rel, new URIImpl(href));
141 }
142 }
143 }
144
145 private Set<Meta> extractMetaElement(Document in, String baseProfile) {
146 List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
147 Set<Meta> result = new HashSet<Meta>();
148 for (Node metaNode : metaNodes) {
149 NamedNodeMap attributes = metaNode.getAttributes();
150 Node nameAttribute = attributes.getNamedItem("name");
151 Node contentAttribute = attributes.getNamedItem("content");
152 if (nameAttribute == null || contentAttribute == null) {
153 continue;
154 }
155 String name = nameAttribute.getTextContent();
156 String content = contentAttribute.getTextContent();
157 String xpath = DomUtils.getXPathForNode(metaNode);
158 URI nameAsURI = getPrefixIfExists(name);
159 if (nameAsURI == null) {
160 nameAsURI = new URIImpl(baseProfile + name);
161 }
162 Meta meta = new Meta(xpath, nameAsURI, content);
163 result.add(meta);
164 }
165 return result;
166 }
167
168 private URI getPrefixIfExists(String name) {
169 String[] split = name.split("\\.");
170 if(split.length == 2 && prefixes.containsKey(split[0])) {
171 return new URIImpl(prefixes.get(split[0]) + split[1]);
172 }
173 return null;
174 }
175
176 public ExtractorDescription getDescription() {
177 return factory;
178 }
179
180 private class Meta {
181
182 private String xpath;
183
184 private URI name;
185
186 private String lang;
187
188 private String content;
189
190 public Meta(String xpath, URI name, String content) {
191 this.xpath = xpath;
192 this.name = name;
193 this.content = content;
194 }
195
196 public Meta(String xpath, URI name, String content, String lang) {
197 this(xpath, name, content);
198 this.lang = lang;
199 }
200
201 public URI getName() {
202 return name;
203 }
204
205 public void setName(URI name) {
206 this.name = name;
207 }
208
209 public String getLang() {
210 return lang;
211 }
212
213 public void setLang(String lang) {
214 this.lang = lang;
215 }
216
217 public String getContent() {
218 return content;
219 }
220
221 public void setContent(String content) {
222 this.content = content;
223 }
224
225 @Override
226 public boolean equals(Object o) {
227 if (this == o) return true;
228 if (o == null || getClass() != o.getClass()) return false;
229
230 Meta meta = (Meta) o;
231
232 if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) return false;
233
234 return true;
235 }
236
237 @Override
238 public int hashCode() {
239 return xpath != null ? xpath.hashCode() : 0;
240 }
241 }
242
243 }