1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.microdata;
19
20 import org.apache.any23.extractor.IssueReport;
21 import org.apache.any23.extractor.ExtractionContext;
22 import org.apache.any23.extractor.ExtractionException;
23 import org.apache.any23.extractor.ExtractionParameters;
24 import org.apache.any23.extractor.ExtractionResult;
25 import org.apache.any23.extractor.Extractor;
26 import org.apache.any23.extractor.ExtractorDescription;
27 import org.apache.any23.extractor.ExtractorFactory;
28 import org.apache.any23.extractor.SimpleExtractorFactory;
29 import org.apache.any23.extractor.html.DomUtils;
30 import org.apache.any23.rdf.PopularPrefixes;
31 import org.apache.any23.rdf.RDFUtils;
32 import org.apache.any23.vocab.DCTERMS;
33 import org.apache.any23.vocab.XHTML;
34 import org.openrdf.model.Literal;
35 import org.openrdf.model.Resource;
36 import org.openrdf.model.URI;
37 import org.openrdf.model.Value;
38 import org.openrdf.model.vocabulary.RDF;
39 import org.openrdf.model.vocabulary.XMLSchema;
40 import org.w3c.dom.Document;
41 import org.w3c.dom.Node;
42 import org.w3c.dom.NodeList;
43
44 import java.io.IOException;
45 import java.net.MalformedURLException;
46 import java.net.URL;
47 import java.util.Arrays;
48 import java.util.Date;
49 import java.util.HashMap;
50 import java.util.HashSet;
51 import java.util.List;
52 import java.util.Map;
53 import java.util.Set;
54
55
56
57
58
59
60
61
62 public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
63
64 private static final URI MICRODATA_ITEM
65 = RDFUtils.uri("http://www.w3.org/1999/xhtml/microdata#item");
66
67 public final static ExtractorFactory<MicrodataExtractor> factory =
68 SimpleExtractorFactory.create(
69 "html-microdata",
70 PopularPrefixes.createSubset("rdf", "doac", "foaf"),
71 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
72 "example-microdata.html",
73 MicrodataExtractor.class
74 );
75
76 private String documentLanguage;
77
78 private boolean isStrict;
79
80 private String defaultNamespace;
81
82 public ExtractorDescription getDescription() {
83 return factory;
84 }
85
86
87
88
89
90
91
92
93 public void run(
94 ExtractionParameters extractionParameters,
95 ExtractionContext extractionContext,
96 Document in,
97 ExtractionResult out
98 ) throws IOException, ExtractionException {
99
100 final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
101 if(parserReport.getErrors().length > 0) {
102 notifyError(parserReport.getErrors(), out);
103 }
104 final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
105 if (itemScopes.length == 0) {
106 return;
107 }
108
109 isStrict = extractionParameters.getFlag("any23.microdata.strict");
110 if (!isStrict) {
111 defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default");
112 }
113
114 documentLanguage = getDocumentLanguage(in);
115
116
117
118
119 final URI documentURI = extractionContext.getDocumentURI();
120 final Map<ItemScope, Resource> mappings = new HashMap<ItemScope, Resource>();
121 for (ItemScope itemScope : itemScopes) {
122 Resource subject = processType(itemScope, documentURI, out, mappings);
123 out.writeTriple(
124 documentURI,
125 MICRODATA_ITEM,
126 subject
127 );
128 }
129
130
131
132
133 processTitle(in, documentURI, out);
134
135
136
137 processHREFElements(in, documentURI, out);
138
139
140
141 processMetaElements(in, documentURI, out);
142
143
144
145
146 processCiteElements(in, documentURI, out);
147 }
148
149
150
151
152
153
154
155 private String getDocumentLanguage(Document in) {
156 String lang = DomUtils.find(in, "string(/HTML/@lang)");
157 if (lang.equals("")) {
158 return null;
159 }
160 return lang;
161 }
162
163
164
165
166
167
168
169
170 private String getLanguage(Node node) {
171 Node nodeLang = node.getAttributes().getNamedItem("lang");
172 if (nodeLang == null) {
173
174 return documentLanguage;
175 }
176 return nodeLang.getTextContent();
177 }
178
179
180
181
182
183
184
185
186
187 private void processTitle(Document in, URI documentURI, ExtractionResult out) {
188 NodeList titles = in.getElementsByTagName("title");
189
190 if (titles.getLength() == 1) {
191 Node title = titles.item(0);
192 String titleValue = title.getTextContent();
193 Literal object;
194 String lang = getLanguage(title);
195 if (lang == null) {
196
197 object = RDFUtils.literal(titleValue);
198 } else {
199 object = RDFUtils.literal(titleValue, lang);
200 }
201 out.writeTriple(
202 documentURI,
203 DCTERMS.getInstance().title,
204 object
205 );
206 }
207 }
208
209
210
211
212
213
214
215
216
217 private void processHREFElements(Document in, URI documentURI, ExtractionResult out) {
218 NodeList anchors = in.getElementsByTagName("a");
219 for (int i = 0; i < anchors.getLength(); i++) {
220 processHREFElement(anchors.item(i), documentURI, out);
221 }
222 NodeList areas = in.getElementsByTagName("area");
223 for (int i = 0; i < areas.getLength(); i++) {
224 processHREFElement(areas.item(i), documentURI, out);
225 }
226 NodeList links = in.getElementsByTagName("link");
227 for (int i = 0; i < links.getLength(); i++) {
228 processHREFElement(links.item(i), documentURI, out);
229 }
230 }
231
232
233
234
235
236
237
238
239
240 private void processHREFElement(Node item, URI documentURI, ExtractionResult out) {
241 Node rel = item.getAttributes().getNamedItem("rel");
242 if (rel == null) {
243 return;
244 }
245 Node href = item.getAttributes().getNamedItem("href");
246 if (href == null) {
247 return;
248 }
249 URL absoluteURL;
250 if (!isAbsoluteURL(href.getTextContent())) {
251 try {
252 absoluteURL = toAbsoluteURL(
253 documentURI.toString(),
254 href.getTextContent(),
255 '/'
256 );
257 } catch (MalformedURLException e) {
258
259 return;
260 }
261 } else {
262 try {
263 absoluteURL = new URL(href.getTextContent());
264 } catch (MalformedURLException e) {
265
266 return;
267 }
268 }
269 String[] relTokens = rel.getTextContent().split(" ");
270 Set<String> tokensWithNoDuplicates = new HashSet<String>();
271 for (String relToken : relTokens) {
272 if (relToken.contains(":")) {
273
274 continue;
275 }
276 if (relToken.equals("alternate") || relToken.equals("stylesheet")) {
277 tokensWithNoDuplicates.add("ALTERNATE-STYLESHEET");
278 continue;
279 }
280 tokensWithNoDuplicates.add(relToken.toLowerCase());
281 }
282 for (String token : tokensWithNoDuplicates) {
283 URI predicate;
284 if (isAbsoluteURL(token)) {
285 predicate = RDFUtils.uri(token);
286 } else {
287 predicate = RDFUtils.uri(XHTML.NS + token);
288 }
289 out.writeTriple(
290 documentURI,
291 predicate,
292 RDFUtils.uri(absoluteURL.toString())
293 );
294 }
295 }
296
297
298
299
300
301
302
303
304
305 private void processMetaElements(Document in, URI documentURI, ExtractionResult out) {
306 NodeList metas = in.getElementsByTagName("meta");
307 for (int i = 0; i < metas.getLength(); i++) {
308 Node meta = metas.item(i);
309 String name = DomUtils.readAttribute(meta, "name" , null);
310 String content = DomUtils.readAttribute(meta, "content", null);
311 if (name != null && content != null) {
312 if (isAbsoluteURL(name)) {
313 processMetaElement(
314 RDFUtils.uri(name),
315 content,
316 getLanguage(meta),
317 documentURI,
318 out
319 );
320 } else {
321 processMetaElement(
322 name,
323 content,
324 getLanguage(meta),
325 documentURI,
326 out
327 );
328 }
329 }
330 }
331 }
332
333
334
335
336
337
338
339
340
341
342
343 private void processMetaElement(
344 URI uri,
345 String content,
346 String language,
347 URI documentURI,
348 ExtractionResult out
349 ) {
350 if (content.contains(":")) {
351
352 return;
353 }
354 Literal subject;
355 if (language == null) {
356
357 subject = RDFUtils.literal(content);
358 } else {
359 subject = RDFUtils.literal(content, language);
360 }
361 out.writeTriple(
362 documentURI,
363 uri,
364 subject
365 );
366 }
367
368
369
370
371
372
373
374
375
376
377
378 private void processMetaElement(
379 String name,
380 String content,
381 String language,
382 URI documentURI,
383 ExtractionResult out) {
384 Literal subject;
385 if (language == null) {
386
387 subject = RDFUtils.literal(content);
388 } else {
389 subject = RDFUtils.literal(content, language);
390 }
391 out.writeTriple(
392 documentURI,
393 RDFUtils.uri(XHTML.NS + name.toLowerCase()),
394 subject
395 );
396 }
397
398
399
400
401
402
403
404
405
406 private void processCiteElements(Document in, URI documentURI, ExtractionResult out) {
407 NodeList blockQuotes = in.getElementsByTagName("blockquote");
408 for (int i = 0; i < blockQuotes.getLength(); i++) {
409 processCiteElement(blockQuotes.item(i), documentURI, out);
410 }
411 NodeList quotes = in.getElementsByTagName("q");
412 for (int i = 0; i < quotes.getLength(); i++) {
413 processCiteElement(quotes.item(i), documentURI, out);
414 }
415 }
416
417 private void processCiteElement(Node item, URI documentURI, ExtractionResult out) {
418 if (item.getAttributes().getNamedItem("cite") != null) {
419 out.writeTriple(
420 documentURI,
421 DCTERMS.getInstance().source,
422 RDFUtils.uri(item.getAttributes().getNamedItem("cite").getTextContent())
423 );
424 }
425 }
426
427
428
429
430
431
432
433
434
435
436
437
438
439 private Resource processType(
440 ItemScope itemScope,
441 URI documentURI, ExtractionResult out,
442 Map<ItemScope, Resource> mappings
443 ) throws ExtractionException {
444 Resource subject;
445 if (mappings.containsKey(itemScope)) {
446 subject = mappings.get(itemScope);
447 } else if (isAbsoluteURL(itemScope.getItemId())) {
448 subject = RDFUtils.uri(itemScope.getItemId());
449 } else {
450 subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode()));
451 }
452 mappings.put(itemScope, subject);
453
454
455 String itemScopeType = "";
456 if (itemScope.getType() != null) {
457 String itemType;
458 itemType = itemScope.getType().toString();
459 out.writeTriple(subject, RDF.TYPE, RDFUtils.uri(itemType));
460 itemScopeType = itemScope.getType().toString();
461 }
462 for (String propName : itemScope.getProperties().keySet()) {
463 List<ItemProp> itemProps = itemScope.getProperties().get(propName);
464 for (ItemProp itemProp : itemProps) {
465 try {
466 processProperty(
467 subject,
468 propName,
469 itemProp,
470 itemScopeType,
471 documentURI,
472 mappings,
473 out
474 );
475 } catch (MalformedURLException e) {
476 throw new ExtractionException(
477 "Error while processing on subject '" + subject +
478 "' the itemProp: '" + itemProp + "' "
479 );
480 }
481 }
482 }
483 return subject;
484 }
485
486 private void processProperty(
487 Resource subject,
488 String propName,
489 ItemProp itemProp,
490 String itemScopeType,
491 URI documentURI,
492 Map<ItemScope, Resource> mappings,
493 ExtractionResult out
494 ) throws MalformedURLException, ExtractionException {
495 URI predicate;
496 if (!isAbsoluteURL(propName) && itemScopeType.equals("") && isStrict) {
497 return;
498 } else if (!isAbsoluteURL(propName) && itemScopeType.equals("") && !isStrict) {
499 predicate = RDFUtils.uri(
500 toAbsoluteURL(
501 defaultNamespace,
502 propName,
503 '/'
504 ).toString()
505 );
506 } else {
507 predicate = RDFUtils.uri(
508 toAbsoluteURL(
509 itemScopeType,
510 propName,
511 '/'
512 ).toString());
513 }
514 Value value;
515 Object propValue = itemProp.getValue().getContent();
516 ItemPropValue.Type propType = itemProp.getValue().getType();
517 if (propType.equals(ItemPropValue.Type.Nested)) {
518 value = processType((ItemScope) propValue, documentURI, out, mappings);
519 } else if (propType.equals(ItemPropValue.Type.Plain)) {
520 value = RDFUtils.literal((String) propValue, documentLanguage);
521 } else if (propType.equals(ItemPropValue.Type.Link)) {
522 value = RDFUtils.uri(
523 toAbsoluteURL(
524 documentURI.toString(),
525 (String) propValue,
526 '/'
527 ).toString()
528 );
529 } else if (propType.equals(ItemPropValue.Type.Date)) {
530 value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
531 } else {
532 throw new RuntimeException("Invalid Type '" +
533 propType + "' for ItemPropValue with name: '" + propName + "'");
534 }
535 out.writeTriple(subject, predicate, value);
536 }
537
538 private boolean isAbsoluteURL(String urlString) {
539 boolean result = false;
540 try {
541 URL url = new URL(urlString);
542 String protocol = url.getProtocol();
543 if (protocol != null && protocol.trim().length() > 0)
544 result = true;
545 } catch (MalformedURLException e) {
546 return false;
547 }
548 return result;
549 }
550
551 private URL toAbsoluteURL(String ns, String part, char trailing)
552 throws MalformedURLException {
553 if (isAbsoluteURL(part)) {
554 return new URL(part);
555 }
556 char lastChar = ns.charAt(ns.length() - 1);
557 if (lastChar == '#' || lastChar == '/')
558 return new URL(ns + part);
559 return new URL(ns + trailing + part);
560 }
561
562 private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
563 for(MicrodataParserException mpe : errors) {
564 out.notifyIssue(
565 IssueReport.IssueLevel.Error,
566 mpe.toJSON(),
567 mpe.getErrorLocationBeginRow(),
568 mpe.getErrorLocationBeginCol()
569 );
570 }
571 }
572
573 }