1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionResult;
21 import org.apache.any23.extractor.ExtractorDescription;
22 import org.apache.any23.extractor.ExtractorFactory;
23 import org.apache.any23.extractor.SimpleExtractorFactory;
24 import org.apache.any23.extractor.TagSoupExtractionResult;
25 import org.apache.any23.rdf.PopularPrefixes;
26 import org.apache.any23.vocab.VCARD;
27 import org.openrdf.model.BNode;
28 import org.openrdf.model.vocabulary.RDF;
29 import org.w3c.dom.Node;
30
31 import java.util.Arrays;
32
33
34
35
36
37
38
39 public class AdrExtractor extends EntityBasedMicroformatExtractor {
40
41 private static final VCARD vVCARD = VCARD.getInstance();
42
43 private static final String[] addressFields = {
44 "post-office-box",
45 "extended-address",
46 "street-address",
47 "locality",
48 "region",
49 "country-name",
50 "postal-code"
51 };
52
53 protected String getBaseClassName() {
54 return "adr";
55 }
56
57 @Override
58 protected void resetExtractor() {
59
60 }
61
62 protected boolean extractEntity(Node node, ExtractionResult out) {
63 if (null == node) return false;
64
65 final HTMLDocument document = new HTMLDocument(node);
66 BNode adr = getBlankNodeFor(node);
67 out.writeTriple(adr, RDF.TYPE, vVCARD.Address);
68 final String extractorName = getDescription().getExtractorName();
69 for (String field : addressFields) {
70 HTMLDocument.TextField[] values = document.getPluralTextField(field);
71 for (HTMLDocument.TextField val : values) {
72 conditionallyAddStringProperty(
73 val.source(),
74 adr, vVCARD.getProperty(field), val.value()
75 );
76 }
77 }
78 HTMLDocument.TextField[] types = document.getPluralTextField("type");
79 for (HTMLDocument.TextField val : types) {
80 conditionallyAddStringProperty(
81 val.source(),
82 adr, vVCARD.addressType, val.value()
83 );
84 }
85
86 final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
87 tser.addResourceRoot( document.getPathToLocalRoot(), adr, this.getClass() );
88
89 return true;
90 }
91
92 public ExtractorDescription getDescription() {
93 return factory;
94 }
95
96 public final static ExtractorFactory<AdrExtractor> factory =
97 SimpleExtractorFactory.create(
98 "html-mf-adr",
99 PopularPrefixes.createSubset("rdf", "vcard"),
100 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
101 "example-mf-adr.html",
102 AdrExtractor.class
103 );
104 }