1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.ExtractorFactory;
24 import org.apache.any23.extractor.SimpleExtractorFactory;
25 import org.apache.any23.extractor.TagSoupExtractionResult;
26 import org.apache.any23.rdf.PopularPrefixes;
27 import org.apache.any23.vocab.DCTERMS;
28 import org.apache.any23.vocab.REVIEW;
29 import org.apache.any23.vocab.VCARD;
30 import org.openrdf.model.BNode;
31 import org.openrdf.model.Resource;
32 import org.openrdf.model.vocabulary.RDF;
33 import org.w3c.dom.Node;
34
35 import java.util.Arrays;
36 import java.util.List;
37
38 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
39
40
41
42
43
44
45
46 public class HReviewExtractor extends EntityBasedMicroformatExtractor {
47
48 private static final REVIEW vREVIEW = REVIEW.getInstance();
49 private static final VCARD vVCARD = VCARD.getInstance();
50 private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
51
52 public final static ExtractorFactory<HReviewExtractor> factory =
53 SimpleExtractorFactory.create(
54 "html-mf-hreview",
55 PopularPrefixes.createSubset("rdf", "vcard", "rev"),
56 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
57 "example-mf-hreview.html",
58 HReviewExtractor.class
59 );
60
61 public ExtractorDescription getDescription() {
62 return factory;
63 }
64
65 protected String getBaseClassName() {
66 return "hreview";
67 }
68
69 @Override
70 protected void resetExtractor() {
71
72 }
73
74 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
75 BNode rev = getBlankNodeFor(node);
76 out.writeTriple(rev, RDF.TYPE, vREVIEW.Review);
77 final HTMLDocument fragment = new HTMLDocument(node);
78 addRating(fragment, rev);
79 addSummary(fragment, rev);
80 addTime(fragment, rev);
81 addType(fragment, rev);
82 addDescription(fragment, rev);
83 addItem(fragment, rev);
84 addReviewer(fragment, rev);
85
86 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
87 tser.addResourceRoot(
88 DomUtils.getXPathListForNode(node),
89 rev,
90 this.getClass()
91 );
92
93 return true;
94 }
95
96 private void addType(HTMLDocument doc, Resource rev) {
97 TextField value = doc.getSingularTextField("type");
98 conditionallyAddStringProperty(
99 value.source(),
100 rev, vREVIEW.type, value.value()
101 );
102 }
103
104 private void addReviewer(HTMLDocument doc, Resource rev) {
105 List<Node> nodes = doc.findAllByClassName("reviewer");
106 if (nodes.size() > 0) {
107 Node node0 = nodes.get(0);
108 addBNodeProperty(
109 node0,
110 rev, vREVIEW.reviewer, getBlankNodeFor(node0)
111 );
112 }
113 }
114
115 private void addItem(HTMLDocument root, BNode rev) throws ExtractionException {
116 List<Node> nodes = root.findAllByClassName("item");
117 for (Node node : nodes) {
118 Resource item = findDummy(new HTMLDocument(node));
119 addBNodeProperty(
120 node,
121 item, vREVIEW.hasReview, rev
122 );
123 }
124 }
125
126 private Resource findDummy(HTMLDocument item) throws ExtractionException {
127 Resource blank = getBlankNodeFor(item.getDocument());
128 TextField val = item.getSingularTextField("fn");
129 conditionallyAddStringProperty(
130 val.source(),
131 blank, vVCARD.fn, val.value()
132 );
133 final TextField url = item.getSingularUrlField("url");
134 conditionallyAddResourceProperty(blank, vVCARD.url, getHTMLDocument().resolveURI(url.value()));
135 TextField pics[] = item.getPluralUrlField("photo");
136 for (TextField pic : pics) {
137 addURIProperty(blank, vVCARD.photo, getHTMLDocument().resolveURI(pic.value()));
138 }
139 return blank;
140 }
141
142 private void addRating(HTMLDocument doc, Resource rev) {
143 HTMLDocument.TextField value = doc.getSingularTextField("rating");
144 conditionallyAddStringProperty(
145 value.source(), rev, vREVIEW.rating, value.value()
146 );
147 }
148
149 private void addSummary(HTMLDocument doc, Resource rev) {
150 TextField value = doc.getSingularTextField("summary");
151 conditionallyAddStringProperty(
152 value.source(),
153 rev, vREVIEW.title, value.value()
154 );
155 }
156
157 private void addTime(HTMLDocument doc, Resource rev) {
158 TextField value = doc.getSingularTextField("dtreviewed");
159 conditionallyAddStringProperty(
160 value.source(),
161 rev, vDCTERMS.date, value.value()
162 );
163 }
164
165 private void addDescription(HTMLDocument doc, Resource rev) {
166 TextField value = doc.getSingularTextField("description");
167 conditionallyAddStringProperty(
168 value.source(),
169 rev, vREVIEW.text, value.value()
170 );
171 }
172
173 }