1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.IssueReport;
24 import org.apache.any23.extractor.SimpleExtractorFactory;
25 import org.apache.any23.extractor.TagSoupExtractionResult;
26 import org.apache.any23.extractor.html.annotations.Includes;
27 import org.apache.any23.rdf.PopularPrefixes;
28 import org.apache.any23.vocab.VCARD;
29 import org.apache.commons.lang.StringUtils;
30 import org.apache.any23.extractor.ExtractorFactory;
31 import org.openrdf.model.BNode;
32 import org.openrdf.model.Resource;
33 import org.openrdf.model.URI;
34 import org.openrdf.model.vocabulary.RDF;
35 import org.w3c.dom.NamedNodeMap;
36 import org.w3c.dom.Node;
37
38 import java.util.ArrayList;
39 import java.util.Arrays;
40 import java.util.Collection;
41 import java.util.List;
42
43 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
44
45
46
47
48
49
50
51
52 @Includes( extractors = AdrExtractor.class )
53 public class HCardExtractor extends EntityBasedMicroformatExtractor {
54
55 private static final VCARD vCARD = VCARD.getInstance();
56
57 private HCardName name = new HCardName();
58
59 private HTMLDocument fragment;
60
61 public final static ExtractorFactory<HCardExtractor> factory =
62 SimpleExtractorFactory.create(
63 "html-mf-hcard",
64 PopularPrefixes.createSubset("rdf", "vcard"),
65 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
66 "example-mf-hcard.html",
67 HCardExtractor.class
68 );
69
70 public ExtractorDescription getDescription() {
71 return factory;
72 }
73
74 @Override
75 protected String getBaseClassName() {
76 return "vcard";
77 }
78
79 @Override
80 protected void resetExtractor() {
81 name.reset();
82 }
83
84 private void fixIncludes(HTMLDocument document, Node node, IssueReport report) {
85 NamedNodeMap attributes = node.getAttributes();
86
87 if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
88 String id = attributes.getNamedItem("headers").getNodeValue();
89 Node header = document.findNodeById(id);
90 if (null != header) {
91 node.appendChild(header.cloneNode(true));
92 attributes.removeNamedItem("headers");
93 }
94 }
95
96
97 for (Node current : DomUtils.findAllByAttributeName(document.getDocument(), "class")) {
98 if (!DomUtils.hasClassName(current, "include")) continue;
99
100
101 current.getAttributes().removeNamedItem("class");
102 ArrayList<TextField> res = new ArrayList<TextField>();
103 HTMLDocument.readUrlField(res, current);
104 TextField id = res.get(0);
105 if (null == id)
106 continue;
107 TextField refId = new TextField( StringUtils.substringAfter(id.value(), "#"), id.source() );
108 Node included = document.findNodeById(refId.value());
109 if (null == included)
110 continue;
111 if( DomUtils.isAncestorOf(included, current) ) {
112 final int[] nodeLocation = DomUtils.getNodeLocation(current);
113 report.notifyIssue(
114 IssueReport.IssueLevel.Warning,
115 "Current node tries to include an ancestor node.",
116 nodeLocation[0], nodeLocation[1]
117 );
118 continue;
119 }
120 current.appendChild(included.cloneNode(true));
121 }
122 }
123
124 @Override
125 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
126 this.fragment = new HTMLDocument(node);
127 fixIncludes(getHTMLDocument(), node, out);
128 final BNode card = getBlankNodeFor(node);
129 boolean foundSomething = false;
130
131 readFn();
132 readNames();
133 readOrganization();
134 foundSomething |= addFn(card);
135 foundSomething |= addNames(card);
136 foundSomething |= addOrganizationName(card);
137 foundSomething |= addStringProperty("sort-string", card, vCARD.sort_string);
138 foundSomething |= addUrl(card);
139 foundSomething |= addEmail(card);
140 foundSomething |= addPhoto(card);
141 foundSomething |= addLogo(card);
142 foundSomething |= addUid(card);
143 foundSomething |= addClass(card);
144 foundSomething |= addStringProperty("bday", card, vCARD.bday);
145 foundSomething |= addStringProperty("rev", card, vCARD.rev);
146 foundSomething |= addStringProperty("tz", card, vCARD.tz);
147 foundSomething |= addCategory(card);
148 foundSomething |= addStringProperty("card", card, vCARD.class_);
149 foundSomething |= addSubMicroformat("adr", card, vCARD.adr);
150 foundSomething |= addTelephones(card);
151 foundSomething |= addStringProperty("title", card, vCARD.title);
152 foundSomething |= addStringProperty("role", card, vCARD.role);
153 foundSomething |= addStringMultiProperty("note", card, vCARD.note);
154 foundSomething |= addSubMicroformat("geo", card, vCARD.geo);
155
156 if (!foundSomething) return false;
157 out.writeTriple(card, RDF.TYPE, vCARD.VCard);
158
159 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
160 tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() );
161
162 return true;
163 }
164
165 private boolean addTelephones(Resource card) {
166 boolean found = false;
167 for (Node node : DomUtils.findAllByAttributeContains(fragment.getDocument(), "class", "tel")) {
168 HTMLDocument telFragment = new HTMLDocument(node);
169 TextField[] values = telFragment.getPluralUrlField("value");
170 if (values.length == 0) {
171
172 String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
173
174 if (typeAndValue.length > 1) {
175 found |= addTel(card, "tel", typeAndValue[1]);
176 } else {
177 found |= addTel(card, "tel", typeAndValue[0]);
178 }
179 } else {
180 final String[] valuesStr = new String[values.length];
181 for(int i = 0; i < values.length; i++) {
182 valuesStr[i] = values[i].value();
183 }
184 HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
185 if (types.length == 0) {
186 found |= addTel(card, "tel", StringUtils.join(valuesStr));
187 }
188 for (HTMLDocument.TextField type : types) {
189 found |= addTel(card, type.value(), StringUtils.join(valuesStr));
190 }
191 }
192 }
193 return found;
194 }
195
196 private boolean addTel(Resource card, String type, String value) {
197 URI tel = super.fixLink(value, "tel");
198 URI composed = vCARD.getProperty(type + "Tel", null);
199 if (composed == null) {
200 URI simple = vCARD.getProperty(type, null);
201 if (simple == null) {
202 return conditionallyAddResourceProperty(card, vCARD.tel, tel);
203 }
204 return conditionallyAddResourceProperty(card, simple, tel);
205 }
206 return conditionallyAddResourceProperty(card, composed, tel);
207 }
208
209 private boolean addSubMicroformat(String className, Resource resource, URI property) {
210 List<Node> nodes = fragment.findAllByClassName(className);
211 if (nodes.isEmpty()) return false;
212 for (Node node : nodes) {
213 addBNodeProperty(
214 node,
215 resource, property, getBlankNodeFor(node)
216 );
217 }
218 return true;
219 }
220
221 private boolean addStringProperty(String className, Resource resource, URI property) {
222 final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
223 return conditionallyAddStringProperty(
224 textField.source(),
225 resource, property, textField.value()
226 );
227 }
228
229
230
231
232
233
234
235
236
237 private boolean addStringMultiProperty(String className, Resource resource, URI property) {
238 HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
239 boolean found = false;
240 for(HTMLDocument.TextField field : fields) {
241 found |= conditionallyAddStringProperty(
242 field.source(),
243 resource, property, field.value()
244 );
245 }
246 return found;
247 }
248
249 private boolean addCategory(Resource card) {
250 HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
251 boolean found = false;
252 for (HTMLDocument.TextField category : categories) {
253 found |= conditionallyAddStringProperty(
254 category.source(),
255 card, vCARD.category, category.value()
256 );
257 }
258 return found;
259 }
260
261 private boolean addUid(Resource card) {
262 TextField uid = fragment.getSingularUrlField("uid");
263 return conditionallyAddStringProperty(
264 fragment.getDocument(),
265 card, vCARD.uid, uid.value()
266 );
267 }
268
269 private boolean addClass(Resource card) {
270 TextField class_ = fragment.getSingularUrlField("class");
271 return conditionallyAddStringProperty(
272 fragment.getDocument(),
273 card, vCARD.class_, class_.value()
274 );
275 }
276
277 private boolean addLogo(Resource card) throws ExtractionException {
278 TextField[] links = fragment.getPluralUrlField("logo");
279 boolean found = false;
280 for (TextField link : links) {
281 found |= conditionallyAddResourceProperty(
282 card, vCARD.logo, getHTMLDocument().resolveURI(link.value())
283 );
284 }
285 return found;
286 }
287
288 private boolean addPhoto(Resource card) throws ExtractionException {
289 TextField[] links = fragment.getPluralUrlField("photo");
290 boolean found = false;
291 for (TextField link : links) {
292 found |= conditionallyAddResourceProperty(
293 card, vCARD.photo, getHTMLDocument().resolveURI(link.value())
294 );
295 }
296 return found;
297 }
298
299 private boolean addEmail(Resource card) {
300 String email = dropSubject(fragment.getSingularUrlField("email").value());
301 return conditionallyAddResourceProperty(
302 card,
303 vCARD.email,
304 fixLink(email, "mailto")
305 );
306 }
307
308 private String dropSubject(String mail) {
309 if (mail == null) return null;
310 return mail.split("\\?")[0];
311 }
312
313 private void readNames() {
314 for (String field : HCardName.FIELDS) {
315 HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
316 for (HTMLDocument.TextField text : values) {
317 if ("".equals(text.value())) continue;
318 name.setField(field, text);
319 }
320 }
321 }
322
323 private void addFieldTriple(Node n, BNode bn, String fieldName, String fieldValue) {
324 conditionallyAddLiteralProperty(
325 n, bn, vCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue)
326 );
327 }
328
329 private boolean addNames(Resource card) {
330 BNode n = valueFactory.createBNode();
331 addBNodeProperty(
332 this.fragment.getDocument(),
333 card, vCARD.n, n
334 );
335 addURIProperty(n, RDF.TYPE, vCARD.Name);
336
337 for (String fieldName : HCardName.FIELDS) {
338 if (!name.containsField(fieldName)) {
339 continue;
340 }
341 if (name.isMultiField(fieldName)) {
342 Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
343 for(TextField value : values) {
344 addFieldTriple(
345 value.source(),
346 n, fieldName, value.value()
347 );
348 }
349 } else {
350 TextField value = name.getField(fieldName);
351 if(value == null) { continue; }
352 addFieldTriple(
353 value.source(),
354 n, fieldName, value.value()
355 );
356 }
357 }
358 return true;
359 }
360
361 private void readFn() {
362 name.setFullName(fragment.getSingularTextField("fn"));
363 }
364
365 private boolean addFn(Resource card) {
366 final TextField fullNameTextField = name.getFullName();
367 if(fullNameTextField == null) {
368 return false;
369 }
370 return conditionallyAddStringProperty(
371 fullNameTextField.source(),
372 card, vCARD.fn, fullNameTextField.value()
373 );
374 }
375
376 private void readOrganization() {
377 Node node = fragment.findMicroformattedObjectNode("*", "org");
378 if (node == null) return;
379 HTMLDocument doc = new HTMLDocument(node);
380 String nodeText = doc.getText();
381 if(nodeText != null) {
382 name.setOrganization( new HTMLDocument.TextField(nodeText, node) );
383 }
384 nodeText = doc.getSingularTextField("organization-name").value();
385 if(nodeText == null || "".equals(nodeText) ) {
386 nodeText = HTMLDocument.readTextField(node).value();
387 }
388 name.setOrganization( new TextField(nodeText, node) );
389
390 name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
391 }
392
393 private boolean addOrganizationName(Resource card) {
394 if (name.getOrganization() == null) return false;
395 BNode org = valueFactory.createBNode();
396 addBNodeProperty(
397 this.fragment.getDocument(),
398 card, vCARD.org, org
399 );
400 addURIProperty(org, RDF.TYPE, vCARD.Organization);
401 final TextField organizationTextField = name.getOrganization();
402 conditionallyAddLiteralProperty(
403 organizationTextField.source(),
404 org, vCARD.organization_name, valueFactory.createLiteral( organizationTextField.value() )
405 );
406 final TextField organizationUnitTextField = name.getOrganizationUnit();
407 if(organizationUnitTextField != null) {
408 conditionallyAddStringProperty(
409 organizationUnitTextField.source(),
410 org, vCARD.organization_unit, organizationUnitTextField.value()
411 );
412 }
413 return true;
414 }
415
416 private boolean addUrl(Resource card) throws ExtractionException {
417 TextField[] links = fragment.getPluralUrlField("url");
418 boolean found = false;
419 for (TextField link : links) {
420 found |= conditionallyAddResourceProperty(card, vCARD.url, getHTMLDocument().resolveURI(link.value()));
421 }
422 return found;
423 }
424
425 }