View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.IssueReport;
24  import org.apache.any23.extractor.TagSoupExtractionResult;
25  import org.apache.any23.extractor.html.annotations.Includes;
26  import org.apache.any23.vocab.VCard;
27  import org.apache.commons.lang3.StringUtils;
28  import org.eclipse.rdf4j.model.BNode;
29  import org.eclipse.rdf4j.model.Resource;
30  import org.eclipse.rdf4j.model.IRI;
31  import org.eclipse.rdf4j.model.vocabulary.RDF;
32  import org.w3c.dom.NamedNodeMap;
33  import org.w3c.dom.Node;
34  
35  import java.util.ArrayList;
36  import java.util.Collection;
37  import java.util.List;
38  
39  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
40  
41  /**
42   * Extractor for the <a href="http://microformats.org/wiki/hcard">hCard</a> microformat.
43   *
44   * @author Gabriele Renzi
45   */
46  @Includes(extractors = AdrExtractor.class)
47  public class HCardExtractor extends EntityBasedMicroformatExtractor {
48  
49      private static final VCard vCARD = VCard.getInstance();
50  
51      private HCardName/html/HCardName.html#HCardName">HCardName name = new HCardName();
52  
53      private HTMLDocument fragment;
54  
55      @Override
56      public ExtractorDescription getDescription() {
57          return HCardExtractorFactory.getDescriptionInstance();
58      }
59  
60      @Override
61      protected String getBaseClassName() {
62          return "vcard";
63      }
64  
65      @Override
66      protected void resetExtractor() {
67          name.reset(); // Cleanup of the HCardName content.
68      }
69  
70      private void fixIncludes(HTMLDocument document, Node node, IssueReport report) {
71          NamedNodeMap attributes = node.getAttributes();
72          // header case test 32
73          if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
74              String id = attributes.getNamedItem("headers").getNodeValue();
75              Node header = document.findNodeById(id);
76              if (null != header) {
77                  node.appendChild(header.cloneNode(true));
78                  attributes.removeNamedItem("headers");
79              }
80          }
81  
82          // include pattern, test 31
83          for (Node current : DomUtils.findAllByAttributeName(document.getDocument(), "class")) {
84              if (!DomUtils.hasClassName(current, "include"))
85                  continue;
86              // we have to remove the field soon to avoid infinite loops
87              // no null check, we know it's there or we won't be in the loop
88              current.getAttributes().removeNamedItem("class");
89              ArrayList<TextField> res = new ArrayList<TextField>();
90              HTMLDocument.readUrlField(res, current);
91              if (res.isEmpty())
92                  continue;
93              TextField id = res.get(0);
94              if (null == id)
95                  continue;
96              TextField refId = new TextField(StringUtils.substringAfter(id.value(), "#"), id.source());
97              Node included = document.findNodeById(refId.value());
98              if (null == included)
99                  continue;
100             if (DomUtils.isAncestorOf(included, current)) {
101                 final int[] nodeLocation = DomUtils.getNodeLocation(current);
102                 report.notifyIssue(IssueReport.IssueLevel.WARNING, "Current node tries to include an ancestor node.",
103                         nodeLocation == null ? -1 : nodeLocation[0], nodeLocation == null ? -1 : nodeLocation[1]);
104                 continue;
105             }
106             current.appendChild(included.cloneNode(true));
107         }
108     }
109 
110     @Override
111     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
112         this.fragment = new HTMLDocument(node);
113         fixIncludes(getHTMLDocument(), node, out);
114         final BNode card = getBlankNodeFor(node);
115         boolean foundSomething = false;
116 
117         readFn();
118         readNames();
119         readOrganization();
120         foundSomething |= addFn(card);
121         foundSomething |= addNames(card);
122         foundSomething |= addOrganizationName(card);
123         foundSomething |= addStringProperty("sort-string", card, vCARD.sort_string);
124         foundSomething |= addUrl(card);
125         foundSomething |= addEmail(card);
126         foundSomething |= addPhoto(card);
127         foundSomething |= addLogo(card);
128         foundSomething |= addUid(card);
129         foundSomething |= addClass(card);
130         foundSomething |= addStringProperty("bday", card, vCARD.bday);
131         foundSomething |= addStringProperty("rev", card, vCARD.rev);
132         foundSomething |= addStringProperty("tz", card, vCARD.tz);
133         foundSomething |= addCategory(card);
134         foundSomething |= addStringProperty("card", card, vCARD.class_);
135         foundSomething |= addSubMicroformat("adr", card, vCARD.adr);
136         foundSomething |= addTelephones(card);
137         foundSomething |= addStringProperty("title", card, vCARD.title);
138         foundSomething |= addStringProperty("role", card, vCARD.role);
139         foundSomething |= addStringMultiProperty("note", card, vCARD.note);
140         foundSomething |= addSubMicroformat("geo", card, vCARD.geo);
141 
142         if (!foundSomething)
143             return false;
144         out.writeTriple(card, RDF.TYPE, vCARD.VCard);
145 
146         final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
147         tser.addResourceRoot(DomUtils.getXPathListForNode(node), card, this.getClass());
148 
149         return true;
150     }
151 
152     private boolean addTelephones(Resource card) {
153         boolean found = false;
154         for (Node node : DomUtils.findAllByAttributeContains(fragment.getDocument(), "class", "tel")) {
155             HTMLDocumentDocument.html#HTMLDocument">HTMLDocument telFragment = new HTMLDocument(node);
156             TextField[] values = telFragment.getPluralUrlField("value");
157             if (values.length == 0) {
158                 // no sub values
159                 String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
160                 // modem:goo fax:foo tel:bar
161                 if (typeAndValue.length > 1) {
162                     found |= addTel(card, "tel", typeAndValue[1]);
163                 } else {
164                     found |= addTel(card, "tel", typeAndValue[0]);
165                 }
166             } else {
167                 final String[] valuesStr = new String[values.length];
168                 for (int i = 0; i < values.length; i++) {
169                     valuesStr[i] = values[i].value();
170                 }
171                 HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
172                 if (types.length == 0) {
173                     found |= addTel(card, "tel", StringUtils.join(valuesStr));
174                 }
175                 for (HTMLDocument.TextField type : types) {
176                     found |= addTel(card, type.value(), StringUtils.join(valuesStr));
177                 }
178             }
179         }
180         return found;
181     }
182 
183     private boolean addTel(Resource card, String type, String value) {
184         IRI tel = super.fixLink(value, "tel");
185         IRI composed = vCARD.getProperty(type + "Tel", null);
186         if (composed == null) {
187             IRI simple = vCARD.getProperty(type, null);
188             if (simple == null) {
189                 return conditionallyAddResourceProperty(card, vCARD.tel, tel);
190             }
191             return conditionallyAddResourceProperty(card, simple, tel);
192         }
193         return conditionallyAddResourceProperty(card, composed, tel);
194     }
195 
196     private boolean addSubMicroformat(String className, Resource resource, IRI property) {
197         List<Node> nodes = fragment.findAllByClassName(className);
198         if (nodes.isEmpty())
199             return false;
200         for (Node node : nodes) {
201             addBNodeProperty(node, resource, property, getBlankNodeFor(node));
202         }
203         return true;
204     }
205 
206     private boolean addStringProperty(String className, Resource resource, IRI property) {
207         final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
208         return conditionallyAddStringProperty(textField.source(), resource, property, textField.value());
209     }
210 
211     /**
212      * Adds a property that can be associated to multiple values.
213      *
214      * @param className
215      * @param resource
216      * @param property
217      * 
218      * @return <code>true</code> if the multi property has been added, <code>false</code> otherwise.
219      */
220     private boolean addStringMultiProperty(String className, Resource resource, IRI property) {
221         HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
222         boolean found = false;
223         for (HTMLDocument.TextField field : fields) {
224             found |= conditionallyAddStringProperty(field.source(), resource, property, field.value());
225         }
226         return found;
227     }
228 
229     private boolean addCategory(Resource card) {
230         HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
231         boolean found = false;
232         for (HTMLDocument.TextField category : categories) {
233             found |= conditionallyAddStringProperty(category.source(), card, vCARD.category, category.value());
234         }
235         return found;
236     }
237 
238     private boolean addUid(Resource card) {
239         TextField uid = fragment.getSingularUrlField("uid");
240         return conditionallyAddStringProperty(fragment.getDocument(), card, vCARD.uid, uid.value());
241     }
242 
243     private boolean addClass(Resource card) {
244         TextField class_ = fragment.getSingularUrlField("class");
245         return conditionallyAddStringProperty(fragment.getDocument(), card, vCARD.class_, class_.value());
246     }
247 
248     private boolean addLogo(Resource card) throws ExtractionException {
249         TextField[] links = fragment.getPluralUrlField("logo");
250         boolean found = false;
251         for (TextField link : links) {
252             found |= conditionallyAddResourceProperty(card, vCARD.logo, getHTMLDocument().resolveIRI(link.value()));
253         }
254         return found;
255     }
256 
257     private boolean addPhoto(Resource card) throws ExtractionException {
258         TextField[] links = fragment.getPluralUrlField("photo");
259         boolean found = false;
260         for (TextField link : links) {
261             found |= conditionallyAddResourceProperty(card, vCARD.photo, getHTMLDocument().resolveIRI(link.value()));
262         }
263         return found;
264     }
265 
266     private boolean addEmail(Resource card) {
267         String email = dropSubject(fragment.getSingularUrlField("email").value());
268         return conditionallyAddResourceProperty(card, vCARD.email, fixLink(email, "mailto"));
269     }
270 
271     private String dropSubject(String mail) {
272         if (mail == null)
273             return null;
274         return mail.split("\\?")[0];
275     }
276 
277     private void readNames() {
278         for (String field : HCardName.FIELDS) {
279             HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
280             for (HTMLDocument.TextField text : values) {
281                 if ("".equals(text.value()))
282                     continue;
283                 name.setField(field, text);
284             }
285         }
286     }
287 
288     private void addFieldTriple(Node n, BNode bn, String fieldName, String fieldValue) {
289         conditionallyAddLiteralProperty(n, bn, vCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue));
290     }
291 
292     private boolean addNames(Resource card) {
293         BNode n = valueFactory.createBNode();
294         addBNodeProperty(this.fragment.getDocument(), card, vCARD.n, n);
295         addIRIProperty(n, RDF.TYPE, vCARD.Name);
296 
297         for (String fieldName : HCardName.FIELDS) {
298             if (!name.containsField(fieldName)) {
299                 continue;
300             }
301             if (name.isMultiField(fieldName)) {
302                 Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
303                 for (TextField value : values) {
304                     addFieldTriple(value.source(), n, fieldName, value.value());
305                 }
306             } else {
307                 TextField value = name.getField(fieldName);
308                 if (value == null) {
309                     continue;
310                 }
311                 addFieldTriple(value.source(), n, fieldName, value.value());
312             }
313         }
314         return true;
315     }
316 
317     private void readFn() {
318         name.setFullName(fragment.getSingularTextField("fn"));
319     }
320 
321     private boolean addFn(Resource card) {
322         final TextField fullNameTextField = name.getFullName();
323         if (fullNameTextField == null) {
324             return false;
325         }
326         return conditionallyAddStringProperty(fullNameTextField.source(), card, vCARD.fn, fullNameTextField.value());
327     }
328 
329     private void readOrganization() {
330         Node node = fragment.findMicroformattedObjectNode("*", "org");
331         if (node == null)
332             return;
333         HTMLDocumenttml/HTMLDocument.html#HTMLDocument">HTMLDocument doc = new HTMLDocument(node);
334         String nodeText = doc.getText();
335         if (nodeText != null) {
336             name.setOrganization(new HTMLDocument.TextField(nodeText, node));
337         }
338         nodeText = doc.getSingularTextField("organization-name").value();
339         if (nodeText == null || "".equals(nodeText)) {
340             nodeText = HTMLDocument.readTextField(node).value();
341         }
342         name.setOrganization(new TextField(nodeText, node));
343 
344         name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
345     }
346 
347     private boolean addOrganizationName(Resource card) {
348         if (name.getOrganization() == null)
349             return false;
350         BNode org = valueFactory.createBNode();
351         addBNodeProperty(this.fragment.getDocument(), card, vCARD.org, org);
352         addIRIProperty(org, RDF.TYPE, vCARD.Organization);
353         final TextField organizationTextField = name.getOrganization();
354         conditionallyAddLiteralProperty(organizationTextField.source(), org, vCARD.organization_name,
355                 valueFactory.createLiteral(organizationTextField.value()));
356         final TextField organizationUnitTextField = name.getOrganizationUnit();
357         if (organizationUnitTextField != null) {
358             conditionallyAddStringProperty(organizationUnitTextField.source(), org, vCARD.organization_unit,
359                     organizationUnitTextField.value());
360         }
361         return true;
362     }
363 
364     private boolean addUrl(Resource card) throws ExtractionException {
365         TextField[] links = fragment.getPluralUrlField("url");
366         boolean found = false;
367         for (TextField link : links) {
368             found |= conditionallyAddResourceProperty(card, vCARD.url, getHTMLDocument().resolveIRI(link.value()));
369         }
370         return found;
371     }
372 
373 }