View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.IssueReport;
24  import org.apache.any23.extractor.TagSoupExtractionResult;
25  import org.apache.any23.extractor.html.annotations.Includes;
26  import org.apache.any23.vocab.VCard;
27  import org.apache.commons.lang.StringUtils;
28  import org.eclipse.rdf4j.model.BNode;
29  import org.eclipse.rdf4j.model.Resource;
30  import org.eclipse.rdf4j.model.IRI;
31  import org.eclipse.rdf4j.model.vocabulary.RDF;
32  import org.w3c.dom.NamedNodeMap;
33  import org.w3c.dom.Node;
34  
35  import java.util.ArrayList;
36  import java.util.Collection;
37  import java.util.List;
38  
39  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
40  
41  
42  /**
43   * Extractor for the <a href="http://microformats.org/wiki/hcard">hCard</a>
44   * microformat.
45   *
46   * @author Gabriele Renzi
47   */
48  @Includes( extractors = AdrExtractor.class )
49  public class HCardExtractor extends EntityBasedMicroformatExtractor {
50  
51      private static final VCard vCARD = VCard.getInstance();
52  
53      private HCardName/html/HCardName.html#HCardName">HCardName name = new HCardName();
54  
55      private HTMLDocument fragment;
56  
57      @Override
58      public ExtractorDescription getDescription() {
59          return HCardExtractorFactory.getDescriptionInstance();
60      }
61  
62      @Override
63      protected String getBaseClassName() {
64          return "vcard";
65      }
66  
67      @Override
68      protected void resetExtractor() {
69          name.reset(); // Cleanup of the HCardName content.
70      }
71  
72      private void fixIncludes(HTMLDocument document, Node node, IssueReport report) {
73          NamedNodeMap attributes = node.getAttributes();
74          // header case test 32
75          if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
76              String id = attributes.getNamedItem("headers").getNodeValue();
77              Node header = document.findNodeById(id);
78              if (null != header) {
79                  node.appendChild(header.cloneNode(true));
80                  attributes.removeNamedItem("headers");
81              }
82          }
83  
84          // include pattern, test 31
85          for (Node current : DomUtils.findAllByAttributeName(document.getDocument(), "class")) {
86              if (!DomUtils.hasClassName(current, "include")) continue;
87              // we have to remove the field soon to avoid infinite loops
88              // no null check, we know it's there or we won't be in the loop
89              current.getAttributes().removeNamedItem("class");
90              ArrayList<TextField> res = new ArrayList<TextField>();
91              HTMLDocument.readUrlField(res, current);
92              if (res.isEmpty())
93                  continue;
94              TextField id = res.get(0);
95              if (null == id)
96                  continue;
97              TextField refId = new TextField( StringUtils.substringAfter(id.value(), "#"), id.source() );
98              Node included = document.findNodeById(refId.value());
99              if (null == included)
100                 continue;
101             if( DomUtils.isAncestorOf(included, current) )  {
102                 final int[] nodeLocation = DomUtils.getNodeLocation(current);
103                 report.notifyIssue(
104                         IssueReport.IssueLevel.WARNING,
105                         "Current node tries to include an ancestor node.",
106                         nodeLocation == null ? -1 : nodeLocation[0],
107                         nodeLocation == null ? -1 : nodeLocation[1]
108                 );
109                 continue;
110             }
111             current.appendChild(included.cloneNode(true));
112         }
113     }
114 
115     @Override
116     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
117         this.fragment = new HTMLDocument(node);
118         fixIncludes(getHTMLDocument(), node, out);
119         final BNode card = getBlankNodeFor(node);
120         boolean foundSomething = false;
121 
122         readFn();
123         readNames();
124         readOrganization();
125         foundSomething |= addFn(card);
126         foundSomething |= addNames(card);
127         foundSomething |= addOrganizationName(card);
128         foundSomething |= addStringProperty("sort-string", card, vCARD.sort_string);
129         foundSomething |= addUrl(card);
130         foundSomething |= addEmail(card);
131         foundSomething |= addPhoto(card);
132         foundSomething |= addLogo(card);
133         foundSomething |= addUid(card);
134         foundSomething |= addClass(card);
135         foundSomething |= addStringProperty("bday", card, vCARD.bday);
136         foundSomething |= addStringProperty("rev", card, vCARD.rev);
137         foundSomething |= addStringProperty("tz", card, vCARD.tz);
138         foundSomething |= addCategory(card);
139         foundSomething |= addStringProperty("card", card, vCARD.class_);
140         foundSomething |= addSubMicroformat("adr", card, vCARD.adr);
141         foundSomething |= addTelephones(card);
142         foundSomething |= addStringProperty("title", card, vCARD.title);
143         foundSomething |= addStringProperty("role", card, vCARD.role);
144         foundSomething |= addStringMultiProperty("note", card, vCARD.note);
145         foundSomething |= addSubMicroformat("geo", card, vCARD.geo);
146 
147         if (!foundSomething) return false;
148         out.writeTriple(card, RDF.TYPE, vCARD.VCard);
149 
150         final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
151         tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() );
152 
153         return true;
154     }
155 
156     private boolean addTelephones(Resource card) {
157         boolean found = false;
158         for (Node node : DomUtils.findAllByAttributeContains(fragment.getDocument(), "class", "tel")) {
159             HTMLDocumentDocument.html#HTMLDocument">HTMLDocument telFragment = new HTMLDocument(node);
160             TextField[] values = telFragment.getPluralUrlField("value");
161             if (values.length == 0) {
162                 //no sub values
163                 String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
164                 //modem:goo fax:foo tel:bar
165                 if (typeAndValue.length > 1) {
166                     found |= addTel(card, "tel", typeAndValue[1]);
167                 } else {
168                     found |= addTel(card, "tel", typeAndValue[0]);
169                 }
170             } else {
171                 final String[] valuesStr = new String[values.length];
172                 for(int i = 0; i < values.length; i++) {
173                     valuesStr[i] = values[i].value();
174                 }
175                 HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
176                 if (types.length == 0) {
177                     found |= addTel(card, "tel", StringUtils.join(valuesStr));
178                 }
179                 for (HTMLDocument.TextField type : types) {
180                     found |= addTel(card, type.value(), StringUtils.join(valuesStr));
181                 }
182             }
183         }
184         return found;
185     }
186 
187     private boolean addTel(Resource card, String type, String value) {
188         IRI tel = super.fixLink(value, "tel");
189         IRI composed = vCARD.getProperty(type + "Tel", null);
190         if (composed == null) {
191             IRI simple = vCARD.getProperty(type, null);
192             if (simple == null) {
193                 return conditionallyAddResourceProperty(card, vCARD.tel, tel);
194             }
195             return conditionallyAddResourceProperty(card, simple, tel);
196         }
197         return conditionallyAddResourceProperty(card, composed, tel);
198     }
199 
200     private boolean addSubMicroformat(String className, Resource resource, IRI property) {
201         List<Node> nodes = fragment.findAllByClassName(className);
202         if (nodes.isEmpty()) return false;
203         for (Node node : nodes) {
204             addBNodeProperty(
205                     node,
206                     resource, property, getBlankNodeFor(node)
207             );
208         }
209         return true;
210     }
211 
212     private boolean addStringProperty(String className, Resource resource, IRI property) {
213         final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
214         return conditionallyAddStringProperty(
215                 textField.source(),
216                 resource, property, textField.value()
217         );
218     }
219 
220     /**
221      * Adds a property that can be associated to multiple values.
222      *
223      * @param className
224      * @param resource
225      * @param property
226      * @return <code>true</code> if the multi property has been added, <code>false</code> otherwise.
227      */
228     private boolean addStringMultiProperty(String className, Resource resource, IRI property) {
229         HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
230         boolean found = false;
231         for(HTMLDocument.TextField field : fields) {
232             found |= conditionallyAddStringProperty(
233                     field.source(),
234                     resource, property, field.value()
235             );
236         }
237         return found;
238     }
239 
240     private boolean addCategory(Resource card) {
241         HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
242         boolean found = false;
243         for (HTMLDocument.TextField category : categories) {
244             found |= conditionallyAddStringProperty(
245                     category.source(),
246                     card, vCARD.category, category.value()
247             );
248         }
249         return found;
250     }
251 
252     private boolean addUid(Resource card) {
253         TextField uid = fragment.getSingularUrlField("uid");
254         return conditionallyAddStringProperty(
255                 fragment.getDocument(),
256                 card, vCARD.uid, uid.value()
257         );
258     }
259 
260     private boolean addClass(Resource card) {
261         TextField class_ = fragment.getSingularUrlField("class");
262         return conditionallyAddStringProperty(
263                 fragment.getDocument(),
264                 card, vCARD.class_, class_.value()
265         );
266     }
267 
268     private boolean addLogo(Resource card) throws ExtractionException {
269         TextField[] links = fragment.getPluralUrlField("logo");
270         boolean found = false;
271         for (TextField link : links) {
272             found |= conditionallyAddResourceProperty(
273                     card, vCARD.logo, getHTMLDocument().resolveIRI(link.value())
274             );
275         }
276         return found;
277     }
278 
279     private boolean addPhoto(Resource card) throws ExtractionException {
280         TextField[] links = fragment.getPluralUrlField("photo");
281         boolean found = false;
282         for (TextField link : links) {
283             found |= conditionallyAddResourceProperty(
284                     card, vCARD.photo, getHTMLDocument().resolveIRI(link.value())
285             );
286         }
287         return found;
288     }
289 
290     private boolean addEmail(Resource card) {
291         String email = dropSubject(fragment.getSingularUrlField("email").value());
292         return conditionallyAddResourceProperty(
293                 card,
294                 vCARD.email,
295                 fixLink(email, "mailto")
296         );
297     }
298 
299     private String dropSubject(String mail) {
300         if (mail == null) return null;
301         return mail.split("\\?")[0];
302     }
303 
304     private void readNames() {
305         for (String field : HCardName.FIELDS) {
306             HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
307             for (HTMLDocument.TextField text : values) {
308                 if ("".equals(text.value())) continue;
309                 name.setField(field, text);
310             }
311         }
312     }
313 
314     private void addFieldTriple(Node n, BNode bn, String fieldName, String fieldValue) {
315         conditionallyAddLiteralProperty(
316                 n, bn, vCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue)
317         );
318     }
319 
320     private boolean addNames(Resource card) {
321         BNode n = valueFactory.createBNode();
322         addBNodeProperty(
323                 this.fragment.getDocument(),
324                 card, vCARD.n, n
325         );
326         addIRIProperty(n, RDF.TYPE, vCARD.Name);
327 
328         for (String fieldName : HCardName.FIELDS) {
329             if (!name.containsField(fieldName)) {
330                 continue;
331             }
332             if (name.isMultiField(fieldName)) {
333                 Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
334                 for(TextField value : values) {
335                     addFieldTriple(
336                             value.source(),
337                             n, fieldName, value.value()
338                     );
339                 }
340             } else {
341                 TextField value =  name.getField(fieldName);
342                 if(value == null) { continue; }
343                 addFieldTriple(
344                         value.source(),
345                         n, fieldName, value.value()
346                 );
347             }
348         }
349         return true;
350     }
351 
352     private void readFn() {
353         name.setFullName(fragment.getSingularTextField("fn"));
354     }
355 
356     private boolean addFn(Resource card) {
357         final TextField fullNameTextField = name.getFullName();
358         if(fullNameTextField == null) {
359             return false;
360         }
361         return conditionallyAddStringProperty(
362                 fullNameTextField.source(),
363                 card, vCARD.fn, fullNameTextField.value()
364         );
365     }
366 
367     private void readOrganization() {
368         Node node = fragment.findMicroformattedObjectNode("*", "org");
369         if (node == null) return;
370         HTMLDocumenttml/HTMLDocument.html#HTMLDocument">HTMLDocument doc = new HTMLDocument(node);
371         String nodeText = doc.getText();
372         if(nodeText != null) {
373             name.setOrganization( new HTMLDocument.TextField(nodeText, node) );
374         }
375         nodeText = doc.getSingularTextField("organization-name").value();
376         if(nodeText == null || "".equals(nodeText) ) {
377             nodeText = HTMLDocument.readTextField(node).value();
378         }
379         name.setOrganization( new TextField(nodeText, node) );
380 
381         name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
382     }
383 
384     private boolean addOrganizationName(Resource card) {
385         if (name.getOrganization() == null) return false;
386         BNode org = valueFactory.createBNode();
387         addBNodeProperty(
388                 this.fragment.getDocument(),
389                 card, vCARD.org, org
390         );
391         addIRIProperty(org, RDF.TYPE, vCARD.Organization);
392         final TextField organizationTextField = name.getOrganization();
393         conditionallyAddLiteralProperty(
394                 organizationTextField.source(),
395                 org, vCARD.organization_name, valueFactory.createLiteral( organizationTextField.value() )
396         );
397         final TextField organizationUnitTextField = name.getOrganizationUnit();
398         if(organizationUnitTextField != null) {
399             conditionallyAddStringProperty(
400                     organizationUnitTextField.source(),
401                     org, vCARD.organization_unit, organizationUnitTextField.value()
402             );
403         }
404         return true;
405     }
406 
407     private boolean addUrl(Resource card) throws ExtractionException {
408         TextField[] links = fragment.getPluralUrlField("url");
409         boolean found = false;
410         for (TextField link : links) {
411             found |= conditionallyAddResourceProperty(card, vCARD.url, getHTMLDocument().resolveIRI(link.value()));
412         }
413         return found;
414     }
415 
416 }