View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.IssueReport;
24  import org.apache.any23.extractor.SimpleExtractorFactory;
25  import org.apache.any23.extractor.TagSoupExtractionResult;
26  import org.apache.any23.extractor.html.annotations.Includes;
27  import org.apache.any23.rdf.PopularPrefixes;
28  import org.apache.any23.vocab.VCARD;
29  import org.apache.commons.lang.StringUtils;
30  import org.apache.any23.extractor.ExtractorFactory;
31  import org.openrdf.model.BNode;
32  import org.openrdf.model.Resource;
33  import org.openrdf.model.URI;
34  import org.openrdf.model.vocabulary.RDF;
35  import org.w3c.dom.NamedNodeMap;
36  import org.w3c.dom.Node;
37  
38  import java.util.ArrayList;
39  import java.util.Arrays;
40  import java.util.Collection;
41  import java.util.List;
42  
43  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
44  
45  
46  /**
47   * Extractor for the <a href="http://microformats.org/wiki/hcard">hCard</a>
48   * microformat.
49   *
50   * @author Gabriele Renzi
51   */
52  @Includes( extractors = AdrExtractor.class )
53  public class HCardExtractor extends EntityBasedMicroformatExtractor {
54  
55      private static final VCARD vCARD = VCARD.getInstance();
56  
57      private HCardName name = new HCardName();
58  
59      private HTMLDocument fragment;
60  
61      public final static ExtractorFactory<HCardExtractor> factory =
62              SimpleExtractorFactory.create(
63                      "html-mf-hcard",
64                      PopularPrefixes.createSubset("rdf", "vcard"),
65                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
66                      "example-mf-hcard.html",
67                      HCardExtractor.class
68              );
69  
70      public ExtractorDescription getDescription() {
71          return factory;
72      }
73  
74      @Override
75      protected String getBaseClassName() {
76          return "vcard";
77      }
78  
79      @Override
80      protected void resetExtractor() {
81          name.reset(); // Cleanup of the HCardName content.
82      }
83  
84      private void fixIncludes(HTMLDocument document, Node node, IssueReport report) {
85          NamedNodeMap attributes = node.getAttributes();
86          // header case test 32
87          if ("TD".equals(node.getNodeName()) && (null != attributes.getNamedItem("headers"))) {
88              String id = attributes.getNamedItem("headers").getNodeValue();
89              Node header = document.findNodeById(id);
90              if (null != header) {
91                  node.appendChild(header.cloneNode(true));
92                  attributes.removeNamedItem("headers");
93              }
94          }
95  
96          // include pattern, test 31
97          for (Node current : DomUtils.findAllByAttributeName(document.getDocument(), "class")) {
98              if (!DomUtils.hasClassName(current, "include")) continue;
99              // we have to remove the field soon to avoid infinite loops
100             // no null check, we know it's there or we won't be in the loop
101             current.getAttributes().removeNamedItem("class");
102             ArrayList<TextField> res = new ArrayList<TextField>();
103             HTMLDocument.readUrlField(res, current);
104             TextField id = res.get(0);
105             if (null == id)
106                 continue;
107             TextField refId = new TextField( StringUtils.substringAfter(id.value(), "#"), id.source() );
108             Node included = document.findNodeById(refId.value());
109             if (null == included)
110                 continue;
111             if( DomUtils.isAncestorOf(included, current) )  {
112                 final int[] nodeLocation = DomUtils.getNodeLocation(current);
113                 report.notifyIssue(
114                         IssueReport.IssueLevel.Warning,
115                         "Current node tries to include an ancestor node.",
116                         nodeLocation[0], nodeLocation[1]
117                 );
118                 continue;
119             }
120             current.appendChild(included.cloneNode(true));
121         }
122     }
123 
124     @Override
125     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
126         this.fragment = new HTMLDocument(node);
127         fixIncludes(getHTMLDocument(), node, out);
128         final BNode card = getBlankNodeFor(node);
129         boolean foundSomething = false;
130 
131         readFn();
132         readNames();
133         readOrganization();
134         foundSomething |= addFn(card);
135         foundSomething |= addNames(card);
136         foundSomething |= addOrganizationName(card);
137         foundSomething |= addStringProperty("sort-string", card, vCARD.sort_string);
138         foundSomething |= addUrl(card);
139         foundSomething |= addEmail(card);
140         foundSomething |= addPhoto(card);
141         foundSomething |= addLogo(card);
142         foundSomething |= addUid(card);
143         foundSomething |= addClass(card);
144         foundSomething |= addStringProperty("bday", card, vCARD.bday);
145         foundSomething |= addStringProperty("rev", card, vCARD.rev);
146         foundSomething |= addStringProperty("tz", card, vCARD.tz);
147         foundSomething |= addCategory(card);
148         foundSomething |= addStringProperty("card", card, vCARD.class_);
149         foundSomething |= addSubMicroformat("adr", card, vCARD.adr);
150         foundSomething |= addTelephones(card);
151         foundSomething |= addStringProperty("title", card, vCARD.title);
152         foundSomething |= addStringProperty("role", card, vCARD.role);
153         foundSomething |= addStringMultiProperty("note", card, vCARD.note);
154         foundSomething |= addSubMicroformat("geo", card, vCARD.geo);
155 
156         if (!foundSomething) return false;
157         out.writeTriple(card, RDF.TYPE, vCARD.VCard);
158 
159         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
160         tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() );
161 
162         return true;
163     }
164 
165     private boolean addTelephones(Resource card) {
166         boolean found = false;
167         for (Node node : DomUtils.findAllByAttributeContains(fragment.getDocument(), "class", "tel")) {
168             HTMLDocument telFragment = new HTMLDocument(node);
169             TextField[] values = telFragment.getPluralUrlField("value");
170             if (values.length == 0) {
171                 //no sub values
172                 String[] typeAndValue = telFragment.getSingularUrlField("tel").value().split(":");
173                 //modem:goo fax:foo tel:bar
174                 if (typeAndValue.length > 1) {
175                     found |= addTel(card, "tel", typeAndValue[1]);
176                 } else {
177                     found |= addTel(card, "tel", typeAndValue[0]);
178                 }
179             } else {
180                 final String[] valuesStr = new String[values.length];
181                 for(int i = 0; i < values.length; i++) {
182                     valuesStr[i] = values[i].value();
183                 }
184                 HTMLDocument.TextField[] types = telFragment.getPluralTextField("type");
185                 if (types.length == 0) {
186                     found |= addTel(card, "tel", StringUtils.join(valuesStr));
187                 }
188                 for (HTMLDocument.TextField type : types) {
189                     found |= addTel(card, type.value(), StringUtils.join(valuesStr));
190                 }
191             }
192         }
193         return found;
194     }
195 
196     private boolean addTel(Resource card, String type, String value) {
197         URI tel = super.fixLink(value, "tel");
198         URI composed = vCARD.getProperty(type + "Tel", null);
199         if (composed == null) {
200             URI simple = vCARD.getProperty(type, null);
201             if (simple == null) {
202                 return conditionallyAddResourceProperty(card, vCARD.tel, tel);
203             }
204             return conditionallyAddResourceProperty(card, simple, tel);
205         }
206         return conditionallyAddResourceProperty(card, composed, tel);
207     }
208 
209     private boolean addSubMicroformat(String className, Resource resource, URI property) {
210         List<Node> nodes = fragment.findAllByClassName(className);
211         if (nodes.isEmpty()) return false;
212         for (Node node : nodes) {
213             addBNodeProperty(
214                     node,
215                     resource, property, getBlankNodeFor(node)
216             );
217         }
218         return true;
219     }
220 
221     private boolean addStringProperty(String className, Resource resource, URI property) {
222         final HTMLDocument.TextField textField = fragment.getSingularTextField(className);
223         return conditionallyAddStringProperty(
224                 textField.source(),
225                 resource, property, textField.value()
226         );
227     }
228 
229     /**
230      * Adds a property that can be associated to multiple values.
231      *
232      * @param className
233      * @param resource
234      * @param property
235      * @return <code>true</code> if the multi property has been added, <code>false</code> otherwise.
236      */
237     private boolean addStringMultiProperty(String className, Resource resource, URI property) {
238         HTMLDocument.TextField[] fields = fragment.getPluralTextField(className);
239         boolean found = false;
240         for(HTMLDocument.TextField field : fields) {
241             found |= conditionallyAddStringProperty(
242                     field.source(),
243                     resource, property, field.value()
244             );
245         }
246         return found;
247     }
248 
249     private boolean addCategory(Resource card) {
250         HTMLDocument.TextField[] categories = fragment.getPluralTextField("category");
251         boolean found = false;
252         for (HTMLDocument.TextField category : categories) {
253             found |= conditionallyAddStringProperty(
254                     category.source(),
255                     card, vCARD.category, category.value()
256             );
257         }
258         return found;
259     }
260 
261     private boolean addUid(Resource card) {
262         TextField uid = fragment.getSingularUrlField("uid");
263         return conditionallyAddStringProperty(
264                 fragment.getDocument(),
265                 card, vCARD.uid, uid.value()
266         );
267     }
268 
269     private boolean addClass(Resource card) {
270         TextField class_ = fragment.getSingularUrlField("class");
271         return conditionallyAddStringProperty(
272                 fragment.getDocument(),
273                 card, vCARD.class_, class_.value()
274         );
275     }
276 
277     private boolean addLogo(Resource card) throws ExtractionException {
278         TextField[] links = fragment.getPluralUrlField("logo");
279         boolean found = false;
280         for (TextField link : links) {
281             found |= conditionallyAddResourceProperty(
282                     card, vCARD.logo, getHTMLDocument().resolveURI(link.value())
283             );
284         }
285         return found;
286     }
287 
288     private boolean addPhoto(Resource card) throws ExtractionException {
289         TextField[] links = fragment.getPluralUrlField("photo");
290         boolean found = false;
291         for (TextField link : links) {
292             found |= conditionallyAddResourceProperty(
293                     card, vCARD.photo, getHTMLDocument().resolveURI(link.value())
294             );
295         }
296         return found;
297     }
298 
299     private boolean addEmail(Resource card) {
300         String email = dropSubject(fragment.getSingularUrlField("email").value());
301         return conditionallyAddResourceProperty(
302                 card,
303                 vCARD.email,
304                 fixLink(email, "mailto")
305         );
306     }
307 
308     private String dropSubject(String mail) {
309         if (mail == null) return null;
310         return mail.split("\\?")[0];
311     }
312 
313     private void readNames() {
314         for (String field : HCardName.FIELDS) {
315             HTMLDocument.TextField[] values = fragment.getPluralTextField(field);
316             for (HTMLDocument.TextField text : values) {
317                 if ("".equals(text.value())) continue;
318                 name.setField(field, text);
319             }
320         }
321     }
322 
323     private void addFieldTriple(Node n, BNode bn, String fieldName, String fieldValue) {
324         conditionallyAddLiteralProperty(
325                 n, bn, vCARD.getProperty(fieldName), valueFactory.createLiteral(fieldValue)
326         );
327     }
328 
329     private boolean addNames(Resource card) {
330         BNode n = valueFactory.createBNode();
331         addBNodeProperty(
332                 this.fragment.getDocument(),
333                 card, vCARD.n, n
334         );
335         addURIProperty(n, RDF.TYPE, vCARD.Name);
336 
337         for (String fieldName : HCardName.FIELDS) {
338             if (!name.containsField(fieldName)) {
339                 continue;
340             }
341             if (name.isMultiField(fieldName)) {
342                 Collection<HTMLDocument.TextField> values = name.getFields(fieldName);
343                 for(TextField value : values) {
344                     addFieldTriple(
345                             value.source(),
346                             n, fieldName, value.value()
347                     );
348                 }
349             } else {
350                 TextField value =  name.getField(fieldName);
351                 if(value == null) { continue; }
352                 addFieldTriple(
353                         value.source(),
354                         n, fieldName, value.value()
355                 );
356             }
357         }
358         return true;
359     }
360 
361     private void readFn() {
362         name.setFullName(fragment.getSingularTextField("fn"));
363     }
364 
365     private boolean addFn(Resource card) {
366         final TextField fullNameTextField = name.getFullName();
367         if(fullNameTextField == null) {
368             return false;
369         }
370         return conditionallyAddStringProperty(
371                 fullNameTextField.source(),
372                 card, vCARD.fn, fullNameTextField.value()
373         );
374     }
375 
376     private void readOrganization() {
377         Node node = fragment.findMicroformattedObjectNode("*", "org");
378         if (node == null) return;
379         HTMLDocument doc = new HTMLDocument(node);
380         String nodeText = doc.getText();
381         if(nodeText != null) {
382             name.setOrganization( new HTMLDocument.TextField(nodeText, node) );
383         }
384         nodeText = doc.getSingularTextField("organization-name").value();
385         if(nodeText == null || "".equals(nodeText) ) {
386             nodeText = HTMLDocument.readTextField(node).value();
387         }
388         name.setOrganization( new TextField(nodeText, node) );
389 
390         name.setOrganizationUnit(doc.getSingularTextField("organization-unit"));
391     }
392 
393     private boolean addOrganizationName(Resource card) {
394         if (name.getOrganization() == null) return false;
395         BNode org = valueFactory.createBNode();
396         addBNodeProperty(
397                 this.fragment.getDocument(),
398                 card, vCARD.org, org
399         );
400         addURIProperty(org, RDF.TYPE, vCARD.Organization);
401         final TextField organizationTextField = name.getOrganization();
402         conditionallyAddLiteralProperty(
403                 organizationTextField.source(),
404                 org, vCARD.organization_name, valueFactory.createLiteral( organizationTextField.value() )
405         );
406         final TextField organizationUnitTextField = name.getOrganizationUnit();
407         if(organizationUnitTextField != null) {
408             conditionallyAddStringProperty(
409                     organizationUnitTextField.source(),
410                     org, vCARD.organization_unit, organizationUnitTextField.value()
411             );
412         }
413         return true;
414     }
415 
416     private boolean addUrl(Resource card) throws ExtractionException {
417         TextField[] links = fragment.getPluralUrlField("url");
418         boolean found = false;
419         for (TextField link : links) {
420             found |= conditionallyAddResourceProperty(card, vCARD.url, getHTMLDocument().resolveURI(link.value()));
421         }
422         return found;
423     }
424 
425 }