View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.extractor.html.microformats2.annotations.Includes;
25  import org.apache.any23.vocab.VCard;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.vocabulary.RDF;
29  import org.w3c.dom.Node;
30  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
31  import org.apache.any23.extractor.html.HTMLDocument;
32  
33  /**
34   * Extractor for the <a href="http://microformats.org/wiki/h-adr">h-adr</a> microformat.
35   *
36   * @author Nisala Nirmana
37   */
38  @Includes(extractors = HGeoExtractor.class)
39  public class HAdrExtractor extends EntityBasedMicroformatExtractor {
40  
41      private static final VCard vVCARD = VCard.getInstance();
42  
43      private static final String[] addressFields = { "street-address", "extended-address", "locality", "region",
44              "postal-code", "country-name", "geo" };
45  
46      private static final String[] geoFields = { "latitude", "longitude", "altitude" };
47  
48      protected String getBaseClassName() {
49          return Microformats2Prefixes.CLASS_PREFIX + "adr";
50      }
51  
52      @Override
53      protected void resetExtractor() {
54          // Empty.
55      }
56  
57      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
58          if (null == node)
59              return false;
60          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument document = new HTMLDocument(node);
61          BNode adr = getBlankNodeFor(node);
62          out.writeTriple(adr, RDF.TYPE, vVCARD.Address);
63          final String extractorName = getDescription().getExtractorName();
64          for (String field : addressFields) {
65              HTMLDocument.TextField[] values = document
66                      .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
67              for (HTMLDocument.TextField val : values) {
68                  if (!field.equals("geo")) {
69                      conditionallyAddStringProperty(val.source(), adr, vVCARD.getProperty(field), val.value());
70                  } else {
71                      String[] composed = val.value().split(";");
72                      for (int counter = 0; counter < composed.length; counter++) {
73                          conditionallyAddStringProperty(val.source(), adr, vVCARD.getProperty(geoFields[counter]),
74                                  composed[counter]);
75  
76                      }
77                  }
78              }
79          }
80          addGeoAsUrlResource(adr, document);
81          final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
82          tser.addResourceRoot(document.getPathToLocalRoot(), adr, this.getClass());
83          return true;
84      }
85  
86      private void addGeoAsUrlResource(Resource card, HTMLDocument document) throws ExtractionException {
87          HTMLDocument.TextField[] links = document.getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + "geo");
88          for (HTMLDocument.TextField link : links) {
89              conditionallyAddResourceProperty(card, vVCARD.geo, getHTMLDocument().resolveIRI(link.value()));
90          }
91      }
92  
93      @Override
94      public ExtractorDescription getDescription() {
95          return HAdrExtractorFactory.getDescriptionInstance();
96      }
97  
98  }