View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.extractor.html.microformats2.annotations.Includes;
25  import org.apache.any23.vocab.VCard;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.vocabulary.RDF;
29  import org.w3c.dom.Node;
30  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
31  import org.apache.any23.extractor.html.HTMLDocument;
32  
33  /**
34   * Extractor for the <a href="http://microformats.org/wiki/h-adr">h-adr</a>
35   * microformat.
36   *
37   * @author Nisala Nirmana
38   */
39  @Includes( extractors = HGeoExtractor.class )
40  public class HAdrExtractor extends EntityBasedMicroformatExtractor {
41  
42      private static final VCard vVCARD = VCard.getInstance();
43  
44      private static final String[] addressFields = {
45              "street-address",
46              "extended-address",
47              "locality",
48              "region",
49              "postal-code",
50              "country-name",
51              "geo"
52      };
53  
54      private static final String[] geoFields = {
55              "latitude",
56              "longitude",
57              "altitude"
58      };
59  
60      protected String getBaseClassName() {
61          return Microformats2Prefixes.CLASS_PREFIX+"adr";
62      }
63  
64      @Override
65      protected void resetExtractor() {
66          // Empty.
67      }
68  
69      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
70          if (null == node) return false;
71          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument document = new HTMLDocument(node);
72          BNode adr = getBlankNodeFor(node);
73          out.writeTriple(adr, RDF.TYPE, vVCARD.Address);
74          final String extractorName = getDescription().getExtractorName();
75          for (String field : addressFields) {
76              HTMLDocument.TextField[] values = document.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field);
77              for (HTMLDocument.TextField val : values) {
78                 if(!field.equals("geo")) {
79                          conditionallyAddStringProperty(
80                                  val.source(),
81                                  adr, vVCARD.getProperty(field), val.value()
82                          );
83                 }else {
84                     String[] composed = val.value().split(";");
85                     for(int counter=0;counter<composed.length;counter++){
86                         conditionallyAddStringProperty(
87                                 val.source(),
88                                 adr, vVCARD.getProperty(geoFields[counter]), composed[counter]
89                         );
90  
91                     }
92                 }
93              }
94          }
95          addGeoAsUrlResource(adr,document);
96          final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
97          tser.addResourceRoot( document.getPathToLocalRoot(), adr, this.getClass());
98          return true;
99      }
100 
101     private void addGeoAsUrlResource(Resource card,HTMLDocument document) throws ExtractionException {
102         HTMLDocument.TextField[] links = document.getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX+"geo");
103         for (HTMLDocument.TextField link : links) {
104             conditionallyAddResourceProperty(card, vVCARD.geo, getHTMLDocument().resolveIRI(link.value()));
105         }
106     }
107 
108     @Override
109     public ExtractorDescription getDescription() {
110         return HAdrExtractorFactory.getDescriptionInstance();
111     }
112 
113 }