View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionResult;
21  import org.apache.any23.extractor.ExtractorDescription;
22  import org.apache.any23.extractor.TagSoupExtractionResult;
23  import org.apache.any23.vocab.VCard;
24  import org.eclipse.rdf4j.model.BNode;
25  import org.eclipse.rdf4j.model.vocabulary.RDF;
26  import org.w3c.dom.Node;
27  
28  /**
29   * Extractor for the <a href="http://microformats.org/wiki/adr">adr</a>
30   * microformat.
31   *
32   * @author Gabriele Renzi
33   */
34  public class AdrExtractor extends EntityBasedMicroformatExtractor {
35  
36      private static final VCard vVCARD = VCard.getInstance();
37  
38      private static final String[] addressFields = {
39              "post-office-box",
40              "extended-address",
41              "street-address",
42              "locality",
43              "region",
44              "country-name",
45              "postal-code"
46      };
47  
48      protected String getBaseClassName() {
49          return "adr";
50      }
51  
52      @Override
53      protected void resetExtractor() {
54          // Empty.
55      }
56  
57      protected boolean extractEntity(Node node, ExtractionResult out) {
58          if (null == node) return false;
59          //try lat & lon
60          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument document = new HTMLDocument(node);
61          BNode adr = getBlankNodeFor(node);
62          out.writeTriple(adr, RDF.TYPE, vVCARD.Address);
63          final String extractorName = getDescription().getExtractorName();
64          for (String field : addressFields) {
65              HTMLDocument.TextField[] values = document.getPluralTextField(field);
66              for (HTMLDocument.TextField val : values) {
67                  conditionallyAddStringProperty(
68                          val.source(),
69                          adr, vVCARD.getProperty(field), val.value()
70                  );
71              }
72          }
73          HTMLDocument.TextField[] types = document.getPluralTextField("type");
74          for (HTMLDocument.TextField val : types) {
75              conditionallyAddStringProperty(
76                      val.source(),
77                      adr, vVCARD.addressType, val.value()
78              );
79          }
80  
81          final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
82          tser.addResourceRoot( document.getPathToLocalRoot(), adr, this.getClass() );
83  
84          return true;
85      }
86  
87      @Override
88      public ExtractorDescription getDescription() {
89          return AdrExtractorFactory.getDescriptionInstance();
90      }
91  
92  }