View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionResult;
21  import org.apache.any23.extractor.ExtractorDescription;
22  import org.apache.any23.extractor.ExtractorFactory;
23  import org.apache.any23.extractor.SimpleExtractorFactory;
24  import org.apache.any23.extractor.TagSoupExtractionResult;
25  import org.apache.any23.rdf.PopularPrefixes;
26  import org.apache.any23.vocab.VCARD;
27  import org.openrdf.model.BNode;
28  import org.openrdf.model.vocabulary.RDF;
29  import org.w3c.dom.Node;
30  
31  import java.util.Arrays;
32  
33  /**
34   * Extractor for the <a href="http://microformats.org/wiki/adr">adr</a>
35   * microformat.
36   *
37   * @author Gabriele Renzi
38   */
39  public class AdrExtractor extends EntityBasedMicroformatExtractor {
40  
41      private static final VCARD vVCARD = VCARD.getInstance();
42  
43      private static final String[] addressFields = {
44              "post-office-box",
45              "extended-address",
46              "street-address",
47              "locality",
48              "region",
49              "country-name",
50              "postal-code"
51      };
52  
53      protected String getBaseClassName() {
54          return "adr";
55      }
56  
57      @Override
58      protected void resetExtractor() {
59          // Empty.
60      }
61  
62      protected boolean extractEntity(Node node, ExtractionResult out) {
63          if (null == node) return false;
64          //try lat & lon
65          final HTMLDocument document = new HTMLDocument(node);
66          BNode adr = getBlankNodeFor(node);
67          out.writeTriple(adr, RDF.TYPE, vVCARD.Address);
68          final String extractorName = getDescription().getExtractorName();
69          for (String field : addressFields) {
70              HTMLDocument.TextField[] values = document.getPluralTextField(field);
71              for (HTMLDocument.TextField val : values) {
72                  conditionallyAddStringProperty(
73                          val.source(),
74                          adr, vVCARD.getProperty(field), val.value()
75                  );
76              }
77          }
78          HTMLDocument.TextField[] types = document.getPluralTextField("type");
79          for (HTMLDocument.TextField val : types) {
80              conditionallyAddStringProperty(
81                      val.source(),
82                      adr, vVCARD.addressType, val.value()
83              );
84          }
85  
86          final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
87          tser.addResourceRoot( document.getPathToLocalRoot(), adr, this.getClass() );
88  
89          return true;
90      }
91  
92      public ExtractorDescription getDescription() {
93          return factory;
94      }
95  
96      public final static ExtractorFactory<AdrExtractor> factory =
97              SimpleExtractorFactory.create(
98                      "html-mf-adr",
99                      PopularPrefixes.createSubset("rdf", "vcard"),
100                     Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
101                     "example-mf-adr.html",
102                     AdrExtractor.class
103             );
104 }