View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.ExtractorFactory;
24  import org.apache.any23.extractor.SimpleExtractorFactory;
25  import org.apache.any23.extractor.TagSoupExtractionResult;
26  import org.apache.any23.rdf.PopularPrefixes;
27  import org.apache.any23.vocab.DCTERMS;
28  import org.apache.any23.vocab.REVIEW;
29  import org.apache.any23.vocab.VCARD;
30  import org.openrdf.model.BNode;
31  import org.openrdf.model.Resource;
32  import org.openrdf.model.vocabulary.RDF;
33  import org.w3c.dom.Node;
34  
35  import java.util.Arrays;
36  import java.util.List;
37  
38  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
39  
40  /**
41   * Extractor for the <a href="http://microformats.org/wiki/hreview">hReview</a>
42   * microformat.
43   *
44   * @author Gabriele Renzi
45   */
46  public class HReviewExtractor extends EntityBasedMicroformatExtractor {
47  
48      private static final REVIEW  vREVIEW  = REVIEW.getInstance();
49      private static final VCARD   vVCARD   = VCARD.getInstance();
50      private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
51  
52      public final static ExtractorFactory<HReviewExtractor> factory =
53              SimpleExtractorFactory.create(
54                      "html-mf-hreview",
55                      PopularPrefixes.createSubset("rdf", "vcard", "rev"),
56                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
57                      "example-mf-hreview.html",
58                      HReviewExtractor.class
59              );
60  
61      public ExtractorDescription getDescription() {
62          return factory;
63      }
64  
65      protected String getBaseClassName() {
66          return "hreview";
67      }
68  
69      @Override
70      protected void resetExtractor() {
71          // Empty.
72      }
73  
74      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
75          BNode rev = getBlankNodeFor(node);
76          out.writeTriple(rev, RDF.TYPE, vREVIEW.Review);
77          final HTMLDocument fragment = new HTMLDocument(node);
78          addRating(fragment, rev);
79          addSummary(fragment, rev);
80          addTime(fragment, rev);
81          addType(fragment, rev);
82          addDescription(fragment, rev);
83          addItem(fragment, rev);
84          addReviewer(fragment, rev);
85  
86          final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
87          tser.addResourceRoot(
88                  DomUtils.getXPathListForNode(node),
89                  rev,
90                  this.getClass()
91          );
92  
93          return true;
94      }
95  
96      private void addType(HTMLDocument doc, Resource rev) {
97          TextField value = doc.getSingularTextField("type");
98          conditionallyAddStringProperty(
99                  value.source(),
100                 rev, vREVIEW.type, value.value()
101         );
102     }
103 
104     private void addReviewer(HTMLDocument doc, Resource rev) {
105         List<Node> nodes = doc.findAllByClassName("reviewer");
106         if (nodes.size() > 0) {
107             Node node0 = nodes.get(0);
108             addBNodeProperty(
109                     node0,
110                     rev, vREVIEW.reviewer, getBlankNodeFor(node0)
111             );
112         }
113     }
114 
115     private void addItem(HTMLDocument root, BNode rev) throws ExtractionException {
116         List<Node> nodes = root.findAllByClassName("item");
117         for (Node node : nodes) {
118             Resource item = findDummy(new HTMLDocument(node));
119             addBNodeProperty(
120                     node,
121                     item, vREVIEW.hasReview, rev
122             );
123         }
124     }
125 
126     private Resource findDummy(HTMLDocument item) throws ExtractionException {
127         Resource blank = getBlankNodeFor(item.getDocument());
128         TextField val = item.getSingularTextField("fn");
129         conditionallyAddStringProperty(
130                 val.source(),
131                 blank, vVCARD.fn, val.value()
132         );
133         final TextField url = item.getSingularUrlField("url");
134         conditionallyAddResourceProperty(blank, vVCARD.url, getHTMLDocument().resolveURI(url.value()));
135         TextField pics[] = item.getPluralUrlField("photo");
136         for (TextField pic : pics) {
137             addURIProperty(blank, vVCARD.photo, getHTMLDocument().resolveURI(pic.value()));
138         }
139         return blank;
140     }
141 
142     private void addRating(HTMLDocument doc, Resource rev) {
143         HTMLDocument.TextField value = doc.getSingularTextField("rating");
144         conditionallyAddStringProperty(
145                 value.source(), rev, vREVIEW.rating, value.value()
146         );
147     }
148 
149     private void addSummary(HTMLDocument doc, Resource rev) {
150         TextField value = doc.getSingularTextField("summary");
151         conditionallyAddStringProperty(
152                 value.source(),
153                 rev, vREVIEW.title, value.value()
154         );
155     }
156 
157     private void addTime(HTMLDocument doc, Resource rev) {
158         TextField value = doc.getSingularTextField("dtreviewed");
159         conditionallyAddStringProperty(
160                 value.source(),
161                 rev, vDCTERMS.date, value.value()
162         );
163     }
164 
165     private void addDescription(HTMLDocument doc, Resource rev) {
166         TextField value = doc.getSingularTextField("description");
167         conditionallyAddStringProperty(
168                 value.source(),
169                 rev, vREVIEW.text, value.value()
170         );
171     }
172 
173 }