View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.DCTerms;
25  import org.apache.any23.vocab.Review;
26  import org.apache.any23.vocab.VCard;
27  import org.eclipse.rdf4j.model.BNode;
28  import org.eclipse.rdf4j.model.Resource;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  
32  import java.util.List;
33  
34  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
35  
36  /**
37   * Extractor for the <a href="http://microformats.org/wiki/hreview">hReview</a>
38   * microformat.
39   *
40   * @author Gabriele Renzi
41   */
42  public class HReviewExtractor extends EntityBasedMicroformatExtractor {
43  
44      private static final Review  vREVIEW  = Review.getInstance();
45      private static final VCard   vVCARD   = VCard.getInstance();
46      private static final DCTerms vDCTERMS = DCTerms.getInstance();
47  
48      @Override
49      public ExtractorDescription getDescription() {
50          return HReviewExtractorFactory.getDescriptionInstance();
51      }
52  
53      @Override
54      protected String getBaseClassName() {
55          return "hreview";
56      }
57  
58      @Override
59      protected void resetExtractor() {
60          // Empty.
61      }
62  
63      @Override
64      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
65          BNode rev = getBlankNodeFor(node);
66          out.writeTriple(rev, RDF.TYPE, vREVIEW.Review);
67          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
68          addRating(fragment, rev);
69          addSummary(fragment, rev);
70          addTime(fragment, rev);
71          addType(fragment, rev);
72          addDescription(fragment, rev);
73          addItem(fragment, rev);
74          addReviewer(fragment, rev);
75  
76          final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
77          tser.addResourceRoot(
78                  DomUtils.getXPathListForNode(node),
79                  rev,
80                  this.getClass()
81          );
82  
83          return true;
84      }
85  
86      private void addType(HTMLDocument doc, Resource rev) {
87          TextField value = doc.getSingularTextField("type");
88          conditionallyAddStringProperty(
89                  value.source(),
90                  rev, vREVIEW.type, value.value()
91          );
92      }
93  
94      private void addReviewer(HTMLDocument doc, Resource rev) {
95          List<Node> nodes = doc.findAllByClassName("reviewer");
96          if (nodes.size() > 0) {
97              Node node0 = nodes.get(0);
98              addBNodeProperty(
99                      node0,
100                     rev, vREVIEW.reviewer, getBlankNodeFor(node0)
101             );
102         }
103     }
104 
105     private void addItem(HTMLDocument root, BNode rev) throws ExtractionException {
106         List<Node> nodes = root.findAllByClassName("item");
107         for (Node node : nodes) {
108             Resource item = findDummy(new HTMLDocument(node));
109             addBNodeProperty(
110                     node,
111                     item, vREVIEW.hasReview, rev
112             );
113         }
114     }
115 
116     private Resource findDummy(HTMLDocument item) throws ExtractionException {
117         Resource blank = getBlankNodeFor(item.getDocument());
118         TextField val = item.getSingularTextField("fn");
119         conditionallyAddStringProperty(
120                 val.source(),
121                 blank, vVCARD.fn, val.value()
122         );
123         final TextField url = item.getSingularUrlField("url");
124         conditionallyAddResourceProperty(blank, vVCARD.url, getHTMLDocument().resolveIRI(url.value()));
125         TextField pics[] = item.getPluralUrlField("photo");
126         for (TextField pic : pics) {
127             addIRIProperty(blank, vVCARD.photo, getHTMLDocument().resolveIRI(pic.value()));
128         }
129         return blank;
130     }
131 
132     private void addRating(HTMLDocument doc, Resource rev) {
133         HTMLDocument.TextField value = doc.getSingularTextField("rating");
134         conditionallyAddStringProperty(
135                 value.source(), rev, vREVIEW.rating, value.value()
136         );
137     }
138 
139     private void addSummary(HTMLDocument doc, Resource rev) {
140         TextField value = doc.getSingularTextField("summary");
141         conditionallyAddStringProperty(
142                 value.source(),
143                 rev, vREVIEW.title, value.value()
144         );
145     }
146 
147     private void addTime(HTMLDocument doc, Resource rev) {
148         TextField value = doc.getSingularTextField("dtreviewed");
149         conditionallyAddStringProperty(
150                 value.source(),
151                 rev, vDCTERMS.date, value.value()
152         );
153     }
154 
155     private void addDescription(HTMLDocument doc, Resource rev) {
156         TextField value = doc.getSingularTextField("description");
157         conditionallyAddStringProperty(
158                 value.source(),
159                 rev, vREVIEW.text, value.value()
160         );
161     }
162 
163 }