View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.DCTerms;
25  import org.apache.any23.vocab.Review;
26  import org.apache.any23.vocab.VCard;
27  import org.eclipse.rdf4j.model.BNode;
28  import org.eclipse.rdf4j.model.Resource;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  
32  import java.util.List;
33  
34  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
35  
36  /**
37   * Extractor for the <a href="http://microformats.org/wiki/hreview">hReview</a> microformat.
38   *
39   * @author Gabriele Renzi
40   */
41  public class HReviewExtractor extends EntityBasedMicroformatExtractor {
42  
43      private static final Review vREVIEW = Review.getInstance();
44      private static final VCard vVCARD = VCard.getInstance();
45      private static final DCTerms vDCTERMS = DCTerms.getInstance();
46  
47      @Override
48      public ExtractorDescription getDescription() {
49          return HReviewExtractorFactory.getDescriptionInstance();
50      }
51  
52      @Override
53      protected String getBaseClassName() {
54          return "hreview";
55      }
56  
57      @Override
58      protected void resetExtractor() {
59          // Empty.
60      }
61  
62      @Override
63      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
64          BNode rev = getBlankNodeFor(node);
65          out.writeTriple(rev, RDF.TYPE, vREVIEW.Review);
66          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
67          addRating(fragment, rev);
68          addSummary(fragment, rev);
69          addTime(fragment, rev);
70          addType(fragment, rev);
71          addDescription(fragment, rev);
72          addItem(fragment, rev);
73          addReviewer(fragment, rev);
74  
75          final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
76          tser.addResourceRoot(DomUtils.getXPathListForNode(node), rev, this.getClass());
77  
78          return true;
79      }
80  
81      private void addType(HTMLDocument doc, Resource rev) {
82          TextField value = doc.getSingularTextField("type");
83          conditionallyAddStringProperty(value.source(), rev, vREVIEW.type, value.value());
84      }
85  
86      private void addReviewer(HTMLDocument doc, Resource rev) {
87          List<Node> nodes = doc.findAllByClassName("reviewer");
88          if (nodes.size() > 0) {
89              Node node0 = nodes.get(0);
90              addBNodeProperty(node0, rev, vREVIEW.reviewer, getBlankNodeFor(node0));
91          }
92      }
93  
94      private void addItem(HTMLDocument root, BNode rev) throws ExtractionException {
95          List<Node> nodes = root.findAllByClassName("item");
96          for (Node node : nodes) {
97              Resource item = findDummy(new HTMLDocument(node));
98              addBNodeProperty(node, item, vREVIEW.hasReview, rev);
99          }
100     }
101 
102     private Resource findDummy(HTMLDocument item) throws ExtractionException {
103         Resource blank = getBlankNodeFor(item.getDocument());
104         TextField val = item.getSingularTextField("fn");
105         conditionallyAddStringProperty(val.source(), blank, vVCARD.fn, val.value());
106         final TextField url = item.getSingularUrlField("url");
107         conditionallyAddResourceProperty(blank, vVCARD.url, getHTMLDocument().resolveIRI(url.value()));
108         TextField pics[] = item.getPluralUrlField("photo");
109         for (TextField pic : pics) {
110             addIRIProperty(blank, vVCARD.photo, getHTMLDocument().resolveIRI(pic.value()));
111         }
112         return blank;
113     }
114 
115     private void addRating(HTMLDocument doc, Resource rev) {
116         HTMLDocument.TextField value = doc.getSingularTextField("rating");
117         conditionallyAddStringProperty(value.source(), rev, vREVIEW.rating, value.value());
118     }
119 
120     private void addSummary(HTMLDocument doc, Resource rev) {
121         TextField value = doc.getSingularTextField("summary");
122         conditionallyAddStringProperty(value.source(), rev, vREVIEW.title, value.value());
123     }
124 
125     private void addTime(HTMLDocument doc, Resource rev) {
126         TextField value = doc.getSingularTextField("dtreviewed");
127         conditionallyAddStringProperty(value.source(), rev, vDCTERMS.date, value.value());
128     }
129 
130     private void addDescription(HTMLDocument doc, Resource rev) {
131         TextField value = doc.getSingularTextField("description");
132         conditionallyAddStringProperty(value.source(), rev, vREVIEW.text, value.value());
133     }
134 
135 }