View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
24  import org.apache.any23.extractor.html.HTMLDocument;
25  import org.apache.any23.vocab.HProduct;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  
32  import java.util.List;
33  
34  /**
35   * Extractor for the <a href="http://microformats.org/wiki/h-product">h-product</a> microformat.
36   *
37   * @author Nisala Nirmana
38   */
39  public class HProductExtractor extends EntityBasedMicroformatExtractor {
40  
41      private static final HProduct vProduct = HProduct.getInstance();
42  
43      private static final String[] productFields = { "name", "photo", "brand", "category", "description", "url",
44              "identifier", "review", // toDo
45              "price" };
46  
47      @Override
48      public ExtractorDescription getDescription() {
49          return HProductExtractorFactory.getDescriptionInstance();
50      }
51  
52      @Override
53      protected String getBaseClassName() {
54          return Microformats2Prefixes.CLASS_PREFIX + "product";
55      }
56  
57      @Override
58      protected void resetExtractor() {
59          // Empty.
60      }
61  
62      @Override
63      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
64          final BNode product = getBlankNodeFor(node);
65          conditionallyAddResourceProperty(product, RDF.TYPE, vProduct.product);
66          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
67          addName(fragment, product);
68          addPhoto(fragment, product);
69          addCategories(fragment, product);
70          addDescription(fragment, product);
71          addURLs(fragment, product);
72          addIdentifiers(fragment, product);
73          addPrice(fragment, product);
74          addBrand(fragment, product);
75          return true;
76      }
77  
78      private void mapFieldWithProperty(HTMLDocument fragment, BNode product, String fieldClass, IRI property) {
79          HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
80          conditionallyAddStringProperty(title.source(), product, property, title.value());
81      }
82  
83      private void addName(HTMLDocument fragment, BNode product) {
84          mapFieldWithProperty(fragment, product, Microformats2Prefixes.PROPERTY_PREFIX + productFields[0],
85                  vProduct.name);
86      }
87  
88      private void addPhoto(HTMLDocument fragment, BNode product) throws ExtractionException {
89          final HTMLDocument.TextField[] photos = fragment
90                  .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[1]);
91          for (HTMLDocument.TextField photo : photos) {
92              addIRIProperty(product, vProduct.photo, fragment.resolveIRI(photo.value()));
93          }
94      }
95  
96      private void addCategories(HTMLDocument fragment, BNode product) {
97          final HTMLDocument.TextField[] categories = fragment
98                  .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + productFields[3]);
99          for (HTMLDocument.TextField category : categories) {
100             conditionallyAddStringProperty(category.source(), product, vProduct.category, category.value());
101         }
102     }
103 
104     private void addDescription(HTMLDocument fragment, BNode product) {
105         mapFieldWithProperty(fragment, product, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + productFields[4],
106                 vProduct.description);
107     }
108 
109     private void addURLs(HTMLDocument fragment, BNode product) throws ExtractionException {
110         final HTMLDocument.TextField[] urls = fragment
111                 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[5]);
112         for (HTMLDocument.TextField url : urls) {
113             addIRIProperty(product, vProduct.url, fragment.resolveIRI(url.value()));
114         }
115     }
116 
117     private void addIdentifiers(HTMLDocument fragment, BNode product) throws ExtractionException {
118         final HTMLDocument.TextField[] identifiers = fragment
119                 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[6]);
120         for (HTMLDocument.TextField identifier : identifiers) {
121             addIRIProperty(product, vProduct.identifier, fragment.resolveIRI(identifier.value()));
122         }
123     }
124 
125     private void addPrice(HTMLDocument fragment, BNode product) {
126         final HTMLDocument.TextField price = fragment
127                 .getSingularTextField(Microformats2Prefixes.PROPERTY_PREFIX + productFields[8]);
128         if (price.source() == null)
129             return;
130         Node attribute = price.source().getAttributes().getNamedItem("value");
131         if (attribute == null) {
132             conditionallyAddStringProperty(price.source(), product, vProduct.price, price.value());
133         } else {
134             conditionallyAddStringProperty(price.source(), product, vProduct.price, attribute.getNodeValue());
135         }
136     }
137 
138     private void addBrand(HTMLDocument doc, Resource product) throws ExtractionException {
139         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + productFields[2]
140                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
141         if (nodes.isEmpty())
142             return;
143         HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
144         HCardExtractor extractor = factory.createExtractor();
145         for (Node node : nodes) {
146             BNode brand = valueFactory.createBNode();
147             addIRIProperty(brand, RDF.TYPE, vProduct.brand);
148             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), brand, getCurrentExtractionResult());
149         }
150     }
151 }