View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
24  import org.apache.any23.extractor.html.HTMLDocument;
25  import org.apache.any23.vocab.HProduct;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  
32  import java.util.List;
33  
34  /**
35   * Extractor for the <a href="http://microformats.org/wiki/h-product">h-product</a>
36   * microformat.
37   *
38   * @author Nisala Nirmana
39   */
40  public class HProductExtractor extends EntityBasedMicroformatExtractor {
41  
42      private static final HProduct vProduct = HProduct.getInstance();
43  
44      private static final String[] productFields = {
45              "name",
46              "photo",
47              "brand",
48              "category",
49              "description",
50              "url",
51              "identifier",
52              "review", //toDo
53              "price"
54      };
55  
56      @Override
57      public ExtractorDescription getDescription() {
58          return HProductExtractorFactory.getDescriptionInstance();
59      }
60  
61      @Override
62      protected String getBaseClassName() {
63          return Microformats2Prefixes.CLASS_PREFIX+"product";
64      }
65  
66      @Override
67      protected void resetExtractor() {
68          // Empty.
69      }
70  
71      @Override
72      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
73          final BNode product = getBlankNodeFor(node);
74          conditionallyAddResourceProperty(product, RDF.TYPE, vProduct.product);
75          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
76          addName(fragment, product);
77          addPhoto(fragment, product);
78          addCategories(fragment, product);
79          addDescription(fragment, product);
80          addURLs(fragment, product);
81          addIdentifiers(fragment, product);
82          addPrice(fragment, product);
83          addBrand(fragment,product);
84          return true;
85      }
86  
87      private void mapFieldWithProperty(HTMLDocument fragment, BNode product, String fieldClass,
88                                        IRI property) {
89          HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
90          conditionallyAddStringProperty(
91                  title.source(), product, property, title.value()
92          );
93      }
94  
95      private void addName(HTMLDocument fragment, BNode product) {
96          mapFieldWithProperty(fragment, product, Microformats2Prefixes.PROPERTY_PREFIX +
97                  productFields[0], vProduct.name);
98      }
99  
100     private void addPhoto(HTMLDocument fragment, BNode product) throws ExtractionException {
101         final HTMLDocument.TextField[] photos = fragment.getPluralUrlField
102                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[1]);
103         for(HTMLDocument.TextField photo : photos) {
104             addIRIProperty(product, vProduct.photo, fragment.resolveIRI(photo.value()));
105         }
106     }
107 
108     private void addCategories(HTMLDocument fragment, BNode product) {
109         final HTMLDocument.TextField[] categories = fragment.getPluralTextField
110                 (Microformats2Prefixes.PROPERTY_PREFIX + productFields[3]);
111         for(HTMLDocument.TextField category : categories) {
112             conditionallyAddStringProperty(
113                     category.source(), product, vProduct.category, category.value()
114             );
115         }
116     }
117 
118     private void addDescription(HTMLDocument fragment, BNode product) {
119         mapFieldWithProperty(fragment, product, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX +
120                 productFields[4], vProduct.description);
121     }
122 
123     private void addURLs(HTMLDocument fragment, BNode product) throws ExtractionException {
124         final HTMLDocument.TextField[] urls = fragment.getPluralUrlField
125                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[5]);
126         for(HTMLDocument.TextField url : urls) {
127             addIRIProperty(product, vProduct.url, fragment.resolveIRI(url.value()));
128         }
129     }
130 
131     private void addIdentifiers(HTMLDocument fragment, BNode product) throws ExtractionException {
132         final HTMLDocument.TextField[] identifiers = fragment.getPluralUrlField
133                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + productFields[6]);
134         for(HTMLDocument.TextField identifier :identifiers) {
135             addIRIProperty(product, vProduct.identifier, fragment.resolveIRI(identifier.value()));
136         }
137     }
138 
139     private void addPrice(HTMLDocument fragment, BNode product) {
140         final HTMLDocument.TextField price = fragment.getSingularTextField(
141                 Microformats2Prefixes.PROPERTY_PREFIX + productFields[8]);
142         if(price.source()==null)
143             return;
144         Node attribute = price.source().getAttributes().getNamedItem("value");
145         if (attribute == null) {
146             conditionallyAddStringProperty(
147                     price.source(),
148                     product, vProduct.price, price.value()
149             );
150         } else {
151             conditionallyAddStringProperty(
152                     price.source(),
153                     product, vProduct.price, attribute.getNodeValue()
154             );
155         }
156     }
157 
158     private void addBrand(HTMLDocument doc, Resource product) throws ExtractionException {
159         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + productFields[2] +
160                 Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
161         if (nodes.isEmpty())
162             return;
163         HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
164         HCardExtractor extractor = factory.createExtractor();
165         for (Node node : nodes) {
166             BNode brand = valueFactory.createBNode();
167             addIRIProperty(brand, RDF.TYPE, vProduct.brand);
168             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), brand,
169                     getCurrentExtractionResult());
170         }
171     }
172 }