View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.vocab.HRecipe;
24  import org.eclipse.rdf4j.model.BNode;
25  import org.eclipse.rdf4j.model.IRI;
26  import org.eclipse.rdf4j.model.vocabulary.RDF;
27  import org.w3c.dom.Node;
28  
29  /**
30   * Extractor for the <a href="http://microformats.org/wiki/hrecipe">hRecipe</a>
31   * microformat.
32   *
33   * @author Michele Mostarda (mostarda@fbk.eu)
34   */
35  public class HRecipeExtractor extends EntityBasedMicroformatExtractor {
36  
37      private static final HRecipe vHRECIPE = HRecipe.getInstance();
38  
39      @Override
40      public ExtractorDescription getDescription() {
41          return HRecipeExtractorFactory.getDescriptionInstance();
42      }
43  
44      @Override
45      protected String getBaseClassName() {
46          return "hrecipe";
47      }
48  
49      @Override
50      protected void resetExtractor() {
51          // Empty.
52      }
53  
54      @Override
55      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
56          final BNode recipe = getBlankNodeFor(node);
57          conditionallyAddResourceProperty(recipe, RDF.TYPE, vHRECIPE.Recipe);
58          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
59          addFN(fragment, recipe);
60          addIngredients(fragment, recipe);
61          addYield(fragment, recipe);
62          addInstructions(fragment, recipe);
63          addDurations(fragment, recipe);
64          addPhoto(fragment, recipe);
65          addSummary(fragment, recipe);
66          addAuthors(fragment, recipe);
67          addPublished(fragment, recipe);
68          addNutritions(fragment, recipe);
69          addTags(fragment, recipe);
70          return true;
71      }
72  
73      /**
74       * Maps a field text with a property.
75       *
76       * @param fragment
77       * @param recipe
78       * @param fieldClass
79       * @param property
80       */
81      private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, IRI property) {
82          HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
83          conditionallyAddStringProperty(
84                  title.source(), recipe, property, title.value()
85          );
86      }
87  
88      /**
89       * Adds the <code>fn</code> triple.
90       *
91       * @param fragment
92       * @param recipe
93       */
94      private void addFN(HTMLDocument fragment, BNode recipe) {
95          mapFieldWithProperty(fragment, recipe, "fn", vHRECIPE.fn);
96      }
97  
98      /**
99       * Adds the <code>ingredient</code> triples.
100      *
101      * @param fragment
102      * @param ingredient
103      * @return
104      */
105     private BNode addIngredient(HTMLDocument fragment,  HTMLDocument.TextField ingredient) {
106         final BNode ingredientBnode = getBlankNodeFor(ingredient.source());
107         addIRIProperty(ingredientBnode, RDF.TYPE, vHRECIPE.Ingredient);
108         conditionallyAddStringProperty(
109                 ingredient.source(),
110                 ingredientBnode,
111                 vHRECIPE.ingredientName,
112                 HTMLDocument.readNodeContent(ingredient.source(), true)
113         );
114         mapFieldWithProperty(fragment, ingredientBnode, "value", vHRECIPE.ingredientQuantity);
115         mapFieldWithProperty(fragment, ingredientBnode, "type" , vHRECIPE.ingredientQuantityType);
116         return ingredientBnode;
117     }
118 
119     /**
120      * Adds the <code>ingredients</code>list triples.
121      *
122      * @param fragment
123      * @param recipe
124      * @return
125      */
126     private void addIngredients(HTMLDocument fragment, BNode recipe) {
127         final HTMLDocument.TextField[] ingredients = fragment.getPluralTextField("ingredient");
128         for(HTMLDocument.TextField ingredient : ingredients) {
129             addBNodeProperty(recipe, vHRECIPE.ingredient, addIngredient(fragment, ingredient));
130         }
131     }
132 
133     /**
134      * Adds the <code>instruction</code> triples.
135      *
136      * @param fragment
137      * @param recipe
138      */
139     private void addInstructions(HTMLDocument fragment, BNode recipe) {
140         mapFieldWithProperty(fragment, recipe, "instructions", vHRECIPE.instructions);
141 
142     }
143 
144     /**
145      * Adds the <code>yield</code> triples.
146      *
147      * @param fragment
148      * @param recipe
149      */
150     private void addYield(HTMLDocument fragment, BNode recipe) {
151         mapFieldWithProperty(fragment, recipe, "yield", vHRECIPE.yield);
152     }
153 
154     /**
155      * Adds the <code>duration</code> triples.
156      *
157      * @param fragment
158      * @param duration
159      * @return
160      */
161     //TODO: USE http://microformats.org/wiki/value-class-pattern to read correct date format.
162     private BNode addDuration(HTMLDocument fragment, HTMLDocument.TextField duration) {
163         final BNode durationBnode = getBlankNodeFor(duration.source());
164         addIRIProperty(durationBnode, RDF.TYPE, vHRECIPE.Duration);
165         conditionallyAddStringProperty(
166                 duration.source(),
167                 durationBnode, vHRECIPE.durationTime, duration.value()
168         );
169         mapFieldWithProperty(fragment, durationBnode, "value-title", vHRECIPE.durationTitle);
170         return durationBnode;
171     }
172 
173     /**
174      * Adds the <code>yield</code> triples.
175      *
176      * @param fragment
177      * @param recipe
178      */
179     private void addDurations(HTMLDocument fragment, BNode recipe) {
180       final HTMLDocument.TextField[] durations = fragment.getPluralTextField("duration");
181         for(HTMLDocument.TextField duration : durations) {
182             addBNodeProperty(recipe, vHRECIPE.duration, addDuration(fragment, duration));
183         }
184     }
185 
186     /**
187      * Adds the <code>photo</code> triples.
188      *
189      * @param fragment
190      * @param recipe
191      * @throws ExtractionException
192      */
193     private void addPhoto(HTMLDocument fragment, BNode recipe) throws ExtractionException {
194         final HTMLDocument.TextField[] photos = fragment.getPluralUrlField("photo");
195         for(HTMLDocument.TextField photo : photos) {
196             addIRIProperty(recipe, vHRECIPE.photo, fragment.resolveIRI(photo.value()));
197         }
198     }
199 
200     /**
201      * Adds the <code>summary</code> triples.
202      *
203      * @param fragment
204      * @param recipe
205      */
206     private void addSummary(HTMLDocument fragment, BNode recipe) {
207         mapFieldWithProperty(fragment, recipe, "summary", vHRECIPE.summary);
208     }
209 
210     /**
211      * Adds the <code>authors</code> triples.
212      *
213      * @param fragment
214      * @param recipe
215      */
216     private void addAuthors(HTMLDocument fragment, BNode recipe) {
217         final HTMLDocument.TextField[] authors = fragment.getPluralTextField("author");
218          for(HTMLDocument.TextField author : authors) {
219              conditionallyAddStringProperty(
220                     author.source(),
221                     recipe, vHRECIPE.author, author.value()
222               );
223         }
224     }
225 
226     /**
227      * Adds the <code>published</code> triples.
228      *
229      * @param fragment
230      * @param recipe
231      */
232     //TODO: USE http://microformats.org/wiki/value-class-pattern to read correct date format.
233     private void addPublished(HTMLDocument fragment, BNode recipe) {
234         mapFieldWithProperty(fragment, recipe, "published", vHRECIPE.published);
235     }
236 
237     /**
238      * Adds the <code>nutrition</code> triples.
239      *
240      * @param fragment
241      * @param nutrition
242      * @return
243      */
244     private BNode addNutrition(HTMLDocument fragment, HTMLDocument.TextField nutrition) {
245         final BNode nutritionBnode = getBlankNodeFor(nutrition.source());
246         addIRIProperty(nutritionBnode, RDF.TYPE, vHRECIPE.Nutrition);
247         conditionallyAddStringProperty(
248                 nutrition.source(),
249                 nutritionBnode, vHRECIPE.nutritionValue, nutrition.value()
250         );
251         mapFieldWithProperty(fragment, nutritionBnode, "value", vHRECIPE.nutritionValue);
252         mapFieldWithProperty(fragment, nutritionBnode, "type" , vHRECIPE.nutritionValueType);
253         return nutritionBnode;
254     }
255 
256     /**
257      * Adds the <code>nutritions</code> triples.
258      *
259      * @param fragment
260      * @param recipe
261      */
262     private void addNutritions(HTMLDocument fragment, BNode recipe) {
263         HTMLDocument.TextField[] nutritions = fragment.getPluralTextField("nutrition");
264         for (HTMLDocument.TextField nutrition : nutritions) {
265             addBNodeProperty(recipe, vHRECIPE.nutrition, addNutrition(fragment, nutrition));
266         }
267     }
268 
269     /**
270      * Adds the <code>tags</code> triples.
271      *
272      * @param fragment
273      * @param recipe
274      */
275     private void addTags(HTMLDocument fragment, BNode recipe) {
276         HTMLDocument.TextField[] tags = fragment.extractRelTagNodes();
277         for(HTMLDocument.TextField tag : tags) {
278             conditionallyAddStringProperty(
279                     tag.source(),
280                     recipe, vHRECIPE.tag, tag.value()
281               );
282         }
283     }
284 
285 }