View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.vocab.HRecipe;
24  import org.eclipse.rdf4j.model.BNode;
25  import org.eclipse.rdf4j.model.IRI;
26  import org.eclipse.rdf4j.model.vocabulary.RDF;
27  import org.w3c.dom.Node;
28  
29  /**
30   * Extractor for the <a href="http://microformats.org/wiki/hrecipe">hRecipe</a> microformat.
31   *
32   * @author Michele Mostarda (mostarda@fbk.eu)
33   */
34  public class HRecipeExtractor extends EntityBasedMicroformatExtractor {
35  
36      private static final HRecipe vHRECIPE = HRecipe.getInstance();
37  
38      @Override
39      public ExtractorDescription getDescription() {
40          return HRecipeExtractorFactory.getDescriptionInstance();
41      }
42  
43      @Override
44      protected String getBaseClassName() {
45          return "hrecipe";
46      }
47  
48      @Override
49      protected void resetExtractor() {
50          // Empty.
51      }
52  
53      @Override
54      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
55          final BNode recipe = getBlankNodeFor(node);
56          conditionallyAddResourceProperty(recipe, RDF.TYPE, vHRECIPE.Recipe);
57          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
58          addFN(fragment, recipe);
59          addIngredients(fragment, recipe);
60          addYield(fragment, recipe);
61          addInstructions(fragment, recipe);
62          addDurations(fragment, recipe);
63          addPhoto(fragment, recipe);
64          addSummary(fragment, recipe);
65          addAuthors(fragment, recipe);
66          addPublished(fragment, recipe);
67          addNutritions(fragment, recipe);
68          addTags(fragment, recipe);
69          return true;
70      }
71  
72      /**
73       * Maps a field text with a property.
74       *
75       * @param fragment
76       * @param recipe
77       * @param fieldClass
78       * @param property
79       */
80      private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, IRI property) {
81          HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
82          conditionallyAddStringProperty(title.source(), recipe, property, title.value());
83      }
84  
85      /**
86       * Adds the <code>fn</code> triple.
87       *
88       * @param fragment
89       * @param recipe
90       */
91      private void addFN(HTMLDocument fragment, BNode recipe) {
92          mapFieldWithProperty(fragment, recipe, "fn", vHRECIPE.fn);
93      }
94  
95      /**
96       * Adds the <code>ingredient</code> triples.
97       *
98       * @param fragment
99       * @param ingredient
100      * 
101      * @return
102      */
103     private BNode addIngredient(HTMLDocument fragment, HTMLDocument.TextField ingredient) {
104         final BNode ingredientBnode = getBlankNodeFor(ingredient.source());
105         addIRIProperty(ingredientBnode, RDF.TYPE, vHRECIPE.Ingredient);
106         conditionallyAddStringProperty(ingredient.source(), ingredientBnode, vHRECIPE.ingredientName,
107                 HTMLDocument.readNodeContent(ingredient.source(), true));
108         mapFieldWithProperty(fragment, ingredientBnode, "value", vHRECIPE.ingredientQuantity);
109         mapFieldWithProperty(fragment, ingredientBnode, "type", vHRECIPE.ingredientQuantityType);
110         return ingredientBnode;
111     }
112 
113     /**
114      * Adds the <code>ingredients</code>list triples.
115      *
116      * @param fragment
117      * @param recipe
118      * 
119      * @return
120      */
121     private void addIngredients(HTMLDocument fragment, BNode recipe) {
122         final HTMLDocument.TextField[] ingredients = fragment.getPluralTextField("ingredient");
123         for (HTMLDocument.TextField ingredient : ingredients) {
124             addBNodeProperty(recipe, vHRECIPE.ingredient, addIngredient(fragment, ingredient));
125         }
126     }
127 
128     /**
129      * Adds the <code>instruction</code> triples.
130      *
131      * @param fragment
132      * @param recipe
133      */
134     private void addInstructions(HTMLDocument fragment, BNode recipe) {
135         mapFieldWithProperty(fragment, recipe, "instructions", vHRECIPE.instructions);
136 
137     }
138 
139     /**
140      * Adds the <code>yield</code> triples.
141      *
142      * @param fragment
143      * @param recipe
144      */
145     private void addYield(HTMLDocument fragment, BNode recipe) {
146         mapFieldWithProperty(fragment, recipe, "yield", vHRECIPE.yield);
147     }
148 
149     /**
150      * Adds the <code>duration</code> triples.
151      *
152      * @param fragment
153      * @param duration
154      * 
155      * @return
156      */
157     // TODO: USE http://microformats.org/wiki/value-class-pattern to read correct date format.
158     private BNode addDuration(HTMLDocument fragment, HTMLDocument.TextField duration) {
159         final BNode durationBnode = getBlankNodeFor(duration.source());
160         addIRIProperty(durationBnode, RDF.TYPE, vHRECIPE.Duration);
161         conditionallyAddStringProperty(duration.source(), durationBnode, vHRECIPE.durationTime, duration.value());
162         mapFieldWithProperty(fragment, durationBnode, "value-title", vHRECIPE.durationTitle);
163         return durationBnode;
164     }
165 
166     /**
167      * Adds the <code>yield</code> triples.
168      *
169      * @param fragment
170      * @param recipe
171      */
172     private void addDurations(HTMLDocument fragment, BNode recipe) {
173         final HTMLDocument.TextField[] durations = fragment.getPluralTextField("duration");
174         for (HTMLDocument.TextField duration : durations) {
175             addBNodeProperty(recipe, vHRECIPE.duration, addDuration(fragment, duration));
176         }
177     }
178 
179     /**
180      * Adds the <code>photo</code> triples.
181      *
182      * @param fragment
183      * @param recipe
184      * 
185      * @throws ExtractionException
186      */
187     private void addPhoto(HTMLDocument fragment, BNode recipe) throws ExtractionException {
188         final HTMLDocument.TextField[] photos = fragment.getPluralUrlField("photo");
189         for (HTMLDocument.TextField photo : photos) {
190             addIRIProperty(recipe, vHRECIPE.photo, fragment.resolveIRI(photo.value()));
191         }
192     }
193 
194     /**
195      * Adds the <code>summary</code> triples.
196      *
197      * @param fragment
198      * @param recipe
199      */
200     private void addSummary(HTMLDocument fragment, BNode recipe) {
201         mapFieldWithProperty(fragment, recipe, "summary", vHRECIPE.summary);
202     }
203 
204     /**
205      * Adds the <code>authors</code> triples.
206      *
207      * @param fragment
208      * @param recipe
209      */
210     private void addAuthors(HTMLDocument fragment, BNode recipe) {
211         final HTMLDocument.TextField[] authors = fragment.getPluralTextField("author");
212         for (HTMLDocument.TextField author : authors) {
213             conditionallyAddStringProperty(author.source(), recipe, vHRECIPE.author, author.value());
214         }
215     }
216 
217     /**
218      * Adds the <code>published</code> triples.
219      *
220      * @param fragment
221      * @param recipe
222      */
223     // TODO: USE http://microformats.org/wiki/value-class-pattern to read correct date format.
224     private void addPublished(HTMLDocument fragment, BNode recipe) {
225         mapFieldWithProperty(fragment, recipe, "published", vHRECIPE.published);
226     }
227 
228     /**
229      * Adds the <code>nutrition</code> triples.
230      *
231      * @param fragment
232      * @param nutrition
233      * 
234      * @return
235      */
236     private BNode addNutrition(HTMLDocument fragment, HTMLDocument.TextField nutrition) {
237         final BNode nutritionBnode = getBlankNodeFor(nutrition.source());
238         addIRIProperty(nutritionBnode, RDF.TYPE, vHRECIPE.Nutrition);
239         conditionallyAddStringProperty(nutrition.source(), nutritionBnode, vHRECIPE.nutritionValue, nutrition.value());
240         mapFieldWithProperty(fragment, nutritionBnode, "value", vHRECIPE.nutritionValue);
241         mapFieldWithProperty(fragment, nutritionBnode, "type", vHRECIPE.nutritionValueType);
242         return nutritionBnode;
243     }
244 
245     /**
246      * Adds the <code>nutritions</code> triples.
247      *
248      * @param fragment
249      * @param recipe
250      */
251     private void addNutritions(HTMLDocument fragment, BNode recipe) {
252         HTMLDocument.TextField[] nutritions = fragment.getPluralTextField("nutrition");
253         for (HTMLDocument.TextField nutrition : nutritions) {
254             addBNodeProperty(recipe, vHRECIPE.nutrition, addNutrition(fragment, nutrition));
255         }
256     }
257 
258     /**
259      * Adds the <code>tags</code> triples.
260      *
261      * @param fragment
262      * @param recipe
263      */
264     private void addTags(HTMLDocument fragment, BNode recipe) {
265         HTMLDocument.TextField[] tags = fragment.extractRelTagNodes();
266         for (HTMLDocument.TextField tag : tags) {
267             conditionallyAddStringProperty(tag.source(), recipe, vHRECIPE.tag, tag.value());
268         }
269     }
270 
271 }