View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.ExtractorFactory;
24  import org.apache.any23.extractor.SimpleExtractorFactory;
25  import org.apache.any23.rdf.PopularPrefixes;
26  import org.apache.any23.vocab.HRECIPE;
27  import org.openrdf.model.BNode;
28  import org.openrdf.model.URI;
29  import org.openrdf.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  
32  import java.util.Arrays;
33  
34  /**
35   * Extractor for the <a href="http://microformats.org/wiki/hrecipe">hRecipe</a>
36   * microformat.
37   *
38   * @author Michele Mostarda (mostarda@fbk.eu)
39   */
40  public class HRecipeExtractor extends EntityBasedMicroformatExtractor {
41  
42      private static final HRECIPE vHRECIPE = HRECIPE.getInstance();
43  
44      public final static ExtractorFactory<HRecipeExtractor> factory =
45              SimpleExtractorFactory.create(
46                      "html-mf-hrecipe",
47                      PopularPrefixes.createSubset("rdf", "hrecipe"),
48                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
49                      "example-mf-hrecipe.html",
50                      HRecipeExtractor.class
51              );
52  
53  
54      @Override
55      public ExtractorDescription getDescription() {
56          return factory;
57      }
58  
59      @Override
60      protected String getBaseClassName() {
61          return "hrecipe";
62      }
63  
64      @Override
65      protected void resetExtractor() {
66          // Empty.
67      }
68  
69      @Override
70      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
71          final BNode recipe = getBlankNodeFor(node);
72          conditionallyAddResourceProperty(recipe, RDF.TYPE, vHRECIPE.Recipe);
73          final HTMLDocument fragment = new HTMLDocument(node);
74          addFN(fragment, recipe);
75          addIngredients(fragment, recipe);
76          addYield(fragment, recipe);
77          addInstructions(fragment, recipe);
78          addDurations(fragment, recipe);
79          addPhoto(fragment, recipe);
80          addSummary(fragment, recipe);
81          addAuthors(fragment, recipe);
82          addPublished(fragment, recipe);
83          addNutritions(fragment, recipe);
84          addTags(fragment, recipe);
85          return true;
86      }
87  
88      /**
89       * Maps a field text with a property.
90       *
91       * @param fragment
92       * @param recipe
93       * @param fieldClass
94       * @param property
95       */
96      private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, URI property) {
97          HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
98          conditionallyAddStringProperty(
99                  title.source(), recipe, property, title.value()
100         );
101     }
102 
103     /**
104      * Adds the <code>fn</code> triple.
105      *
106      * @param fragment
107      * @param recipe
108      */
109     private void addFN(HTMLDocument fragment, BNode recipe) {
110         mapFieldWithProperty(fragment, recipe, "fn", vHRECIPE.fn);
111     }
112 
113     /**
114      * Adds the <code>ingredient</code> triples.
115      *
116      * @param fragment
117      * @param ingredient
118      * @return
119      */
120     private BNode addIngredient(HTMLDocument fragment,  HTMLDocument.TextField ingredient) {
121         final BNode ingredientBnode = getBlankNodeFor(ingredient.source());
122         addURIProperty(ingredientBnode, RDF.TYPE, vHRECIPE.Ingredient);
123         conditionallyAddStringProperty(
124                 ingredient.source(),
125                 ingredientBnode,
126                 vHRECIPE.ingredientName,
127                 HTMLDocument.readNodeContent(ingredient.source(), true)
128         );
129         mapFieldWithProperty(fragment, ingredientBnode, "value", vHRECIPE.ingredientQuantity);
130         mapFieldWithProperty(fragment, ingredientBnode, "type" , vHRECIPE.ingredientQuantityType);
131         return ingredientBnode;
132     }
133 
134     /**
135      * Adds the <code>ingredients</code>list triples.
136      *
137      * @param fragment
138      * @param recipe
139      * @return
140      */
141     private void addIngredients(HTMLDocument fragment, BNode recipe) {
142         final HTMLDocument.TextField[] ingredients = fragment.getPluralTextField("ingredient");
143         for(HTMLDocument.TextField ingredient : ingredients) {
144             addBNodeProperty(recipe, vHRECIPE.ingredient, addIngredient(fragment, ingredient));
145         }
146     }
147 
148     /**
149      * Adds the <code>instruction</code> triples.
150      *
151      * @param fragment
152      * @param recipe
153      */
154     private void addInstructions(HTMLDocument fragment, BNode recipe) {
155         mapFieldWithProperty(fragment, recipe, "instructions", vHRECIPE.instructions);
156 
157     }
158 
159     /**
160      * Adds the <code>yield</code> triples.
161      *
162      * @param fragment
163      * @param recipe
164      */
165     private void addYield(HTMLDocument fragment, BNode recipe) {
166         mapFieldWithProperty(fragment, recipe, "yield", vHRECIPE.yield);
167     }
168 
169     /**
170      * Adds the <code>duration</code> triples.
171      *
172      * @param fragment
173      * @param duration
174      * @return
175      */
176     //TODO: USE http://microformats.org/wiki/value-class-pattern to read correct date format.
177     private BNode addDuration(HTMLDocument fragment, HTMLDocument.TextField duration) {
178         final BNode durationBnode = getBlankNodeFor(duration.source());
179         addURIProperty(durationBnode, RDF.TYPE, vHRECIPE.Duration);
180         conditionallyAddStringProperty(
181                 duration.source(),
182                 durationBnode, vHRECIPE.durationTime, duration.value()
183         );
184         mapFieldWithProperty(fragment, durationBnode, "value-title", vHRECIPE.durationTitle);
185         return durationBnode;
186     }
187 
188     /**
189      * Adds the <code>yield</code> triples.
190      *
191      * @param fragment
192      * @param recipe
193      */
194     private void addDurations(HTMLDocument fragment, BNode recipe) {
195       final HTMLDocument.TextField[] durations = fragment.getPluralTextField("duration");
196         for(HTMLDocument.TextField duration : durations) {
197             addBNodeProperty(recipe, vHRECIPE.duration, addDuration(fragment, duration));
198         }
199     }
200 
201     /**
202      * Adds the <code>photo</code> triples.
203      *
204      * @param fragment
205      * @param recipe
206      * @throws ExtractionException
207      */
208     private void addPhoto(HTMLDocument fragment, BNode recipe) throws ExtractionException {
209         final HTMLDocument.TextField[] photos = fragment.getPluralUrlField("photo");
210         for(HTMLDocument.TextField photo : photos) {
211             addURIProperty(recipe, vHRECIPE.photo, fragment.resolveURI(photo.value()));
212         }
213     }
214 
215     /**
216      * Adds the <code>summary</code> triples.
217      *
218      * @param fragment
219      * @param recipe
220      */
221     private void addSummary(HTMLDocument fragment, BNode recipe) {
222         mapFieldWithProperty(fragment, recipe, "summary", vHRECIPE.summary);
223     }
224 
225     /**
226      * Adds the <code>authors</code> triples.
227      *
228      * @param fragment
229      * @param recipe
230      */
231     private void addAuthors(HTMLDocument fragment, BNode recipe) {
232         final HTMLDocument.TextField[] authors = fragment.getPluralTextField("author");
233          for(HTMLDocument.TextField author : authors) {
234              conditionallyAddStringProperty(
235                     author.source(),
236                     recipe, vHRECIPE.author, author.value()
237               );
238         }
239     }
240 
241     /**
242      * Adds the <code>published</code> triples.
243      *
244      * @param fragment
245      * @param recipe
246      */
247     //TODO: USE http://microformats.org/wiki/value-class-pattern to read correct date format.
248     private void addPublished(HTMLDocument fragment, BNode recipe) {
249         mapFieldWithProperty(fragment, recipe, "published", vHRECIPE.published);
250     }
251 
252     /**
253      * Adds the <code>nutrition</code> triples.
254      *
255      * @param fragment
256      * @param nutrition
257      * @return
258      */
259     private BNode addNutrition(HTMLDocument fragment, HTMLDocument.TextField nutrition) {
260         final BNode nutritionBnode = getBlankNodeFor(nutrition.source());
261         addURIProperty(nutritionBnode, RDF.TYPE, vHRECIPE.Nutrition);
262         conditionallyAddStringProperty(
263                 nutrition.source(),
264                 nutritionBnode, vHRECIPE.nutritionValue, nutrition.value()
265         );
266         mapFieldWithProperty(fragment, nutritionBnode, "value", vHRECIPE.nutritionValue);
267         mapFieldWithProperty(fragment, nutritionBnode, "type" , vHRECIPE.nutritionValueType);
268         return nutritionBnode;
269     }
270 
271     /**
272      * Adds the <code>nutritions</code> triples.
273      *
274      * @param fragment
275      * @param recipe
276      */
277     private void addNutritions(HTMLDocument fragment, BNode recipe) {
278         HTMLDocument.TextField[] nutritions = fragment.getPluralTextField("nutrition");
279         for (HTMLDocument.TextField nutrition : nutritions) {
280             addBNodeProperty(recipe, vHRECIPE.nutrition, addNutrition(fragment, nutrition));
281         }
282     }
283 
284     /**
285      * Adds the <code>tags</code> triples.
286      *
287      * @param fragment
288      * @param recipe
289      */
290     private void addTags(HTMLDocument fragment, BNode recipe) {
291         HTMLDocument.TextField[] tags = fragment.extractRelTagNodes();
292         for(HTMLDocument.TextField tag : tags) {
293             conditionallyAddStringProperty(
294                     tag.source(),
295                     recipe, vHRECIPE.tag, tag.value()
296               );
297         }
298     }
299 
300 }