View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.vocab.HRecipe;
24  import org.eclipse.rdf4j.model.BNode;
25  import org.eclipse.rdf4j.model.IRI;
26  import org.eclipse.rdf4j.model.vocabulary.RDF;
27  import org.w3c.dom.Node;
28  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
29  import org.apache.any23.extractor.html.HTMLDocument;
30  
31  /**
32   * Extractor for the <a href="http://microformats.org/wiki/hrecipe">hRecipe</a> microformat.
33   *
34   * @author Nisala Nirmana
35   */
36  public class HRecipeExtractor extends EntityBasedMicroformatExtractor {
37  
38      private static final HRecipe vHRECIPE = HRecipe.getInstance();
39  
40      private static final String[] recipeFields = { "name", "ingredient", "yield", "instructions", "duration", "photo",
41              "summary", "author", "published", "nutrition" };
42  
43      @Override
44      public ExtractorDescription getDescription() {
45          return HRecipeExtractorFactory.getDescriptionInstance();
46      }
47  
48      @Override
49      protected String getBaseClassName() {
50          return Microformats2Prefixes.CLASS_PREFIX + "recipe";
51      }
52  
53      @Override
54      protected void resetExtractor() {
55          // Empty.
56      }
57  
58      @Override
59      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
60          final BNode recipe = getBlankNodeFor(node);
61          conditionallyAddResourceProperty(recipe, RDF.TYPE, vHRECIPE.Recipe);
62          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
63          addName(fragment, recipe);
64          addIngredients(fragment, recipe);
65          addYield(fragment, recipe);
66          addInstructions(fragment, recipe);
67          addDurations(fragment, recipe);
68          addPhoto(fragment, recipe);
69          addSummary(fragment, recipe);
70          addAuthors(fragment, recipe);
71          addPublished(fragment, recipe);
72          addNutritions(fragment, recipe);
73          return true;
74      }
75  
76      private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, IRI property) {
77          HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
78          conditionallyAddStringProperty(title.source(), recipe, property, title.value());
79      }
80  
81      private void addName(HTMLDocument fragment, BNode recipe) {
82          mapFieldWithProperty(fragment, recipe, Microformats2Prefixes.PROPERTY_PREFIX + recipeFields[0], vHRECIPE.fn);
83      }
84  
85      private void addIngredients(HTMLDocument fragment, BNode recipe) {
86          final HTMLDocument.TextField[] ingredients = fragment
87                  .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + recipeFields[1]);
88          for (HTMLDocument.TextField ingredient : ingredients) {
89              conditionallyAddStringProperty(ingredient.source(), recipe, vHRECIPE.ingredient, ingredient.value());
90          }
91      }
92  
93      private void addInstructions(HTMLDocument fragment, BNode recipe) {
94          mapFieldWithProperty(fragment, recipe, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + recipeFields[2],
95                  vHRECIPE.instructions);
96      }
97  
98      private void addYield(HTMLDocument fragment, BNode recipe) {
99          mapFieldWithProperty(fragment, recipe, Microformats2Prefixes.PROPERTY_PREFIX + recipeFields[3], vHRECIPE.yield);
100     }
101 
102     private void addDurations(HTMLDocument fragment, BNode recipe) {
103         final HTMLDocument.TextField[] durations = fragment
104                 .getPluralTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + recipeFields[4]);
105         for (HTMLDocument.TextField duration : durations) {
106             Node attribute = duration.source().getAttributes().getNamedItem("datetime");
107             if (attribute == null) {
108                 conditionallyAddStringProperty(duration.source(), recipe, vHRECIPE.duration, duration.value());
109             } else {
110                 conditionallyAddStringProperty(duration.source(), recipe, vHRECIPE.duration, attribute.getNodeValue());
111 
112             }
113 
114         }
115     }
116 
117     private void addPhoto(HTMLDocument fragment, BNode recipe) throws ExtractionException {
118         final HTMLDocument.TextField[] photos = fragment
119                 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + recipeFields[5]);
120         for (HTMLDocument.TextField photo : photos) {
121             addIRIProperty(recipe, vHRECIPE.photo, fragment.resolveIRI(photo.value()));
122         }
123     }
124 
125     private void addSummary(HTMLDocument fragment, BNode recipe) {
126         mapFieldWithProperty(fragment, recipe, Microformats2Prefixes.PROPERTY_PREFIX + recipeFields[6],
127                 vHRECIPE.summary);
128     }
129 
130     private void addAuthors(HTMLDocument fragment, BNode recipe) {
131         final HTMLDocument.TextField[] authors = fragment
132                 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + recipeFields[7]);
133         for (HTMLDocument.TextField author : authors) {
134             conditionallyAddStringProperty(author.source(), recipe, vHRECIPE.author, author.value());
135         }
136     }
137 
138     private void addPublished(HTMLDocument fragment, BNode recipe) {
139         final HTMLDocument.TextField[] durations = fragment
140                 .getPluralTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + recipeFields[8]);
141         for (HTMLDocument.TextField duration : durations) {
142             Node attribute = duration.source().getAttributes().getNamedItem("datetime");
143             if (attribute == null) {
144                 conditionallyAddStringProperty(duration.source(), recipe, vHRECIPE.published, duration.value());
145             } else {
146                 conditionallyAddStringProperty(duration.source(), recipe, vHRECIPE.published, attribute.getNodeValue());
147             }
148         }
149     }
150 
151     private void addNutritions(HTMLDocument fragment, BNode recipe) {
152         final HTMLDocument.TextField[] nutritions = fragment
153                 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + recipeFields[9]);
154         for (HTMLDocument.TextField nutrition : nutritions) {
155             conditionallyAddStringProperty(nutrition.source(), recipe, vHRECIPE.nutrition, nutrition.value());
156         }
157     }
158 }