View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  
19  package org.apache.any23.extractor.html.microformats2;
20  
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionResult;
23  import org.apache.any23.extractor.ExtractorDescription;
24  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
25  import org.apache.any23.extractor.html.HTMLDocument;
26  import org.apache.any23.vocab.HEntry;
27  import org.apache.any23.vocab.VCard;
28  import org.eclipse.rdf4j.model.BNode;
29  import org.eclipse.rdf4j.model.IRI;
30  import org.eclipse.rdf4j.model.vocabulary.RDF;
31  import org.w3c.dom.Node;
32  import org.eclipse.rdf4j.model.Resource;
33  
34  import java.util.List;
35  
36  /**
37   * Extractor for the <a href="http://microformats.org/wiki/h-entry">h-entry</a>
38   * microformat.
39   *
40   * @author Nisala Nirmana
41   */
42  public class HEntryExtractor extends EntityBasedMicroformatExtractor {
43  
44      private static final HEntry vEntry = HEntry.getInstance();
45      private static final VCard vVCARD = VCard.getInstance();
46  
47      private static final String[] entryFields = {
48              "name",
49              "summary",
50              "content",
51              "published",
52              "updated",
53              "category",
54              "url",
55              "uid",
56              "syndication",
57              "in-reply-to",
58              "author",
59              "location",
60  
61      };
62  
63      private static final String[] geoFields = {
64              "latitude",
65              "longitude",
66              "altitude"
67      };
68  
69      @Override
70      public ExtractorDescription getDescription() {
71          return HEntryExtractorFactory.getDescriptionInstance();
72      }
73  
74      @Override
75      protected String getBaseClassName() {
76          return Microformats2Prefixes.CLASS_PREFIX+"entry";
77      }
78  
79      @Override
80      protected void resetExtractor() {
81          // Empty.
82      }
83  
84      @Override
85      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
86          final BNode entry = getBlankNodeFor(node);
87          conditionallyAddResourceProperty(entry, RDF.TYPE, vEntry.Entry);
88          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
89          addName(fragment, entry);
90          addSummary(fragment, entry);
91          addContent(fragment, entry);
92          addPublished(fragment, entry);
93          addUpdated(fragment, entry);
94          addCategories(fragment, entry);
95          addURLs(fragment, entry);
96          addUID(fragment, entry);
97          addSyndications(fragment, entry);
98          addInReplyTo(fragment, entry);
99          addLocations(fragment, entry);
100         addAuthors(fragment, entry);
101         return true;
102     }
103 
104     private void addAuthors(HTMLDocument doc, Resource entry) throws ExtractionException {
105         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[10] +
106                 Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
107         if (nodes.isEmpty())
108             return;
109         HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
110         HCardExtractor extractor = factory.createExtractor();
111         for (Node node : nodes) {
112             BNode author = valueFactory.createBNode();
113             addIRIProperty(author, RDF.TYPE, vEntry.author);
114             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), author,
115                     getCurrentExtractionResult());
116         }
117     }
118 
119     private void mapFieldWithProperty(HTMLDocument fragment, BNode entry, String fieldClass,
120                                       IRI property) {
121         HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
122         conditionallyAddStringProperty(
123                 title.source(), entry, property, title.value()
124         );
125     }
126 
127     private void addName(HTMLDocument fragment, BNode entry) {
128         mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX +
129                 entryFields[0], vEntry.name);
130     }
131 
132     private void addSummary(HTMLDocument fragment, BNode entry) {
133         mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[1],
134                 vEntry.summary);
135     }
136 
137     private void addContent(HTMLDocument fragment, BNode entry) {
138         mapFieldWithProperty(fragment, entry, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + entryFields[2],
139                 vEntry.content);
140     }
141 
142     private void addPublished(HTMLDocument fragment, BNode entry) {
143         final HTMLDocument.TextField[] durations = fragment.getPluralTextField(
144                 Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[3]);
145         for(HTMLDocument.TextField duration : durations) {
146             Node attribute=duration.source().getAttributes().getNamedItem("datetime");
147             if (attribute==null){
148                 conditionallyAddStringProperty(
149                         duration.source(),
150                         entry, vEntry.published, duration.value()
151                 );
152             }else{
153                 conditionallyAddStringProperty(
154                         duration.source(),
155                         entry, vEntry.published, attribute.getNodeValue()
156                 );
157             }
158         }
159     }
160 
161     private void addUpdated(HTMLDocument fragment, BNode entry) {
162         final HTMLDocument.TextField[] durations = fragment.getPluralTextField(
163                 Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[4]);
164         for(HTMLDocument.TextField duration : durations) {
165             Node attribute=duration.source().getAttributes().getNamedItem("datetime");
166             if (attribute==null){
167                 conditionallyAddStringProperty(
168                         duration.source(),
169                         entry, vEntry.updated, duration.value()
170                 );
171             }else{
172                 conditionallyAddStringProperty(
173                         duration.source(),
174                         entry, vEntry.updated, attribute.getNodeValue()
175                 );
176             }
177         }
178     }
179 
180     private void addCategories(HTMLDocument fragment, BNode entry) {
181         final HTMLDocument.TextField[] categories = fragment.getPluralTextField
182                 (Microformats2Prefixes.PROPERTY_PREFIX + entryFields[5]);
183         for (HTMLDocument.TextField category : categories) {
184             conditionallyAddStringProperty(
185                     category.source(), entry, vEntry.category, category.value()
186             );
187         }
188     }
189 
190     private void addURLs(HTMLDocument fragment, BNode entry) throws ExtractionException {
191         final HTMLDocument.TextField[] urls = fragment.getPluralUrlField
192                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[6]);
193         for(HTMLDocument.TextField url : urls) {
194             addIRIProperty(entry, vEntry.url, fragment.resolveIRI(url.value()));
195         }
196     }
197 
198     private void addUID(HTMLDocument fragment, BNode entry) throws ExtractionException {
199         final HTMLDocument.TextField uid = fragment.getSingularTextField
200                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[7]);
201         if(uid.source()==null)
202             return;
203         addIRIProperty(entry, vEntry.uid, fragment.resolveIRI(uid.value()));
204     }
205 
206     private void addSyndications(HTMLDocument fragment, BNode entry) throws ExtractionException {
207         final HTMLDocument.TextField[] syndications = fragment.getPluralUrlField
208                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[8]);
209         for(HTMLDocument.TextField syndication : syndications) {
210             addIRIProperty(entry, vEntry.syndication, fragment.resolveIRI(syndication.value()));
211         }
212     }
213 
214     private void addInReplyTo(HTMLDocument fragment, BNode entry) throws ExtractionException {
215         final HTMLDocument.TextField inReplyTo = fragment.getSingularTextField
216                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[9]);
217         if(inReplyTo.source()==null)
218             return;
219         addIRIProperty(entry, vEntry.in_reply_to, fragment.resolveIRI(inReplyTo.value()));
220     }
221 
222     private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException {
223         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[11] +
224                 Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo");
225         if (nodes.isEmpty())
226             return;
227         for (Node node : nodes) {
228             BNode location = valueFactory.createBNode();
229             addIRIProperty(location, RDF.TYPE, vEntry.location);
230             HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
231             for (String field : geoFields) {
232                 HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field);
233                 for (HTMLDocument.TextField val : values) {
234                     Node attribute=val.source().getAttributes().getNamedItem("title");
235                     if (attribute==null){
236                         conditionallyAddStringProperty(
237                                 val.source(),
238                                 location, vVCARD.getProperty(field), val.value()
239                         );
240                     }else{
241                         conditionallyAddStringProperty(
242                                 val.source(),
243                                 location, vVCARD.getProperty(field), attribute.getNodeValue()
244                         );
245                     }
246                 }
247             }
248         }
249     }
250 }