View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
24  import org.apache.any23.extractor.html.HTMLDocument;
25  import org.apache.any23.vocab.HEntry;
26  import org.apache.any23.vocab.VCard;
27  import org.eclipse.rdf4j.model.BNode;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  import org.eclipse.rdf4j.model.Resource;
32  
33  import java.util.List;
34  
35  /**
36   * Extractor for the <a href="http://microformats.org/wiki/h-entry">h-entry</a> microformat.
37   *
38   * @author Nisala Nirmana
39   */
40  public class HEntryExtractor extends EntityBasedMicroformatExtractor {
41  
42      private static final HEntry vEntry = HEntry.getInstance();
43      private static final VCard vVCARD = VCard.getInstance();
44  
45      private static final String[] entryFields = { "name", "summary", "content", "published", "updated", "category",
46              "url", "uid", "syndication", "in-reply-to", "author", "location",
47  
48      };
49  
50      private static final String[] geoFields = { "latitude", "longitude", "altitude" };
51  
52      @Override
53      public ExtractorDescription getDescription() {
54          return HEntryExtractorFactory.getDescriptionInstance();
55      }
56  
57      @Override
58      protected String getBaseClassName() {
59          return Microformats2Prefixes.CLASS_PREFIX + "entry";
60      }
61  
62      @Override
63      protected void resetExtractor() {
64          // Empty.
65      }
66  
67      @Override
68      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
69          final BNode entry = getBlankNodeFor(node);
70          conditionallyAddResourceProperty(entry, RDF.TYPE, vEntry.Entry);
71          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
72          addName(fragment, entry);
73          addSummary(fragment, entry);
74          addContent(fragment, entry);
75          addPublished(fragment, entry);
76          addUpdated(fragment, entry);
77          addCategories(fragment, entry);
78          addURLs(fragment, entry);
79          addUID(fragment, entry);
80          addSyndications(fragment, entry);
81          addInReplyTo(fragment, entry);
82          addLocations(fragment, entry);
83          addAuthors(fragment, entry);
84          return true;
85      }
86  
87      private void addAuthors(HTMLDocument doc, Resource entry) throws ExtractionException {
88          List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[10]
89                  + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
90          if (nodes.isEmpty())
91              return;
92          HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
93          HCardExtractor extractor = factory.createExtractor();
94          for (Node node : nodes) {
95              BNode author = valueFactory.createBNode();
96              addIRIProperty(author, RDF.TYPE, vEntry.author);
97              extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), author, getCurrentExtractionResult());
98          }
99      }
100 
101     private void mapFieldWithProperty(HTMLDocument fragment, BNode entry, String fieldClass, IRI property) {
102         HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
103         conditionallyAddStringProperty(title.source(), entry, property, title.value());
104     }
105 
106     private void addName(HTMLDocument fragment, BNode entry) {
107         mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[0], vEntry.name);
108     }
109 
110     private void addSummary(HTMLDocument fragment, BNode entry) {
111         mapFieldWithProperty(fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[1], vEntry.summary);
112     }
113 
114     private void addContent(HTMLDocument fragment, BNode entry) {
115         mapFieldWithProperty(fragment, entry, Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + entryFields[2],
116                 vEntry.content);
117     }
118 
119     private void addPublished(HTMLDocument fragment, BNode entry) {
120         final HTMLDocument.TextField[] durations = fragment
121                 .getPluralTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[3]);
122         for (HTMLDocument.TextField duration : durations) {
123             Node attribute = duration.source().getAttributes().getNamedItem("datetime");
124             if (attribute == null) {
125                 conditionallyAddStringProperty(duration.source(), entry, vEntry.published, duration.value());
126             } else {
127                 conditionallyAddStringProperty(duration.source(), entry, vEntry.published, attribute.getNodeValue());
128             }
129         }
130     }
131 
132     private void addUpdated(HTMLDocument fragment, BNode entry) {
133         final HTMLDocument.TextField[] durations = fragment
134                 .getPluralTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[4]);
135         for (HTMLDocument.TextField duration : durations) {
136             Node attribute = duration.source().getAttributes().getNamedItem("datetime");
137             if (attribute == null) {
138                 conditionallyAddStringProperty(duration.source(), entry, vEntry.updated, duration.value());
139             } else {
140                 conditionallyAddStringProperty(duration.source(), entry, vEntry.updated, attribute.getNodeValue());
141             }
142         }
143     }
144 
145     private void addCategories(HTMLDocument fragment, BNode entry) {
146         final HTMLDocument.TextField[] categories = fragment
147                 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[5]);
148         for (HTMLDocument.TextField category : categories) {
149             conditionallyAddStringProperty(category.source(), entry, vEntry.category, category.value());
150         }
151     }
152 
153     private void addURLs(HTMLDocument fragment, BNode entry) throws ExtractionException {
154         final HTMLDocument.TextField[] urls = fragment
155                 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[6]);
156         for (HTMLDocument.TextField url : urls) {
157             addIRIProperty(entry, vEntry.url, fragment.resolveIRI(url.value()));
158         }
159     }
160 
161     private void addUID(HTMLDocument fragment, BNode entry) throws ExtractionException {
162         final HTMLDocument.TextField uid = fragment
163                 .getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[7]);
164         if (uid.source() == null)
165             return;
166         addIRIProperty(entry, vEntry.uid, fragment.resolveIRI(uid.value()));
167     }
168 
169     private void addSyndications(HTMLDocument fragment, BNode entry) throws ExtractionException {
170         final HTMLDocument.TextField[] syndications = fragment
171                 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[8]);
172         for (HTMLDocument.TextField syndication : syndications) {
173             addIRIProperty(entry, vEntry.syndication, fragment.resolveIRI(syndication.value()));
174         }
175     }
176 
177     private void addInReplyTo(HTMLDocument fragment, BNode entry) throws ExtractionException {
178         final HTMLDocument.TextField inReplyTo = fragment
179                 .getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[9]);
180         if (inReplyTo.source() == null)
181             return;
182         addIRIProperty(entry, vEntry.in_reply_to, fragment.resolveIRI(inReplyTo.value()));
183     }
184 
185     private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException {
186         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[11]
187                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo");
188         if (nodes.isEmpty())
189             return;
190         for (Node node : nodes) {
191             BNode location = valueFactory.createBNode();
192             addIRIProperty(location, RDF.TYPE, vEntry.location);
193             HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
194             for (String field : geoFields) {
195                 HTMLDocument.TextField[] values = fragment
196                         .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
197                 for (HTMLDocument.TextField val : values) {
198                     Node attribute = val.source().getAttributes().getNamedItem("title");
199                     if (attribute == null) {
200                         conditionallyAddStringProperty(val.source(), location, vVCARD.getProperty(field), val.value());
201                     } else {
202                         conditionallyAddStringProperty(val.source(), location, vVCARD.getProperty(field),
203                                 attribute.getNodeValue());
204                     }
205                 }
206             }
207         }
208     }
209 }