View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.FOAF;
25  import org.apache.any23.vocab.HListing;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  
32  import java.util.ArrayList;
33  import java.util.Arrays;
34  import java.util.HashSet;
35  import java.util.List;
36  import java.util.Set;
37  
38  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
39  
40  /**
41   * Extractor for the <a href="http://microformats.org/wiki/hlisting">hListing</a>
42   * microformat.
43   *
44   * @author Gabriele Renzi
45   */
46  public class HListingExtractor extends EntityBasedMicroformatExtractor {
47  
48      private static final HListing hLISTING = HListing.getInstance();
49      private static final FOAF foaf     = FOAF.getInstance();
50  
51      private static final Set<String> ActionClasses = new HashSet<String>() {
52          {
53              add("sell"    );
54              add("rent"    );
55              add("trade"   );
56              add("meet"    );
57              add("announce");
58              add("offer"   );
59              add("wanted"  );
60              add("event"   );
61              add("service" );
62          }
63      };
64  
65      private static final List<String> validClassesForAddress = Arrays.asList(
66              "post-office-box",
67              "extended-address",
68              "street-address",
69              "locality",
70              "region",
71              "postal-code",
72              "country-name"
73      );
74  
75      private HTMLDocument fragment;
76  
77      @Override
78      public ExtractorDescription getDescription() {
79          return HListingExtractorFactory.getDescriptionInstance();
80      }
81  
82      protected String getBaseClassName() {
83          return "hlisting";
84      }
85  
86      @Override
87      protected void resetExtractor() {
88          // Empty.
89      }
90  
91      @Override
92      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
93          this.fragment = new HTMLDocument(node);
94          BNode listing = getBlankNodeFor(node);
95          out.writeTriple(listing, RDF.TYPE, hLISTING.Listing);
96  
97          for (String action : findActions(fragment)) {
98              out.writeTriple(listing, hLISTING.action, hLISTING.getClass(action));
99          }
100         out.writeTriple(listing, hLISTING.lister, addLister() );
101         addItem(listing);
102         addDateTimes(listing);
103         addPrice(listing);
104         addDescription(listing);
105         addSummary(listing);
106         addPermalink(listing);
107 
108         final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
109         tser.addResourceRoot(
110                 DomUtils.getXPathListForNode(node),
111                 listing,
112                 this.getClass()
113         );
114 
115         return true;
116     }
117 
118     private void addItem(Resource listing) throws ExtractionException {
119         Node node = fragment.findMicroformattedObjectNode("*", "item");
120         if (null == node) return;
121         BNode blankItem = valueFactory.createBNode();
122         addBNodeProperty(
123                 node,
124                 listing, hLISTING.item, blankItem
125         );
126         addIRIProperty(blankItem, RDF.TYPE, hLISTING.Item);
127 
128         HTMLDocumentml/HTMLDocument.html#HTMLDocument">HTMLDocument item = new HTMLDocument(node);
129 
130         addItemName(item, blankItem);
131         addItemUrl(item, blankItem);
132         // the format is specified with photo into item, but kelkoo has it into the top level
133         addItemPhoto(fragment, blankItem);
134         addItemAddresses(fragment, blankItem);
135     }
136 
137     private void addItemAddresses(HTMLDocument doc, Resource blankItem) {
138         final String extractorName = getDescription().getExtractorName();
139         for (Node node : doc.findAll(".//*[contains(@class,'adr')]//*[@class]")) {
140             String[] klasses = node.getAttributes().getNamedItem("class").getNodeValue().split("\\s+");
141             for (String klass : klasses)
142                 if (validClassesForAddress.contains(klass)) {
143                     String value = node.getNodeValue();
144                     // do not use conditionallyAdd, it won't work cause of evaluation rules
145                     if (!(null == value || "".equals(value))) {
146                         IRI property = hLISTING.getPropertyCamelCase(klass);
147                         conditionallyAddLiteralProperty(
148                                 node,
149                                 blankItem, property, valueFactory.createLiteral(value)
150                         );
151                     }
152                 }
153         }
154     }
155 
156     private void addPermalink(Resource listing) {
157         String link = fragment.find(".//A[contains(@rel,'self') and contains(@rel,'bookmark')]/@href");
158         conditionallyAddStringProperty(
159                 fragment.getDocument(),
160                 listing, hLISTING.permalink, link
161         );
162     }
163 
164     private void addPrice(Resource listing) {
165         TextField price = fragment.getSingularTextField("price");
166         conditionallyAddStringProperty(
167                 price.source(),
168                 listing, hLISTING.price, price.value()
169         );
170     }
171 
172     private void addDescription(Resource listing) {
173         TextField description = fragment.getSingularTextField("description");
174         conditionallyAddStringProperty(
175                 description.source(),
176                 listing, hLISTING.description, description.value()
177         );
178     }
179 
180     private void addSummary(Resource listing) {
181         TextField summary = fragment.getSingularTextField("summary");
182         conditionallyAddStringProperty(
183                 summary.source(),
184                 listing, hLISTING.summary, summary.value()
185         );
186     }
187 
188     private void addDateTimes(Resource listing) {
189         TextField listed = fragment.getSingularTextField("dtlisted");
190         conditionallyAddStringProperty(
191                 listed.source(),
192                 listing, hLISTING.dtlisted, listed.value()
193         );
194         HTMLDocument.TextField expired = fragment.getSingularTextField("dtexpired");
195         conditionallyAddStringProperty(
196                 expired.source(),
197                 listing, hLISTING.dtexpired, expired.value()
198         );
199     }
200 
201     private Resource addLister() throws ExtractionException {
202         Resource blankLister = valueFactory.createBNode();
203         addIRIProperty(blankLister, RDF.TYPE, hLISTING.Lister);
204         Node node = fragment.findMicroformattedObjectNode("*", "lister");
205         if (null == node)
206             return blankLister;
207         HTMLDocumentLDocument.html#HTMLDocument">HTMLDocument listerNode = new HTMLDocument(node);
208         addListerFn(listerNode, blankLister);
209         addListerOrg(listerNode, blankLister);
210         addListerEmail(listerNode, blankLister);
211         addListerUrl(listerNode, blankLister);
212         addListerTel(listerNode, blankLister);
213         addListerLogo(listerNode, blankLister);
214         return blankLister;
215     }
216 
217     private void addListerTel(HTMLDocument doc, Resource blankLister) {
218         HTMLDocument.TextField tel = doc.getSingularTextField("tel");
219         conditionallyAddStringProperty(
220                 tel.source(),
221                 blankLister, hLISTING.tel, tel.value()
222         );
223     }
224 
225     private void addListerUrl(HTMLDocument doc, Resource blankLister) throws ExtractionException {
226         TextField url = doc.getSingularUrlField("url");
227         conditionallyAddResourceProperty(blankLister, hLISTING.listerUrl, getHTMLDocument().resolveIRI(url.value()));
228     }
229 
230     private void addListerEmail(HTMLDocument doc, Resource blankLister) {
231         TextField email = doc.getSingularUrlField("email");
232         conditionallyAddResourceProperty(blankLister, foaf.mbox, fixLink(email.value(), "mailto"));
233     }
234 
235     private void addListerFn(HTMLDocument doc, Resource blankLister) {
236         TextField fn = doc.getSingularTextField("fn");
237         conditionallyAddStringProperty(
238                 fn.source(),
239                 blankLister, hLISTING.listerName, fn.value()
240         );
241     }
242 
243     private void addListerLogo(HTMLDocument doc, Resource blankLister) throws ExtractionException {
244         TextField logo = doc.getSingularUrlField("logo");
245         conditionallyAddResourceProperty(blankLister, hLISTING.listerLogo, getHTMLDocument().resolveIRI(logo.value()));
246     }
247 
248     private void addListerOrg(HTMLDocument doc, Resource blankLister) {
249         TextField org = doc.getSingularTextField("org");
250         conditionallyAddStringProperty(
251                 org.source(),
252                 blankLister, hLISTING.listerOrg, org.value()
253         );
254     }
255 
256     private void addItemName(HTMLDocument item, Resource blankItem) {
257         HTMLDocument.TextField fn = item.getSingularTextField("fn");
258         conditionallyAddStringProperty(
259                 fn.source(),
260                 blankItem, hLISTING.itemName, fn.value()
261         );
262     }
263 
264     private void addItemUrl(HTMLDocument item, Resource blankItem) throws ExtractionException {
265         TextField url = item.getSingularUrlField("url");
266         conditionallyAddResourceProperty(blankItem, hLISTING.itemUrl, getHTMLDocument().resolveIRI(url.value()));
267     }
268 
269     private void addItemPhoto(HTMLDocument doc, Resource blankLister) throws ExtractionException {
270         // as per spec
271         String url = doc.findMicroformattedValue("*", "item", "A", "photo", "@href");
272         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
273         url = doc.findMicroformattedValue("*", "item", "IMG", "photo", "@src");
274         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
275         // as per kelkoo. Remember that contains(foo,'') is true in xpath
276         url = doc.findMicroformattedValue("*", "photo", "IMG", "", "@src");
277         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
278     }
279 
280     private List<String> findActions(HTMLDocument doc) {
281         List<String> actions = new ArrayList<String>(0);
282         // first check if values are inlined
283         String[] classes = doc.readAttribute("class").split("\\s+");
284         for (String klass : classes) {
285             if (ActionClasses.contains(klass))
286                 actions.add(klass);
287         }
288 
289         for (Node action : doc.findAll("./*[@class]/@class")) {
290             for (String substring : action.getNodeValue().split("\\s+")) {
291                 if (ActionClasses.contains(substring))
292                     actions.add(substring);
293             }
294         }
295         return actions;
296     }
297 
298 }