View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.FOAF;
25  import org.apache.any23.vocab.HListing;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  
32  import java.util.ArrayList;
33  import java.util.Arrays;
34  import java.util.HashSet;
35  import java.util.List;
36  import java.util.Set;
37  
38  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
39  
40  /**
41   * Extractor for the <a href="http://microformats.org/wiki/hlisting">hListing</a> microformat.
42   *
43   * @author Gabriele Renzi
44   */
45  public class HListingExtractor extends EntityBasedMicroformatExtractor {
46  
47      private static final HListing hLISTING = HListing.getInstance();
48      private static final FOAF foaf = FOAF.getInstance();
49  
50      private static final Set<String> ActionClasses = new HashSet<String>() {
51          {
52              add("sell");
53              add("rent");
54              add("trade");
55              add("meet");
56              add("announce");
57              add("offer");
58              add("wanted");
59              add("event");
60              add("service");
61          }
62      };
63  
64      private static final List<String> validClassesForAddress = Arrays.asList("post-office-box", "extended-address",
65              "street-address", "locality", "region", "postal-code", "country-name");
66  
67      private HTMLDocument fragment;
68  
69      @Override
70      public ExtractorDescription getDescription() {
71          return HListingExtractorFactory.getDescriptionInstance();
72      }
73  
74      protected String getBaseClassName() {
75          return "hlisting";
76      }
77  
78      @Override
79      protected void resetExtractor() {
80          // Empty.
81      }
82  
83      @Override
84      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
85          this.fragment = new HTMLDocument(node);
86          BNode listing = getBlankNodeFor(node);
87          out.writeTriple(listing, RDF.TYPE, hLISTING.Listing);
88  
89          for (String action : findActions(fragment)) {
90              out.writeTriple(listing, hLISTING.action, hLISTING.getClass(action));
91          }
92          out.writeTriple(listing, hLISTING.lister, addLister());
93          addItem(listing);
94          addDateTimes(listing);
95          addPrice(listing);
96          addDescription(listing);
97          addSummary(listing);
98          addPermalink(listing);
99  
100         final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
101         tser.addResourceRoot(DomUtils.getXPathListForNode(node), listing, this.getClass());
102 
103         return true;
104     }
105 
106     private void addItem(Resource listing) throws ExtractionException {
107         Node node = fragment.findMicroformattedObjectNode("*", "item");
108         if (null == node)
109             return;
110         BNode blankItem = valueFactory.createBNode();
111         addBNodeProperty(node, listing, hLISTING.item, blankItem);
112         addIRIProperty(blankItem, RDF.TYPE, hLISTING.Item);
113 
114         HTMLDocumentml/HTMLDocument.html#HTMLDocument">HTMLDocument item = new HTMLDocument(node);
115 
116         addItemName(item, blankItem);
117         addItemUrl(item, blankItem);
118         // the format is specified with photo into item, but kelkoo has it into the top level
119         addItemPhoto(fragment, blankItem);
120         addItemAddresses(fragment, blankItem);
121     }
122 
123     private void addItemAddresses(HTMLDocument doc, Resource blankItem) {
124         final String extractorName = getDescription().getExtractorName();
125         for (Node node : doc.findAll(".//*[contains(@class,'adr')]//*[@class]")) {
126             String[] klasses = node.getAttributes().getNamedItem("class").getNodeValue().split("\\s+");
127             for (String klass : klasses)
128                 if (validClassesForAddress.contains(klass)) {
129                     String value = node.getNodeValue();
130                     // do not use conditionallyAdd, it won't work cause of evaluation rules
131                     if (!(null == value || "".equals(value))) {
132                         IRI property = hLISTING.getPropertyCamelCase(klass);
133                         conditionallyAddLiteralProperty(node, blankItem, property, valueFactory.createLiteral(value));
134                     }
135                 }
136         }
137     }
138 
139     private void addPermalink(Resource listing) {
140         String link = fragment.find(".//A[contains(@rel,'self') and contains(@rel,'bookmark')]/@href");
141         conditionallyAddStringProperty(fragment.getDocument(), listing, hLISTING.permalink, link);
142     }
143 
144     private void addPrice(Resource listing) {
145         TextField price = fragment.getSingularTextField("price");
146         conditionallyAddStringProperty(price.source(), listing, hLISTING.price, price.value());
147     }
148 
149     private void addDescription(Resource listing) {
150         TextField description = fragment.getSingularTextField("description");
151         conditionallyAddStringProperty(description.source(), listing, hLISTING.description, description.value());
152     }
153 
154     private void addSummary(Resource listing) {
155         TextField summary = fragment.getSingularTextField("summary");
156         conditionallyAddStringProperty(summary.source(), listing, hLISTING.summary, summary.value());
157     }
158 
159     private void addDateTimes(Resource listing) {
160         TextField listed = fragment.getSingularTextField("dtlisted");
161         conditionallyAddStringProperty(listed.source(), listing, hLISTING.dtlisted, listed.value());
162         HTMLDocument.TextField expired = fragment.getSingularTextField("dtexpired");
163         conditionallyAddStringProperty(expired.source(), listing, hLISTING.dtexpired, expired.value());
164     }
165 
166     private Resource addLister() throws ExtractionException {
167         Resource blankLister = valueFactory.createBNode();
168         addIRIProperty(blankLister, RDF.TYPE, hLISTING.Lister);
169         Node node = fragment.findMicroformattedObjectNode("*", "lister");
170         if (null == node)
171             return blankLister;
172         HTMLDocumentLDocument.html#HTMLDocument">HTMLDocument listerNode = new HTMLDocument(node);
173         addListerFn(listerNode, blankLister);
174         addListerOrg(listerNode, blankLister);
175         addListerEmail(listerNode, blankLister);
176         addListerUrl(listerNode, blankLister);
177         addListerTel(listerNode, blankLister);
178         addListerLogo(listerNode, blankLister);
179         return blankLister;
180     }
181 
182     private void addListerTel(HTMLDocument doc, Resource blankLister) {
183         HTMLDocument.TextField tel = doc.getSingularTextField("tel");
184         conditionallyAddStringProperty(tel.source(), blankLister, hLISTING.tel, tel.value());
185     }
186 
187     private void addListerUrl(HTMLDocument doc, Resource blankLister) throws ExtractionException {
188         TextField url = doc.getSingularUrlField("url");
189         conditionallyAddResourceProperty(blankLister, hLISTING.listerUrl, getHTMLDocument().resolveIRI(url.value()));
190     }
191 
192     private void addListerEmail(HTMLDocument doc, Resource blankLister) {
193         TextField email = doc.getSingularUrlField("email");
194         conditionallyAddResourceProperty(blankLister, foaf.mbox, fixLink(email.value(), "mailto"));
195     }
196 
197     private void addListerFn(HTMLDocument doc, Resource blankLister) {
198         TextField fn = doc.getSingularTextField("fn");
199         conditionallyAddStringProperty(fn.source(), blankLister, hLISTING.listerName, fn.value());
200     }
201 
202     private void addListerLogo(HTMLDocument doc, Resource blankLister) throws ExtractionException {
203         TextField logo = doc.getSingularUrlField("logo");
204         conditionallyAddResourceProperty(blankLister, hLISTING.listerLogo, getHTMLDocument().resolveIRI(logo.value()));
205     }
206 
207     private void addListerOrg(HTMLDocument doc, Resource blankLister) {
208         TextField org = doc.getSingularTextField("org");
209         conditionallyAddStringProperty(org.source(), blankLister, hLISTING.listerOrg, org.value());
210     }
211 
212     private void addItemName(HTMLDocument item, Resource blankItem) {
213         HTMLDocument.TextField fn = item.getSingularTextField("fn");
214         conditionallyAddStringProperty(fn.source(), blankItem, hLISTING.itemName, fn.value());
215     }
216 
217     private void addItemUrl(HTMLDocument item, Resource blankItem) throws ExtractionException {
218         TextField url = item.getSingularUrlField("url");
219         conditionallyAddResourceProperty(blankItem, hLISTING.itemUrl, getHTMLDocument().resolveIRI(url.value()));
220     }
221 
222     private void addItemPhoto(HTMLDocument doc, Resource blankLister) throws ExtractionException {
223         // as per spec
224         String url = doc.findMicroformattedValue("*", "item", "A", "photo", "@href");
225         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
226         url = doc.findMicroformattedValue("*", "item", "IMG", "photo", "@src");
227         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
228         // as per kelkoo. Remember that contains(foo,'') is true in xpath
229         url = doc.findMicroformattedValue("*", "photo", "IMG", "", "@src");
230         conditionallyAddResourceProperty(blankLister, hLISTING.itemPhoto, getHTMLDocument().resolveIRI(url));
231     }
232 
233     private List<String> findActions(HTMLDocument doc) {
234         List<String> actions = new ArrayList<String>(0);
235         // first check if values are inlined
236         String[] classes = doc.readAttribute("class").split("\\s+");
237         for (String klass : classes) {
238             if (ActionClasses.contains(klass))
239                 actions.add(klass);
240         }
241 
242         for (Node action : doc.findAll("./*[@class]/@class")) {
243             for (String substring : action.getNodeValue().split("\\s+")) {
244                 if (ActionClasses.contains(substring))
245                     actions.add(substring);
246             }
247         }
248         return actions;
249     }
250 
251 }