This project has retired. For details please refer to its Attic page.
RDFMergerTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResultImpl;
24  import org.apache.any23.extractor.ExtractorFactory;
25  import org.apache.any23.rdf.RDFUtils;
26  import org.apache.any23.vocab.DCTerms;
27  import org.apache.any23.vocab.FOAF;
28  import org.apache.any23.vocab.Review;
29  import org.apache.any23.vocab.VCard;
30  import org.apache.any23.writer.RepositoryWriter;
31  import org.junit.Test;
32  import org.eclipse.rdf4j.model.Resource;
33  import org.eclipse.rdf4j.model.Statement;
34  import org.eclipse.rdf4j.model.Value;
35  import org.eclipse.rdf4j.model.vocabulary.OWL;
36  import org.eclipse.rdf4j.model.vocabulary.RDF;
37  import org.eclipse.rdf4j.repository.RepositoryException;
38  import org.eclipse.rdf4j.repository.RepositoryResult;
39  import org.w3c.dom.Document;
40  
41  import java.io.BufferedInputStream;
42  import java.io.IOException;
43  import java.io.InputStream;
44  import java.util.HashMap;
45  import java.util.Map;
46  
47  /**
48   * Reference Test class for various mixed extractors.
49   *
50   * @author Davide Palmisano (dpalmisano@gmail.com)
51   *
52   * @see GeoExtractor
53   * @see AdrExtractor
54   * @see HCardExtractor
55   * @see HReviewExtractor
56   */
57  public class RDFMergerTest extends AbstractExtractorTestCase {
58  
59      private static final DCTerms vDCTERMS = DCTerms.getInstance();
60      private static final FOAF vFOAF = FOAF.getInstance();
61      private static final Review vREVIEW = Review.getInstance();
62      private static final VCard vVCARD = VCard.getInstance();
63  
64      @Override
65      protected ExtractorFactory<?> getExtractorFactory() {
66          return null;
67      }
68  
69      @Test
70      public void testNoMicroformats() throws Exception, ExtractionException, IOException {
71          extract("/html/html-without-uf.html");
72          assertModelEmpty();
73      }
74  
75      @Test
76      public void test01XFNFoaf() throws Exception {
77          assertExtract("/html/mixed/01-xfn-foaf.html", false);
78          assertModelNotEmpty();
79          assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1);
80          Resource vcard = findExactlyOneBlankSubject(RDF.TYPE, vVCARD.VCard);
81          RepositoryResult<Statement> statements = getStatements(null, vFOAF.topic, vcard);
82  
83          try {
84              while (statements.hasNext()) {
85                  Statement statement = statements.next();
86                  Resource person = statement.getSubject();
87                  Resource blank = findExactlyOneBlankSubject(OWL.SAMEAS, person);
88                  assertContains(blank, RDF.TYPE, vFOAF.Person);
89  
90              }
91  
92          } finally {
93              statements.close();
94          }
95      }
96  
97      @Test
98      public void testAbbrTitleEverything() throws ExtractionException, IOException, RepositoryException {
99          extractHCardAndRelated("/microformats/hcard/23-abbr-title-everything.html");
100 
101         assertContains(vVCARD.fn, "John Doe");
102         assertContains(vVCARD.nickname, "JJ");
103 
104         assertContains(vVCARD.given_name, "Jonathan");
105         assertContains(vVCARD.additional_name, "John");
106         assertContains(vVCARD.family_name, "Doe-Smith");
107         assertContains(vVCARD.honorific_suffix, "Medical Doctor");
108         assertContains(vVCARD.title, "President");
109         assertContains(vVCARD.role, "Chief");
110         assertContains(vVCARD.tz, "-0700");
111         assertContains(vVCARD.bday, "2006-04-04");
112         assertContains(vVCARD.tel, RDFUtils.iri("tel:415.555.1234"));
113         assertContains(vVCARD.uid, "abcdefghijklmnopqrstuvwxyz");
114         assertContains(vVCARD.class_, "public");
115         assertContains(vVCARD.note, "this is a note");
116         assertContains(vVCARD.organization_name, "Intellicorp");
117         assertContains(vVCARD.organization_unit, "Intelligence");
118         assertContains(RDF.TYPE, vVCARD.Location);
119         assertContains(vVCARD.geo, (Resource) null);
120         assertContains(vVCARD.latitude, "37.77");
121         assertContains(vVCARD.longitude, "-122.41");
122         assertContains(vVCARD.post_office_box, "Box 1234");
123         assertContains(vVCARD.extended_address, "Suite 100");
124         assertContains(vVCARD.street_address, "123 Fake Street");
125         assertContains(vVCARD.locality, "San Francisco");
126         assertContains(vVCARD.region, "California");
127         assertContains(vVCARD.postal_code, "12345-6789");
128         assertContains(vVCARD.country_name, "United States of America");
129         assertContains(vVCARD.addressType, "work");
130     }
131 
132     @Test
133     public void testAdr() throws Exception {
134         extractHRevAndRelated("/microformats/hcard/22-adr.html");
135 
136         assertStatementsSize(RDF.TYPE, vVCARD.Address, 4);
137 
138         Map<String, String[]> addresses = new HashMap<String, String[]>(4);
139         addresses.put("1233 Main St.",
140                 new String[] { "United States of America", "Beverly Hills", "90210", "California" });
141         addresses.put("1232 Main St.",
142                 new String[] { "United States of America", "Beverly Hills", "90210", "California" });
143         addresses.put("1234 Main St.",
144                 new String[] { "United States of America", "Beverly Hills", "90210", "California" });
145         addresses.put("1231 Main St.",
146                 new String[] { "United States of America", "Beverly Hills", "90210", "California" });
147         addresses.put("Suite 100", new String[] { "United States of America", "Beverly Hills", "90210", "California" });
148 
149         RepositoryResult<Statement> statements = getStatements(null, RDF.TYPE, vVCARD.Address);
150 
151         try {
152             while (statements.hasNext()) {
153                 Resource adr = statements.next().getSubject();
154                 RepositoryResult<Statement> innerStatements = getStatements(adr, vVCARD.street_address, null);
155                 try {
156                     while (innerStatements.hasNext()) {
157                         Value innerValue = innerStatements.next().getObject();
158                         assertContains(adr, vVCARD.country_name, addresses.get(innerValue.stringValue())[0]);
159                         assertContains(adr, vVCARD.locality, addresses.get(innerValue.stringValue())[1]);
160                         assertContains(adr, vVCARD.postal_code, addresses.get(innerValue.stringValue())[2]);
161                         assertContains(adr, vVCARD.region, addresses.get(innerValue.stringValue())[3]);
162                     }
163 
164                 } finally {
165                     innerStatements.close();
166                 }
167             }
168 
169         } finally {
170             statements.close();
171         }
172 
173         assertContains(vVCARD.post_office_box, "PO Box 1234");
174         assertContains(vVCARD.addressType, "home");
175     }
176 
177     @Test
178     public void testGeoAbbr() throws ExtractionException, IOException, RepositoryException {
179         extractHCardAndRelated("/microformats/hcard/25-geo-abbr.html");
180         assertModelNotEmpty();
181         assertContains(vVCARD.fn, "Paradise");
182         assertContains(RDF.TYPE, vVCARD.Organization);
183         assertContains(vVCARD.organization_name, "Paradise");
184         assertContains(RDF.TYPE, vVCARD.Location);
185         assertContains(vVCARD.geo, (Resource) null);
186         assertContains(vVCARD.latitude, "30.267991");
187         assertContains(vVCARD.longitude, "-97.739568");
188     }
189 
190     @Test
191     public void testAncestors() throws ExtractionException, IOException, RepositoryException {
192         extractHCardAndRelated("/microformats/hcard/26-ancestors.html");
193         assertModelNotEmpty();
194 
195         assertContains(vVCARD.fn, "John Doe");
196         assertNotContains(null, vVCARD.fn, "Mister Jonathan John Doe-Smith Medical Doctor");
197         assertContains(vVCARD.nickname, "JJ");
198         assertContains(RDF.TYPE, vVCARD.Address);
199         assertContains(vVCARD.tz, "-0700");
200         assertContains(vVCARD.title, "President");
201         assertContains(vVCARD.role, "Chief");
202         assertContains(vVCARD.organization_name, "Intellicorp");
203         assertContains(vVCARD.organization_unit, "Intelligence");
204 
205         assertContains(vVCARD.tel, RDFUtils.iri("tel:415.555.1234"));
206         assertContains(vVCARD.uid, "abcdefghijklmnopqrstuvwxyz");
207         assertContains(vVCARD.note, "this is a note");
208         assertContains(vVCARD.class_, "public");
209 
210         assertContains(RDF.TYPE, vVCARD.Location);
211         assertContains(vVCARD.geo, (Resource) null);
212         assertContains(null, vVCARD.latitude, "37.77");
213         assertContains(null, vVCARD.longitude, "-122.41");
214 
215         assertContains(RDF.TYPE, vVCARD.Name);
216         assertContains(vVCARD.additional_name, "John");
217         assertContains(vVCARD.given_name, "Jonathan");
218         assertContains(vVCARD.family_name, "Doe-Smith");
219         assertContains(vVCARD.honorific_prefix, "Mister");
220         assertContains(vVCARD.honorific_suffix, "Medical Doctor");
221 
222         assertContains(vVCARD.post_office_box, "Box 1234");
223         assertContains(vVCARD.extended_address, "Suite 100");
224         assertContains(vVCARD.street_address, "123 Fake Street");
225         assertContains(vVCARD.locality, "San Francisco");
226         assertContains(vVCARD.region, "California");
227         assertContains(vVCARD.postal_code, "12345-6789");
228         assertContains(vVCARD.country_name, "United States of America");
229         assertContains(vVCARD.addressType, "work");
230     }
231 
232     @Test
233     public void testSingleton() throws Exception {
234         extractHCardAndRelated("/microformats/hcard/37-singleton.html");
235         assertModelNotEmpty();
236         assertStatementsSize(vVCARD.fn, (Value) null, 1);
237         assertContains(vVCARD.fn, "john doe 1");
238         assertStatementsSize(RDF.TYPE, vVCARD.Name, 1);
239         assertStatementsSize(vVCARD.given_name, (Value) null, 1);
240         assertContains(vVCARD.given_name, "john");
241         assertStatementsSize(vVCARD.family_name, (Value) null, 1);
242         assertContains(vVCARD.family_name, "doe");
243         assertStatementsSize(vVCARD.sort_string, (Value) null, 1);
244         assertContains(vVCARD.sort_string, "d");
245         assertStatementsSize(vVCARD.bday, (Value) null, 1);
246         assertContains(vVCARD.bday, "20060707");
247         assertStatementsSize(vVCARD.rev, (Value) null, 1);
248         assertContains(vVCARD.rev, "20060707");
249         assertStatementsSize(vVCARD.class_, (Value) null, 1);
250         assertContains(vVCARD.class_, "public");
251         assertStatementsSize(vVCARD.tz, (Value) null, 1);
252         assertContains(vVCARD.tz, "+0600");
253         // 2 uf, one of them outside the card
254         assertStatementsSize(RDF.TYPE, vVCARD.Location, 2);
255         // one is actually used
256         assertStatementsSize(vVCARD.geo, (Value) null, 2);
257         assertContains(vVCARD.latitude, "123.45");
258         assertContains(vVCARD.longitude, "67.89");
259         assertStatementsSize(vVCARD.uid, (Value) null, 1);
260         assertContains(vVCARD.uid, "unique-id-1");
261     }
262 
263     @Test
264     public void test01Basic() throws Exception {
265         extractHRevAndRelated("/microformats/hreview/01-spec.html");
266         assertModelNotEmpty();
267 
268         assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1);
269         // reviewer, item
270         assertStatementsSize(RDF.TYPE, vVCARD.VCard, 2);
271         // there is one address in the item vcard
272         assertStatementsSize(RDF.TYPE, vVCARD.Address, 1);
273 
274         RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review);
275 
276         try {
277             while (reviews.hasNext()) {
278                 Resource review = reviews.next().getSubject();
279                 assertContains(review, vREVIEW.rating, "5");
280                 assertContains(review, vREVIEW.title, "Crepes on Cole is awesome");
281                 assertContains(review, vDCTERMS.date, "20050418T2300-0700");
282                 assertContains(vREVIEW.text,
283                         "Crepes on Cole is one of the best little \n" + "      creperies in San Francisco.\n      "
284                                 + "Excellent food and service. Plenty of tables in a variety of sizes\n"
285                                 + "      for parties large and small.  " + "Window seating makes for excellent\n      "
286                                 + "people watching to/from the N-Judah which stops right outside.\n"
287                                 + "      I've had many fun social gatherings here, as well as gotten\n"
288                                 + "      plenty of work done thanks to neighborhood WiFi.");
289 
290                 assertContains(null, vREVIEW.hasReview, review);
291             }
292         } finally {
293             reviews.close();
294         }
295 
296         // generic checks that vcards are correct, improve
297         assertContains(vVCARD.fn, "Crepes on Cole");
298         assertContains(vVCARD.fn, "Tantek");
299         assertContains(vVCARD.locality, "San Francisco");
300         assertContains(vVCARD.organization_name, "Crepes on Cole");
301 
302     }
303 
304     @Test
305     public void test02RatedTags() throws Exception {
306         extractHRevAndRelated("/microformats/hreview/02-spec-2.html");
307 
308         assertStatementsSize(vREVIEW.reviewer, (Value) null, 1);
309         assertStatementsSize(vREVIEW.hasReview, (Value) null, 1);
310         assertModelNotEmpty();
311         assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1);
312         // reviewer, item
313         assertStatementsSize(RDF.TYPE, vVCARD.VCard, 2);
314         // there is one address in the item vcard
315         assertStatementsSize(RDF.TYPE, vVCARD.Address, 1);
316 
317         RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review);
318 
319         try {
320             while (reviews.hasNext()) {
321                 Resource review = reviews.next().getSubject();
322                 assertContains(review, vREVIEW.rating, "18");
323                 assertContains(review, vREVIEW.title, "Cafe Borrone");
324                 assertContains(review, vDCTERMS.date, "20050428T2130-0700");
325                 assertContains(null, vREVIEW.hasReview, review);
326                 assertContains(vREVIEW.type, "business");
327             }
328 
329         } finally {
330             reviews.close();
331         }
332 
333         // generic checks that vcards are correct, improve
334         assertContains(vVCARD.fn, "Cafe Borrone");
335         assertContains(vVCARD.fn, "anonymous");
336         assertContains(vVCARD.organization_name, "Cafe Borrone");
337 
338     }
339 
340     @Test
341     public void test03NoHcardForItem() throws Exception {
342         extractHRevAndRelated("/microformats/hreview/03-spec-3.html");
343 
344         assertModelNotEmpty();
345         assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1);
346         assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1);
347 
348         RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review);
349 
350         try {
351             while (reviews.hasNext()) {
352                 Resource review = reviews.next().getSubject();
353                 assertContains(review, vREVIEW.rating, "5");
354                 assertNotContains(vREVIEW.title, null);
355                 assertContains(review, vDCTERMS.date, "200502");
356 
357                 assertContains(vREVIEW.text,
358                         "\"The people thought they were just being rewarded for "
359                                 + "treating others\n       as they like to be treated, for "
360                                 + "obeying stop signs and curing diseases,\n       for mailing "
361                                 + "letters with the address of the sender... Don't wake me,\n "
362                                 + "      I plan on sleeping in...\"\n     \n     \"Nothing Better\""
363                                 + " is a great track on this album, too...");
364 
365                 RepositoryResult<Statement> whatHasAReview = getStatements(null, vREVIEW.hasReview, review);
366 
367                 try {
368                     while (whatHasAReview.hasNext()) {
369                         Resource subject = whatHasAReview.next().getSubject();
370                         assertContains(subject, vVCARD.fn, "The Postal Service: Give Up");
371                         assertContains(subject, vVCARD.url,
372                                 RDFUtils.iri("http://www.amazon.com/exec/obidos/ASIN/B000089CJI/"));
373                         assertContains(subject, vVCARD.photo,
374                                 RDFUtils.iri("http://images.amazon.com/images/P/B000089CJI.01._SCTHUMBZZZ_.jpg"));
375                     }
376 
377                 } finally {
378                     whatHasAReview.close();
379                 }
380 
381             }
382 
383         } finally {
384             reviews.close();
385         }
386 
387         assertContains(vVCARD.fn, "Adam Rifkin");
388         assertContains(vVCARD.url, RDFUtils.iri("http://ifindkarma.com/blog/"));
389     }
390 
391     @Override
392     protected void extract(String filename) throws ExtractionException, IOException {
393 
394         InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));
395 
396         Document document = new TagSoupParser(input, baseIRI.stringValue()).getDOM();
397         HCardExtractor hCardExtractor = new HCardExtractorFactory().createExtractor();
398         ExtractionContext hcExtractionContext = new ExtractionContext(
399                 hCardExtractor.getDescription().getExtractorName(), baseIRI);
400         hCardExtractor.run(ExtractionParameters.newDefault(), hcExtractionContext, document,
401                 new ExtractionResultImpl(hcExtractionContext, hCardExtractor, new RepositoryWriter(getConnection())));
402         XFNExtractor xfnExtractor = new XFNExtractorFactory().createExtractor();
403         ExtractionContext xfnExtractionContext = new ExtractionContext(xfnExtractor.getDescription().getExtractorName(),
404                 baseIRI);
405         xfnExtractor.run(ExtractionParameters.newDefault(), xfnExtractionContext, document,
406                 new ExtractionResultImpl(xfnExtractionContext, hCardExtractor, new RepositoryWriter(getConnection())));
407     }
408 
409     private void extractHCardAndRelated(String filename) throws IOException, ExtractionException {
410 
411         InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));
412 
413         Document document = new TagSoupParser(input, baseIRI.stringValue()).getDOM();
414         HCardExtractor hCardExtractor = new HCardExtractorFactory().createExtractor();
415         ExtractionContext hCardExtractionContext = new ExtractionContext(
416                 hCardExtractor.getDescription().getExtractorName(), baseIRI);
417         hCardExtractor.run(ExtractionParameters.newDefault(), hCardExtractionContext, document,
418                 new ExtractionResultImpl(hCardExtractionContext, hCardExtractor,
419                         new RepositoryWriter(getConnection())));
420 
421         GeoExtractor geoExtractor = new GeoExtractorFactory().createExtractor();
422         ExtractionContext geoExtractionContext = new ExtractionContext(geoExtractor.getDescription().getExtractorName(),
423                 baseIRI);
424         geoExtractor.run(ExtractionParameters.newDefault(), geoExtractionContext, document,
425                 new ExtractionResultImpl(geoExtractionContext, geoExtractor, new RepositoryWriter(getConnection())));
426 
427         AdrExtractor adrExtractor = new AdrExtractorFactory().createExtractor();
428         ExtractionContext adrExtractionContext = new ExtractionContext(adrExtractor.getDescription().getExtractorName(),
429                 baseIRI);
430         adrExtractor.run(ExtractionParameters.newDefault(), adrExtractionContext, document,
431                 new ExtractionResultImpl(adrExtractionContext, adrExtractor, new RepositoryWriter(getConnection())));
432 
433     }
434 
435     private void extractHRevAndRelated(String filename) throws ExtractionException, IOException {
436         extractHCardAndRelated(filename);
437         InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));
438         Document document = new TagSoupParser(input, baseIRI.stringValue()).getDOM();
439         HReviewExtractor hReviewExtractor = new HReviewExtractorFactory().createExtractor();
440         ExtractionContext hreviewExtractionContext = new ExtractionContext(
441                 hReviewExtractor.getDescription().getExtractorName(), baseIRI);
442         hReviewExtractor.run(ExtractionParameters.newDefault(), hreviewExtractionContext, document,
443                 new ExtractionResultImpl(hreviewExtractionContext, hReviewExtractor,
444                         new RepositoryWriter(getConnection())));
445     }
446 
447 }