This project has retired. For details please refer to its
Attic page.
RDFMergerTest xref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResultImpl;
24 import org.apache.any23.extractor.ExtractorFactory;
25 import org.apache.any23.rdf.RDFUtils;
26 import org.apache.any23.vocab.DCTerms;
27 import org.apache.any23.vocab.FOAF;
28 import org.apache.any23.vocab.Review;
29 import org.apache.any23.vocab.VCard;
30 import org.apache.any23.writer.RepositoryWriter;
31 import org.junit.Test;
32 import org.eclipse.rdf4j.model.Resource;
33 import org.eclipse.rdf4j.model.Statement;
34 import org.eclipse.rdf4j.model.Value;
35 import org.eclipse.rdf4j.model.vocabulary.OWL;
36 import org.eclipse.rdf4j.model.vocabulary.RDF;
37 import org.eclipse.rdf4j.repository.RepositoryException;
38 import org.eclipse.rdf4j.repository.RepositoryResult;
39 import org.w3c.dom.Document;
40
41 import java.io.BufferedInputStream;
42 import java.io.IOException;
43 import java.io.InputStream;
44 import java.util.HashMap;
45 import java.util.Map;
46
47
48
49
50
51
52
53
54
55
56
57 public class RDFMergerTest extends AbstractExtractorTestCase {
58
59 private static final DCTerms vDCTERMS = DCTerms.getInstance();
60 private static final FOAF vFOAF = FOAF.getInstance();
61 private static final Review vREVIEW = Review.getInstance();
62 private static final VCard vVCARD = VCard.getInstance();
63
64 @Override
65 protected ExtractorFactory<?> getExtractorFactory() {
66 return null;
67 }
68
69 @Test
70 public void testNoMicroformats() throws Exception, ExtractionException, IOException {
71 extract("/html/html-without-uf.html");
72 assertModelEmpty();
73 }
74
75 @Test
76 public void test01XFNFoaf() throws Exception {
77 assertExtract("/html/mixed/01-xfn-foaf.html", false);
78 assertModelNotEmpty();
79 assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1);
80 Resource vcard = findExactlyOneBlankSubject(RDF.TYPE, vVCARD.VCard);
81 RepositoryResult<Statement> statements = getStatements(null, vFOAF.topic, vcard);
82
83 try {
84 while (statements.hasNext()) {
85 Statement statement = statements.next();
86 Resource person = statement.getSubject();
87 Resource blank = findExactlyOneBlankSubject(OWL.SAMEAS, person);
88 assertContains(blank, RDF.TYPE, vFOAF.Person);
89
90 }
91
92 } finally {
93 statements.close();
94 }
95 }
96
97 @Test
98 public void testAbbrTitleEverything() throws ExtractionException, IOException, RepositoryException {
99 extractHCardAndRelated("/microformats/hcard/23-abbr-title-everything.html");
100
101 assertContains(vVCARD.fn, "John Doe");
102 assertContains(vVCARD.nickname, "JJ");
103
104 assertContains(vVCARD.given_name, "Jonathan");
105 assertContains(vVCARD.additional_name, "John");
106 assertContains(vVCARD.family_name, "Doe-Smith");
107 assertContains(vVCARD.honorific_suffix, "Medical Doctor");
108 assertContains(vVCARD.title, "President");
109 assertContains(vVCARD.role, "Chief");
110 assertContains(vVCARD.tz, "-0700");
111 assertContains(vVCARD.bday, "2006-04-04");
112 assertContains(vVCARD.tel, RDFUtils.iri("tel:415.555.1234"));
113 assertContains(vVCARD.uid, "abcdefghijklmnopqrstuvwxyz");
114 assertContains(vVCARD.class_, "public");
115 assertContains(vVCARD.note, "this is a note");
116 assertContains(vVCARD.organization_name, "Intellicorp");
117 assertContains(vVCARD.organization_unit, "Intelligence");
118 assertContains(RDF.TYPE, vVCARD.Location);
119 assertContains(vVCARD.geo, (Resource) null);
120 assertContains(vVCARD.latitude, "37.77");
121 assertContains(vVCARD.longitude, "-122.41");
122 assertContains(vVCARD.post_office_box, "Box 1234");
123 assertContains(vVCARD.extended_address, "Suite 100");
124 assertContains(vVCARD.street_address, "123 Fake Street");
125 assertContains(vVCARD.locality, "San Francisco");
126 assertContains(vVCARD.region, "California");
127 assertContains(vVCARD.postal_code, "12345-6789");
128 assertContains(vVCARD.country_name, "United States of America");
129 assertContains(vVCARD.addressType, "work");
130 }
131
132 @Test
133 public void testAdr() throws Exception {
134 extractHRevAndRelated("/microformats/hcard/22-adr.html");
135
136 assertStatementsSize(RDF.TYPE, vVCARD.Address, 4);
137
138 Map<String, String[]> addresses = new HashMap<String, String[]>(4);
139 addresses.put("1233 Main St.",
140 new String[] { "United States of America", "Beverly Hills", "90210", "California" });
141 addresses.put("1232 Main St.",
142 new String[] { "United States of America", "Beverly Hills", "90210", "California" });
143 addresses.put("1234 Main St.",
144 new String[] { "United States of America", "Beverly Hills", "90210", "California" });
145 addresses.put("1231 Main St.",
146 new String[] { "United States of America", "Beverly Hills", "90210", "California" });
147 addresses.put("Suite 100", new String[] { "United States of America", "Beverly Hills", "90210", "California" });
148
149 RepositoryResult<Statement> statements = getStatements(null, RDF.TYPE, vVCARD.Address);
150
151 try {
152 while (statements.hasNext()) {
153 Resource adr = statements.next().getSubject();
154 RepositoryResult<Statement> innerStatements = getStatements(adr, vVCARD.street_address, null);
155 try {
156 while (innerStatements.hasNext()) {
157 Value innerValue = innerStatements.next().getObject();
158 assertContains(adr, vVCARD.country_name, addresses.get(innerValue.stringValue())[0]);
159 assertContains(adr, vVCARD.locality, addresses.get(innerValue.stringValue())[1]);
160 assertContains(adr, vVCARD.postal_code, addresses.get(innerValue.stringValue())[2]);
161 assertContains(adr, vVCARD.region, addresses.get(innerValue.stringValue())[3]);
162 }
163
164 } finally {
165 innerStatements.close();
166 }
167 }
168
169 } finally {
170 statements.close();
171 }
172
173 assertContains(vVCARD.post_office_box, "PO Box 1234");
174 assertContains(vVCARD.addressType, "home");
175 }
176
177 @Test
178 public void testGeoAbbr() throws ExtractionException, IOException, RepositoryException {
179 extractHCardAndRelated("/microformats/hcard/25-geo-abbr.html");
180 assertModelNotEmpty();
181 assertContains(vVCARD.fn, "Paradise");
182 assertContains(RDF.TYPE, vVCARD.Organization);
183 assertContains(vVCARD.organization_name, "Paradise");
184 assertContains(RDF.TYPE, vVCARD.Location);
185 assertContains(vVCARD.geo, (Resource) null);
186 assertContains(vVCARD.latitude, "30.267991");
187 assertContains(vVCARD.longitude, "-97.739568");
188 }
189
190 @Test
191 public void testAncestors() throws ExtractionException, IOException, RepositoryException {
192 extractHCardAndRelated("/microformats/hcard/26-ancestors.html");
193 assertModelNotEmpty();
194
195 assertContains(vVCARD.fn, "John Doe");
196 assertNotContains(null, vVCARD.fn, "Mister Jonathan John Doe-Smith Medical Doctor");
197 assertContains(vVCARD.nickname, "JJ");
198 assertContains(RDF.TYPE, vVCARD.Address);
199 assertContains(vVCARD.tz, "-0700");
200 assertContains(vVCARD.title, "President");
201 assertContains(vVCARD.role, "Chief");
202 assertContains(vVCARD.organization_name, "Intellicorp");
203 assertContains(vVCARD.organization_unit, "Intelligence");
204
205 assertContains(vVCARD.tel, RDFUtils.iri("tel:415.555.1234"));
206 assertContains(vVCARD.uid, "abcdefghijklmnopqrstuvwxyz");
207 assertContains(vVCARD.note, "this is a note");
208 assertContains(vVCARD.class_, "public");
209
210 assertContains(RDF.TYPE, vVCARD.Location);
211 assertContains(vVCARD.geo, (Resource) null);
212 assertContains(null, vVCARD.latitude, "37.77");
213 assertContains(null, vVCARD.longitude, "-122.41");
214
215 assertContains(RDF.TYPE, vVCARD.Name);
216 assertContains(vVCARD.additional_name, "John");
217 assertContains(vVCARD.given_name, "Jonathan");
218 assertContains(vVCARD.family_name, "Doe-Smith");
219 assertContains(vVCARD.honorific_prefix, "Mister");
220 assertContains(vVCARD.honorific_suffix, "Medical Doctor");
221
222 assertContains(vVCARD.post_office_box, "Box 1234");
223 assertContains(vVCARD.extended_address, "Suite 100");
224 assertContains(vVCARD.street_address, "123 Fake Street");
225 assertContains(vVCARD.locality, "San Francisco");
226 assertContains(vVCARD.region, "California");
227 assertContains(vVCARD.postal_code, "12345-6789");
228 assertContains(vVCARD.country_name, "United States of America");
229 assertContains(vVCARD.addressType, "work");
230 }
231
232 @Test
233 public void testSingleton() throws Exception {
234 extractHCardAndRelated("/microformats/hcard/37-singleton.html");
235 assertModelNotEmpty();
236 assertStatementsSize(vVCARD.fn, (Value) null, 1);
237 assertContains(vVCARD.fn, "john doe 1");
238 assertStatementsSize(RDF.TYPE, vVCARD.Name, 1);
239 assertStatementsSize(vVCARD.given_name, (Value) null, 1);
240 assertContains(vVCARD.given_name, "john");
241 assertStatementsSize(vVCARD.family_name, (Value) null, 1);
242 assertContains(vVCARD.family_name, "doe");
243 assertStatementsSize(vVCARD.sort_string, (Value) null, 1);
244 assertContains(vVCARD.sort_string, "d");
245 assertStatementsSize(vVCARD.bday, (Value) null, 1);
246 assertContains(vVCARD.bday, "20060707");
247 assertStatementsSize(vVCARD.rev, (Value) null, 1);
248 assertContains(vVCARD.rev, "20060707");
249 assertStatementsSize(vVCARD.class_, (Value) null, 1);
250 assertContains(vVCARD.class_, "public");
251 assertStatementsSize(vVCARD.tz, (Value) null, 1);
252 assertContains(vVCARD.tz, "+0600");
253
254 assertStatementsSize(RDF.TYPE, vVCARD.Location, 2);
255
256 assertStatementsSize(vVCARD.geo, (Value) null, 2);
257 assertContains(vVCARD.latitude, "123.45");
258 assertContains(vVCARD.longitude, "67.89");
259 assertStatementsSize(vVCARD.uid, (Value) null, 1);
260 assertContains(vVCARD.uid, "unique-id-1");
261 }
262
263 @Test
264 public void test01Basic() throws Exception {
265 extractHRevAndRelated("/microformats/hreview/01-spec.html");
266 assertModelNotEmpty();
267
268 assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1);
269
270 assertStatementsSize(RDF.TYPE, vVCARD.VCard, 2);
271
272 assertStatementsSize(RDF.TYPE, vVCARD.Address, 1);
273
274 RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review);
275
276 try {
277 while (reviews.hasNext()) {
278 Resource review = reviews.next().getSubject();
279 assertContains(review, vREVIEW.rating, "5");
280 assertContains(review, vREVIEW.title, "Crepes on Cole is awesome");
281 assertContains(review, vDCTERMS.date, "20050418T2300-0700");
282 assertContains(vREVIEW.text,
283 "Crepes on Cole is one of the best little \n" + " creperies in San Francisco.\n "
284 + "Excellent food and service. Plenty of tables in a variety of sizes\n"
285 + " for parties large and small. " + "Window seating makes for excellent\n "
286 + "people watching to/from the N-Judah which stops right outside.\n"
287 + " I've had many fun social gatherings here, as well as gotten\n"
288 + " plenty of work done thanks to neighborhood WiFi.");
289
290 assertContains(null, vREVIEW.hasReview, review);
291 }
292 } finally {
293 reviews.close();
294 }
295
296
297 assertContains(vVCARD.fn, "Crepes on Cole");
298 assertContains(vVCARD.fn, "Tantek");
299 assertContains(vVCARD.locality, "San Francisco");
300 assertContains(vVCARD.organization_name, "Crepes on Cole");
301
302 }
303
304 @Test
305 public void test02RatedTags() throws Exception {
306 extractHRevAndRelated("/microformats/hreview/02-spec-2.html");
307
308 assertStatementsSize(vREVIEW.reviewer, (Value) null, 1);
309 assertStatementsSize(vREVIEW.hasReview, (Value) null, 1);
310 assertModelNotEmpty();
311 assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1);
312
313 assertStatementsSize(RDF.TYPE, vVCARD.VCard, 2);
314
315 assertStatementsSize(RDF.TYPE, vVCARD.Address, 1);
316
317 RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review);
318
319 try {
320 while (reviews.hasNext()) {
321 Resource review = reviews.next().getSubject();
322 assertContains(review, vREVIEW.rating, "18");
323 assertContains(review, vREVIEW.title, "Cafe Borrone");
324 assertContains(review, vDCTERMS.date, "20050428T2130-0700");
325 assertContains(null, vREVIEW.hasReview, review);
326 assertContains(vREVIEW.type, "business");
327 }
328
329 } finally {
330 reviews.close();
331 }
332
333
334 assertContains(vVCARD.fn, "Cafe Borrone");
335 assertContains(vVCARD.fn, "anonymous");
336 assertContains(vVCARD.organization_name, "Cafe Borrone");
337
338 }
339
340 @Test
341 public void test03NoHcardForItem() throws Exception {
342 extractHRevAndRelated("/microformats/hreview/03-spec-3.html");
343
344 assertModelNotEmpty();
345 assertStatementsSize(RDF.TYPE, vREVIEW.Review, 1);
346 assertStatementsSize(RDF.TYPE, vVCARD.VCard, 1);
347
348 RepositoryResult<Statement> reviews = getStatements(null, RDF.TYPE, vREVIEW.Review);
349
350 try {
351 while (reviews.hasNext()) {
352 Resource review = reviews.next().getSubject();
353 assertContains(review, vREVIEW.rating, "5");
354 assertNotContains(vREVIEW.title, null);
355 assertContains(review, vDCTERMS.date, "200502");
356
357 assertContains(vREVIEW.text,
358 "\"The people thought they were just being rewarded for "
359 + "treating others\n as they like to be treated, for "
360 + "obeying stop signs and curing diseases,\n for mailing "
361 + "letters with the address of the sender... Don't wake me,\n "
362 + " I plan on sleeping in...\"\n \n \"Nothing Better\""
363 + " is a great track on this album, too...");
364
365 RepositoryResult<Statement> whatHasAReview = getStatements(null, vREVIEW.hasReview, review);
366
367 try {
368 while (whatHasAReview.hasNext()) {
369 Resource subject = whatHasAReview.next().getSubject();
370 assertContains(subject, vVCARD.fn, "The Postal Service: Give Up");
371 assertContains(subject, vVCARD.url,
372 RDFUtils.iri("http://www.amazon.com/exec/obidos/ASIN/B000089CJI/"));
373 assertContains(subject, vVCARD.photo,
374 RDFUtils.iri("http://images.amazon.com/images/P/B000089CJI.01._SCTHUMBZZZ_.jpg"));
375 }
376
377 } finally {
378 whatHasAReview.close();
379 }
380
381 }
382
383 } finally {
384 reviews.close();
385 }
386
387 assertContains(vVCARD.fn, "Adam Rifkin");
388 assertContains(vVCARD.url, RDFUtils.iri("http://ifindkarma.com/blog/"));
389 }
390
391 @Override
392 protected void extract(String filename) throws ExtractionException, IOException {
393
394 InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));
395
396 Document document = new TagSoupParser(input, baseIRI.stringValue()).getDOM();
397 HCardExtractor hCardExtractor = new HCardExtractorFactory().createExtractor();
398 ExtractionContext hcExtractionContext = new ExtractionContext(
399 hCardExtractor.getDescription().getExtractorName(), baseIRI);
400 hCardExtractor.run(ExtractionParameters.newDefault(), hcExtractionContext, document,
401 new ExtractionResultImpl(hcExtractionContext, hCardExtractor, new RepositoryWriter(getConnection())));
402 XFNExtractor xfnExtractor = new XFNExtractorFactory().createExtractor();
403 ExtractionContext xfnExtractionContext = new ExtractionContext(xfnExtractor.getDescription().getExtractorName(),
404 baseIRI);
405 xfnExtractor.run(ExtractionParameters.newDefault(), xfnExtractionContext, document,
406 new ExtractionResultImpl(xfnExtractionContext, hCardExtractor, new RepositoryWriter(getConnection())));
407 }
408
409 private void extractHCardAndRelated(String filename) throws IOException, ExtractionException {
410
411 InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));
412
413 Document document = new TagSoupParser(input, baseIRI.stringValue()).getDOM();
414 HCardExtractor hCardExtractor = new HCardExtractorFactory().createExtractor();
415 ExtractionContext hCardExtractionContext = new ExtractionContext(
416 hCardExtractor.getDescription().getExtractorName(), baseIRI);
417 hCardExtractor.run(ExtractionParameters.newDefault(), hCardExtractionContext, document,
418 new ExtractionResultImpl(hCardExtractionContext, hCardExtractor,
419 new RepositoryWriter(getConnection())));
420
421 GeoExtractor geoExtractor = new GeoExtractorFactory().createExtractor();
422 ExtractionContext geoExtractionContext = new ExtractionContext(geoExtractor.getDescription().getExtractorName(),
423 baseIRI);
424 geoExtractor.run(ExtractionParameters.newDefault(), geoExtractionContext, document,
425 new ExtractionResultImpl(geoExtractionContext, geoExtractor, new RepositoryWriter(getConnection())));
426
427 AdrExtractor adrExtractor = new AdrExtractorFactory().createExtractor();
428 ExtractionContext adrExtractionContext = new ExtractionContext(adrExtractor.getDescription().getExtractorName(),
429 baseIRI);
430 adrExtractor.run(ExtractionParameters.newDefault(), adrExtractionContext, document,
431 new ExtractionResultImpl(adrExtractionContext, adrExtractor, new RepositoryWriter(getConnection())));
432
433 }
434
435 private void extractHRevAndRelated(String filename) throws ExtractionException, IOException {
436 extractHCardAndRelated(filename);
437 InputStream input = new BufferedInputStream(this.getClass().getResourceAsStream(filename));
438 Document document = new TagSoupParser(input, baseIRI.stringValue()).getDOM();
439 HReviewExtractor hReviewExtractor = new HReviewExtractorFactory().createExtractor();
440 ExtractionContext hreviewExtractionContext = new ExtractionContext(
441 hReviewExtractor.getDescription().getExtractorName(), baseIRI);
442 hReviewExtractor.run(ExtractionParameters.newDefault(), hreviewExtractionContext, document,
443 new ExtractionResultImpl(hreviewExtractionContext, hReviewExtractor,
444 new RepositoryWriter(getConnection())));
445 }
446
447 }