This project has retired. For details please refer to its
Attic page.
MicrodataExtractorTest xref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.microdata;
19
20 import org.apache.any23.Any23;
21 import org.apache.any23.Any23OnlineTestBase;
22 import org.apache.any23.configuration.DefaultConfiguration;
23 import org.apache.any23.configuration.ModifiableConfiguration;
24 import org.apache.any23.extractor.ExtractionException;
25 import org.apache.any23.extractor.ExtractorFactory;
26 import org.apache.any23.extractor.IssueReport;
27 import org.apache.any23.extractor.html.AbstractExtractorTestCase;
28 import org.apache.any23.extractor.rdf.TurtleExtractorFactory;
29 import org.apache.any23.rdf.RDFUtils;
30 import org.apache.any23.source.DocumentSource;
31 import org.apache.any23.source.HTTPDocumentSource;
32 import org.apache.any23.writer.TripleWriterHandler;
33 import org.eclipse.rdf4j.model.IRI;
34 import org.eclipse.rdf4j.model.Model;
35 import org.eclipse.rdf4j.model.Value;
36 import org.eclipse.rdf4j.model.Literal;
37 import org.eclipse.rdf4j.model.Resource;
38 import org.eclipse.rdf4j.model.impl.TreeModel;
39 import org.eclipse.rdf4j.model.util.Models;
40 import org.eclipse.rdf4j.model.vocabulary.RDF;
41 import org.eclipse.rdf4j.model.vocabulary.RDFS;
42 import org.slf4j.Logger;
43 import org.slf4j.LoggerFactory;
44 import org.junit.Assert;
45 import org.junit.Test;
46 import org.eclipse.rdf4j.model.BNode;
47 import org.eclipse.rdf4j.model.Statement;
48 import org.eclipse.rdf4j.repository.RepositoryException;
49 import org.eclipse.rdf4j.rio.RDFFormat;
50 import org.eclipse.rdf4j.rio.RDFHandler;
51 import org.eclipse.rdf4j.rio.RDFHandlerException;
52 import org.eclipse.rdf4j.rio.RDFParseException;
53 import org.eclipse.rdf4j.rio.RDFParser;
54 import org.eclipse.rdf4j.rio.Rio;
55
56 import java.io.File;
57 import java.io.FileReader;
58 import java.io.IOException;
59 import java.nio.charset.StandardCharsets;
60 import java.util.ArrayDeque;
61 import java.util.ArrayList;
62 import java.util.Arrays;
63 import java.util.Collections;
64 import java.util.HashMap;
65 import java.util.List;
66 import java.util.Map;
67 import java.util.TreeMap;
68 import java.util.concurrent.atomic.AtomicInteger;
69
70
71
72
73
74
75 public class MicrodataExtractorTest extends AbstractExtractorTestCase {
76
77 private static final Logger logger = LoggerFactory.getLogger(MicrodataExtractorTest.class);
78
79 @Override
80 protected ExtractorFactory<?> getExtractorFactory() {
81 return new MicrodataExtractorFactory();
82 }
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98 @Test
99 public void testSchemaOrgNestedProps()
100 throws RepositoryException, RDFHandlerException, IOException, RDFParseException, ExtractionException {
101 extractAndVerifyAgainstNQuads("microdata-nested.html", "microdata-nested-expected.nquads");
102 logger.debug(dumpModelToNQuads());
103 }
104
105 @Test
106 public void testUnusedItemprop() {
107
108 assertExtract("/microdata/unused-itemprop.html");
109 assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Offer"));
110 }
111
112 @Test
113 public void testExample2() {
114
115 assertExtract("/microdata/example2.html");
116 assertContains(null, RDF.TYPE, RDFUtils.iri("http://microformats.org/profile/hcard"));
117 assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#given-name"), (Value) null);
118 assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#n"), (Value) null);
119 }
120
121 @Test
122 public void testExample5() {
123
124 assertExtract("/microdata/example5.html");
125 assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Person"));
126 assertContains(null, RDF.TYPE, RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
127 assertContains(null, RDFUtils.iri("http://schema.org/additionalType"),
128 RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
129 assertContains(null, RDFUtils.iri("http://schema.org/email"), RDFUtils.iri("mailto:mail@gmail.com"));
130 assertContains(null, RDFUtils.iri("http://xmlns.com/foaf/0.1/mbox"), RDFUtils.iri("mailto:mail@gmail.com"));
131 }
132
133 private static final List<String> ignoredOnlineTestNames = Arrays.asList("Test 0073",
134
135 "Test 0074"
136 );
137
138 private static Any23 createRunner(String extractorName) {
139 ModifiableConfiguration config = DefaultConfiguration.copy();
140 config.setProperty("any23.microdata.strict", DefaultConfiguration.FLAG_PROPERTY_ON);
141 Any23 runner = new Any23(config, extractorName);
142 runner.setHTTPUserAgent("apache-any23-test-user-agent");
143 return runner;
144 }
145
146 @Test
147 public void runOnlineTests() throws Exception {
148
149 Any23OnlineTestBase.assumeOnlineAllowed();
150
151 Any23 ttlRunner = createRunner(TurtleExtractorFactory.NAME);
152 DocumentSource source = new HTTPDocumentSource(ttlRunner.getHTTPClient(),
153 "https://w3c.github.io/microdata-rdf/tests/manifest.ttl");
154 HashMap<Resource, HashMap<IRI, ArrayDeque<Value>>> map = new HashMap<>(256);
155 ttlRunner.extract(source, new TripleWriterHandler() {
156 public void writeTriple(Resource s, IRI p, Value o, Resource g) {
157 map.computeIfAbsent(s, k -> new HashMap<>()).computeIfAbsent(p, k -> new ArrayDeque<>()).add(o);
158 }
159
160 public void writeNamespace(String prefix, String uri) {
161 }
162
163 public void close() {
164 }
165 });
166
167 Assert.assertFalse(map.isEmpty());
168
169 final IRI actionPred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#action");
170 final IRI resultPred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#result");
171 final IRI namePred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#name");
172
173 AtomicInteger passedTests = new AtomicInteger();
174 AtomicInteger ignoredTests = new AtomicInteger();
175 Map<String, String> failedTests = Collections.synchronizedMap(new TreeMap<>());
176
177 map.values().parallelStream().forEach(item -> {
178 ArrayDeque<Value> types = item.get(RDF.TYPE);
179 if (types == null) {
180 return;
181 }
182 boolean positive;
183 label: {
184 for (Value type : types) {
185 if (type.stringValue().startsWith("http://www.w3.org/ns/rdftest#TestMicrodataNegative")) {
186 positive = false;
187 break label;
188 } else if (type.stringValue().startsWith("http://www.w3.org/ns/rdftest#TestMicrodata")) {
189 positive = true;
190 break label;
191 }
192 }
193 return;
194 }
195 IRI action = (IRI) item.get(actionPred).pop();
196 IRI result = (IRI) (item.containsKey(resultPred) ? item.get(resultPred).pop() : null);
197 String name = ((Literal) item.get(namePred).pop()).getLabel();
198 if (ignoredOnlineTestNames.contains(name)) {
199 ignoredTests.incrementAndGet();
200 return;
201 }
202 try {
203 name += ": " + ((Literal) item.get(RDFS.COMMENT).pop()).getLabel();
204 TreeModel actual = new TreeModel();
205 createRunner(MicrodataExtractorFactory.NAME).extract(action.stringValue(), new TripleWriterHandler() {
206 public void writeTriple(Resource s, IRI p, Value o, Resource g) {
207 if (MicrodataExtractor.MICRODATA_ITEM.equals(p))
208 return;
209 actual.add(s, p, o);
210 }
211
212 public void writeNamespace(String prefix, String uri) {
213 }
214
215 public void close() {
216 }
217 });
218
219 TreeModel expected = new TreeModel();
220 if (result != null) {
221 createRunner(TurtleExtractorFactory.NAME).extract(result.stringValue(), new TripleWriterHandler() {
222 public void writeTriple(Resource s, IRI p, Value o, Resource g) {
223
224
225 if (o instanceof IRI
226 && o.stringValue().equals("http://w3c.github.io/author/jd_salinger.html")) {
227 o = RDFUtils.iri("https://w3c.github.io/author/jd_salinger.html");
228 }
229
230 expected.add(s, p, o);
231 }
232
233 public void writeNamespace(String prefix, String uri) {
234 }
235
236 public void close() {
237 }
238 });
239 }
240
241 boolean testPassed = positive == Models.isomorphic(expected, actual);
242 if (testPassed) {
243 passedTests.incrementAndGet();
244 } else {
245 StringBuilder error = new StringBuilder("\n" + name + "\n");
246 error.append(action).append(positive ? " ==> " : " =/=> ").append(result).append("\n");
247
248 HashMap<Value, String> m = new HashMap<>();
249 AtomicInteger i = new AtomicInteger();
250 int match = 0;
251 for (Statement st : expected) {
252 Resource s = st.getSubject();
253 Value o = st.getObject();
254
255 if (actual.stream().noneMatch(t -> st.getPredicate().equals(t.getPredicate())
256 && (s instanceof BNode ? t.getSubject() instanceof BNode : s.equals(t.getSubject()))
257 && (o instanceof BNode ? t.getObject() instanceof BNode : o.equals(t.getObject())))) {
258 if (positive) {
259 Object sstr = s instanceof BNode ? m.computeIfAbsent(s, k -> "_:" + i.getAndIncrement())
260 : s;
261 Object ostr = o instanceof BNode ? m.computeIfAbsent(o, k -> "_:" + i.getAndIncrement())
262 : o;
263 error.append("EXPECT: ").append(sstr).append(" ").append(st.getPredicate()).append(" ")
264 .append(ostr).append("\n");
265 }
266 } else {
267 match++;
268 }
269 }
270 error.append("...").append(match).append(" statements in common...\n");
271
272 for (Statement st : actual) {
273 Resource s = st.getSubject();
274 Value o = st.getObject();
275
276 if (expected.stream().noneMatch(t -> st.getPredicate().equals(t.getPredicate())
277 && (s instanceof BNode ? t.getSubject() instanceof BNode : s.equals(t.getSubject()))
278 && (o instanceof BNode ? t.getObject() instanceof BNode : o.equals(t.getObject())))) {
279 if (positive) {
280 Object sstr = s instanceof BNode ? m.computeIfAbsent(s, k -> "_:" + i.getAndIncrement())
281 : s;
282 Object ostr = o instanceof BNode ? m.computeIfAbsent(o, k -> "_:" + i.getAndIncrement())
283 : o;
284 error.append("ACTUAL: ").append(sstr).append(" ").append(st.getPredicate()).append(" ")
285 .append(ostr).append("\n");
286 }
287 }
288 }
289
290 failedTests.put(name, error.toString());
291 }
292 } catch (Exception e) {
293 failedTests.put(name, "\n" + e.toString() + "\n");
294 }
295 });
296
297 if (logger.isDebugEnabled()) {
298 logger.debug("passed=" + passedTests.get() + "; ignored=" + ignoredTests.get());
299 }
300
301 if (!failedTests.isEmpty()) {
302 Assert.fail(failedTests.size() + " failures out of " + (failedTests.size() + passedTests.get())
303 + " total tests\n" + String.join("\n", failedTests.keySet()) + "\n\n"
304 + String.join("\n", failedTests.values()));
305 }
306 }
307
308 @Test
309 public void testMicrodataBasic() {
310 assertExtract("/microdata/microdata-basic.html");
311 assertModelNotEmpty();
312 assertStatementsSize(null, null, null, 40);
313 assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4);
314 }
315
316 @Test
317 public void testMicrodataMissingScheme() {
318 assertExtract("/microdata/microdata-missing-scheme.html");
319 assertModelNotEmpty();
320 assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
321 }
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337 @Test
338 public void testMicrodataGoogleRichSnippet()
339 throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
340 extractAndVerifyAgainstNQuads("microdata-richsnippet.html", "microdata-richsnippet-expected.nquads");
341 logger.debug(dumpHumanReadableTriples());
342 }
343
344
345
346
347
348
349
350
351
352
353
354
355
356 @Test
357 public void testExample5221() throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
358 extractAndVerifyAgainstNQuads("5.2.1-non-normative-example-1.html",
359 "5.2.1-non-normative-example-1-expected.nquads");
360 logger.debug(dumpHumanReadableTriples());
361 }
362
363
364
365
366
367
368
369
370
371
372
373
374
375 @Test
376 public void testExample5222() throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
377 extractAndVerifyAgainstNQuads("5.2.1-non-normative-example-2.html",
378 "5.2.1-non-normative-example-2-expected.nquads");
379 logger.debug(dumpHumanReadableTriples());
380 }
381
382
383
384
385
386
387
388
389
390
391
392
393
394 @Test
395 public void testExampleSchemaOrg1()
396 throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
397 extractAndVerifyAgainstNQuads("schemaorg-example-1.html", "schemaorg-example-1-expected.nquads");
398 logger.debug(dumpHumanReadableTriples());
399 }
400
401
402
403
404
405
406
407
408
409
410
411
412
413 @Test
414 public void testExampleSchemaOrg2()
415 throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
416 extractAndVerifyAgainstNQuads("schemaorg-example-2.html", "schemaorg-example-2-expected.nquads");
417 logger.debug(dumpHumanReadableTriples());
418 }
419
420 @Test
421 public void testMicrodataNestedUrlResolving() throws IOException {
422 IRI oldBaseIRI = baseIRI;
423 try {
424 baseIRI = RDFUtils.iri("https://ruben.verborgh.org/tmp/schemaorg-test.html");
425 extractAndVerifyAgainstNQuads("microdata-nested-url-resolving.html",
426 "microdata-nested-url-resolving-expected.nquads");
427 } finally {
428 baseIRI = oldBaseIRI;
429 }
430 }
431
432 @Test
433 public void testTel() {
434 assertExtract("/microdata/tel-test.html");
435 assertModelNotEmpty();
436 assertContains(RDFUtils.iri("http://schema.org/telephone"), RDFUtils.iri("tel:(909)%20484-2020"));
437 }
438
439 @Test
440 public void testBadTypes() throws IOException {
441 extractAndVerifyAgainstNQuads("microdata-bad-types.html", "microdata-bad-types-expected.nquads");
442 }
443
444 @Test
445 public void testBadPropertyNames() throws IOException {
446 extractAndVerifyAgainstNQuads("microdata-bad-properties.html", "microdata-bad-properties-expected.nquads",
447 false);
448 assertIssue(IssueReport.IssueLevel.ERROR,
449 ".*invalid property name ''.*\"path\" : \"/HTML\\[1\\]/BODY\\[1\\]/DIV\\[1\\]/DIV\\[2\\]/DIV\\[1\\]\".*");
450 }
451
452 private void extractAndVerifyAgainstNQuads(String actual, String expected)
453 throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
454 extractAndVerifyAgainstNQuads(actual, expected, true);
455 }
456
457 private void extractAndVerifyAgainstNQuads(String actual, String expected, boolean assertNoIssues)
458 throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
459 assertExtract("/microdata/" + actual, assertNoIssues);
460 assertModelNotEmpty();
461 logger.debug(dumpModelToNQuads());
462 List<Statement> expectedStatements = loadResultStatement("/microdata/" + expected);
463 int actualStmtSize = getStatementsSize(null, null, null);
464 Assert.assertEquals(expectedStatements.size(), actualStmtSize);
465 for (Statement statement : expectedStatements) {
466 assertContains(statement.getSubject() instanceof BNode ? null : statement.getSubject(),
467 statement.getPredicate(), statement.getObject() instanceof BNode ? null : statement.getObject());
468 }
469 Model expectedModel = new TreeModel();
470 for (Statement s : expectedStatements) {
471 expectedModel.add(s.getSubject(), s.getPredicate(), s.getObject());
472 }
473
474 Model actualModel = new TreeModel();
475 conn.export(new RDFHandler() {
476 @Override
477 public void startRDF() throws RDFHandlerException {
478 }
479
480 @Override
481 public void endRDF() throws RDFHandlerException {
482 }
483
484 @Override
485 public void handleNamespace(String s, String s1) throws RDFHandlerException {
486 }
487
488 @Override
489 public void handleStatement(Statement statement) throws RDFHandlerException {
490 actualModel.add(statement.getSubject(), statement.getPredicate(), statement.getObject());
491 }
492
493 @Override
494 public void handleComment(String s) throws RDFHandlerException {
495 }
496 });
497
498 Assert.assertTrue("Models are not isomorphic", Models.isomorphic(expectedModel, actualModel));
499 }
500
501 private List<Statement> loadResultStatement(String resultFilePath)
502 throws RDFHandlerException, IOException, RDFParseException {
503 RDFParser nQuadsParser = Rio.createParser(RDFFormat.NQUADS);
504 TestRDFHandler rdfHandler = new TestRDFHandler();
505 nQuadsParser.setRDFHandler(rdfHandler);
506 File file = copyResourceToTempFile(resultFilePath);
507 nQuadsParser.parse(new FileReader(file, StandardCharsets.UTF_8), baseIRI.toString());
508 return rdfHandler.getStatements();
509 }
510
511 public static class TestRDFHandler implements RDFHandler {
512
513 private final List<Statement> statements = new ArrayList<Statement>();
514
515 protected List<Statement> getStatements() {
516 return statements;
517 }
518
519 public void startRDF() throws RDFHandlerException {
520 }
521
522 public void endRDF() throws RDFHandlerException {
523 }
524
525 public void handleNamespace(String s, String s1) throws RDFHandlerException {
526 throw new UnsupportedOperationException();
527 }
528
529 public void handleStatement(Statement statement) throws RDFHandlerException {
530 statements.add(statement);
531 }
532
533 public void handleComment(String s) throws RDFHandlerException {
534 throw new UnsupportedOperationException();
535 }
536 }
537
538 }