This project has retired. For details please refer to its
Attic page.
Any23Test xref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23;
19
20 import org.apache.any23.extractor.ExtractorGroup;
21 import org.apache.any23.extractor.rdf.NTriplesExtractorFactory;
22 import org.apache.http.conn.ConnectTimeoutException;
23 import org.junit.Assert;
24 import org.apache.any23.configuration.Configuration;
25 import org.apache.any23.configuration.DefaultConfiguration;
26 import org.apache.any23.configuration.ModifiableConfiguration;
27 import org.apache.any23.extractor.ExtractionException;
28 import org.apache.any23.extractor.ExtractionParameters;
29 import org.apache.any23.extractor.Extractor;
30 import org.apache.any23.extractor.microdata.MicrodataExtractor;
31 import org.apache.any23.filter.IgnoreAccidentalRDFa;
32 import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
33 import org.apache.any23.http.DefaultHTTPClient;
34 import org.apache.any23.http.DefaultHTTPClientConfiguration;
35 import org.apache.any23.http.HTTPClient;
36 import org.apache.any23.http.HTTPClientConfiguration;
37 import org.apache.any23.source.DocumentSource;
38 import org.apache.any23.source.HTTPDocumentSource;
39 import org.apache.any23.source.StringDocumentSource;
40 import org.apache.any23.util.FileUtils;
41 import org.apache.any23.util.StreamUtils;
42 import org.apache.any23.util.StringUtils;
43 import org.apache.any23.vocab.DCTerms;
44 import org.apache.any23.writer.CompositeTripleHandler;
45 import org.apache.any23.writer.CountingTripleHandler;
46 import org.apache.any23.writer.NTriplesWriter;
47 import org.apache.any23.writer.RDFXMLWriter;
48 import org.apache.any23.writer.ReportingTripleHandler;
49 import org.apache.any23.writer.RepositoryWriter;
50 import org.apache.any23.writer.TripleHandler;
51 import org.apache.any23.writer.TripleHandlerException;
52 import org.apache.commons.io.IOUtils;
53 import org.junit.AssumptionViolatedException;
54 import org.junit.Test;
55 import org.eclipse.rdf4j.model.Statement;
56 import org.eclipse.rdf4j.repository.Repository;
57 import org.eclipse.rdf4j.repository.RepositoryConnection;
58 import org.eclipse.rdf4j.repository.RepositoryResult;
59 import org.eclipse.rdf4j.repository.sail.SailRepository;
60 import org.eclipse.rdf4j.rio.RDFParseException;
61 import org.eclipse.rdf4j.sail.memory.MemoryStore;
62 import org.slf4j.Logger;
63 import org.slf4j.LoggerFactory;
64
65 import java.io.ByteArrayOutputStream;
66 import java.io.IOException;
67 import java.net.URISyntaxException;
68 import java.nio.charset.StandardCharsets;
69 import java.util.Collections;
70 import java.util.List;
71 import java.util.Locale;
72
73 import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
74
75
76
77
78
79
80
81 @SuppressWarnings("unchecked")
82 public class Any23Test extends Any23OnlineTestBase {
83
84 private static final DCTerms vDCTERMS = DCTerms.getInstance();
85
86 private static final String PAGE_URL = "http://bob.com";
87
88 private static final Logger logger = LoggerFactory.getLogger(Any23Test.class);
89
90 @Test
91 public void testTTLDetection() throws Exception {
92 assertDetection("<a> <b> <c> .", "rdf-turtle");
93 }
94
95 @Test
96 public void testN3Detection1() throws Exception {
97 assertDetection("<Bob><brothers>(<Jim><Mark>).", "rdf-turtle");
98 }
99
100 @Test
101 public void testN3Detection2() throws Exception {
102 assertDetection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .", "rdf-nt");
103 }
104
105 @Test
106 public void testHTMLBruteForceDetection() throws Exception {
107 assertDetection("<html><body><div class=\"vcard fn\">Joe</div></body></html>");
108 }
109
110
111
112
113
114
115
116
117 @Test
118 public void testExplicitEncoding() throws Exception {
119 assertEncodingDetection("UTF-8", "/html/encoding-test.html", "Knud M\u00F6ller");
120 }
121
122
123
124
125
126
127
128
129 @Test
130 public void testImplicitEncoding() throws Exception {
131 assertEncodingDetection(null,
132 "/html/encoding-test.html", "Knud M\u00F6ller");
133 }
134
135 @Test
136 public void testRDFXMLDetectionAndExtraction() throws Exception {
137 String rdfXML = "<?xml version='1.0'?> " + "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' "
138 + "xmlns:dc='http://purl.org/dc/elements/1.1/'>"
139 + "<rdf:Description rdf:about='http://www.example.com'>" + "<dc:title>x</dc:title>"
140 + "</rdf:Description>" + "</rdf:RDF>";
141 assertDetectionAndExtraction(rdfXML);
142 }
143
144 @Test
145 public void testNTriplesDetectionAndExtraction() throws Exception {
146 String n3 = "<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"n3 . appo\" .";
147 assertDetectionAndExtraction(n3);
148 }
149
150 @Test
151 public void testNturtleDetectionAndExtraction() throws Exception {
152 String nTurtle = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n"
153 + "@prefix dc: <http://purl.org/dc/elements/1.1/> .\n"
154 + "@prefix ex: <http://example.org/stuff/1.0/> .\n" + "\n"
155 + "<http://www.w3.org/TR/rdf-syntax-grammar>\n"
156 + " dc:title \"RDF/XML Syntax Specification (Revised)\" ;\n" + " ex:editor [\n"
157 + " ex:fullname \"Dave Beckett\";\n" + " ex:homePage <http://purl.org/net/dajobe/>\n" + " ] .";
158 assertDetectionAndExtraction(nTurtle);
159 }
160
161
162
163
164
165
166
167 @Test
168 public void testDemoCodeSnippet1() throws Exception {
169 Any23 runner = new Any23();
170 final String content = "@prefix foo: <http://example.org/ns#> . "
171 + "@prefix : <http://other.example.org/ns#> ." + "foo:bar foo: : . "
172 + ":bar : foo:bar . ";
173
174 DocumentSource source = new StringDocumentSource(content, "http://host.com/service");
175 ByteArrayOutputStream out = new ByteArrayOutputStream();
176 TripleHandler handler = new NTriplesWriter(out);
177 try {
178 runner.extract(source, handler);
179 } finally {
180 handler.close();
181 }
182 String nt = out.toString("UTF-8");
183
184
185
186
187
188 logger.debug("nt: " + nt);
189 Assert.assertTrue(nt.length() > 0);
190 }
191
192
193
194
195
196
197
198 @Test
199 public void testDemoCodeSnippet2() throws Exception {
200 assumeOnlineAllowed();
201
202 Any23 runner = new Any23();
203 runner.setHTTPUserAgent("apache-any23-test-user-agent");
204 HTTPClient httpClient = runner.getHTTPClient();
205 DocumentSource source = new HTTPDocumentSource(httpClient, "http://dbpedia.org/resource/Trento");
206 ByteArrayOutputStream out = new ByteArrayOutputStream();
207 TripleHandler handler = new NTriplesWriter(out);
208 try {
209 runner.extract(source, handler);
210 } finally {
211 handler.close();
212 }
213 String n3 = out.toString("UTF-8");
214
215
216
217
218
219
220
221
222 logger.debug("n3: " + n3);
223 Assert.assertTrue(n3.length() > 0);
224
225 Assert.assertTrue(n3.contains(
226 "<http://dbpedia.org/resource/Trento> <http://dbpedia.org/property/mayor> \"Franco Ianeselli, elected 2020\"@en ."));
227 }
228
229
230
231
232
233
234
235
236
237
238
239
240
241 @Test
242 public void testProgrammaticExtraction() throws ExtractionException, IOException, URISyntaxException {
243 Any23 any23 = new Any23();
244 any23.setHTTPUserAgent("Any23-Servlet");
245 any23.setHTTPClient(new DefaultHTTPClient() {
246 @Override
247 protected int getConnectionTimeout() {
248 return 5000;
249 }
250
251 @Override
252 protected int getSoTimeout() {
253 return 2000;
254 }
255 });
256 ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
257 TripleHandler handler = new NTriplesWriter(byteArrayOutputStream);
258 TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
259 ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);
260
261 DocumentSource source = getDocumentSourceFromResource("/html/rdfa/ansa_2010-02-26_12645863.html",
262 "http://host.com/service");
263
264 Assert.assertTrue(any23.extract(source, reporting).hasMatchingExtractors());
265 try {
266 handler.close();
267 } catch (TripleHandlerException e) {
268 Assert.fail(e.getMessage());
269 }
270
271 final String bufferContent = byteArrayOutputStream.toString(StandardCharsets.UTF_8);
272 logger.debug(bufferContent);
273 Assert.assertSame("Unexpected number of triples.", 18, StringUtils.countNL(bufferContent));
274
275 }
276
277
278
279
280
281
282
283
284
285
286
287
288 @Test
289 public void testGZippedContent() throws IOException, URISyntaxException, ExtractionException {
290 assumeOnlineAllowed();
291 final Any23 runner = new Any23();
292 runner.setHTTPUserAgent("apache-any23-test-user-agent");
293 DocumentSource source = new HTTPDocumentSource(runner.getHTTPClient(), "https://dev.w3.org/html5/rdfa/");
294 ByteArrayOutputStream out = new ByteArrayOutputStream();
295 TripleHandler handler = new NTriplesWriter(out);
296 try {
297 runner.extract(source, handler);
298 } catch (ConnectTimeoutException e) {
299
300 logger.error("Connection to " + source.getDocumentIRI() + " timed out; skipping test", e);
301 throw new AssumptionViolatedException(e.getMessage());
302 }
303 String n3 = out.toString(StandardCharsets.UTF_8);
304 logger.debug("N3 " + n3);
305 Assert.assertTrue(n3.length() > 0);
306 }
307
308 @Test
309 public void testExtractionParameters() throws IOException, ExtractionException, TripleHandlerException {
310
311
312
313
314 final int EXPECTED_TRIPLES = 12;
315 Any23 runner = new Any23();
316 DocumentSource source = getDocumentSourceFromResource("/org/apache/any23/validator/missing-og-namespace.html",
317 "http://www.test.com");
318
319 ByteArrayOutputStream baos = new ByteArrayOutputStream();
320
321 CountingTripleHandler cth1 = new CountingTripleHandler();
322 NTriplesWriter ctw1 = new NTriplesWriter(baos);
323 CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
324 compositeTH1.addChild(cth1);
325 compositeTH1.addChild(ctw1);
326 try {
327 runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE), source,
328 compositeTH1);
329 } finally {
330 compositeTH1.close();
331 }
332 logger.debug(baos.toString(StandardCharsets.UTF_8));
333 Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth1.getCount());
334 }
335
336 @Test
337 public void testExtractionParametersWithNestingDisabled()
338 throws IOException, ExtractionException, TripleHandlerException {
339 final int EXPECTED_TRIPLES = 20;
340 Any23 runner = new Any23();
341 DocumentSource source = getDocumentSourceFromResource("/microformats/nested-microformats-a1.html",
342 "http://www.test.com");
343
344 ByteArrayOutputStream baos = new ByteArrayOutputStream();
345
346 CountingTripleHandler cth1 = new CountingTripleHandler();
347 RDFXMLWriter ctw1 = new RDFXMLWriter(baos);
348 CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
349 compositeTH1.addChild(cth1);
350 compositeTH1.addChild(ctw1);
351 runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE, true), source,
352 compositeTH1);
353 compositeTH1.close();
354 logger.debug("Out1: " + baos.toString(StandardCharsets.UTF_8));
355 Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES + 3, cth1.getCount());
356
357 baos.reset();
358 CountingTripleHandler cth2 = new CountingTripleHandler();
359 NTriplesWriter ctw2 = new NTriplesWriter(baos);
360 CompositeTripleHandler compositeTH2 = new CompositeTripleHandler();
361 compositeTH2.addChild(cth2);
362 compositeTH2.addChild(ctw2);
363 runner.extract(
364 new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.VALIDATE_AND_FIX, false),
365 source, compositeTH2);
366 compositeTH2.close();
367 logger.debug("Out2: " + baos.toString(StandardCharsets.UTF_8));
368 Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth2.getCount());
369 }
370
371 @Test
372 public void testExceptionPropagation() throws IOException {
373 Any23 any23 = new Any23();
374 DocumentSource source = getDocumentSourceFromResource("/application/turtle/geolinkeddata.ttl",
375 "http://www.test.com");
376 CountingTripleHandler cth1 = new CountingTripleHandler();
377 try {
378 any23.extract(source, cth1);
379 } catch (ExtractionException e) {
380 Assert.assertTrue(e.getCause() instanceof RDFParseException);
381 }
382
383 }
384
385
386
387
388
389
390
391
392
393 @Test
394 public void testXMLMimeTypeManagement() throws IOException, ExtractionException {
395 final String documentIRI = "http://www.test.com/resource.xml";
396 final String contentType = "application/xml";
397 final String in = StreamUtils.asString(this.getClass().getResourceAsStream("any23-xml-mimetype.xml"));
398 final DocumentSource doc = new StringDocumentSource(in, documentIRI, contentType);
399 final Any23 any23 = new Any23();
400 final CountingTripleHandler cth = new CountingTripleHandler(false);
401 final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
402 final ExtractionReport report = any23.extract(doc, rth);
403 Assert.assertFalse(report.hasMatchingExtractors());
404 Assert.assertEquals(0, cth.getCount());
405 }
406
407
408
409
410
411
412
413
414
415 @Test
416 public void testXMLMimeTypeManagementViaURL() throws IOException, ExtractionException {
417 assumeOnlineAllowed();
418 final Any23 any23 = new Any23();
419 any23.setHTTPUserAgent("apache-any23-test-user-agent");
420 HTTPClient client = any23.getHTTPClient();
421 HTTPClientConfiguration configuration = new DefaultHTTPClientConfiguration("application/xml");
422 client.init(configuration);
423 final CountingTripleHandler cth = new CountingTripleHandler(false);
424 final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
425 final ExtractionReport report = any23.extract("http://www.legislation.gov.uk/ukpga/2015/17/section/4/data.xml",
426 rth);
427 Assert.assertFalse(report.hasMatchingExtractors());
428 Assert.assertEquals(0, cth.getCount());
429 }
430
431 @Test
432 public void testBlankNodesViaURL() throws IOException, ExtractionException {
433 assumeOnlineAllowed();
434 final Any23 any23 = new Any23();
435 any23.setHTTPUserAgent("apache-any23-test-user-agent");
436 final CountingTripleHandler cth = new CountingTripleHandler(false);
437 final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
438 final ExtractionReport report = any23.extract("https://www.w3.org/", rth);
439 Assert.assertTrue(report.hasMatchingExtractors());
440 }
441
442 @Test
443 public void testMicrodataSupport() throws Exception {
444 final String htmlWithMicrodata = IOUtils
445 .toString(getClass().getResourceAsStream("/microdata/microdata-basic.html"), StandardCharsets.UTF_8);
446 assertExtractorActivation(htmlWithMicrodata, MicrodataExtractor.class);
447 }
448
449 @Test
450 public void testAbstractMethodErrorIssue186_1() throws IOException, ExtractionException {
451 final Any23 runner = new Any23();
452 final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-1.xhtml");
453 final DocumentSource source = new StringDocumentSource(content, "http://base.com");
454 final ByteArrayOutputStream out = new ByteArrayOutputStream();
455 final TripleHandler handler = new NTriplesWriter(out);
456 runner.extract(source, handler);
457 String n3 = out.toString("UTF-8");
458 logger.debug(n3);
459 }
460
461 @Test
462 public void testAbstractMethodErrorIssue186_2() throws IOException, ExtractionException {
463 final Any23 runner = new Any23();
464 final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-2.xhtml");
465 final DocumentSource source = new StringDocumentSource(content, "http://richard.cyganiak.de/");
466 final ByteArrayOutputStream out = new ByteArrayOutputStream();
467 final TripleHandler handler = new NTriplesWriter(out);
468 runner.extract(source, handler);
469 final String n3 = out.toString("UTF-8");
470 logger.debug(n3);
471 }
472
473 @Test
474 public void testModifiableConfiguration_issue183() throws Exception {
475 final ModifiableConfiguration modifiableConf = DefaultConfiguration.copy();
476 modifiableConf.setProperty("any23.extraction.metadata.timesize", "off");
477 final Any23 any23 = new Any23(modifiableConf);
478
479 final String content = FileUtils.readResourceContent("/rdf/rdf-issue183.ttl");
480 final DocumentSource source = new StringDocumentSource(content, "http://base.com");
481 final ByteArrayOutputStream out = new ByteArrayOutputStream();
482 final TripleHandler handler = new NTriplesWriter(out);
483 any23.extract(source, handler);
484 handler.close();
485 final String n3 = out.toString("UTF-8");
486
487 logger.debug(n3);
488 Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/date",
489 n3.contains("http://vocab.sindice.net/date"));
490 Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/size",
491 n3.contains("http://vocab.sindice.net/size"));
492 }
493
494 @Test
495 public void testIssue415InvalidNTriples() throws Exception {
496 NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
497 Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
498
499 ExtractionReport report = runner.extract(IOUtils.resourceToString("/rdf/issue415.txt", StandardCharsets.UTF_8),
500 "http://humanstxt.org/humans.txt", new CompositeTripleHandler());
501 Assert.assertEquals("text/plain", report.getDetectedMimeType());
502 Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
503 Assert.assertEquals(0, report.getMatchingExtractors().size());
504 }
505
506 @Test
507 public void testIssue415ValidNTriples() throws Exception {
508 NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
509 Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
510
511 CountingTripleHandler handler = new CountingTripleHandler();
512 ExtractionReport report = runner.extract(
513 IOUtils.resourceToString("/rdf/issue415-valid.txt", StandardCharsets.UTF_8),
514 "http://humanstxt.org/humans.txt", handler);
515 Assert.assertEquals("application/n-triples", report.getDetectedMimeType());
516 Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
517 Assert.assertEquals(1, report.getMatchingExtractors().size());
518 Assert.assertEquals(1, handler.getCount());
519 }
520
521
522
523
524
525
526
527
528
529
530
531
532 private ExtractionReport detectAndExtract(String in) throws Exception {
533 Any23 any23 = new Any23();
534 Configuration conf = DefaultConfiguration.copy();
535 ByteArrayOutputStream out = new ByteArrayOutputStream();
536 ReportingTripleHandler outputHandler = new ReportingTripleHandler(
537 new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(new NTriplesWriter(out))));
538 return any23.extract(new ExtractionParameters(conf, ValidationMode.VALIDATE_AND_FIX, null, null),
539 new StringDocumentSource(in, "http://host.com/path"), outputHandler, "UTF-8");
540 }
541
542
543
544
545
546
547
548
549
550
551 private void assertDetectionAndExtraction(String in) throws Exception {
552 final ExtractionReport extractionReport = detectAndExtract(in);
553 Assert.assertTrue("Detection and extraction failed, no matching extractors.",
554 extractionReport.hasMatchingExtractors());
555 }
556
557
558
559
560
561
562
563
564
565
566
567 private void assertExtractorActivation(String in,
568 @SuppressWarnings("rawtypes") Class<? extends Extractor>... expectedExtractors) throws Exception {
569 final ExtractionReport extractionReport = detectAndExtract(in);
570 for (@SuppressWarnings("rawtypes")
571 Class<? extends Extractor> expectedExtractorClass : expectedExtractors) {
572 Assert.assertTrue(
573 String.format(Locale.ROOT, "Detection and extraction failed, expected extractor [%s] not found.",
574 expectedExtractorClass),
575 containsClass(extractionReport.getMatchingExtractors(), expectedExtractorClass));
576 }
577 }
578
579
580
581
582
583
584
585
586
587
588
589 private void assertEncodingDetection(String encoding, String input, String expectedContent) throws Exception {
590 DocumentSource fileDocumentSource = getDocumentSourceFromResource(input);
591 Any23 any23;
592 RepositoryConnection conn = null;
593 RepositoryWriter repositoryWriter = null;
594
595 any23 = new Any23();
596 Repository store = new SailRepository(new MemoryStore());
597 store.init();
598 try {
599 conn = store.getConnection();
600 repositoryWriter = new RepositoryWriter(conn);
601 Assert.assertTrue(any23.extract(fileDocumentSource, repositoryWriter, encoding).hasMatchingExtractors());
602
603 RepositoryResult<Statement> statements = conn.getStatements(null, vDCTERMS.title, null, false);
604 try {
605 while (statements.hasNext()) {
606 Statement statement = statements.next();
607 printStatement(statement);
608 Assert.assertTrue(statement.getObject().stringValue().contains(expectedContent));
609 }
610 } finally {
611 statements.close();
612 }
613 } finally {
614 if (conn != null) {
615 conn.close();
616 }
617 if (repositoryWriter != null) {
618 repositoryWriter.close();
619 }
620 }
621 fileDocumentSource = null;
622 any23 = null;
623 }
624
625
626
627
628
629
630
631
632
633 private void assertDetection(String content, String... parsers) throws Exception {
634 ByteArrayOutputStream out = new ByteArrayOutputStream();
635 Any23 runner = new Any23(parsers.length == 0 ? null : parsers);
636 if (parsers.length != 0) {
637 runner.setMIMETypeDetector(null);
638
639 }
640 final NTriplesWriter tripleHandler = new NTriplesWriter(out);
641 runner.extract(new StringDocumentSource(content, PAGE_URL), tripleHandler);
642 tripleHandler.close();
643 String result = out.toString("us-ascii");
644 Assert.assertNotNull(result);
645 Assert.assertTrue(result.length() > 10);
646 }
647
648 private void printStatement(Statement statement) {
649 logger.debug(String.format(Locale.ROOT, "%s\t%s\t%s", statement.getSubject(), statement.getPredicate(),
650 statement.getObject()));
651 }
652
653 private boolean containsClass(List<?> list, Class<?> clazz) {
654 for (Object o : list) {
655 if (o.getClass().equals(clazz)) {
656 return true;
657 }
658 }
659 return false;
660 }
661
662 }