This project has retired. For details please refer to its
Attic page.
AbstractExtractorTestCase xref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.AbstractAny23TestBase;
21 import org.apache.any23.extractor.IssueReport;
22 import org.apache.any23.extractor.IssueReport.Issue;
23 import org.apache.any23.extractor.IssueReport.IssueLevel;
24 import org.apache.any23.extractor.ExtractionException;
25 import org.apache.any23.extractor.ExtractorFactory;
26 import org.apache.any23.extractor.SingleDocumentExtraction;
27 import org.apache.any23.extractor.SingleDocumentExtractionReport;
28 import org.apache.any23.rdf.RDFUtils;
29 import org.apache.any23.vocab.SINDICE;
30 import org.apache.any23.writer.RepositoryWriter;
31 import org.junit.After;
32 import org.junit.Assert;
33 import org.junit.Before;
34 import org.eclipse.rdf4j.common.iteration.Iterations;
35 import org.eclipse.rdf4j.model.BNode;
36 import org.eclipse.rdf4j.model.Literal;
37 import org.eclipse.rdf4j.model.Resource;
38 import org.eclipse.rdf4j.model.Statement;
39 import org.eclipse.rdf4j.model.IRI;
40 import org.eclipse.rdf4j.model.Value;
41 import org.eclipse.rdf4j.repository.RepositoryConnection;
42 import org.eclipse.rdf4j.repository.RepositoryException;
43 import org.eclipse.rdf4j.repository.RepositoryResult;
44 import org.eclipse.rdf4j.repository.sail.SailRepository;
45 import org.eclipse.rdf4j.rio.RDFFormat;
46 import org.eclipse.rdf4j.rio.RDFHandlerException;
47 import org.eclipse.rdf4j.rio.RDFParseException;
48 import org.eclipse.rdf4j.rio.Rio;
49 import org.eclipse.rdf4j.sail.Sail;
50 import org.eclipse.rdf4j.sail.memory.MemoryStore;
51 import org.slf4j.Logger;
52 import org.slf4j.LoggerFactory;
53
54 import java.io.ByteArrayOutputStream;
55 import java.io.IOException;
56 import java.io.PrintStream;
57 import java.io.StringWriter;
58 import java.lang.invoke.MethodHandles;
59 import java.nio.charset.StandardCharsets;
60 import java.util.ArrayList;
61 import java.util.Collection;
62 import java.util.Collections;
63 import java.util.List;
64 import java.util.Locale;
65 import java.util.Map;
66
67
68
69
70 public abstract class AbstractExtractorTestCase extends AbstractAny23TestBase {
71
72 private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
73
74
75
76
77
78 protected static IRI baseIRI = RDFUtils.iri("http://bob.example.com/");
79
80
81
82
83 protected RepositoryConnection conn;
84
85
86
87
88 private SingleDocumentExtractionReport report;
89
90 private Sail store;
91
92 private SailRepository repository;
93
94
95
96
97 public AbstractExtractorTestCase() {
98 super();
99 }
100
101
102
103
104 protected abstract ExtractorFactory<?> getExtractorFactory();
105
106
107
108
109
110
111
112 @Before
113 public void setUp() throws Exception {
114 super.setUp();
115 store = new MemoryStore();
116 repository = new SailRepository(store);
117 repository.init();
118 conn = repository.getConnection();
119 }
120
121
122
123
124
125
126
127
128 @After
129 public void tearDown() throws RepositoryException {
130 try {
131 conn.close();
132 } finally {
133 repository.shutDown();
134 }
135 conn = null;
136 report = null;
137 store = null;
138 repository = null;
139 }
140
141
142
143
144 protected RepositoryConnection getConnection() {
145 return conn;
146 }
147
148
149
150
151 protected SingleDocumentExtractionReport getReport() {
152 return report;
153 }
154
155
156
157
158
159
160
161
162
163 protected Collection<IssueReport.Issue> getIssues(String extractorName) {
164 for (Map.Entry<String, Collection<IssueReport.Issue>> issueEntry : report.getExtractorToIssues().entrySet()) {
165 if (issueEntry.getKey().equals(extractorName)) {
166 return issueEntry.getValue();
167 }
168 }
169 return Collections.emptyList();
170 }
171
172
173
174
175
176
177 protected Collection<IssueReport.Issue> getIssues() {
178 return getIssues(getExtractorFactory().getExtractorName());
179 }
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195 protected void extract(String resource) throws ExtractionException, IOException {
196 SingleDocumentExtraction ex = new SingleDocumentExtraction(
197 new HTMLFixture(copyResourceToTempFile(resource)).getOpener(baseIRI.toString()), getExtractorFactory(),
198 new RepositoryWriter(conn));
199 ex.setMIMETypeDetector(null);
200 report = ex.run();
201 }
202
203
204
205
206
207
208
209
210
211 protected void assertExtract(String resource, boolean assertNoIssues) {
212 try {
213 extract(resource);
214 if (assertNoIssues)
215 assertNoIssues();
216 } catch (ExtractionException ex) {
217 throw new RuntimeException(ex);
218 } catch (IOException ex) {
219 throw new RuntimeException(ex);
220 }
221 }
222
223
224
225
226
227
228
229
230 protected void assertExtract(String resource) {
231 assertExtract(resource, true);
232 }
233
234
235
236
237
238
239
240
241
242
243
244
245
246 protected void assertContains(IRI p, Resource o) throws RepositoryException {
247 assertContains(null, p, o);
248 }
249
250
251
252
253
254
255
256
257
258
259
260
261
262 protected void assertContains(IRI p, String o) throws RepositoryException {
263 assertContains(null, p, RDFUtils.literal(o));
264 }
265
266
267
268
269
270
271
272
273
274
275
276
277
278 protected void assertNotContains(IRI p, Resource o) throws RepositoryException {
279 assertNotContains(null, p, o);
280 }
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296 protected void assertContains(Resource s, IRI p, Value o) throws RepositoryException {
297 Assert.assertTrue(
298 getFailedExtractionMessage() + String.format(Locale.ROOT, "Cannot find triple (%s %s %s)", s, p, o),
299 conn.hasStatement(s, p, o, false));
300 }
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316 protected void assertNotContains(Resource s, IRI p, String o) throws RepositoryException {
317 Assert.assertFalse(getFailedExtractionMessage(), conn.hasStatement(s, p, RDFUtils.literal(o), false));
318 }
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334 protected void assertNotContains(Resource s, IRI p, Resource o) throws RepositoryException {
335 Assert.assertFalse(getFailedExtractionMessage(), conn.hasStatement(s, p, o, false));
336 }
337
338
339
340
341
342
343
344
345 protected void assertModelNotEmpty() throws RepositoryException {
346 Assert.assertFalse("The model is expected to not be empty." + getFailedExtractionMessage(), conn.isEmpty());
347 }
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363 protected void assertNotContains(Resource s, IRI p, Literal o) throws RepositoryException {
364 Assert.assertFalse(getFailedExtractionMessage(), conn.hasStatement(s, p, o, false));
365 }
366
367
368
369
370
371
372
373
374 protected void assertModelEmpty() throws RepositoryException {
375 Assert.assertTrue(getFailedExtractionMessage(), conn.isEmpty());
376 }
377
378
379
380
381 protected void assertNoIssues() {
382 for (Map.Entry<String, Collection<IssueReport.Issue>> entry : report.getExtractorToIssues().entrySet()) {
383 if (entry.getValue().size() > 0) {
384 log.debug("Unexpected issue for extractor " + entry.getKey() + " : " + entry.getValue());
385 }
386 for (Issue nextIssue : entry.getValue()) {
387 if (nextIssue.getLevel() == IssueLevel.ERROR || nextIssue.getLevel() == IssueLevel.FATAL) {
388 Assert.fail("Unexpected issue for extractor " + entry.getKey() + " : " + entry.getValue());
389 }
390 }
391 }
392 }
393
394
395
396
397
398
399
400
401
402 protected void assertIssue(IssueReport.IssueLevel level, String issueRegex) {
403 final Collection<IssueReport.Issue> issues = getIssues(getExtractorFactory().getExtractorName());
404 boolean found = false;
405 for (IssueReport.Issue issue : issues) {
406 if (issue.getLevel() == level && issue.getMessage().matches(issueRegex)) {
407 found = true;
408 break;
409 }
410 }
411 Assert.assertTrue(String.format(Locale.ROOT, "Cannot find issue with level %s matching expression '%s'", level,
412 issueRegex), found);
413 }
414
415
416
417
418
419
420
421
422
423
424
425 public void assertContainsModel(Statement[] statements) throws RepositoryException {
426 for (Statement statement : statements) {
427 assertContains(statement);
428 }
429 }
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447 public void assertContainsModel(String modelResource)
448 throws RDFHandlerException, IOException, RDFParseException, RepositoryException {
449 getConnection().remove(null, SINDICE.getInstance().date, (Value) null, (Resource) null);
450 getConnection().remove(null, SINDICE.getInstance().size, (Value) null, (Resource) null);
451 assertContainsModel(RDFUtils.parseRDF(modelResource));
452 }
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470 protected void assertStatementsSize(Resource s, IRI p, Value o, int expected)
471 throws RDFHandlerException, RepositoryException {
472 int statementsSize = getStatementsSize(s, p, o);
473 if (statementsSize != expected) {
474 final ByteArrayOutputStream baos = new ByteArrayOutputStream();
475 PrintStream ps = new PrintStream(baos, true, StandardCharsets.UTF_8);
476 getConnection().exportStatements(s, p, o, true, Rio.createWriter(RDFFormat.NQUADS, ps));
477 }
478
479 Assert.assertEquals("Unexpected number of matching statements.", expected, statementsSize);
480 }
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496 protected void assertStatementsSize(IRI p, Value o, int expected) throws RDFHandlerException, RepositoryException {
497 assertStatementsSize(null, p, o, expected);
498 }
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514 protected void assertStatementsSize(IRI p, String o, int expected) throws RDFHandlerException, RepositoryException {
515 assertStatementsSize(p, o == null ? null : RDFUtils.literal(o), expected);
516 }
517
518
519
520
521
522
523
524
525
526
527
528
529
530 protected void assertNotFound(Resource s, IRI p) throws RepositoryException {
531 RepositoryResult<Statement> statements = conn.getStatements(s, p, null, true);
532 try {
533 Assert.assertFalse("Expected no statements.", statements.hasNext());
534 } finally {
535 statements.close();
536 }
537 }
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553 protected Resource findExactlyOneBlankSubject(IRI p, Value o) throws RepositoryException {
554 RepositoryResult<Statement> it = conn.getStatements(null, p, o, false);
555 try {
556 Assert.assertTrue(getFailedExtractionMessage(), it.hasNext());
557 Statement stmt = it.next();
558 Resource result = stmt.getSubject();
559 Assert.assertTrue(getFailedExtractionMessage(), result instanceof BNode);
560 Assert.assertFalse(getFailedExtractionMessage(), it.hasNext());
561 return result;
562 } finally {
563 it.close();
564 }
565 }
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581 protected Value findExactlyOneObject(Resource s, IRI p) throws RepositoryException {
582 RepositoryResult<Statement> it = conn.getStatements(s, p, null, false);
583 try {
584 Assert.assertTrue(getFailedExtractionMessage(), it.hasNext());
585 return it.next().getObject();
586 } finally {
587 it.close();
588 }
589 }
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605 protected List<Resource> findSubjects(IRI p, Value o) throws RepositoryException {
606 RepositoryResult<Statement> it = conn.getStatements(null, p, o, false);
607 List<Resource> subjects = new ArrayList<Resource>();
608 try {
609 Statement statement;
610 while (it.hasNext()) {
611 statement = it.next();
612 subjects.add(statement.getSubject());
613 }
614 } finally {
615 it.close();
616 }
617 return subjects;
618 }
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634 protected List<Value> findObjects(Resource s, IRI p) throws RepositoryException {
635 RepositoryResult<Statement> it = conn.getStatements(s, p, null, false);
636 List<Value> objects = new ArrayList<Value>();
637 try {
638 Statement statement;
639 while (it.hasNext()) {
640 statement = it.next();
641 objects.add(statement.getObject());
642 }
643 } finally {
644 it.close();
645 }
646 return objects;
647 }
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662 protected Value findObject(Resource s, IRI p) throws RepositoryException {
663 RepositoryResult<Statement> statements = conn.getStatements(s, p, null, true);
664 try {
665 Assert.assertTrue("Expected at least a statement.", statements.hasNext());
666 return (statements.next().getObject());
667 } finally {
668 statements.close();
669 }
670 }
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686 protected Resource findObjectAsResource(Resource s, IRI p) throws RepositoryException {
687 final Value v = findObject(s, p);
688 try {
689 return (Resource) v;
690 } catch (ClassCastException cce) {
691 Assert.fail("Expected resource object, found: " + v.getClass().getSimpleName());
692 throw new IllegalStateException();
693 }
694 }
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710 protected String findObjectAsLiteral(Resource s, IRI p) throws RepositoryException {
711 return findObject(s, p).stringValue();
712 }
713
714
715
716
717
718
719
720
721
722
723 protected String dumpModelToTurtle() throws RepositoryException {
724 StringWriter w = new StringWriter();
725 try {
726 conn.export(Rio.createWriter(RDFFormat.TURTLE, w));
727 return w.toString();
728 } catch (RDFHandlerException ex) {
729 throw new RuntimeException(ex);
730 }
731 }
732
733
734
735
736
737
738
739
740
741
742 protected String dumpModelToNQuads() throws RepositoryException {
743 StringWriter w = new StringWriter();
744 try {
745 conn.export(Rio.createWriter(RDFFormat.NQUADS, w));
746 return w.toString();
747 } catch (RDFHandlerException ex) {
748 throw new RuntimeException(ex);
749 }
750 }
751
752
753
754
755
756
757
758
759
760
761 protected String dumpModelToRDFXML() throws RepositoryException {
762 StringWriter w = new StringWriter();
763 try {
764 conn.export(Rio.createWriter(RDFFormat.RDFXML, w));
765 return w.toString();
766 } catch (RDFHandlerException ex) {
767 throw new RuntimeException(ex);
768 }
769 }
770
771
772
773
774
775
776
777
778
779
780 protected List<Statement> dumpAsListOfStatements() throws RepositoryException {
781 return Iterations.asList(conn.getStatements(null, null, null, false));
782 }
783
784
785
786
787
788
789
790
791 protected String dumpHumanReadableTriples() throws RepositoryException {
792 StringBuilder sb = new StringBuilder();
793 RepositoryResult<Statement> result = conn.getStatements(null, null, null, false);
794 while (result.hasNext()) {
795 Statement statement = result.next();
796 sb.append(String.format(Locale.ROOT, "%s %s %s %s\n", statement.getSubject(), statement.getPredicate(),
797 statement.getObject(), statement.getContext()));
798
799 }
800 return sb.toString();
801 }
802
803
804
805
806
807
808
809
810
811
812
813
814
815 protected void assertContains(Statement statement) throws RepositoryException {
816 Assert.assertTrue("Cannot find statement " + statement + " in model.",
817 conn.hasStatement(statement.getSubject() instanceof BNode ? null : statement.getSubject(),
818 statement.getPredicate(), statement.getObject() instanceof BNode ? null : statement.getObject(),
819 false));
820 }
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836 protected void assertContains(Resource s, IRI p, String l) throws RepositoryException {
837 assertContains(s, p, RDFUtils.literal(l));
838 }
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856 protected void assertContains(Resource s, IRI p, String l, String lang) throws RepositoryException {
857 assertContains(s, p, RDFUtils.literal(l, lang));
858 }
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876 protected RepositoryResult<Statement> getStatements(Resource s, IRI p, Value o) throws RepositoryException {
877 return conn.getStatements(s, p, o, false);
878 }
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896 protected int getStatementsSize(Resource s, IRI p, Value o) throws RepositoryException {
897 RepositoryResult<Statement> result = getStatements(s, p, o);
898 int count = 0;
899 try {
900 while (result.hasNext()) {
901 result.next();
902 count++;
903 }
904 } finally {
905 result.close();
906 }
907 return count;
908 }
909
910 private String getFailedExtractionMessage() throws RepositoryException {
911 return "Assertion failed! Extracted triples:\n" + dumpModelToNQuads();
912 }
913
914 }