1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.rdfa;
19
20 import org.apache.any23.configuration.DefaultConfiguration;
21 import org.apache.any23.extractor.ExtractionContext;
22 import org.apache.any23.extractor.ExtractionException;
23 import org.apache.any23.extractor.ExtractionParameters;
24 import org.apache.any23.extractor.ExtractionResult;
25 import org.apache.any23.extractor.ExtractorDescription;
26 import org.apache.any23.extractor.ExtractorFactory;
27 import org.apache.any23.extractor.SimpleExtractorFactory;
28 import org.apache.any23.extractor.rdf.RDFParserFactory;
29 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30 import org.openrdf.rio.RDFHandlerException;
31 import org.openrdf.rio.RDFParseException;
32 import org.openrdf.rio.RDFParser;
33 import org.w3c.dom.Document;
34
35 import java.io.IOException;
36 import java.io.InputStream;
37 import java.io.StringReader;
38 import java.io.StringWriter;
39 import java.util.Arrays;
40
41
42
43
44
45
46
47
48
49
50 public class RDFaExtractor implements TagSoupDOMExtractor {
51
52 public final static String NAME = "html-rdfa";
53
54 public final static String xsltFilename =
55 DefaultConfiguration.singleton().getPropertyOrFail("any23.rdfa.extractor.xslt");
56
57 private static XSLTStylesheet xslt = null;
58
59 public final static ExtractorFactory<RDFaExtractor> factory =
60 SimpleExtractorFactory.create(
61 NAME,
62 null,
63 Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"),
64 null,
65 RDFaExtractor.class
66 );
67
68
69
70
71
72
73
74 public static synchronized XSLTStylesheet getXSLT() {
75
76
77 if (xslt == null) {
78 InputStream in = RDFaExtractor.class.getResourceAsStream(xsltFilename);
79 if (in == null) {
80 throw new RuntimeException("Couldn't load '" + xsltFilename +
81 "', maybe the file is not bundled in the jar?");
82 }
83 xslt = new XSLTStylesheet(in);
84 }
85 return xslt;
86 }
87
88 private boolean verifyDataType;
89
90 private boolean stopAtFirstError;
91
92
93
94
95
96
97
98
99
100 public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
101 this.verifyDataType = verifyDataType;
102 this.stopAtFirstError = stopAtFirstError;
103 }
104
105
106
107
108 public RDFaExtractor() {
109 this(false, false);
110 }
111
112 public boolean isVerifyDataType() {
113 return verifyDataType;
114 }
115
116 public void setVerifyDataType(boolean verifyDataType) {
117 this.verifyDataType = verifyDataType;
118 }
119
120 public boolean isStopAtFirstError() {
121 return stopAtFirstError;
122 }
123
124 public void setStopAtFirstError(boolean stopAtFirstError) {
125 this.stopAtFirstError = stopAtFirstError;
126 }
127
128 public void run(
129 ExtractionParameters extractionParameters,
130 ExtractionContext extractionContext,
131 Document in,
132 ExtractionResult out
133 ) throws IOException, ExtractionException {
134
135 StringWriter buffer = new StringWriter();
136 try {
137 getXSLT().applyTo(in, buffer);
138 } catch (XSLTStylesheetException xslte) {
139 throw new ExtractionException("An error occurred during the XSLT application.", xslte);
140 }
141
142 try {
143 RDFParser parser
144 = RDFParserFactory.getInstance().getRDFXMLParser(
145 verifyDataType, stopAtFirstError, extractionContext, out
146 );
147 parser.parse(
148 new StringReader(buffer.getBuffer().toString()),
149 extractionContext.getDocumentURI().stringValue()
150 );
151 } catch (RDFHandlerException ex) {
152 throw new IllegalStateException(
153 "Should not happen, RDFHandlerAdapter does not throw RDFHandlerException", ex
154 );
155 } catch (RDFParseException ex) {
156 throw new ExtractionException(
157 "Invalid RDF/XML produced by RDFa transform.", ex, out
158 );
159 }
160 }
161
162 private String getDocType(Document in) {
163 return in.getDoctype().getPublicId();
164 }
165
166
167
168
169 public ExtractorDescription getDescription() {
170 return factory;
171 }
172
173 }