1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.extractor.IssueReport;
26 import org.apache.any23.extractor.TagSoupExtractionResult;
27 import org.apache.any23.extractor.html.annotations.Includes;
28 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
29 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30 import org.openrdf.model.BNode;
31 import org.openrdf.model.Literal;
32 import org.openrdf.model.Resource;
33 import org.openrdf.model.URI;
34 import org.openrdf.model.impl.ValueFactoryImpl;
35 import org.w3c.dom.Document;
36 import org.w3c.dom.Node;
37
38 import java.io.IOException;
39
40
41
42
43
44 public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
45
46 public static final String BEGIN_SCRIPT = "<script>";
47 public static final String END_SCRIPT = "</script>";
48
49 private HTMLDocument htmlDocument;
50
51 private ExtractionContext context;
52
53 private URI documentURI;
54
55 private ExtractionResult out;
56
57 protected final Any23ValueFactoryWrapper valueFactory =
58 new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
59
60
61
62
63
64
65 public abstract ExtractorDescription getDescription();
66
67
68
69
70
71
72
73
74 protected abstract boolean extract() throws ExtractionException;
75
76 public HTMLDocument getHTMLDocument() {
77 return htmlDocument;
78 }
79
80 public ExtractionContext getExtractionContext() {
81 return context;
82 }
83
84 public URI getDocumentURI() {
85 return documentURI;
86 }
87
88 public final void run(
89 ExtractionParameters extractionParameters,
90 ExtractionContext extractionContext,
91 Document in,
92 ExtractionResult out
93 ) throws IOException, ExtractionException {
94 this.htmlDocument = new HTMLDocument(in);
95 this.context = extractionContext;
96 this.documentURI = extractionContext.getDocumentURI();
97 this.out = out;
98 valueFactory.setIssueReport(out);
99 try {
100 extract();
101 } finally {
102 valueFactory.setIssueReport(null);
103 }
104 }
105
106
107
108
109
110
111
112 protected ExtractionResult getCurrentExtractionResult() {
113 return out;
114 }
115
116 protected ExtractionResult openSubResult(ExtractionContext context) {
117 return out.openSubResult(context);
118 }
119
120
121
122
123
124
125
126
127
128
129
130 protected boolean conditionallyAddStringProperty(
131 Node n,
132 Resource subject, URI p, String value
133 ) {
134 if (value == null) return false;
135 value = value.trim();
136 return
137 value.length() > 0
138 &&
139 conditionallyAddLiteralProperty(
140 n,
141 subject, p, valueFactory.createLiteral(value)
142 );
143 }
144
145
146
147
148
149
150
151
152
153
154 protected boolean conditionallyAddLiteralProperty(
155 Node n,
156 Resource subject,
157 URI property,
158 Literal literal
159 ) {
160 final String literalStr = literal.stringValue();
161 if( containsScriptBlock(literalStr) ) {
162 out.notifyIssue(
163 IssueReport.IssueLevel.Warning,
164 String.format("Detected script in literal: [%s]", literalStr)
165 , -1
166 , -1
167 );
168 return false;
169 }
170 out.writeTriple(subject, property, literal);
171 TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
172 tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
173 return true;
174 }
175
176
177
178
179
180
181
182
183 protected boolean conditionallyAddResourceProperty(Resource subject, URI property, URI uri) {
184 if (uri == null) return false;
185 out.writeTriple(subject, property, uri);
186 return true;
187 }
188
189
190
191
192
193
194
195
196
197 protected void addBNodeProperty(Node n, Resource subject, URI property, BNode bnode) {
198 out.writeTriple(subject, property, bnode);
199 TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
200 tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
201 }
202
203
204
205
206
207
208
209
210 protected void addBNodeProperty( Resource subject, URI property, BNode bnode) {
211 out.writeTriple(subject, property, bnode);
212 }
213
214
215
216
217
218
219
220
221 protected void addURIProperty(Resource subject, URI property, URI object) {
222 out.writeTriple(subject, property, object);
223 }
224
225 protected URI fixLink(String link) {
226 return valueFactory.fixLink(link, null);
227 }
228
229 protected URI fixLink(String link, String defaultSchema) {
230 return valueFactory.fixLink(link, defaultSchema);
231 }
232
233 private boolean containsScriptBlock(String in) {
234 final String inLowerCase = in.toLowerCase();
235 final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
236 if(beginBlock == -1) {
237 return false;
238 }
239 return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
240 }
241
242
243
244
245
246
247
248
249
250
251 public static boolean includes(
252 Class<? extends MicroformatExtractor>including,
253 Class<? extends MicroformatExtractor> included) {
254 Includes includes = including.getAnnotation(Includes.class);
255 if (includes != null) {
256 Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
257 if (extractors != null && extractors.length > 0) {
258 for (Class<? extends MicroformatExtractor> extractor : extractors) {
259 if (extractor.equals(included)) {
260 return true;
261 }
262 }
263 }
264 }
265 return false;
266 }
267
268 }