1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
22 import org.apache.any23.rdf.RDFUtils;
23 import org.openrdf.model.URI;
24 import org.openrdf.model.impl.ValueFactoryImpl;
25 import org.slf4j.Logger;
26 import org.slf4j.LoggerFactory;
27 import org.w3c.dom.NamedNodeMap;
28 import org.w3c.dom.Node;
29 import org.w3c.dom.NodeList;
30 import org.w3c.dom.Text;
31
32 import javax.xml.xpath.XPath;
33 import javax.xml.xpath.XPathConstants;
34 import javax.xml.xpath.XPathExpressionException;
35 import javax.xml.xpath.XPathFactory;
36 import java.net.URISyntaxException;
37 import java.util.ArrayList;
38 import java.util.List;
39
40
41
42
43
44
45
46
47 public class HTMLDocument {
48
49 private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
50 private final static Logger log = LoggerFactory.getLogger(HTMLDocument.class);
51
52 private Node document;
53 private java.net.URI baseURI;
54
55 private final Any23ValueFactoryWrapper valueFactory =
56 new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
57
58
59
60
61
62
63
64 public static TextField readTextField(Node node) {
65 TextField result;
66 final String name = node.getNodeName();
67 final NamedNodeMap attributes = node.getAttributes();
68
69 if (attributes == null ) {
70 return new TextField( node.getTextContent(), node);
71 }
72
73 List<Node> values = DomUtils.findAllByClassName(node, "value");
74 if (!values.isEmpty()) {
75 String val = "";
76 for (Node n : values)
77 val += n.getTextContent();
78 return new TextField( val.trim(), node);
79 }
80 if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
81 result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
82 } else if ("A".equals(name)) {
83 if (DomUtils.hasAttribute(node, "rel", "tag")) {
84 String href = extractRelTag(attributes);
85 result = new TextField(href, node);
86 } else
87 result = new TextField(node.getTextContent(), node);
88 } else if ("IMG".equals(name) || "AREA".equals(name)) {
89 result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
90 } else {
91 result = new TextField(node.getTextContent(), node);
92 }
93 return result;
94 }
95
96
97
98
99
100
101
102 public static void readUrlField(List<TextField> res, Node node) {
103 String name = node.getNodeName();
104 NamedNodeMap attributes = node.getAttributes();
105 if (null == attributes) {
106 res.add( new TextField(node.getTextContent(), node) );
107 return;
108 }
109 if ("A".equals(name) || "AREA".equals(name)) {
110 Node n = attributes.getNamedItem("href");
111 res.add( new TextField(n.getNodeValue(), n) );
112 } else if ("ABBR".equals(name)) {
113 Node n = attributes.getNamedItem("title");
114 res.add( new TextField(n.getNodeValue(), n) );
115 } else if ("IMG".equals(name)) {
116 Node n = attributes.getNamedItem("src");
117 res.add( new TextField(n.getNodeValue(), n) );
118 } else if ("OBJECT".equals(name)) {
119 Node n = attributes.getNamedItem("data");
120 res.add( new TextField(n.getNodeValue(), n) );
121 } else {
122 res.add( new TextField(node.getTextContent().trim(), node) );
123 }
124 }
125
126
127
128
129
130
131
132
133 public static String extractRelTag(String hrefAttributeContent) {
134 String[] all = hrefAttributeContent.split("[#?]");
135
136 String path = all[0];
137 int pathLenghtMin1 = path.length() - 1;
138 if( '/' == path.charAt(pathLenghtMin1) ) {
139 path = path.substring(0, pathLenghtMin1);
140 }
141 return path;
142 }
143
144
145
146
147
148
149
150
151 public static String extractRelTag(NamedNodeMap attributes) {
152 return extractRelTag(attributes.getNamedItem("href").getNodeValue());
153 }
154
155
156
157
158
159
160
161
162
163
164 public static String readNodeContent(Node node, boolean prettify) {
165 final String content = node.getTextContent();
166 return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
167 }
168
169
170
171
172
173
174 public HTMLDocument(Node document) {
175 if (null == document)
176 throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
177 this.document = document;
178 }
179
180
181
182
183
184 public URI resolveURI(String uri) throws ExtractionException {
185 return valueFactory.resolveURI(uri, getBaseURI());
186 }
187
188 public String find(String xpath) {
189 return DomUtils.find(getDocument(), xpath);
190 }
191
192 public Node findNodeById(String id) {
193 return DomUtils.findNodeById(getDocument(), id);
194 }
195
196 public List<Node> findAll(String xpath) {
197 return DomUtils.findAll(getDocument(), xpath);
198 }
199
200 public String findMicroformattedValue(
201 String objectTag,
202 String object,
203 String fieldTag,
204 String field,
205 String key
206 ) {
207 Node node = findMicroformattedObjectNode(objectTag, object);
208 if (null == node)
209 return "";
210
211 if (DomUtils.hasClassName(node, field))
212 return node.getTextContent();
213
214
215 try {
216 String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
217 String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
218 if (null == value) {
219 return "";
220 }
221 return value;
222 } catch (XPathExpressionException ex) {
223 throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
224 }
225
226 }
227
228 public Node getDocument() {
229 return document;
230 }
231
232
233
234
235
236
237
238
239 public TextField getSingularTextField(String className) {
240 TextField[] res = getPluralTextField(className);
241 if (res.length == 0)
242 return new TextField("", null);
243 return res[0];
244 }
245
246
247
248
249
250
251
252 public TextField[] getPluralTextField(String className) {
253 List<TextField> res = new ArrayList<TextField>();
254 List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
255 for (Node node : nodes) {
256 res.add( readTextField(node) );
257 }
258 return res.toArray( new TextField[res.size()] );
259 }
260
261
262
263
264
265
266
267
268 public TextField getSingularUrlField(String className) {
269 TextField[] res = getPluralUrlField(className);
270 if (res.length < 1)
271 return new TextField("", null);
272 return res[0];
273 }
274
275
276
277
278
279
280
281 public TextField[] getPluralUrlField(String className) {
282 List<TextField> res = new ArrayList<TextField>();
283 List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
284 for (Node node : nodes)
285 readUrlField(res, node);
286 return res.toArray( new TextField[res.size()] );
287 }
288
289 public Node findMicroformattedObjectNode(String objectTag, String name) {
290 List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
291 if (nodes.isEmpty())
292 return null;
293 return nodes.get(0);
294 }
295
296
297
298
299
300
301
302
303 public String readAttribute(String attribute) {
304 return DomUtils.readAttribute(getDocument(), attribute);
305 }
306
307
308
309
310
311
312
313 public List<Node> findAllByClassName(String clazz) {
314 return DomUtils.findAllByClassName(getDocument(), clazz);
315 }
316
317
318
319
320
321
322
323 public String getText() {
324 NodeList children = getDocument().getChildNodes();
325 if(children.getLength() == 1 && children.item(0) instanceof Text) {
326 return children.item(0).getTextContent();
327 }
328 return null;
329 }
330
331
332
333
334
335
336 public String getDefaultLanguage() {
337 final String xpathLanguageSelector = "/HTML";
338 Node html;
339 try {
340 html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
341 } catch (XPathExpressionException xpeee) {
342 throw new IllegalStateException();
343 }
344 if (html == null) {
345 return null;
346 }
347 Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
348 return langAttribute == null ? null : langAttribute.getTextContent();
349 }
350
351
352
353
354
355
356 public String[] getPathToLocalRoot() {
357 return DomUtils.getXPathListForNode(document);
358 }
359
360
361
362
363
364
365 public TextField[] extractRelTagNodes() {
366 final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
367 final List<TextField> result = new ArrayList<TextField>();
368 for(Node relTagNode : relTagNodes) {
369 readUrlField(result, relTagNode);
370 }
371 return result.toArray( new TextField[result.size()] );
372 }
373
374 private java.net.URI getBaseURI() throws ExtractionException {
375 if (baseURI == null) {
376 try {
377 if (document.getBaseURI() == null) {
378 log.warn("document.getBaseURI() is null, this should not happen");
379 }
380 baseURI = new java.net.URI(RDFUtils.fixAbsoluteURI(document.getBaseURI()));
381 } catch (IllegalArgumentException ex) {
382 throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
383 } catch (URISyntaxException ex) {
384 throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex);
385 }
386 }
387 return baseURI;
388 }
389
390
391
392
393
394 public static class TextField {
395 private String value;
396 private Node source;
397
398 public TextField(String value, Node source) {
399 this.value = value;
400 this.source = source;
401 }
402
403 public String value() {
404 return value;
405 }
406
407 public Node source() {
408 return source;
409 }
410 }
411
412 }