1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.IssueReport;
21 import org.apache.any23.extractor.ExtractionContext;
22 import org.apache.any23.extractor.ExtractionException;
23 import org.apache.any23.extractor.ExtractionParameters;
24 import org.apache.any23.extractor.ExtractionResult;
25 import org.apache.any23.extractor.Extractor;
26 import org.apache.any23.extractor.ExtractorDescription;
27 import org.apache.any23.extractor.ExtractorFactory;
28 import org.apache.any23.extractor.SimpleExtractorFactory;
29 import org.apache.any23.extractor.rdf.RDFParserFactory;
30 import org.apache.any23.rdf.PopularPrefixes;
31 import org.openrdf.model.URI;
32 import org.openrdf.rio.RDFParseException;
33 import org.openrdf.rio.RDFParser;
34 import org.openrdf.rio.turtle.TurtleParser;
35 import org.w3c.dom.Document;
36 import org.w3c.dom.Node;
37
38 import java.io.IOException;
39 import java.io.StringReader;
40 import java.util.Arrays;
41 import java.util.List;
42
43
44
45
46
47
48
49
50
51 public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
52
53 public final static String NAME = "html-script-turtle";
54
55 public final static ExtractorFactory<TurtleHTMLExtractor> factory =
56 SimpleExtractorFactory.create(
57 NAME,
58 PopularPrefixes.get(),
59 Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
60 "example-script-turtle.html",
61 TurtleHTMLExtractor.class
62 );
63
64 private RDFParser turtleParser;
65
66 public void run(
67 ExtractionParameters extractionParameters,
68 ExtractionContext extractionContext,
69 Document in,
70 ExtractionResult out
71 ) throws IOException, ExtractionException {
72 List<Node> scriptNodes;
73 HTMLDocument htmlDocument = new HTMLDocument(in);
74 final URI documentURI = extractionContext.getDocumentURI();
75
76 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
77 processScriptNodes(documentURI, extractionContext, out, scriptNodes);
78
79 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
80 processScriptNodes(documentURI, extractionContext, out, scriptNodes);
81
82 scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
83 processScriptNodes(documentURI, extractionContext,out, scriptNodes);
84 }
85
86 public ExtractorDescription getDescription() {
87 return factory;
88 }
89
90
91
92
93
94
95
96
97 private void processScriptNodes(URI documentURI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
98 if(ns.size() > 0 && turtleParser == null) {
99 turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
100 }
101 for(Node n : ns) {
102 processScriptNode(turtleParser, documentURI, n, er);
103 }
104 }
105
106
107
108
109
110
111
112
113
114 private void processScriptNode(RDFParser turtleParser, URI documentURI, Node n, ExtractionResult er) {
115 final Node idAttribute = n.getAttributes().getNamedItem("id");
116 final String graphName =
117 documentURI.stringValue() +
118 ( idAttribute == null ? "" : "#" + idAttribute.getTextContent() );
119 try {
120 turtleParser.parse( new StringReader(n.getTextContent()), graphName );
121 } catch (RDFParseException rdfpe) {
122 er.notifyIssue(
123 IssueReport.IssueLevel.Error,
124 String.format(
125 "An error occurred while parsing turtle content within script node: %s",
126 Arrays.toString(DomUtils.getXPathListForNode(n))
127 ),
128 rdfpe.getLineNumber(), rdfpe.getColumnNumber()
129 );
130 } catch (Exception e) {
131 er.notifyIssue(IssueReport.IssueLevel.Error, "An error occurred while processing RDF data.", -1, -1);
132 }
133 }
134
135 }