View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.IssueReport;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionParameters;
24  import org.apache.any23.extractor.ExtractionResult;
25  import org.apache.any23.extractor.Extractor;
26  import org.apache.any23.extractor.ExtractorDescription;
27  import org.apache.any23.extractor.ExtractorFactory;
28  import org.apache.any23.extractor.SimpleExtractorFactory;
29  import org.apache.any23.extractor.rdf.RDFParserFactory;
30  import org.apache.any23.rdf.PopularPrefixes;
31  import org.openrdf.model.URI;
32  import org.openrdf.rio.RDFParseException;
33  import org.openrdf.rio.RDFParser;
34  import org.openrdf.rio.turtle.TurtleParser;
35  import org.w3c.dom.Document;
36  import org.w3c.dom.Node;
37  
38  import java.io.IOException;
39  import java.io.StringReader;
40  import java.util.Arrays;
41  import java.util.List;
42  
43  /**
44   * Extractor for <i>Turtle/N3</i> format embedded within <i>HTML</i>
45   * <i>script</i> tags.
46   *
47   * See specification draft <a href="http://esw.w3.org/N3inHTML">here</a>. 
48   *
49   * @author Michele Mostarda (mostarda@fbk.eu)
50   */
51  public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
52  
53      public final static String NAME = "html-script-turtle";
54  
55      public final static ExtractorFactory<TurtleHTMLExtractor> factory =
56              SimpleExtractorFactory.create(
57                      NAME,
58                      PopularPrefixes.get(),
59                      Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
60                      "example-script-turtle.html",
61                      TurtleHTMLExtractor.class
62              );
63  
64      private RDFParser turtleParser;
65  
66      public void run(
67              ExtractionParameters extractionParameters,
68              ExtractionContext extractionContext,
69              Document in,
70              ExtractionResult out
71      ) throws IOException, ExtractionException {
72          List<Node> scriptNodes;
73          HTMLDocument htmlDocument = new HTMLDocument(in);
74          final URI documentURI = extractionContext.getDocumentURI();
75  
76          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
77          processScriptNodes(documentURI, extractionContext, out, scriptNodes);
78  
79          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
80          processScriptNodes(documentURI, extractionContext, out, scriptNodes);
81  
82          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
83          processScriptNodes(documentURI, extractionContext,out, scriptNodes);
84      }
85  
86      public ExtractorDescription getDescription() {
87          return factory;
88      }
89  
90      /**
91       * Processes a list of <i>html script</i> nodes retrieving the N3 / Turtle content.
92       *
93       * @param documentURI the URI of the original HTML document.
94       * @param er the extraction result used to store triples.
95       * @param ns the list of script nodes.
96       */
97      private void processScriptNodes(URI documentURI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
98          if(ns.size() > 0 && turtleParser == null) {
99              turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
100         }
101         for(Node n : ns) {
102             processScriptNode(turtleParser, documentURI, n, er);
103         }
104     }
105 
106     /**
107      * Processes a single <i>html script</i> node.
108      *
109      * @param turtleParser the parser used to digest node content.
110      * @param documentURI the URI of the original HTML document.
111      * @param n the script node.
112      * @param er the extraction result used to store triples.
113      */
114     private void processScriptNode(RDFParser turtleParser, URI documentURI, Node n, ExtractionResult er) {
115         final Node idAttribute = n.getAttributes().getNamedItem("id");
116         final String graphName =
117                 documentURI.stringValue() +
118                 ( idAttribute == null ? "" : "#" +   idAttribute.getTextContent() ); 
119         try {
120             turtleParser.parse( new StringReader(n.getTextContent()), graphName );
121         } catch (RDFParseException rdfpe) {
122             er.notifyIssue(
123                     IssueReport.IssueLevel.Error,
124                     String.format(
125                             "An error occurred while parsing turtle content within script node: %s",
126                             Arrays.toString(DomUtils.getXPathListForNode(n))
127                     ),
128                     rdfpe.getLineNumber(), rdfpe.getColumnNumber()
129             );
130         } catch (Exception e) {
131             er.notifyIssue(IssueReport.IssueLevel.Error, "An error occurred while processing RDF data.", -1, -1);
132         }
133     }
134 
135 }