View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.IssueReport;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionParameters;
24  import org.apache.any23.extractor.ExtractionResult;
25  import org.apache.any23.extractor.Extractor;
26  import org.apache.any23.extractor.ExtractorDescription;
27  import org.apache.any23.extractor.rdf.RDFParserFactory;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.rio.RDFParseException;
30  import org.eclipse.rdf4j.rio.RDFParser;
31  import org.w3c.dom.Document;
32  import org.w3c.dom.Node;
33  
34  import java.io.IOException;
35  import java.io.StringReader;
36  import java.util.Arrays;
37  import java.util.List;
38  
39  /**
40   * Extractor for <i>Turtle/N3</i> format embedded within <i>HTML</i>
41   * <i>script</i> tags.
42   *
43   * See specification draft <a href="http://esw.w3.org/N3inHTML">here</a>. 
44   *
45   * @author Michele Mostarda (mostarda@fbk.eu)
46   */
47  public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
48  
49      private RDFParser turtleParser;
50  
51      @Override
52      public void run(
53              ExtractionParameters extractionParameters,
54              ExtractionContext extractionContext,
55              Document in,
56              ExtractionResult out
57      ) throws IOException, ExtractionException {
58          List<Node> scriptNodes;
59          HTMLDocumentocument.html#HTMLDocument">HTMLDocument htmlDocument = new HTMLDocument(in);
60          final IRI documentIRI = extractionContext.getDocumentIRI();
61  
62          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
63          processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
64  
65          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
66          processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
67  
68          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
69          processScriptNodes(documentIRI, extractionContext,out, scriptNodes);
70      }
71  
72      @Override
73      public ExtractorDescription getDescription() {
74          return TurtleHTMLExtractorFactory.getDescriptionInstance();
75      }
76  
77      /**
78       * Processes a list of <i>html script</i> nodes retrieving the N3 / Turtle content.
79       *
80       * @param documentIRI the IRI of the original HTML document.
81       * @param er the extraction result used to store triples.
82       * @param ns the list of script nodes.
83       */
84      private void processScriptNodes(IRI documentIRI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
85          if(ns.size() > 0 && turtleParser == null) {
86              turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
87          }
88          for(Node n : ns) {
89              processScriptNode(turtleParser, documentIRI, n, er);
90          }
91      }
92  
93      /**
94       * Processes a single <i>html script</i> node.
95       *
96       * @param turtleParser the parser used to digest node content.
97       * @param documentIRI the IRI of the original HTML document.
98       * @param n the script node.
99       * @param er the extraction result used to store triples.
100      */
101     private void processScriptNode(RDFParser turtleParser, IRI documentIRI, Node n, ExtractionResult er) {
102         final Node idAttribute = n.getAttributes().getNamedItem("id");
103         final String graphName =
104                 documentIRI.stringValue() +
105                 ( idAttribute == null ? "" : "#" +   idAttribute.getTextContent() ); 
106         try {
107             turtleParser.parse( new StringReader(n.getTextContent()), graphName );
108         } catch (RDFParseException rdfpe) {
109             er.notifyIssue(
110                     IssueReport.IssueLevel.ERROR,
111                     String.format(
112                             "An error occurred while parsing turtle content within script node: %s",
113                             Arrays.toString(DomUtils.getXPathListForNode(n))
114                     ),
115                     rdfpe.getLineNumber(), rdfpe.getColumnNumber()
116             );
117         } catch (Exception e) {
118             er.notifyIssue(IssueReport.IssueLevel.ERROR, "An error occurred while processing RDF data.", -1, -1);
119         }
120     }
121 
122 }