View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.IssueReport;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionParameters;
24  import org.apache.any23.extractor.ExtractionResult;
25  import org.apache.any23.extractor.Extractor;
26  import org.apache.any23.extractor.ExtractorDescription;
27  import org.apache.any23.extractor.rdf.RDFParserFactory;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.rio.RDFParseException;
30  import org.eclipse.rdf4j.rio.RDFParser;
31  import org.w3c.dom.Document;
32  import org.w3c.dom.Node;
33  
34  import java.io.IOException;
35  import java.io.StringReader;
36  import java.util.Arrays;
37  import java.util.List;
38  import java.util.Locale;
39  
40  /**
41   * Extractor for <i>Turtle/N3</i> format embedded within <i>HTML</i> <i>script</i> tags.
42   *
43   * See specification draft <a href="http://esw.w3.org/N3inHTML">here</a>.
44   *
45   * @author Michele Mostarda (mostarda@fbk.eu)
46   */
47  public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
48  
49      private RDFParser turtleParser;
50  
51      @Override
52      public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
53              ExtractionResult out) throws IOException, ExtractionException {
54          List<Node> scriptNodes;
55          HTMLDocumentocument.html#HTMLDocument">HTMLDocument htmlDocument = new HTMLDocument(in);
56          final IRI documentIRI = extractionContext.getDocumentIRI();
57  
58          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
59          processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
60  
61          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
62          processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
63  
64          scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
65          processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
66      }
67  
68      @Override
69      public ExtractorDescription getDescription() {
70          return TurtleHTMLExtractorFactory.getDescriptionInstance();
71      }
72  
73      /**
74       * Processes a list of <i>html script</i> nodes retrieving the N3 / Turtle content.
75       *
76       * @param documentIRI
77       *            the IRI of the original HTML document.
78       * @param er
79       *            the extraction result used to store triples.
80       * @param ns
81       *            the list of script nodes.
82       */
83      private void processScriptNodes(IRI documentIRI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
84          if (ns.size() > 0 && turtleParser == null) {
85              turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
86          }
87          for (Node n : ns) {
88              processScriptNode(turtleParser, documentIRI, n, er);
89          }
90      }
91  
92      /**
93       * Processes a single <i>html script</i> node.
94       *
95       * @param turtleParser
96       *            the parser used to digest node content.
97       * @param documentIRI
98       *            the IRI of the original HTML document.
99       * @param n
100      *            the script node.
101      * @param er
102      *            the extraction result used to store triples.
103      */
104     private void processScriptNode(RDFParser turtleParser, IRI documentIRI, Node n, ExtractionResult er) {
105         final Node idAttribute = n.getAttributes().getNamedItem("id");
106         final String graphName = documentIRI.stringValue()
107                 + (idAttribute == null ? "" : "#" + idAttribute.getTextContent());
108         try {
109             turtleParser.parse(new StringReader(n.getTextContent()), graphName);
110         } catch (RDFParseException rdfpe) {
111             er.notifyIssue(IssueReport.IssueLevel.ERROR,
112                     String.format(Locale.ROOT, "An error occurred while parsing turtle content within script node: %s",
113                             Arrays.toString(DomUtils.getXPathListForNode(n))),
114                     rdfpe.getLineNumber(), rdfpe.getColumnNumber());
115         } catch (Exception e) {
116             er.notifyIssue(IssueReport.IssueLevel.ERROR, "An error occurred while processing RDF data.", -1, -1);
117         }
118     }
119 
120 }