View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdfa;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.IssueReport;
25  import org.apache.any23.extractor.rdf.BaseRDFExtractor;
26  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
27  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
28  import org.eclipse.rdf4j.rio.helpers.RDFaParserSettings;
29  import org.jsoup.Jsoup;
30  import org.jsoup.nodes.Document;
31  import org.jsoup.parser.ParseSettings;
32  import org.jsoup.parser.Parser;
33  import org.semarglproject.rdf.rdfa.RdfaParser;
34  import org.semarglproject.rdf4j.rdf.rdfa.SemarglParserSettings;
35  import org.semarglproject.sink.XmlSink;
36  import org.semarglproject.source.StreamProcessor;
37  
38  import java.io.IOException;
39  import java.io.InputStream;
40  import java.io.PrintWriter;
41  import java.io.StringWriter;
42  
43  /**
44   * @author Hans Brende (hansbrende@apache.org)
45   */
46  abstract class BaseRDFaExtractor extends BaseRDFExtractor {
47  
48      private final short version;
49  
50      BaseRDFaExtractor(short version) {
51          super(false, false);
52          this.version = version;
53      }
54  
55      @Override
56      public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in,
57              ExtractionResult extractionResult) throws IOException, ExtractionException {
58  
59          SemarglSinkSemarglSink.html#SemarglSink">SemarglSink rdfaSink = new SemarglSink(extractionResult, new Any23ValueFactoryWrapper(
60                  SimpleValueFactory.getInstance(), extractionResult, extractionContext.getDefaultLanguage()));
61  
62          XmlSink xmlSink = RdfaParser.connect(rdfaSink);
63          xmlSink.setProperty(StreamProcessor.PROCESSOR_GRAPH_HANDLER_PROPERTY, rdfaSink);
64          xmlSink.setProperty(RdfaParser.RDFA_VERSION_PROPERTY, version);
65          xmlSink.setProperty(RdfaParser.ENABLE_VOCAB_EXPANSION,
66                  RDFaParserSettings.VOCAB_EXPANSION_ENABLED.getDefaultValue());
67          xmlSink.setProperty(RdfaParser.ENABLE_PROCESSOR_GRAPH,
68                  SemarglParserSettings.PROCESSOR_GRAPH_ENABLED.getDefaultValue());
69  
70          String baseUri = extractionContext.getDocumentIRI().stringValue();
71          xmlSink.setBaseUri(baseUri);
72          Document doc = Jsoup.parse(in, null, baseUri, Parser.htmlParser().settings(ParseSettings.preserveCase));
73          try {
74              xmlSink.startDocument();
75              doc.traverse(new JsoupScanner(xmlSink));
76              xmlSink.endDocument();
77          } catch (Exception e) {
78              extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, toString(e), -1, -1);
79          }
80      }
81  
82      @SuppressWarnings("Duplicates")
83      private static String toString(Throwable th) {
84          StringWriter writer = new StringWriter();
85          try (PrintWriter pw = new PrintWriter(writer)) {
86              th.printStackTrace(pw);
87          }
88          String string = writer.toString();
89          if (string.length() > 1024) {
90              return string.substring(0, 1021) + "...";
91          }
92          return string;
93      }
94  
95  }