View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdf;
19  
20  import com.fasterxml.jackson.core.JsonLocation;
21  import com.fasterxml.jackson.core.JsonProcessingException;
22  import org.apache.any23.extractor.ExtractionContext;
23  import org.apache.any23.extractor.ExtractionException;
24  import org.apache.any23.extractor.ExtractionParameters;
25  import org.apache.any23.extractor.ExtractionResult;
26  import org.apache.any23.extractor.Extractor;
27  import org.apache.any23.extractor.IssueReport;
28  import org.apache.any23.extractor.html.JsoupUtils;
29  import org.eclipse.rdf4j.common.net.ParsedIRI;
30  import org.eclipse.rdf4j.rio.RDFFormat;
31  import org.eclipse.rdf4j.rio.RDFParser;
32  import org.jsoup.nodes.Attribute;
33  import org.jsoup.nodes.Comment;
34  import org.jsoup.nodes.DataNode;
35  import org.jsoup.nodes.Document;
36  import org.jsoup.nodes.DocumentType;
37  import org.jsoup.nodes.Element;
38  import org.jsoup.nodes.Entities;
39  import org.jsoup.nodes.Node;
40  import org.jsoup.select.NodeFilter;
41  import org.jsoup.select.NodeTraversor;
42  
43  import java.io.ByteArrayInputStream;
44  import java.io.IOException;
45  import java.io.InputStream;
46  import java.io.PrintWriter;
47  import java.io.StringWriter;
48  import java.nio.charset.Charset;
49  import java.nio.charset.StandardCharsets;
50  import java.util.HashSet;
51  import java.util.Iterator;
52  import java.util.regex.Pattern;
53  
54  /**
55   * Base class for a generic <i>RDF</i>
56   * {@link org.apache.any23.extractor.Extractor.ContentExtractor}.
57   *
58   * @author Michele Mostarda (mostarda@fbk.eu)
59   * @author Hans Brende (hansbrende@apache.org)
60   */
61  public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
62  
63      private boolean verifyDataType;
64      private boolean stopAtFirstError;
65  
66      public BaseRDFExtractor() {
67          this(false, false);
68      }
69  
70      /**
71       * Constructor, allows to specify the validation and error handling policies.
72       *
73       * @param verifyDataType if <code>true</code> the data types will be verified,
74       *         if <code>false</code> will be ignored.
75       * @param stopAtFirstError if <code>true</code> the parser will stop at first parsing error,
76       *        if <code>false</code> will ignore non blocking errors.
77       */
78      public BaseRDFExtractor(boolean verifyDataType, boolean stopAtFirstError) {
79          this.verifyDataType = verifyDataType;
80          this.stopAtFirstError = stopAtFirstError;
81      }
82  
83      protected abstract RDFParser getParser(
84              ExtractionContext extractionContext,
85              ExtractionResult extractionResult
86      );
87  
88      public boolean isVerifyDataType() {
89          return verifyDataType;
90      }
91  
92      public void setVerifyDataType(boolean verifyDataType) {
93          this.verifyDataType = verifyDataType;
94      }
95  
96      public boolean isStopAtFirstError() {
97          return stopAtFirstError;
98      }
99  
100     @Override
101     public void setStopAtFirstError(boolean b) {
102         stopAtFirstError = b;
103     }
104 
105     private static final Pattern invalidXMLCharacters = Pattern.compile(
106             "[^\u0009\r\n\u0020-\uD7FF\uE000-\uFFFD\ud800\udc00-\udbff\udfff]");
107 
108     @Override
109     public void run(
110             ExtractionParameters extractionParameters,
111             ExtractionContext extractionContext,
112             InputStream in,
113             ExtractionResult extractionResult
114     ) throws IOException, ExtractionException {
115         try {
116             final RDFParser parser = getParser(extractionContext, extractionResult);
117 
118             RDFFormat format = parser.getRDFFormat();
119             String iri = extractionContext.getDocumentIRI().stringValue();
120 
121             if (format.hasFileExtension("xhtml") || format.hasMIMEType("application/xhtml+xml")) {
122                 Charset charset = format.getCharset();
123                 if (charset == null) {
124                     charset = StandardCharsets.UTF_8;
125                 }
126                 Document doc = JsoupUtils.parse(in, iri, null);
127                 doc.outputSettings()
128                         .prettyPrint(false)
129                         .syntax(Document.OutputSettings.Syntax.xml)
130                         .escapeMode(Entities.EscapeMode.xhtml)
131                         .charset(charset);
132                 // Delete scripts, comments, and doctypes
133                 // See https://issues.apache.org/jira/browse/ANY23-317
134                 // and https://issues.apache.org/jira/browse/ANY23-340
135                 NodeTraversor.filter(new NodeFilter() {
136                     final HashSet<String> tmpAttributeKeys = new HashSet<>();
137 
138                     @Override
139                     public FilterResult head(Node node, int depth) {
140                         if (node instanceof Element) {
141                             HashSet<String> attributeKeys = tmpAttributeKeys;
142                             for (Iterator<Attribute> it = node.attributes().iterator(); it.hasNext(); ) {
143                                 // fix for ANY23-350: valid xml attribute names are ^[a-zA-Z_:][-a-zA-Z0-9_:.]
144                                 Attribute attr = it.next();
145                                 String oldKey = attr.getKey();
146                                 String newKey = oldKey.replaceAll("[^-a-zA-Z0-9_:.]", "");
147 
148                                 // fix for ANY23-347: strip non-reserved xml namespaces
149                                 // See https://www.w3.org/TR/xml-names/#sec-namespaces
150                                 // "All other prefixes beginning with the three-letter sequence x, m, l,
151                                 // in any case combination, are reserved. This means that:
152                                 //   * users SHOULD NOT use them except as defined by later specifications
153                                 //   * processors MUST NOT treat them as fatal errors."
154                                 int prefixlen = newKey.lastIndexOf(':') + 1;
155                                 String prefix = newKey.substring(0, prefixlen).toLowerCase();
156                                 newKey = (prefix.startsWith("xml") ? prefix : "") + newKey.substring(prefixlen);
157 
158                                 if (newKey.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*")
159                                         //the namespace name for "xmlns" MUST NOT be declared
160                                         //the namespace name for "xml" need not be declared
161                                         && !newKey.startsWith("xmlns:xml")
162                                         // fix for ANY23-380: disallow duplicate attribute keys
163                                         && attributeKeys.add(newKey)) {
164                                     //avoid indexOf() operation if possible
165                                     if (!newKey.equals(oldKey)) {
166                                         attr.setKey(newKey);
167                                     }
168                                 } else {
169                                     it.remove();
170                                 }
171                             }
172                             attributeKeys.clear();
173 
174                             String tagName = ((Element)node).tagName().replaceAll("[^-a-zA-Z0-9_:.]", "");
175                             tagName = tagName.substring(tagName.lastIndexOf(':') + 1);
176                             ((Element)node).tagName(tagName.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*") ? tagName : "div");
177 
178                             // fix for ANY23-389
179                             resolve_base:
180                             if ("base".equalsIgnoreCase(tagName) && node.hasAttr("href")) {
181                                 String href = node.attr("href");
182                                 String absHref;
183                                 try {
184                                     ParsedIRI parsedHref = ParsedIRI.create(href.trim());
185                                     if (parsedHref.isAbsolute()) {
186                                         absHref = parsedHref.toString();
187                                     } else {
188                                         parsedHref = ParsedIRI.create(iri.trim()).resolve(parsedHref);
189                                         if (parsedHref.isAbsolute()) {
190                                             absHref = parsedHref.toString();
191                                         } else {
192                                             // shouldn't happen unless document IRI wasn't absolute
193                                             // ignore and let underlying RDFa parser report the issue
194                                             break resolve_base;
195                                         }
196                                     }
197                                 } catch (RuntimeException e) {
198                                     // can't parse href as a relative or absolute IRI:
199                                     // ignore and let underlying RDFa parser report the issue
200                                     break resolve_base;
201                                 }
202                                 if (!absHref.equals(href)) {
203                                     node.attr("href", absHref);
204                                 }
205                             }
206 
207                             return FilterResult.CONTINUE;
208                         }
209                         return node instanceof DataNode || node instanceof Comment || node instanceof DocumentType
210                                 ? FilterResult.REMOVE : FilterResult.CONTINUE;
211                     }
212                     @Override
213                     public FilterResult tail(Node node, int depth) {
214                         return FilterResult.CONTINUE;
215                     }
216                 }, doc);
217 
218                 // fix for ANY23-379: remove invalid xml characters from document
219                 String finalOutput = invalidXMLCharacters.matcher(doc.toString()).replaceAll("");
220 
221                 in = new ByteArrayInputStream(finalOutput.getBytes(charset));
222             } else if (format.hasFileExtension("jsonld") || format.hasMIMEType("application/ld+json")) {
223                 in = new JsonCleaningInputStream(in);
224             }
225 
226             parser.parse(in, iri);
227         } catch (Exception ex) {
228             // ANY23-420: jsonld-java can sometimes throw IllegalArgumentException,
229             // so don't limit catch block to RDFParseExceptions
230 
231             Throwable cause = ex.getCause();
232             if (cause instanceof JsonProcessingException) {
233                 JsonProcessingException err = (JsonProcessingException)cause;
234                 JsonLocation loc = err.getLocation();
235                 if (loc == null) {
236                     extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, err.getOriginalMessage(), -1L, -1L);
237                 } else {
238                     extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, err.getOriginalMessage(), loc.getLineNr(), loc.getColumnNr());
239                 }
240             } else {
241                 extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, toString(ex), -1, -1);
242             }
243         }
244     }
245 
246     private static String toString(Throwable th) {
247         StringWriter writer = new StringWriter();
248         try (PrintWriter pw = new PrintWriter(writer)) {
249             th.printStackTrace(pw);
250         }
251         String string = writer.toString();
252         if (string.length() > 1024) {
253             return string.substring(0, 1021) + "...";
254         }
255         return string;
256     }
257 
258 }