View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdfa;
19  
20  import org.apache.any23.configuration.DefaultConfiguration;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionParameters;
24  import org.apache.any23.extractor.ExtractionResult;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.extractor.ExtractorFactory;
27  import org.apache.any23.extractor.SimpleExtractorFactory;
28  import org.apache.any23.extractor.rdf.RDFParserFactory;
29  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30  import org.openrdf.rio.RDFHandlerException;
31  import org.openrdf.rio.RDFParseException;
32  import org.openrdf.rio.RDFParser;
33  import org.w3c.dom.Document;
34  
35  import java.io.IOException;
36  import java.io.InputStream;
37  import java.io.StringReader;
38  import java.io.StringWriter;
39  import java.util.Arrays;
40  
41  /**
42   * Extractor for RDFa in HTML, based on Fabien Gadon's XSLT transform, found
43   * <a href="http://ns.inria.fr/grddl/rdfa/">here</a>. It works by first
44   * parsing the HTML using a tagsoup parser, then applies the XSLT to the
45   * DOM tree, then parses the resulting RDF/XML.
46   *
47   * @author Gabriele Renzi
48   * @author Richard Cyganiak (richard@cyganiak.de)
49   */
50  public class RDFaExtractor implements TagSoupDOMExtractor {
51  
52      public final static String NAME = "html-rdfa";
53  
54      public final static String xsltFilename =
55              DefaultConfiguration.singleton().getPropertyOrFail("any23.rdfa.extractor.xslt");
56  
57      private static XSLTStylesheet xslt = null;
58  
59      public final static ExtractorFactory<RDFaExtractor> factory =
60          SimpleExtractorFactory.create(
61                  NAME,
62                  null,
63                  Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"),
64                  null,
65                  RDFaExtractor.class
66          );
67  
68      /**
69       * Returns a {@link XSLTStylesheet} able to distill RDFa from
70       * HTML pages.
71       *
72       * @return returns a not <code>null</code> XSLT instance.
73       */
74      public static synchronized XSLTStylesheet getXSLT() {
75          // Lazily initialized static instance, so we don't parse
76          // the XSLT unless really necessary, and only once
77          if (xslt == null) {
78              InputStream in = RDFaExtractor.class.getResourceAsStream(xsltFilename);
79              if (in == null) {
80                  throw new RuntimeException("Couldn't load '" + xsltFilename +
81                          "', maybe the file is not bundled in the jar?");
82              }
83              xslt = new XSLTStylesheet(in);
84          }
85          return xslt;
86      }
87  
88      private boolean verifyDataType;
89  
90      private boolean stopAtFirstError;
91  
92      /**
93       * Constructor, allows to specify the validation and error handling policies.
94       *
95       * @param verifyDataType if <code>true</code> the data types will be verified,
96       *         if <code>false</code> will be ignored.
97       * @param stopAtFirstError if <code>true</code> the parser will stop at first parsing error,
98       *        if <code>false</code> will ignore non blocking errors.
99       */
100     public RDFaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
101         this.verifyDataType   = verifyDataType;
102         this.stopAtFirstError = stopAtFirstError;
103     }
104 
105     /**
106      * Default constructor, with no verification of data types and not stop at first error.
107      */    
108     public RDFaExtractor() {
109         this(false, false);
110     }
111 
112     public boolean isVerifyDataType() {
113         return verifyDataType;
114     }
115 
116     public void setVerifyDataType(boolean verifyDataType) {
117         this.verifyDataType = verifyDataType;
118     }
119 
120     public boolean isStopAtFirstError() {
121         return stopAtFirstError;
122     }
123 
124     public void setStopAtFirstError(boolean stopAtFirstError) {
125         this.stopAtFirstError = stopAtFirstError;
126     }
127 
128     public void run(
129             ExtractionParameters extractionParameters,
130             ExtractionContext extractionContext,
131             Document in,
132             ExtractionResult out
133     ) throws IOException, ExtractionException {
134 
135         StringWriter buffer = new StringWriter();
136         try {
137             getXSLT().applyTo(in, buffer);
138         } catch (XSLTStylesheetException xslte) {
139             throw new ExtractionException("An error occurred during the XSLT application.", xslte);
140         }
141 
142         try {
143             RDFParser parser
144                     = RDFParserFactory.getInstance().getRDFXMLParser(
145                         verifyDataType, stopAtFirstError, extractionContext, out
146                     );
147             parser.parse(
148                     new StringReader(buffer.getBuffer().toString()),
149                     extractionContext.getDocumentURI().stringValue()
150             );
151         } catch (RDFHandlerException ex) {
152             throw new IllegalStateException(
153                     "Should not happen, RDFHandlerAdapter does not throw RDFHandlerException", ex
154             );
155         } catch (RDFParseException ex) {
156             throw new ExtractionException(
157                     "Invalid RDF/XML produced by RDFa transform.", ex, out
158             );
159         }
160     }
161 
162     private String getDocType(Document in) {
163         return in.getDoctype().getPublicId();
164     }
165 
166     /**
167      * @return the {@link org.apache.any23.extractor.ExtractorDescription} of this extractor
168      */
169     public ExtractorDescription getDescription() {
170         return factory;
171     }
172 
173 }