View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.ExtractorDescription;
25  import org.apache.any23.extractor.IssueReport;
26  import org.apache.any23.extractor.TagSoupExtractionResult;
27  import org.apache.any23.extractor.html.annotations.Includes;
28  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
29  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30  import org.openrdf.model.BNode;
31  import org.openrdf.model.Literal;
32  import org.openrdf.model.Resource;
33  import org.openrdf.model.URI;
34  import org.openrdf.model.impl.ValueFactoryImpl;
35  import org.w3c.dom.Document;
36  import org.w3c.dom.Node;
37  
38  import java.io.IOException;
39  
40  /**
41   * The abstract base class for any
42   * <a href="microformats.org/">Microformat specification</a> extractor.
43   */
44  public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
45  
46      public static final String BEGIN_SCRIPT = "<script>";
47      public static final String END_SCRIPT   = "</script>";
48  
49      private HTMLDocument htmlDocument;
50  
51      private ExtractionContext context;
52  
53      private URI documentURI;
54  
55      private ExtractionResult out;
56  
57      protected final Any23ValueFactoryWrapper valueFactory =
58              new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance());
59  
60      /**
61       * Returns the description of this extractor.
62       *
63       * @return a human readable description.
64       */
65      public abstract ExtractorDescription getDescription();
66  
67      /**
68       * Performs the extraction of the data and writes them to the model.
69       * The nodes generated in the model can have any name or implicit label
70       * but if possible they </i>SHOULD</i> have names (either URIs or AnonId) that
71       * are uniquely derivable from their position in the DOM tree, so that
72       * multiple extractors can merge information.
73       */
74      protected abstract boolean extract() throws ExtractionException;
75  
76      public HTMLDocument getHTMLDocument() {
77          return htmlDocument;
78      }
79  
80      public ExtractionContext getExtractionContext() {
81          return context;
82      }
83  
84      public URI getDocumentURI() {
85          return documentURI;
86      }
87  
88      public final void run(
89              ExtractionParameters extractionParameters,
90              ExtractionContext extractionContext,
91              Document in,
92              ExtractionResult out
93      ) throws IOException, ExtractionException {
94          this.htmlDocument = new HTMLDocument(in);
95          this.context      = extractionContext;
96          this.documentURI  = extractionContext.getDocumentURI();
97          this.out          = out;
98          valueFactory.setIssueReport(out);
99          try {
100             extract();
101         } finally {
102             valueFactory.setIssueReport(null);
103         }
104     }
105 
106     /**
107      * Returns the {@link org.apache.any23.extractor.ExtractionResult} associated
108      * to the extraction session.
109      *
110      * @return a valid extraction result.
111      */
112     protected ExtractionResult getCurrentExtractionResult() {
113         return out;
114     }
115 
116     protected ExtractionResult openSubResult(ExtractionContext context) {
117         return out.openSubResult(context);
118     }
119 
120     /**
121      * Helper method that adds a literal property to a subject only if the value of the property
122      * is a valid string.
123      *
124      * @param n the <i>HTML</i> node from which the property value has been extracted.
125      * @param subject the property subject.
126      * @param p the property URI.
127      * @param value the property value.
128      * @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
129      */
130     protected boolean conditionallyAddStringProperty(
131             Node n,
132             Resource subject, URI p, String value
133     ) {
134         if (value == null) return false;
135         value = value.trim();
136         return
137                 value.length() > 0 
138                         &&
139                 conditionallyAddLiteralProperty(
140                         n,
141                         subject, p, valueFactory.createLiteral(value)
142                 );
143     }
144 
145     /**
146      * Helper method that adds a literal property to a node.
147      *
148      * @param n the <i>HTML</i> node from which the property value has been extracted.
149      * @param subject subject the property subject.
150      * @param property the property URI.
151      * @param literal value the property value.
152      * @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
153      */
154     protected boolean conditionallyAddLiteralProperty(
155             Node n,
156             Resource subject,
157             URI property,
158             Literal literal
159     ) {
160         final String literalStr = literal.stringValue();
161         if( containsScriptBlock(literalStr) ) {
162             out.notifyIssue(
163                     IssueReport.IssueLevel.Warning,
164                     String.format("Detected script in literal: [%s]", literalStr)
165                     , -1
166                     , -1
167             );
168             return false;
169         }
170         out.writeTriple(subject, property, literal);
171         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
172         tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
173         return true;
174     }
175 
176     /**
177      * Helper method that adds a URI property to a node.
178      * @param subject the property subject.
179      * @param property the property URI.
180      * @param uri the property object.
181      * @return <code>true</code> if the the resource has been added, <code>false</code> otherwise. 
182      */
183     protected boolean conditionallyAddResourceProperty(Resource subject, URI property, URI uri) {
184         if (uri == null) return false;
185         out.writeTriple(subject, property, uri);
186         return true;
187     }
188 
189     /**
190      * Helper method that adds a BNode property to a node.
191      *
192      * @param n the <i>HTML</i> node used for extracting such property.
193      * @param subject the property subject.
194      * @param property the property URI.
195      * @param bnode the property value.
196      */
197     protected void addBNodeProperty(Node n, Resource subject, URI property, BNode bnode) {
198         out.writeTriple(subject, property, bnode);
199         TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
200         tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
201     }
202 
203     /**
204      * Helper method that adds a BNode property to a node.
205      *
206      * @param subject the property subject.
207      * @param property the property URI.
208      * @param bnode the property value.
209      */
210     protected void addBNodeProperty( Resource subject, URI property, BNode bnode) {
211         out.writeTriple(subject, property, bnode);
212     }
213 
214     /**
215      * Helper method that adds a URI property to a node.
216      *
217      * @param subject
218      * @param property
219      * @param object
220      */
221     protected void addURIProperty(Resource subject, URI property, URI object) {
222         out.writeTriple(subject, property, object);    
223     }
224 
225     protected URI fixLink(String link) {
226         return valueFactory.fixLink(link, null);
227     }
228 
229     protected URI fixLink(String link, String defaultSchema) {
230         return valueFactory.fixLink(link, defaultSchema);
231     }
232 
233     private boolean containsScriptBlock(String in) {
234         final String inLowerCase = in.toLowerCase();
235         final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
236         if(beginBlock == -1) {
237             return false;
238         }
239         return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
240     }
241 
242         /**
243      * This method checks if there is a native nesting relationship between two
244      * {@link MicroformatExtractor}.
245      *
246      * @see org.apache.any23.extractor.html.annotations.Includes
247      * @param including the including {@link MicroformatExtractor}
248      * @param included the included {@link MicroformatExtractor}
249      * @return <code>true</code> if there is a declared nesting relationship
250      */
251     public static boolean includes(
252             Class<? extends MicroformatExtractor>including,
253             Class<? extends MicroformatExtractor> included) {
254         Includes includes = including.getAnnotation(Includes.class);
255         if (includes != null) {
256             Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
257             if (extractors != null && extractors.length > 0) {
258                 for (Class<? extends MicroformatExtractor> extractor : extractors) {
259                     if (extractor.equals(included)) {
260                         return true;
261                     }
262                 }
263             }
264         }
265         return false;
266     }
267 
268 }