View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.ExtractorDescription;
25  import org.apache.any23.extractor.IssueReport;
26  import org.apache.any23.extractor.TagSoupExtractionResult;
27  import org.apache.any23.extractor.html.annotations.Includes;
28  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
29  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30  import org.eclipse.rdf4j.model.BNode;
31  import org.eclipse.rdf4j.model.Literal;
32  import org.eclipse.rdf4j.model.Resource;
33  import org.eclipse.rdf4j.model.IRI;
34  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
35  import org.w3c.dom.Document;
36  import org.w3c.dom.Node;
37  
38  import java.io.IOException;
39  import java.util.Locale;
40  
41  /**
42   * The abstract base class for any <a href="microformats.org/">Microformat specification</a> extractor.
43   */
44  public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
45  
46      public static final String BEGIN_SCRIPT = "<script>";
47      public static final String END_SCRIPT = "</script>";
48  
49      private HTMLDocument htmlDocument;
50  
51      private ExtractionContext context;
52  
53      private IRI documentIRI;
54  
55      private ExtractionResult out;
56  
57      protected final Any23ValueFactoryWrapperl#Any23ValueFactoryWrapper">Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
58              SimpleValueFactory.getInstance());
59  
60      /**
61       * Returns the description of this extractor.
62       *
63       * @return a human readable description.
64       */
65      public abstract ExtractorDescription getDescription();
66  
67      /**
68       * Performs the extraction of the data and writes them to the model. The nodes generated in the model can have any
69       * name or implicit label but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that are uniquely
70       * derivable from their position in the DOM tree, so that multiple extractors can merge information.
71       * 
72       * @return true if extraction is successful
73       * 
74       * @throws ExtractionException
75       *             if there is an error during extraction
76       */
77      protected abstract boolean extract() throws ExtractionException;
78  
79      public HTMLDocument getHTMLDocument() {
80          return htmlDocument;
81      }
82  
83      public ExtractionContext getExtractionContext() {
84          return context;
85      }
86  
87      public IRI getDocumentIRI() {
88          return documentIRI;
89      }
90  
91      public final void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
92              ExtractionResult out) throws IOException, ExtractionException {
93          this.htmlDocument = new HTMLDocument(in);
94          this.context = extractionContext;
95          this.documentIRI = extractionContext.getDocumentIRI();
96          this.out = out;
97          valueFactory.setIssueReport(out);
98          try {
99              extract();
100         } finally {
101             valueFactory.setIssueReport(null);
102         }
103     }
104 
105     /**
106      * Returns the {@link org.apache.any23.extractor.ExtractionResult} associated to the extraction session.
107      *
108      * @return a valid extraction result.
109      */
110     protected ExtractionResult getCurrentExtractionResult() {
111         return out;
112     }
113 
114     protected void setCurrentExtractionResult(ExtractionResult out) {
115         this.out = out;
116     }
117 
118     protected ExtractionResult openSubResult(ExtractionContext context) {
119         return out.openSubResult(context);
120     }
121 
122     /**
123      * Helper method that adds a literal property to a subject only if the value of the property is a valid string.
124      *
125      * @param n
126      *            the <i>HTML</i> node from which the property value has been extracted.
127      * @param subject
128      *            the property subject.
129      * @param p
130      *            the property IRI.
131      * @param value
132      *            the property value.
133      * 
134      * @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
135      */
136     protected boolean conditionallyAddStringProperty(Node n, Resource subject, IRI p, String value) {
137         if (value == null)
138             return false;
139         value = value.trim();
140         return value.length() > 0 && conditionallyAddLiteralProperty(n, subject, p, valueFactory.createLiteral(value));
141     }
142 
143     /**
144      * Helper method that adds a literal property to a node.
145      *
146      * @param n
147      *            the <i>HTML</i> node from which the property value has been extracted.
148      * @param subject
149      *            subject the property subject.
150      * @param property
151      *            the property IRI.
152      * @param literal
153      *            value the property value.
154      * 
155      * @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
156      */
157     protected boolean conditionallyAddLiteralProperty(Node n, Resource subject, IRI property, Literal literal) {
158         final String literalStr = literal.stringValue();
159         if (containsScriptBlock(literalStr)) {
160             out.notifyIssue(IssueReport.IssueLevel.WARNING,
161                     String.format(Locale.ROOT, "Detected script in literal: [%s]", literalStr), -1, -1);
162             return false;
163         }
164         out.writeTriple(subject, property, literal);
165         TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
166         tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n));
167         return true;
168     }
169 
170     /**
171      * Helper method that adds a IRI property to a node.
172      * 
173      * @param subject
174      *            the property subject.
175      * @param property
176      *            the property IRI.
177      * @param uri
178      *            the property object.
179      * 
180      * @return <code>true</code> if the the resource has been added, <code>false</code> otherwise.
181      */
182     protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
183         if (uri == null)
184             return false;
185         out.writeTriple(subject, property, uri);
186         return true;
187     }
188 
189     /**
190      * Helper method that adds a BNode property to a node.
191      *
192      * @param n
193      *            the <i>HTML</i> node used for extracting such property.
194      * @param subject
195      *            the property subject.
196      * @param property
197      *            the property IRI.
198      * @param bnode
199      *            the property value.
200      */
201     protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
202         out.writeTriple(subject, property, bnode);
203         TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
204         tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n));
205     }
206 
207     /**
208      * Helper method that adds a BNode property to a node.
209      *
210      * @param subject
211      *            the property subject.
212      * @param property
213      *            the property IRI.
214      * @param bnode
215      *            the property value.
216      */
217     protected void addBNodeProperty(Resource subject, IRI property, BNode bnode) {
218         out.writeTriple(subject, property, bnode);
219     }
220 
221     /**
222      * Helper method that adds a IRI property to a node.
223      *
224      * @param subject
225      *            subject to add
226      * @param property
227      *            predicate to add
228      * @param object
229      *            object to add
230      */
231     protected void addIRIProperty(Resource subject, IRI property, IRI object) {
232         out.writeTriple(subject, property, object);
233     }
234 
235     protected IRI fixLink(String link) {
236         return valueFactory.fixLink(link, null);
237     }
238 
239     protected IRI fixLink(String link, String defaultSchema) {
240         return valueFactory.fixLink(link, defaultSchema);
241     }
242 
243     private boolean containsScriptBlock(String in) {
244         final String inLowerCase = in.toLowerCase(Locale.ROOT);
245         final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
246         if (beginBlock == -1) {
247             return false;
248         }
249         return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
250     }
251 
252     /**
253      * This method checks if there is a native nesting relationship between two {@link MicroformatExtractor}.
254      *
255      * @see org.apache.any23.extractor.html.annotations.Includes
256      * 
257      * @param including
258      *            the including {@link MicroformatExtractor}
259      * @param included
260      *            the included {@link MicroformatExtractor}
261      * 
262      * @return <code>true</code> if there is a declared nesting relationship
263      */
264     public static boolean includes(Class<? extends MicroformatExtractor> including,
265             Class<? extends MicroformatExtractor> included) {
266         Includes includes = including.getAnnotation(Includes.class);
267         if (includes != null) {
268             Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
269             if (extractors != null && extractors.length > 0) {
270                 for (Class<? extends MicroformatExtractor> extractor : extractors) {
271                     if (extractor.equals(included)) {
272                         return true;
273                     }
274                 }
275             }
276         }
277         return false;
278     }
279 
280 }