View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.ExtractorDescription;
25  import org.apache.any23.extractor.IssueReport;
26  import org.apache.any23.extractor.TagSoupExtractionResult;
27  import org.apache.any23.extractor.html.annotations.Includes;
28  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
29  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30  import org.eclipse.rdf4j.model.BNode;
31  import org.eclipse.rdf4j.model.Literal;
32  import org.eclipse.rdf4j.model.Resource;
33  import org.eclipse.rdf4j.model.IRI;
34  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
35  import org.w3c.dom.Document;
36  import org.w3c.dom.Node;
37  
38  import java.io.IOException;
39  
40  /**
41   * The abstract base class for any
42   * <a href="microformats.org/">Microformat specification</a> extractor.
43   */
44  public abstract class MicroformatExtractor implements TagSoupDOMExtractor {
45  
46      public static final String BEGIN_SCRIPT = "<script>";
47      public static final String END_SCRIPT   = "</script>";
48  
49      private HTMLDocument htmlDocument;
50  
51      private ExtractionContext context;
52  
53      private IRI documentIRI;
54  
55      private ExtractionResult out;
56  
57      protected final Any23ValueFactoryWrapper valueFactory =
58              new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance());
59  
60      /**
61       * Returns the description of this extractor.
62       *
63       * @return a human readable description.
64       */
65      public abstract ExtractorDescription getDescription();
66  
67      /**
68       * Performs the extraction of the data and writes them to the model.
69       * The nodes generated in the model can have any name or implicit label
70       * but if possible they <i>SHOULD</i> have names (either URIs or AnonId) that
71       * are uniquely derivable from their position in the DOM tree, so that
72       * multiple extractors can merge information.
73       * @return true if extraction is successful
74       * @throws ExtractionException if there is an error during extraction
75       */
76      protected abstract boolean extract() throws ExtractionException;
77  
78      public HTMLDocument getHTMLDocument() {
79          return htmlDocument;
80      }
81  
82      public ExtractionContext getExtractionContext() {
83          return context;
84      }
85  
86      public IRI getDocumentIRI() {
87          return documentIRI;
88      }
89  
90      public final void run(
91              ExtractionParameters extractionParameters,
92              ExtractionContext extractionContext,
93              Document in,
94              ExtractionResult out
95      ) throws IOException, ExtractionException {
96          this.htmlDocument = new HTMLDocument(in);
97          this.context      = extractionContext;
98          this.documentIRI  = extractionContext.getDocumentIRI();
99          this.out          = out;
100         valueFactory.setIssueReport(out);
101         try {
102             extract();
103         } finally {
104             valueFactory.setIssueReport(null);
105         }
106     }
107 
108     /**
109      * Returns the {@link org.apache.any23.extractor.ExtractionResult} associated
110      * to the extraction session.
111      *
112      * @return a valid extraction result.
113      */
114     protected ExtractionResult getCurrentExtractionResult() {
115         return out;
116     }
117 
118     protected void setCurrentExtractionResult(ExtractionResult out) {
119         this.out = out;
120     }
121 
122     protected ExtractionResult openSubResult(ExtractionContext context) {
123         return out.openSubResult(context);
124     }
125 
126     /**
127      * Helper method that adds a literal property to a subject only if the value of the property
128      * is a valid string.
129      *
130      * @param n the <i>HTML</i> node from which the property value has been extracted.
131      * @param subject the property subject.
132      * @param p the property IRI.
133      * @param value the property value.
134      * @return returns <code>true</code> if the value has been accepted and added, <code>false</code> otherwise.
135      */
136     protected boolean conditionallyAddStringProperty(
137             Node n,
138             Resource subject, IRI p, String value
139     ) {
140         if (value == null) return false;
141         value = value.trim();
142         return
143                 value.length() > 0 
144                         &&
145                 conditionallyAddLiteralProperty(
146                         n,
147                         subject, p, valueFactory.createLiteral(value)
148                 );
149     }
150 
151     /**
152      * Helper method that adds a literal property to a node.
153      *
154      * @param n the <i>HTML</i> node from which the property value has been extracted.
155      * @param subject subject the property subject.
156      * @param property the property IRI.
157      * @param literal value the property value.
158      * @return returns <code>true</code> if the literal has been accepted and added, <code>false</code> otherwise.
159      */
160     protected boolean conditionallyAddLiteralProperty(
161             Node n,
162             Resource subject,
163             IRI property,
164             Literal literal
165     ) {
166         final String literalStr = literal.stringValue();
167         if( containsScriptBlock(literalStr) ) {
168             out.notifyIssue(
169                     IssueReport.IssueLevel.WARNING,
170                     String.format("Detected script in literal: [%s]", literalStr)
171                     , -1
172                     , -1
173             );
174             return false;
175         }
176         out.writeTriple(subject, property, literal);
177         TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
178         tser.addPropertyPath(this.getClass(), subject, property, null, DomUtils.getXPathListForNode(n) );
179         return true;
180     }
181 
182     /**
183      * Helper method that adds a IRI property to a node.
184      * @param subject the property subject.
185      * @param property the property IRI.
186      * @param uri the property object.
187      * @return <code>true</code> if the the resource has been added, <code>false</code> otherwise. 
188      */
189     protected boolean conditionallyAddResourceProperty(Resource subject, IRI property, IRI uri) {
190         if (uri == null) return false;
191         out.writeTriple(subject, property, uri);
192         return true;
193     }
194 
195     /**
196      * Helper method that adds a BNode property to a node.
197      *
198      * @param n the <i>HTML</i> node used for extracting such property.
199      * @param subject the property subject.
200      * @param property the property IRI.
201      * @param bnode the property value.
202      */
203     protected void addBNodeProperty(Node n, Resource subject, IRI property, BNode bnode) {
204         out.writeTriple(subject, property, bnode);
205         TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
206         tser.addPropertyPath(this.getClass(), subject, property, bnode, DomUtils.getXPathListForNode(n) );
207     }
208 
209     /**
210      * Helper method that adds a BNode property to a node.
211      *
212      * @param subject the property subject.
213      * @param property the property IRI.
214      * @param bnode the property value.
215      */
216     protected void addBNodeProperty( Resource subject, IRI property, BNode bnode) {
217         out.writeTriple(subject, property, bnode);
218     }
219 
220     /**
221      * Helper method that adds a IRI property to a node.
222      *
223      * @param subject subject to add
224      * @param property predicate to add
225      * @param object object to add
226      */
227     protected void addIRIProperty(Resource subject, IRI property, IRI object) {
228         out.writeTriple(subject, property, object);    
229     }
230 
231     protected IRI fixLink(String link) {
232         return valueFactory.fixLink(link, null);
233     }
234 
235     protected IRI fixLink(String link, String defaultSchema) {
236         return valueFactory.fixLink(link, defaultSchema);
237     }
238 
239     private boolean containsScriptBlock(String in) {
240         final String inLowerCase = in.toLowerCase();
241         final int beginBlock = inLowerCase.indexOf(BEGIN_SCRIPT);
242         if(beginBlock == -1) {
243             return false;
244         }
245         return inLowerCase.indexOf(END_SCRIPT, beginBlock + BEGIN_SCRIPT.length()) != -1;
246     }
247 
248         /**
249      * This method checks if there is a native nesting relationship between two
250      * {@link MicroformatExtractor}.
251      *
252      * @see org.apache.any23.extractor.html.annotations.Includes
253      * @param including the including {@link MicroformatExtractor}
254      * @param included the included {@link MicroformatExtractor}
255      * @return <code>true</code> if there is a declared nesting relationship
256      */
257     public static boolean includes(
258             Class<? extends MicroformatExtractor>including,
259             Class<? extends MicroformatExtractor> included) {
260         Includes includes = including.getAnnotation(Includes.class);
261         if (includes != null) {
262             Class<? extends MicroformatExtractor>[] extractors = includes.extractors();
263             if (extractors != null && extractors.length > 0) {
264                 for (Class<? extends MicroformatExtractor> extractor : extractors) {
265                     if (extractor.equals(included)) {
266                         return true;
267                     }
268                 }
269             }
270         }
271         return false;
272     }
273 
274 
275 }