View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.filter;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.rdfa.RDFaExtractorFactory;
22  import org.apache.any23.vocab.XHTML;
23  import org.apache.any23.writer.TripleHandler;
24  import org.apache.any23.writer.TripleHandlerException;
25  import org.eclipse.rdf4j.model.Resource;
26  import org.eclipse.rdf4j.model.IRI;
27  import org.eclipse.rdf4j.model.Value;
28  
29  /**
30   * A {@link TripleHandler} that suppresses output of the RDFa
31   * parser if the document only contains "accidental" RDFa,
32   * like stylesheet links and other non-RDFa uses of HTML's
33   *
34   * @author Richard Cyganiak (richard@cyganiak.de)
35   */
36  public class IgnoreAccidentalRDFa implements TripleHandler {
37  
38      private static final XHTML vXHTML = XHTML.getInstance();
39  
40      private final ExtractionContextBlocker blocker;
41  
42      private final boolean alwaysSuppressCSSTriples;
43  
44      /**
45       * Constructor.
46       *
47       * @param wrapped the decorated triple handler.
48       * @param alwaysSuppressCSSTriples if <code>true</code> the <i>CSS</i> triples will be
49       *        always suppressed even if the document is not empty.
50       *        If <code>false</code> then the <i>CSS</i> triples will be suppressed only if
51       *        document is empty.
52       */
53      public IgnoreAccidentalRDFa(TripleHandler wrapped, boolean alwaysSuppressCSSTriples) {
54          this.blocker = new ExtractionContextBlocker(wrapped);
55          this.alwaysSuppressCSSTriples = alwaysSuppressCSSTriples;
56      }
57      public IgnoreAccidentalRDFa(TripleHandler wrapped) {
58          this(wrapped, false);
59      }
60  
61      public void startDocument(IRI documentIRI) throws TripleHandlerException {
62          blocker.startDocument(documentIRI);
63      }
64  
65      public void openContext(ExtractionContext context) throws TripleHandlerException {
66          blocker.openContext(context);
67          if (isRDFaContext(context)) {
68              blocker.blockContext(context);
69          }
70      }
71  
72      public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context)
73      throws TripleHandlerException {
74          // Suppress stylesheet triples.
75          if(alwaysSuppressCSSTriples && p.stringValue().equals(vXHTML.stylesheet.stringValue())) {
76              return;
77          }
78          if (isRDFaContext(context)) {
79              blocker.unblockContext(context);
80          }
81          blocker.receiveTriple(s, p, o, g, context);
82      }
83  
84      public void receiveNamespace(String prefix, String uri, ExtractionContext context)
85      throws TripleHandlerException {
86          blocker.receiveNamespace(prefix, uri, context);
87      }
88  
89      public void closeContext(ExtractionContext context) {
90          blocker.closeContext(context);
91      }
92  
93      public void close() throws TripleHandlerException {
94          blocker.close();
95      }
96  
97      private boolean isRDFaContext(ExtractionContext context) {
98          return context.getExtractorName().equals(RDFaExtractorFactory.NAME);
99      }
100 
101     public void endDocument(IRI documentIRI) throws TripleHandlerException {
102         blocker.endDocument(documentIRI);
103     }
104 
105     public void setContentLength(long contentLength) {
106         //Ignore.
107     }
108 }