View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  
23  /**
24   * Extension of {@link java.io.InputStream} meant to
25   * detect and replace any occurrence of inline <i>span</i>:
26   * <pre>
27   * &lt;span/&gt;
28   * </pre>
29   * with an open close tag sequence:
30   * <pre>
31   * &lt;span&gt;&lt;/span&gt;
32   * </pre>
33   *
34   * @author Michele Mostarda (mostarda@fbk.eu)
35   */
36  public class SpanCloserInputStream extends InputStream {
37  
38      private static final String TRAILING_SEQUENCE_OPEN  = "<span";
39      private static final char   TRAILING_SEQUENCE_CLOSE = '>';
40      private static final String CLOSE_SEQUENCE          = "</span>";
41  
42      private final InputStream wrapped;
43  
44      private int trailingSequenceOpenMatch  = 0;
45      private int closeSequenceIndex = 0;
46      private boolean trailingSequenceOpenDetected  = false;
47      private boolean trailingSequenceCloseDetected = false;
48      private boolean inlineDetected = false;
49      private boolean betweenQuotes = false;
50  
51      public SpanCloserInputStream(InputStream is) {
52          wrapped = is;
53      }
54  
55      @Override
56      public int read() throws IOException {
57          if(trailingSequenceOpenDetected && inlineDetected && trailingSequenceCloseDetected) {
58              final int ret = CLOSE_SEQUENCE.charAt(closeSequenceIndex);
59              closeSequenceIndex++;
60              if(closeSequenceIndex >= CLOSE_SEQUENCE.length()) {
61                  resetDetector();
62              }
63              return ret;
64          } else if(trailingSequenceOpenDetected && trailingSequenceCloseDetected) {
65              resetDetector();
66          }
67  
68          int c = wrapped.read();
69          if(c == '"') {
70              betweenQuotes = !betweenQuotes;
71          } else if(c == '/' && !betweenQuotes && trailingSequenceOpenDetected && !trailingSequenceCloseDetected) {
72              inlineDetected = true;
73              c = wrapped.read();
74          }
75  
76          if( !trailingSequenceOpenDetected && checkOpenTrailingSequence(c) ) {
77             trailingSequenceOpenDetected = true;
78              trailingSequenceCloseDetected = false;
79          } else if(c == TRAILING_SEQUENCE_CLOSE && trailingSequenceOpenDetected) {
80              trailingSequenceCloseDetected = true;
81          }
82          return c;
83      }
84  
85      private boolean checkOpenTrailingSequence(int c) {
86          if( TRAILING_SEQUENCE_OPEN.charAt(trailingSequenceOpenMatch) == Character.toLowerCase(c) ) {
87              trailingSequenceOpenMatch++;
88              if(trailingSequenceOpenMatch == TRAILING_SEQUENCE_OPEN.length()) {
89                  trailingSequenceOpenMatch = 0;
90                  return true;
91              }
92          } else {
93              trailingSequenceOpenMatch = 0;
94          }
95          return false;
96      }
97  
98      private void resetDetector() {
99          trailingSequenceOpenMatch = 0;
100         closeSequenceIndex = 0;
101         trailingSequenceOpenDetected = false;
102         trailingSequenceCloseDetected = false;
103         inlineDetected = false;
104         betweenQuotes = false;
105     }
106 
107 }