View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  
23  /**
24   * Extension of {@link java.io.InputStream} meant to detect and replace any occurrence of inline <i>span</i>:
25   * 
26   * <pre>
27   * &lt;span/&gt;
28   * </pre>
29   * 
30   * with an open close tag sequence:
31   * 
32   * <pre>
33   * &lt;span&gt;&lt;/span&gt;
34   * </pre>
35   *
36   * @author Michele Mostarda (mostarda@fbk.eu)
37   */
38  public class SpanCloserInputStream extends InputStream {
39  
40      private static final String TRAILING_SEQUENCE_OPEN = "<span";
41      private static final char TRAILING_SEQUENCE_CLOSE = '>';
42      private static final String CLOSE_SEQUENCE = "</span>";
43  
44      private final InputStream wrapped;
45  
46      private int trailingSequenceOpenMatch = 0;
47      private int closeSequenceIndex = 0;
48      private boolean trailingSequenceOpenDetected = false;
49      private boolean trailingSequenceCloseDetected = false;
50      private boolean inlineDetected = false;
51      private boolean betweenQuotes = false;
52  
53      public SpanCloserInputStream(InputStream is) {
54          wrapped = is;
55      }
56  
57      @Override
58      public int read() throws IOException {
59          if (trailingSequenceOpenDetected && inlineDetected && trailingSequenceCloseDetected) {
60              final int ret = CLOSE_SEQUENCE.charAt(closeSequenceIndex);
61              closeSequenceIndex++;
62              if (closeSequenceIndex >= CLOSE_SEQUENCE.length()) {
63                  resetDetector();
64              }
65              return ret;
66          } else if (trailingSequenceOpenDetected && trailingSequenceCloseDetected) {
67              resetDetector();
68          }
69  
70          int c = wrapped.read();
71          if (c == '"') {
72              betweenQuotes = !betweenQuotes;
73          } else if (c == '/' && !betweenQuotes && trailingSequenceOpenDetected && !trailingSequenceCloseDetected) {
74              inlineDetected = true;
75              c = wrapped.read();
76          }
77  
78          if (!trailingSequenceOpenDetected && checkOpenTrailingSequence(c)) {
79              trailingSequenceOpenDetected = true;
80              trailingSequenceCloseDetected = false;
81          } else if (c == TRAILING_SEQUENCE_CLOSE && trailingSequenceOpenDetected) {
82              trailingSequenceCloseDetected = true;
83          }
84          return c;
85      }
86  
87      private boolean checkOpenTrailingSequence(int c) {
88          if (TRAILING_SEQUENCE_OPEN.charAt(trailingSequenceOpenMatch) == Character.toLowerCase(c)) {
89              trailingSequenceOpenMatch++;
90              if (trailingSequenceOpenMatch == TRAILING_SEQUENCE_OPEN.length()) {
91                  trailingSequenceOpenMatch = 0;
92                  return true;
93              }
94          } else {
95              trailingSequenceOpenMatch = 0;
96          }
97          return false;
98      }
99  
100     private void resetDetector() {
101         trailingSequenceOpenMatch = 0;
102         closeSequenceIndex = 0;
103         trailingSequenceOpenDetected = false;
104         trailingSequenceCloseDetected = false;
105         inlineDetected = false;
106         betweenQuotes = false;
107     }
108 
109 }