View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdf;
19  
20  import java.io.IOException;
21  import java.io.InputStream;
22  import java.io.PushbackInputStream;
23  import java.util.Arrays;
24  
25  /**
26   * This class uses several strategies to fix common JSON syntax errors, including:
27   * <ol>
28   *     <li>Remove CDATA markers</li>
29   *     <li>Remove YAML and C-style comments</li>
30   *     <li>Allow single-quoted strings</li>
31   *     <li>Ignore duplicated commas between elements of objects and arrays</li>
32   *     <li>Remove trailing commas from objects and arrays</li>
33   *     <li>Insert omitted commas after objects and arrays</li>
34   *     <li>Ignore all unicode whitespace characters (assumes UTF-8 encoding)</li>
35   *     <li>Treat semi-colons as commas</li>
36   * </ol>
37   *
38   * @author Hans Brende (hansbrende@apache.org)
39   */
40  class JsonCleaningInputStream extends InputStream {
41  
42      private static final int EOL_COMMENT = 1;
43      private static final int MULTILINE_COMMENT = 2;
44  
45      private static final int NEEDS_COMMA = -1;
46      private static final int NEEDS_COMMA_AND_NEWLINE = 1;
47  
48      private boolean inEscape;
49      private boolean inCDATA;
50      private int needsComma;
51      private int currentState;
52  
53      private static final int MAX_BLANK_PUSHBACK = 128;
54      private static final byte[] BLANK_PUSHBACK = new byte[MAX_BLANK_PUSHBACK];
55  
56      static {
57          Arrays.fill(BLANK_PUSHBACK, (byte)' ');
58          BLANK_PUSHBACK[0] = '\n';
59      }
60  
61      private final PushbackInputStream in;
62  
63      JsonCleaningInputStream(InputStream in) {
64          this.in = new PushbackInputStream(in, 256);
65      }
66  
67      private static void unread(PushbackInputStream in, int c) throws IOException {
68          if (c != -1) {
69              in.unread(c);
70          }
71      }
72  
73      private static boolean isNextOrUnread(PushbackInputStream in, int... next) throws IOException {
74          int i = -1;
75          for (int test : next) {
76              int c = in.read();
77              if (c != test) {
78                  unread(in, c);
79                  while (i >= 0) {
80                      in.unread(next[i--]);
81                  }
82                  return false;
83              }
84              i++;
85          }
86          return true;
87      }
88  
89      @Override
90      public int read() throws IOException {
91          PushbackInputStream in = this.in;
92  
93          for (;;) {
94              int c = in.read();
95  
96              if (c == -1) {
97                  return c;
98              }
99  
100             if (inCDATA) {
101                 if (c == ']' && isNextOrUnread(in, ']', '>')) {
102                     inCDATA = false;
103                     continue;
104                 }
105             } else {
106                 if (c == '<' && isNextOrUnread(in, '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) {
107                     inCDATA = true;
108                     continue;
109                 }
110             }
111 
112             int ctx = currentState;
113             switch (ctx) {
114                 case 0:
115                     break;
116                 case EOL_COMMENT:
117                     if (c == '\r' || c == '\n') {
118                         //end single-line comment
119                         currentState = 0;
120                         if (needsComma != 0) {
121                             needsComma = NEEDS_COMMA_AND_NEWLINE;
122                             continue;
123                         }
124                         return c;
125                     }
126                     continue;
127                 case MULTILINE_COMMENT:
128                     if (c == '\r' || c == '\n') {
129                         if (needsComma != 0) {
130                             needsComma = NEEDS_COMMA_AND_NEWLINE;
131                             continue;
132                         }
133                         return c;
134                     } else if (c == '*' && isNextOrUnread(in, '/')) {
135                         //end multiline comment
136                         currentState = 0;
137                     }
138                     continue;
139                 default:
140                     //we're in a quote
141                     if (inEscape) {
142                         //end escape
143                         inEscape = false;
144                     } else if (c == '\\') {
145                         //begin escape
146                         inEscape = true;
147                     } else if (c == ctx) {
148                         //end quote
149                         currentState = 0;
150                         return '"';
151                     }
152                     return c;
153             }
154 
155             //we're not in a quote or comment
156 
157             $whitespace: {
158                 switch (c) {
159                     case '#':
160                         currentState = EOL_COMMENT;
161                         continue;
162                     case '/':
163                         int next = in.read();
164                         if (next == '/') {
165                             currentState = EOL_COMMENT;
166                             continue;
167                         } else if (next == '*') {
168                             currentState = MULTILINE_COMMENT;
169                             continue;
170                         }
171                         unread(in, next);
172                         break;
173                     case ',':
174                     case ';':
175                         //don't write out comma yet!
176                         needsComma = NEEDS_COMMA;
177                         continue;
178                     case '}':
179                     case ']':
180                         // Only thing that can follow '}' or ']' is:
181                         // '}' or ']' or ',' or EOF
182                         needsComma = NEEDS_COMMA;
183                         return c;
184                     case '\r':
185                     case '\n':
186                         if (needsComma != 0) {
187                             needsComma = NEEDS_COMMA_AND_NEWLINE;
188                             continue;
189                         }
190                         return c;
191                     // UTF-8 whitespace detection
192                     case 0x09:
193                     case 0x0b:
194                     case 0x0c:
195                     case 0x1c:
196                     case 0x1d:
197                     case 0x1e:
198                     case 0x1f:
199                     case 0x20:
200                         break $whitespace;
201                     case 0xc2:
202                         if (isNextOrUnread(in, 0xa0)) {
203                             break $whitespace;
204                         }
205                         break;
206                     case 0xe1:
207                         if (isNextOrUnread(in, 0x9a, 0x80)
208                                 || isNextOrUnread(in, 0xa0, 0x8e)) {
209                             break $whitespace;
210                         }
211                         break;
212                     case 0xe2:
213                         int c1 = in.read();
214                         if (c1 == 0x80) {
215                             int c2 = in.read();
216                             //space separators
217                             if (c2 >= 0x80 && c2 <= 0x8a || c2 == 0xaf
218                                     //line and paragraph separators
219                                     || c2 == 0xa8 || c2 == 0xa9) {
220                                 break $whitespace;
221                             }
222                             unread(in, c2);
223                             in.unread(0x80);
224                         } else if (c1 == 0x81) {
225                             int c2 = in.read();
226                             if (c2 == 0x9f) {
227                                 break $whitespace;
228                             }
229                             unread(in, c2);
230                             in.unread(0x81);
231                         } else {
232                             unread(in, c1);
233                         }
234                         break;
235                     case 0xe3:
236                         if (isNextOrUnread(in, 0x80, 0x80)) {
237                             break $whitespace;
238                         }
239                         break;
240                     default:
241                         break;
242                 }
243 
244                 //here: character is not whitespace
245 
246                 int nc = needsComma;
247                 if (nc != 0) {
248                     in.unread(c);
249                     if (nc == NEEDS_COMMA) {
250                         in.unread(' ');
251                     } else {
252                         in.unread(BLANK_PUSHBACK, 0, nc);
253                     }
254                     needsComma = 0;
255                     return ',';
256                 } else if (c == '"' || c == '\'') {
257                     currentState = c;
258                     return '"';
259                 }
260                 return c;
261             } //end $whitespace
262 
263             //here: character is whitespace
264 
265             int nc = needsComma;
266             if (nc != 0) {
267                 if (nc != NEEDS_COMMA && nc != MAX_BLANK_PUSHBACK) {
268                     needsComma = nc + 1;
269                 }
270                 continue;
271             }
272 
273             return ' ';
274 
275         }
276 
277     }
278 }