This project has retired. For details please refer to its Attic page.
MicrodataParserTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.microdata;
19  
20  import java.io.ByteArrayInputStream;
21  import java.io.ByteArrayOutputStream;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.PrintStream;
25  import java.nio.charset.StandardCharsets;
26  import java.text.ParseException;
27  import java.time.LocalDate;
28  import java.util.ArrayList;
29  import java.util.Calendar;
30  import java.util.Date;
31  import java.util.GregorianCalendar;
32  import java.util.List;
33  import java.util.Locale;
34  import java.util.Properties;
35  import java.util.TimeZone;
36  import java.util.concurrent.CountDownLatch;
37  import java.util.concurrent.atomic.AtomicBoolean;
38  
39  import org.apache.any23.extractor.html.TagSoupParser;
40  import org.apache.any23.util.StreamUtils;
41  import org.apache.commons.io.IOUtils;
42  import org.junit.Assert;
43  import org.junit.Ignore;
44  import org.junit.Test;
45  import org.slf4j.Logger;
46  import org.slf4j.LoggerFactory;
47  import org.w3c.dom.Document;
48  
49  import static org.junit.Assert.assertFalse;
50  
51  /**
52   * Test case for {@link MicrodataParser}.
53   *
54   * @author Michele Mostarda (mostarda@fbk.eu)
55   */
56  public class MicrodataParserTest {
57  
58      private static final Logger logger = LoggerFactory.getLogger(MicrodataParserTest.class);
59  
60      @Test
61      public void testBasicFeatures() throws IOException {
62          extractItemsAndVerifyJSONSerialization("microdata-basic", "microdata-basic-expected");
63      }
64  
65      @Test
66      public void testNestedMicrodata() throws IOException {
67          extractItemsAndVerifyJSONSerialization("microdata-nested", "microdata-nested-expected");
68      }
69  
70      @Test
71      public void testAdvancedItemrefManagement() throws IOException {
72          extractItemsAndVerifyJSONSerialization("microdata-itemref", "microdata-itemref-expected");
73      }
74  
75      @Test
76      public void testMicrodataJSONSerialization() throws IOException {
77          final Document document = getMicrodataDom("microdata-nested");
78          final ByteArrayOutputStream baos = new ByteArrayOutputStream();
79          final PrintStream ps = new PrintStream(baos, true, StandardCharsets.UTF_8);
80          MicrodataParser.getMicrodataAsJSON(document, ps);
81          ps.flush();
82          final String expected = StreamUtils
83                  .asString(this.getClass().getResourceAsStream("/microdata/microdata-json-serialization.json"));
84  
85          Assert.assertEquals("Unexpected serialization for Microdata file.", expected,
86                  baos.toString(StandardCharsets.UTF_8));
87      }
88  
89      @Test
90      @Ignore
91      public void testGetContentAsDate() throws IOException, ParseException {
92          final ItemScope target = extractItems("microdata-basic").getDetectedItemScopes()[4];
93          final GregorianCalendar gregorianCalendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
94          gregorianCalendar.set(2009, GregorianCalendar.MAY, 10); // 2009-05-10
95          gregorianCalendar.set(Calendar.HOUR, 0);
96          gregorianCalendar.set(Calendar.MINUTE, 0);
97          gregorianCalendar.set(Calendar.SECOND, 0);
98          Assert.assertEquals(gregorianCalendar.getTime().toString(),
99                  target.getProperties().get("birthday").get(0).getValue().getAsDate().toString());
100     }
101 
102     @Test
103     @Ignore
104     public void testGetDateConcurrent() throws Exception {
105         GregorianCalendar gc = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
106         gc.set(2009, GregorianCalendar.MAY, 10); // 2009-05-10
107         gc.set(Calendar.HOUR, 0);
108         gc.set(Calendar.MINUTE, 0);
109         gc.set(Calendar.SECOND, 0);
110         final Date expectedDate = gc.getTime();
111         final byte[] content = IOUtils.toByteArray(getClass().getResourceAsStream("/microdata/microdata-basic.html"));
112         final int threadCount = 10;
113         final int attemptCount = 100;
114         final List<Thread> threads = new ArrayList<Thread>();
115         final CountDownLatch beforeLatch = new CountDownLatch(1);
116         final CountDownLatch afterLatch = new CountDownLatch(threadCount);
117         final AtomicBoolean foundFailure = new AtomicBoolean(false);
118         for (int i = 0; i < threadCount; i++) {
119             threads.add(new Thread("Test-thread-" + i) {
120                 @Override
121                 public void run() {
122                     try {
123                         beforeLatch.await();
124                         int counter = 0;
125                         while (counter++ < attemptCount && !foundFailure.get()) {
126                             final Document document = getDom(content);
127                             final MicrodataParserReport report = MicrodataParser.getMicrodata(document);
128                             final ItemScope target = report.getDetectedItemScopes()[4];
129                             Date actualDate = target.getProperties().get("birthday").get(0).getValue().getAsDate();
130                             if (!expectedDate.toString().equals(actualDate.toString())) {
131                                 foundFailure.set(true);
132                             }
133                         }
134                     } catch (Exception ex) {
135                         logger.error(ex.getMessage());
136                         foundFailure.set(true);
137                     } finally {
138                         afterLatch.countDown();
139                     }
140                 }
141             });
142         }
143         for (Thread thread : threads) {
144             thread.start();
145         }
146         // Let threads start computation
147         beforeLatch.countDown();
148         // Wait for all threads to complete
149         afterLatch.await();
150         assertFalse(foundFailure.get());
151     }
152 
153     /**
154      * Test the main use case of {@link MicrodataParser#deferProperties(String...)}
155      *
156      * @throws IOException
157      *             if there is an error processing the input data
158      * @throws MicrodataParserException
159      *             if there is an error within the {@link org.apache.any23.extractor.microdata.MicrodataParser}
160      */
161     @Test
162     public void testDeferProperties() throws IOException, MicrodataParserException {
163         final Document document = getMicrodataDom("microdata-itemref");
164         final MicrodataParser parser = new MicrodataParser(document);
165         final ItemProp[] deferred = parser.deferProperties("ip5", "ip4", "ip3", "unexisting");
166         Assert.assertEquals(3, deferred.length);
167     }
168 
169     /**
170      * Tests the loop detection in {@link MicrodataParser#deferProperties(String...)}.
171      *
172      * @throws IOException
173      *             if there is an error processing the input data
174      * @throws MicrodataParserException
175      *             if there is an error within the {@link org.apache.any23.extractor.microdata.MicrodataParser}
176      */
177     @Test(expected = MicrodataParserException.class)
178     public void testDeferPropertiesLoopDetection1() throws IOException, MicrodataParserException {
179         final Document document = getMicrodataDom("microdata-itemref");
180         final MicrodataParser parser = new MicrodataParser(document);
181         parser.setErrorMode(MicrodataParser.ErrorMode.STOP_AT_FIRST_ERROR);
182         parser.deferProperties("loop0");
183     }
184 
185     /**
186      * Tests the deep loop detection in {@link MicrodataParser#deferProperties(String...)}.
187      *
188      * @throws IOException
189      *             if there is an error processing the input data
190      * @throws MicrodataParserException
191      *             if there is an error within the {@link org.apache.any23.extractor.microdata.MicrodataParser}
192      */
193     @Test(expected = MicrodataParserException.class)
194     public void testDeferPropertiesLoopDetection2() throws IOException, MicrodataParserException {
195         final Document document = getMicrodataDom("microdata-itemref");
196         final MicrodataParser parser = new MicrodataParser(document);
197         parser.setErrorMode(MicrodataParser.ErrorMode.STOP_AT_FIRST_ERROR);
198         parser.deferProperties("loop2");
199     }
200 
201     /**
202      * Tests that the loop detection works property even with multiple calls of
203      * {@link MicrodataParser#deferProperties(String...)} over the same item props.
204      *
205      * @throws java.io.IOException
206      *             if there is an error processing the input data
207      * @throws MicrodataParserException
208      *             if there is an error within the {@link org.apache.any23.extractor.microdata.MicrodataParser}
209      */
210     @Test
211     public void testDeferPropertiesStateManagement() throws IOException, MicrodataParserException {
212         final Document document = getMicrodataDom("microdata-itemref");
213         final MicrodataParser parser = new MicrodataParser(document);
214         String ip1 = "ip1";
215         Assert.assertEquals(1, parser.deferProperties(ip1).length);
216         Assert.assertEquals(1, parser.deferProperties(ip1).length);
217         Assert.assertEquals(1, parser.deferProperties(ip1).length);
218     }
219 
220     private Document getDom(String document) throws IOException {
221         final InputStream is = this.getClass().getResourceAsStream(document);
222         try {
223             final TagSoupParser tagSoupParser = new TagSoupParser(is, "http://test-document");
224             return tagSoupParser.getDOM();
225         } finally {
226             is.close();
227         }
228     }
229 
230     private Document getDom(byte[] document) throws IOException {
231         final InputStream is = new ByteArrayInputStream(document);
232         try {
233             final TagSoupParser tagSoupParser = new TagSoupParser(is, "http://test-document");
234             return tagSoupParser.getDOM();
235         } finally {
236             is.close();
237         }
238     }
239 
240     private Document getMicrodataDom(String htmlFile) throws IOException {
241         return getDom("/microdata/" + htmlFile + ".html");
242     }
243 
244     private MicrodataParserReport extractItems(String htmlFile) throws IOException {
245         final Document document = getMicrodataDom(htmlFile);
246         return MicrodataParser.getMicrodata(document);
247     }
248 
249     private void extractItemsAndVerifyJSONSerialization(String htmlFile, String expectedResult) throws IOException {
250         final MicrodataParserReport report = extractItems(htmlFile);
251         final ItemScope[] items = report.getDetectedItemScopes();
252         final MicrodataParserException[] errors = report.getErrors();
253 
254         logger.debug("begin itemScopes");
255         for (ItemScope item : items) {
256             logger.debug(item.toJSON());
257         }
258         logger.debug("end itemScopes");
259         logger.debug("begin errors");
260         for (MicrodataParserException error : errors) {
261             logger.debug(error.toJSON());
262         }
263         logger.debug("end errors");
264 
265         final Properties resultContent = new Properties();
266         resultContent.load(this.getClass().getResourceAsStream("/microdata/" + expectedResult + ".properties"));
267 
268         final int expectedResults = getExpectedResultCount(resultContent);
269         final int expectedErrors = getExpectedErrorsCount(resultContent);
270         Assert.assertEquals("Unexpected number of detect items.", expectedResults, items.length);
271         Assert.assertEquals("Unexpected number of errors.", expectedErrors, errors.length);
272 
273         for (int i = 0; i < items.length; i++) {
274             Assert.assertEquals("Error while comparing result [" + i + "]", resultContent.getProperty("result" + i),
275                     items[i].toJSON());
276         }
277 
278         for (int i = 0; i < errors.length; i++) {
279             // Jsoup doesn't support element locations
280             Assert.assertEquals("Error while comparing error [" + i + "]",
281                     resultContent.getProperty("error" + i).replaceAll("_row\" : -?\\d+", "_row\" : -1")
282                             .replaceAll("_col\" : -?\\d+", "_col\" : -1"),
283                     errors[i].toJSON().replaceAll("_row\" : -?\\d+", "_row\" : -1").replaceAll("_col\" : -?\\d+",
284                             "_col\" : -1"));
285         }
286     }
287 
288     private int countKeysWithPrefix(Properties properties, String prefix) {
289         int count = 0;
290         for (Object key : properties.keySet()) {
291             if (key.toString().indexOf(prefix) == 0)
292                 count++;
293         }
294         return count;
295     }
296 
297     private int getExpectedResultCount(Properties properties) {
298         return countKeysWithPrefix(properties, "result");
299     }
300 
301     private int getExpectedErrorsCount(Properties properties) {
302         return countKeysWithPrefix(properties, "error");
303     }
304 
305 }