This project has retired. For details please refer to its
Attic page.
MicrodataParserTest xref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.microdata;
19
20 import java.io.ByteArrayInputStream;
21 import java.io.ByteArrayOutputStream;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.PrintStream;
25 import java.nio.charset.StandardCharsets;
26 import java.text.ParseException;
27 import java.time.LocalDate;
28 import java.util.ArrayList;
29 import java.util.Calendar;
30 import java.util.Date;
31 import java.util.GregorianCalendar;
32 import java.util.List;
33 import java.util.Locale;
34 import java.util.Properties;
35 import java.util.TimeZone;
36 import java.util.concurrent.CountDownLatch;
37 import java.util.concurrent.atomic.AtomicBoolean;
38
39 import org.apache.any23.extractor.html.TagSoupParser;
40 import org.apache.any23.util.StreamUtils;
41 import org.apache.commons.io.IOUtils;
42 import org.junit.Assert;
43 import org.junit.Ignore;
44 import org.junit.Test;
45 import org.slf4j.Logger;
46 import org.slf4j.LoggerFactory;
47 import org.w3c.dom.Document;
48
49 import static org.junit.Assert.assertFalse;
50
51
52
53
54
55
56 public class MicrodataParserTest {
57
58 private static final Logger logger = LoggerFactory.getLogger(MicrodataParserTest.class);
59
60 @Test
61 public void testBasicFeatures() throws IOException {
62 extractItemsAndVerifyJSONSerialization("microdata-basic", "microdata-basic-expected");
63 }
64
65 @Test
66 public void testNestedMicrodata() throws IOException {
67 extractItemsAndVerifyJSONSerialization("microdata-nested", "microdata-nested-expected");
68 }
69
70 @Test
71 public void testAdvancedItemrefManagement() throws IOException {
72 extractItemsAndVerifyJSONSerialization("microdata-itemref", "microdata-itemref-expected");
73 }
74
75 @Test
76 public void testMicrodataJSONSerialization() throws IOException {
77 final Document document = getMicrodataDom("microdata-nested");
78 final ByteArrayOutputStream baos = new ByteArrayOutputStream();
79 final PrintStream ps = new PrintStream(baos, true, StandardCharsets.UTF_8);
80 MicrodataParser.getMicrodataAsJSON(document, ps);
81 ps.flush();
82 final String expected = StreamUtils
83 .asString(this.getClass().getResourceAsStream("/microdata/microdata-json-serialization.json"));
84
85 Assert.assertEquals("Unexpected serialization for Microdata file.", expected,
86 baos.toString(StandardCharsets.UTF_8));
87 }
88
89 @Test
90 @Ignore
91 public void testGetContentAsDate() throws IOException, ParseException {
92 final ItemScope target = extractItems("microdata-basic").getDetectedItemScopes()[4];
93 final GregorianCalendar gregorianCalendar = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
94 gregorianCalendar.set(2009, GregorianCalendar.MAY, 10);
95 gregorianCalendar.set(Calendar.HOUR, 0);
96 gregorianCalendar.set(Calendar.MINUTE, 0);
97 gregorianCalendar.set(Calendar.SECOND, 0);
98 Assert.assertEquals(gregorianCalendar.getTime().toString(),
99 target.getProperties().get("birthday").get(0).getValue().getAsDate().toString());
100 }
101
102 @Test
103 @Ignore
104 public void testGetDateConcurrent() throws Exception {
105 GregorianCalendar gc = new GregorianCalendar(TimeZone.getDefault(), Locale.ROOT);
106 gc.set(2009, GregorianCalendar.MAY, 10);
107 gc.set(Calendar.HOUR, 0);
108 gc.set(Calendar.MINUTE, 0);
109 gc.set(Calendar.SECOND, 0);
110 final Date expectedDate = gc.getTime();
111 final byte[] content = IOUtils.toByteArray(getClass().getResourceAsStream("/microdata/microdata-basic.html"));
112 final int threadCount = 10;
113 final int attemptCount = 100;
114 final List<Thread> threads = new ArrayList<Thread>();
115 final CountDownLatch beforeLatch = new CountDownLatch(1);
116 final CountDownLatch afterLatch = new CountDownLatch(threadCount);
117 final AtomicBoolean foundFailure = new AtomicBoolean(false);
118 for (int i = 0; i < threadCount; i++) {
119 threads.add(new Thread("Test-thread-" + i) {
120 @Override
121 public void run() {
122 try {
123 beforeLatch.await();
124 int counter = 0;
125 while (counter++ < attemptCount && !foundFailure.get()) {
126 final Document document = getDom(content);
127 final MicrodataParserReport report = MicrodataParser.getMicrodata(document);
128 final ItemScope target = report.getDetectedItemScopes()[4];
129 Date actualDate = target.getProperties().get("birthday").get(0).getValue().getAsDate();
130 if (!expectedDate.toString().equals(actualDate.toString())) {
131 foundFailure.set(true);
132 }
133 }
134 } catch (Exception ex) {
135 logger.error(ex.getMessage());
136 foundFailure.set(true);
137 } finally {
138 afterLatch.countDown();
139 }
140 }
141 });
142 }
143 for (Thread thread : threads) {
144 thread.start();
145 }
146
147 beforeLatch.countDown();
148
149 afterLatch.await();
150 assertFalse(foundFailure.get());
151 }
152
153
154
155
156
157
158
159
160
161 @Test
162 public void testDeferProperties() throws IOException, MicrodataParserException {
163 final Document document = getMicrodataDom("microdata-itemref");
164 final MicrodataParser parser = new MicrodataParser(document);
165 final ItemProp[] deferred = parser.deferProperties("ip5", "ip4", "ip3", "unexisting");
166 Assert.assertEquals(3, deferred.length);
167 }
168
169
170
171
172
173
174
175
176
177 @Test(expected = MicrodataParserException.class)
178 public void testDeferPropertiesLoopDetection1() throws IOException, MicrodataParserException {
179 final Document document = getMicrodataDom("microdata-itemref");
180 final MicrodataParser parser = new MicrodataParser(document);
181 parser.setErrorMode(MicrodataParser.ErrorMode.STOP_AT_FIRST_ERROR);
182 parser.deferProperties("loop0");
183 }
184
185
186
187
188
189
190
191
192
193 @Test(expected = MicrodataParserException.class)
194 public void testDeferPropertiesLoopDetection2() throws IOException, MicrodataParserException {
195 final Document document = getMicrodataDom("microdata-itemref");
196 final MicrodataParser parser = new MicrodataParser(document);
197 parser.setErrorMode(MicrodataParser.ErrorMode.STOP_AT_FIRST_ERROR);
198 parser.deferProperties("loop2");
199 }
200
201
202
203
204
205
206
207
208
209
210 @Test
211 public void testDeferPropertiesStateManagement() throws IOException, MicrodataParserException {
212 final Document document = getMicrodataDom("microdata-itemref");
213 final MicrodataParser parser = new MicrodataParser(document);
214 String ip1 = "ip1";
215 Assert.assertEquals(1, parser.deferProperties(ip1).length);
216 Assert.assertEquals(1, parser.deferProperties(ip1).length);
217 Assert.assertEquals(1, parser.deferProperties(ip1).length);
218 }
219
220 private Document getDom(String document) throws IOException {
221 final InputStream is = this.getClass().getResourceAsStream(document);
222 try {
223 final TagSoupParser tagSoupParser = new TagSoupParser(is, "http://test-document");
224 return tagSoupParser.getDOM();
225 } finally {
226 is.close();
227 }
228 }
229
230 private Document getDom(byte[] document) throws IOException {
231 final InputStream is = new ByteArrayInputStream(document);
232 try {
233 final TagSoupParser tagSoupParser = new TagSoupParser(is, "http://test-document");
234 return tagSoupParser.getDOM();
235 } finally {
236 is.close();
237 }
238 }
239
240 private Document getMicrodataDom(String htmlFile) throws IOException {
241 return getDom("/microdata/" + htmlFile + ".html");
242 }
243
244 private MicrodataParserReport extractItems(String htmlFile) throws IOException {
245 final Document document = getMicrodataDom(htmlFile);
246 return MicrodataParser.getMicrodata(document);
247 }
248
249 private void extractItemsAndVerifyJSONSerialization(String htmlFile, String expectedResult) throws IOException {
250 final MicrodataParserReport report = extractItems(htmlFile);
251 final ItemScope[] items = report.getDetectedItemScopes();
252 final MicrodataParserException[] errors = report.getErrors();
253
254 logger.debug("begin itemScopes");
255 for (ItemScope item : items) {
256 logger.debug(item.toJSON());
257 }
258 logger.debug("end itemScopes");
259 logger.debug("begin errors");
260 for (MicrodataParserException error : errors) {
261 logger.debug(error.toJSON());
262 }
263 logger.debug("end errors");
264
265 final Properties resultContent = new Properties();
266 resultContent.load(this.getClass().getResourceAsStream("/microdata/" + expectedResult + ".properties"));
267
268 final int expectedResults = getExpectedResultCount(resultContent);
269 final int expectedErrors = getExpectedErrorsCount(resultContent);
270 Assert.assertEquals("Unexpected number of detect items.", expectedResults, items.length);
271 Assert.assertEquals("Unexpected number of errors.", expectedErrors, errors.length);
272
273 for (int i = 0; i < items.length; i++) {
274 Assert.assertEquals("Error while comparing result [" + i + "]", resultContent.getProperty("result" + i),
275 items[i].toJSON());
276 }
277
278 for (int i = 0; i < errors.length; i++) {
279
280 Assert.assertEquals("Error while comparing error [" + i + "]",
281 resultContent.getProperty("error" + i).replaceAll("_row\" : -?\\d+", "_row\" : -1")
282 .replaceAll("_col\" : -?\\d+", "_col\" : -1"),
283 errors[i].toJSON().replaceAll("_row\" : -?\\d+", "_row\" : -1").replaceAll("_col\" : -?\\d+",
284 "_col\" : -1"));
285 }
286 }
287
288 private int countKeysWithPrefix(Properties properties, String prefix) {
289 int count = 0;
290 for (Object key : properties.keySet()) {
291 if (key.toString().indexOf(prefix) == 0)
292 count++;
293 }
294 return count;
295 }
296
297 private int getExpectedResultCount(Properties properties) {
298 return countKeysWithPrefix(properties, "result");
299 }
300
301 private int getExpectedErrorsCount(Properties properties) {
302 return countKeysWithPrefix(properties, "error");
303 }
304
305 }