1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.source.DocumentSource;
21 import org.apache.any23.source.FileDocumentSource;
22 import org.junit.Assert;
23 import org.w3c.dom.Node;
24
25 import java.io.File;
26 import java.io.FileInputStream;
27 import java.io.IOException;
28
29 /**
30 * This class is a wrapper around an HTML document providing a simply facade.
31 */
32 public class HTMLFixture {
33
34 private final File file;
35
36 public HTMLFixture(File file) {
37 Assert.assertNotNull("Test resource file was null", file);
38 Assert.assertTrue("Test resource file does not exist", file.exists());
39 this.file = file;
40 }
41
42 private File getFile() {
43 return file;
44 }
45
46 public DocumentSource getOpener(String baseIRI) {
47 return new FileDocumentSource(getFile(), baseIRI);
48 }
49
50 /**
51 * @return the DOM root {@link org.w3c.dom.Node} of the whole document.
52 */
53 public Node getDOM() {
54 try {
55 return new TagSoupParser(new FileInputStream(getFile()), "http://example.org/").getDOM();
56 } catch (IOException ex) {
57 throw new RuntimeException(ex);
58 }
59 }
60
61 /**
62 * @return an {@link HTMLDocument} object of the whole HTML document.
63 */
64 public HTMLDocument getHTMLDocument() {
65 return new HTMLDocument(getDOM());
66 }
67 }