View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.extractor.rdf.JSONLDExtractor;
27  import org.apache.any23.extractor.rdf.JSONLDExtractorFactory;
28  import org.apache.any23.rdf.RDFUtils;
29  import org.apache.any23.vocab.SINDICE;
30  import org.apache.commons.io.IOUtils;
31  import org.eclipse.rdf4j.model.IRI;
32  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
33  import org.w3c.dom.Document;
34  import org.w3c.dom.NamedNodeMap;
35  import org.w3c.dom.Node;
36  
37  import java.io.IOException;
38  import java.nio.charset.StandardCharsets;
39  import java.util.HashMap;
40  import java.util.HashSet;
41  import java.util.List;
42  import java.util.Map;
43  import java.util.Set;
44  
45  /**
46   * This extractor represents the HTML script tags used to embed blocks of data in documents. This way, JSON-LD content
47   * can be easily embedded in HTML by placing it in a script element with the type attribute set to application/ld+json
48   * according the <a href="http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents" >JSON-LD specification</a>.
49   *
50   */
51  public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
52  
53      private static final SINDICE vSINDICE = SINDICE.getInstance();
54  
55      private IRI profile;
56  
57      private Map<String, IRI> prefixes = new HashMap<>();
58  
59      private String documentLang;
60  
61      private JSONLDExtractor extractor;
62  
63      /**
64       * {@inheritDoc}
65       */
66      @Override
67      public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
68              ExtractionResult out) throws IOException, ExtractionException {
69          profile = extractProfile(in);
70          documentLang = getDocumentLanguage(in);
71          extractLinkDefinedPrefixes(in);
72  
73          String baseProfile = vSINDICE.NS;
74          if (profile != null) {
75              baseProfile = profile.toString();
76          }
77  
78          extractionContext.getDocumentIRI();
79          Set<JSONLDScript> jsonldScripts = extractJSONLDScript(in, baseProfile, extractionParameters, extractionContext,
80                  out);
81          for (JSONLDScript jsonldScript : jsonldScripts) {
82              // String lang = documentLang;
83              // if (jsonldScript.getLang() != null) {
84              // lang = jsonldScript.getLang();
85              // }
86              // out.writeTriple(documentIRI, jsonldScript.getName(),
87              // SimpleValueFactory.getInstance().createLiteral(jsonldScript.getContent(), lang));
88          }
89      }
90  
91      /**
92       * Returns the {@link Document} language if declared, <code>null</code> otherwise.
93       *
94       * @param in
95       *            a instance of {@link Document}.
96       * 
97       * @return the language declared, could be <code>null</code>.
98       */
99      private String getDocumentLanguage(Document in) {
100         String lang = DomUtils.find(in, "string(/HTML/@lang)");
101         if ("".equals(lang)) {
102             return null;
103         }
104         return lang;
105     }
106 
107     private IRI extractProfile(Document in) {
108         String profile = DomUtils.find(in, "string(/HTML/@profile)");
109         if ("".equals(profile)) {
110             return null;
111         }
112         return SimpleValueFactory.getInstance().createIRI(profile);
113     }
114 
115     /**
116      * It extracts prefixes defined in the <i>LINK</i> meta tags.
117      *
118      * @param in
119      */
120     private void extractLinkDefinedPrefixes(Document in) {
121         List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
122         for (Node linkNode : linkNodes) {
123             NamedNodeMap attributes = linkNode.getAttributes();
124             Node relNode = attributes.getNamedItem("rel");
125             String rel = relNode == null ? null : relNode.getTextContent();
126             Node hrefNode = attributes.getNamedItem("href");
127             String href = hrefNode == null ? null : hrefNode.getTextContent();
128             if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
129                 prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
130             }
131         }
132     }
133 
134     private Set<JSONLDScript> extractJSONLDScript(Document in, String baseProfile,
135             ExtractionParameters extractionParameters, ExtractionContext extractionContext, ExtractionResult out)
136             throws IOException, ExtractionException {
137         List<Node> scriptNodes = DomUtils.findAll(in, "//SCRIPT");
138         Set<JSONLDScript> result = new HashSet<>();
139         extractor = new JSONLDExtractorFactory().createExtractor();
140         for (Node jsonldNode : scriptNodes) {
141             NamedNodeMap attributes = jsonldNode.getAttributes();
142             for (int i = 0; i < attributes.getLength(); i++) {
143                 if ("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) {
144                     extractor.run(extractionParameters, extractionContext,
145                             IOUtils.toInputStream(jsonldNode.getTextContent(), StandardCharsets.UTF_8), out);
146                 }
147             }
148             Node nameAttribute = attributes.getNamedItem("name");
149             Node contentAttribute = attributes.getNamedItem("content");
150             if (nameAttribute == null || contentAttribute == null) {
151                 continue;
152             }
153             String name = nameAttribute.getTextContent();
154             String content = contentAttribute.getTextContent();
155             String xpath = DomUtils.getXPathForNode(jsonldNode);
156             IRI nameAsIRI = getPrefixIfExists(name);
157             if (nameAsIRI == null) {
158                 nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
159             }
160             JSONLDScript jsonldScript = new JSONLDScript(xpath, nameAsIRI, content);
161             result.add(jsonldScript);
162         }
163         return result;
164     }
165 
166     private IRI getPrefixIfExists(String name) {
167         String[] split = name.split("\\.");
168         if (split.length == 2 && prefixes.containsKey(split[0])) {
169             return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
170         }
171         return null;
172     }
173 
174     @Override
175     public ExtractorDescription getDescription() {
176         return EmbeddedJSONLDExtractorFactory.getDescriptionInstance();
177     }
178 
179     private static class JSONLDScript {
180 
181         private String xpath;
182 
183         public JSONLDScript(String xpath, IRI name, String content) {
184             this.xpath = xpath;
185         }
186 
187         @Override
188         public boolean equals(Object o) {
189             if (this == o) {
190                 return true;
191             }
192             if (o == null) {
193                 return false;
194             }
195             if (!(o instanceof JSONLDScript)) {
196                 return false;
197             }
198 
199             JSONLDScript meta = (JSONLDScript) o;
200 
201             if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) {
202                 return false;
203             }
204 
205             return true;
206         }
207 
208         @Override
209         public int hashCode() {
210             return xpath != null ? xpath.hashCode() : 0;
211         }
212     }
213 
214 }