View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.extractor.rdf.JSONLDExtractor;
27  import org.apache.any23.extractor.rdf.JSONLDExtractorFactory;
28  import org.apache.any23.rdf.RDFUtils;
29  import org.apache.any23.vocab.SINDICE;
30  import org.apache.commons.io.IOUtils;
31  import org.eclipse.rdf4j.model.IRI;
32  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
33  import org.w3c.dom.Document;
34  import org.w3c.dom.NamedNodeMap;
35  import org.w3c.dom.Node;
36  
37  import java.io.IOException;
38  import java.nio.charset.StandardCharsets;
39  import java.util.HashMap;
40  import java.util.HashSet;
41  import java.util.List;
42  import java.util.Map;
43  import java.util.Set;
44  
45  /**
46   * This extractor represents the HTML script tags used to embed blocks of data
47   * in documents. This way, JSON-LD content can be easily embedded in HTML by
48   * placing it in a script element with the type attribute set to
49   * application/ld+json according the <a
50   * href="http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents"
51   * >JSON-LD specification</a>.
52   *
53   */
54  public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
55  
56    private static final SINDICE vSINDICE = SINDICE.getInstance();
57  
58    private IRI profile;
59  
60    private Map<String, IRI> prefixes = new HashMap<>();
61  
62    private String documentLang;
63  
64    private JSONLDExtractor extractor;
65  
66    /**
67     * {@inheritDoc}
68     */
69    @Override
70    public void run(ExtractionParameters extractionParameters,
71            ExtractionContext extractionContext, Document in,
72            ExtractionResult out) throws IOException, ExtractionException {
73      profile = extractProfile(in);
74      documentLang = getDocumentLanguage(in);
75      extractLinkDefinedPrefixes(in);
76  
77      String baseProfile = vSINDICE.NS;
78      if (profile != null) {
79        baseProfile = profile.toString();
80      }
81  
82      extractionContext.getDocumentIRI();
83      Set<JSONLDScript> jsonldScripts = extractJSONLDScript(in, baseProfile,
84              extractionParameters, extractionContext, out);
85      for (JSONLDScript jsonldScript : jsonldScripts) {
86        //String lang = documentLang;
87        //if (jsonldScript.getLang() != null) {
88        //	lang = jsonldScript.getLang();
89        //}
90        //out.writeTriple(documentIRI, jsonldScript.getName(),
91        //		SimpleValueFactory.getInstance().createLiteral(jsonldScript.getContent(), lang));
92      }
93    }
94  
95    /**
96     * Returns the {@link Document} language if declared, <code>null</code>
97     * otherwise.
98     *
99     * @param in
100    *            a instance of {@link Document}.
101    * @return the language declared, could be <code>null</code>.
102    */
103   private String getDocumentLanguage(Document in) {
104     String lang = DomUtils.find(in, "string(/HTML/@lang)");
105     if ("".equals(lang)) {
106       return null;
107     }
108     return lang;
109   }
110 
111   private IRI extractProfile(Document in) {
112     String profile = DomUtils.find(in, "string(/HTML/@profile)");
113     if ("".equals(profile)) {
114       return null;
115     }
116     return SimpleValueFactory.getInstance().createIRI(profile);
117   }
118 
119   /**
120    * It extracts prefixes defined in the <i>LINK</i> meta tags.
121    *
122    * @param in
123    */
124   private void extractLinkDefinedPrefixes(Document in) {
125     List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
126     for (Node linkNode : linkNodes) {
127       NamedNodeMap attributes = linkNode.getAttributes();
128       Node relNode = attributes.getNamedItem("rel");
129       String rel = relNode == null ? null : relNode.getTextContent();
130       Node hrefNode = attributes.getNamedItem("href");
131       String href = hrefNode == null ? null : hrefNode.getTextContent();
132       if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
133         prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
134       }
135     }
136   }
137 
138   private Set<JSONLDScript> extractJSONLDScript(Document in,
139           String baseProfile, ExtractionParameters extractionParameters,
140           ExtractionContext extractionContext, ExtractionResult out)
141                   throws IOException, ExtractionException {
142     List<Node> scriptNodes = DomUtils.findAll(in, "//SCRIPT");
143     Set<JSONLDScript> result = new HashSet<>();
144     extractor = new JSONLDExtractorFactory().createExtractor();
145     for (Node jsonldNode : scriptNodes) {
146       NamedNodeMap attributes = jsonldNode.getAttributes();
147       for (int i = 0; i < attributes.getLength(); i++) {
148         if ("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) {
149           extractor.run(extractionParameters, extractionContext,
150                   IOUtils.toInputStream(jsonldNode.getTextContent(), StandardCharsets.UTF_8), out);
151         }
152       }
153       Node nameAttribute = attributes.getNamedItem("name");
154       Node contentAttribute = attributes.getNamedItem("content");
155       if (nameAttribute == null || contentAttribute == null) {
156         continue;
157       }
158       String name = nameAttribute.getTextContent();
159       String content = contentAttribute.getTextContent();
160       String xpath = DomUtils.getXPathForNode(jsonldNode);
161       IRI nameAsIRI = getPrefixIfExists(name);
162       if (nameAsIRI == null) {
163         nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
164       }
165       JSONLDScript jsonldScript = new JSONLDScript(xpath, nameAsIRI,
166               content);
167       result.add(jsonldScript);
168     }
169     return result;
170   }
171 
172   private IRI getPrefixIfExists(String name) {
173     String[] split = name.split("\\.");
174     if (split.length == 2 && prefixes.containsKey(split[0])) {
175       return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
176     }
177     return null;
178   }
179 
180   @Override
181   public ExtractorDescription getDescription() {
182     return EmbeddedJSONLDExtractorFactory.getDescriptionInstance();
183   }
184 
185   private class JSONLDScript {
186 
187     private String xpath;
188 
189     public JSONLDScript(String xpath, IRI name, String content) {
190       this.xpath = xpath;
191     }
192 
193     @Override
194     public boolean equals(Object o) {
195       if (this == o) {
196         return true;
197       }
198       if (o == null) {
199         return false;
200       }
201       if (!(o instanceof JSONLDScript)) {
202         return false;
203       }
204 
205       JSONLDScript meta = (JSONLDScript) o;
206 
207       if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) {
208         return false;
209       }
210 
211       return true;
212     }
213 
214     @Override
215     public int hashCode() {
216       return xpath != null ? xpath.hashCode() : 0;
217     }
218   }
219 
220 }