View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.any23.extractor.yaml;
18  
19  import java.io.IOException;
20  import java.io.InputStream;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionParameters;
24  import org.apache.any23.extractor.ExtractionResult;
25  import org.apache.any23.extractor.Extractor;
26  import org.apache.any23.extractor.ExtractorDescription;
27  import org.apache.any23.rdf.RDFUtils;
28  import org.apache.any23.vocab.YAML;
29  import org.eclipse.rdf4j.model.Resource;
30  import org.eclipse.rdf4j.model.IRI;
31  import org.eclipse.rdf4j.model.vocabulary.RDF;
32  import org.eclipse.rdf4j.model.vocabulary.RDFS;
33  import org.slf4j.Logger;
34  import org.slf4j.LoggerFactory;
35  import org.yaml.snakeyaml.Yaml;
36  
37  /**
38   * @author Jacek Grzebyta (grzebyta.dev [at] gmail.com)
39   */
40  public class YAMLExtractor implements Extractor.ContentExtractor {
41  
42      private final Logger log = LoggerFactory.getLogger(getClass());
43  
44      private static final Yaml yml = new Yaml();
45  
46      private static final YAML vocab = YAML.getInstance();
47  
48      private final ElementsProcessor ep = ElementsProcessor.getInstance();
49  
50      private Resource documentRoot;
51  
52      @Override
53      public void setStopAtFirstError(boolean f) {
54      }
55  
56      @Override
57      public void run(ExtractionParameters extractionParameters, ExtractionContext context, InputStream in,
58              ExtractionResult out)
59              throws IOException, ExtractionException {
60  
61          IRI documentIRI = context.getDocumentIRI();
62          documentRoot = RDFUtils.iri(documentIRI.toString() + "root");
63  
64          log.debug("Processing: {}", documentIRI.toString());
65          out.writeNamespace(vocab.PREFIX, vocab.NS);
66          out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
67          out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
68  
69          out.writeTriple(documentRoot, RDF.TYPE, vocab.root);
70          Iterable<Object> docIterate = yml.loadAll(in);
71  
72          // Iterate over page(s)
73          for (Object p : docIterate) {
74              Resource pageNode = RDFUtils.makeIRI("document", documentIRI, true);
75              out.writeTriple(documentRoot, vocab.contains, pageNode);
76              out.writeTriple(pageNode, RDF.TYPE, vocab.document);
77              ElementsProcessor.ModelHolder rootNode = ep.asModel(documentIRI, p, pageNode);
78              
79              if (rootNode == null) {
80                  continue;
81              }
82              
83              if (!rootNode.getRoot().equals(pageNode)) {
84                  out.writeTriple(pageNode, vocab.contains, rootNode.getRoot());
85              }
86              
87              log.debug("Subgraph root node: {}", rootNode.getRoot().stringValue());
88              
89              rootNode.getModel().forEach((s) ->{
90                  out.writeTriple(s.getSubject(), s.getPredicate(), s.getObject());
91              });
92              
93          }
94  
95      }
96  
97      @Override
98      public ExtractorDescription getDescription() {
99          return YAMLExtractorFactory.getDescriptionInstance();
100     }
101 
102 }