This project has retired. For details please refer to its Attic page.
PeopleExtractor xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except csvModel compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to csvModel writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.any23.cli.flows;
18  
19  import org.apache.any23.extractor.ExtractionContext;
20  import org.apache.any23.vocab.CSV;
21  import org.apache.any23.writer.CompositeTripleHandler;
22  import org.apache.any23.writer.TripleHandler;
23  import org.apache.any23.writer.TripleHandlerException;
24  import org.apache.commons.codec.digest.DigestUtils;
25  import org.eclipse.rdf4j.model.IRI;
26  import org.eclipse.rdf4j.model.Literal;
27  import org.eclipse.rdf4j.model.Model;
28  import org.eclipse.rdf4j.model.Resource;
29  import org.eclipse.rdf4j.model.Statement;
30  import org.eclipse.rdf4j.model.Value;
31  import org.eclipse.rdf4j.model.ValueFactory;
32  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
33  import org.eclipse.rdf4j.model.impl.TreeModel;
34  import org.eclipse.rdf4j.model.util.Models;
35  import org.eclipse.rdf4j.model.vocabulary.RDF;
36  import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
37  import org.slf4j.Logger;
38  import org.slf4j.LoggerFactory;
39  
40  import java.lang.invoke.MethodHandles;
41  import java.util.Collections;
42  import java.util.Set;
43  import java.util.stream.Collectors;
44  
45  /**
46   * Proof of concept for ANY23-396 example.
47   */
48  public class PeopleExtractor extends CompositeTripleHandler {
49  
50      private Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
51  
52      private static final CSV csv = CSV.getInstance();
53      private static final ValueFactory vf = SimpleValueFactory.getInstance();
54      public static final String RAW_NS = "urn:dataser:raw/";
55      private static final IRI RAW_FIRST_NAME = vf.createIRI(RAW_NS, "FirstName");
56      private static final IRI RAW_LAST_NAME = vf.createIRI(RAW_NS, "LastName");
57  
58      private static final String NAMESPACE = "http://supercustom.net/ontology/";
59      private static final IRI PERSON = vf.createIRI(NAMESPACE, "Person");
60      private static final IRI FULL_NAME = vf.createIRI(NAMESPACE, "fullName");
61      private static final IRI HASH = vf.createIRI(NAMESPACE, "hash");
62  
63      public static Model createPerson(String fullName) {
64          IRI s = vf.createIRI("http://rdf.supercustom.net/data/", DigestUtils.sha1Hex(fullName));
65          Model model = new TreeModel();
66          model.add(s, RDF.TYPE, PERSON);
67          model.add(s, FULL_NAME, vf.createLiteral(fullName));
68          model.add(s, HASH, vf.createLiteral(s.getLocalName(), XMLSchema.HEXBINARY));
69          return model;
70      };
71  
72      private final Model csvModel = new TreeModel();
73  
74      public PeopleExtractor(TripleHandler delegate) {
75          super(Collections.singletonList(delegate));
76      }
77  
78      @Override
79      public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context)
80              throws TripleHandlerException {
81          if ("csv".equals(context.getExtractorName())) {
82              csvModel.add(s, p, o, vf.createIRI(context.getUniqueID()));
83          } else {
84              super.receiveTriple(s, p, o, g, context);
85          }
86      }
87  
88      @Override
89      public void closeContext(ExtractionContext context) throws TripleHandlerException {
90          Set<Resource> subjects = csvModel.filter(null, RDF.TYPE, csv.rowType).stream().map(Statement::getSubject)
91                  .collect(Collectors.toSet());
92  
93          log.debug("List of rows: {}", subjects);
94  
95          for (Resource rowId : subjects) {
96              String firstName = Models.objectLiteral(csvModel.filter(rowId, RAW_FIRST_NAME, null)).map(Literal::getLabel)
97                      .orElse("");
98  
99              String lastName = Models.objectLiteral(csvModel.filter(rowId, RAW_LAST_NAME, null)).map(Literal::getLabel)
100                     .orElse("");
101 
102             String fullName = firstName + " " + lastName;
103 
104             for (Statement s : createPerson(fullName)) {
105                 super.receiveTriple(s.getSubject(), s.getPredicate(), s.getObject(), null, context);
106             }
107         }
108 
109         csvModel.clear();
110 
111         super.closeContext(context);
112     }
113 
114 }