This project has retired. For details please refer to its
Attic page.
PeopleExtractor xref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 package org.apache.any23.cli.flows;
18
19 import org.apache.any23.extractor.ExtractionContext;
20 import org.apache.any23.vocab.CSV;
21 import org.apache.any23.writer.CompositeTripleHandler;
22 import org.apache.any23.writer.TripleHandler;
23 import org.apache.any23.writer.TripleHandlerException;
24 import org.apache.commons.codec.digest.DigestUtils;
25 import org.eclipse.rdf4j.model.IRI;
26 import org.eclipse.rdf4j.model.Literal;
27 import org.eclipse.rdf4j.model.Model;
28 import org.eclipse.rdf4j.model.Resource;
29 import org.eclipse.rdf4j.model.Statement;
30 import org.eclipse.rdf4j.model.Value;
31 import org.eclipse.rdf4j.model.ValueFactory;
32 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
33 import org.eclipse.rdf4j.model.impl.TreeModel;
34 import org.eclipse.rdf4j.model.util.Models;
35 import org.eclipse.rdf4j.model.vocabulary.RDF;
36 import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
37 import org.slf4j.Logger;
38 import org.slf4j.LoggerFactory;
39
40 import java.lang.invoke.MethodHandles;
41 import java.util.Collections;
42 import java.util.Set;
43 import java.util.stream.Collectors;
44
45
46
47
48 public class PeopleExtractor extends CompositeTripleHandler {
49
50 private Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
51
52 private static final CSV csv = CSV.getInstance();
53 private static final ValueFactory vf = SimpleValueFactory.getInstance();
54 public static final String RAW_NS = "urn:dataser:raw/";
55 private static final IRI RAW_FIRST_NAME = vf.createIRI(RAW_NS, "FirstName");
56 private static final IRI RAW_LAST_NAME = vf.createIRI(RAW_NS, "LastName");
57
58 private static final String NAMESPACE = "http://supercustom.net/ontology/";
59 private static final IRI PERSON = vf.createIRI(NAMESPACE, "Person");
60 private static final IRI FULL_NAME = vf.createIRI(NAMESPACE, "fullName");
61 private static final IRI HASH = vf.createIRI(NAMESPACE, "hash");
62
63 public static Model createPerson(String fullName) {
64 IRI s = vf.createIRI("http://rdf.supercustom.net/data/", DigestUtils.sha1Hex(fullName));
65 Model model = new TreeModel();
66 model.add(s, RDF.TYPE, PERSON);
67 model.add(s, FULL_NAME, vf.createLiteral(fullName));
68 model.add(s, HASH, vf.createLiteral(s.getLocalName(), XMLSchema.HEXBINARY));
69 return model;
70 };
71
72 private final Model csvModel = new TreeModel();
73
74 public PeopleExtractor(TripleHandler delegate) {
75 super(Collections.singletonList(delegate));
76 }
77
78 @Override
79 public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context)
80 throws TripleHandlerException {
81 if ("csv".equals(context.getExtractorName())) {
82 csvModel.add(s, p, o, vf.createIRI(context.getUniqueID()));
83 } else {
84 super.receiveTriple(s, p, o, g, context);
85 }
86 }
87
88 @Override
89 public void closeContext(ExtractionContext context) throws TripleHandlerException {
90 Set<Resource> subjects = csvModel.filter(null, RDF.TYPE, csv.rowType).stream().map(Statement::getSubject)
91 .collect(Collectors.toSet());
92
93 log.debug("List of rows: {}", subjects);
94
95 for (Resource rowId : subjects) {
96 String firstName = Models.objectLiteral(csvModel.filter(rowId, RAW_FIRST_NAME, null)).map(Literal::getLabel)
97 .orElse("");
98
99 String lastName = Models.objectLiteral(csvModel.filter(rowId, RAW_LAST_NAME, null)).map(Literal::getLabel)
100 .orElse("");
101
102 String fullName = firstName + " " + lastName;
103
104 for (Statement s : createPerson(fullName)) {
105 super.receiveTriple(s.getSubject(), s.getPredicate(), s.getObject(), null, context);
106 }
107 }
108
109 csvModel.clear();
110
111 super.closeContext(context);
112 }
113
114 }