|
1
|
|
|
package br.ufrj.ppgi.greco.kettle; |
|
2
|
|
|
|
|
3
|
|
|
import java.io.File; |
|
4
|
|
|
import java.io.IOException; |
|
5
|
|
|
|
|
6
|
|
|
import javax.xml.parsers.DocumentBuilder; |
|
7
|
|
|
import javax.xml.parsers.DocumentBuilderFactory; |
|
8
|
|
|
import javax.xml.parsers.ParserConfigurationException; |
|
9
|
|
|
|
|
10
|
|
|
import org.pentaho.di.core.exception.KettleException; |
|
11
|
|
|
import org.pentaho.di.core.row.RowDataUtil; |
|
12
|
|
|
import org.pentaho.di.core.row.RowMeta; |
|
13
|
|
|
import org.pentaho.di.core.row.RowMetaInterface; |
|
14
|
|
|
import org.pentaho.di.trans.Trans; |
|
15
|
|
|
import org.pentaho.di.trans.TransMeta; |
|
16
|
|
|
import org.pentaho.di.trans.step.BaseStep; |
|
17
|
|
|
import org.pentaho.di.trans.step.StepDataInterface; |
|
18
|
|
|
import org.pentaho.di.trans.step.StepInterface; |
|
19
|
|
|
import org.pentaho.di.trans.step.StepMeta; |
|
20
|
|
|
import org.pentaho.di.trans.step.StepMetaInterface; |
|
21
|
|
|
import org.w3c.dom.Document; |
|
22
|
|
|
import org.w3c.dom.Element; |
|
23
|
|
|
import org.w3c.dom.Node; |
|
24
|
|
|
import org.w3c.dom.NodeList; |
|
25
|
|
|
import org.xml.sax.SAXException; |
|
26
|
|
|
|
|
27
|
|
|
/** |
|
28
|
|
|
* Step Annotator. |
|
29
|
|
|
* <p /> |
|
30
|
|
|
* Gera sentenças RDF no formato N-Triple |
|
31
|
|
|
* |
|
32
|
|
|
* |
|
33
|
|
|
* @author Camila Carvalho Ferreira |
|
34
|
|
|
* |
|
35
|
|
|
*/ |
|
36
|
|
|
public class AnnotatorStep extends BaseStep implements StepInterface { |
|
37
|
|
|
// Constantes |
|
38
|
|
|
public static final String LITERAL_OBJECT_TRIPLE_FORMAT = "<%s> <%s> \"%s\"."; |
|
39
|
|
|
public static final String URI_OBJECT_TRIPLE_FORMAT = "<%s> <%s> <%s> ."; |
|
40
|
|
|
public static final String RDF_TYPE_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; |
|
41
|
|
|
|
|
42
|
|
|
public AnnotatorStep(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, |
|
43
|
|
|
Trans trans) { |
|
44
|
|
|
super(stepMeta, stepDataInterface, copyNr, transMeta, trans); |
|
45
|
|
|
} |
|
46
|
|
|
|
|
47
|
|
|
@Override |
|
48
|
|
|
public boolean init(StepMetaInterface smi, StepDataInterface sdi) { |
|
49
|
|
|
if (super.init(smi, sdi)) { |
|
50
|
|
|
return true; |
|
51
|
|
|
} else |
|
52
|
|
|
return false; |
|
53
|
|
|
} |
|
54
|
|
|
|
|
55
|
|
|
@Override |
|
56
|
|
|
public void dispose(StepMetaInterface smi, StepDataInterface sdi) { |
|
57
|
|
|
super.dispose(smi, sdi); |
|
58
|
|
|
} |
|
59
|
|
|
|
|
60
|
|
|
/** |
|
61
|
|
|
* Metodo chamado para cada linha que entra no step |
|
62
|
|
|
*/ |
|
63
|
|
|
public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException { |
|
64
|
|
|
AnnotatorStepMeta meta = (AnnotatorStepMeta) smi; |
|
65
|
|
|
AnnotatorStepData data = (AnnotatorStepData) sdi; |
|
66
|
|
|
|
|
67
|
|
|
// Obtem linha do fluxo de entrada e termina caso nao haja mais entrada |
|
68
|
|
|
|
|
69
|
|
|
Object[] row = getRow(); |
|
70
|
|
|
|
|
71
|
|
|
if (row == null) { // Nao ha mais linhas de dados |
|
72
|
|
|
setOutputDone(); |
|
73
|
|
|
return false; |
|
74
|
|
|
} |
|
75
|
|
|
|
|
76
|
|
|
// Executa apenas uma vez. Variavel first definida na superclasse com |
|
77
|
|
|
// valor true |
|
78
|
|
|
if (first) { |
|
79
|
|
|
first = false; |
|
80
|
|
|
|
|
81
|
|
|
// Obtem todas as colunas ateh o step anterior. |
|
82
|
|
|
// Chamar apenas apos chamar getRow() |
|
83
|
|
|
RowMetaInterface rowMeta = getInputRowMeta(); |
|
84
|
|
|
data.outputRowMeta = meta.getInnerKeepInputFields() ? rowMeta.clone() : new RowMeta(); |
|
85
|
|
|
|
|
86
|
|
|
// Adiciona os metadados do step atual |
|
87
|
|
|
meta.getFields(data.outputRowMeta, getStepname(), null, null, this); |
|
88
|
|
|
} |
|
89
|
|
|
|
|
90
|
|
|
String outputNTriple; |
|
91
|
|
|
|
|
92
|
|
|
// Logica do step |
|
93
|
|
|
// Leitura de campos Input |
|
94
|
|
|
String inputSubject = getInputRowMeta().getString(row, meta.getInputSubject(), ""); |
|
95
|
|
|
String inputPredicate = getInputRowMeta().getString(row, meta.getInputPredicate(), ""); |
|
96
|
|
|
String inputObject = getInputRowMeta().getString(row, meta.getInputObject(), ""); |
|
97
|
|
|
String outputSubject = inputSubject; |
|
98
|
|
|
String outputPredicate = inputPredicate; |
|
99
|
|
|
String outputObject = inputObject; |
|
100
|
|
|
|
|
101
|
|
|
try { |
|
102
|
|
|
// abre arquivo xml |
|
103
|
|
|
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); |
|
104
|
|
|
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); |
|
105
|
|
|
Document doc = docBuilder.parse(new File(meta.getBrowseFilename())); |
|
106
|
|
|
NodeList listOfMaps = doc.getElementsByTagName("map"); |
|
107
|
|
|
int totalMaps = listOfMaps.getLength(); |
|
108
|
|
|
// procura em cada node map as regras de anota |
|
109
|
|
|
for (int i = 0; i < totalMaps; i++) { |
|
110
|
|
|
Node fromMapNode = listOfMaps.item(i); |
|
111
|
|
|
if (fromMapNode.getNodeType() == Node.ELEMENT_NODE) { |
|
112
|
|
|
Element fromMapElement = (Element) fromMapNode; |
|
113
|
|
|
NodeList fromList = fromMapElement.getElementsByTagName("from"); |
|
114
|
|
|
Element fromElement = (Element) fromList.item(0); |
|
115
|
|
|
NodeList textFList = fromElement.getChildNodes(); |
|
116
|
|
|
NodeList toList = fromMapElement.getElementsByTagName("to"); |
|
117
|
|
|
Element toElement = (Element) toList.item(0); |
|
118
|
|
|
NodeList textTList = toElement.getChildNodes(); |
|
119
|
|
|
if (((Node) textFList.item(0)).getNodeValue().trim().contains(inputSubject)) { |
|
120
|
|
|
outputSubject = ((Node) textTList.item(0)).getNodeValue().trim(); |
|
121
|
|
|
} |
|
122
|
|
|
if (((Node) textFList.item(0)).getNodeValue().trim().contains(inputPredicate)) { |
|
123
|
|
|
outputPredicate = ((Node) textTList.item(0)).getNodeValue().trim(); |
|
124
|
|
|
} |
|
125
|
|
|
if (((Node) textFList.item(0)).getNodeValue().trim().contains(inputObject)) { |
|
126
|
|
|
outputObject = ((Node) textTList.item(0)).getNodeValue().trim(); |
|
127
|
|
|
} |
|
128
|
|
|
} |
|
129
|
|
|
} |
|
130
|
|
|
|
|
131
|
|
|
} catch (ParserConfigurationException e) { |
|
132
|
|
|
// TODO Auto-generated catch block |
|
133
|
|
|
e.printStackTrace(); |
|
134
|
|
|
} catch (SAXException e) { |
|
135
|
|
|
// TODO Auto-generated catch block |
|
136
|
|
|
e.printStackTrace(); |
|
137
|
|
|
} catch (IOException e) { |
|
138
|
|
|
// TODO Auto-generated catch block |
|
139
|
|
|
e.printStackTrace(); |
|
140
|
|
|
} |
|
141
|
|
|
|
|
142
|
|
|
if (inputPredicate.equals(RDF_TYPE_URI)) { |
|
143
|
|
|
outputNTriple = String.format(URI_OBJECT_TRIPLE_FORMAT, outputSubject, outputPredicate, outputObject); |
|
144
|
|
|
} else { |
|
145
|
|
|
|
|
146
|
|
|
outputNTriple = String.format(LITERAL_OBJECT_TRIPLE_FORMAT, outputSubject, outputPredicate, outputObject); |
|
147
|
|
|
} |
|
148
|
|
|
|
|
149
|
|
|
// Set output row |
|
150
|
|
|
Object[] outputRow = meta.getInnerKeepInputFields() ? row : new Object[0]; |
|
151
|
|
|
|
|
152
|
|
|
outputRow = RowDataUtil.addValueData(outputRow, outputRow.length, outputNTriple); |
|
153
|
|
|
|
|
154
|
|
|
putRow(data.outputRowMeta, outputRow); |
|
155
|
|
|
|
|
156
|
|
|
return true; |
|
157
|
|
|
} |
|
158
|
|
|
} |
|
159
|
|
|
|