1
|
|
|
package br.ufrj.ppgi.greco.kettle; |
2
|
|
|
|
3
|
|
|
import java.io.File; |
4
|
|
|
import java.io.IOException; |
5
|
|
|
|
6
|
|
|
import javax.xml.parsers.DocumentBuilder; |
7
|
|
|
import javax.xml.parsers.DocumentBuilderFactory; |
8
|
|
|
import javax.xml.parsers.ParserConfigurationException; |
9
|
|
|
|
10
|
|
|
import org.pentaho.di.core.exception.KettleException; |
11
|
|
|
import org.pentaho.di.core.row.RowDataUtil; |
12
|
|
|
import org.pentaho.di.core.row.RowMeta; |
13
|
|
|
import org.pentaho.di.core.row.RowMetaInterface; |
14
|
|
|
import org.pentaho.di.trans.Trans; |
15
|
|
|
import org.pentaho.di.trans.TransMeta; |
16
|
|
|
import org.pentaho.di.trans.step.BaseStep; |
17
|
|
|
import org.pentaho.di.trans.step.StepDataInterface; |
18
|
|
|
import org.pentaho.di.trans.step.StepInterface; |
19
|
|
|
import org.pentaho.di.trans.step.StepMeta; |
20
|
|
|
import org.pentaho.di.trans.step.StepMetaInterface; |
21
|
|
|
import org.w3c.dom.Document; |
22
|
|
|
import org.w3c.dom.Element; |
23
|
|
|
import org.w3c.dom.Node; |
24
|
|
|
import org.w3c.dom.NodeList; |
25
|
|
|
import org.xml.sax.SAXException; |
26
|
|
|
|
27
|
|
|
/** |
28
|
|
|
* Step Annotator. |
29
|
|
|
* <p /> |
30
|
|
|
* Gera sentenças RDF no formato N-Triple |
31
|
|
|
* |
32
|
|
|
* |
33
|
|
|
* @author Camila Carvalho Ferreira |
34
|
|
|
* |
35
|
|
|
*/ |
36
|
|
|
public class AnnotatorStep extends BaseStep implements StepInterface { |
37
|
|
|
// Constantes |
38
|
|
|
public static final String LITERAL_OBJECT_TRIPLE_FORMAT = "<%s> <%s> \"%s\"."; |
39
|
|
|
public static final String URI_OBJECT_TRIPLE_FORMAT = "<%s> <%s> <%s> ."; |
40
|
|
|
public static final String RDF_TYPE_URI = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"; |
41
|
|
|
|
42
|
|
|
public AnnotatorStep(StepMeta stepMeta, StepDataInterface stepDataInterface, int copyNr, TransMeta transMeta, |
43
|
|
|
Trans trans) { |
44
|
|
|
super(stepMeta, stepDataInterface, copyNr, transMeta, trans); |
45
|
|
|
} |
46
|
|
|
|
47
|
|
|
@Override |
48
|
|
|
public boolean init(StepMetaInterface smi, StepDataInterface sdi) { |
49
|
|
|
if (super.init(smi, sdi)) { |
50
|
|
|
return true; |
51
|
|
|
} else |
52
|
|
|
return false; |
53
|
|
|
} |
54
|
|
|
|
55
|
|
|
@Override |
56
|
|
|
public void dispose(StepMetaInterface smi, StepDataInterface sdi) { |
57
|
|
|
super.dispose(smi, sdi); |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* Metodo chamado para cada linha que entra no step |
62
|
|
|
*/ |
63
|
|
|
public boolean processRow(StepMetaInterface smi, StepDataInterface sdi) throws KettleException { |
64
|
|
|
AnnotatorStepMeta meta = (AnnotatorStepMeta) smi; |
65
|
|
|
AnnotatorStepData data = (AnnotatorStepData) sdi; |
66
|
|
|
|
67
|
|
|
// Obtem linha do fluxo de entrada e termina caso nao haja mais entrada |
68
|
|
|
|
69
|
|
|
Object[] row = getRow(); |
70
|
|
|
|
71
|
|
|
if (row == null) { // Nao ha mais linhas de dados |
72
|
|
|
setOutputDone(); |
73
|
|
|
return false; |
74
|
|
|
} |
75
|
|
|
|
76
|
|
|
// Executa apenas uma vez. Variavel first definida na superclasse com |
77
|
|
|
// valor true |
78
|
|
|
if (first) { |
79
|
|
|
first = false; |
80
|
|
|
|
81
|
|
|
// Obtem todas as colunas ateh o step anterior. |
82
|
|
|
// Chamar apenas apos chamar getRow() |
83
|
|
|
RowMetaInterface rowMeta = getInputRowMeta(); |
84
|
|
|
data.outputRowMeta = meta.getInnerKeepInputFields() ? rowMeta.clone() : new RowMeta(); |
85
|
|
|
|
86
|
|
|
// Adiciona os metadados do step atual |
87
|
|
|
meta.getFields(data.outputRowMeta, getStepname(), null, null, this); |
88
|
|
|
} |
89
|
|
|
|
90
|
|
|
String outputNTriple; |
91
|
|
|
|
92
|
|
|
// Logica do step |
93
|
|
|
// Leitura de campos Input |
94
|
|
|
String inputSubject = getInputRowMeta().getString(row, meta.getInputSubject(), ""); |
95
|
|
|
String inputPredicate = getInputRowMeta().getString(row, meta.getInputPredicate(), ""); |
96
|
|
|
String inputObject = getInputRowMeta().getString(row, meta.getInputObject(), ""); |
97
|
|
|
String outputSubject = inputSubject; |
98
|
|
|
String outputPredicate = inputPredicate; |
99
|
|
|
String outputObject = inputObject; |
100
|
|
|
|
101
|
|
|
try { |
102
|
|
|
// abre arquivo xml |
103
|
|
|
DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance(); |
104
|
|
|
DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); |
105
|
|
|
Document doc = docBuilder.parse(new File(meta.getBrowseFilename())); |
106
|
|
|
NodeList listOfMaps = doc.getElementsByTagName("map"); |
107
|
|
|
int totalMaps = listOfMaps.getLength(); |
108
|
|
|
// procura em cada node map as regras de anota |
109
|
|
|
for (int i = 0; i < totalMaps; i++) { |
110
|
|
|
Node fromMapNode = listOfMaps.item(i); |
111
|
|
|
if (fromMapNode.getNodeType() == Node.ELEMENT_NODE) { |
112
|
|
|
Element fromMapElement = (Element) fromMapNode; |
113
|
|
|
NodeList fromList = fromMapElement.getElementsByTagName("from"); |
114
|
|
|
Element fromElement = (Element) fromList.item(0); |
115
|
|
|
NodeList textFList = fromElement.getChildNodes(); |
116
|
|
|
NodeList toList = fromMapElement.getElementsByTagName("to"); |
117
|
|
|
Element toElement = (Element) toList.item(0); |
118
|
|
|
NodeList textTList = toElement.getChildNodes(); |
119
|
|
|
if (((Node) textFList.item(0)).getNodeValue().trim().contains(inputSubject)) { |
120
|
|
|
outputSubject = ((Node) textTList.item(0)).getNodeValue().trim(); |
121
|
|
|
} |
122
|
|
|
if (((Node) textFList.item(0)).getNodeValue().trim().contains(inputPredicate)) { |
123
|
|
|
outputPredicate = ((Node) textTList.item(0)).getNodeValue().trim(); |
124
|
|
|
} |
125
|
|
|
if (((Node) textFList.item(0)).getNodeValue().trim().contains(inputObject)) { |
126
|
|
|
outputObject = ((Node) textTList.item(0)).getNodeValue().trim(); |
127
|
|
|
} |
128
|
|
|
} |
129
|
|
|
} |
130
|
|
|
|
131
|
|
|
} catch (ParserConfigurationException e) { |
132
|
|
|
// TODO Auto-generated catch block |
133
|
|
|
e.printStackTrace(); |
134
|
|
|
} catch (SAXException e) { |
135
|
|
|
// TODO Auto-generated catch block |
136
|
|
|
e.printStackTrace(); |
137
|
|
|
} catch (IOException e) { |
138
|
|
|
// TODO Auto-generated catch block |
139
|
|
|
e.printStackTrace(); |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
if (inputPredicate.equals(RDF_TYPE_URI)) { |
143
|
|
|
outputNTriple = String.format(URI_OBJECT_TRIPLE_FORMAT, outputSubject, outputPredicate, outputObject); |
144
|
|
|
} else { |
145
|
|
|
|
146
|
|
|
outputNTriple = String.format(LITERAL_OBJECT_TRIPLE_FORMAT, outputSubject, outputPredicate, outputObject); |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
// Set output row |
150
|
|
|
Object[] outputRow = meta.getInnerKeepInputFields() ? row : new Object[0]; |
151
|
|
|
|
152
|
|
|
outputRow = RowDataUtil.addValueData(outputRow, outputRow.length, outputNTriple); |
153
|
|
|
|
154
|
|
|
putRow(data.outputRowMeta, outputRow); |
155
|
|
|
|
156
|
|
|
return true; |
157
|
|
|
} |
158
|
|
|
} |
159
|
|
|
|