1
|
|
|
<?php |
2
|
|
|
/* |
3
|
|
|
homepage: http://arc.semsol.org/ |
4
|
|
|
license: http://arc.semsol.org/license |
5
|
|
|
|
6
|
|
|
class: ARC2 RDFa Extractor |
7
|
|
|
author: Benjamin Nowack |
8
|
|
|
version: 2009-05-29 (Fix: CURIEs support DOTs now) |
9
|
|
|
*/ |
10
|
|
|
|
11
|
|
|
ARC2::inc('RDFExtractor'); |
12
|
|
|
|
13
|
|
|
class ARC2_RdfaExtractor extends ARC2_RDFExtractor { |
14
|
|
|
|
15
|
|
|
function __construct($a = '', &$caller) { |
16
|
|
|
parent::__construct($a, $caller); |
17
|
|
|
} |
18
|
|
|
|
19
|
|
|
function ARC2_RdfaExtractor($a = '', &$caller) { |
20
|
|
|
$this->__construct($a, $caller); |
21
|
|
|
} |
22
|
|
|
|
23
|
|
|
function __init() { |
24
|
|
|
parent::__init(); |
25
|
|
|
} |
26
|
|
|
|
27
|
|
|
/* */ |
28
|
|
|
|
29
|
|
|
function extractRDF() { |
30
|
|
|
//echo '<pre>' . htmlspecialchars(print_r($this->nodes, 1)) . '</pre>'; |
31
|
|
|
if (!isset($this->caller->detected_formats['rdfa'])) return 0; |
32
|
|
|
$root_node = $this->getRootNode(); |
33
|
|
|
//$base = $this->v('xml:base', $this->getDocBase(), $root_node['a']); |
34
|
|
|
$base = $this->getDocBase(); |
35
|
|
|
$context = array( |
36
|
|
|
'base' => $base, |
37
|
|
|
'p_s' => $base, |
38
|
|
|
'p_o' => '', |
39
|
|
|
'ns' => array(), |
40
|
|
|
'inco_ts' => array(), |
41
|
|
|
'lang' => '', |
42
|
|
|
); |
43
|
|
|
$this->processNode($root_node, $context, 0); |
44
|
|
|
} |
45
|
|
|
|
46
|
|
|
/* */ |
47
|
|
|
|
48
|
|
|
function getRootNode() { |
49
|
|
|
foreach ($this->nodes as $id => $node) { |
50
|
|
|
if ($node['tag'] == 'html') { |
51
|
|
|
return $node; |
52
|
|
|
} |
53
|
|
|
} |
54
|
|
|
return $this->nodes[0]; |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
/* */ |
58
|
|
|
|
59
|
|
|
function processNode($n, $ct, $level) { |
60
|
|
|
if ($n['tag']=='cdata' || $n['tag']=='comment') return null; /* patch by tobyink */ |
61
|
|
|
$ts_added = 0; |
62
|
|
|
/* step 1 */ |
63
|
|
|
$lct = array(); |
64
|
|
|
$lct['prev_s'] = $this->v('prev_s', $this->v('p_s', '', $ct), $ct); |
65
|
|
|
$lct['recurse'] = 1; |
66
|
|
|
$lct['skip'] = 0; |
67
|
|
|
$lct['new_s'] = ''; |
68
|
|
|
$lct['cur_o_res'] = ''; |
69
|
|
|
$lct['inco_ts'] = array(); |
70
|
|
|
$lct['base'] = $ct['base']; |
71
|
|
|
//$lct['base'] = $this->v('xml:base', $ct['base'], $n['a']); |
72
|
|
|
/* step 2 */ |
73
|
|
|
$lct['ns'] = array_merge($ct['ns'], $this->v('xmlns', array(), $n['a'])); |
74
|
|
|
/* step 3 */ |
75
|
|
|
$lct['lang'] = $this->v('xml:lang', $ct['lang'], $n['a']); |
76
|
|
|
/* step 4 */ |
77
|
|
|
$rel_uris = $this->getAttributeURIs($n, $ct, $lct, 'rel'); |
78
|
|
|
$rev_uris = $this->getAttributeURIs($n, $ct, $lct, 'rev'); |
79
|
|
|
if (!$rel_uris && !$rev_uris) { |
|
|
|
|
80
|
|
|
foreach (array('about', 'src', 'resource', 'href') as $attr) { |
81
|
|
|
if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'], '', $lct)) && $uri) { |
82
|
|
|
$lct['new_s'] = $uri; |
83
|
|
|
break; |
84
|
|
|
} |
85
|
|
|
} |
86
|
|
|
if (!$lct['new_s']) { |
87
|
|
|
if (preg_match('/(head|body)/i', $n['tag'])) { |
88
|
|
|
$lct['new_s'] = $lct['base']; |
89
|
|
|
} |
90
|
|
|
elseif ($this->getAttributeURIs($n, $ct, $lct, 'typeof')) { |
91
|
|
|
$lct['new_s'] = $this->createBnodeID(); |
92
|
|
|
} |
93
|
|
|
elseif ($ct['p_o']) { |
94
|
|
|
$lct['new_s'] = $ct['p_o']; |
95
|
|
|
//$lct['skip'] = 1; |
96
|
|
|
if(!isset($n['a']['property'])) $lct['skip'] = 1;/* patch by masaka */ |
97
|
|
|
} |
98
|
|
|
} |
99
|
|
|
} |
100
|
|
|
/* step 5 */ |
101
|
|
|
else { |
102
|
|
|
foreach (array('about', 'src') as $attr) { |
103
|
|
|
if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'], '', $lct)) && $uri) { |
104
|
|
|
$lct['new_s'] = $uri; |
105
|
|
|
break; |
106
|
|
|
} |
107
|
|
|
} |
108
|
|
|
if (!$lct['new_s']) { |
109
|
|
|
if (preg_match('/(head|body)/i', $n['tag'])) { |
110
|
|
|
$lct['new_s'] = $lct['base']; |
111
|
|
|
} |
112
|
|
|
elseif ($this->getAttributeURIs($n, $ct, $lct, 'typeof')) { |
113
|
|
|
$lct['new_s'] = $this->createBnodeID(); |
114
|
|
|
} |
115
|
|
|
elseif ($ct['p_o']) { |
116
|
|
|
$lct['new_s'] = $ct['p_o']; |
117
|
|
|
} |
118
|
|
|
} |
119
|
|
|
foreach (array('resource', 'href') as $attr) { |
120
|
|
|
if (isset($n['a'][$attr]) && (list($uri, $sub_v) = $this->xURI($n['a'][$attr], $lct['base'], $lct['ns'], '', $lct)) && $uri) { |
121
|
|
|
$lct['cur_o_res'] = $uri; |
122
|
|
|
break; |
123
|
|
|
} |
124
|
|
|
} |
125
|
|
|
} |
126
|
|
|
/* step 6 */ |
127
|
|
|
if ($lct['new_s']) { |
128
|
|
|
if ($uris = $this->getAttributeURIs($n, $ct, $lct, 'typeof')) { |
129
|
|
|
foreach ($uris as $uri) { |
130
|
|
|
$this->addT(array( |
131
|
|
|
's' => $lct['new_s'], |
132
|
|
|
's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri', |
133
|
|
|
'p' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type', |
134
|
|
|
'o' => $uri, |
135
|
|
|
'o_type' => 'uri', |
136
|
|
|
'o_lang' => '', |
137
|
|
|
'o_datatype' => '', |
138
|
|
|
)); |
139
|
|
|
$ts_added = 1; |
140
|
|
|
} |
141
|
|
|
} |
142
|
|
|
/* step 7 */ |
143
|
|
|
if ($lct['cur_o_res']) { |
144
|
|
|
if ($rel_uris) { |
|
|
|
|
145
|
|
|
foreach ($rel_uris as $uri) { |
146
|
|
|
$this->addT(array( |
147
|
|
|
's' => $lct['new_s'], |
148
|
|
|
's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri', |
149
|
|
|
'p' => $uri, |
150
|
|
|
'o' => $lct['cur_o_res'], |
151
|
|
|
'o_type' => preg_match('/^\_\:/', $lct['cur_o_res']) ? 'bnode' : 'uri', |
152
|
|
|
'o_lang' => '', |
153
|
|
|
'o_datatype' => '', |
154
|
|
|
)); |
155
|
|
|
$ts_added = 1; |
156
|
|
|
} |
157
|
|
|
} |
158
|
|
|
if ($rev_uris) { |
|
|
|
|
159
|
|
|
foreach ($rev_uris as $uri) { |
160
|
|
|
$this->addT(array( |
161
|
|
|
's' => $lct['cur_o_res'], |
162
|
|
|
's_type' => preg_match('/^\_\:/', $lct['cur_o_res']) ? 'bnode' : 'uri', |
163
|
|
|
'p' => $uri, |
164
|
|
|
'o' => $lct['new_s'], |
165
|
|
|
'o_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri', |
166
|
|
|
'o_lang' => '', |
167
|
|
|
'o_datatype' => '', |
168
|
|
|
)); |
169
|
|
|
$ts_added = 1; |
170
|
|
|
} |
171
|
|
|
} |
172
|
|
|
} |
173
|
|
|
} |
174
|
|
|
/* step 8 */ |
175
|
|
|
if (!$lct['cur_o_res']) { |
176
|
|
|
if ($rel_uris || $rev_uris) { |
|
|
|
|
177
|
|
|
$lct['cur_o_res'] = $this->createBnodeID(); |
178
|
|
|
foreach ($rel_uris as $uri) { |
179
|
|
|
$lct['inco_ts'][] = array('p' => $uri, 'dir' => 'fwd'); |
180
|
|
|
} |
181
|
|
|
foreach ($rev_uris as $uri) { |
182
|
|
|
$lct['inco_ts'][] = array('p' => $uri, 'dir' => 'rev'); |
183
|
|
|
} |
184
|
|
|
} |
185
|
|
|
} |
186
|
|
|
/* step 10 */ |
187
|
|
|
if (!$lct['skip'] && ($new_s = $lct['new_s'])) { |
188
|
|
|
//if ($new_s = $lct['new_s']) { |
189
|
|
|
if ($uris = $this->getAttributeURIs($n, $ct, $lct, 'property')) { |
190
|
|
|
foreach ($uris as $uri) { |
191
|
|
|
$lct['cur_o_lit'] = $this->getCurrentObjectLiteral($n, $lct, $ct); |
192
|
|
|
$this->addT(array( |
193
|
|
|
's' => $lct['new_s'], |
194
|
|
|
's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri', |
195
|
|
|
'p' => $uri, |
196
|
|
|
'o' => $lct['cur_o_lit']['value'], |
197
|
|
|
'o_type' => 'literal', |
198
|
|
|
'o_lang' => $lct['cur_o_lit']['lang'], |
199
|
|
|
'o_datatype' => $lct['cur_o_lit']['datatype'], |
200
|
|
|
)); |
201
|
|
|
$ts_added = 1; |
202
|
|
|
if ($lct['cur_o_lit']['datatype'] == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral') { |
203
|
|
|
$lct['recurse'] = 0; |
204
|
|
|
} |
205
|
|
|
} |
206
|
|
|
} |
207
|
|
|
} |
208
|
|
|
/* step 11 (10) */ |
209
|
|
|
$complete_triples = 0; |
210
|
|
|
if ($lct['recurse']) { |
211
|
|
|
if ($lct['skip']) { |
212
|
|
|
$new_ct = array_merge($ct, array('base' => $lct['base'], 'lang' => $lct['lang'], 'ns' => $lct['ns'])); |
213
|
|
|
} |
214
|
|
|
else { |
215
|
|
|
$new_ct = array( |
216
|
|
|
'base' => $lct['base'], |
217
|
|
|
'p_s' => $lct['new_s'] ? $lct['new_s'] : $ct['p_s'], |
218
|
|
|
'p_o' => $lct['cur_o_res'] ? $lct['cur_o_res'] : ($lct['new_s'] ? $lct['new_s'] : $ct['p_s']), |
219
|
|
|
'ns' => $lct['ns'], |
220
|
|
|
'inco_ts' => $lct['inco_ts'], |
221
|
|
|
'lang' => $lct['lang'] |
222
|
|
|
); |
223
|
|
|
} |
224
|
|
|
$sub_nodes = $this->getSubNodes($n); |
225
|
|
|
foreach ($sub_nodes as $sub_node) { |
226
|
|
|
if ($this->processNode($sub_node, $new_ct, $level+1)) { |
|
|
|
|
227
|
|
|
$complete_triples = 1; |
228
|
|
|
} |
229
|
|
|
} |
230
|
|
|
} |
231
|
|
|
/* step 12 (11) */ |
232
|
|
|
$other = 0; |
233
|
|
|
if ($ts_added || $complete_triples || ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])) || ($other == 1)) { |
234
|
|
|
//if (!$lct['skip'] && ($complete_triples || ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])))) { |
235
|
|
|
foreach ($ct['inco_ts'] as $inco_t) { |
236
|
|
|
if ($inco_t['dir'] == 'fwd') { |
237
|
|
|
$this->addT(array( |
238
|
|
|
's' => $ct['p_s'], |
239
|
|
|
's_type' => preg_match('/^\_\:/', $ct['p_s']) ? 'bnode' : 'uri', |
240
|
|
|
'p' => $inco_t['p'], |
241
|
|
|
'o' => $lct['new_s'], |
242
|
|
|
'o_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri', |
243
|
|
|
'o_lang' => '', |
244
|
|
|
'o_datatype' => '', |
245
|
|
|
)); |
246
|
|
|
} |
247
|
|
|
elseif ($inco_t['dir'] == 'rev') { |
248
|
|
|
$this->addT(array( |
249
|
|
|
's' => $lct['new_s'], |
250
|
|
|
's_type' => preg_match('/^\_\:/', $lct['new_s']) ? 'bnode' : 'uri', |
251
|
|
|
'p' => $inco_t['p'], |
252
|
|
|
'o' => $ct['p_s'], |
253
|
|
|
'o_type' => preg_match('/^\_\:/', $ct['p_s']) ? 'bnode' : 'uri', |
254
|
|
|
'o_lang' => '', |
255
|
|
|
'o_datatype' => '', |
256
|
|
|
)); |
257
|
|
|
} |
258
|
|
|
} |
259
|
|
|
} |
260
|
|
|
/* step 13 (12) (result flag) */ |
261
|
|
|
if ($ts_added) return 1; |
262
|
|
|
if ($lct['new_s'] && !preg_match('/^\_\:/', $lct['new_s'])) return 1; |
263
|
|
|
if ($complete_triples) return 1; |
264
|
|
|
return 0; |
265
|
|
|
} |
266
|
|
|
|
267
|
|
|
/* */ |
268
|
|
|
|
269
|
|
|
function getAttributeURIs($n, $ct, $lct, $attr) { |
270
|
|
|
$vals = ($val = $this->v($attr, '', $n['a'])) ? explode(' ', $val) : array(); |
271
|
|
|
$r = array(); |
272
|
|
|
foreach ($vals as $val) { |
273
|
|
|
if(!trim($val)) continue; |
274
|
|
|
if ((list($uri, $sub_v) = $this->xURI(trim($val), $lct['base'], $lct['ns'], $attr, $lct)) && $uri) { |
275
|
|
|
$r[] = $uri; |
276
|
|
|
} |
277
|
|
|
} |
278
|
|
|
return $r; |
279
|
|
|
} |
280
|
|
|
|
281
|
|
|
/* */ |
282
|
|
|
|
283
|
|
|
function getCurrentObjectLiteral($n, $lct, $ct) { |
284
|
|
|
$xml_val = $this->getContent($n); |
285
|
|
|
$plain_val = $this->getPlainContent($n, 0, 0); |
286
|
|
|
if (function_exists('html_entity_decode')) { |
287
|
|
|
$plain_val = html_entity_decode($plain_val, ENT_QUOTES); |
288
|
|
|
} |
289
|
|
|
$dt = $this->v('datatype', '', $n['a']); |
290
|
|
|
list($dt_uri, $sub_v) = $this->xURI($dt, $lct['base'], $lct['ns'], '', $lct); |
|
|
|
|
291
|
|
|
$dt = $dt ? $dt_uri : $dt; |
292
|
|
|
$r = array('value' => '', 'lang' => $lct['lang'], 'datatype' => $dt); |
293
|
|
|
if (isset($n['a']['content'])) { |
294
|
|
|
$r['value'] = $n['a']['content']; |
295
|
|
|
if (function_exists('html_entity_decode')) { |
296
|
|
|
$r['value'] = html_entity_decode($r['value'], ENT_QUOTES); |
297
|
|
|
} |
298
|
|
|
} |
299
|
|
|
elseif ($xml_val == $plain_val) { |
300
|
|
|
$r['value'] = $plain_val; |
301
|
|
|
} |
302
|
|
|
elseif (!preg_match('/[\<\>]/', $xml_val)) { |
303
|
|
|
$r['value'] = $xml_val; |
304
|
|
|
} |
305
|
|
|
elseif (isset($n['a']['datatype']) && ($dt != 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral')) { |
306
|
|
|
$r['value'] = $plain_val; |
307
|
|
|
} |
308
|
|
|
elseif (!isset($n['a']['datatype']) || ($dt == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral')) { |
309
|
|
|
$r['value'] = $this->injectXMLDeclarations($xml_val, $lct['ns'], $lct['lang']); |
310
|
|
|
$r['datatype'] = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral'; |
311
|
|
|
} |
312
|
|
|
return $r; |
313
|
|
|
} |
314
|
|
|
|
315
|
|
|
function injectXMLDeclarations($val, $ns, $lang) {//@@todo proper node rebuilding */ |
316
|
|
|
$lang_code = $lang ? ' xml:lang="' . $lang . '"' : ''; |
317
|
|
|
/* ns */ |
318
|
|
|
$val = preg_replace('/<([a-z0-9]+)([\>\s])/is', '<\\1 xmlns="http://www.w3.org/1999/xhtml"' . $lang_code . '\\2', $val); |
319
|
|
|
foreach ($ns as $prefix => $uri) { |
320
|
|
|
if ($prefix && ($pos = strpos(' ' . $val, '<' . $prefix . ':'))) { |
321
|
|
|
$val = substr($val, 0, $pos - 1) . preg_replace('/^(<' . $prefix . '\:[^\>\s]+)/', '\\1 xmlns:' . $prefix. '="' . $uri . '"' . $lang_code, substr($val, $pos - 1)); |
322
|
|
|
} |
323
|
|
|
} |
324
|
|
|
/* remove accidentally added xml:lang and xmlns= */ |
325
|
|
|
$val = preg_replace('/(\<[^\>]*)( xml\:lang[^\s\>]+)([^\>]*)(xml\:lang[^\s\>]+)/s', '\\1\\3\\4', $val); |
326
|
|
|
$val = preg_replace('/(\<[^\>]*)( xmlns=[^\s\>]+)([^\>]*)(xmlns=[^\s\>]+)/s', '\\1\\3\\4', $val); |
327
|
|
|
return $val; |
328
|
|
|
} |
329
|
|
|
|
330
|
|
|
/* */ |
331
|
|
|
|
332
|
|
|
function xURI($v, $base, $ns, $attr_type = '', $lct = '') { |
333
|
|
|
if ((list($sub_r, $sub_v) = $this->xBlankCURIE($v, $base, $ns)) && $sub_r) { |
334
|
|
|
return array($sub_r, $sub_v); |
335
|
|
|
} |
336
|
|
|
if ((list($sub_r, $sub_v) = $this->xSafeCURIE($v, $base, $ns, $lct)) && $sub_r) { |
337
|
|
|
return array($sub_r, $sub_v); |
338
|
|
|
} |
339
|
|
|
if ((list($sub_r, $sub_v) = $this->xCURIE($v, $base, $ns)) && $sub_r) { |
340
|
|
|
return array($sub_r, $sub_v); |
341
|
|
|
} |
342
|
|
|
if (preg_match('/^(rel|rev)$/', $attr_type) && preg_match('/^\s*(alternate|appendix|bookmark|cite|chapter|contents|copyright|glossary|help|icon|index|last|license|meta|next|p3pv1|prev|role|section|stylesheet|subsection|start|up)(\s|$)/is', $v, $m)) { |
343
|
|
|
return array('http://www.w3.org/1999/xhtml/vocab#' . strtolower($m[1]), preg_replace('/^\s*' . $m[1]. '/is', '', $v)); |
344
|
|
|
} |
345
|
|
|
if (preg_match('/^(rel|rev)$/', $attr_type) && preg_match('/^[a-z0-9\.]+$/i', $v)) { |
346
|
|
|
return array(0, $v); |
347
|
|
|
} |
348
|
|
|
return array($this->calcURI($v, $base), ''); |
349
|
|
|
} |
350
|
|
|
|
351
|
|
|
function xBlankCURIE($v, $base, $ns) { |
352
|
|
|
if ($sub_r = $this->x('\[\_\:\]', $v)) { |
353
|
|
|
$this->empty_bnode = isset($this->empty_bnode) ? $this->empty_bnode : $this->createBnodeID(); |
354
|
|
|
return array($this->empty_bnode, ''); |
355
|
|
|
} |
356
|
|
|
if ($sub_r = $this->x('\[?(\_\:[a-z0-9\_\-]+)\]?', $v)) { |
357
|
|
|
return array($sub_r[1], ''); |
358
|
|
|
} |
359
|
|
|
return array(0, $v); |
360
|
|
|
} |
361
|
|
|
|
362
|
|
|
function xSafeCURIE($v, $base, $ns, $lct = '') { |
363
|
|
|
/* empty */ |
364
|
|
|
if ($sub_r = $this->x('\[\]', $v)) { |
365
|
|
|
$r = $lct ? $lct['prev_s'] : $base;/* should be current subject value */ |
366
|
|
|
return $sub_r[1] ? array($r, $sub_r[1]) : array($r, ''); |
367
|
|
|
} |
368
|
|
|
if ($sub_r = $this->x('\[([^\:]*)\:([^\]]*)\]', $v)) { |
369
|
|
|
if (!$sub_r[1]) return array('http://www.w3.org/1999/xhtml/vocab#' . $sub_r[2], ''); |
370
|
|
|
if (isset($ns[$sub_r[1]])) { |
371
|
|
|
return array($ns[$sub_r[1]] . $sub_r[2], ''); |
372
|
|
|
} |
373
|
|
|
} |
374
|
|
|
return array(0, $v); |
375
|
|
|
} |
376
|
|
|
|
377
|
|
|
function xCURIE($v, $base, $ns) { |
378
|
|
|
if ($sub_r = $this->x('([a-z0-9\-\_]*)\:([^\s]+)', $v)) { |
379
|
|
|
if (!$sub_r[1]) return array('http://www.w3.org/1999/xhtml/vocab#' . $sub_r[2], ''); |
380
|
|
|
if (isset($ns[$sub_r[1]])) { |
381
|
|
|
return array($ns[$sub_r[1]] . $sub_r[2], ''); |
382
|
|
|
} |
383
|
|
|
} |
384
|
|
|
return array(0, $v); |
385
|
|
|
} |
386
|
|
|
|
387
|
|
|
/* */ |
388
|
|
|
|
389
|
|
|
} |
390
|
|
|
|
This check marks implicit conversions of arrays to boolean values in a comparison. While in PHP an empty array is considered to be equal (but not identical) to false, this is not always apparent.
Consider making the comparison explicit by using
empty(..)
or! empty(...)
instead.