1
|
|
|
<?php |
2
|
|
|
/* |
3
|
|
|
* Project: MagpieRSS: a simple RSS integration tool |
4
|
|
|
* File: rss_fetch.inc, a simple functional interface |
5
|
|
|
to fetching and parsing RSS files, via the |
6
|
|
|
function fetch_rss() |
7
|
|
|
* Author: Kellan Elliott-McCrea <[email protected]> |
8
|
|
|
* License: GPL |
9
|
|
|
* |
10
|
|
|
* The lastest version of MagpieRSS can be obtained from: |
11
|
|
|
* http://magpierss.sourceforge.net |
12
|
|
|
* |
13
|
|
|
* For questions, help, comments, discussion, etc., please join the |
14
|
|
|
* Magpie mailing list: |
15
|
|
|
* [email protected] |
16
|
|
|
* |
17
|
|
|
*/ |
18
|
|
|
|
19
|
|
|
// Setup MAGPIE_DIR for use on hosts that don't include |
20
|
|
|
// the current path in include_path. |
21
|
|
|
// with thanks to rajiv and smarty |
22
|
|
|
if (!defined('DIR_SEP')) { |
23
|
|
|
define('DIR_SEP', DIRECTORY_SEPARATOR); |
24
|
|
|
} |
25
|
|
|
|
26
|
|
|
if (!defined('MAGPIE_DIR')) { |
27
|
|
|
define('MAGPIE_DIR', dirname(__FILE__) . DIR_SEP); |
28
|
|
|
} |
29
|
|
|
|
30
|
|
|
if (!defined('MAGPIE_CACHE_DIR')) { |
31
|
|
|
define('MAGPIE_CACHE_DIR', MODX_BASE_PATH . 'assets/cache/rss'); |
32
|
|
|
} |
33
|
|
|
|
34
|
|
|
require_once( MAGPIE_DIR . 'rss_parse.inc' ); |
35
|
|
|
require_once( MAGPIE_DIR . 'rss_cache.inc' ); |
36
|
|
|
|
37
|
|
|
// for including 3rd party libraries |
38
|
|
|
define('MAGPIE_EXTLIB', MAGPIE_DIR . 'extlib' . DIR_SEP); |
39
|
|
|
require_once( MAGPIE_EXTLIB . 'Snoopy.class.inc'); |
40
|
|
|
|
41
|
|
|
|
42
|
|
|
/* |
43
|
|
|
* CONSTANTS - redefine these in your script to change the |
44
|
|
|
* behaviour of fetch_rss() currently, most options effect the cache |
45
|
|
|
* |
46
|
|
|
* MAGPIE_CACHE_ON - Should Magpie cache parsed RSS objects? |
47
|
|
|
* For me a built in cache was essential to creating a "PHP-like" |
48
|
|
|
* feel to Magpie, see rss_cache.inc for rationale |
49
|
|
|
* |
50
|
|
|
* |
51
|
|
|
* MAGPIE_CACHE_DIR - Where should Magpie cache parsed RSS objects? |
52
|
|
|
* This should be a location that the webserver can write to. If this |
53
|
|
|
* directory does not already exist Mapie will try to be smart and create |
54
|
|
|
* it. This will often fail for permissions reasons. |
55
|
|
|
* |
56
|
|
|
* |
57
|
|
|
* MAGPIE_CACHE_AGE - How long to store cached RSS objects? In seconds. |
58
|
|
|
* |
59
|
|
|
* |
60
|
|
|
* MAGPIE_CACHE_FRESH_ONLY - If remote fetch fails, throw error |
61
|
|
|
* instead of returning stale object? |
62
|
|
|
* |
63
|
|
|
* MAGPIE_DEBUG - Display debugging notices? |
64
|
|
|
* |
65
|
|
|
*/ |
66
|
|
|
|
67
|
|
|
|
68
|
|
|
/*=======================================================================*\ |
69
|
|
|
Function: fetch_rss: |
70
|
|
|
Purpose: return RSS object for the give url |
71
|
|
|
maintain the cache |
72
|
|
|
Input: url of RSS file |
73
|
|
|
Output: parsed RSS object (see rss_parse.inc) |
74
|
|
|
|
75
|
|
|
NOTES ON CACHEING: |
76
|
|
|
If caching is on (MAGPIE_CACHE_ON) fetch_rss will first check the cache. |
77
|
|
|
|
78
|
|
|
NOTES ON RETRIEVING REMOTE FILES: |
79
|
|
|
If conditional gets are on (MAGPIE_CONDITIONAL_GET_ON) fetch_rss will |
80
|
|
|
return a cached object, and touch the cache object upon recieving a |
81
|
|
|
304. |
82
|
|
|
|
83
|
|
|
NOTES ON FAILED REQUESTS: |
84
|
|
|
If there is an HTTP error while fetching an RSS object, the cached |
85
|
|
|
version will be return, if it exists (and if MAGPIE_CACHE_FRESH_ONLY is off) |
86
|
|
|
\*=======================================================================*/ |
87
|
|
|
|
88
|
|
|
define('MAGPIE_VERSION', '0.72'); |
89
|
|
|
|
90
|
|
|
$MAGPIE_ERROR = ""; |
91
|
|
|
|
92
|
|
|
function fetch_rss ($url) { |
|
|
|
|
93
|
|
|
// initialize constants |
94
|
|
|
init(); |
95
|
|
|
|
96
|
|
|
if ( !isset($url) ) { |
97
|
|
|
error("fetch_rss called without a url"); |
98
|
|
|
return false; |
99
|
|
|
} |
100
|
|
|
|
101
|
|
|
// if cache is disabled |
102
|
|
|
if ( !MAGPIE_CACHE_ON ) { |
103
|
|
|
// fetch file, and parse it |
104
|
|
|
$resp = _fetch_remote_file( $url ); |
105
|
|
|
if ( is_success( $resp->status ) ) { |
106
|
|
|
return _response_to_rss( $resp ); |
107
|
|
|
} |
108
|
|
|
else { |
109
|
|
|
error("Failed to fetch $url and cache is off"); |
110
|
|
|
return false; |
111
|
|
|
} |
112
|
|
|
} |
113
|
|
|
// else cache is ON |
114
|
|
|
else { |
115
|
|
|
// Flow |
116
|
|
|
// 1. check cache |
117
|
|
|
// 2. if there is a hit, make sure its fresh |
118
|
|
|
// 3. if cached obj fails freshness check, fetch remote |
119
|
|
|
// 4. if remote fails, return stale object, or error |
120
|
|
|
|
121
|
|
|
$cache = new RSSCache( MAGPIE_CACHE_DIR, MAGPIE_CACHE_AGE ); |
122
|
|
|
|
123
|
|
|
if (MAGPIE_DEBUG and $cache->ERROR) { |
124
|
|
|
debug($cache->ERROR, E_USER_WARNING); |
125
|
|
|
} |
126
|
|
|
|
127
|
|
|
|
128
|
|
|
$cache_status = 0; // response of check_cache |
129
|
|
|
$request_headers = array(); // HTTP headers to send with fetch |
130
|
|
|
$rss = 0; // parsed RSS object |
131
|
|
|
$errormsg = 0; // errors, if any |
132
|
|
|
|
133
|
|
|
// store parsed XML by desired output encoding |
134
|
|
|
// as character munging happens at parse time |
135
|
|
|
$cache_key = $url . MAGPIE_OUTPUT_ENCODING; |
136
|
|
|
|
137
|
|
|
if (!$cache->ERROR) { |
138
|
|
|
// return cache HIT, MISS, or STALE |
139
|
|
|
$cache_status = $cache->check_cache( $cache_key); |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
// if object cached, and cache is fresh, return cached obj |
143
|
|
|
if ( $cache_status == 'HIT' ) { |
144
|
|
|
$rss = $cache->get( $cache_key ); |
145
|
|
|
if ( isset($rss) and $rss ) { |
146
|
|
|
// should be cache age |
147
|
|
|
$rss->from_cache = 1; |
148
|
|
|
if ( MAGPIE_DEBUG > 1) { |
149
|
|
|
debug("MagpieRSS: Cache HIT", E_USER_NOTICE); |
150
|
|
|
} |
151
|
|
|
return $rss; |
152
|
|
|
} |
153
|
|
|
} |
154
|
|
|
|
155
|
|
|
// else attempt a conditional get |
156
|
|
|
|
157
|
|
|
// setup headers |
158
|
|
|
if ( $cache_status == 'STALE' ) { |
159
|
|
|
$rss = $cache->get( $cache_key ); |
160
|
|
|
if ( $rss and $rss->etag and $rss->last_modified ) { |
161
|
|
|
$request_headers['If-None-Match'] = $rss->etag; |
162
|
|
|
$request_headers['If-Last-Modified'] = $rss->last_modified; |
163
|
|
|
} |
164
|
|
|
} |
165
|
|
|
|
166
|
|
|
$resp = _fetch_remote_file( $url, $request_headers ); |
|
|
|
|
167
|
|
|
|
168
|
|
|
if (isset($resp) and $resp) { |
169
|
|
|
if ($resp->status == '304' ) { |
170
|
|
|
// we have the most current copy |
171
|
|
|
if ( MAGPIE_DEBUG > 1) { |
172
|
|
|
debug("Got 304 for $url"); |
173
|
|
|
} |
174
|
|
|
// reset cache on 304 (at minutillo insistent prodding) |
175
|
|
|
$cache->set($cache_key, $rss); |
176
|
|
|
return $rss; |
177
|
|
|
} |
178
|
|
|
elseif ( is_success( $resp->status ) ) { |
179
|
|
|
$rss = _response_to_rss( $resp ); |
180
|
|
|
if ( $rss ) { |
181
|
|
|
if (MAGPIE_DEBUG > 1) { |
182
|
|
|
debug("Fetch successful"); |
183
|
|
|
} |
184
|
|
|
// add object to cache |
185
|
|
|
$cache->set( $cache_key, $rss ); |
186
|
|
|
return $rss; |
187
|
|
|
} |
188
|
|
|
} |
189
|
|
|
else { |
190
|
|
|
$errormsg = "Failed to fetch $url "; |
191
|
|
|
if ( $resp->status == '-100' ) { |
192
|
|
|
$errormsg .= "(Request timed out after " . MAGPIE_FETCH_TIME_OUT . " seconds)"; |
193
|
|
|
} |
194
|
|
|
elseif ( $resp->error ) { |
195
|
|
|
# compensate for Snoopy's annoying habbit to tacking |
196
|
|
|
# on '\n' |
197
|
|
|
$http_error = substr($resp->error, 0, -2); |
198
|
|
|
$errormsg .= "(HTTP Error: $http_error)"; |
199
|
|
|
} |
200
|
|
|
else { |
201
|
|
|
$errormsg .= "(HTTP Response: " . $resp->response_code .')'; |
202
|
|
|
} |
203
|
|
|
} |
204
|
|
|
} |
205
|
|
|
else { |
206
|
|
|
$errormsg = "Unable to retrieve RSS file for unknown reasons."; |
207
|
|
|
} |
208
|
|
|
|
209
|
|
|
// else fetch failed |
210
|
|
|
|
211
|
|
|
// attempt to return cached object |
212
|
|
|
if ($rss) { |
213
|
|
|
if ( MAGPIE_DEBUG ) { |
214
|
|
|
debug("Returning STALE object for $url"); |
215
|
|
|
} |
216
|
|
|
return $rss; |
217
|
|
|
} |
218
|
|
|
|
219
|
|
|
// else we totally failed |
220
|
|
|
error( $errormsg ); |
221
|
|
|
|
222
|
|
|
return false; |
223
|
|
|
|
224
|
|
|
} // end if ( !MAGPIE_CACHE_ON ) { |
225
|
|
|
} // end fetch_rss() |
226
|
|
|
|
227
|
|
|
/*=======================================================================*\ |
228
|
|
|
Function: error |
229
|
|
|
Purpose: set MAGPIE_ERROR, and trigger error |
230
|
|
|
\*=======================================================================*/ |
231
|
|
|
|
232
|
|
|
function error ($errormsg, $lvl=E_USER_WARNING) { |
233
|
|
|
global $MAGPIE_ERROR; |
234
|
|
|
|
235
|
|
|
// append PHP's error message if track_errors enabled |
236
|
|
|
if ( isset($php_errormsg) ) { |
237
|
|
|
$errormsg .= " ($php_errormsg)"; |
238
|
|
|
} |
239
|
|
|
if ( $errormsg ) { |
240
|
|
|
$errormsg = "MagpieRSS: $errormsg"; |
241
|
|
|
$MAGPIE_ERROR = $errormsg; |
242
|
|
|
trigger_error( $errormsg, $lvl); |
243
|
|
|
} |
244
|
|
|
} |
245
|
|
|
|
246
|
|
|
function debug ($debugmsg, $lvl=E_USER_NOTICE) { |
247
|
|
|
trigger_error("MagpieRSS [debug] $debugmsg", $lvl); |
248
|
|
|
} |
249
|
|
|
|
250
|
|
|
/*=======================================================================*\ |
251
|
|
|
Function: magpie_error |
252
|
|
|
Purpose: accessor for the magpie error variable |
253
|
|
|
\*=======================================================================*/ |
254
|
|
|
function magpie_error ($errormsg="") { |
255
|
|
|
global $MAGPIE_ERROR; |
256
|
|
|
|
257
|
|
|
if ( isset($errormsg) and $errormsg ) { |
258
|
|
|
$MAGPIE_ERROR = $errormsg; |
259
|
|
|
} |
260
|
|
|
|
261
|
|
|
return $MAGPIE_ERROR; |
262
|
|
|
} |
263
|
|
|
|
264
|
|
|
/*=======================================================================*\ |
265
|
|
|
Function: _fetch_remote_file |
266
|
|
|
Purpose: retrieve an arbitrary remote file |
267
|
|
|
Input: url of the remote file |
268
|
|
|
headers to send along with the request (optional) |
269
|
|
|
Output: an HTTP response object (see Snoopy.class.inc) |
270
|
|
|
\*=======================================================================*/ |
271
|
|
|
function _fetch_remote_file ($url, $headers = "" ) { |
272
|
|
|
// Snoopy is an HTTP client in PHP |
273
|
|
|
$client = new Snoopy(); |
274
|
|
|
$client->agent = MAGPIE_USER_AGENT; |
275
|
|
|
$client->read_timeout = MAGPIE_FETCH_TIME_OUT; |
276
|
|
|
$client->use_gzip = MAGPIE_USE_GZIP; |
277
|
|
|
if (is_array($headers) ) { |
278
|
|
|
$client->rawheaders = $headers; |
279
|
|
|
} |
280
|
|
|
|
281
|
|
|
@$client->fetch($url); |
282
|
|
|
return $client; |
283
|
|
|
|
284
|
|
|
} |
285
|
|
|
|
286
|
|
|
/*=======================================================================*\ |
287
|
|
|
Function: _response_to_rss |
288
|
|
|
Purpose: parse an HTTP response object into an RSS object |
289
|
|
|
Input: an HTTP response object (see Snoopy) |
290
|
|
|
Output: parsed RSS object (see rss_parse) |
291
|
|
|
\*=======================================================================*/ |
292
|
|
|
function _response_to_rss ($resp) { |
293
|
|
|
$rss = new MagpieRSS( $resp->results, MAGPIE_OUTPUT_ENCODING, MAGPIE_INPUT_ENCODING, MAGPIE_DETECT_ENCODING ); |
294
|
|
|
|
295
|
|
|
// if RSS parsed successfully |
296
|
|
|
if ( $rss and !$rss->ERROR) { |
297
|
|
|
|
298
|
|
|
// find Etag, and Last-Modified |
299
|
|
|
foreach($resp->headers as $h) { |
300
|
|
|
// 2003-03-02 - Nicola Asuni (www.tecnick.com) - fixed bug "Undefined offset: 1" |
301
|
|
View Code Duplication |
if (strpos($h, ": ")) { |
302
|
|
|
list($field, $val) = explode(": ", $h, 2); |
303
|
|
|
} |
304
|
|
|
else { |
305
|
|
|
$field = $h; |
306
|
|
|
$val = ""; |
307
|
|
|
} |
308
|
|
|
|
309
|
|
|
if ( $field == 'ETag' ) { |
310
|
|
|
$rss->etag = $val; |
|
|
|
|
311
|
|
|
} |
312
|
|
|
|
313
|
|
|
if ( $field == 'Last-Modified' ) { |
314
|
|
|
$rss->last_modified = $val; |
|
|
|
|
315
|
|
|
} |
316
|
|
|
} |
317
|
|
|
|
318
|
|
|
return $rss; |
319
|
|
|
} // else construct error message |
320
|
|
|
else { |
321
|
|
|
$errormsg = "Failed to parse RSS file."; |
322
|
|
|
|
323
|
|
|
if ($rss) { |
324
|
|
|
$errormsg .= " (" . $rss->ERROR . ")"; |
325
|
|
|
} |
326
|
|
|
error($errormsg); |
327
|
|
|
|
328
|
|
|
return false; |
329
|
|
|
} // end if ($rss and !$rss->error) |
330
|
|
|
} |
331
|
|
|
|
332
|
|
|
/*=======================================================================*\ |
333
|
|
|
Function: init |
334
|
|
|
Purpose: setup constants with default values |
335
|
|
|
check for user overrides |
336
|
|
|
\*=======================================================================*/ |
337
|
|
|
function init () { |
338
|
|
|
if ( defined('MAGPIE_INITALIZED') ) { |
339
|
|
|
return; |
340
|
|
|
} |
341
|
|
|
else { |
342
|
|
|
define('MAGPIE_INITALIZED', true); |
343
|
|
|
} |
344
|
|
|
|
345
|
|
|
if ( !defined('MAGPIE_CACHE_ON') ) { |
346
|
|
|
define('MAGPIE_CACHE_ON', true); |
347
|
|
|
} |
348
|
|
|
|
349
|
|
|
if ( !defined('MAGPIE_CACHE_DIR') ) { |
350
|
|
|
define('MAGPIE_CACHE_DIR', './cache'); |
351
|
|
|
} |
352
|
|
|
|
353
|
|
|
if ( !defined('MAGPIE_CACHE_AGE') ) { |
354
|
|
|
define('MAGPIE_CACHE_AGE', 60*60); // one hour |
355
|
|
|
} |
356
|
|
|
|
357
|
|
|
if ( !defined('MAGPIE_CACHE_FRESH_ONLY') ) { |
358
|
|
|
define('MAGPIE_CACHE_FRESH_ONLY', false); |
359
|
|
|
} |
360
|
|
|
|
361
|
|
|
if ( !defined('MAGPIE_OUTPUT_ENCODING') ) { |
362
|
|
|
global $modx_manager_charset; |
363
|
|
|
if(empty($modx_manager_charset)) $modx_manager_charset = 'ISO-8859-1'; |
364
|
|
|
define('MAGPIE_OUTPUT_ENCODING', $modx_manager_charset); |
365
|
|
|
} |
366
|
|
|
|
367
|
|
|
if ( !defined('MAGPIE_INPUT_ENCODING') ) { |
368
|
|
|
define('MAGPIE_INPUT_ENCODING', null); |
369
|
|
|
} |
370
|
|
|
|
371
|
|
|
if ( !defined('MAGPIE_DETECT_ENCODING') ) { |
372
|
|
|
define('MAGPIE_DETECT_ENCODING', true); |
373
|
|
|
} |
374
|
|
|
|
375
|
|
|
if ( !defined('MAGPIE_DEBUG') ) { |
376
|
|
|
define('MAGPIE_DEBUG', 0); |
377
|
|
|
} |
378
|
|
|
|
379
|
|
|
if ( !defined('MAGPIE_USER_AGENT') ) { |
380
|
|
|
$ua = 'MagpieRSS/'. MAGPIE_VERSION . ' (+http://magpierss.sf.net'; |
381
|
|
|
|
382
|
|
|
if ( MAGPIE_CACHE_ON ) { |
383
|
|
|
$ua = $ua . ')'; |
384
|
|
|
} |
385
|
|
|
else { |
386
|
|
|
$ua = $ua . '; No cache)'; |
387
|
|
|
} |
388
|
|
|
|
389
|
|
|
define('MAGPIE_USER_AGENT', $ua); |
390
|
|
|
} |
391
|
|
|
|
392
|
|
|
if ( !defined('MAGPIE_FETCH_TIME_OUT') ) { |
393
|
|
|
define('MAGPIE_FETCH_TIME_OUT', 5); // 5 second timeout |
394
|
|
|
} |
395
|
|
|
|
396
|
|
|
// use gzip encoding to fetch rss files if supported? |
397
|
|
|
if ( !defined('MAGPIE_USE_GZIP') ) { |
398
|
|
|
define('MAGPIE_USE_GZIP', true); |
399
|
|
|
} |
400
|
|
|
} |
401
|
|
|
|
402
|
|
|
// NOTE: the following code should really be in Snoopy, or at least |
403
|
|
|
// somewhere other then rss_fetch! |
404
|
|
|
|
405
|
|
|
/*=======================================================================*\ |
406
|
|
|
HTTP STATUS CODE PREDICATES |
407
|
|
|
These functions attempt to classify an HTTP status code |
408
|
|
|
based on RFC 2616 and RFC 2518. |
409
|
|
|
|
410
|
|
|
All of them take an HTTP status code as input, and return true or false |
411
|
|
|
|
412
|
|
|
All this code is adapted from LWP's HTTP::Status. |
413
|
|
|
\*=======================================================================*/ |
414
|
|
|
|
415
|
|
|
|
416
|
|
|
/*=======================================================================*\ |
417
|
|
|
Function: is_info |
418
|
|
|
Purpose: return true if Informational status code |
419
|
|
|
\*=======================================================================*/ |
420
|
|
|
function is_info ($sc) { |
421
|
|
|
return $sc >= 100 && $sc < 200; |
422
|
|
|
} |
423
|
|
|
|
424
|
|
|
/*=======================================================================*\ |
425
|
|
|
Function: is_success |
426
|
|
|
Purpose: return true if Successful status code |
427
|
|
|
\*=======================================================================*/ |
428
|
|
|
function is_success ($sc) { |
429
|
|
|
return $sc >= 200 && $sc < 300; |
430
|
|
|
} |
431
|
|
|
|
432
|
|
|
/*=======================================================================*\ |
433
|
|
|
Function: is_redirect |
434
|
|
|
Purpose: return true if Redirection status code |
435
|
|
|
\*=======================================================================*/ |
436
|
|
|
function is_redirect ($sc) { |
437
|
|
|
return $sc >= 300 && $sc < 400; |
438
|
|
|
} |
439
|
|
|
|
440
|
|
|
/*=======================================================================*\ |
441
|
|
|
Function: is_error |
442
|
|
|
Purpose: return true if Error status code |
443
|
|
|
\*=======================================================================*/ |
444
|
|
|
function is_error ($sc) { |
445
|
|
|
return $sc >= 400 && $sc < 600; |
446
|
|
|
} |
447
|
|
|
|
448
|
|
|
/*=======================================================================*\ |
449
|
|
|
Function: is_client_error |
450
|
|
|
Purpose: return true if Error status code, and its a client error |
451
|
|
|
\*=======================================================================*/ |
452
|
|
|
function is_client_error ($sc) { |
453
|
|
|
return $sc >= 400 && $sc < 500; |
454
|
|
|
} |
455
|
|
|
|
456
|
|
|
/*=======================================================================*\ |
457
|
|
|
Function: is_client_error |
458
|
|
|
Purpose: return true if Error status code, and its a server error |
459
|
|
|
\*=======================================================================*/ |
460
|
|
|
function is_server_error ($sc) { |
461
|
|
|
return $sc >= 500 && $sc < 600; |
462
|
|
|
} |
463
|
|
|
|
Our type inference engine in quite powerful, but sometimes the code does not provide enough clues to go by. In these cases we request you to add a
@return
annotation as described here.