Completed
Push — 3.7 ( 81b2d8...ef0909 )
by
unknown
09:42
created

SimpleLexer   A

Complexity

Total Complexity 36

Size/Duplication

Total Lines 264
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 2

Importance

Changes 0
Metric Value
dl 0
loc 264
rs 9.52
c 0
b 0
f 0
wmc 36
lcom 1
cbo 2

13 Methods

Rating   Name   Duplication   Size   Complexity  
A __construct() 0 7 1
A addPattern() 0 9 3
A addEntryPattern() 0 9 3
A addExitPattern() 0 9 3
A addSpecialPattern() 0 9 3
A mapHandler() 0 3 1
B parse() 0 23 7
B _dispatchTokens() 0 23 7
A _isModeEnd() 0 3 1
A _isSpecialMode() 0 3 1
A _decodeSpecial() 0 3 1
A _invokeParser() 0 7 3
A _reduce() 0 9 2
1
<?php
2
/**
3
 *  base include file for SimpleTest
4
 *  @package    SimpleTest
5
 *  @subpackage MockObjects
6
 *  @version    $Id: parser.php 1723 2008-04-08 00:34:10Z lastcraft $
7
 */
8
9
/**#@+
10
 * Lexer mode stack constants
11
 */
12
foreach (array('LEXER_ENTER', 'LEXER_MATCHED',
13
                'LEXER_UNMATCHED', 'LEXER_EXIT',
14
                'LEXER_SPECIAL') as $i => $constant) {
15
    if (! defined($constant)) {
16
        define($constant, $i + 1);
17
    }
18
}
19
/**#@-*/
20
21
/**
22
 *    Compounded regular expression. Any of
23
 *    the contained patterns could match and
24
 *    when one does, it's label is returned.
25
 *    @package SimpleTest
26
 *    @subpackage WebTester
27
 */
28
class ParallelRegex {
29
    var $_patterns;
30
    var $_labels;
31
    var $_regex;
32
    var $_case;
33
    
34
    /**
35
     *    Constructor. Starts with no patterns.
36
     *    @param boolean $case    True for case sensitive, false
37
     *                            for insensitive.
38
     *    @access public
39
     */
40
    function __construct($case) {
41
        $this->_case = $case;
42
        $this->_patterns = array();
43
        $this->_labels = array();
44
        $this->_regex = null;
45
    }
46
    
47
    /**
48
     *    Adds a pattern with an optional label.
49
     *    @param string $pattern      Perl style regex, but ( and )
50
     *                                lose the usual meaning.
51
     *    @param string $label        Label of regex to be returned
52
     *                                on a match.
53
     *    @access public
54
     */
55
    function addPattern($pattern, $label = true) {
56
        $count = count($this->_patterns);
57
        $this->_patterns[$count] = $pattern;
58
        $this->_labels[$count] = $label;
59
        $this->_regex = null;
60
    }
61
    
62
    /**
63
     *    Attempts to match all patterns at once against
64
     *    a string.
65
     *    @param string $subject      String to match against.
66
     *    @param string $match        First matched portion of
67
     *                                subject.
68
     *    @return boolean             True on success.
69
     *    @access public
70
     */
71
    function match($subject, &$match) {
72
        if (count($this->_patterns) == 0) {
73
            return false;
74
        }
75
        if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
76
            $match = '';
77
            return false;
78
        }
79
        $match = $matches[0];
80
        for ($i = 1; $i < count($matches); $i++) {
81
            if ($matches[$i]) {
82
                return $this->_labels[$i - 1];
83
            }
84
        }
85
        return true;
86
    }
87
    
88
    /**
89
     *    Compounds the patterns into a single
90
     *    regular expression separated with the
91
     *    "or" operator. Caches the regex.
92
     *    Will automatically escape (, ) and / tokens.
93
     *    @param array $patterns    List of patterns in order.
94
     *    @access private
95
     */
96
    function _getCompoundedRegex() {
97
        if ($this->_regex == null) {
98
            for ($i = 0, $count = count($this->_patterns); $i < $count; $i++) {
99
                $this->_patterns[$i] = '(' . str_replace(
100
                        array('/', '(', ')'),
101
                        array('\/', '\(', '\)'),
102
                        $this->_patterns[$i]) . ')';
103
            }
104
            $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
105
        }
106
        return $this->_regex;
107
    }
108
    
109
    /**
110
     *    Accessor for perl regex mode flags to use.
111
     *    @return string       Perl regex flags.
112
     *    @access private
113
     */
114
    function _getPerlMatchingFlags() {
115
        return ($this->_case ? "msS" : "msSi");
116
    }
117
}
118
119
/**
120
 *    States for a stack machine.
121
 *    @package SimpleTest
122
 *    @subpackage WebTester
123
 */
124
class SimpleStateStack {
125
    var $_stack;
126
    
127
    /**
128
     *    Constructor. Starts in named state.
129
     *    @param string $start        Starting state name.
130
     *    @access public
131
     */
132
    function __construct($start) {
133
        $this->_stack = array($start);
134
    }
135
    
136
    /**
137
     *    Accessor for current state.
138
     *    @return string       State.
139
     *    @access public
140
     */
141
    function getCurrent() {
142
        return $this->_stack[count($this->_stack) - 1];
143
    }
144
    
145
    /**
146
     *    Adds a state to the stack and sets it
147
     *    to be the current state.
148
     *    @param string $state        New state.
149
     *    @access public
150
     */
151
    function enter($state) {
152
        array_push($this->_stack, $state);
153
    }
154
    
155
    /**
156
     *    Leaves the current state and reverts
157
     *    to the previous one.
158
     *    @return boolean    False if we drop off
159
     *                       the bottom of the list.
160
     *    @access public
161
     */
162
    function leave() {
163
        if (count($this->_stack) == 1) {
164
            return false;
165
        }
166
        array_pop($this->_stack);
167
        return true;
168
    }
169
}
170
171
/**
172
 *    Accepts text and breaks it into tokens.
173
 *    Some optimisation to make the sure the
174
 *    content is only scanned by the PHP regex
175
 *    parser once. Lexer modes must not start
176
 *    with leading underscores.
177
 *    @package SimpleTest
178
 *    @subpackage WebTester
179
 */
180
class SimpleLexer {
181
    var $_regexes;
182
    var $_parser;
183
    var $_mode;
184
    var $_mode_handlers;
185
    var $_case;
186
    
187
    /**
188
     *    Sets up the lexer in case insensitive matching
189
     *    by default.
190
     *    @param SimpleSaxParser $parser  Handling strategy by
191
     *                                    reference.
192
     *    @param string $start            Starting handler.
193
     *    @param boolean $case            True for case sensitive.
194
     *    @access public
195
     */
196
    function __construct(&$parser, $start = "accept", $case = false) {
197
        $this->_case = $case;
198
        $this->_regexes = array();
199
        $this->_parser = &$parser;
200
        $this->_mode = new SimpleStateStack($start);
201
        $this->_mode_handlers = array($start => $start);
202
    }
203
    
204
    /**
205
     *    Adds a token search pattern for a particular
206
     *    parsing mode. The pattern does not change the
207
     *    current mode.
208
     *    @param string $pattern      Perl style regex, but ( and )
209
     *                                lose the usual meaning.
210
     *    @param string $mode         Should only apply this
211
     *                                pattern when dealing with
212
     *                                this type of input.
213
     *    @access public
214
     */
215
    function addPattern($pattern, $mode = "accept") {
216
        if (! isset($this->_regexes[$mode])) {
217
            $this->_regexes[$mode] = new ParallelRegex($this->_case);
218
        }
219
        $this->_regexes[$mode]->addPattern($pattern);
220
        if (! isset($this->_mode_handlers[$mode])) {
221
            $this->_mode_handlers[$mode] = $mode;
222
        }
223
    }
224
    
225
    /**
226
     *    Adds a pattern that will enter a new parsing
227
     *    mode. Useful for entering parenthesis, strings,
228
     *    tags, etc.
229
     *    @param string $pattern      Perl style regex, but ( and )
230
     *                                lose the usual meaning.
231
     *    @param string $mode         Should only apply this
232
     *                                pattern when dealing with
233
     *                                this type of input.
234
     *    @param string $new_mode     Change parsing to this new
235
     *                                nested mode.
236
     *    @access public
237
     */
238
    function addEntryPattern($pattern, $mode, $new_mode) {
239
        if (! isset($this->_regexes[$mode])) {
240
            $this->_regexes[$mode] = new ParallelRegex($this->_case);
241
        }
242
        $this->_regexes[$mode]->addPattern($pattern, $new_mode);
243
        if (! isset($this->_mode_handlers[$new_mode])) {
244
            $this->_mode_handlers[$new_mode] = $new_mode;
245
        }
246
    }
247
    
248
    /**
249
     *    Adds a pattern that will exit the current mode
250
     *    and re-enter the previous one.
251
     *    @param string $pattern      Perl style regex, but ( and )
252
     *                                lose the usual meaning.
253
     *    @param string $mode         Mode to leave.
254
     *    @access public
255
     */
256
    function addExitPattern($pattern, $mode) {
257
        if (! isset($this->_regexes[$mode])) {
258
            $this->_regexes[$mode] = new ParallelRegex($this->_case);
259
        }
260
        $this->_regexes[$mode]->addPattern($pattern, "__exit");
261
        if (! isset($this->_mode_handlers[$mode])) {
262
            $this->_mode_handlers[$mode] = $mode;
263
        }
264
    }
265
    
266
    /**
267
     *    Adds a pattern that has a special mode. Acts as an entry
268
     *    and exit pattern in one go, effectively calling a special
269
     *    parser handler for this token only.
270
     *    @param string $pattern      Perl style regex, but ( and )
271
     *                                lose the usual meaning.
272
     *    @param string $mode         Should only apply this
273
     *                                pattern when dealing with
274
     *                                this type of input.
275
     *    @param string $special      Use this mode for this one token.
276
     *    @access public
277
     */
278
    function addSpecialPattern($pattern, $mode, $special) {
279
        if (! isset($this->_regexes[$mode])) {
280
            $this->_regexes[$mode] = new ParallelRegex($this->_case);
281
        }
282
        $this->_regexes[$mode]->addPattern($pattern, "_$special");
283
        if (! isset($this->_mode_handlers[$special])) {
284
            $this->_mode_handlers[$special] = $special;
285
        }
286
    }
287
    
288
    /**
289
     *    Adds a mapping from a mode to another handler.
290
     *    @param string $mode        Mode to be remapped.
291
     *    @param string $handler     New target handler.
292
     *    @access public
293
     */
294
    function mapHandler($mode, $handler) {
295
        $this->_mode_handlers[$mode] = $handler;
296
    }
297
    
298
    /**
299
     *    Splits the page text into tokens. Will fail
300
     *    if the handlers report an error or if no
301
     *    content is consumed. If successful then each
302
     *    unparsed and parsed token invokes a call to the
303
     *    held listener.
304
     *    @param string $raw        Raw HTML text.
305
     *    @return boolean           True on success, else false.
306
     *    @access public
307
     */
308
    function parse($raw) {
309
        if (! isset($this->_parser)) {
310
            return false;
311
        }
312
        $length = strlen($raw);
313
        while (is_array($parsed = $this->_reduce($raw))) {
314
            list($raw, $unmatched, $matched, $mode) = $parsed;
315
            if (! $this->_dispatchTokens($unmatched, $matched, $mode)) {
316
                return false;
317
            }
318
            if ($raw === '') {
319
                return true;
320
            }
321
            if (strlen($raw) == $length) {
322
                return false;
323
            }
324
            $length = strlen($raw);
325
        }
326
        if (! $parsed) {
327
            return false;
328
        }
329
        return $this->_invokeParser($raw, LEXER_UNMATCHED);
330
    }
331
    
332
    /**
333
     *    Sends the matched token and any leading unmatched
334
     *    text to the parser changing the lexer to a new
335
     *    mode if one is listed.
336
     *    @param string $unmatched    Unmatched leading portion.
337
     *    @param string $matched      Actual token match.
338
     *    @param string $mode         Mode after match. A boolean
339
     *                                false mode causes no change.
340
     *    @return boolean             False if there was any error
341
     *                                from the parser.
342
     *    @access private
343
     */
344
    function _dispatchTokens($unmatched, $matched, $mode = false) {
345
        if (! $this->_invokeParser($unmatched, LEXER_UNMATCHED)) {
346
            return false;
347
        }
348
        if (is_bool($mode)) {
349
            return $this->_invokeParser($matched, LEXER_MATCHED);
350
        }
351
        if ($this->_isModeEnd($mode)) {
352
            if (! $this->_invokeParser($matched, LEXER_EXIT)) {
353
                return false;
354
            }
355
            return $this->_mode->leave();
356
        }
357
        if ($this->_isSpecialMode($mode)) {
358
            $this->_mode->enter($this->_decodeSpecial($mode));
359
            if (! $this->_invokeParser($matched, LEXER_SPECIAL)) {
360
                return false;
361
            }
362
            return $this->_mode->leave();
363
        }
364
        $this->_mode->enter($mode);
365
        return $this->_invokeParser($matched, LEXER_ENTER);
366
    }
367
    
368
    /**
369
     *    Tests to see if the new mode is actually to leave
370
     *    the current mode and pop an item from the matching
371
     *    mode stack.
372
     *    @param string $mode    Mode to test.
373
     *    @return boolean        True if this is the exit mode.
374
     *    @access private
375
     */
376
    function _isModeEnd($mode) {
377
        return ($mode === "__exit");
378
    }
379
    
380
    /**
381
     *    Test to see if the mode is one where this mode
382
     *    is entered for this token only and automatically
383
     *    leaves immediately afterwoods.
384
     *    @param string $mode    Mode to test.
385
     *    @return boolean        True if this is the exit mode.
386
     *    @access private
387
     */
388
    function _isSpecialMode($mode) {
389
        return (strncmp($mode, "_", 1) == 0);
390
    }
391
    
392
    /**
393
     *    Strips the magic underscore marking single token
394
     *    modes.
395
     *    @param string $mode    Mode to decode.
396
     *    @return string         Underlying mode name.
397
     *    @access private
398
     */
399
    function _decodeSpecial($mode) {
400
        return substr($mode, 1);
401
    }
402
    
403
    /**
404
     *    Calls the parser method named after the current
405
     *    mode. Empty content will be ignored. The lexer
406
     *    has a parser handler for each mode in the lexer.
407
     *    @param string $content        Text parsed.
408
     *    @param boolean $is_match      Token is recognised rather
409
     *                                  than unparsed data.
410
     *    @access private
411
     */
412
    function _invokeParser($content, $is_match) {
413
        if (($content === '') || ($content === false)) {
414
            return true;
415
        }
416
        $handler = $this->_mode_handlers[$this->_mode->getCurrent()];
417
        return $this->_parser->$handler($content, $is_match);
418
    }
419
    
420
    /**
421
     *    Tries to match a chunk of text and if successful
422
     *    removes the recognised chunk and any leading
423
     *    unparsed data. Empty strings will not be matched.
424
     *    @param string $raw         The subject to parse. This is the
425
     *                               content that will be eaten.
426
     *    @return array/boolean      Three item list of unparsed
427
     *                               content followed by the
428
     *                               recognised token and finally the
429
     *                               action the parser is to take.
430
     *                               True if no match, false if there
431
     *                               is a parsing error.
432
     *    @access private
433
     */
434
    function _reduce($raw) {
435
        if ($action = $this->_regexes[$this->_mode->getCurrent()]->match($raw, $match)) {
436
            $unparsed_character_count = strpos($raw, $match);
437
            $unparsed = substr($raw, 0, $unparsed_character_count);
438
            $raw = substr($raw, $unparsed_character_count + strlen($match));
439
            return array($raw, $unparsed, $match, $action);
440
        }
441
        return true;
442
    }
443
}
444
445
/**
446
 *    Breaks HTML into SAX events.
447
 *    @package SimpleTest
448
 *    @subpackage WebTester
449
 */
450
class SimpleHtmlLexer extends SimpleLexer {
451
    
452
    /**
453
     *    Sets up the lexer with case insensitive matching
454
     *    and adds the HTML handlers.
455
     *    @param SimpleSaxParser $parser  Handling strategy by
456
     *                                    reference.
457
     *    @access public
458
     */
459
    function __construct(&$parser) {
460
        parent::__construct($parser, 'text');
461
        $this->mapHandler('text', 'acceptTextToken');
462
        $this->_addSkipping();
463
        foreach ($this->_getParsedTags() as $tag) {
464
            $this->_addTag($tag);
465
        }
466
        $this->_addInTagTokens();
467
    }
468
    
469
    /**
470
     *    List of parsed tags. Others are ignored.
471
     *    @return array        List of searched for tags.
472
     *    @access private
473
     */
474
    function _getParsedTags() {
475
        return array('a', 'base', 'title', 'form', 'input', 'button', 'textarea', 'select',
476
                'option', 'frameset', 'frame', 'label');
477
    }
478
    
479
    /**
480
     *    The lexer has to skip certain sections such
481
     *    as server code, client code and styles.
482
     *    @access private
483
     */
484
    function _addSkipping() {
485
        $this->mapHandler('css', 'ignore');
486
        $this->addEntryPattern('<style', 'text', 'css');
487
        $this->addExitPattern('</style>', 'css');
488
        $this->mapHandler('js', 'ignore');
489
        $this->addEntryPattern('<script', 'text', 'js');
490
        $this->addExitPattern('</script>', 'js');
491
        $this->mapHandler('comment', 'ignore');
492
        $this->addEntryPattern('<!--', 'text', 'comment');
493
        $this->addExitPattern('-->', 'comment');
494
    }
495
    
496
    /**
497
     *    Pattern matches to start and end a tag.
498
     *    @param string $tag          Name of tag to scan for.
499
     *    @access private
500
     */
501
    function _addTag($tag) {
502
        $this->addSpecialPattern("</$tag>", 'text', 'acceptEndToken');
503
        $this->addEntryPattern("<$tag", 'text', 'tag');
504
    }
505
    
506
    /**
507
     *    Pattern matches to parse the inside of a tag
508
     *    including the attributes and their quoting.
509
     *    @access private
510
     */
511
    function _addInTagTokens() {
512
        $this->mapHandler('tag', 'acceptStartToken');
513
        $this->addSpecialPattern('\s+', 'tag', 'ignore');
514
        $this->_addAttributeTokens();
515
        $this->addExitPattern('/>', 'tag');
516
        $this->addExitPattern('>', 'tag');
517
    }
518
    
519
    /**
520
     *    Matches attributes that are either single quoted,
521
     *    double quoted or unquoted.
522
     *    @access private
523
     */
524
    function _addAttributeTokens() {
525
        $this->mapHandler('dq_attribute', 'acceptAttributeToken');
526
        $this->addEntryPattern('=\s*"', 'tag', 'dq_attribute');
527
        $this->addPattern("\\\\\"", 'dq_attribute');
528
        $this->addExitPattern('"', 'dq_attribute');
529
        $this->mapHandler('sq_attribute', 'acceptAttributeToken');
530
        $this->addEntryPattern("=\s*'", 'tag', 'sq_attribute');
531
        $this->addPattern("\\\\'", 'sq_attribute');
532
        $this->addExitPattern("'", 'sq_attribute');
533
        $this->mapHandler('uq_attribute', 'acceptAttributeToken');
534
        $this->addSpecialPattern('=\s*[^>\s]*', 'tag', 'uq_attribute');
535
    }
536
}
537
538
/**
539
 *    Converts HTML tokens into selected SAX events.
540
 *    @package SimpleTest
541
 *    @subpackage WebTester
542
 */
543
class SimpleHtmlSaxParser {
544
    var $_lexer;
545
    var $_listener;
546
    var $_tag;
547
    var $_attributes;
548
    var $_current_attribute;
549
    
550
    /**
551
     *    Sets the listener.
552
     *    @param SimpleSaxListener $listener    SAX event handler.
553
     *    @access public
554
     */
555
    function __construct(&$listener) {
556
        $this->_listener = &$listener;
557
        $this->_lexer = &$this->createLexer($this);
558
        $this->_tag = '';
559
        $this->_attributes = array();
560
        $this->_current_attribute = '';
561
    }
562
    
563
    /**
564
     *    Runs the content through the lexer which
565
     *    should call back to the acceptors.
566
     *    @param string $raw      Page text to parse.
567
     *    @return boolean         False if parse error.
568
     *    @access public
569
     */
570
    function parse($raw) {
571
        return $this->_lexer->parse($raw);
572
    }
573
    
574
    /**
575
     *    Sets up the matching lexer. Starts in 'text' mode.
576
     *    @param SimpleSaxParser $parser    Event generator, usually $self.
577
     *    @return SimpleLexer               Lexer suitable for this parser.
578
     *    @access public
579
     *    @static
580
     */
581
    function &createLexer(&$parser) {
582
        $lexer = new SimpleHtmlLexer($parser);
583
        return $lexer;
584
    }
585
    
586
    /**
587
     *    Accepts a token from the tag mode. If the
588
     *    starting element completes then the element
589
     *    is dispatched and the current attributes
590
     *    set back to empty. The element or attribute
591
     *    name is converted to lower case.
592
     *    @param string $token     Incoming characters.
593
     *    @param integer $event    Lexer event type.
594
     *    @return boolean          False if parse error.
595
     *    @access public
596
     */
597
    function acceptStartToken($token, $event) {
598
        if ($event == LEXER_ENTER) {
599
            $this->_tag = strtolower(substr($token, 1));
600
            return true;
601
        }
602
        if ($event == LEXER_EXIT) {
603
            $success = $this->_listener->startElement(
604
                    $this->_tag,
605
                    $this->_attributes);
606
            $this->_tag = '';
607
            $this->_attributes = array();
608
            return $success;
609
        }
610
        if ($token != '=') {
611
            $this->_current_attribute = strtolower(SimpleHtmlSaxParser::decodeHtml($token));
612
            $this->_attributes[$this->_current_attribute] = '';
613
        }
614
        return true;
615
    }
616
    
617
    /**
618
     *    Accepts a token from the end tag mode.
619
     *    The element name is converted to lower case.
620
     *    @param string $token     Incoming characters.
621
     *    @param integer $event    Lexer event type.
622
     *    @return boolean          False if parse error.
623
     *    @access public
624
     */
625
    function acceptEndToken($token, $event) {
626
        if (! preg_match('/<\/(.*)>/', $token, $matches)) {
627
            return false;
628
        }
629
        return $this->_listener->endElement(strtolower($matches[1]));
630
    }
631
    
632
    /**
633
     *    Part of the tag data.
634
     *    @param string $token     Incoming characters.
635
     *    @param integer $event    Lexer event type.
636
     *    @return boolean          False if parse error.
637
     *    @access public
638
     */
639
    function acceptAttributeToken($token, $event) {
640
        if ($this->_current_attribute) {
641
            if ($event == LEXER_UNMATCHED) {
642
                $this->_attributes[$this->_current_attribute] .=
643
                        SimpleHtmlSaxParser::decodeHtml($token);
644
            }
645
            if ($event == LEXER_SPECIAL) {
646
                $this->_attributes[$this->_current_attribute] .=
647
                        preg_replace('/^=\s*/' , '', SimpleHtmlSaxParser::decodeHtml($token));
648
            }
649
        }
650
        return true;
651
    }
652
    
653
    /**
654
     *    A character entity.
655
     *    @param string $token    Incoming characters.
656
     *    @param integer $event   Lexer event type.
657
     *    @return boolean         False if parse error.
658
     *    @access public
659
     */
660
    function acceptEntityToken($token, $event) {
661
    }
662
    
663
    /**
664
     *    Character data between tags regarded as
665
     *    important.
666
     *    @param string $token     Incoming characters.
667
     *    @param integer $event    Lexer event type.
668
     *    @return boolean          False if parse error.
669
     *    @access public
670
     */
671
    function acceptTextToken($token, $event) {
672
        return $this->_listener->addContent($token);
673
    }
674
    
675
    /**
676
     *    Incoming data to be ignored.
677
     *    @param string $token     Incoming characters.
678
     *    @param integer $event    Lexer event type.
679
     *    @return boolean          False if parse error.
680
     *    @access public
681
     */
682
    function ignore($token, $event) {
683
        return true;
684
    }
685
    
686
    /**
687
     *    Decodes any HTML entities.
688
     *    @param string $html    Incoming HTML.
689
     *    @return string         Outgoing plain text.
690
     *    @access public
691
     *    @static
692
     */
693
    static function decodeHtml($html) {
694
        return html_entity_decode($html, ENT_QUOTES);
695
    }
696
    
697
    /**
698
     *    Turns HTML into text browser visible text. Images
699
     *    are converted to their alt text and tags are supressed.
700
     *    Entities are converted to their visible representation.
701
     *    @param string $html        HTML to convert.
702
     *    @return string             Plain text.
703
     *    @access public
704
     *    @static
705
     */
706
    static function normalise($html) {
707
        $text = preg_replace('|<!--.*?-->|', '', $html);
708
        $text = preg_replace('|<script[^>]*>.*?</script>|', '', $text);
709
        $text = preg_replace('|<img[^>]*alt\s*=\s*"([^"]*)"[^>]*>|', ' \1 ', $text);
710
        $text = preg_replace('|<img[^>]*alt\s*=\s*\'([^\']*)\'[^>]*>|', ' \1 ', $text);
711
        $text = preg_replace('|<img[^>]*alt\s*=\s*([a-zA-Z_]+)[^>]*>|', ' \1 ', $text);
712
        $text = preg_replace('|<[^>]*>|', '', $text);
713
        $text = SimpleHtmlSaxParser::decodeHtml($text);
714
        $text = preg_replace('|\s+|', ' ', $text);
715
        return trim(trim($text), "\xA0");        // TODO: The \xAO is a &nbsp;. Add a test for this.
716
    }
717
}
718
719
/**
720
 *    SAX event handler.
721
 *    @package SimpleTest
722
 *    @subpackage WebTester
723
 *    @abstract
724
 */
725
class SimpleSaxListener {
726
    
727
    /**
728
     *    Sets the document to write to.
729
     *    @access public
730
     */
731
    function __construct() {
732
    }
733
    
734
    /**
735
     *    Start of element event.
736
     *    @param string $name        Element name.
737
     *    @param hash $attributes    Name value pairs.
738
     *                               Attributes without content
739
     *                               are marked as true.
740
     *    @return boolean            False on parse error.
741
     *    @access public
742
     */
743
    function startElement($name, $attributes) {
744
    }
745
    
746
    /**
747
     *    End of element event.
748
     *    @param string $name        Element name.
749
     *    @return boolean            False on parse error.
750
     *    @access public
751
     */
752
    function endElement($name) {
753
    }
754
    
755
    /**
756
     *    Unparsed, but relevant data.
757
     *    @param string $text        May include unparsed tags.
758
     *    @return boolean            False on parse error.
759
     *    @access public
760
     */
761
    function addContent($text) {
762
    }
763
}
764
765