|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* @file |
|
4
|
|
|
* |
|
5
|
|
|
* QueryPath functions. |
|
6
|
|
|
* |
|
7
|
|
|
* This file holds the QueryPath functions, qp() and htmlqp(). |
|
8
|
|
|
* |
|
9
|
|
|
* Usage: |
|
10
|
|
|
* |
|
11
|
|
|
* @code |
|
12
|
|
|
* <?php |
|
13
|
|
|
* require 'qp.php'; |
|
14
|
|
|
* |
|
15
|
|
|
* qp($xml)->find('foo')->count(); |
|
16
|
|
|
* ?> |
|
17
|
|
|
* @endcode |
|
18
|
|
|
*/ |
|
19
|
|
|
|
|
20
|
|
|
use QueryPath\QueryPath; |
|
21
|
|
|
|
|
22
|
|
|
/** @addtogroup querypath_core Core API |
|
23
|
|
|
* Core classes and functions for QueryPath. |
|
24
|
|
|
* |
|
25
|
|
|
* These are the classes, objects, and functions that developers who use QueryPath |
|
26
|
|
|
* are likely to use. The qp() and htmlqp() functions are the best place to start, |
|
27
|
|
|
* while most of the frequently used methods are part of the QueryPath object. |
|
28
|
|
|
*/ |
|
29
|
|
|
|
|
30
|
|
|
/** @addtogroup querypath_util Utilities |
|
31
|
|
|
* Utility classes for QueryPath. |
|
32
|
|
|
* |
|
33
|
|
|
* These classes add important, but less-often used features to QueryPath. Some of |
|
34
|
|
|
* these are used transparently (QueryPathIterator). Others you can use directly in your |
|
35
|
|
|
* code (QueryPathEntities). |
|
36
|
|
|
*/ |
|
37
|
|
|
|
|
38
|
|
|
/** @namespace QueryPath |
|
39
|
|
|
* The core classes that compose QueryPath. |
|
40
|
|
|
* |
|
41
|
|
|
* The QueryPath classes contain the brunt of the QueryPath code. If you are |
|
42
|
|
|
* interested in working with just the CSS engine, you may want to look at CssEventHandler, |
|
43
|
|
|
* which can be used without the rest of QueryPath. If you are interested in looking |
|
44
|
|
|
* carefully at QueryPath's implementation details, then the QueryPath class is where you |
|
45
|
|
|
* should begin. If you are interested in writing extensions, than you may want to look at |
|
46
|
|
|
* QueryPathExtension, and also at some of the simple extensions, such as QPXML. |
|
47
|
|
|
*/ |
|
48
|
|
|
|
|
49
|
|
|
/** |
|
50
|
|
|
* Build a new Query Path. |
|
51
|
|
|
* This builds a new Query Path object. The new object can be used for |
|
52
|
|
|
* reading, search, and modifying a document. |
|
53
|
|
|
* |
|
54
|
|
|
* While it is permissible to directly create new instances of a QueryPath |
|
55
|
|
|
* implementation, it is not advised. Instead, you should use this function |
|
56
|
|
|
* as a factory. |
|
57
|
|
|
* |
|
58
|
|
|
* Example: |
|
59
|
|
|
* |
|
60
|
|
|
* @code |
|
61
|
|
|
* <?php |
|
62
|
|
|
* qp(); // New empty QueryPath |
|
63
|
|
|
* qp('path/to/file.xml'); // From a file |
|
64
|
|
|
* qp('<html><head></head><body></body></html>'); // From HTML or XML |
|
65
|
|
|
* qp(QueryPath::XHTML_STUB); // From a basic HTML document. |
|
66
|
|
|
* qp(QueryPath::XHTML_STUB, 'title'); // Create one from a basic HTML doc and position it at the title element. |
|
67
|
|
|
* |
|
68
|
|
|
* // Most of the time, methods are chained directly off of this call. |
|
69
|
|
|
* qp(QueryPath::XHTML_STUB, 'body')->append('<h1>Title</h1>')->addClass('body-class'); |
|
70
|
|
|
* ?> |
|
71
|
|
|
* @endcode |
|
72
|
|
|
* |
|
73
|
|
|
* This function is used internally by QueryPath. Anything that modifies the |
|
74
|
|
|
* behavior of this function may also modify the behavior of common QueryPath |
|
75
|
|
|
* methods. |
|
76
|
|
|
* |
|
77
|
|
|
* <b>Types of documents that QueryPath can support</b> |
|
78
|
|
|
* |
|
79
|
|
|
* qp() can take any of these as its first argument: |
|
80
|
|
|
* |
|
81
|
|
|
* - A string of XML or HTML (See {@link XHTML_STUB}) |
|
82
|
|
|
* - A path on the file system or a URL |
|
83
|
|
|
* - A {@link DOMDocument} object |
|
84
|
|
|
* - A {@link SimpleXMLElement} object. |
|
85
|
|
|
* - A {@link DOMNode} object. |
|
86
|
|
|
* - An array of {@link DOMNode} objects (generally {@link DOMElement} nodes). |
|
87
|
|
|
* - Another {@link QueryPath} object. |
|
88
|
|
|
* |
|
89
|
|
|
* Keep in mind that most features of QueryPath operate on elements. Other |
|
90
|
|
|
* sorts of DOMNodes might not work with all features. |
|
91
|
|
|
* |
|
92
|
|
|
* <b>Supported Options</b> |
|
93
|
|
|
* - context: A stream context object. This is used to pass context info |
|
94
|
|
|
* to the underlying file IO subsystem. |
|
95
|
|
|
* - encoding: A valid character encoding, such as 'utf-8' or 'ISO-8859-1'. |
|
96
|
|
|
* The default is system-dependant, typically UTF-8. Note that this is |
|
97
|
|
|
* only used when creating new documents, not when reading existing content. |
|
98
|
|
|
* (See convert_to_encoding below.) |
|
99
|
|
|
* - parser_flags: An OR-combined set of parser flags. The flags supported |
|
100
|
|
|
* by the DOMDocument PHP class are all supported here. |
|
101
|
|
|
* - omit_xml_declaration: Boolean. If this is TRUE, then certain output |
|
102
|
|
|
* methods (like {@link QueryPath::xml()}) will omit the XML declaration |
|
103
|
|
|
* from the beginning of a document. |
|
104
|
|
|
* - format_output: Boolean. If this is set to TRUE, QueryPath will format |
|
105
|
|
|
* the HTML or XML output to make it more readible. If this is set to |
|
106
|
|
|
* FALSE, QueryPath will minimize whitespace to keep the document smaller |
|
107
|
|
|
* but harder to read. |
|
108
|
|
|
* - replace_entities: Boolean. If this is TRUE, then any of the insertion |
|
109
|
|
|
* functions (before(), append(), etc.) will replace named entities with |
|
110
|
|
|
* their decimal equivalent, and will replace un-escaped ampersands with |
|
111
|
|
|
* a numeric entity equivalent. |
|
112
|
|
|
* - ignore_parser_warnings: Boolean. If this is TRUE, then E_WARNING messages |
|
113
|
|
|
* generated by the XML parser will not cause QueryPath to throw an exception. |
|
114
|
|
|
* This is useful when parsing |
|
115
|
|
|
* badly mangled HTML, or when failure to find files should not result in |
|
116
|
|
|
* an exception. By default, this is FALSE -- that is, parsing warnings and |
|
117
|
|
|
* IO warnings throw exceptions. |
|
118
|
|
|
* - convert_to_encoding: Use the MB library to convert the document to the |
|
119
|
|
|
* named encoding before parsing. This is useful for old HTML (set it to |
|
120
|
|
|
* iso-8859-1 for best results). If this is not supplied, no character set |
|
121
|
|
|
* conversion will be performed. See {@link mb_convert_encoding()}. |
|
122
|
|
|
* (QueryPath 1.3 and later) |
|
123
|
|
|
* - convert_from_encoding: If 'convert_to_encoding' is set, this option can be |
|
124
|
|
|
* used to explicitly define what character set the source document is using. |
|
125
|
|
|
* By default, QueryPath will allow the MB library to guess the encoding. |
|
126
|
|
|
* (QueryPath 1.3 and later) |
|
127
|
|
|
* - strip_low_ascii: If this is set to TRUE then markup will have all low ASCII |
|
128
|
|
|
* characters (<32) stripped out before parsing. This is good in cases where |
|
129
|
|
|
* icky HTML has (illegal) low characters in the document. |
|
130
|
|
|
* - use_parser: If 'xml', Parse the document as XML. If 'html', parse the |
|
131
|
|
|
* document as HTML. Note that the XML parser is very strict, while the |
|
132
|
|
|
* HTML parser is more lenient, but does enforce some of the DTD/Schema. |
|
133
|
|
|
* <i>By default, QueryPath autodetects the type.</i> |
|
134
|
|
|
* - escape_xhtml_js_css_sections: XHTML needs script and css sections to be |
|
135
|
|
|
* escaped. Yet older readers do not handle CDATA sections, and comments do not |
|
136
|
|
|
* work properly (for numerous reasons). By default, QueryPath's *XHTML methods |
|
137
|
|
|
* will wrap a script body with a CDATA declaration inside of C-style comments. |
|
138
|
|
|
* If you want to change this, you can set this option with one of the |
|
139
|
|
|
* JS_CSS_ESCAPE_* constants, or you can write your own. |
|
140
|
|
|
* - QueryPath_class: (ADVANCED) Use this to set the actual classname that |
|
141
|
|
|
* {@link qp()} loads as a QueryPath instance. It is assumed that the |
|
142
|
|
|
* class is either {@link QueryPath} or a subclass thereof. See the test |
|
143
|
|
|
* cases for an example. |
|
144
|
|
|
* |
|
145
|
|
|
* @ingroup querypath_core |
|
146
|
|
|
* @param mixed $document |
|
147
|
|
|
* A document in one of the forms listed above. |
|
148
|
|
|
* @param string $string |
|
149
|
|
|
* A CSS 3 selector. |
|
150
|
|
|
* @param array $options |
|
151
|
|
|
* An associative array of options. Currently supported options are listed above. |
|
152
|
|
|
* @return \QueryPath\DOMQuery |
|
153
|
|
|
* Or possibly another QueryPath-like object if you overrode QueryPath_class. |
|
154
|
|
|
*/ |
|
155
|
|
|
function qp($document = NULL, $string = NULL, array $options = []) |
|
156
|
|
|
{ |
|
157
|
|
|
return QueryPath::with($document, $string, $options); |
|
|
|
|
|
|
158
|
|
|
} |
|
159
|
|
|
|
|
160
|
|
|
/** |
|
161
|
|
|
* A special-purpose version of {@link qp()} designed specifically for HTML. |
|
162
|
|
|
* |
|
163
|
|
|
* XHTML (if valid) can be easily parsed by {@link qp()} with no problems. However, |
|
164
|
|
|
* because of the way that libxml handles HTML, there are several common steps that |
|
165
|
|
|
* need to be taken to reliably parse non-XML HTML documents. This function is |
|
166
|
|
|
* a convenience tool for configuring QueryPath to parse HTML. |
|
167
|
|
|
* |
|
168
|
|
|
* The following options are automatically set unless overridden: |
|
169
|
|
|
* - ignore_parser_warnings: TRUE |
|
170
|
|
|
* - convert_to_encoding: ISO-8859-1 (the best for the HTML parser). |
|
171
|
|
|
* - convert_from_encoding: auto (autodetect encoding) |
|
172
|
|
|
* - use_parser: html |
|
173
|
|
|
* |
|
174
|
|
|
* Parser warning messages are also suppressed, so if the parser emits a warning, |
|
175
|
|
|
* the application will not be notified. This is equivalent to |
|
176
|
|
|
* calling @code@qp()@endcode. |
|
177
|
|
|
* |
|
178
|
|
|
* Warning: Character set conversions will only work if the Multi-Byte (mb) library |
|
179
|
|
|
* is installed and enabled. This is usually enabled, but not always. |
|
180
|
|
|
* |
|
181
|
|
|
* @ingroup querypath_core |
|
182
|
|
|
* @see qp() |
|
183
|
|
|
* @param null $document |
|
|
|
|
|
|
184
|
|
|
* @param null $selector |
|
|
|
|
|
|
185
|
|
|
* @param array $options |
|
186
|
|
|
* @return mixed|\QueryPath\DOMQuery |
|
187
|
|
|
*/ |
|
188
|
|
|
function htmlqp($document = NULL, $selector = NULL, $options = []) |
|
189
|
|
|
{ |
|
190
|
|
|
|
|
191
|
|
|
return QueryPath::withHTML($document, $selector, $options); |
|
192
|
|
|
} |
|
193
|
|
|
|
|
194
|
|
|
/** |
|
195
|
|
|
* Parse HTML5 documents. |
|
196
|
|
|
* |
|
197
|
|
|
* This uses HTML5-PHP to parse the document. In actuality, this parser does |
|
198
|
|
|
* a fine job with pre-HTML5 documents in most cases, though really old HTML |
|
199
|
|
|
* (like 2.0) may have some substantial quirks. |
|
200
|
|
|
* |
|
201
|
|
|
* <b>Supported Options</b> |
|
202
|
|
|
* Any options supported by HTML5-PHP are allowed here. Additionally, the |
|
203
|
|
|
* following options have meaning to QueryPath. |
|
204
|
|
|
* - QueryPath_class |
|
205
|
|
|
* |
|
206
|
|
|
* |
|
207
|
|
|
* @param null $document |
|
|
|
|
|
|
208
|
|
|
* @param string $selector |
|
209
|
|
|
* A CSS3 selector. |
|
210
|
|
|
* |
|
211
|
|
|
* @param array $options |
|
212
|
|
|
* An associative array of options, which is passed on into HTML5-PHP. Note |
|
213
|
|
|
* that the standard QueryPath options may be ignored for this function, |
|
214
|
|
|
* since it uses a different parser. |
|
215
|
|
|
* |
|
216
|
|
|
* @return QueryPath |
|
217
|
|
|
*/ |
|
218
|
|
|
function html5qp($document = NULL, $selector = NULL, array $options = []) |
|
219
|
|
|
{ |
|
220
|
|
|
return QueryPath::withHTML5($document, $selector, $options); |
|
221
|
|
|
} |
|
222
|
|
|
|