1
|
|
|
<?php |
2
|
|
|
/** |
3
|
|
|
* @file |
4
|
|
|
* |
5
|
|
|
* QueryPath functions. |
6
|
|
|
* |
7
|
|
|
* This file holds the QueryPath functions, qp() and htmlqp(). |
8
|
|
|
* |
9
|
|
|
* Usage: |
10
|
|
|
* |
11
|
|
|
* @code |
12
|
|
|
* <?php |
13
|
|
|
* require 'qp.php'; |
14
|
|
|
* |
15
|
|
|
* qp($xml)->find('foo')->count(); |
16
|
|
|
* ?> |
17
|
|
|
* @endcode |
18
|
|
|
*/ |
19
|
|
|
|
20
|
|
|
use QueryPath\QueryPath; |
21
|
|
|
|
22
|
|
|
/** @addtogroup querypath_core Core API |
23
|
|
|
* Core classes and functions for QueryPath. |
24
|
|
|
* |
25
|
|
|
* These are the classes, objects, and functions that developers who use QueryPath |
26
|
|
|
* are likely to use. The qp() and htmlqp() functions are the best place to start, |
27
|
|
|
* while most of the frequently used methods are part of the QueryPath object. |
28
|
|
|
*/ |
29
|
|
|
|
30
|
|
|
/** @addtogroup querypath_util Utilities |
31
|
|
|
* Utility classes for QueryPath. |
32
|
|
|
* |
33
|
|
|
* These classes add important, but less-often used features to QueryPath. Some of |
34
|
|
|
* these are used transparently (QueryPathIterator). Others you can use directly in your |
35
|
|
|
* code (QueryPathEntities). |
36
|
|
|
*/ |
37
|
|
|
|
38
|
|
|
/** @namespace QueryPath |
39
|
|
|
* The core classes that compose QueryPath. |
40
|
|
|
* |
41
|
|
|
* The QueryPath classes contain the brunt of the QueryPath code. If you are |
42
|
|
|
* interested in working with just the CSS engine, you may want to look at CssEventHandler, |
43
|
|
|
* which can be used without the rest of QueryPath. If you are interested in looking |
44
|
|
|
* carefully at QueryPath's implementation details, then the QueryPath class is where you |
45
|
|
|
* should begin. If you are interested in writing extensions, than you may want to look at |
46
|
|
|
* QueryPathExtension, and also at some of the simple extensions, such as QPXML. |
47
|
|
|
*/ |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* Build a new Query Path. |
51
|
|
|
* This builds a new Query Path object. The new object can be used for |
52
|
|
|
* reading, search, and modifying a document. |
53
|
|
|
* |
54
|
|
|
* While it is permissible to directly create new instances of a QueryPath |
55
|
|
|
* implementation, it is not advised. Instead, you should use this function |
56
|
|
|
* as a factory. |
57
|
|
|
* |
58
|
|
|
* Example: |
59
|
|
|
* |
60
|
|
|
* @code |
61
|
|
|
* <?php |
62
|
|
|
* qp(); // New empty QueryPath |
63
|
|
|
* qp('path/to/file.xml'); // From a file |
64
|
|
|
* qp('<html><head></head><body></body></html>'); // From HTML or XML |
65
|
|
|
* qp(QueryPath::XHTML_STUB); // From a basic HTML document. |
66
|
|
|
* qp(QueryPath::XHTML_STUB, 'title'); // Create one from a basic HTML doc and position it at the title element. |
67
|
|
|
* |
68
|
|
|
* // Most of the time, methods are chained directly off of this call. |
69
|
|
|
* qp(QueryPath::XHTML_STUB, 'body')->append('<h1>Title</h1>')->addClass('body-class'); |
70
|
|
|
* ?> |
71
|
|
|
* @endcode |
72
|
|
|
* |
73
|
|
|
* This function is used internally by QueryPath. Anything that modifies the |
74
|
|
|
* behavior of this function may also modify the behavior of common QueryPath |
75
|
|
|
* methods. |
76
|
|
|
* |
77
|
|
|
* <b>Types of documents that QueryPath can support</b> |
78
|
|
|
* |
79
|
|
|
* qp() can take any of these as its first argument: |
80
|
|
|
* |
81
|
|
|
* - A string of XML or HTML (See {@link XHTML_STUB}) |
82
|
|
|
* - A path on the file system or a URL |
83
|
|
|
* - A {@link DOMDocument} object |
84
|
|
|
* - A {@link SimpleXMLElement} object. |
85
|
|
|
* - A {@link DOMNode} object. |
86
|
|
|
* - An array of {@link DOMNode} objects (generally {@link DOMElement} nodes). |
87
|
|
|
* - Another {@link QueryPath} object. |
88
|
|
|
* |
89
|
|
|
* Keep in mind that most features of QueryPath operate on elements. Other |
90
|
|
|
* sorts of DOMNodes might not work with all features. |
91
|
|
|
* |
92
|
|
|
* <b>Supported Options</b> |
93
|
|
|
* - context: A stream context object. This is used to pass context info |
94
|
|
|
* to the underlying file IO subsystem. |
95
|
|
|
* - encoding: A valid character encoding, such as 'utf-8' or 'ISO-8859-1'. |
96
|
|
|
* The default is system-dependant, typically UTF-8. Note that this is |
97
|
|
|
* only used when creating new documents, not when reading existing content. |
98
|
|
|
* (See convert_to_encoding below.) |
99
|
|
|
* - parser_flags: An OR-combined set of parser flags. The flags supported |
100
|
|
|
* by the DOMDocument PHP class are all supported here. |
101
|
|
|
* - omit_xml_declaration: Boolean. If this is TRUE, then certain output |
102
|
|
|
* methods (like {@link QueryPath::xml()}) will omit the XML declaration |
103
|
|
|
* from the beginning of a document. |
104
|
|
|
* - format_output: Boolean. If this is set to TRUE, QueryPath will format |
105
|
|
|
* the HTML or XML output to make it more readible. If this is set to |
106
|
|
|
* FALSE, QueryPath will minimize whitespace to keep the document smaller |
107
|
|
|
* but harder to read. |
108
|
|
|
* - replace_entities: Boolean. If this is TRUE, then any of the insertion |
109
|
|
|
* functions (before(), append(), etc.) will replace named entities with |
110
|
|
|
* their decimal equivalent, and will replace un-escaped ampersands with |
111
|
|
|
* a numeric entity equivalent. |
112
|
|
|
* - ignore_parser_warnings: Boolean. If this is TRUE, then E_WARNING messages |
113
|
|
|
* generated by the XML parser will not cause QueryPath to throw an exception. |
114
|
|
|
* This is useful when parsing |
115
|
|
|
* badly mangled HTML, or when failure to find files should not result in |
116
|
|
|
* an exception. By default, this is FALSE -- that is, parsing warnings and |
117
|
|
|
* IO warnings throw exceptions. |
118
|
|
|
* - convert_to_encoding: Use the MB library to convert the document to the |
119
|
|
|
* named encoding before parsing. This is useful for old HTML (set it to |
120
|
|
|
* iso-8859-1 for best results). If this is not supplied, no character set |
121
|
|
|
* conversion will be performed. See {@link mb_convert_encoding()}. |
122
|
|
|
* (QueryPath 1.3 and later) |
123
|
|
|
* - convert_from_encoding: If 'convert_to_encoding' is set, this option can be |
124
|
|
|
* used to explicitly define what character set the source document is using. |
125
|
|
|
* By default, QueryPath will allow the MB library to guess the encoding. |
126
|
|
|
* (QueryPath 1.3 and later) |
127
|
|
|
* - strip_low_ascii: If this is set to TRUE then markup will have all low ASCII |
128
|
|
|
* characters (<32) stripped out before parsing. This is good in cases where |
129
|
|
|
* icky HTML has (illegal) low characters in the document. |
130
|
|
|
* - use_parser: If 'xml', Parse the document as XML. If 'html', parse the |
131
|
|
|
* document as HTML. Note that the XML parser is very strict, while the |
132
|
|
|
* HTML parser is more lenient, but does enforce some of the DTD/Schema. |
133
|
|
|
* <i>By default, QueryPath autodetects the type.</i> |
134
|
|
|
* - escape_xhtml_js_css_sections: XHTML needs script and css sections to be |
135
|
|
|
* escaped. Yet older readers do not handle CDATA sections, and comments do not |
136
|
|
|
* work properly (for numerous reasons). By default, QueryPath's *XHTML methods |
137
|
|
|
* will wrap a script body with a CDATA declaration inside of C-style comments. |
138
|
|
|
* If you want to change this, you can set this option with one of the |
139
|
|
|
* JS_CSS_ESCAPE_* constants, or you can write your own. |
140
|
|
|
* - QueryPath_class: (ADVANCED) Use this to set the actual classname that |
141
|
|
|
* {@link qp()} loads as a QueryPath instance. It is assumed that the |
142
|
|
|
* class is either {@link QueryPath} or a subclass thereof. See the test |
143
|
|
|
* cases for an example. |
144
|
|
|
* |
145
|
|
|
* @ingroup querypath_core |
146
|
|
|
* @param mixed $document |
147
|
|
|
* A document in one of the forms listed above. |
148
|
|
|
* @param string $string |
149
|
|
|
* A CSS 3 selector. |
150
|
|
|
* @param array $options |
151
|
|
|
* An associative array of options. Currently supported options are listed above. |
152
|
|
|
* @return \QueryPath\DOMQuery |
153
|
|
|
* Or possibly another QueryPath-like object if you overrode QueryPath_class. |
154
|
|
|
*/ |
155
|
|
|
function qp($document = NULL, $string = NULL, array $options = []) |
156
|
|
|
{ |
157
|
|
|
return QueryPath::with($document, $string, $options); |
|
|
|
|
158
|
|
|
} |
159
|
|
|
|
160
|
|
|
/** |
161
|
|
|
* A special-purpose version of {@link qp()} designed specifically for HTML. |
162
|
|
|
* |
163
|
|
|
* XHTML (if valid) can be easily parsed by {@link qp()} with no problems. However, |
164
|
|
|
* because of the way that libxml handles HTML, there are several common steps that |
165
|
|
|
* need to be taken to reliably parse non-XML HTML documents. This function is |
166
|
|
|
* a convenience tool for configuring QueryPath to parse HTML. |
167
|
|
|
* |
168
|
|
|
* The following options are automatically set unless overridden: |
169
|
|
|
* - ignore_parser_warnings: TRUE |
170
|
|
|
* - convert_to_encoding: ISO-8859-1 (the best for the HTML parser). |
171
|
|
|
* - convert_from_encoding: auto (autodetect encoding) |
172
|
|
|
* - use_parser: html |
173
|
|
|
* |
174
|
|
|
* Parser warning messages are also suppressed, so if the parser emits a warning, |
175
|
|
|
* the application will not be notified. This is equivalent to |
176
|
|
|
* calling @code@qp()@endcode. |
177
|
|
|
* |
178
|
|
|
* Warning: Character set conversions will only work if the Multi-Byte (mb) library |
179
|
|
|
* is installed and enabled. This is usually enabled, but not always. |
180
|
|
|
* |
181
|
|
|
* @ingroup querypath_core |
182
|
|
|
* @see qp() |
183
|
|
|
* @param null $document |
|
|
|
|
184
|
|
|
* @param null $selector |
|
|
|
|
185
|
|
|
* @param array $options |
186
|
|
|
* @return mixed|\QueryPath\DOMQuery |
187
|
|
|
*/ |
188
|
|
|
function htmlqp($document = NULL, $selector = NULL, $options = []) |
189
|
|
|
{ |
190
|
|
|
|
191
|
|
|
return QueryPath::withHTML($document, $selector, $options); |
192
|
|
|
} |
193
|
|
|
|
194
|
|
|
/** |
195
|
|
|
* Parse HTML5 documents. |
196
|
|
|
* |
197
|
|
|
* This uses HTML5-PHP to parse the document. In actuality, this parser does |
198
|
|
|
* a fine job with pre-HTML5 documents in most cases, though really old HTML |
199
|
|
|
* (like 2.0) may have some substantial quirks. |
200
|
|
|
* |
201
|
|
|
* <b>Supported Options</b> |
202
|
|
|
* Any options supported by HTML5-PHP are allowed here. Additionally, the |
203
|
|
|
* following options have meaning to QueryPath. |
204
|
|
|
* - QueryPath_class |
205
|
|
|
* |
206
|
|
|
* |
207
|
|
|
* @param null $document |
|
|
|
|
208
|
|
|
* @param string $selector |
209
|
|
|
* A CSS3 selector. |
210
|
|
|
* |
211
|
|
|
* @param array $options |
212
|
|
|
* An associative array of options, which is passed on into HTML5-PHP. Note |
213
|
|
|
* that the standard QueryPath options may be ignored for this function, |
214
|
|
|
* since it uses a different parser. |
215
|
|
|
* |
216
|
|
|
* @return QueryPath |
217
|
|
|
*/ |
218
|
|
|
function html5qp($document = NULL, $selector = NULL, array $options = []) |
219
|
|
|
{ |
220
|
|
|
return QueryPath::withHTML5($document, $selector, $options); |
221
|
|
|
} |
222
|
|
|
|