|
1
|
|
|
<?php |
|
2
|
|
|
/** |
|
3
|
|
|
* Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded |
|
4
|
|
|
* in or attached to the post/page. |
|
5
|
|
|
* |
|
6
|
|
|
* @todo Additionally, have some filters on number of items in each field |
|
|
|
|
|
|
7
|
|
|
*/ |
|
8
|
|
|
class Jetpack_Media_Meta_Extractor { |
|
9
|
|
|
|
|
10
|
|
|
// Some consts for what to extract |
|
11
|
|
|
const ALL = 255; |
|
12
|
|
|
const LINKS = 1; |
|
13
|
|
|
const MENTIONS = 2; |
|
14
|
|
|
const IMAGES = 4; |
|
15
|
|
|
const SHORTCODES = 8; // Only the keeper shortcodes below |
|
16
|
|
|
const EMBEDS = 16; |
|
17
|
|
|
const HASHTAGS = 32; |
|
18
|
|
|
|
|
19
|
|
|
// For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all) |
|
20
|
|
|
// There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these. |
|
21
|
|
|
private static $KEEPER_SHORTCODES = array( |
|
22
|
|
|
'youtube', |
|
23
|
|
|
'vimeo', |
|
24
|
|
|
'hulu', |
|
25
|
|
|
'ted', |
|
26
|
|
|
'wpvideo', |
|
27
|
|
|
'audio', |
|
28
|
|
|
); |
|
29
|
|
|
|
|
30
|
|
|
/** |
|
31
|
|
|
* Gets the specified media and meta info from the given post. |
|
32
|
|
|
* NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead. |
|
33
|
|
|
* |
|
34
|
|
|
* @param $blog_id The ID of the blog |
|
35
|
|
|
* @param $post_id The ID of the post |
|
36
|
|
|
* @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS |
|
37
|
|
|
* @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error |
|
38
|
|
|
*/ |
|
39
|
|
|
static public function extract( $blog_id, $post_id, $what_to_extract = self::ALL ) { |
|
|
|
|
|
|
40
|
|
|
|
|
41
|
|
|
// multisite? |
|
42
|
|
|
if ( function_exists( 'switch_to_blog') ) |
|
43
|
|
|
switch_to_blog( $blog_id ); |
|
44
|
|
|
|
|
45
|
|
|
$post = get_post( $post_id ); |
|
46
|
|
|
$content = $post->post_title . "\n\n" . $post->post_content; |
|
47
|
|
|
$char_cnt = strlen( $content ); |
|
48
|
|
|
|
|
49
|
|
|
//prevent running extraction on really huge amounts of content |
|
50
|
|
|
if ( $char_cnt > 100000 ) //about 20k English words |
|
51
|
|
|
$content = substr( $content, 0, 100000 ); |
|
52
|
|
|
|
|
53
|
|
|
$extracted = array(); |
|
54
|
|
|
|
|
55
|
|
|
// Get images first, we need the full post for that |
|
56
|
|
|
if ( self::IMAGES & $what_to_extract ) { |
|
57
|
|
|
$extracted = self::get_image_fields( $post ); |
|
58
|
|
|
|
|
59
|
|
|
// Turn off images so we can safely call extract_from_content() below |
|
60
|
|
|
$what_to_extract = $what_to_extract - self::IMAGES; |
|
61
|
|
|
} |
|
62
|
|
|
|
|
63
|
|
|
if ( function_exists( 'switch_to_blog') ) |
|
64
|
|
|
restore_current_blog(); |
|
65
|
|
|
|
|
66
|
|
|
// All of the other things besides images can be extracted from just the content |
|
67
|
|
|
$extracted = self::extract_from_content( $content, $what_to_extract, $extracted ); |
|
68
|
|
|
|
|
69
|
|
|
return $extracted; |
|
70
|
|
|
} |
|
71
|
|
|
|
|
72
|
|
|
/** |
|
73
|
|
|
* Gets the specified meta info from the given post content. |
|
74
|
|
|
* NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction |
|
75
|
|
|
* This method will give you an error if you ask for IMAGES. |
|
76
|
|
|
* |
|
77
|
|
|
* @param $content The HTML post_content of a post |
|
78
|
|
|
* @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS |
|
79
|
|
|
* @param $already_extracted (array) Previously extracted things, e.g. images from extract(), which can be used for x-referencing here |
|
80
|
|
|
* @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error |
|
81
|
|
|
*/ |
|
82
|
|
|
static public function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) { |
|
|
|
|
|
|
83
|
|
|
$stripped_content = self::get_stripped_content( $content ); |
|
84
|
|
|
|
|
85
|
|
|
// Maybe start with some previously extracted things (e.g. images from extract() |
|
86
|
|
|
$extracted = $already_extracted; |
|
87
|
|
|
|
|
88
|
|
|
// Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save. |
|
89
|
|
|
|
|
90
|
|
|
if ( self::IMAGES & $what_to_extract ) { |
|
91
|
|
|
$images = Jetpack_Media_Meta_Extractor::extract_images_from_content( $stripped_content, array() ); |
|
92
|
|
|
$extracted = array_merge( $extracted, $images ); |
|
93
|
|
|
} |
|
94
|
|
|
|
|
95
|
|
|
// ----------------------------------- MENTIONS ------------------------------ |
|
96
|
|
|
|
|
97
|
|
View Code Duplication |
if ( self::MENTIONS & $what_to_extract ) { |
|
98
|
|
|
if ( preg_match_all( '/(^|\s)@(\w+)/u', $stripped_content, $matches ) ) { |
|
99
|
|
|
$mentions = array_values( array_unique( $matches[2] ) ); //array_unique() retains the keys! |
|
100
|
|
|
$mentions = array_map( 'strtolower', $mentions ); |
|
101
|
|
|
$extracted['mention'] = array( 'name' => $mentions ); |
|
102
|
|
|
if ( !isset( $extracted['has'] ) ) |
|
103
|
|
|
$extracted['has'] = array(); |
|
104
|
|
|
$extracted['has']['mention'] = count( $mentions ); |
|
105
|
|
|
} |
|
106
|
|
|
} |
|
107
|
|
|
|
|
108
|
|
|
// ----------------------------------- HASHTAGS ------------------------------ |
|
109
|
|
|
/** Some hosts may not compile with --enable-unicode-properties and kick a warning: |
|
110
|
|
|
* Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled |
|
111
|
|
|
* Therefore, we only run this code block on wpcom, not in Jetpack. |
|
112
|
|
|
*/ |
|
113
|
|
|
if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) { |
|
114
|
|
|
//This regex does not exactly match Twitter's |
|
115
|
|
|
// if there are problems/complaints we should implement this: |
|
116
|
|
|
// https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java |
|
117
|
|
View Code Duplication |
if ( preg_match_all( '/(?:^|\s)#(\w*\p{L}+\w*)/u', $stripped_content, $matches ) ) { |
|
118
|
|
|
$hashtags = array_values( array_unique( $matches[1] ) ); //array_unique() retains the keys! |
|
119
|
|
|
$hashtags = array_map( 'strtolower', $hashtags ); |
|
120
|
|
|
$extracted['hashtag'] = array( 'name' => $hashtags ); |
|
121
|
|
|
if ( !isset( $extracted['has'] ) ) |
|
122
|
|
|
$extracted['has'] = array(); |
|
123
|
|
|
$extracted['has']['hashtag'] = count( $hashtags ); |
|
124
|
|
|
} |
|
125
|
|
|
} |
|
126
|
|
|
|
|
127
|
|
|
// ----------------------------------- SHORTCODES ------------------------------ |
|
128
|
|
|
|
|
129
|
|
|
// Always look for shortcodes. |
|
130
|
|
|
// If we don't want them, we'll just remove them, so we don't grab them as links below |
|
131
|
|
|
$shortcode_pattern = '/' . get_shortcode_regex() . '/s'; |
|
132
|
|
|
if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) { |
|
133
|
|
|
|
|
134
|
|
|
$shortcode_total_count = 0; |
|
135
|
|
|
$shortcode_type_counts = array(); |
|
136
|
|
|
$shortcode_types = array(); |
|
137
|
|
|
$shortcode_details = array(); |
|
138
|
|
|
|
|
139
|
|
|
if ( self::SHORTCODES & $what_to_extract ) { |
|
140
|
|
|
|
|
141
|
|
|
foreach( $matches[2] as $key => $shortcode ) { |
|
142
|
|
|
//Elasticsearch (and probably other things) doesn't deal well with some chars as key names |
|
143
|
|
|
$shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode ); |
|
144
|
|
|
|
|
145
|
|
|
$attr = shortcode_parse_atts( $matches[3][ $key ] ); |
|
146
|
|
|
|
|
147
|
|
|
$shortcode_total_count++; |
|
148
|
|
|
if ( ! isset( $shortcode_type_counts[$shortcode_name] ) ) |
|
149
|
|
|
$shortcode_type_counts[$shortcode_name] = 0; |
|
150
|
|
|
$shortcode_type_counts[$shortcode_name]++; |
|
151
|
|
|
|
|
152
|
|
|
// Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below) |
|
153
|
|
|
// @todo Store number of occurrences? |
|
|
|
|
|
|
154
|
|
|
if ( ! in_array( $shortcode_name, $shortcode_types ) ) |
|
155
|
|
|
$shortcode_types[] = $shortcode_name; |
|
156
|
|
|
|
|
157
|
|
|
// For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.) |
|
158
|
|
|
if ( in_array( $shortcode, self::$KEEPER_SHORTCODES ) ) { |
|
159
|
|
|
unset( $id ); // Clear shortcode ID data left from the last shortcode |
|
160
|
|
|
// We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id() |
|
161
|
|
|
// If the shortcode is a class, we'll call XyzShortcode::get_xyz_id() |
|
162
|
|
|
$shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id"; |
|
163
|
|
|
$shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode'; |
|
164
|
|
|
$shortcode_get_id_method = "get_{$shortcode}_id"; |
|
165
|
|
|
if ( function_exists( $shortcode_get_id_func ) ) { |
|
166
|
|
|
$id = call_user_func( $shortcode_get_id_func, $attr ); |
|
167
|
|
|
} else if ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) { |
|
168
|
|
|
$id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr ); |
|
169
|
|
|
} |
|
170
|
|
|
if ( ! empty( $id ) |
|
171
|
|
|
&& ( ! isset( $shortcode_details[$shortcode_name] ) || ! in_array( $id, $shortcode_details[$shortcode_name] ) ) ) |
|
172
|
|
|
$shortcode_details[$shortcode_name][] = $id; |
|
173
|
|
|
} |
|
174
|
|
|
} |
|
175
|
|
|
|
|
176
|
|
|
if ( $shortcode_total_count > 0 ) { |
|
177
|
|
|
// Add the shortcode info to the $extracted array |
|
178
|
|
|
if ( !isset( $extracted['has'] ) ) |
|
179
|
|
|
$extracted['has'] = array(); |
|
180
|
|
|
$extracted['has']['shortcode'] = $shortcode_total_count; |
|
181
|
|
|
$extracted['shortcode'] = array(); |
|
182
|
|
|
foreach ( $shortcode_type_counts as $type => $count ) |
|
183
|
|
|
$extracted['shortcode'][$type] = array( 'count' => $count ); |
|
184
|
|
|
if ( ! empty( $shortcode_types ) ) |
|
185
|
|
|
$extracted['shortcode_types'] = $shortcode_types; |
|
186
|
|
|
foreach ( $shortcode_details as $type => $id ) |
|
187
|
|
|
$extracted['shortcode'][$type]['id'] = $id; |
|
188
|
|
|
} |
|
189
|
|
|
} |
|
190
|
|
|
|
|
191
|
|
|
// Remove the shortcodes form our copy of $content, so we don't count links in them as links below. |
|
192
|
|
|
$content = preg_replace( $shortcode_pattern, ' ', $content ); |
|
193
|
|
|
} |
|
194
|
|
|
|
|
195
|
|
|
// ----------------------------------- LINKS ------------------------------ |
|
196
|
|
|
|
|
197
|
|
|
if ( self::LINKS & $what_to_extract ) { |
|
198
|
|
|
|
|
199
|
|
|
// To hold the extracted stuff we find |
|
200
|
|
|
$links = array(); |
|
201
|
|
|
|
|
202
|
|
|
// @todo Get the text inside the links? |
|
|
|
|
|
|
203
|
|
|
|
|
204
|
|
|
// Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images |
|
205
|
|
|
// (we treat embed links as just another link) |
|
206
|
|
|
if ( preg_match_all( '#(?:^|\s|"|\')(https?://([^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))))#', $content, $matches ) ) { |
|
207
|
|
|
|
|
208
|
|
|
foreach ( $matches[1] as $link_raw ) { |
|
209
|
|
|
$url = parse_url( $link_raw ); |
|
210
|
|
|
|
|
211
|
|
|
// Data URI links |
|
212
|
|
|
if ( isset( $url['scheme'] ) && 'data' === $url['scheme'] ) |
|
213
|
|
|
continue; |
|
214
|
|
|
|
|
215
|
|
|
// Remove large (and likely invalid) links |
|
216
|
|
|
if ( 4096 < strlen( $link_raw ) ) |
|
217
|
|
|
continue; |
|
218
|
|
|
|
|
219
|
|
|
// Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those |
|
220
|
|
|
$simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' ); |
|
221
|
|
|
if ( isset( $extracted['image']['url'] ) ) { |
|
222
|
|
|
if ( in_array( $simple_url, (array) $extracted['image']['url'] ) ) |
|
223
|
|
|
continue; |
|
224
|
|
|
} |
|
225
|
|
|
|
|
226
|
|
|
list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); |
|
227
|
|
|
|
|
228
|
|
|
// Build a reversed hostname |
|
229
|
|
|
$host_parts = array_reverse( explode( '.', $url['host'] ) ); |
|
230
|
|
|
$host_reversed = ''; |
|
231
|
|
|
foreach ( $host_parts as $part ) { |
|
232
|
|
|
$host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part; |
|
233
|
|
|
} |
|
234
|
|
|
|
|
235
|
|
|
$link_analyzed = ''; |
|
236
|
|
|
if ( !empty( $url['path'] ) ) { |
|
237
|
|
|
// The whole path (no query args or fragments) |
|
238
|
|
|
$path = substr( $url['path'], 1 ); // strip the leading '/' |
|
239
|
|
|
$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path; |
|
240
|
|
|
|
|
241
|
|
|
// The path split by / |
|
242
|
|
|
$path_split = explode( '/', $path ); |
|
243
|
|
|
if ( count( $path_split ) > 1 ) { |
|
244
|
|
|
$link_analyzed .= ' ' . implode( ' ', $path_split ); |
|
245
|
|
|
} |
|
246
|
|
|
|
|
247
|
|
|
// The fragment |
|
248
|
|
|
if ( ! empty( $url['fragment'] ) ) |
|
249
|
|
|
$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment']; |
|
250
|
|
|
} |
|
251
|
|
|
|
|
252
|
|
|
// @todo Check unique before adding |
|
|
|
|
|
|
253
|
|
|
$links[] = array( |
|
254
|
|
|
'url' => $link_all_but_proto, |
|
255
|
|
|
'host_reversed' => $host_reversed, |
|
256
|
|
|
'host' => $url['host'], |
|
257
|
|
|
); |
|
258
|
|
|
} |
|
259
|
|
|
|
|
260
|
|
|
} |
|
261
|
|
|
|
|
262
|
|
|
$link_count = count( $links ); |
|
263
|
|
|
if ( $link_count ) { |
|
264
|
|
|
$extracted[ 'link' ] = $links; |
|
265
|
|
|
if ( !isset( $extracted['has'] ) ) |
|
266
|
|
|
$extracted['has'] = array(); |
|
267
|
|
|
$extracted['has']['link'] = $link_count; |
|
268
|
|
|
} |
|
269
|
|
|
} |
|
270
|
|
|
|
|
271
|
|
|
// ----------------------------------- EMBEDS ------------------------------ |
|
272
|
|
|
|
|
273
|
|
|
//Embeds are just individual links on their own line |
|
274
|
|
|
if ( self::EMBEDS & $what_to_extract ) { |
|
275
|
|
|
|
|
276
|
|
|
if ( !function_exists( '_wp_oembed_get_object' ) ) |
|
277
|
|
|
include( ABSPATH . WPINC . '/class-oembed.php' ); |
|
278
|
|
|
|
|
279
|
|
|
// get an oembed object |
|
280
|
|
|
$oembed = _wp_oembed_get_object(); |
|
281
|
|
|
|
|
282
|
|
|
// Grab any links on their own lines that may be embeds |
|
283
|
|
|
if ( preg_match_all( '|^\s*(https?://[^\s"]+)\s*$|im', $content, $matches ) ) { |
|
284
|
|
|
|
|
285
|
|
|
// To hold the extracted stuff we find |
|
286
|
|
|
$embeds = array(); |
|
287
|
|
|
|
|
288
|
|
|
foreach ( $matches[1] as $link_raw ) { |
|
289
|
|
|
$url = parse_url( $link_raw ); |
|
|
|
|
|
|
290
|
|
|
|
|
291
|
|
|
list( $proto, $link_all_but_proto ) = explode( '://', $link_raw ); |
|
|
|
|
|
|
292
|
|
|
|
|
293
|
|
|
// Check whether this "link" is really an embed. |
|
294
|
|
|
foreach ( $oembed->providers as $matchmask => $data ) { |
|
295
|
|
|
list( $providerurl, $regex ) = $data; |
|
296
|
|
|
|
|
297
|
|
|
// Turn the asterisk-type provider URLs into regex |
|
298
|
|
|
if ( !$regex ) { |
|
299
|
|
|
$matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i'; |
|
300
|
|
|
$matchmask = preg_replace( '|^#http\\\://|', '#https?\://', $matchmask ); |
|
301
|
|
|
} |
|
302
|
|
|
|
|
303
|
|
|
if ( preg_match( $matchmask, $link_raw ) ) { |
|
304
|
|
|
$provider = str_replace( '{format}', 'json', $providerurl ); // JSON is easier to deal with than XML |
|
|
|
|
|
|
305
|
|
|
$embeds[] = $link_all_but_proto; // @todo Check unique before adding |
|
|
|
|
|
|
306
|
|
|
|
|
307
|
|
|
// @todo Try to get ID's for the ones we care about (shortcode_keepers) |
|
|
|
|
|
|
308
|
|
|
break; |
|
309
|
|
|
} |
|
310
|
|
|
} |
|
311
|
|
|
} |
|
312
|
|
|
|
|
313
|
|
|
if ( ! empty( $embeds ) ) { |
|
314
|
|
|
if ( !isset( $extracted['has'] ) ) |
|
315
|
|
|
$extracted['has'] = array(); |
|
316
|
|
|
$extracted['has']['embed'] = count( $embeds ); |
|
317
|
|
|
$extracted['embed'] = array( 'url' => array() ); |
|
318
|
|
|
foreach ( $embeds as $e ) |
|
319
|
|
|
$extracted['embed']['url'][] = $e; |
|
320
|
|
|
} |
|
321
|
|
|
} |
|
322
|
|
|
} |
|
323
|
|
|
|
|
324
|
|
|
return $extracted; |
|
325
|
|
|
} |
|
326
|
|
|
|
|
327
|
|
|
/** |
|
328
|
|
|
* @param $post A post object |
|
329
|
|
|
* @param $args (array) Optional args, see defaults list for details |
|
330
|
|
|
* @returns array Returns an array of all images meeting the specified criteria in $args |
|
331
|
|
|
* |
|
332
|
|
|
* Uses Jetpack Post Images |
|
333
|
|
|
*/ |
|
334
|
|
|
private static function get_image_fields( $post, $args = array() ) { |
|
335
|
|
|
|
|
336
|
|
|
$defaults = array( |
|
337
|
|
|
'width' => 200, // Required minimum width (if possible to determine) |
|
338
|
|
|
'height' => 200, // Required minimum height (if possible to determine) |
|
339
|
|
|
); |
|
340
|
|
|
|
|
341
|
|
|
$args = wp_parse_args( $args, $defaults ); |
|
342
|
|
|
|
|
343
|
|
|
$image_list = array(); |
|
344
|
|
|
$image_booleans = array(); |
|
345
|
|
|
$image_booleans['gallery'] = 0; |
|
346
|
|
|
|
|
347
|
|
|
$from_featured_image = Jetpack_PostImages::from_thumbnail( $post->ID, $args['width'], $args['height'] ); |
|
348
|
|
View Code Duplication |
if ( !empty( $from_featured_image ) ) { |
|
349
|
|
|
$srcs = wp_list_pluck( $from_featured_image, 'src' ); |
|
350
|
|
|
$image_list = array_merge( $image_list, $srcs ); |
|
351
|
|
|
} |
|
352
|
|
|
|
|
353
|
|
|
$from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] ); |
|
354
|
|
View Code Duplication |
if ( !empty( $from_slideshow ) ) { |
|
355
|
|
|
$srcs = wp_list_pluck( $from_slideshow, 'src' ); |
|
356
|
|
|
$image_list = array_merge( $image_list, $srcs ); |
|
357
|
|
|
} |
|
358
|
|
|
|
|
359
|
|
|
$from_gallery = Jetpack_PostImages::from_gallery( $post->ID ); |
|
360
|
|
|
if ( !empty( $from_gallery ) ) { |
|
361
|
|
|
$srcs = wp_list_pluck( $from_gallery, 'src' ); |
|
362
|
|
|
$image_list = array_merge( $image_list, $srcs ); |
|
363
|
|
|
$image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1 |
|
|
|
|
|
|
364
|
|
|
} |
|
365
|
|
|
|
|
366
|
|
|
// @todo Can we check width/height of these efficiently? Could maybe use query args at least, before we strip them out |
|
|
|
|
|
|
367
|
|
|
$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $post->post_content, $image_list ); |
|
368
|
|
|
|
|
369
|
|
|
return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list ); |
|
370
|
|
|
} |
|
371
|
|
|
|
|
372
|
|
|
public static function extract_images_from_content( $content, $image_list ) { |
|
373
|
|
|
$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $content, $image_list ); |
|
374
|
|
|
return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list ); |
|
375
|
|
|
} |
|
376
|
|
|
|
|
377
|
|
|
public static function build_image_struct( $image_list ) { |
|
378
|
|
|
if ( ! empty( $image_list ) ) { |
|
379
|
|
|
$retval = array( 'image' => array() ); |
|
380
|
|
|
$image_list = array_unique( $image_list ); |
|
381
|
|
|
foreach ( $image_list as $img ) { |
|
382
|
|
|
$retval['image'][] = array( 'url' => $img ); |
|
383
|
|
|
} |
|
384
|
|
|
$image_booleans['image'] = count( $retval['image'] ); |
|
|
|
|
|
|
385
|
|
|
if ( ! empty( $image_booleans ) ) |
|
386
|
|
|
$retval['has'] = $image_booleans; |
|
387
|
|
|
return $retval; |
|
388
|
|
|
} else { |
|
389
|
|
|
return array(); |
|
390
|
|
|
} |
|
391
|
|
|
} |
|
392
|
|
|
|
|
393
|
|
|
/** |
|
394
|
|
|
* |
|
395
|
|
|
* @param string $html Some markup, possibly containing image tags |
|
396
|
|
|
* @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication |
|
397
|
|
|
* @return array Image URLs extracted from the HTML, stripped of query params and de-duped |
|
398
|
|
|
*/ |
|
399
|
|
|
public static function get_images_from_html( $html, $images_already_extracted ) { |
|
400
|
|
|
$image_list = $images_already_extracted; |
|
401
|
|
|
$from_html = Jetpack_PostImages::from_html( $html ); |
|
402
|
|
|
if ( !empty( $from_html ) ) { |
|
403
|
|
|
$srcs = wp_list_pluck( $from_html, 'src' ); |
|
404
|
|
|
foreach( $srcs as $image_url ) { |
|
405
|
|
|
if ( ( $src = parse_url( $image_url ) ) && isset( $src['scheme'], $src['host'], $src['path'] ) ) { |
|
406
|
|
|
// Rebuild the URL without the query string |
|
407
|
|
|
$queryless = $src['scheme'] . '://' . $src['host'] . $src['path']; |
|
408
|
|
|
} elseif ( $length = strpos( $image_url, '?' ) ) { |
|
409
|
|
|
// If parse_url() didn't work, strip off the query string the old fashioned way |
|
410
|
|
|
$queryless = substr( $image_url, 0, $length ); |
|
411
|
|
|
} else { |
|
412
|
|
|
// Failing that, there was no spoon! Err ... query string! |
|
413
|
|
|
$queryless = $image_url; |
|
414
|
|
|
} |
|
415
|
|
|
|
|
416
|
|
|
// Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML. |
|
417
|
|
|
if ( 4096 < strlen( $queryless ) ) { |
|
418
|
|
|
continue; |
|
419
|
|
|
} |
|
420
|
|
|
|
|
421
|
|
|
if ( ! in_array( $queryless, $image_list ) ) { |
|
422
|
|
|
$image_list[] = $queryless; |
|
423
|
|
|
} |
|
424
|
|
|
} |
|
425
|
|
|
} |
|
426
|
|
|
return $image_list; |
|
427
|
|
|
} |
|
428
|
|
|
|
|
429
|
|
|
private static function get_stripped_content( $content ) { |
|
430
|
|
|
$clean_content = strip_tags( $content ); |
|
431
|
|
|
$clean_content = html_entity_decode( $clean_content ); |
|
432
|
|
|
//completely strip shortcodes and any content they enclose |
|
433
|
|
|
$clean_content = strip_shortcodes( $clean_content ); |
|
434
|
|
|
return $clean_content; |
|
435
|
|
|
} |
|
436
|
|
|
} |
|
437
|
|
|
|
This check looks
TODOcomments that have been left in the code.``TODO``s show that something is left unfinished and should be attended to.