Completed
Push — fix/normalize-www-in-site-url-... ( e67e76 )
by
unknown
13:13 queued 02:59
created

Jetpack_Media_Meta_Extractor::extract()   B

Complexity

Conditions 5
Paths 16

Size

Total Lines 32
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 16
nc 16
nop 3
dl 0
loc 32
rs 8.439
c 0
b 0
f 0
1
<?php
2
/**
3
 * Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
4
 * in or attached to the post/page.
5
 *
6
 * @todo Additionally, have some filters on number of items in each field
0 ignored issues
show
Coding Style introduced by
Comment refers to a TODO task

This check looks TODO comments that have been left in the code.

``TODO``s show that something is left unfinished and should be attended to.

Loading history...
7
 */
8
class Jetpack_Media_Meta_Extractor {
9
10
	// Some consts for what to extract
11
	const ALL = 255;
12
	const LINKS = 1;
13
	const MENTIONS = 2;
14
	const IMAGES = 4;
15
	const SHORTCODES = 8; // Only the keeper shortcodes below
16
	const EMBEDS = 16;
17
	const HASHTAGS = 32;
18
19
	// For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all)
20
	// There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these.
21
	private static $KEEPER_SHORTCODES = array(
22
		'youtube',
23
		'vimeo',
24
		'hulu',
25
		'ted',
26
		'wpvideo',
27
	);
28
29
	/**
30
	 * Gets the specified media and meta info from the given post.
31
	 * NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead.
32
	 *
33
	 * @param $blog_id The ID of the blog
34
	 * @param $post_id The ID of the post
35
	 * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS
36
	 * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
37
	 */
38
	static public function extract( $blog_id, $post_id, $what_to_extract = self::ALL ) {
0 ignored issues
show
Coding Style introduced by
As per PSR2, the static declaration should come after the visibility declaration.
Loading history...
39
40
		// multisite?
41
		if ( function_exists( 'switch_to_blog') )
42
			switch_to_blog( $blog_id );
43
44
		$post = get_post( $post_id );
45
		$content = $post->post_title . "\n\n" . $post->post_content;
46
		$char_cnt = strlen( $content );
47
48
		//prevent running extraction on really huge amounts of content
49
		if ( $char_cnt > 100000 ) //about 20k English words
50
			$content = substr( $content, 0, 100000 );
51
52
		$extracted = array();
53
54
		// Get images first, we need the full post for that
55
		if ( self::IMAGES & $what_to_extract ) {
56
			$extracted = self::get_image_fields( $post );
57
58
			// Turn off images so we can safely call extract_from_content() below
59
			$what_to_extract = $what_to_extract - self::IMAGES;
0 ignored issues
show
Coding Style introduced by
Consider using a different name than the parameter $what_to_extract. This often makes code more readable.
Loading history...
60
		}
61
62
		if ( function_exists( 'switch_to_blog') )
63
			restore_current_blog();
64
65
		// All of the other things besides images can be extracted from just the content
66
		$extracted = self::extract_from_content( $content, $what_to_extract, $extracted );
67
68
		return $extracted;
69
	}
70
71
	/**
72
	 * Gets the specified meta info from the given post content.
73
	 * NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction
74
	 * This method will give you an error if you ask for IMAGES.
75
	 *
76
	 * @param $content The HTML post_content of a post
77
	 * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS
78
	 * @param $already_extracted (array) Previously extracted things, e.g. images from extract(), which can be used for x-referencing here
79
	 * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
80
	 */
81
	static public function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) {
0 ignored issues
show
Coding Style introduced by
As per PSR2, the static declaration should come after the visibility declaration.
Loading history...
82
		$stripped_content = self::get_stripped_content( $content );
83
84
		// Maybe start with some previously extracted things (e.g. images from extract()
85
		$extracted = $already_extracted;
86
87
		// Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.
88
89
 		if ( self::IMAGES & $what_to_extract ) {
90
			$images = Jetpack_Media_Meta_Extractor::extract_images_from_content( $stripped_content, array() );
91
			$extracted = array_merge( $extracted, $images );
92
		}
93
94
		// ----------------------------------- MENTIONS ------------------------------
95
96 View Code Duplication
		if ( self::MENTIONS & $what_to_extract ) {
97
			if ( preg_match_all( '/(^|\s)@(\w+)/u', $stripped_content, $matches ) ) {
98
				$mentions = array_values( array_unique( $matches[2] ) ); //array_unique() retains the keys!
99
				$mentions = array_map( 'strtolower', $mentions );
100
				$extracted['mention'] = array( 'name' => $mentions );
101
				if ( !isset( $extracted['has'] ) )
102
					$extracted['has'] = array();
103
				$extracted['has']['mention'] = count( $mentions );
104
			}
105
		}
106
107
		// ----------------------------------- HASHTAGS ------------------------------
108
		/** Some hosts may not compile with --enable-unicode-properties and kick a warning:
109
		  *   Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
110
		  * Therefore, we only run this code block on wpcom, not in Jetpack.
111
		 */
112
		if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) {
113
			//This regex does not exactly match Twitter's
114
			// if there are problems/complaints we should implement this:
115
			//   https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
116 View Code Duplication
			if ( preg_match_all( '/(?:^|\s)#(\w*\p{L}+\w*)/u', $stripped_content, $matches ) ) {
117
				$hashtags = array_values( array_unique( $matches[1] ) ); //array_unique() retains the keys!
118
				$hashtags = array_map( 'strtolower', $hashtags );
119
				$extracted['hashtag'] = array( 'name' => $hashtags );
120
				if ( !isset( $extracted['has'] ) )
121
					$extracted['has'] = array();
122
				$extracted['has']['hashtag'] = count( $hashtags );
123
			}
124
		}
125
126
		// ----------------------------------- SHORTCODES ------------------------------
127
128
		// Always look for shortcodes.
129
		// If we don't want them, we'll just remove them, so we don't grab them as links below
130
		$shortcode_pattern = '/' . get_shortcode_regex() . '/s';
131
 		if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) {
132
133
			$shortcode_total_count = 0;
134
			$shortcode_type_counts = array();
135
			$shortcode_types = array();
136
			$shortcode_details = array();
137
138
			if ( self::SHORTCODES & $what_to_extract ) {
139
140
				foreach( $matches[2] as $key => $shortcode ) {
141
					//Elasticsearch (and probably other things) doesn't deal well with some chars as key names
142
					$shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode );
143
144
					$attr = shortcode_parse_atts( $matches[3][ $key ] );
145
146
					$shortcode_total_count++;
147
					if ( ! isset( $shortcode_type_counts[$shortcode_name] ) )
148
						$shortcode_type_counts[$shortcode_name] = 0;
149
					$shortcode_type_counts[$shortcode_name]++;
150
151
					// Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
152
					// @todo Store number of occurrences?
0 ignored issues
show
Coding Style Best Practice introduced by
Comments for TODO tasks are often forgotten in the code; it might be better to use a dedicated issue tracker.
Loading history...
153
					if ( ! in_array( $shortcode_name, $shortcode_types ) )
154
						$shortcode_types[] = $shortcode_name;
155
156
					// For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.)
157
					if ( in_array( $shortcode, self::$KEEPER_SHORTCODES ) ) {
158
						unset( $id ); // Clear shortcode ID data left from the last shortcode
159
						// We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id()
160
						// If the shortcode is a class, we'll call XyzShortcode::get_xyz_id()
161
						$shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
162
						$shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode';
163
						$shortcode_get_id_method = "get_{$shortcode}_id";
164
						if ( function_exists( $shortcode_get_id_func ) ) {
165
							$id = call_user_func( $shortcode_get_id_func, $attr );
166
						} else if ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) {
167
							$id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr );
168
						}
169
						if ( ! empty( $id )
170
							&& ( ! isset( $shortcode_details[$shortcode_name] ) || ! in_array( $id, $shortcode_details[$shortcode_name] ) ) )
171
							$shortcode_details[$shortcode_name][] = $id;
172
					}
173
				}
174
175
				if ( $shortcode_total_count > 0 ) {
176
					// Add the shortcode info to the $extracted array
177
					if ( !isset( $extracted['has'] ) )
178
						$extracted['has'] = array();
179
					$extracted['has']['shortcode'] = $shortcode_total_count;
180
					$extracted['shortcode'] = array();
181
					foreach ( $shortcode_type_counts as $type => $count )
182
						$extracted['shortcode'][$type] = array( 'count' => $count );
183
					if ( ! empty( $shortcode_types ) )
184
						$extracted['shortcode_types'] = $shortcode_types;
185
					foreach ( $shortcode_details as $type => $id )
186
						$extracted['shortcode'][$type]['id'] = $id;
187
				}
188
			}
189
190
			// Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
191
			$content = preg_replace( $shortcode_pattern, ' ', $content );
0 ignored issues
show
Coding Style introduced by
Consider using a different name than the parameter $content. This often makes code more readable.
Loading history...
192
		}
193
194
		// ----------------------------------- LINKS ------------------------------
195
196
		if ( self::LINKS & $what_to_extract ) {
197
198
			// To hold the extracted stuff we find
199
			$links = array();
200
201
			// @todo Get the text inside the links?
0 ignored issues
show
Coding Style Best Practice introduced by
Comments for TODO tasks are often forgotten in the code; it might be better to use a dedicated issue tracker.
Loading history...
202
203
			// Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images
204
			// (we treat embed links as just another link)
205
			if ( preg_match_all( '#(?:^|\s|"|\')(https?://([^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))))#', $content, $matches ) ) {
206
207
				foreach ( $matches[1] as $link_raw ) {
208
					$url = parse_url( $link_raw );
209
210
					// Data URI links
211
					if ( isset( $url['scheme'] ) && 'data' === $url['scheme'] )
212
						continue;
213
214
					// Remove large (and likely invalid) links
215
					if ( 4096 < strlen( $link_raw ) )
216
						continue;
217
218
					// Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those
219
					$simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' );
220
					if ( isset( $extracted['image']['url'] ) ) {
221
						if ( in_array( $simple_url, (array) $extracted['image']['url'] ) )
222
							continue;
223
					}
224
225
					list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );
226
227
					// Build a reversed hostname
228
					$host_parts = array_reverse( explode( '.', $url['host'] ) );
229
					$host_reversed = '';
230
					foreach ( $host_parts as $part ) {
231
						$host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part;
232
					}
233
234
					$link_analyzed = '';
235
					if ( !empty( $url['path'] ) ) {
236
						// The whole path (no query args or fragments)
237
						$path = substr( $url['path'], 1 ); // strip the leading '/'
238
						$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path;
239
240
						// The path split by /
241
						$path_split = explode( '/', $path );
242
						if ( count( $path_split ) > 1 ) {
243
							$link_analyzed .= ' ' . implode( ' ', $path_split );
244
						}
245
246
						// The fragment
247
						if ( ! empty( $url['fragment'] ) )
248
							$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment'];
249
					}
250
251
					// @todo Check unique before adding
0 ignored issues
show
Coding Style Best Practice introduced by
Comments for TODO tasks are often forgotten in the code; it might be better to use a dedicated issue tracker.
Loading history...
252
					$links[] = array(
253
						'url' => $link_all_but_proto,
254
						'host_reversed' => $host_reversed,
255
						'host' => $url['host'],
256
					);
257
				}
258
259
			}
260
261
			$link_count = count( $links );
262
			if ( $link_count ) {
263
				$extracted[ 'link' ] = $links;
264
				if ( !isset( $extracted['has'] ) )
265
					$extracted['has'] = array();
266
				$extracted['has']['link'] = $link_count;
267
			}
268
		}
269
270
		// ----------------------------------- EMBEDS ------------------------------
271
272
		//Embeds are just individual links on their own line
273
		if ( self::EMBEDS & $what_to_extract ) {
274
275
			if ( !function_exists( '_wp_oembed_get_object' ) )
276
				include( ABSPATH . WPINC . '/class-oembed.php' );
277
278
			// get an oembed object
279
			$oembed = _wp_oembed_get_object();
280
281
			// Grab any links on their own lines that may be embeds
282
			if ( preg_match_all( '|^\s*(https?://[^\s"]+)\s*$|im', $content, $matches ) ) {
283
284
				// To hold the extracted stuff we find
285
				$embeds = array();
286
287
				foreach ( $matches[1] as $link_raw ) {
288
					$url = parse_url( $link_raw );
0 ignored issues
show
Unused Code introduced by
$url is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
289
290
					list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );
0 ignored issues
show
Unused Code introduced by
The assignment to $proto is unused. Consider omitting it like so list($first,,$third).

This checks looks for assignemnts to variables using the list(...) function, where not all assigned variables are subsequently used.

Consider the following code example.

<?php

function returnThreeValues() {
    return array('a', 'b', 'c');
}

list($a, $b, $c) = returnThreeValues();

print $a . " - " . $c;

Only the variables $a and $c are used. There was no need to assign $b.

Instead, the list call could have been.

list($a,, $c) = returnThreeValues();
Loading history...
291
292
					// Check whether this "link" is really an embed.
293
					foreach ( $oembed->providers as $matchmask => $data ) {
294
						list( $providerurl, $regex ) = $data;
295
296
						// Turn the asterisk-type provider URLs into regex
297
						if ( !$regex ) {
298
							$matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i';
299
							$matchmask = preg_replace( '|^#http\\\://|', '#https?\://', $matchmask );
300
						}
301
302
						if ( preg_match( $matchmask, $link_raw ) ) {
303
							$provider = str_replace( '{format}', 'json', $providerurl ); // JSON is easier to deal with than XML
0 ignored issues
show
Unused Code introduced by
$provider is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
304
							$embeds[] = $link_all_but_proto; // @todo Check unique before adding
0 ignored issues
show
Coding Style Best Practice introduced by
Comments for TODO tasks are often forgotten in the code; it might be better to use a dedicated issue tracker.
Loading history...
305
306
							// @todo Try to get ID's for the ones we care about (shortcode_keepers)
0 ignored issues
show
Coding Style Best Practice introduced by
Comments for TODO tasks are often forgotten in the code; it might be better to use a dedicated issue tracker.
Loading history...
307
							break;
308
						}
309
					}
310
				}
311
312
				if ( ! empty( $embeds ) ) {
313
					if ( !isset( $extracted['has'] ) )
314
						$extracted['has'] = array();
315
					$extracted['has']['embed'] = count( $embeds );
316
					$extracted['embed'] = array( 'url' => array() );
317
					foreach ( $embeds as $e )
318
						$extracted['embed']['url'][] = $e;
319
				}
320
			}
321
		}
322
323
		return $extracted;
324
	}
325
326
	/**
327
	 * @param $post A post object
328
	 * @param $args (array) Optional args, see defaults list for details
329
	 * @returns array Returns an array of all images meeting the specified criteria in $args
330
	 *
331
	 * Uses Jetpack Post Images
332
	 */
333
	private static function get_image_fields( $post, $args = array() ) {
334
335
		$defaults = array(
336
			'width'               => 200, // Required minimum width (if possible to determine)
337
			'height'              => 200, // Required minimum height (if possible to determine)
338
		);
339
340
		$args = wp_parse_args( $args, $defaults );
0 ignored issues
show
Coding Style introduced by
Consider using a different name than the parameter $args. This often makes code more readable.
Loading history...
341
342
		$image_list = array();
343
		$image_booleans = array();
344
		$image_booleans['gallery'] = 0;
345
346
		$from_featured_image = Jetpack_PostImages::from_thumbnail( $post->ID, $args['width'], $args['height'] );
347 View Code Duplication
		if ( !empty( $from_featured_image ) ) {
348
			$srcs = wp_list_pluck( $from_featured_image, 'src' );
349
			$image_list = array_merge( $image_list, $srcs );
350
		}
351
352
		$from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] );
353 View Code Duplication
		if ( !empty( $from_slideshow ) ) {
354
			$srcs = wp_list_pluck( $from_slideshow, 'src' );
355
			$image_list = array_merge( $image_list, $srcs );
356
		}
357
358
		$from_gallery = Jetpack_PostImages::from_gallery( $post->ID );
359
		if ( !empty( $from_gallery ) ) {
360
			$srcs = wp_list_pluck( $from_gallery, 'src' );
361
			$image_list = array_merge( $image_list, $srcs );
362
			$image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1
0 ignored issues
show
Coding Style Best Practice introduced by
Comments for TODO tasks are often forgotten in the code; it might be better to use a dedicated issue tracker.
Loading history...
363
		}
364
365
		// @todo Can we check width/height of these efficiently?  Could maybe use query args at least, before we strip them out
0 ignored issues
show
Coding Style Best Practice introduced by
Comments for TODO tasks are often forgotten in the code; it might be better to use a dedicated issue tracker.
Loading history...
366
		$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $post->post_content, $image_list );
367
368
		return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list );
369
	}
370
371
	public static function extract_images_from_content( $content, $image_list ) {
372
		$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $content, $image_list );
0 ignored issues
show
Coding Style introduced by
Consider using a different name than the parameter $image_list. This often makes code more readable.
Loading history...
373
		return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list );
374
	}
375
376
	public static function build_image_struct( $image_list ) {
377
		if ( ! empty( $image_list ) ) {
378
			$retval = array( 'image' => array() );
379
			$image_list = array_unique( $image_list );
0 ignored issues
show
Coding Style introduced by
Consider using a different name than the parameter $image_list. This often makes code more readable.
Loading history...
380
			foreach ( $image_list as $img ) {
381
				$retval['image'][] = array( 'url' => $img );
382
			}
383
			$image_booleans['image'] = count( $retval['image'] );
0 ignored issues
show
Coding Style Comprehensibility introduced by
$image_booleans was never initialized. Although not strictly required by PHP, it is generally a good practice to add $image_booleans = array(); before regardless.

Adding an explicit array definition is generally preferable to implicit array definition as it guarantees a stable state of the code.

Let’s take a look at an example:

foreach ($collection as $item) {
    $myArray['foo'] = $item->getFoo();

    if ($item->hasBar()) {
        $myArray['bar'] = $item->getBar();
    }

    // do something with $myArray
}

As you can see in this example, the array $myArray is initialized the first time when the foreach loop is entered. You can also see that the value of the bar key is only written conditionally; thus, its value might result from a previous iteration.

This might or might not be intended. To make your intention clear, your code more readible and to avoid accidental bugs, we recommend to add an explicit initialization $myArray = array() either outside or inside the foreach loop.

Loading history...
384
			if ( ! empty( $image_booleans ) )
385
				$retval['has'] = $image_booleans;
386
			return $retval;
387
		} else {
388
			return array();
389
		}
390
	}
391
392
	/**
393
	 *
394
	 * @param string $html Some markup, possibly containing image tags
395
	 * @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication
396
	 * @return array Image URLs extracted from the HTML, stripped of query params and de-duped
397
	 */
398
	public static function get_images_from_html( $html, $images_already_extracted ) {
399
		$image_list = $images_already_extracted;
400
		$from_html = Jetpack_PostImages::from_html( $html );
401
		if ( !empty( $from_html ) ) {
402
			$srcs = wp_list_pluck( $from_html, 'src' );
403
			foreach( $srcs as $image_url ) {
404
				if ( ( $src = parse_url( $image_url ) ) && isset( $src['scheme'], $src['host'], $src['path'] ) ) {
405
					// Rebuild the URL without the query string
406
					$queryless = $src['scheme'] . '://' . $src['host'] . $src['path'];
407
				} elseif ( $length = strpos( $image_url, '?' ) ) {
408
					// If parse_url() didn't work, strip off the query string the old fashioned way
409
					$queryless = substr( $image_url, 0, $length );
410
				} else {
411
					// Failing that, there was no spoon! Err ... query string!
412
					$queryless = $image_url;
413
				}
414
415
				// Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML.
416
				if ( 4096 < strlen( $queryless ) ) {
417
					continue;
418
				}
419
420
				if ( ! in_array( $queryless, $image_list ) ) {
421
					$image_list[] = $queryless;
422
				}
423
			}
424
		}
425
		return $image_list;
426
	}
427
428
	private static function get_stripped_content( $content ) {
429
		$clean_content = strip_tags( $content );
430
		$clean_content = html_entity_decode( $clean_content );
431
		//completely strip shortcodes and any content they enclose
432
		$clean_content = strip_shortcodes( $clean_content );
433
		return $clean_content;
434
	}
435
}
436