Issues in class.media-extractor.php - New Issues - Inspection of "Normalize URLs containing www subdomain" - Automattic/jetpack - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — fix/normalize-www-in-site-url-... ( e67e76 )

unknown

created 2016-07-12 18:52 UTC

_inc/lib/class.media-extractor.php (5 issues)

Labels

Coding Style 5

Severity

Informational 5

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

<?php
/**
 * Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
 * in or attached to the post/page.
 *
 * @todo Additionally, have some filters on number of items in each field
 */
class Jetpack_Media_Meta_Extractor {

	// Some consts for what to extract
	const ALL = 255;
	const LINKS = 1;
	const MENTIONS = 2;
	const IMAGES = 4;
	const SHORTCODES = 8; // Only the keeper shortcodes below
	const EMBEDS = 16;
	const HASHTAGS = 32;

	// For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all)
	// There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these.
	private static $KEEPER_SHORTCODES = array(
		'youtube',
		'vimeo',
		'hulu',
		'ted',
		'wpvideo',
	);

	/**
	 * Gets the specified media and meta info from the given post.
	 * NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead.
	 *
	 * @param $blog_id The ID of the blog
	 * @param $post_id The ID of the post
	 * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS
	 * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
	 */
	static public function extract( $blog_id, $post_id, $what_to_extract = self::ALL ) {

		// multisite?
		if ( function_exists( 'switch_to_blog') )
			switch_to_blog( $blog_id );

		$post = get_post( $post_id );
		$content = $post->post_title . "\n\n" . $post->post_content;
		$char_cnt = strlen( $content );

		//prevent running extraction on really huge amounts of content
		if ( $char_cnt > 100000 ) //about 20k English words
			$content = substr( $content, 0, 100000 );

		$extracted = array();

		// Get images first, we need the full post for that
		if ( self::IMAGES & $what_to_extract ) {
			$extracted = self::get_image_fields( $post );

			// Turn off images so we can safely call extract_from_content() below
			$what_to_extract = $what_to_extract - self::IMAGES;

		}

		if ( function_exists( 'switch_to_blog') )
			restore_current_blog();

		// All of the other things besides images can be extracted from just the content
		$extracted = self::extract_from_content( $content, $what_to_extract, $extracted );

		return $extracted;
	}

	/**
	 * Gets the specified meta info from the given post content.
	 * NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction
	 * This method will give you an error if you ask for IMAGES.
	 *
	 * @param $content The HTML post_content of a post
	 * @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES | Jetpack_Media_Meta_Extractor::MENTIONS
	 * @param $already_extracted (array) Previously extracted things, e.g. images from extract(), which can be used for x-referencing here
	 * @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
	 */
	static public function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) {
		$stripped_content = self::get_stripped_content( $content );

		// Maybe start with some previously extracted things (e.g. images from extract()
		$extracted = $already_extracted;

		// Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.

 		if ( self::IMAGES & $what_to_extract ) {
			$images = Jetpack_Media_Meta_Extractor::extract_images_from_content( $stripped_content, array() );
			$extracted = array_merge( $extracted, $images );
		}

		// ----------------------------------- MENTIONS ------------------------------

		if ( self::MENTIONS & $what_to_extract ) {
			if ( preg_match_all( '/(^|\s)@(\w+)/u', $stripped_content, $matches ) ) {
				$mentions = array_values( array_unique( $matches[2] ) ); //array_unique() retains the keys!
				$mentions = array_map( 'strtolower', $mentions );
				$extracted['mention'] = array( 'name' => $mentions );
				if ( !isset( $extracted['has'] ) )
					$extracted['has'] = array();
				$extracted['has']['mention'] = count( $mentions );
			}
		}

		// ----------------------------------- HASHTAGS ------------------------------
		/** Some hosts may not compile with --enable-unicode-properties and kick a warning:
		  *   Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
		  * Therefore, we only run this code block on wpcom, not in Jetpack.
		 */
		if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) {
			//This regex does not exactly match Twitter's
			// if there are problems/complaints we should implement this:
			//   https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
			if ( preg_match_all( '/(?:^|\s)#(\w*\p{L}+\w*)/u', $stripped_content, $matches ) ) {
				$hashtags = array_values( array_unique( $matches[1] ) ); //array_unique() retains the keys!
				$hashtags = array_map( 'strtolower', $hashtags );
				$extracted['hashtag'] = array( 'name' => $hashtags );
				if ( !isset( $extracted['has'] ) )
					$extracted['has'] = array();
				$extracted['has']['hashtag'] = count( $hashtags );
			}
		}

		// ----------------------------------- SHORTCODES ------------------------------

		// Always look for shortcodes.
		// If we don't want them, we'll just remove them, so we don't grab them as links below
		$shortcode_pattern = '/' . get_shortcode_regex() . '/s';
 		if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) {

			$shortcode_total_count = 0;
			$shortcode_type_counts = array();
			$shortcode_types = array();
			$shortcode_details = array();

			if ( self::SHORTCODES & $what_to_extract ) {

				foreach( $matches[2] as $key => $shortcode ) {
					//Elasticsearch (and probably other things) doesn't deal well with some chars as key names
					$shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode );

					$attr = shortcode_parse_atts( $matches[3][ $key ] );

					$shortcode_total_count++;
					if ( ! isset( $shortcode_type_counts[$shortcode_name] ) )
						$shortcode_type_counts[$shortcode_name] = 0;
					$shortcode_type_counts[$shortcode_name]++;

					// Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
					// @todo Store number of occurrences?
					if ( ! in_array( $shortcode_name, $shortcode_types ) )
						$shortcode_types[] = $shortcode_name;

					// For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.)
					if ( in_array( $shortcode, self::$KEEPER_SHORTCODES ) ) {
						unset( $id ); // Clear shortcode ID data left from the last shortcode
						// We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id()
						// If the shortcode is a class, we'll call XyzShortcode::get_xyz_id()
						$shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
						$shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode';
						$shortcode_get_id_method = "get_{$shortcode}_id";
						if ( function_exists( $shortcode_get_id_func ) ) {
							$id = call_user_func( $shortcode_get_id_func, $attr );
						} else if ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) {
							$id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr );
						}
						if ( ! empty( $id )
							&& ( ! isset( $shortcode_details[$shortcode_name] ) || ! in_array( $id, $shortcode_details[$shortcode_name] ) ) )
							$shortcode_details[$shortcode_name][] = $id;
					}
				}

				if ( $shortcode_total_count > 0 ) {
					// Add the shortcode info to the $extracted array
					if ( !isset( $extracted['has'] ) )
						$extracted['has'] = array();
					$extracted['has']['shortcode'] = $shortcode_total_count;
					$extracted['shortcode'] = array();
					foreach ( $shortcode_type_counts as $type => $count )
						$extracted['shortcode'][$type] = array( 'count' => $count );
					if ( ! empty( $shortcode_types ) )
						$extracted['shortcode_types'] = $shortcode_types;
					foreach ( $shortcode_details as $type => $id )
						$extracted['shortcode'][$type]['id'] = $id;
				}
			}

			// Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
			$content = preg_replace( $shortcode_pattern, ' ', $content );

		}

		// ----------------------------------- LINKS ------------------------------

		if ( self::LINKS & $what_to_extract ) {

			// To hold the extracted stuff we find
			$links = array();

			// @todo Get the text inside the links?

			// Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images
			// (we treat embed links as just another link)
			if ( preg_match_all( '#(?:^|\s|"|\')(https?://([^\s()<>]+(?:\([\w\d]+\)|([^[:punct:]\s]|/))))#', $content, $matches ) ) {

				foreach ( $matches[1] as $link_raw ) {
					$url = parse_url( $link_raw );

					// Data URI links
					if ( isset( $url['scheme'] ) && 'data' === $url['scheme'] )
						continue;

					// Remove large (and likely invalid) links
					if ( 4096 < strlen( $link_raw ) )
						continue;

					// Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those
					$simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' );
					if ( isset( $extracted['image']['url'] ) ) {
						if ( in_array( $simple_url, (array) $extracted['image']['url'] ) )
							continue;
					}

					list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );

					// Build a reversed hostname
					$host_parts = array_reverse( explode( '.', $url['host'] ) );
					$host_reversed = '';
					foreach ( $host_parts as $part ) {
						$host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part;
					}

					$link_analyzed = '';
					if ( !empty( $url['path'] ) ) {
						// The whole path (no query args or fragments)
						$path = substr( $url['path'], 1 ); // strip the leading '/'
						$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path;

						// The path split by /
						$path_split = explode( '/', $path );
						if ( count( $path_split ) > 1 ) {
							$link_analyzed .= ' ' . implode( ' ', $path_split );
						}

						// The fragment
						if ( ! empty( $url['fragment'] ) )
							$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment'];
					}

					// @todo Check unique before adding
					$links[] = array(
						'url' => $link_all_but_proto,
						'host_reversed' => $host_reversed,
						'host' => $url['host'],
					);
				}

			}

			$link_count = count( $links );
			if ( $link_count ) {
				$extracted[ 'link' ] = $links;
				if ( !isset( $extracted['has'] ) )
					$extracted['has'] = array();
				$extracted['has']['link'] = $link_count;
			}
		}

		// ----------------------------------- EMBEDS ------------------------------

		//Embeds are just individual links on their own line
		if ( self::EMBEDS & $what_to_extract ) {

			if ( !function_exists( '_wp_oembed_get_object' ) )
				include( ABSPATH . WPINC . '/class-oembed.php' );

			// get an oembed object
			$oembed = _wp_oembed_get_object();

			// Grab any links on their own lines that may be embeds
			if ( preg_match_all( '|^\s*(https?://[^\s"]+)\s*$|im', $content, $matches ) ) {

				// To hold the extracted stuff we find
				$embeds = array();

				foreach ( $matches[1] as $link_raw ) {
					$url = parse_url( $link_raw );

					list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );

					// Check whether this "link" is really an embed.
					foreach ( $oembed->providers as $matchmask => $data ) {
						list( $providerurl, $regex ) = $data;

						// Turn the asterisk-type provider URLs into regex
						if ( !$regex ) {
							$matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i';
							$matchmask = preg_replace( '|^#http\\\://|', '#https?\://', $matchmask );
						}

						if ( preg_match( $matchmask, $link_raw ) ) {
							$provider = str_replace( '{format}', 'json', $providerurl ); // JSON is easier to deal with than XML
							$embeds[] = $link_all_but_proto; // @todo Check unique before adding

							// @todo Try to get ID's for the ones we care about (shortcode_keepers)
							break;
						}
					}
				}

				if ( ! empty( $embeds ) ) {
					if ( !isset( $extracted['has'] ) )
						$extracted['has'] = array();
					$extracted['has']['embed'] = count( $embeds );
					$extracted['embed'] = array( 'url' => array() );
					foreach ( $embeds as $e )
						$extracted['embed']['url'][] = $e;
				}
			}
		}

		return $extracted;
	}

	/**
	 * @param $post A post object
	 * @param $args (array) Optional args, see defaults list for details
	 * @returns array Returns an array of all images meeting the specified criteria in $args
	 *
	 * Uses Jetpack Post Images
	 */
	private static function get_image_fields( $post, $args = array() ) {

		$defaults = array(
			'width'               => 200, // Required minimum width (if possible to determine)
			'height'              => 200, // Required minimum height (if possible to determine)
		);

		$args = wp_parse_args( $args, $defaults );


		$image_list = array();
		$image_booleans = array();
		$image_booleans['gallery'] = 0;

		$from_featured_image = Jetpack_PostImages::from_thumbnail( $post->ID, $args['width'], $args['height'] );
		if ( !empty( $from_featured_image ) ) {
			$srcs = wp_list_pluck( $from_featured_image, 'src' );
			$image_list = array_merge( $image_list, $srcs );
		}

		$from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] );
		if ( !empty( $from_slideshow ) ) {
			$srcs = wp_list_pluck( $from_slideshow, 'src' );
			$image_list = array_merge( $image_list, $srcs );
		}

		$from_gallery = Jetpack_PostImages::from_gallery( $post->ID );
		if ( !empty( $from_gallery ) ) {
			$srcs = wp_list_pluck( $from_gallery, 'src' );
			$image_list = array_merge( $image_list, $srcs );
			$image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1
		}

		// @todo Can we check width/height of these efficiently?  Could maybe use query args at least, before we strip them out
		$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $post->post_content, $image_list );

		return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list );
	}

	public static function extract_images_from_content( $content, $image_list ) {
		$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $content, $image_list );

		return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list );
	}

	public static function build_image_struct( $image_list ) {
		if ( ! empty( $image_list ) ) {
			$retval = array( 'image' => array() );
			$image_list = array_unique( $image_list );

			foreach ( $image_list as $img ) {
				$retval['image'][] = array( 'url' => $img );
			}
			$image_booleans['image'] = count( $retval['image'] );
			if ( ! empty( $image_booleans ) )
				$retval['has'] = $image_booleans;
			return $retval;
		} else {
			return array();
		}
	}

	/**
	 *
	 * @param string $html Some markup, possibly containing image tags
	 * @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication
	 * @return array Image URLs extracted from the HTML, stripped of query params and de-duped
	 */
	public static function get_images_from_html( $html, $images_already_extracted ) {
		$image_list = $images_already_extracted;
		$from_html = Jetpack_PostImages::from_html( $html );
		if ( !empty( $from_html ) ) {
			$srcs = wp_list_pluck( $from_html, 'src' );
			foreach( $srcs as $image_url ) {
				if ( ( $src = parse_url( $image_url ) ) && isset( $src['scheme'], $src['host'], $src['path'] ) ) {
					// Rebuild the URL without the query string
					$queryless = $src['scheme'] . '://' . $src['host'] . $src['path'];
				} elseif ( $length = strpos( $image_url, '?' ) ) {
					// If parse_url() didn't work, strip off the query string the old fashioned way
					$queryless = substr( $image_url, 0, $length );
				} else {
					// Failing that, there was no spoon! Err ... query string!
					$queryless = $image_url;
				}

				// Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML.
				if ( 4096 < strlen( $queryless ) ) {
					continue;
				}

				if ( ! in_array( $queryless, $image_list ) ) {
					$image_list[] = $queryless;
				}
			}
		}
		return $image_list;
	}

	private static function get_stripped_content( $content ) {
		$clean_content = strip_tags( $content );
		$clean_content = html_entity_decode( $clean_content );
		//completely strip shortcodes and any content they enclose
		$clean_content = strip_shortcodes( $clean_content );
		return $clean_content;
	}
}


1		<?php
2		/**
3		* Class with methods to extract metadata from a post/page about videos, images, links, mentions embedded
4		* in or attached to the post/page.
5		*
6		* @todo Additionally, have some filters on number of items in each field
7		*/
8		class Jetpack_Media_Meta_Extractor {
9
10		// Some consts for what to extract
11		const ALL = 255;
12		const LINKS = 1;
13		const MENTIONS = 2;
14		const IMAGES = 4;
15		const SHORTCODES = 8; // Only the keeper shortcodes below
16		const EMBEDS = 16;
17		const HASHTAGS = 32;
18
19		// For these, we try to extract some data from the shortcode, rather than just recording its presence (which we do for all)
20		// There should be a function get_{shortcode}_id( $atts ) or static method SomethingShortcode::get_{shortcode}_id( $atts ) for these.
21		private static $KEEPER_SHORTCODES = array(
22		'youtube',
23		'vimeo',
24		'hulu',
25		'ted',
26		'wpvideo',
27		);
28
29		/**
30		* Gets the specified media and meta info from the given post.
31		* NOTE: If you have the post's HTML content already and don't need image data, use extract_from_content() instead.
32		*
33		* @param $blog_id The ID of the blog
34		* @param $post_id The ID of the post
35		* @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES \| Jetpack_Media_Meta_Extractor::MENTIONS
36		* @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
37		*/
38		static public function extract( $blog_id, $post_id, $what_to_extract = self::ALL ) {
39
40		// multisite?
41		if ( function_exists( 'switch_to_blog') )
42		switch_to_blog( $blog_id );
43
44		$post = get_post( $post_id );
45		$content = $post->post_title . "\n\n" . $post->post_content;
46		$char_cnt = strlen( $content );
47
48		//prevent running extraction on really huge amounts of content
49		if ( $char_cnt > 100000 ) //about 20k English words
50		$content = substr( $content, 0, 100000 );
51
52		$extracted = array();
53
54		// Get images first, we need the full post for that
55		if ( self::IMAGES & $what_to_extract ) {
56		$extracted = self::get_image_fields( $post );
57
58		// Turn off images so we can safely call extract_from_content() below
59		$what_to_extract = $what_to_extract - self::IMAGES;
		0 ignored issues – show Coding Style introduced 2016-07-12 10:37 UTC by Report Bug Copy Issue Report Show Similar Issues like this Consider using a different name than the parameter `$what_to_extract`. This often makes code more readable. Loading history...
60		}
61
62		if ( function_exists( 'switch_to_blog') )
63		restore_current_blog();
64
65		// All of the other things besides images can be extracted from just the content
66		$extracted = self::extract_from_content( $content, $what_to_extract, $extracted );
67
68		return $extracted;
69		}
70
71		/**
72		* Gets the specified meta info from the given post content.
73		* NOTE: If you want IMAGES, call extract( $blog_id, $post_id, ...) which will give you more/better image extraction
74		* This method will give you an error if you ask for IMAGES.
75		*
76		* @param $content The HTML post_content of a post
77		* @param $what_to_extract (int) A mask of things to extract, e.g. Jetpack_Media_Meta_Extractor::IMAGES \| Jetpack_Media_Meta_Extractor::MENTIONS
78		* @param $already_extracted (array) Previously extracted things, e.g. images from extract(), which can be used for x-referencing here
79		* @returns a structure containing metadata about the embedded things, or empty array if nothing found, or WP_Error on error
80		*/
81		static public function extract_from_content( $content, $what_to_extract = self::ALL, $already_extracted = array() ) {
82		$stripped_content = self::get_stripped_content( $content );
83
84		// Maybe start with some previously extracted things (e.g. images from extract()
85		$extracted = $already_extracted;
86
87		// Embedded media objects will have already been converted to shortcodes by pre_kses hooks on save.
88
89		if ( self::IMAGES & $what_to_extract ) {
90		$images = Jetpack_Media_Meta_Extractor::extract_images_from_content( $stripped_content, array() );
91		$extracted = array_merge( $extracted, $images );
92		}
93
94		// ----------------------------------- MENTIONS ------------------------------
95
96	View Code Duplication	if ( self::MENTIONS & $what_to_extract ) {
97		if ( preg_match_all( '/(^\|\s)@(\w+)/u', $stripped_content, $matches ) ) {
98		$mentions = array_values( array_unique( $matches[2] ) ); //array_unique() retains the keys!
99		$mentions = array_map( 'strtolower', $mentions );
100		$extracted['mention'] = array( 'name' => $mentions );
101		if ( !isset( $extracted['has'] ) )
102		$extracted['has'] = array();
103		$extracted['has']['mention'] = count( $mentions );
104		}
105		}
106
107		// ----------------------------------- HASHTAGS ------------------------------
108		/** Some hosts may not compile with --enable-unicode-properties and kick a warning:
109		* Warning: preg_match_all() [function.preg-match-all]: Compilation failed: support for \P, \p, and \X has not been compiled
110		* Therefore, we only run this code block on wpcom, not in Jetpack.
111		*/
112		if ( ( defined( 'IS_WPCOM' ) && IS_WPCOM ) && ( self::HASHTAGS & $what_to_extract ) ) {
113		//This regex does not exactly match Twitter's
114		// if there are problems/complaints we should implement this:
115		// https://github.com/twitter/twitter-text/blob/master/java/src/com/twitter/Regex.java
116	View Code Duplication	if ( preg_match_all( '/(?:^\|\s)#(\w\p{L}+\w)/u', $stripped_content, $matches ) ) {
117		$hashtags = array_values( array_unique( $matches[1] ) ); //array_unique() retains the keys!
118		$hashtags = array_map( 'strtolower', $hashtags );
119		$extracted['hashtag'] = array( 'name' => $hashtags );
120		if ( !isset( $extracted['has'] ) )
121		$extracted['has'] = array();
122		$extracted['has']['hashtag'] = count( $hashtags );
123		}
124		}
125
126		// ----------------------------------- SHORTCODES ------------------------------
127
128		// Always look for shortcodes.
129		// If we don't want them, we'll just remove them, so we don't grab them as links below
130		$shortcode_pattern = '/' . get_shortcode_regex() . '/s';
131		if ( preg_match_all( $shortcode_pattern, $content, $matches ) ) {
132
133		$shortcode_total_count = 0;
134		$shortcode_type_counts = array();
135		$shortcode_types = array();
136		$shortcode_details = array();
137
138		if ( self::SHORTCODES & $what_to_extract ) {
139
140		foreach( $matches[2] as $key => $shortcode ) {
141		//Elasticsearch (and probably other things) doesn't deal well with some chars as key names
142		$shortcode_name = preg_replace( '/[.,*"\'\/\\\\#+ ]/', '_', $shortcode );
143
144		$attr = shortcode_parse_atts( $matches[3][ $key ] );
145
146		$shortcode_total_count++;
147		if ( ! isset( $shortcode_type_counts[$shortcode_name] ) )
148		$shortcode_type_counts[$shortcode_name] = 0;
149		$shortcode_type_counts[$shortcode_name]++;
150
151		// Store (uniquely) presence of all shortcode regardless of whether it's a keeper (for those, get ID below)
152		// @todo Store number of occurrences?
153		if ( ! in_array( $shortcode_name, $shortcode_types ) )
154		$shortcode_types[] = $shortcode_name;
155
156		// For keeper shortcodes, also store the id/url of the object (e.g. youtube video, TED talk, etc.)
157		if ( in_array( $shortcode, self::$KEEPER_SHORTCODES ) ) {
158		unset( $id ); // Clear shortcode ID data left from the last shortcode
159		// We'll try to get the salient ID from the function jetpack_shortcode_get_xyz_id()
160		// If the shortcode is a class, we'll call XyzShortcode::get_xyz_id()
161		$shortcode_get_id_func = "jetpack_shortcode_get_{$shortcode}_id";
162		$shortcode_class_name = ucfirst( $shortcode ) . 'Shortcode';
163		$shortcode_get_id_method = "get_{$shortcode}_id";
164		if ( function_exists( $shortcode_get_id_func ) ) {
165		$id = call_user_func( $shortcode_get_id_func, $attr );
166		} else if ( method_exists( $shortcode_class_name, $shortcode_get_id_method ) ) {
167		$id = call_user_func( array( $shortcode_class_name, $shortcode_get_id_method ), $attr );
168		}
169		if ( ! empty( $id )
170		&& ( ! isset( $shortcode_details[$shortcode_name] ) \|\| ! in_array( $id, $shortcode_details[$shortcode_name] ) ) )
171		$shortcode_details[$shortcode_name][] = $id;
172		}
173		}
174
175		if ( $shortcode_total_count > 0 ) {
176		// Add the shortcode info to the $extracted array
177		if ( !isset( $extracted['has'] ) )
178		$extracted['has'] = array();
179		$extracted['has']['shortcode'] = $shortcode_total_count;
180		$extracted['shortcode'] = array();
181		foreach ( $shortcode_type_counts as $type => $count )
182		$extracted['shortcode'][$type] = array( 'count' => $count );
183		if ( ! empty( $shortcode_types ) )
184		$extracted['shortcode_types'] = $shortcode_types;
185		foreach ( $shortcode_details as $type => $id )
186		$extracted['shortcode'][$type]['id'] = $id;
187		}
188		}
189
190		// Remove the shortcodes form our copy of $content, so we don't count links in them as links below.
191		$content = preg_replace( $shortcode_pattern, ' ', $content );
		0 ignored issues – show Coding Style introduced 2016-07-12 10:37 UTC by Report Bug Copy Issue Report Show Similar Issues like this Consider using a different name than the parameter `$content`. This often makes code more readable. Loading history...
192		}
193
194		// ----------------------------------- LINKS ------------------------------
195
196		if ( self::LINKS & $what_to_extract ) {
197
198		// To hold the extracted stuff we find
199		$links = array();
200
201		// @todo Get the text inside the links?
202
203		// Grab any links, whether in <a href="..." or not, but subtract those from shortcodes and images
204		// (we treat embed links as just another link)
205		if ( preg_match_all( '#(?:^\|\s\|"\|\')(https?://([^\s()<>]+(?:\([\w\d]+\)\|([^[:punct:]\s]\|/))))#', $content, $matches ) ) {
206
207		foreach ( $matches[1] as $link_raw ) {
208		$url = parse_url( $link_raw );
209
210		// Data URI links
211		if ( isset( $url['scheme'] ) && 'data' === $url['scheme'] )
212		continue;
213
214		// Remove large (and likely invalid) links
215		if ( 4096 < strlen( $link_raw ) )
216		continue;
217
218		// Build a simple form of the URL so we can compare it to ones we found in IMAGES or SHORTCODES and exclude those
219		$simple_url = $url['scheme'] . '://' . $url['host'] . ( ! empty( $url['path'] ) ? $url['path'] : '' );
220		if ( isset( $extracted['image']['url'] ) ) {
221		if ( in_array( $simple_url, (array) $extracted['image']['url'] ) )
222		continue;
223		}
224
225		list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );
226
227		// Build a reversed hostname
228		$host_parts = array_reverse( explode( '.', $url['host'] ) );
229		$host_reversed = '';
230		foreach ( $host_parts as $part ) {
231		$host_reversed .= ( ! empty( $host_reversed ) ? '.' : '' ) . $part;
232		}
233
234		$link_analyzed = '';
235		if ( !empty( $url['path'] ) ) {
236		// The whole path (no query args or fragments)
237		$path = substr( $url['path'], 1 ); // strip the leading '/'
238		$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $path;
239
240		// The path split by /
241		$path_split = explode( '/', $path );
242		if ( count( $path_split ) > 1 ) {
243		$link_analyzed .= ' ' . implode( ' ', $path_split );
244		}
245
246		// The fragment
247		if ( ! empty( $url['fragment'] ) )
248		$link_analyzed .= ( ! empty( $link_analyzed ) ? ' ' : '' ) . $url['fragment'];
249		}
250
251		// @todo Check unique before adding
252		$links[] = array(
253		'url' => $link_all_but_proto,
254		'host_reversed' => $host_reversed,
255		'host' => $url['host'],
256		);
257		}
258
259		}
260
261		$link_count = count( $links );
262		if ( $link_count ) {
263		$extracted[ 'link' ] = $links;
264		if ( !isset( $extracted['has'] ) )
265		$extracted['has'] = array();
266		$extracted['has']['link'] = $link_count;
267		}
268		}
269
270		// ----------------------------------- EMBEDS ------------------------------
271
272		//Embeds are just individual links on their own line
273		if ( self::EMBEDS & $what_to_extract ) {
274
275		if ( !function_exists( '_wp_oembed_get_object' ) )
276		include( ABSPATH . WPINC . '/class-oembed.php' );
277
278		// get an oembed object
279		$oembed = _wp_oembed_get_object();
280
281		// Grab any links on their own lines that may be embeds
282		if ( preg_match_all( '\|^\s(https?://[^\s"]+)\s$\|im', $content, $matches ) ) {
283
284		// To hold the extracted stuff we find
285		$embeds = array();
286
287		foreach ( $matches[1] as $link_raw ) {
288		$url = parse_url( $link_raw );
289
290		list( $proto, $link_all_but_proto ) = explode( '://', $link_raw );
291
292		// Check whether this "link" is really an embed.
293		foreach ( $oembed->providers as $matchmask => $data ) {
294		list( $providerurl, $regex ) = $data;
295
296		// Turn the asterisk-type provider URLs into regex
297		if ( !$regex ) {
298		$matchmask = '#' . str_replace( '___wildcard___', '(.+)', preg_quote( str_replace( '*', '___wildcard___', $matchmask ), '#' ) ) . '#i';
299		$matchmask = preg_replace( '\|^#http\\\://\|', '#https?\://', $matchmask );
300		}
301
302		if ( preg_match( $matchmask, $link_raw ) ) {
303		$provider = str_replace( '{format}', 'json', $providerurl ); // JSON is easier to deal with than XML
304		$embeds[] = $link_all_but_proto; // @todo Check unique before adding
305
306		// @todo Try to get ID's for the ones we care about (shortcode_keepers)
307		break;
308		}
309		}
310		}
311
312		if ( ! empty( $embeds ) ) {
313		if ( !isset( $extracted['has'] ) )
314		$extracted['has'] = array();
315		$extracted['has']['embed'] = count( $embeds );
316		$extracted['embed'] = array( 'url' => array() );
317		foreach ( $embeds as $e )
318		$extracted['embed']['url'][] = $e;
319		}
320		}
321		}
322
323		return $extracted;
324		}
325
326		/**
327		* @param $post A post object
328		* @param $args (array) Optional args, see defaults list for details
329		* @returns array Returns an array of all images meeting the specified criteria in $args
330		*
331		* Uses Jetpack Post Images
332		*/
333		private static function get_image_fields( $post, $args = array() ) {
334
335		$defaults = array(
336		'width' => 200, // Required minimum width (if possible to determine)
337		'height' => 200, // Required minimum height (if possible to determine)
338		);
339
340		$args = wp_parse_args( $args, $defaults );
		0 ignored issues – show Coding Style introduced 2016-07-12 10:37 UTC by Report Bug Copy Issue Report Show Similar Issues like this Consider using a different name than the parameter `$args`. This often makes code more readable. Loading history...
341
342		$image_list = array();
343		$image_booleans = array();
344		$image_booleans['gallery'] = 0;
345
346		$from_featured_image = Jetpack_PostImages::from_thumbnail( $post->ID, $args['width'], $args['height'] );
347	View Code Duplication	if ( !empty( $from_featured_image ) ) {
348		$srcs = wp_list_pluck( $from_featured_image, 'src' );
349		$image_list = array_merge( $image_list, $srcs );
350		}
351
352		$from_slideshow = Jetpack_PostImages::from_slideshow( $post->ID, $args['width'], $args['height'] );
353	View Code Duplication	if ( !empty( $from_slideshow ) ) {
354		$srcs = wp_list_pluck( $from_slideshow, 'src' );
355		$image_list = array_merge( $image_list, $srcs );
356		}
357
358		$from_gallery = Jetpack_PostImages::from_gallery( $post->ID );
359		if ( !empty( $from_gallery ) ) {
360		$srcs = wp_list_pluck( $from_gallery, 'src' );
361		$image_list = array_merge( $image_list, $srcs );
362		$image_booleans['gallery']++; // @todo This count isn't correct, will only every count 1
363		}
364
365		// @todo Can we check width/height of these efficiently? Could maybe use query args at least, before we strip them out
366		$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $post->post_content, $image_list );
367
368		return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list );
369		}
370
371		public static function extract_images_from_content( $content, $image_list ) {
372		$image_list = Jetpack_Media_Meta_Extractor::get_images_from_html( $content, $image_list );
		0 ignored issues – show Coding Style introduced 2016-07-12 10:37 UTC by Report Bug Copy Issue Report Show Similar Issues like this Consider using a different name than the parameter `$image_list`. This often makes code more readable. Loading history...
373		return Jetpack_Media_Meta_Extractor::build_image_struct( $image_list );
374		}
375
376		public static function build_image_struct( $image_list ) {
377		if ( ! empty( $image_list ) ) {
378		$retval = array( 'image' => array() );
379		$image_list = array_unique( $image_list );
		0 ignored issues – show Coding Style introduced 2016-07-12 10:37 UTC by Report Bug Copy Issue Report Show Similar Issues like this Consider using a different name than the parameter `$image_list`. This often makes code more readable. Loading history...
380		foreach ( $image_list as $img ) {
381		$retval['image'][] = array( 'url' => $img );
382		}
383		$image_booleans['image'] = count( $retval['image'] );
384		if ( ! empty( $image_booleans ) )
385		$retval['has'] = $image_booleans;
386		return $retval;
387		} else {
388		return array();
389		}
390		}
391
392		/**
393		*
394		* @param string $html Some markup, possibly containing image tags
395		* @param array $images_already_extracted (just an array of image URLs without query strings, no special structure), used for de-duplication
396		* @return array Image URLs extracted from the HTML, stripped of query params and de-duped
397		*/
398		public static function get_images_from_html( $html, $images_already_extracted ) {
399		$image_list = $images_already_extracted;
400		$from_html = Jetpack_PostImages::from_html( $html );
401		if ( !empty( $from_html ) ) {
402		$srcs = wp_list_pluck( $from_html, 'src' );
403		foreach( $srcs as $image_url ) {
404		if ( ( $src = parse_url( $image_url ) ) && isset( $src['scheme'], $src['host'], $src['path'] ) ) {
405		// Rebuild the URL without the query string
406		$queryless = $src['scheme'] . '://' . $src['host'] . $src['path'];
407		} elseif ( $length = strpos( $image_url, '?' ) ) {
408		// If parse_url() didn't work, strip off the query string the old fashioned way
409		$queryless = substr( $image_url, 0, $length );
410		} else {
411		// Failing that, there was no spoon! Err ... query string!
412		$queryless = $image_url;
413		}
414
415		// Discard URLs that are longer then 4KB, these are likely data URIs or malformed HTML.
416		if ( 4096 < strlen( $queryless ) ) {
417		continue;
418		}
419
420		if ( ! in_array( $queryless, $image_list ) ) {
421		$image_list[] = $queryless;
422		}
423		}
424		}
425		return $image_list;
426		}
427
428		private static function get_stripped_content( $content ) {
429		$clean_content = strip_tags( $content );
430		$clean_content = html_entity_decode( $clean_content );
431		//completely strip shortcodes and any content they enclose
432		$clean_content = strip_shortcodes( $clean_content );
433		return $clean_content;
434		}
435		}
436

Automattic / jetpack

Push — fix/normalize-www-in-site-url-... ( e67e76 )

_inc/lib/class.media-extractor.php (5 issues)

Labels

Severity

Introduced By

Upgrade to new PHP Analysis Engine

Duplication Side-by-Side

Filter issues like