Parser::getHttpClient() - Code Metrics - Inspection of "MediaEmbed: disabled YouTube test because they see..." - s9e/TextFormatter - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 00d88c...61baac )

by Josh

created 2017-09-22 01:21 UTC

Parser::getHttpClient() A

↳ Parent: Parser

Complexity

Conditions	2
Paths	2

Size

Total Lines	10
Code Lines	5

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	0
CRAP Score	6

Importance

Changes

Metric	Value
dl	0
loc	10
ccs	0
cts	7
cp	0
rs	9.4285
c	0
b	0
f	0
cc	2
eloc	5
nc	2
nop	0
crap	6

<?php

/**
* @package   s9e\TextFormatter
* @copyright Copyright (c) 2010-2017 The s9e Authors
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\TextFormatter\Plugins\MediaEmbed;

use s9e\TextFormatter\Parser as TagStack;
use s9e\TextFormatter\Parser\Tag;
use s9e\TextFormatter\Plugins\ParserBase;
use s9e\TextFormatter\Utils\Http;

class Parser extends ParserBase
{
	/**
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
	*/
	protected static $client;

	/**
	* {@inheritdoc}
	*/
	public function parse($text, array $matches)
	{
		foreach ($matches as $m)
		{
			$url = $m[0][0];
			$pos = $m[0][1];
			$len = strlen($url);

			// Give that tag priority over other tags such as Autolink's
			$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
			$tag->setAttribute('url', $url);
		}
	}

	/**
	* Filter a MEDIA tag
	*
	* This will always invalidate the original tag, and possibly replace it with the tag that
	* corresponds to the media site
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
	* @param  array    $sites    Map of [host => siteId]
	* @return bool               Unconditionally FALSE
	*/
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
	{
		if ($tag->hasAttribute('site'))
		{
			self::addTagFromMediaId($tag, $tagStack, $sites);
		}
		elseif ($tag->hasAttribute('url'))
		{
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
		}

		return false;
	}

	/**
	* Test whether a given tag has at least one non-default attribute
	*
	* @param  Tag  $tag The original tag
	* @return bool      Whether the tag contains an attribute not named "url"
	*/
	public static function hasNonDefaultAttribute(Tag $tag)
	{
		foreach ($tag->getAttributes() as $attrName => $void)
		{
			if ($attrName !== 'url')
			{
				return true;
			}
		}

		return false;
	}

	/**
	* Scrape the content of an URL to extract some data
	*
	* @param  Tag    $tag          Source tag
	* @param  array  $scrapeConfig Array of scrape directives
	* @param  string $cacheDir     Path to the cache directory
	* @return bool                 Unconditionally TRUE
	*/
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
	{
		if ($tag->hasAttribute('url'))
		{
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
			$url = $tag->getAttribute('url');
			if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
			{
				$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
				foreach ($scrapeConfig as $scrape)
				{
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
				}
			}
		}

		return true;
	}

	//==============================================================================================
	// Internals
	//==============================================================================================

	/**
	* Add a site tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
	* @param  string   $siteId   Site ID
	* @return void
	*/
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
	{
		$endTag = $tag->getEndTag() ?: $tag;

		// Compute the boundaries of our new tag
		$lpos = $tag->getPos();
		$rpos = $endTag->getPos() + $endTag->getLen();

		// Create a new tag and copy this tag's attributes and priority
		$tagStack->addTagPair(strtoupper($siteId), $lpos, 0, $rpos, 0, $tag->getSortPriority())->setAttributes($tag->getAttributes());
	}

	/**
	* Add a media site tag based on the attributes of a MEDIA tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance
	* @param  array    $sites    Map of [host => siteId]
	* @return void
	*/
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
	{
		$siteId = strtolower($tag->getAttribute('site'));
		if (in_array($siteId, $sites, true))
		{
			self::addSiteTag($tag, $tagStack, $siteId);
		}
	}

	/**
	* Add a media site tag based on the url attribute of a MEDIA tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance
	* @param  array    $sites    Map of [host => siteId]
	* @return void
	*/
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
	{
		// Capture the scheme and (if applicable) host of the URL
		$p = parse_url($tag->getAttribute('url'));
		if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
		{
			$siteId = $sites[$p['scheme'] . ':'];
		}
		elseif (isset($p['host']))
		{
			$siteId = self::findSiteIdByHost($p['host'], $sites);
		}

		if (!empty($siteId))
		{
			self::addSiteTag($tag, $tagStack, $siteId);
		}
	}

	/**
	* Match a given host to a site ID
	*
	* @param  string      $host  Host
	* @param  array       $sites Map of [host => siteId]
	* @return string|bool        Site ID or FALSE
	*/
	protected static function findSiteIdByHost($host, array $sites)
	{
		// Start with the full host then pop domain labels off the start until we get a match
		do
		{
			if (isset($sites[$host]))
			{
				return $sites[$host];
			}

			$pos = strpos($host, '.');
			if ($pos === false)
			{
				break;
			}

			$host = substr($host, 1 + $pos);
		}
		while ($host > '');

		return false;
	}

	/**
	* Return a cached instance of the HTTP client
	*
	* @return \s9e\TextFormatter\Utils\Http\Client
	*/
	protected static function getHttpClient()
	{
		if (!isset(self::$client))
		{
			self::$client = Http::getClient();
		}
		self::$client->timeout = 10;

		return self::$client;
	}

	/**
	* Replace {@var} tokens in given URL
	*
	* @param  string   $url  Original URL
	* @param  string[] $vars Replacements
	* @return string         Modified URL
	*/
	protected static function replaceTokens($url, array $vars)
	{
		return preg_replace_callback(
			'#\\{@(\\w+)\\}#',
			function ($m) use ($vars)
			{
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
			},
			$url
		);
	}

	/**
	* Scrape the content of an URL to extract some data
	*
	* @param  string $url      Original URL
	* @param  Tag    $tag      Source tag
	* @param  array  $scrape   Array of scrape directives
	* @param  string $cacheDir Path to the cache directory
	* @return void
	*/
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
	{
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;

		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
		{
			return;
		}

		// Test whether this URL matches any regexp
		$vars    = [];
		$matched = false;
		foreach ((array) $matchRegexps as $matchRegexp)
		{
			if (preg_match($matchRegexp, $url, $m))
			{
				$vars   += $m;
				$matched = true;
			}
		}
		if (!$matched)
		{
			return;
		}

		// Add the tag's attributes to the named captures from the "match" regexp
		$vars += $tag->getAttributes();

		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
	}

	/**
	* Scrape a URL to help fill a tag's attributes
	*
	* @param  string      $url      URL to scrape
	* @param  Tag         $tag      Tag to fill
	* @param  string[]    $regexps  Regexps used to extract content from the page
	* @param  string|null $cacheDir Path to the cache directory
	* @return void
	*/
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
	{
		$content = self::wget($url, $cacheDir);

		// Execute the extract regexps and fill any missing attribute
		foreach ($regexps as $regexp)
		{
			if (preg_match($regexp, $content, $m))
			{
				foreach ($m as $k => $v)
				{
					if (!is_numeric($k) && !$tag->hasAttribute($k))
					{
						$tag->setAttribute($k, $v);
					}
				}
			}
		}
	}

	/**
	* Test whether a tag is missing any of given attributes
	*
	* @param  Tag      $tag
	* @param  string[] $attrNames
	* @return bool
	*/
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
	{
		foreach ($attrNames as $attrName)
		{
			if (!$tag->hasAttribute($attrName))
			{
				return true;
			}
		}

		return false;
	}

	/**
	* Retrieve external content (possibly from the cache)
	*
	* If the cache directory exists, the external content will be saved into it. Cached content is
	* never pruned
	*
	* @param  string $url      URL
	* @param  string $cacheDir Path to the cache directory
	* @return string           External content
	*/
	protected static function wget($url, $cacheDir = null)
	{
		$prefix = '';
		$url    = preg_replace('(#.*)s', '', $url);

		// Return the content from the cache if applicable
		if (isset($cacheDir) && file_exists($cacheDir))
		{
			$cacheFile = $cacheDir . '/http.' . crc32($url);
			if (extension_loaded('zlib'))
			{
				$prefix     = 'compress.zlib://';
				$cacheFile .= '.gz';
			}
			if (file_exists($cacheFile))
			{
				return file_get_contents($prefix . $cacheFile);
			}
		}

		// Retrieve the external content from the source
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);

		// Save to the cache if applicable
		if (isset($cacheFile) && !empty($content))
		{
			file_put_contents($prefix . $cacheFile, $content);
		}

		return $content;
	}
}

1		<?php
2
3		/**
4		* @package s9e\TextFormatter
5		* @copyright Copyright (c) 2010-2017 The s9e Authors
6		* @license http://www.opensource.org/licenses/mit-license.php The MIT License
7		*/
8		namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10		use s9e\TextFormatter\Parser as TagStack;
11		use s9e\TextFormatter\Parser\Tag;
12		use s9e\TextFormatter\Plugins\ParserBase;
13		use s9e\TextFormatter\Utils\Http;
14
15		class Parser extends ParserBase
16		{
17		/**
18		* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19		*/
20		protected static $client;
21
22		/**
23		* {@inheritdoc}
24		*/
25	386	public function parse($text, array $matches)
26		{
27	386	foreach ($matches as $m)
28		{
29	386	$url = $m[0][0];
30	386	$pos = $m[0][1];
31	386	$len = strlen($url);
32
33		// Give that tag priority over other tags such as Autolink's
34	386	$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
35	386	$tag->setAttribute('url', $url);
36	386	}
37	386	}
38
39		/**
40		* Filter a MEDIA tag
41		*
42		* This will always invalidate the original tag, and possibly replace it with the tag that
43		* corresponds to the media site
44		*
45		* @param Tag $tag The original tag
46		* @param TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47		* @param array $sites Map of [host => siteId]
48		* @return bool Unconditionally FALSE
49		*/
50	387	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51		{
52	387	if ($tag->hasAttribute('site'))
53	387	{
54	5	self::addTagFromMediaId($tag, $tagStack, $sites);
55	5	}
56	383	elseif ($tag->hasAttribute('url'))
57		{
58	383	self::addTagFromMediaUrl($tag, $tagStack, $sites);
59	383	}
60
61	387	return false;
62		}
63
64		/**
65		* Test whether a given tag has at least one non-default attribute
66		*
67		* @param Tag $tag The original tag
68		* @return bool Whether the tag contains an attribute not named "url"
69		*/
70	64	public static function hasNonDefaultAttribute(Tag $tag)
71		{
72	64	foreach ($tag->getAttributes() as $attrName => $void)
73		{
74	64	if ($attrName !== 'url')
75	64	{
76	59	return true;
77		}
78	60	}
79
80	6	return false;
81		}
82
83		/**
84		* Scrape the content of an URL to extract some data
85		*
86		* @param Tag $tag Source tag
87		* @param array $scrapeConfig Array of scrape directives
88		* @param string $cacheDir Path to the cache directory
89		* @return bool Unconditionally TRUE
90		*/
91	150	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
92		{
93	150	if ($tag->hasAttribute('url'))
94	150	{
95		// Ensure that the URL actually looks like a URL if we want to use it to scrape
96	149	$url = $tag->getAttribute('url');
97	149	if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
98	149	{
99	143	$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
100	143	foreach ($scrapeConfig as $scrape)
101		{
102	143	self::scrapeEntry($url, $tag, $scrape, $cacheDir);
103	143	}
104	143	}
105	149	}
106
107	150	return true;
108		}
109
110		//==============================================================================================
111		// Internals
112		//==============================================================================================
113
114		/**
115		* Add a site tag
116		*
117		* @param Tag $tag The original tag
118		* @param TagStack $tagStack Parser instance, so that we can add the new tag to the stack
119		* @param string $siteId Site ID
120		* @return void
121		*/
122	385	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
123		{
124	385	$endTag = $tag->getEndTag() ?: $tag;
125
126		// Compute the boundaries of our new tag
127	385	$lpos = $tag->getPos();
128	385	$rpos = $endTag->getPos() + $endTag->getLen();
129
130		// Create a new tag and copy this tag's attributes and priority
131	385	$tagStack->addTagPair(strtoupper($siteId), $lpos, 0, $rpos, 0, $tag->getSortPriority())->setAttributes($tag->getAttributes());
132	385	}
133
134		/**
135		* Add a media site tag based on the attributes of a MEDIA tag
136		*
137		* @param Tag $tag The original tag
138		* @param TagStack $tagStack Parser instance
139		* @param array $sites Map of [host => siteId]
140		* @return void
141		*/
142	5	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
143		{
144	5	$siteId = strtolower($tag->getAttribute('site'));
145	5	if (in_array($siteId, $sites, true))
146	5	{
147	4	self::addSiteTag($tag, $tagStack, $siteId);
148	4	}
149	5	}
150
151		/**
152		* Add a media site tag based on the url attribute of a MEDIA tag
153		*
154		* @param Tag $tag The original tag
155		* @param TagStack $tagStack Parser instance
156		* @param array $sites Map of [host => siteId]
157		* @return void
158		*/
159	383	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
160		{
161		// Capture the scheme and (if applicable) host of the URL
162	383	$p = parse_url($tag->getAttribute('url'));
163	383	if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
164	383	{
165	4	$siteId = $sites[$p['scheme'] . ':'];
166	4	}
167	379	elseif (isset($p['host']))
168		{
169	379	$siteId = self::findSiteIdByHost($p['host'], $sites);
170	379	}
171
172	383	if (!empty($siteId))
173	383	{
174	381	self::addSiteTag($tag, $tagStack, $siteId);
175	381	}
176	383	}
177
178		/**
179		* Match a given host to a site ID
180		*
181		* @param string $host Host
182		* @param array $sites Map of [host => siteId]
183		* @return string\|bool Site ID or FALSE
184		*/
185	379	protected static function findSiteIdByHost($host, array $sites)
186		{
187		// Start with the full host then pop domain labels off the start until we get a match
188		do
189		{
190	379	if (isset($sites[$host]))
191	379	{
192	377	return $sites[$host];
193		}
194
195	253	$pos = strpos($host, '.');
196	253	if ($pos === false)
197	253	{
198	4	break;
199		}
200
201	253	$host = substr($host, 1 + $pos);
202		}
203	253	while ($host > '');
204
205	4	return false;
206		}
207
208		/**
209		* Return a cached instance of the HTTP client
210		*
211		* @return \s9e\TextFormatter\Utils\Http\Client
212		*/
213		protected static function getHttpClient()
214		{
215		if (!isset(self::$client))
216		{
217		self::$client = Http::getClient();
218		}
219		self::$client->timeout = 10;
220
221		return self::$client;
222		}
223
224		/**
225		* Replace {@var} tokens in given URL
226		*
227		* @param string $url Original URL
228		* @param string[] $vars Replacements
229		* @return string Modified URL
230		*/
231	19	protected static function replaceTokens($url, array $vars)
232		{
233	19	return preg_replace_callback(
234	19	'#\\{@(\\w+)\\}#',
235	19	function ($m) use ($vars)
236		{
237	19	return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
238	19	},
239		$url
240	19	);
241		}
242
243		/**
244		* Scrape the content of an URL to extract some data
245		*
246		* @param string $url Original URL
247		* @param Tag $tag Source tag
248		* @param array $scrape Array of scrape directives
249		* @param string $cacheDir Path to the cache directory
250		* @return void
251		*/
252	143	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
253		{
254	143	list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
255
256	143	if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
257	143	{
258	60	return;
259		}
260
261		// Test whether this URL matches any regexp
262	90	$vars = [];
263	90	$matched = false;
264	90	foreach ((array) $matchRegexps as $matchRegexp)
265		{
266	90	if (preg_match($matchRegexp, $url, $m))
267	90	{
268	68	$vars += $m;
269	68	$matched = true;
270	68	}
271	90	}
272	90	if (!$matched)
273	90	{
274	29	return;
275		}
276
277		// Add the tag's attributes to the named captures from the "match" regexp
278	68	$vars += $tag->getAttributes();
279
280	68	$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
281	68	self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
282	68	}
283
284		/**
285		* Scrape a URL to help fill a tag's attributes
286		*
287		* @param string $url URL to scrape
288		* @param Tag $tag Tag to fill
289		* @param string[] $regexps Regexps used to extract content from the page
290		* @param string\|null $cacheDir Path to the cache directory
291		* @return void
292		*/
293	68	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
294		{
295	68	$content = self::wget($url, $cacheDir);
296
297		// Execute the extract regexps and fill any missing attribute
298	68	foreach ($regexps as $regexp)
299		{
300	68	if (preg_match($regexp, $content, $m))
301	68	{
302	64	foreach ($m as $k => $v)
303		{
304	64	if (!is_numeric($k) && !$tag->hasAttribute($k))
305	64	{
306	64	$tag->setAttribute($k, $v);
307	64	}
308	64	}
309	64	}
310	68	}
311	68	}
312
313		/**
314		* Test whether a tag is missing any of given attributes
315		*
316		* @param Tag $tag
317		* @param string[] $attrNames
318		* @return bool
319		*/
320	143	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
321		{
322	143	foreach ($attrNames as $attrName)
323		{
324	143	if (!$tag->hasAttribute($attrName))
325	143	{
326	90	return true;
327		}
328	62	}
329
330	60	return false;
331		}
332
333		/**
334		* Retrieve external content (possibly from the cache)
335		*
336		* If the cache directory exists, the external content will be saved into it. Cached content is
337		* never pruned
338		*
339		* @param string $url URL
340		* @param string $cacheDir Path to the cache directory
341		* @return string External content
342		*/
343	68	protected static function wget($url, $cacheDir = null)
344		{
345	68	$prefix = '';
346	68	$url = preg_replace('(#.*)s', '', $url);
347
348		// Return the content from the cache if applicable
349	68	if (isset($cacheDir) && file_exists($cacheDir))
350	68	{
351	68	$cacheFile = $cacheDir . '/http.' . crc32($url);
352	68	if (extension_loaded('zlib'))
353	68	{
354	68	$prefix = 'compress.zlib://';
355	68	$cacheFile .= '.gz';
356	68	}
357	68	if (file_exists($cacheFile))
358	68	{
359	68	return file_get_contents($prefix . $cacheFile);
360		}
361		}
362
363		// Retrieve the external content from the source
364		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
365
366		// Save to the cache if applicable
367		if (isset($cacheFile) && !empty($content))
368		{
369		file_put_contents($prefix . $cacheFile, $content);
370		}
371
372		return $content;
373		}
374		}

s9e / TextFormatter

Push — master ( 00d88c...61baac )

Parser::getHttpClient() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like