Parser::wget() - Code Metrics - Inspection of "Added caching HTTP client" - s9e/TextFormatter - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 83cb8e...7577e4 )

by Josh

created 2018-01-01 05:55 UTC

Parser::wget() A

↳ Parent: Parser

Complexity

Conditions	1
Paths	1

Size

Total Lines	6
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	3
CRAP Score	1

Importance

Changes

Metric	Value
dl	0
loc	6
ccs	3
cts	3
cp	1
rs	9.4285
c	0
b	0
f	0
cc	1
eloc	3
nc	1
nop	2
crap	1

<?php

/**
* @package   s9e\TextFormatter
* @copyright Copyright (c) 2010-2018 The s9e Authors
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\TextFormatter\Plugins\MediaEmbed;

use s9e\TextFormatter\Parser as TagStack;
use s9e\TextFormatter\Parser\Tag;
use s9e\TextFormatter\Plugins\ParserBase;
use s9e\TextFormatter\Utils\Http;

class Parser extends ParserBase
{
	/**
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
	*/
	protected static $client;

	/**
	* {@inheritdoc}
	*/
	public function parse($text, array $matches)
	{
		foreach ($matches as $m)
		{
			$url = $m[0][0];
			$pos = $m[0][1];
			$len = strlen($url);

			// Give that tag priority over other tags such as Autolink's
			$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
			$tag->setAttribute('url', $url);
		}
	}

	/**
	* Filter a MEDIA tag
	*
	* This will always invalidate the original tag, and possibly replace it with the tag that
	* corresponds to the media site
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
	* @param  array    $sites    Map of [host => siteId]
	* @return void
	*/
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
	{
		$tag->invalidate();
		if ($tag->hasAttribute('site'))
		{
			self::addTagFromMediaId($tag, $tagStack, $sites);
		}
		elseif ($tag->hasAttribute('url'))
		{
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
		}
	}

	/**
	* Invalidate given tag if it doesn't have at least one non-default attribute
	*
	* @param  Tag  $tag The original tag
	* @return void
	*/
	public static function hasNonDefaultAttribute(Tag $tag)
	{
		foreach ($tag->getAttributes() as $attrName => $void)
		{
			if ($attrName !== 'url')
			{
				return;
			}
		}

		$tag->invalidate();
	}

	/**
	* Scrape the content of an URL to extract some data
	*
	* @param  Tag    $tag          Source tag
	* @param  array  $scrapeConfig Array of scrape directives
	* @param  string $cacheDir     Path to the cache directory
	* @return bool                 Unconditionally TRUE
	*/
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
	{
		if ($tag->hasAttribute('url'))
		{
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
			$url = $tag->getAttribute('url');
			if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
			{
				$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
				foreach ($scrapeConfig as $scrape)
				{
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
				}
			}
		}

		return true;
	}

	//==============================================================================================
	// Internals
	//==============================================================================================

	/**
	* Add a site tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
	* @param  string   $siteId   Site ID
	* @return void
	*/
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
	{
		$endTag = $tag->getEndTag();
		if ($endTag)
		{
			$startPos = $tag->getPos();
			$startLen = $tag->getLen();
			$endPos   = $endTag->getPos();
			$endLen   = $endTag->getLen();
		}
		else
		{
			$startPos = $tag->getPos();
			$startLen = 0;
			$endPos   = $tag->getPos() + $tag->getLen();
			$endLen   = 0;
		}

		// Create a new tag and copy this tag's attributes and priority
		$tagStack->addTagPair(strtoupper($siteId), $startPos, $startLen, $endPos, $endLen, $tag->getSortPriority())->setAttributes($tag->getAttributes());
	}

	/**
	* Add a media site tag based on the attributes of a MEDIA tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance
	* @param  array    $sites    Map of [host => siteId]
	* @return void
	*/
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
	{
		$siteId = strtolower($tag->getAttribute('site'));
		if (in_array($siteId, $sites, true))
		{
			self::addSiteTag($tag, $tagStack, $siteId);
		}
	}

	/**
	* Add a media site tag based on the url attribute of a MEDIA tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance
	* @param  array    $sites    Map of [host => siteId]
	* @return void
	*/
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
	{
		// Capture the host of the URL
		if (preg_match('(^\\w+://(?:[^@/]*@)?([^/]+))', $tag->getAttribute('url'), $m))
		{
			$siteId = self::findSiteIdByHost($m[1], $sites);
		}

		if (!empty($siteId))
		{
			self::addSiteTag($tag, $tagStack, $siteId);
		}
	}

	/**
	* Match a given host to a site ID
	*
	* @param  string      $host  Host
	* @param  array       $sites Map of [host => siteId]
	* @return string|bool        Site ID or FALSE
	*/
	protected static function findSiteIdByHost($host, array $sites)
	{
		// Start with the full host then pop domain labels off the start until we get a match
		do
		{
			if (isset($sites[$host]))
			{
				return $sites[$host];
			}

			$pos = strpos($host, '.');
			if ($pos === false)
			{
				break;
			}

			$host = substr($host, 1 + $pos);
		}
		while ($host > '');

		return false;
	}

	/**
	* Return a cached instance of the HTTP client
	*
	* @param  string|null $cacheDir
	* @return \s9e\TextFormatter\Utils\Http\Client
	*/
	protected static function getHttpClient($cacheDir)
	{
		if (!isset(self::$client))
		{
			self::$client = (isset($cacheDir)) ? Http::getCachingClient($cacheDir) : Http::getClient();
		}

		return self::$client;
	}

	/**
	* Replace {@var} tokens in given URL
	*
	* @param  string   $url  Original URL
	* @param  string[] $vars Replacements
	* @return string         Modified URL
	*/
	protected static function replaceTokens($url, array $vars)
	{
		return preg_replace_callback(
			'#\\{@(\\w+)\\}#',
			function ($m) use ($vars)
			{
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
			},
			$url
		);
	}

	/**
	* Scrape the content of an URL to extract some data
	*
	* @param  string $url      Original URL
	* @param  Tag    $tag      Source tag
	* @param  array  $scrape   Array of scrape directives
	* @param  string $cacheDir Path to the cache directory
	* @return void
	*/
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
	{
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
		{
			return;
		}

		// Test whether this URL matches any regexp
		$vars    = [];
		$matched = false;
		foreach ((array) $matchRegexps as $matchRegexp)
		{
			if (preg_match($matchRegexp, $url, $m))
			{
				$vars   += $m;
				$matched = true;
			}
		}
		if (!$matched)
		{
			return;
		}

		// Add the tag's attributes to the named captures from the "match" regexp
		$vars += $tag->getAttributes();

		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
	}

	/**
	* Scrape a URL to help fill a tag's attributes
	*
	* @param  string      $url      URL to scrape
	* @param  Tag         $tag      Tag to fill
	* @param  string[]    $regexps  Regexps used to extract content from the page
	* @param  string|null $cacheDir Path to the cache directory
	* @return void
	*/
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
	{
		$content = self::wget($url, $cacheDir);

		// Execute the extract regexps and fill any missing attribute
		foreach ($regexps as $regexp)
		{
			if (preg_match($regexp, $content, $m))
			{
				foreach ($m as $k => $v)
				{
					if (!is_numeric($k) && !$tag->hasAttribute($k))
					{
						$tag->setAttribute($k, $v);
					}
				}
			}
		}
	}

	/**
	* Test whether a tag is missing any of given attributes
	*
	* @param  Tag      $tag
	* @param  string[] $attrNames
	* @return bool
	*/
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
	{
		foreach ($attrNames as $attrName)
		{
			if (!$tag->hasAttribute($attrName))
			{
				return true;
			}
		}

		return false;
	}

	/**
	* Retrieve external content (possibly from the cache)
	*
	* If the cache directory exists, the external content will be saved into it. Cached content is
	* never pruned
	*
	* @param  string $url      URL
	* @param  string $cacheDir Path to the cache directory
	* @return string           External content
	*/
	protected static function wget($url, $cacheDir = null)
	{
		$url = preg_replace('(#.*)s', '', $url);

		return @self::getHttpClient($cacheDir)->get($url, ['User-Agent: PHP (not Mozilla)']);
	}
}

1		<?php
2
3		/**
4		* @package s9e\TextFormatter
5		* @copyright Copyright (c) 2010-2018 The s9e Authors
6		* @license http://www.opensource.org/licenses/mit-license.php The MIT License
7		*/
8		namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10		use s9e\TextFormatter\Parser as TagStack;
11		use s9e\TextFormatter\Parser\Tag;
12		use s9e\TextFormatter\Plugins\ParserBase;
13		use s9e\TextFormatter\Utils\Http;
14
15		class Parser extends ParserBase
16		{
17		/**
18		* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19		*/
20		protected static $client;
21
22		/**
23		* {@inheritdoc}
24		*/
25	405	public function parse($text, array $matches)
26		{
27	405	foreach ($matches as $m)
28		{
29	405	$url = $m[0][0];
30	405	$pos = $m[0][1];
31	405	$len = strlen($url);
32
33		// Give that tag priority over other tags such as Autolink's
34	405	$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
35	405	$tag->setAttribute('url', $url);
36		}
37	405	}
38
39		/**
40		* Filter a MEDIA tag
41		*
42		* This will always invalidate the original tag, and possibly replace it with the tag that
43		* corresponds to the media site
44		*
45		* @param Tag $tag The original tag
46		* @param TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47		* @param array $sites Map of [host => siteId]
48		* @return void
49		*/
50	405	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51		{
52	405	$tag->invalidate();
53	405	if ($tag->hasAttribute('site'))
54		{
55	5	self::addTagFromMediaId($tag, $tagStack, $sites);
56		}
57	401	elseif ($tag->hasAttribute('url'))
58		{
59	401	self::addTagFromMediaUrl($tag, $tagStack, $sites);
60		}
61	405	}
62
63		/**
64		* Invalidate given tag if it doesn't have at least one non-default attribute
65		*
66		* @param Tag $tag The original tag
67		* @return void
68		*/
69	60	public static function hasNonDefaultAttribute(Tag $tag)
70		{
71	60	foreach ($tag->getAttributes() as $attrName => $void)
72		{
73	60	if ($attrName !== 'url')
74		{
75	60	return;
76		}
77		}
78
79	6	$tag->invalidate();
80	6	}
81
82		/**
83		* Scrape the content of an URL to extract some data
84		*
85		* @param Tag $tag Source tag
86		* @param array $scrapeConfig Array of scrape directives
87		* @param string $cacheDir Path to the cache directory
88		* @return bool Unconditionally TRUE
89		*/
90	410	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
91		{
92	410	if ($tag->hasAttribute('url'))
93		{
94		// Ensure that the URL actually looks like a URL if we want to use it to scrape
95	409	$url = $tag->getAttribute('url');
96	409	if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
97		{
98	404	$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
99	404	foreach ($scrapeConfig as $scrape)
100		{
101	141	self::scrapeEntry($url, $tag, $scrape, $cacheDir);
102		}
103		}
104		}
105
106	410	return true;
107		}
108
109		//==============================================================================================
110		// Internals
111		//==============================================================================================
112
113		/**
114		* Add a site tag
115		*
116		* @param Tag $tag The original tag
117		* @param TagStack $tagStack Parser instance, so that we can add the new tag to the stack
118		* @param string $siteId Site ID
119		* @return void
120		*/
121	403	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
122		{
123	403	$endTag = $tag->getEndTag();
124	403	if ($endTag)
125		{
126	12	$startPos = $tag->getPos();
127	12	$startLen = $tag->getLen();
128	12	$endPos = $endTag->getPos();
129	12	$endLen = $endTag->getLen();
130		}
131		else
132		{
133	399	$startPos = $tag->getPos();
134	399	$startLen = 0;
135	399	$endPos = $tag->getPos() + $tag->getLen();
136	399	$endLen = 0;
137		}
138
139		// Create a new tag and copy this tag's attributes and priority
140	403	$tagStack->addTagPair(strtoupper($siteId), $startPos, $startLen, $endPos, $endLen, $tag->getSortPriority())->setAttributes($tag->getAttributes());
141	403	}
142
143		/**
144		* Add a media site tag based on the attributes of a MEDIA tag
145		*
146		* @param Tag $tag The original tag
147		* @param TagStack $tagStack Parser instance
148		* @param array $sites Map of [host => siteId]
149		* @return void
150		*/
151	5	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
152		{
153	5	$siteId = strtolower($tag->getAttribute('site'));
154	5	if (in_array($siteId, $sites, true))
155		{
156	4	self::addSiteTag($tag, $tagStack, $siteId);
157		}
158	5	}
159
160		/**
161		* Add a media site tag based on the url attribute of a MEDIA tag
162		*
163		* @param Tag $tag The original tag
164		* @param TagStack $tagStack Parser instance
165		* @param array $sites Map of [host => siteId]
166		* @return void
167		*/
168	401	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
169		{
170		// Capture the host of the URL
171	401	if (preg_match('(^\\w+://(?:[^@/]*@)?([^/]+))', $tag->getAttribute('url'), $m))
172		{
173	401	$siteId = self::findSiteIdByHost($m[1], $sites);
174		}
175
176	401	if (!empty($siteId))
177		{
178	399	self::addSiteTag($tag, $tagStack, $siteId);
179		}
180	401	}
181
182		/**
183		* Match a given host to a site ID
184		*
185		* @param string $host Host
186		* @param array $sites Map of [host => siteId]
187		* @return string\|bool Site ID or FALSE
188		*/
189	401	protected static function findSiteIdByHost($host, array $sites)
190		{
191		// Start with the full host then pop domain labels off the start until we get a match
192		do
193		{
194	401	if (isset($sites[$host]))
195		{
196	399	return $sites[$host];
197		}
198
199	262	$pos = strpos($host, '.');
200	262	if ($pos === false)
201		{
202	4	break;
203		}
204
205	262	$host = substr($host, 1 + $pos);
206		}
207	262	while ($host > '');
208
209	4	return false;
210		}
211
212		/**
213		* Return a cached instance of the HTTP client
214		*
215		* @param string\|null $cacheDir
216		* @return \s9e\TextFormatter\Utils\Http\Client
217	2	*/
218		protected static function getHttpClient($cacheDir)
219	2	{
220		if (!isset(self::$client))
221	1	{
222		self::$client = (isset($cacheDir)) ? Http::getCachingClient($cacheDir) : Http::getClient();
223	2	}
224
225	2	return self::$client;
226		}
227
228		/**
229		* Replace {@var} tokens in given URL
230		*
231		* @param string $url Original URL
232		* @param string[] $vars Replacements
233		* @return string Modified URL
234		*/
235	18	protected static function replaceTokens($url, array $vars)
236		{
237	18	return preg_replace_callback(
238	18	'#\\{@(\\w+)\\}#',
239	18	function ($m) use ($vars)
240		{
241	18	return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
242	18	},
243	18	$url
244		);
245		}
246
247		/**
248		* Scrape the content of an URL to extract some data
249		*
250		* @param string $url Original URL
251		* @param Tag $tag Source tag
252		* @param array $scrape Array of scrape directives
253		* @param string $cacheDir Path to the cache directory
254		* @return void
255		*/
256	141	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
257		{
258	141	list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
259	141	if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
260		{
261	74	return;
262		}
263
264		// Test whether this URL matches any regexp
265	74	$vars = [];
266	74	$matched = false;
267	74	foreach ((array) $matchRegexps as $matchRegexp)
268		{
269	74	if (preg_match($matchRegexp, $url, $m))
270		{
271	61	$vars += $m;
272	74	$matched = true;
273		}
274		}
275	74	if (!$matched)
276		{
277	20	return;
278		}
279
280		// Add the tag's attributes to the named captures from the "match" regexp
281	61	$vars += $tag->getAttributes();
282
283	61	$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
284	61	self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
285	61	}
286
287		/**
288		* Scrape a URL to help fill a tag's attributes
289		*
290		* @param string $url URL to scrape
291		* @param Tag $tag Tag to fill
292		* @param string[] $regexps Regexps used to extract content from the page
293		* @param string\|null $cacheDir Path to the cache directory
294		* @return void
295		*/
296	61	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
297		{
298	61	$content = self::wget($url, $cacheDir);
299
300		// Execute the extract regexps and fill any missing attribute
301	61	foreach ($regexps as $regexp)
302		{
303	61	if (preg_match($regexp, $content, $m))
304		{
305	59	foreach ($m as $k => $v)
306		{
307	59	if (!is_numeric($k) && !$tag->hasAttribute($k))
308		{
309	61	$tag->setAttribute($k, $v);
310		}
311		}
312		}
313		}
314	61	}
315
316		/**
317		* Test whether a tag is missing any of given attributes
318		*
319		* @param Tag $tag
320		* @param string[] $attrNames
321		* @return bool
322		*/
323	141	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
324		{
325	141	foreach ($attrNames as $attrName)
326		{
327	141	if (!$tag->hasAttribute($attrName))
328		{
329	141	return true;
330		}
331		}
332
333	74	return false;
334		}
335
336		/**
337		* Retrieve external content (possibly from the cache)
338		*
339		* If the cache directory exists, the external content will be saved into it. Cached content is
340		* never pruned
341		*
342		* @param string $url URL
343		* @param string $cacheDir Path to the cache directory
344		* @return string External content
345		*/
346	61	protected static function wget($url, $cacheDir = null)
347		{
348	61	$url = preg_replace('(#.*)s', '', $url);
349	61
350		return @self::getHttpClient($cacheDir)->get($url, ['User-Agent: PHP (not Mozilla)']);
351		}
352		}

s9e / TextFormatter

Push — master ( 83cb8e...7577e4 )

Parser::wget() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like