Parser::replaceTokens() - Code Metrics - Inspection of "MediaEmbed: added support for HTTP client used for..." - s9e/TextFormatter - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 644404...5feeae )

by Josh

created 2016-01-14 09:04 UTC

Parser::replaceTokens() A

↳ Parent: Parser

Complexity

Conditions	2
Paths	1

Size

Total Lines	11
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	7
CRAP Score	2

Metric	Value
dl	0
loc	11
ccs	7
cts	7
cp	1
rs	9.4286
cc	2
eloc	6
nc	1
nop	2
crap	2

<?php

/**
* @package   s9e\TextFormatter
* @copyright Copyright (c) 2010-2016 The s9e Authors
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
*/
namespace s9e\TextFormatter\Plugins\MediaEmbed;

use s9e\TextFormatter\Utils\Http;
use s9e\TextFormatter\Parser as TagStack;
use s9e\TextFormatter\Parser\Tag;
use s9e\TextFormatter\Plugins\ParserBase;

class Parser extends ParserBase
{
	/**
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
	*/
	protected static $client;

	/**
	* {@inheritdoc}
	*/
	public function parse($text, array $matches)
	{
		foreach ($matches as $m)
		{
			$url = $m[0][0];
			$pos = $m[0][1];
			$len = strlen($url);

			$tag = $this->parser->addSelfClosingTag('MEDIA', $pos, $len);
			$tag->setAttribute('url', $url);

			// Give that tag priority over other tags such as Autolink's
			$tag->setSortPriority(-10);
		}
	}

	/**
	* Filter a MEDIA tag
	*
	* This will always invalidate the original tag, and possibly replace it with the tag that
	* corresponds to the media site
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
	* @param  array    $sites    Map of [host => siteId]
	* @return bool               Unconditionally FALSE
	*/
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
	{
		if ($tag->hasAttribute('media'))
		{
			self::addTagFromMediaId($tag, $tagStack, $sites);
		}
		elseif ($tag->hasAttribute('url'))
		{
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
		}

		return false;
	}

	/**
	* Test whether a given tag has at least one non-default attribute
	*
	* @param  Tag  $tag The original tag
	* @return bool      Whether the tag contains an attribute not named "url"
	*/
	public static function hasNonDefaultAttribute(Tag $tag)
	{
		foreach ($tag->getAttributes() as $attrName => $void)
		{
			if ($attrName !== 'url')
			{
				return true;
			}
		}

		return false;
	}

	/**
	* Scrape the content of an URL to extract some data
	*
	* @param  Tag    $tag          Source tag
	* @param  array  $scrapeConfig Array of scrape directives
	* @param  string $cacheDir     Path to the cache directory
	* @return bool                 Unconditionally TRUE
	*/
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
	{
		if (!$tag->hasAttribute('url'))
		{
			return true;
		}

		$url = $tag->getAttribute('url');

		// Ensure that the URL actually looks like a URL
		if (!preg_match('#^https?://[^<>"\'\\s]+$#D', $url))
		{
			// A bad URL means we don't scrape, but it doesn't necessarily invalidate the tag
			return true;
		}

		foreach ($scrapeConfig as $scrape)
		{
			self::scrapeEntry($url, $tag, $scrape, $cacheDir);
		}

		return true;
	}

	//==============================================================================================
	// Internals
	//==============================================================================================

	/**
	* Add a site tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
	* @param  string   $siteId   Site ID
	* @return void
	*/
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
	{
		$endTag = $tag->getEndTag() ?: $tag;

		// Compute the boundaries of our new tag
		$lpos = $tag->getPos();
		$rpos = $endTag->getPos() + $endTag->getLen();

		// Create a new tag and copy this tag's attributes and priority
		$newTag = $tagStack->addSelfClosingTag(strtoupper($siteId), $lpos, $rpos - $lpos);
		$newTag->setAttributes($tag->getAttributes());
		$newTag->setSortPriority($tag->getSortPriority());
	}

	/**
	* Add a media site tag based on the attributes of a MEDIA tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance
	* @param  array    $sites    Map of [host => siteId]
	* @return void
	*/
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
	{
		$siteId = strtolower($tag->getAttribute('media'));
		if (in_array($siteId, $sites, true))
		{
			self::addSiteTag($tag, $tagStack, $siteId);
		}
	}

	/**
	* Add a media site tag based on the url attribute of a MEDIA tag
	*
	* @param  Tag      $tag      The original tag
	* @param  TagStack $tagStack Parser instance
	* @param  array    $sites    Map of [host => siteId]
	* @return void
	*/
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
	{
		// Capture the scheme and (if applicable) host of the URL
		$p = parse_url($tag->getAttribute('url'));
		if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
		{
			$siteId = $sites[$p['scheme'] . ':'];
		}
		elseif (isset($p['host']))
		{
			$siteId = self::findSiteIdByHost($p['host'], $sites);
		}

		if (!empty($siteId))
		{
			self::addSiteTag($tag, $tagStack, $siteId);
		}
	}

	/**
	* Match a given host to a site ID
	*
	* @param  string      $host  Host
	* @param  array       $sites Map of [host => siteId]
	* @return string|bool        Site ID or FALSE
	*/
	protected static function findSiteIdByHost($host, array $sites)
	{
		// Start with the full host then pop domain labels off the start until we get a match
		do
		{
			if (isset($sites[$host]))
			{
				return $sites[$host];
			}

			$pos = strpos($host, '.');
			if ($pos === false)
			{
				break;
			}

			$host = substr($host, 1 + $pos);
		}
		while ($host > '');

		return false;
	}

	/**
	* Return a cached instance of the HTTP client
	*
	* @return \s9e\TextFormatter\Utils\Http\Client
	*/
	protected static function getHttpClient()
	{
		if (!isset(self::$client))
		{
			self::$client = Http::getClient();
		}
		self::$client->timeout = 10;

		return self::$client;
	}

	/**
	* Replace {@var} tokens in given URL
	*
	* @param  string   $url  Original URL
	* @param  string[] $vars Replacements
	* @return string         Modified URL
	*/
	protected static function replaceTokens($url, array $vars)
	{
		return preg_replace_callback(
			'#\\{@(\\w+)\\}#',
			function ($m) use ($vars)
			{
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
			},
			$url
		);
	}

	/**
	* Scrape the content of an URL to extract some data
	*
	* @param  string $url      Original URL
	* @param  Tag    $tag      Source tag
	* @param  array  $scrape   Array of scrape directives
	* @param  string $cacheDir Path to the cache directory
	* @return void
	*/
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
	{
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;

		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
		{
			return;
		}

		// Test whether this URL matches any regexp
		$vars    = [];
		$matched = false;
		foreach ((array) $matchRegexps as $matchRegexp)
		{
			if (preg_match($matchRegexp, $url, $m))
			{
				$vars   += $m;
				$matched = true;
			}
		}
		if (!$matched)
		{
			return;
		}

		// Add the tag's attributes to the named captures from the "match" regexp
		$vars += $tag->getAttributes();

		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
	}

	/**
	* Scrape a URL to help fill a tag's attributes
	*
	* @param  string      $url      URL to scrape
	* @param  Tag         $tag      Tag to fill
	* @param  string[]    $regexps  Regexps used to extract content from the page
	* @param  string|null $cacheDir Path to the cache directory
	* @return void
	*/
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
	{
		$content = self::wget($url, $cacheDir);

		// Execute the extract regexps and fill any missing attribute
		foreach ($regexps as $regexp)
		{
			if (preg_match($regexp, $content, $m))
			{
				foreach ($m as $k => $v)
				{
					if (!is_numeric($k) && !$tag->hasAttribute($k))
					{
						$tag->setAttribute($k, $v);
					}
				}
			}
		}
	}

	/**
	* Test whether a tag is missing any of given attributes
	*
	* @param  Tag      $tag
	* @param  string[] $attrNames
	* @return bool
	*/
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
	{
		foreach ($attrNames as $attrName)
		{
			if (!$tag->hasAttribute($attrName))
			{
				return true;
			}
		}

		return false;
	}

	/**
	* Retrieve external content (possibly from the cache)
	*
	* If the cache directory exists, the external content will be saved into it. Cached content is
	* never pruned
	*
	* @param  string $url      URL
	* @param  string $cacheDir Path to the cache directory
	* @return string           External content
	*/
	protected static function wget($url, $cacheDir = null)
	{
		$prefix = '';

		// Return the content from the cache if applicable
		if (isset($cacheDir) && file_exists($cacheDir))
		{
			$cacheFile = $cacheDir . '/http.' . crc32($url);
			if (extension_loaded('zlib'))
			{
				$prefix     = 'compress.zlib://';
				$cacheFile .= '.gz';
			}
			if (file_exists($cacheFile))
			{
				return file_get_contents($prefix . $cacheFile);
			}
		}

		// Retrieve the external content from the source
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);

		// Save to the cache if applicable
		if (isset($cacheFile) && !empty($content))
		{
			file_put_contents($prefix . $cacheFile, $content);
		}

		return $content;
	}
}

1		<?php
2
3		/**
4		* @package s9e\TextFormatter
5		* @copyright Copyright (c) 2010-2016 The s9e Authors
6		* @license http://www.opensource.org/licenses/mit-license.php The MIT License
7		*/
8		namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10		use s9e\TextFormatter\Utils\Http;
11		use s9e\TextFormatter\Parser as TagStack;
12		use s9e\TextFormatter\Parser\Tag;
13		use s9e\TextFormatter\Plugins\ParserBase;
14
15		class Parser extends ParserBase
16		{
17		/**
18		* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19		*/
20		protected static $client;
21
22		/**
23		* {@inheritdoc}
24		*/
25	331	public function parse($text, array $matches)
26		{
27	331	foreach ($matches as $m)
28		{
29	331	$url = $m[0][0];
30	331	$pos = $m[0][1];
31	331	$len = strlen($url);
32
33	331	$tag = $this->parser->addSelfClosingTag('MEDIA', $pos, $len);
34	331	$tag->setAttribute('url', $url);
35
36		// Give that tag priority over other tags such as Autolink's
37	331	$tag->setSortPriority(-10);
38	331	}
39	331	}
40
41		/**
42		* Filter a MEDIA tag
43		*
44		* This will always invalidate the original tag, and possibly replace it with the tag that
45		* corresponds to the media site
46		*
47		* @param Tag $tag The original tag
48		* @param TagStack $tagStack Parser instance, so that we can add the new tag to the stack
49		* @param array $sites Map of [host => siteId]
50		* @return bool Unconditionally FALSE
51		*/
52	325	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
53		{
54	325	if ($tag->hasAttribute('media'))
55	325	{
56	6	self::addTagFromMediaId($tag, $tagStack, $sites);
57	6	}
58	319	elseif ($tag->hasAttribute('url'))
59		{
60	319	self::addTagFromMediaUrl($tag, $tagStack, $sites);
61	319	}
62
63	325	return false;
64		}
65
66		/**
67		* Test whether a given tag has at least one non-default attribute
68		*
69		* @param Tag $tag The original tag
70		* @return bool Whether the tag contains an attribute not named "url"
71		*/
72	52	public static function hasNonDefaultAttribute(Tag $tag)
73		{
74	52	foreach ($tag->getAttributes() as $attrName => $void)
75		{
76	52	if ($attrName !== 'url')
77	52	{
78	48	return true;
79		}
80	47	}
81
82	4	return false;
83		}
84
85		/**
86		* Scrape the content of an URL to extract some data
87		*
88		* @param Tag $tag Source tag
89		* @param array $scrapeConfig Array of scrape directives
90		* @param string $cacheDir Path to the cache directory
91		* @return bool Unconditionally TRUE
92		*/
93	122	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
94		{
95	122	if (!$tag->hasAttribute('url'))
96	122	{
97	1	return true;
98		}
99
100	121	$url = $tag->getAttribute('url');
101
102		// Ensure that the URL actually looks like a URL
103	121	if (!preg_match('#^https?://[^<>"\'\\s]+$#D', $url))
104	121	{
105		// A bad URL means we don't scrape, but it doesn't necessarily invalidate the tag
106	3	return true;
107		}
108
109	118	foreach ($scrapeConfig as $scrape)
110		{
111	118	self::scrapeEntry($url, $tag, $scrape, $cacheDir);
112	118	}
113
114	118	return true;
115		}
116
117		//==============================================================================================
118		// Internals
119		//==============================================================================================
120
121		/**
122		* Add a site tag
123		*
124		* @param Tag $tag The original tag
125		* @param TagStack $tagStack Parser instance, so that we can add the new tag to the stack
126		* @param string $siteId Site ID
127		* @return void
128		*/
129	323	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
130		{
131	323	$endTag = $tag->getEndTag() ?: $tag;
132
133		// Compute the boundaries of our new tag
134	323	$lpos = $tag->getPos();
135	323	$rpos = $endTag->getPos() + $endTag->getLen();
136
137		// Create a new tag and copy this tag's attributes and priority
138	323	$newTag = $tagStack->addSelfClosingTag(strtoupper($siteId), $lpos, $rpos - $lpos);
139	323	$newTag->setAttributes($tag->getAttributes());
140	323	$newTag->setSortPriority($tag->getSortPriority());
141	323	}
142
143		/**
144		* Add a media site tag based on the attributes of a MEDIA tag
145		*
146		* @param Tag $tag The original tag
147		* @param TagStack $tagStack Parser instance
148		* @param array $sites Map of [host => siteId]
149		* @return void
150		*/
151	6	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
152		{
153	6	$siteId = strtolower($tag->getAttribute('media'));
154	6	if (in_array($siteId, $sites, true))
155	6	{
156	5	self::addSiteTag($tag, $tagStack, $siteId);
157	5	}
158	6	}
159
160		/**
161		* Add a media site tag based on the url attribute of a MEDIA tag
162		*
163		* @param Tag $tag The original tag
164		* @param TagStack $tagStack Parser instance
165		* @param array $sites Map of [host => siteId]
166		* @return void
167		*/
168	319	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
169		{
170		// Capture the scheme and (if applicable) host of the URL
171	319	$p = parse_url($tag->getAttribute('url'));
172	319	if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
173	319	{
174	2	$siteId = $sites[$p['scheme'] . ':'];
175	2	}
176	317	elseif (isset($p['host']))
177		{
178	317	$siteId = self::findSiteIdByHost($p['host'], $sites);
179	317	}
180
181	319	if (!empty($siteId))
182	319	{
183	318	self::addSiteTag($tag, $tagStack, $siteId);
184	318	}
185	319	}
186
187		/**
188		* Match a given host to a site ID
189		*
190		* @param string $host Host
191		* @param array $sites Map of [host => siteId]
192		* @return string\|bool Site ID or FALSE
193		*/
194	317	protected static function findSiteIdByHost($host, array $sites)
195		{
196		// Start with the full host then pop domain labels off the start until we get a match
197		do
198		{
199	317	if (isset($sites[$host]))
200	317	{
201	316	return $sites[$host];
202		}
203
204	210	$pos = strpos($host, '.');
205	210	if ($pos === false)
206	210	{
207	3	break;
208		}
209
210	210	$host = substr($host, 1 + $pos);
211		}
212	210	while ($host > '');
213
214	3	return false;
215		}
216
217		/**
218		* Return a cached instance of the HTTP client
219		*
220		* @return \s9e\TextFormatter\Utils\Http\Client
221		*/
222	1	protected static function getHttpClient()
223		{
224	1	if (!isset(self::$client))
225	1	{
226	1	self::$client = Http::getClient();
227	1	}
228	1	self::$client->timeout = 10;
229
230	1	return self::$client;
231		}
232
233		/**
234		* Replace {@var} tokens in given URL
235		*
236		* @param string $url Original URL
237		* @param string[] $vars Replacements
238		* @return string Modified URL
239		*/
240	19	protected static function replaceTokens($url, array $vars)
241		{
242	19	return preg_replace_callback(
243	19	'#\\{@(\\w+)\\}#',
244	19	function ($m) use ($vars)
245		{
246	19	return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
247	19	},
248		$url
249	19	);
250		}
251
252		/**
253		* Scrape the content of an URL to extract some data
254		*
255		* @param string $url Original URL
256		* @param Tag $tag Source tag
257		* @param array $scrape Array of scrape directives
258		* @param string $cacheDir Path to the cache directory
259		* @return void
260		*/
261	118	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
262		{
263	118	list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
264
265	118	if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
266	118	{
267	28	return;
268		}
269
270		// Test whether this URL matches any regexp
271	91	$vars = [];
272	91	$matched = false;
273	91	foreach ((array) $matchRegexps as $matchRegexp)
274		{
275	91	if (preg_match($matchRegexp, $url, $m))
276	91	{
277	74	$vars += $m;
278	74	$matched = true;
279	74	}
280	91	}
281	91	if (!$matched)
282	91	{
283	33	return;
284		}
285
286		// Add the tag's attributes to the named captures from the "match" regexp
287	74	$vars += $tag->getAttributes();
288
289	74	$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
290	74	self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
291	74	}
292
293		/**
294		* Scrape a URL to help fill a tag's attributes
295		*
296		* @param string $url URL to scrape
297		* @param Tag $tag Tag to fill
298		* @param string[] $regexps Regexps used to extract content from the page
299		* @param string\|null $cacheDir Path to the cache directory
300		* @return void
301		*/
302	74	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
303		{
304	74	$content = self::wget($url, $cacheDir);
305
306		// Execute the extract regexps and fill any missing attribute
307	74	foreach ($regexps as $regexp)
308		{
309	74	if (preg_match($regexp, $content, $m))
310	74	{
311	72	foreach ($m as $k => $v)
312		{
313	72	if (!is_numeric($k) && !$tag->hasAttribute($k))
314	72	{
315	72	$tag->setAttribute($k, $v);
316	72	}
317	72	}
318	72	}
319	74	}
320	74	}
321
322		/**
323		* Test whether a tag is missing any of given attributes
324		*
325		* @param Tag $tag
326		* @param string[] $attrNames
327		* @return bool
328		*/
329	118	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
330		{
331	118	foreach ($attrNames as $attrName)
332		{
333	118	if (!$tag->hasAttribute($attrName))
334	118	{
335	91	return true;
336		}
337	30	}
338
339	28	return false;
340		}
341
342		/**
343		* Retrieve external content (possibly from the cache)
344		*
345		* If the cache directory exists, the external content will be saved into it. Cached content is
346		* never pruned
347		*
348		* @param string $url URL
349		* @param string $cacheDir Path to the cache directory
350		* @return string External content
351		*/
352	74	protected static function wget($url, $cacheDir = null)
353		{
354	74	$prefix = '';
355
356		// Return the content from the cache if applicable
357	74	if (isset($cacheDir) && file_exists($cacheDir))
358	74	{
359	74	$cacheFile = $cacheDir . '/http.' . crc32($url);
360	74	if (extension_loaded('zlib'))
361	74	{
362	74	$prefix = 'compress.zlib://';
363	74	$cacheFile .= '.gz';
364	74	}
365	74	if (file_exists($cacheFile))
366	74	{
367	73	return file_get_contents($prefix . $cacheFile);
368		}
369	1	}
370
371		// Retrieve the external content from the source
372	1	$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
373
374		// Save to the cache if applicable
375	1	if (isset($cacheFile) && !empty($content))
376	1	{
377	1	file_put_contents($prefix . $cacheFile, $content);
378	1	}
379
380	1	return $content;
381		}
382		}

s9e / TextFormatter

Push — master ( 644404...5feeae )

Parser::replaceTokens() A

Complexity

Size

Duplication

Code Coverage

Duplication Side-by-Side

Filter issues like