Completed
Branch TemplateInspector (41b71f)
by Josh
12:43
created

Parser::wget()   C

Complexity

Conditions 7
Paths 8

Size

Total Lines 31
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 13
CRAP Score 7.0178

Importance

Changes 0
Metric Value
dl 0
loc 31
rs 6.7272
c 0
b 0
f 0
ccs 13
cts 14
cp 0.9286
cc 7
eloc 14
nc 8
nop 2
crap 7.0178
1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2017 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10
use s9e\TextFormatter\Parser as TagStack;
11
use s9e\TextFormatter\Parser\Tag;
12
use s9e\TextFormatter\Plugins\ParserBase;
13
use s9e\TextFormatter\Utils\Http;
14
15
class Parser extends ParserBase
16
{
17
	/**
18
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19
	*/
20
	protected static $client;
21
22
	/**
23
	* {@inheritdoc}
24
	*/
25 405
	public function parse($text, array $matches)
26
	{
27 405
		foreach ($matches as $m)
28
		{
29 405
			$url = $m[0][0];
30 405
			$pos = $m[0][1];
31 405
			$len = strlen($url);
32
33
			// Give that tag priority over other tags such as Autolink's
34 405
			$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
35 405
			$tag->setAttribute('url', $url);
36
		}
37 405
	}
38
39
	/**
40
	* Filter a MEDIA tag
41
	*
42
	* This will always invalidate the original tag, and possibly replace it with the tag that
43
	* corresponds to the media site
44
	*
45
	* @param  Tag      $tag      The original tag
46
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47
	* @param  array    $sites    Map of [host => siteId]
48
	* @return bool               Unconditionally FALSE
49
	*/
50 406
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51
	{
52 406
		if ($tag->hasAttribute('site'))
53
		{
54 5
			self::addTagFromMediaId($tag, $tagStack, $sites);
55
		}
56 402
		elseif ($tag->hasAttribute('url'))
57
		{
58 402
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
59
		}
60
61 406
		return false;
62
	}
63
64
	/**
65
	* Test whether a given tag has at least one non-default attribute
66
	*
67
	* @param  Tag  $tag The original tag
68
	* @return bool      Whether the tag contains an attribute not named "url"
69
	*/
70 69
	public static function hasNonDefaultAttribute(Tag $tag)
71
	{
72 69
		foreach ($tag->getAttributes() as $attrName => $void)
73
		{
74 69
			if ($attrName !== 'url')
75
			{
76 69
				return true;
77
			}
78
		}
79
80 8
		return false;
81
	}
82
83
	/**
84
	* Scrape the content of an URL to extract some data
85
	*
86
	* @param  Tag    $tag          Source tag
87
	* @param  array  $scrapeConfig Array of scrape directives
88
	* @param  string $cacheDir     Path to the cache directory
89
	* @return bool                 Unconditionally TRUE
90
	*/
91 163
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
92
	{
93 163
		if ($tag->hasAttribute('url'))
94
		{
95
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
96 162
			$url = $tag->getAttribute('url');
97 162
			if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
98
			{
99 156
				$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
100 156
				foreach ($scrapeConfig as $scrape)
101
				{
102 156
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
103
				}
104
			}
105
		}
106
107 163
		return true;
108
	}
109
110
	//==============================================================================================
111
	// Internals
112
	//==============================================================================================
113
114
	/**
115
	* Add a site tag
116
	*
117
	* @param  Tag      $tag      The original tag
118
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
119
	* @param  string   $siteId   Site ID
120
	* @return void
121
	*/
122 404
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
123
	{
124 404
		$endTag = $tag->getEndTag();
125 404
		if ($endTag)
126
		{
127 12
			$startPos = $tag->getPos();
128 12
			$startLen = $tag->getLen();
129 12
			$endPos   = $endTag->getPos();
130 12
			$endLen   = $endTag->getLen();
131
		}
132
		else
133
		{
134 399
			$startPos = $tag->getPos();
135 399
			$startLen = 0;
136 399
			$endPos   = $tag->getPos() + $tag->getLen();
137 399
			$endLen   = 0;
138
		}
139
140
		// Create a new tag and copy this tag's attributes and priority
141 404
		$tagStack->addTagPair(strtoupper($siteId), $startPos, $startLen, $endPos, $endLen, $tag->getSortPriority())->setAttributes($tag->getAttributes());
142 404
	}
143
144
	/**
145
	* Add a media site tag based on the attributes of a MEDIA tag
146
	*
147
	* @param  Tag      $tag      The original tag
148
	* @param  TagStack $tagStack Parser instance
149
	* @param  array    $sites    Map of [host => siteId]
150
	* @return void
151
	*/
152 5
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
153
	{
154 5
		$siteId = strtolower($tag->getAttribute('site'));
155 5
		if (in_array($siteId, $sites, true))
156
		{
157 4
			self::addSiteTag($tag, $tagStack, $siteId);
158
		}
159 5
	}
160
161
	/**
162
	* Add a media site tag based on the url attribute of a MEDIA tag
163
	*
164
	* @param  Tag      $tag      The original tag
165
	* @param  TagStack $tagStack Parser instance
166
	* @param  array    $sites    Map of [host => siteId]
167
	* @return void
168
	*/
169 402
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
170
	{
171
		// Capture the scheme and (if applicable) host of the URL
172 402
		$p = parse_url($tag->getAttribute('url'));
173 402
		if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
174
		{
175 4
			$siteId = $sites[$p['scheme'] . ':'];
176
		}
177 398
		elseif (isset($p['host']))
178
		{
179 398
			$siteId = self::findSiteIdByHost($p['host'], $sites);
180
		}
181
182 402
		if (!empty($siteId))
183
		{
184 400
			self::addSiteTag($tag, $tagStack, $siteId);
185
		}
186 402
	}
187
188
	/**
189
	* Match a given host to a site ID
190
	*
191
	* @param  string      $host  Host
192
	* @param  array       $sites Map of [host => siteId]
193
	* @return string|bool        Site ID or FALSE
194
	*/
195 398
	protected static function findSiteIdByHost($host, array $sites)
196
	{
197
		// Start with the full host then pop domain labels off the start until we get a match
198
		do
199
		{
200 398
			if (isset($sites[$host]))
201
			{
202 396
				return $sites[$host];
203
			}
204
205 261
			$pos = strpos($host, '.');
206 261
			if ($pos === false)
207
			{
208 4
				break;
209
			}
210
211 261
			$host = substr($host, 1 + $pos);
212
		}
213 261
		while ($host > '');
214
215 4
		return false;
216
	}
217
218
	/**
219
	* Return a cached instance of the HTTP client
220
	*
221
	* @return \s9e\TextFormatter\Utils\Http\Client
222
	*/
223 2
	protected static function getHttpClient()
224
	{
225 2
		if (!isset(self::$client))
226
		{
227 1
			self::$client = Http::getClient();
228
		}
229 2
		self::$client->timeout = 10;
230
231 2
		return self::$client;
232
	}
233
234
	/**
235
	* Replace {@var} tokens in given URL
236
	*
237
	* @param  string   $url  Original URL
238
	* @param  string[] $vars Replacements
239
	* @return string         Modified URL
240
	*/
241 19
	protected static function replaceTokens($url, array $vars)
242
	{
243 19
		return preg_replace_callback(
244 19
			'#\\{@(\\w+)\\}#',
245 19
			function ($m) use ($vars)
246
			{
247 19
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
248 19
			},
249 19
			$url
250
		);
251
	}
252
253
	/**
254
	* Scrape the content of an URL to extract some data
255
	*
256
	* @param  string $url      Original URL
257
	* @param  Tag    $tag      Source tag
258
	* @param  array  $scrape   Array of scrape directives
259
	* @param  string $cacheDir Path to the cache directory
260
	* @return void
261
	*/
262 156
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
263
	{
264 156
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
265
266 156
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
267
		{
268 72
			return;
269
		}
270
271
		// Test whether this URL matches any regexp
272 91
		$vars    = [];
273 91
		$matched = false;
274 91
		foreach ((array) $matchRegexps as $matchRegexp)
275
		{
276 91
			if (preg_match($matchRegexp, $url, $m))
277
			{
278 69
				$vars   += $m;
279 91
				$matched = true;
280
			}
281
		}
282 91
		if (!$matched)
283
		{
284 29
			return;
285
		}
286
287
		// Add the tag's attributes to the named captures from the "match" regexp
288 69
		$vars += $tag->getAttributes();
289
290 69
		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
291 69
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
292 69
	}
293
294
	/**
295
	* Scrape a URL to help fill a tag's attributes
296
	*
297
	* @param  string      $url      URL to scrape
298
	* @param  Tag         $tag      Tag to fill
299
	* @param  string[]    $regexps  Regexps used to extract content from the page
300
	* @param  string|null $cacheDir Path to the cache directory
301
	* @return void
302
	*/
303 69
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
304
	{
305 69
		$content = self::wget($url, $cacheDir);
306
307
		// Execute the extract regexps and fill any missing attribute
308 69
		foreach ($regexps as $regexp)
309
		{
310 69
			if (preg_match($regexp, $content, $m))
311
			{
312 63
				foreach ($m as $k => $v)
313
				{
314 63
					if (!is_numeric($k) && !$tag->hasAttribute($k))
315
					{
316 69
						$tag->setAttribute($k, $v);
317
					}
318
				}
319
			}
320
		}
321 69
	}
322
323
	/**
324
	* Test whether a tag is missing any of given attributes
325
	*
326
	* @param  Tag      $tag
327
	* @param  string[] $attrNames
328
	* @return bool
329
	*/
330 156
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
331
	{
332 156
		foreach ($attrNames as $attrName)
333
		{
334 156
			if (!$tag->hasAttribute($attrName))
335
			{
336 156
				return true;
337
			}
338
		}
339
340 72
		return false;
341
	}
342
343
	/**
344
	* Retrieve external content (possibly from the cache)
345
	*
346
	* If the cache directory exists, the external content will be saved into it. Cached content is
347
	* never pruned
348
	*
349
	* @param  string $url      URL
350
	* @param  string $cacheDir Path to the cache directory
351
	* @return string           External content
352
	*/
353 69
	protected static function wget($url, $cacheDir = null)
354
	{
355 69
		$prefix = '';
356 69
		$url    = preg_replace('(#.*)s', '', $url);
357
358
		// Return the content from the cache if applicable
359 69
		if (isset($cacheDir) && file_exists($cacheDir))
360
		{
361 69
			$cacheFile = $cacheDir . '/http.' . crc32($url);
362 69
			if (extension_loaded('zlib'))
363
			{
364 69
				$prefix     = 'compress.zlib://';
365 69
				$cacheFile .= '.gz';
366
			}
367 69
			if (file_exists($cacheFile))
368
			{
369 67
				return file_get_contents($prefix . $cacheFile);
370
			}
371
		}
372
373
		// Retrieve the external content from the source
374 2
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
375
376
		// Save to the cache if applicable
377 2
		if (isset($cacheFile) && !empty($content))
378
		{
379
			file_put_contents($prefix . $cacheFile, $content);
380
		}
381
382 2
		return $content;
383
	}
384
}