Completed
Push — master ( ff6da6...21a9fc )
by Josh
17:18
created

Parser::scrapeUrl()   B

Complexity

Conditions 6
Paths 5

Size

Total Lines 19
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 19
ccs 8
cts 8
cp 1
rs 8.8571
c 0
b 0
f 0
cc 6
eloc 7
nc 5
nop 4
crap 6
1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2017 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10
use s9e\TextFormatter\Parser as TagStack;
11
use s9e\TextFormatter\Parser\Tag;
12
use s9e\TextFormatter\Plugins\ParserBase;
13
use s9e\TextFormatter\Utils\Http;
14
15
class Parser extends ParserBase
16
{
17
	/**
18
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19
	*/
20
	protected static $client;
21
22
	/**
23
	* {@inheritdoc}
24
	*/
25 407
	public function parse($text, array $matches)
26
	{
27 407
		foreach ($matches as $m)
28
		{
29 407
			$url = $m[0][0];
30 407
			$pos = $m[0][1];
31 407
			$len = strlen($url);
32
33
			// Give that tag priority over other tags such as Autolink's
34 407
			$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
35 407
			$tag->setAttribute('url', $url);
36
		}
37 407
	}
38
39
	/**
40
	* Filter a MEDIA tag
41
	*
42
	* This will always invalidate the original tag, and possibly replace it with the tag that
43
	* corresponds to the media site
44
	*
45
	* @param  Tag      $tag      The original tag
46
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47
	* @param  array    $sites    Map of [host => siteId]
48
	* @return void
49
	*/
50 407
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51
	{
52 407
		$tag->invalidate();
53 407
		if ($tag->hasAttribute('site'))
54
		{
55 5
			self::addTagFromMediaId($tag, $tagStack, $sites);
56
		}
57 403
		elseif ($tag->hasAttribute('url'))
58
		{
59 403
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
60
		}
61 407
	}
62
63
	/**
64
	* Invalidate given tag if it doesn't have at least one non-default attribute
65
	*
66
	* @param  Tag  $tag The original tag
67
	* @return void
68
	*/
69 60
	public static function hasNonDefaultAttribute(Tag $tag)
70
	{
71 60
		foreach ($tag->getAttributes() as $attrName => $void)
72
		{
73 60
			if ($attrName !== 'url')
74
			{
75 60
				return;
76
			}
77
		}
78
79 6
		$tag->invalidate();
80 6
	}
81
82
	/**
83
	* Scrape the content of an URL to extract some data
84
	*
85
	* @param  Tag    $tag          Source tag
86
	* @param  array  $scrapeConfig Array of scrape directives
87
	* @param  string $cacheDir     Path to the cache directory
88
	* @return bool                 Unconditionally TRUE
89
	*/
90 412
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
91
	{
92 412
		if ($tag->hasAttribute('url'))
93
		{
94
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
95 411
			$url = $tag->getAttribute('url');
96 411
			if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
97
			{
98 406
				$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
99 406
				foreach ($scrapeConfig as $scrape)
100
				{
101 143
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
102
				}
103
			}
104
		}
105
106 412
		return true;
107
	}
108
109
	//==============================================================================================
110
	// Internals
111
	//==============================================================================================
112
113
	/**
114
	* Add a site tag
115
	*
116
	* @param  Tag      $tag      The original tag
117
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
118
	* @param  string   $siteId   Site ID
119
	* @return void
120
	*/
121 405
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
122
	{
123 405
		$endTag = $tag->getEndTag();
124 405
		if ($endTag)
125
		{
126 12
			$startPos = $tag->getPos();
127 12
			$startLen = $tag->getLen();
128 12
			$endPos   = $endTag->getPos();
129 12
			$endLen   = $endTag->getLen();
130
		}
131
		else
132
		{
133 401
			$startPos = $tag->getPos();
134 401
			$startLen = 0;
135 401
			$endPos   = $tag->getPos() + $tag->getLen();
136 401
			$endLen   = 0;
137
		}
138
139
		// Create a new tag and copy this tag's attributes and priority
140 405
		$tagStack->addTagPair(strtoupper($siteId), $startPos, $startLen, $endPos, $endLen, $tag->getSortPriority())->setAttributes($tag->getAttributes());
141 405
	}
142
143
	/**
144
	* Add a media site tag based on the attributes of a MEDIA tag
145
	*
146
	* @param  Tag      $tag      The original tag
147
	* @param  TagStack $tagStack Parser instance
148
	* @param  array    $sites    Map of [host => siteId]
149
	* @return void
150
	*/
151 5
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
152
	{
153 5
		$siteId = strtolower($tag->getAttribute('site'));
154 5
		if (in_array($siteId, $sites, true))
155
		{
156 4
			self::addSiteTag($tag, $tagStack, $siteId);
157
		}
158 5
	}
159
160
	/**
161
	* Add a media site tag based on the url attribute of a MEDIA tag
162
	*
163
	* @param  Tag      $tag      The original tag
164
	* @param  TagStack $tagStack Parser instance
165
	* @param  array    $sites    Map of [host => siteId]
166
	* @return void
167
	*/
168 403
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
169
	{
170
		// Capture the scheme and (if applicable) host of the URL
171 403
		$p = parse_url($tag->getAttribute('url'));
172 403
		if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
173
		{
174
			$siteId = $sites[$p['scheme'] . ':'];
175
		}
176 403
		elseif (isset($p['host']))
177
		{
178 403
			$siteId = self::findSiteIdByHost($p['host'], $sites);
179
		}
180
181 403
		if (!empty($siteId))
182
		{
183 401
			self::addSiteTag($tag, $tagStack, $siteId);
184
		}
185 403
	}
186
187
	/**
188
	* Match a given host to a site ID
189
	*
190
	* @param  string      $host  Host
191
	* @param  array       $sites Map of [host => siteId]
192
	* @return string|bool        Site ID or FALSE
193
	*/
194 403
	protected static function findSiteIdByHost($host, array $sites)
195
	{
196
		// Start with the full host then pop domain labels off the start until we get a match
197
		do
198
		{
199 403
			if (isset($sites[$host]))
200
			{
201 401
				return $sites[$host];
202
			}
203
204 264
			$pos = strpos($host, '.');
205 264
			if ($pos === false)
206
			{
207 4
				break;
208
			}
209
210 264
			$host = substr($host, 1 + $pos);
211
		}
212 264
		while ($host > '');
213
214 4
		return false;
215
	}
216
217
	/**
218
	* Return a cached instance of the HTTP client
219
	*
220
	* @return \s9e\TextFormatter\Utils\Http\Client
221
	*/
222 2
	protected static function getHttpClient()
223
	{
224 2
		if (!isset(self::$client))
225
		{
226 1
			self::$client = Http::getClient();
227
		}
228 2
		self::$client->timeout = 10;
229
230 2
		return self::$client;
231
	}
232
233
	/**
234
	* Replace {@var} tokens in given URL
235
	*
236
	* @param  string   $url  Original URL
237
	* @param  string[] $vars Replacements
238
	* @return string         Modified URL
239
	*/
240 20
	protected static function replaceTokens($url, array $vars)
241
	{
242 20
		return preg_replace_callback(
243 20
			'#\\{@(\\w+)\\}#',
244 20
			function ($m) use ($vars)
245
			{
246 20
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
247 20
			},
248 20
			$url
249
		);
250
	}
251
252
	/**
253
	* Scrape the content of an URL to extract some data
254
	*
255
	* @param  string $url      Original URL
256
	* @param  Tag    $tag      Source tag
257
	* @param  array  $scrape   Array of scrape directives
258
	* @param  string $cacheDir Path to the cache directory
259
	* @return void
260
	*/
261 143
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
262
	{
263 143
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
264 143
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
265
		{
266 74
			return;
267
		}
268
269
		// Test whether this URL matches any regexp
270 76
		$vars    = [];
271 76
		$matched = false;
272 76
		foreach ((array) $matchRegexps as $matchRegexp)
273
		{
274 76
			if (preg_match($matchRegexp, $url, $m))
275
			{
276 63
				$vars   += $m;
277 76
				$matched = true;
278
			}
279
		}
280 76
		if (!$matched)
281
		{
282 20
			return;
283
		}
284
285
		// Add the tag's attributes to the named captures from the "match" regexp
286 63
		$vars += $tag->getAttributes();
287
288 63
		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
289 63
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
290 63
	}
291
292
	/**
293
	* Scrape a URL to help fill a tag's attributes
294
	*
295
	* @param  string      $url      URL to scrape
296
	* @param  Tag         $tag      Tag to fill
297
	* @param  string[]    $regexps  Regexps used to extract content from the page
298
	* @param  string|null $cacheDir Path to the cache directory
299
	* @return void
300
	*/
301 63
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
302
	{
303 63
		$content = self::wget($url, $cacheDir);
304
305
		// Execute the extract regexps and fill any missing attribute
306 63
		foreach ($regexps as $regexp)
307
		{
308 63
			if (preg_match($regexp, $content, $m))
309
			{
310 61
				foreach ($m as $k => $v)
311
				{
312 61
					if (!is_numeric($k) && !$tag->hasAttribute($k))
313
					{
314 63
						$tag->setAttribute($k, $v);
315
					}
316
				}
317
			}
318
		}
319 63
	}
320
321
	/**
322
	* Test whether a tag is missing any of given attributes
323
	*
324
	* @param  Tag      $tag
325
	* @param  string[] $attrNames
326
	* @return bool
327
	*/
328 143
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
329
	{
330 143
		foreach ($attrNames as $attrName)
331
		{
332 143
			if (!$tag->hasAttribute($attrName))
333
			{
334 143
				return true;
335
			}
336
		}
337
338 74
		return false;
339
	}
340
341
	/**
342
	* Retrieve external content (possibly from the cache)
343
	*
344
	* If the cache directory exists, the external content will be saved into it. Cached content is
345
	* never pruned
346
	*
347
	* @param  string $url      URL
348
	* @param  string $cacheDir Path to the cache directory
349
	* @return string           External content
350
	*/
351 63
	protected static function wget($url, $cacheDir = null)
352
	{
353 63
		$prefix = '';
354 63
		$url    = preg_replace('(#.*)s', '', $url);
355
356
		// Return the content from the cache if applicable
357 63
		if (isset($cacheDir) && file_exists($cacheDir))
358
		{
359 63
			$cacheFile = $cacheDir . '/http.' . crc32($url);
360 63
			if (extension_loaded('zlib'))
361
			{
362 63
				$prefix     = 'compress.zlib://';
363 63
				$cacheFile .= '.gz';
364
			}
365 63
			if (file_exists($cacheFile))
366
			{
367 61
				return file_get_contents($prefix . $cacheFile);
368
			}
369
		}
370
371
		// Retrieve the external content from the source
372 2
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
373
374
		// Save to the cache if applicable
375 2
		if (isset($cacheFile) && !empty($content))
376
		{
377 1
			file_put_contents($prefix . $cacheFile, $content);
378
		}
379
380 2
		return $content;
381
	}
382
}