Completed
Push — master ( 06920f...7e343b )
by Josh
18:41
created

Parser::scrapeUrl()   B

Complexity

Conditions 6
Paths 5

Size

Total Lines 19
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 19
ccs 8
cts 8
cp 1
rs 8.8571
c 0
b 0
f 0
cc 6
eloc 7
nc 5
nop 4
crap 6
1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2017 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10
use s9e\TextFormatter\Parser as TagStack;
11
use s9e\TextFormatter\Parser\Tag;
12
use s9e\TextFormatter\Plugins\ParserBase;
13
use s9e\TextFormatter\Utils\Http;
14
15
class Parser extends ParserBase
16
{
17
	/**
18
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19
	*/
20
	protected static $client;
21
22
	/**
23
	* {@inheritdoc}
24
	*/
25 404
	public function parse($text, array $matches)
26
	{
27 404
		foreach ($matches as $m)
28
		{
29 404
			$url = $m[0][0];
30 404
			$pos = $m[0][1];
31 404
			$len = strlen($url);
32
33
			// Give that tag priority over other tags such as Autolink's
34 404
			$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
35 404
			$tag->setAttribute('url', $url);
36
		}
37 404
	}
38
39
	/**
40
	* Filter a MEDIA tag
41
	*
42
	* This will always invalidate the original tag, and possibly replace it with the tag that
43
	* corresponds to the media site
44
	*
45
	* @param  Tag      $tag      The original tag
46
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47
	* @param  array    $sites    Map of [host => siteId]
48
	* @return bool               Unconditionally FALSE
49
	*/
50 404
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51
	{
52 404
		if ($tag->hasAttribute('site'))
53
		{
54 5
			self::addTagFromMediaId($tag, $tagStack, $sites);
55
		}
56 400
		elseif ($tag->hasAttribute('url'))
57
		{
58 400
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
59
		}
60
61 404
		return false;
62
	}
63
64
	/**
65
	* Test whether a given tag has at least one non-default attribute
66
	*
67
	* @param  Tag  $tag The original tag
68
	* @return bool      Whether the tag contains an attribute not named "url"
69
	*/
70 65
	public static function hasNonDefaultAttribute(Tag $tag)
71
	{
72 65
		foreach ($tag->getAttributes() as $attrName => $void)
73
		{
74 65
			if ($attrName !== 'url')
75
			{
76 65
				return true;
77
			}
78
		}
79
80 6
		return false;
81
	}
82
83
	/**
84
	* Scrape the content of an URL to extract some data
85
	*
86
	* @param  Tag    $tag          Source tag
87
	* @param  array  $scrapeConfig Array of scrape directives
88
	* @param  string $cacheDir     Path to the cache directory
89
	* @return bool                 Unconditionally TRUE
90
	*/
91 409
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
92
	{
93 409
		if ($tag->hasAttribute('url'))
94
		{
95
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
96 408
			$url = $tag->getAttribute('url');
97 408
			if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
98
			{
99 403
				$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
100 403
				foreach ($scrapeConfig as $scrape)
101
				{
102 143
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
103
				}
104
			}
105
		}
106
107 409
		return true;
108
	}
109
110
	//==============================================================================================
111
	// Internals
112
	//==============================================================================================
113
114
	/**
115
	* Add a site tag
116
	*
117
	* @param  Tag      $tag      The original tag
118
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
119
	* @param  string   $siteId   Site ID
120
	* @return void
121
	*/
122 402
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
123
	{
124 402
		$endTag = $tag->getEndTag();
125 402
		if ($endTag)
126
		{
127 12
			$startPos = $tag->getPos();
128 12
			$startLen = $tag->getLen();
129 12
			$endPos   = $endTag->getPos();
130 12
			$endLen   = $endTag->getLen();
131
		}
132
		else
133
		{
134 398
			$startPos = $tag->getPos();
135 398
			$startLen = 0;
136 398
			$endPos   = $tag->getPos() + $tag->getLen();
137 398
			$endLen   = 0;
138
		}
139
140
		// Create a new tag and copy this tag's attributes and priority
141 402
		$tagStack->addTagPair(strtoupper($siteId), $startPos, $startLen, $endPos, $endLen, $tag->getSortPriority())->setAttributes($tag->getAttributes());
142 402
	}
143
144
	/**
145
	* Add a media site tag based on the attributes of a MEDIA tag
146
	*
147
	* @param  Tag      $tag      The original tag
148
	* @param  TagStack $tagStack Parser instance
149
	* @param  array    $sites    Map of [host => siteId]
150
	* @return void
151
	*/
152 5
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
153
	{
154 5
		$siteId = strtolower($tag->getAttribute('site'));
155 5
		if (in_array($siteId, $sites, true))
156
		{
157 4
			self::addSiteTag($tag, $tagStack, $siteId);
158
		}
159 5
	}
160
161
	/**
162
	* Add a media site tag based on the url attribute of a MEDIA tag
163
	*
164
	* @param  Tag      $tag      The original tag
165
	* @param  TagStack $tagStack Parser instance
166
	* @param  array    $sites    Map of [host => siteId]
167
	* @return void
168
	*/
169 400
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
170
	{
171
		// Capture the scheme and (if applicable) host of the URL
172 400
		$p = parse_url($tag->getAttribute('url'));
173 400
		if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
174
		{
175
			$siteId = $sites[$p['scheme'] . ':'];
176
		}
177 400
		elseif (isset($p['host']))
178
		{
179 400
			$siteId = self::findSiteIdByHost($p['host'], $sites);
180
		}
181
182 400
		if (!empty($siteId))
183
		{
184 398
			self::addSiteTag($tag, $tagStack, $siteId);
185
		}
186 400
	}
187
188
	/**
189
	* Match a given host to a site ID
190
	*
191
	* @param  string      $host  Host
192
	* @param  array       $sites Map of [host => siteId]
193
	* @return string|bool        Site ID or FALSE
194
	*/
195 400
	protected static function findSiteIdByHost($host, array $sites)
196
	{
197
		// Start with the full host then pop domain labels off the start until we get a match
198
		do
199
		{
200 400
			if (isset($sites[$host]))
201
			{
202 398
				return $sites[$host];
203
			}
204
205 263
			$pos = strpos($host, '.');
206 263
			if ($pos === false)
207
			{
208 4
				break;
209
			}
210
211 263
			$host = substr($host, 1 + $pos);
212
		}
213 263
		while ($host > '');
214
215 4
		return false;
216
	}
217
218
	/**
219
	* Return a cached instance of the HTTP client
220
	*
221
	* @return \s9e\TextFormatter\Utils\Http\Client
222
	*/
223 2
	protected static function getHttpClient()
224
	{
225 2
		if (!isset(self::$client))
226
		{
227 1
			self::$client = Http::getClient();
228
		}
229 2
		self::$client->timeout = 10;
230
231 2
		return self::$client;
232
	}
233
234
	/**
235
	* Replace {@var} tokens in given URL
236
	*
237
	* @param  string   $url  Original URL
238
	* @param  string[] $vars Replacements
239
	* @return string         Modified URL
240
	*/
241 20
	protected static function replaceTokens($url, array $vars)
242
	{
243 20
		return preg_replace_callback(
244 20
			'#\\{@(\\w+)\\}#',
245 20
			function ($m) use ($vars)
246
			{
247 20
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
248 20
			},
249 20
			$url
250
		);
251
	}
252
253
	/**
254
	* Scrape the content of an URL to extract some data
255
	*
256
	* @param  string $url      Original URL
257
	* @param  Tag    $tag      Source tag
258
	* @param  array  $scrape   Array of scrape directives
259
	* @param  string $cacheDir Path to the cache directory
260
	* @return void
261
	*/
262 143
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
263
	{
264 143
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
265 143
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
266
		{
267 74
			return;
268
		}
269
270
		// Test whether this URL matches any regexp
271 76
		$vars    = [];
272 76
		$matched = false;
273 76
		foreach ((array) $matchRegexps as $matchRegexp)
274
		{
275 76
			if (preg_match($matchRegexp, $url, $m))
276
			{
277 63
				$vars   += $m;
278 76
				$matched = true;
279
			}
280
		}
281 76
		if (!$matched)
282
		{
283 20
			return;
284
		}
285
286
		// Add the tag's attributes to the named captures from the "match" regexp
287 63
		$vars += $tag->getAttributes();
288
289 63
		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
290 63
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
291 63
	}
292
293
	/**
294
	* Scrape a URL to help fill a tag's attributes
295
	*
296
	* @param  string      $url      URL to scrape
297
	* @param  Tag         $tag      Tag to fill
298
	* @param  string[]    $regexps  Regexps used to extract content from the page
299
	* @param  string|null $cacheDir Path to the cache directory
300
	* @return void
301
	*/
302 63
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
303
	{
304 63
		$content = self::wget($url, $cacheDir);
305
306
		// Execute the extract regexps and fill any missing attribute
307 63
		foreach ($regexps as $regexp)
308
		{
309 63
			if (preg_match($regexp, $content, $m))
310
			{
311 61
				foreach ($m as $k => $v)
312
				{
313 61
					if (!is_numeric($k) && !$tag->hasAttribute($k))
314
					{
315 63
						$tag->setAttribute($k, $v);
316
					}
317
				}
318
			}
319
		}
320 63
	}
321
322
	/**
323
	* Test whether a tag is missing any of given attributes
324
	*
325
	* @param  Tag      $tag
326
	* @param  string[] $attrNames
327
	* @return bool
328
	*/
329 143
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
330
	{
331 143
		foreach ($attrNames as $attrName)
332
		{
333 143
			if (!$tag->hasAttribute($attrName))
334
			{
335 143
				return true;
336
			}
337
		}
338
339 74
		return false;
340
	}
341
342
	/**
343
	* Retrieve external content (possibly from the cache)
344
	*
345
	* If the cache directory exists, the external content will be saved into it. Cached content is
346
	* never pruned
347
	*
348
	* @param  string $url      URL
349
	* @param  string $cacheDir Path to the cache directory
350
	* @return string           External content
351
	*/
352 63
	protected static function wget($url, $cacheDir = null)
353
	{
354 63
		$prefix = '';
355 63
		$url    = preg_replace('(#.*)s', '', $url);
356
357
		// Return the content from the cache if applicable
358 63
		if (isset($cacheDir) && file_exists($cacheDir))
359
		{
360 63
			$cacheFile = $cacheDir . '/http.' . crc32($url);
361 63
			if (extension_loaded('zlib'))
362
			{
363 63
				$prefix     = 'compress.zlib://';
364 63
				$cacheFile .= '.gz';
365
			}
366 63
			if (file_exists($cacheFile))
367
			{
368 61
				return file_get_contents($prefix . $cacheFile);
369
			}
370
		}
371
372
		// Retrieve the external content from the source
373 2
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
374
375
		// Save to the cache if applicable
376 2
		if (isset($cacheFile) && !empty($content))
377
		{
378 1
			file_put_contents($prefix . $cacheFile, $content);
379
		}
380
381 2
		return $content;
382
	}
383
}