Completed
Push — master ( 0e549e...7e9c0f )
by Josh
20:53
created

Parser::parse()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 13
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 7
nc 2
nop 2
dl 0
loc 13
ccs 9
cts 9
cp 1
crap 2
rs 9.4285
c 0
b 0
f 0
1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2016 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10
use s9e\TextFormatter\Utils\Http;
11
use s9e\TextFormatter\Parser as TagStack;
12
use s9e\TextFormatter\Parser\Tag;
13
use s9e\TextFormatter\Plugins\ParserBase;
14
15
class Parser extends ParserBase
16
{
17
	/**
18
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19
	*/
20
	protected static $client;
21
22
	/**
23
	* {@inheritdoc}
24
	*/
25 355
	public function parse($text, array $matches)
26
	{
27 355
		foreach ($matches as $m)
28
		{
29 355
			$url = $m[0][0];
30 355
			$pos = $m[0][1];
31 355
			$len = strlen($url);
32
33
			// Give that tag priority over other tags such as Autolink's
34 355
			$tag = $this->parser->addSelfClosingTag('MEDIA', $pos, $len, -10);
35 355
			$tag->setAttribute('url', $url);
36 355
		}
37 355
	}
38
39
	/**
40
	* Filter a MEDIA tag
41
	*
42
	* This will always invalidate the original tag, and possibly replace it with the tag that
43
	* corresponds to the media site
44
	*
45
	* @param  Tag      $tag      The original tag
46
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47
	* @param  array    $sites    Map of [host => siteId]
48
	* @return bool               Unconditionally FALSE
49
	*/
50 357
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51
	{
52 357
		if ($tag->hasAttribute('media'))
53 357
		{
54 6
			self::addTagFromMediaId($tag, $tagStack, $sites);
55 6
		}
56 352
		elseif ($tag->hasAttribute('url'))
57
		{
58 352
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
59 352
		}
60
61 357
		return false;
62
	}
63
64
	/**
65
	* Test whether a given tag has at least one non-default attribute
66
	*
67
	* @param  Tag  $tag The original tag
68
	* @return bool      Whether the tag contains an attribute not named "url"
69
	*/
70 54
	public static function hasNonDefaultAttribute(Tag $tag)
71
	{
72 54
		foreach ($tag->getAttributes() as $attrName => $void)
73
		{
74 54
			if ($attrName !== 'url')
75 54
			{
76 50
				return true;
77
			}
78 50
		}
79
80 5
		return false;
81
	}
82
83
	/**
84
	* Scrape the content of an URL to extract some data
85
	*
86
	* @param  Tag    $tag          Source tag
87
	* @param  array  $scrapeConfig Array of scrape directives
88
	* @param  string $cacheDir     Path to the cache directory
89
	* @return bool                 Unconditionally TRUE
90
	*/
91 120
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
92
	{
93 120
		if ($tag->hasAttribute('url'))
94 120
		{
95
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
96 119
			$url = $tag->getAttribute('url');
97 119
			if (preg_match('#^https?://[^<>"\'\\s]+$#D', $url))
98 119
			{
99 116
				foreach ($scrapeConfig as $scrape)
100
				{
101 116
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
102 116
				}
103 116
			}
104 119
		}
105
106 120
		return true;
107
	}
108
109
	//==============================================================================================
110
	// Internals
111
	//==============================================================================================
112
113
	/**
114
	* Add a site tag
115
	*
116
	* @param  Tag      $tag      The original tag
117
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
118
	* @param  string   $siteId   Site ID
119
	* @return void
120
	*/
121 355
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
122
	{
123 355
		$endTag = $tag->getEndTag() ?: $tag;
124
125
		// Compute the boundaries of our new tag
126 355
		$lpos = $tag->getPos();
127 355
		$rpos = $endTag->getPos() + $endTag->getLen();
128
129
		// Create a new tag and copy this tag's attributes and priority
130 355
		$tagStack->addTagPair(strtoupper($siteId), $lpos, 0, $rpos, 0, $tag->getSortPriority())->setAttributes($tag->getAttributes());
131 355
	}
132
133
	/**
134
	* Add a media site tag based on the attributes of a MEDIA tag
135
	*
136
	* @param  Tag      $tag      The original tag
137
	* @param  TagStack $tagStack Parser instance
138
	* @param  array    $sites    Map of [host => siteId]
139
	* @return void
140
	*/
141 6
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
142
	{
143 6
		$siteId = strtolower($tag->getAttribute('media'));
144 6
		if (in_array($siteId, $sites, true))
145 6
		{
146 5
			self::addSiteTag($tag, $tagStack, $siteId);
147 5
		}
148 6
	}
149
150
	/**
151
	* Add a media site tag based on the url attribute of a MEDIA tag
152
	*
153
	* @param  Tag      $tag      The original tag
154
	* @param  TagStack $tagStack Parser instance
155
	* @param  array    $sites    Map of [host => siteId]
156
	* @return void
157
	*/
158 352
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
159
	{
160
		// Capture the scheme and (if applicable) host of the URL
161 352
		$p = parse_url($tag->getAttribute('url'));
162 352
		if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
163 352
		{
164 4
			$siteId = $sites[$p['scheme'] . ':'];
165 4
		}
166 348
		elseif (isset($p['host']))
167
		{
168 348
			$siteId = self::findSiteIdByHost($p['host'], $sites);
169 348
		}
170
171 352
		if (!empty($siteId))
172 352
		{
173 350
			self::addSiteTag($tag, $tagStack, $siteId);
174 350
		}
175 352
	}
176
177
	/**
178
	* Match a given host to a site ID
179
	*
180
	* @param  string      $host  Host
181
	* @param  array       $sites Map of [host => siteId]
182
	* @return string|bool        Site ID or FALSE
183
	*/
184 348
	protected static function findSiteIdByHost($host, array $sites)
185
	{
186
		// Start with the full host then pop domain labels off the start until we get a match
187
		do
188
		{
189 348
			if (isset($sites[$host]))
190 348
			{
191 346
				return $sites[$host];
192
			}
193
194 228
			$pos = strpos($host, '.');
195 228
			if ($pos === false)
196 228
			{
197 4
				break;
198
			}
199
200 228
			$host = substr($host, 1 + $pos);
201
		}
202 228
		while ($host > '');
203
204 4
		return false;
205
	}
206
207
	/**
208
	* Return a cached instance of the HTTP client
209
	*
210
	* @return \s9e\TextFormatter\Utils\Http\Client
211
	*/
212 1
	protected static function getHttpClient()
213
	{
214 1
		if (!isset(self::$client))
215 1
		{
216 1
			self::$client = Http::getClient();
217 1
		}
218 1
		self::$client->timeout = 10;
219
220 1
		return self::$client;
221
	}
222
223
	/**
224
	* Replace {@var} tokens in given URL
225
	*
226
	* @param  string   $url  Original URL
227
	* @param  string[] $vars Replacements
228
	* @return string         Modified URL
229
	*/
230 17
	protected static function replaceTokens($url, array $vars)
231
	{
232 17
		return preg_replace_callback(
233 17
			'#\\{@(\\w+)\\}#',
234 17
			function ($m) use ($vars)
235
			{
236 17
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
237 17
			},
238
			$url
239 17
		);
240
	}
241
242
	/**
243
	* Scrape the content of an URL to extract some data
244
	*
245
	* @param  string $url      Original URL
246
	* @param  Tag    $tag      Source tag
247
	* @param  array  $scrape   Array of scrape directives
248
	* @param  string $cacheDir Path to the cache directory
249
	* @return void
250
	*/
251 116
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
252
	{
253 116
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
254
255 116
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
256 116
		{
257 32
			return;
258
		}
259
260
		// Test whether this URL matches any regexp
261 91
		$vars    = [];
262 91
		$matched = false;
263 91
		foreach ((array) $matchRegexps as $matchRegexp)
264
		{
265 91
			if (preg_match($matchRegexp, $url, $m))
266 91
			{
267 67
				$vars   += $m;
268 67
				$matched = true;
269 67
			}
270 91
		}
271 91
		if (!$matched)
272 91
		{
273 31
			return;
274
		}
275
276
		// Add the tag's attributes to the named captures from the "match" regexp
277 67
		$vars += $tag->getAttributes();
278
279 67
		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
280 67
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
281 67
	}
282
283
	/**
284
	* Scrape a URL to help fill a tag's attributes
285
	*
286
	* @param  string      $url      URL to scrape
287
	* @param  Tag         $tag      Tag to fill
288
	* @param  string[]    $regexps  Regexps used to extract content from the page
289
	* @param  string|null $cacheDir Path to the cache directory
290
	* @return void
291
	*/
292 67
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
293
	{
294 67
		$content = self::wget($url, $cacheDir);
295
296
		// Execute the extract regexps and fill any missing attribute
297 67
		foreach ($regexps as $regexp)
298
		{
299 67
			if (preg_match($regexp, $content, $m))
300 67
			{
301 65
				foreach ($m as $k => $v)
302
				{
303 65
					if (!is_numeric($k) && !$tag->hasAttribute($k))
304 65
					{
305 65
						$tag->setAttribute($k, $v);
306 65
					}
307 65
				}
308 65
			}
309 67
		}
310 67
	}
311
312
	/**
313
	* Test whether a tag is missing any of given attributes
314
	*
315
	* @param  Tag      $tag
316
	* @param  string[] $attrNames
317
	* @return bool
318
	*/
319 116
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
320
	{
321 116
		foreach ($attrNames as $attrName)
322
		{
323 116
			if (!$tag->hasAttribute($attrName))
324 116
			{
325 91
				return true;
326
			}
327 34
		}
328
329 32
		return false;
330
	}
331
332
	/**
333
	* Retrieve external content (possibly from the cache)
334
	*
335
	* If the cache directory exists, the external content will be saved into it. Cached content is
336
	* never pruned
337
	*
338
	* @param  string $url      URL
339
	* @param  string $cacheDir Path to the cache directory
340
	* @return string           External content
341
	*/
342 67
	protected static function wget($url, $cacheDir = null)
343
	{
344 67
		$prefix = '';
345 67
		$url    = preg_replace('(#.*)s', '', $url);
346
347
		// Return the content from the cache if applicable
348 67
		if (isset($cacheDir) && file_exists($cacheDir))
349 67
		{
350 67
			$cacheFile = $cacheDir . '/http.' . crc32($url);
351 67
			if (extension_loaded('zlib'))
352 67
			{
353 67
				$prefix     = 'compress.zlib://';
354 67
				$cacheFile .= '.gz';
355 67
			}
356 67
			if (file_exists($cacheFile))
357 67
			{
358 66
				return file_get_contents($prefix . $cacheFile);
359
			}
360 1
		}
361
362
		// Retrieve the external content from the source
363 1
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
364
365
		// Save to the cache if applicable
366 1
		if (isset($cacheFile) && !empty($content))
367 1
		{
368 1
			file_put_contents($prefix . $cacheFile, $content);
369 1
		}
370
371 1
		return $content;
372
	}
373
}