Completed
Push — master ( 644404...5feeae )
by Josh
16:44 queued 01:12
created

Parser::replaceTokens()   A

Complexity

Conditions 2
Paths 1

Size

Total Lines 11
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 2
Metric Value
dl 0
loc 11
ccs 7
cts 7
cp 1
rs 9.4286
cc 2
eloc 6
nc 1
nop 2
crap 2
1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2016 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10
use s9e\TextFormatter\Utils\Http;
11
use s9e\TextFormatter\Parser as TagStack;
12
use s9e\TextFormatter\Parser\Tag;
13
use s9e\TextFormatter\Plugins\ParserBase;
14
15
class Parser extends ParserBase
16
{
17
	/**
18
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19
	*/
20
	protected static $client;
21
22
	/**
23
	* {@inheritdoc}
24
	*/
25 331
	public function parse($text, array $matches)
26
	{
27 331
		foreach ($matches as $m)
28
		{
29 331
			$url = $m[0][0];
30 331
			$pos = $m[0][1];
31 331
			$len = strlen($url);
32
33 331
			$tag = $this->parser->addSelfClosingTag('MEDIA', $pos, $len);
34 331
			$tag->setAttribute('url', $url);
35
36
			// Give that tag priority over other tags such as Autolink's
37 331
			$tag->setSortPriority(-10);
38 331
		}
39 331
	}
40
41
	/**
42
	* Filter a MEDIA tag
43
	*
44
	* This will always invalidate the original tag, and possibly replace it with the tag that
45
	* corresponds to the media site
46
	*
47
	* @param  Tag      $tag      The original tag
48
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
49
	* @param  array    $sites    Map of [host => siteId]
50
	* @return bool               Unconditionally FALSE
51
	*/
52 325
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
53
	{
54 325
		if ($tag->hasAttribute('media'))
55 325
		{
56 6
			self::addTagFromMediaId($tag, $tagStack, $sites);
57 6
		}
58 319
		elseif ($tag->hasAttribute('url'))
59
		{
60 319
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
61 319
		}
62
63 325
		return false;
64
	}
65
66
	/**
67
	* Test whether a given tag has at least one non-default attribute
68
	*
69
	* @param  Tag  $tag The original tag
70
	* @return bool      Whether the tag contains an attribute not named "url"
71
	*/
72 52
	public static function hasNonDefaultAttribute(Tag $tag)
73
	{
74 52
		foreach ($tag->getAttributes() as $attrName => $void)
75
		{
76 52
			if ($attrName !== 'url')
77 52
			{
78 48
				return true;
79
			}
80 47
		}
81
82 4
		return false;
83
	}
84
85
	/**
86
	* Scrape the content of an URL to extract some data
87
	*
88
	* @param  Tag    $tag          Source tag
89
	* @param  array  $scrapeConfig Array of scrape directives
90
	* @param  string $cacheDir     Path to the cache directory
91
	* @return bool                 Unconditionally TRUE
92
	*/
93 122
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
94
	{
95 122
		if (!$tag->hasAttribute('url'))
96 122
		{
97 1
			return true;
98
		}
99
100 121
		$url = $tag->getAttribute('url');
101
102
		// Ensure that the URL actually looks like a URL
103 121
		if (!preg_match('#^https?://[^<>"\'\\s]+$#D', $url))
104 121
		{
105
			// A bad URL means we don't scrape, but it doesn't necessarily invalidate the tag
106 3
			return true;
107
		}
108
109 118
		foreach ($scrapeConfig as $scrape)
110
		{
111 118
			self::scrapeEntry($url, $tag, $scrape, $cacheDir);
112 118
		}
113
114 118
		return true;
115
	}
116
117
	//==============================================================================================
118
	// Internals
119
	//==============================================================================================
120
121
	/**
122
	* Add a site tag
123
	*
124
	* @param  Tag      $tag      The original tag
125
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
126
	* @param  string   $siteId   Site ID
127
	* @return void
128
	*/
129 323
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
130
	{
131 323
		$endTag = $tag->getEndTag() ?: $tag;
132
133
		// Compute the boundaries of our new tag
134 323
		$lpos = $tag->getPos();
135 323
		$rpos = $endTag->getPos() + $endTag->getLen();
136
137
		// Create a new tag and copy this tag's attributes and priority
138 323
		$newTag = $tagStack->addSelfClosingTag(strtoupper($siteId), $lpos, $rpos - $lpos);
139 323
		$newTag->setAttributes($tag->getAttributes());
140 323
		$newTag->setSortPriority($tag->getSortPriority());
141 323
	}
142
143
	/**
144
	* Add a media site tag based on the attributes of a MEDIA tag
145
	*
146
	* @param  Tag      $tag      The original tag
147
	* @param  TagStack $tagStack Parser instance
148
	* @param  array    $sites    Map of [host => siteId]
149
	* @return void
150
	*/
151 6
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
152
	{
153 6
		$siteId = strtolower($tag->getAttribute('media'));
154 6
		if (in_array($siteId, $sites, true))
155 6
		{
156 5
			self::addSiteTag($tag, $tagStack, $siteId);
157 5
		}
158 6
	}
159
160
	/**
161
	* Add a media site tag based on the url attribute of a MEDIA tag
162
	*
163
	* @param  Tag      $tag      The original tag
164
	* @param  TagStack $tagStack Parser instance
165
	* @param  array    $sites    Map of [host => siteId]
166
	* @return void
167
	*/
168 319
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
169
	{
170
		// Capture the scheme and (if applicable) host of the URL
171 319
		$p = parse_url($tag->getAttribute('url'));
172 319
		if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
173 319
		{
174 2
			$siteId = $sites[$p['scheme'] . ':'];
175 2
		}
176 317
		elseif (isset($p['host']))
177
		{
178 317
			$siteId = self::findSiteIdByHost($p['host'], $sites);
179 317
		}
180
181 319
		if (!empty($siteId))
182 319
		{
183 318
			self::addSiteTag($tag, $tagStack, $siteId);
184 318
		}
185 319
	}
186
187
	/**
188
	* Match a given host to a site ID
189
	*
190
	* @param  string      $host  Host
191
	* @param  array       $sites Map of [host => siteId]
192
	* @return string|bool        Site ID or FALSE
193
	*/
194 317
	protected static function findSiteIdByHost($host, array $sites)
195
	{
196
		// Start with the full host then pop domain labels off the start until we get a match
197
		do
198
		{
199 317
			if (isset($sites[$host]))
200 317
			{
201 316
				return $sites[$host];
202
			}
203
204 210
			$pos = strpos($host, '.');
205 210
			if ($pos === false)
206 210
			{
207 3
				break;
208
			}
209
210 210
			$host = substr($host, 1 + $pos);
211
		}
212 210
		while ($host > '');
213
214 3
		return false;
215
	}
216
217
	/**
218
	* Return a cached instance of the HTTP client
219
	*
220
	* @return \s9e\TextFormatter\Utils\Http\Client
221
	*/
222 1
	protected static function getHttpClient()
223
	{
224 1
		if (!isset(self::$client))
225 1
		{
226 1
			self::$client = Http::getClient();
227 1
		}
228 1
		self::$client->timeout = 10;
229
230 1
		return self::$client;
231
	}
232
233
	/**
234
	* Replace {@var} tokens in given URL
235
	*
236
	* @param  string   $url  Original URL
237
	* @param  string[] $vars Replacements
238
	* @return string         Modified URL
239
	*/
240 19
	protected static function replaceTokens($url, array $vars)
241
	{
242 19
		return preg_replace_callback(
243 19
			'#\\{@(\\w+)\\}#',
244 19
			function ($m) use ($vars)
245
			{
246 19
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
247 19
			},
248
			$url
249 19
		);
250
	}
251
252
	/**
253
	* Scrape the content of an URL to extract some data
254
	*
255
	* @param  string $url      Original URL
256
	* @param  Tag    $tag      Source tag
257
	* @param  array  $scrape   Array of scrape directives
258
	* @param  string $cacheDir Path to the cache directory
259
	* @return void
260
	*/
261 118
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
262
	{
263 118
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
264
265 118
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
266 118
		{
267 28
			return;
268
		}
269
270
		// Test whether this URL matches any regexp
271 91
		$vars    = [];
272 91
		$matched = false;
273 91
		foreach ((array) $matchRegexps as $matchRegexp)
274
		{
275 91
			if (preg_match($matchRegexp, $url, $m))
276 91
			{
277 74
				$vars   += $m;
278 74
				$matched = true;
279 74
			}
280 91
		}
281 91
		if (!$matched)
282 91
		{
283 33
			return;
284
		}
285
286
		// Add the tag's attributes to the named captures from the "match" regexp
287 74
		$vars += $tag->getAttributes();
288
289 74
		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
290 74
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
291 74
	}
292
293
	/**
294
	* Scrape a URL to help fill a tag's attributes
295
	*
296
	* @param  string      $url      URL to scrape
297
	* @param  Tag         $tag      Tag to fill
298
	* @param  string[]    $regexps  Regexps used to extract content from the page
299
	* @param  string|null $cacheDir Path to the cache directory
300
	* @return void
301
	*/
302 74
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
303
	{
304 74
		$content = self::wget($url, $cacheDir);
305
306
		// Execute the extract regexps and fill any missing attribute
307 74
		foreach ($regexps as $regexp)
308
		{
309 74
			if (preg_match($regexp, $content, $m))
310 74
			{
311 72
				foreach ($m as $k => $v)
312
				{
313 72
					if (!is_numeric($k) && !$tag->hasAttribute($k))
314 72
					{
315 72
						$tag->setAttribute($k, $v);
316 72
					}
317 72
				}
318 72
			}
319 74
		}
320 74
	}
321
322
	/**
323
	* Test whether a tag is missing any of given attributes
324
	*
325
	* @param  Tag      $tag
326
	* @param  string[] $attrNames
327
	* @return bool
328
	*/
329 118
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
330
	{
331 118
		foreach ($attrNames as $attrName)
332
		{
333 118
			if (!$tag->hasAttribute($attrName))
334 118
			{
335 91
				return true;
336
			}
337 30
		}
338
339 28
		return false;
340
	}
341
342
	/**
343
	* Retrieve external content (possibly from the cache)
344
	*
345
	* If the cache directory exists, the external content will be saved into it. Cached content is
346
	* never pruned
347
	*
348
	* @param  string $url      URL
349
	* @param  string $cacheDir Path to the cache directory
350
	* @return string           External content
351
	*/
352 74
	protected static function wget($url, $cacheDir = null)
353
	{
354 74
		$prefix = '';
355
356
		// Return the content from the cache if applicable
357 74
		if (isset($cacheDir) && file_exists($cacheDir))
358 74
		{
359 74
			$cacheFile = $cacheDir . '/http.' . crc32($url);
360 74
			if (extension_loaded('zlib'))
361 74
			{
362 74
				$prefix     = 'compress.zlib://';
363 74
				$cacheFile .= '.gz';
364 74
			}
365 74
			if (file_exists($cacheFile))
366 74
			{
367 73
				return file_get_contents($prefix . $cacheFile);
368
			}
369 1
		}
370
371
		// Retrieve the external content from the source
372 1
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
373
374
		// Save to the cache if applicable
375 1
		if (isset($cacheFile) && !empty($content))
376 1
		{
377 1
			file_put_contents($prefix . $cacheFile, $content);
378 1
		}
379
380 1
		return $content;
381
	}
382
}