Completed
Push — master ( 00d88c...61baac )
by Josh
30:59
created

Parser::getHttpClient()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 10
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 6

Importance

Changes 0
Metric Value
dl 0
loc 10
ccs 0
cts 7
cp 0
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 5
nc 2
nop 0
crap 6
1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2017 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10
use s9e\TextFormatter\Parser as TagStack;
11
use s9e\TextFormatter\Parser\Tag;
12
use s9e\TextFormatter\Plugins\ParserBase;
13
use s9e\TextFormatter\Utils\Http;
14
15
class Parser extends ParserBase
16
{
17
	/**
18
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19
	*/
20
	protected static $client;
21
22
	/**
23
	* {@inheritdoc}
24
	*/
25 386
	public function parse($text, array $matches)
26
	{
27 386
		foreach ($matches as $m)
28
		{
29 386
			$url = $m[0][0];
30 386
			$pos = $m[0][1];
31 386
			$len = strlen($url);
32
33
			// Give that tag priority over other tags such as Autolink's
34 386
			$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
35 386
			$tag->setAttribute('url', $url);
36 386
		}
37 386
	}
38
39
	/**
40
	* Filter a MEDIA tag
41
	*
42
	* This will always invalidate the original tag, and possibly replace it with the tag that
43
	* corresponds to the media site
44
	*
45
	* @param  Tag      $tag      The original tag
46
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47
	* @param  array    $sites    Map of [host => siteId]
48
	* @return bool               Unconditionally FALSE
49
	*/
50 387
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51
	{
52 387
		if ($tag->hasAttribute('site'))
53 387
		{
54 5
			self::addTagFromMediaId($tag, $tagStack, $sites);
55 5
		}
56 383
		elseif ($tag->hasAttribute('url'))
57
		{
58 383
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
59 383
		}
60
61 387
		return false;
62
	}
63
64
	/**
65
	* Test whether a given tag has at least one non-default attribute
66
	*
67
	* @param  Tag  $tag The original tag
68
	* @return bool      Whether the tag contains an attribute not named "url"
69
	*/
70 64
	public static function hasNonDefaultAttribute(Tag $tag)
71
	{
72 64
		foreach ($tag->getAttributes() as $attrName => $void)
73
		{
74 64
			if ($attrName !== 'url')
75 64
			{
76 59
				return true;
77
			}
78 60
		}
79
80 6
		return false;
81
	}
82
83
	/**
84
	* Scrape the content of an URL to extract some data
85
	*
86
	* @param  Tag    $tag          Source tag
87
	* @param  array  $scrapeConfig Array of scrape directives
88
	* @param  string $cacheDir     Path to the cache directory
89
	* @return bool                 Unconditionally TRUE
90
	*/
91 150
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
92
	{
93 150
		if ($tag->hasAttribute('url'))
94 150
		{
95
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
96 149
			$url = $tag->getAttribute('url');
97 149
			if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
98 149
			{
99 143
				$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
100 143
				foreach ($scrapeConfig as $scrape)
101
				{
102 143
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
103 143
				}
104 143
			}
105 149
		}
106
107 150
		return true;
108
	}
109
110
	//==============================================================================================
111
	// Internals
112
	//==============================================================================================
113
114
	/**
115
	* Add a site tag
116
	*
117
	* @param  Tag      $tag      The original tag
118
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
119
	* @param  string   $siteId   Site ID
120
	* @return void
121
	*/
122 385
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
123
	{
124 385
		$endTag = $tag->getEndTag() ?: $tag;
125
126
		// Compute the boundaries of our new tag
127 385
		$lpos = $tag->getPos();
128 385
		$rpos = $endTag->getPos() + $endTag->getLen();
129
130
		// Create a new tag and copy this tag's attributes and priority
131 385
		$tagStack->addTagPair(strtoupper($siteId), $lpos, 0, $rpos, 0, $tag->getSortPriority())->setAttributes($tag->getAttributes());
132 385
	}
133
134
	/**
135
	* Add a media site tag based on the attributes of a MEDIA tag
136
	*
137
	* @param  Tag      $tag      The original tag
138
	* @param  TagStack $tagStack Parser instance
139
	* @param  array    $sites    Map of [host => siteId]
140
	* @return void
141
	*/
142 5
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
143
	{
144 5
		$siteId = strtolower($tag->getAttribute('site'));
145 5
		if (in_array($siteId, $sites, true))
146 5
		{
147 4
			self::addSiteTag($tag, $tagStack, $siteId);
148 4
		}
149 5
	}
150
151
	/**
152
	* Add a media site tag based on the url attribute of a MEDIA tag
153
	*
154
	* @param  Tag      $tag      The original tag
155
	* @param  TagStack $tagStack Parser instance
156
	* @param  array    $sites    Map of [host => siteId]
157
	* @return void
158
	*/
159 383
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
160
	{
161
		// Capture the scheme and (if applicable) host of the URL
162 383
		$p = parse_url($tag->getAttribute('url'));
163 383
		if (isset($p['scheme']) && isset($sites[$p['scheme'] . ':']))
164 383
		{
165 4
			$siteId = $sites[$p['scheme'] . ':'];
166 4
		}
167 379
		elseif (isset($p['host']))
168
		{
169 379
			$siteId = self::findSiteIdByHost($p['host'], $sites);
170 379
		}
171
172 383
		if (!empty($siteId))
173 383
		{
174 381
			self::addSiteTag($tag, $tagStack, $siteId);
175 381
		}
176 383
	}
177
178
	/**
179
	* Match a given host to a site ID
180
	*
181
	* @param  string      $host  Host
182
	* @param  array       $sites Map of [host => siteId]
183
	* @return string|bool        Site ID or FALSE
184
	*/
185 379
	protected static function findSiteIdByHost($host, array $sites)
186
	{
187
		// Start with the full host then pop domain labels off the start until we get a match
188
		do
189
		{
190 379
			if (isset($sites[$host]))
191 379
			{
192 377
				return $sites[$host];
193
			}
194
195 253
			$pos = strpos($host, '.');
196 253
			if ($pos === false)
197 253
			{
198 4
				break;
199
			}
200
201 253
			$host = substr($host, 1 + $pos);
202
		}
203 253
		while ($host > '');
204
205 4
		return false;
206
	}
207
208
	/**
209
	* Return a cached instance of the HTTP client
210
	*
211
	* @return \s9e\TextFormatter\Utils\Http\Client
212
	*/
213
	protected static function getHttpClient()
214
	{
215
		if (!isset(self::$client))
216
		{
217
			self::$client = Http::getClient();
218
		}
219
		self::$client->timeout = 10;
220
221
		return self::$client;
222
	}
223
224
	/**
225
	* Replace {@var} tokens in given URL
226
	*
227
	* @param  string   $url  Original URL
228
	* @param  string[] $vars Replacements
229
	* @return string         Modified URL
230
	*/
231 19
	protected static function replaceTokens($url, array $vars)
232
	{
233 19
		return preg_replace_callback(
234 19
			'#\\{@(\\w+)\\}#',
235 19
			function ($m) use ($vars)
236
			{
237 19
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
238 19
			},
239
			$url
240 19
		);
241
	}
242
243
	/**
244
	* Scrape the content of an URL to extract some data
245
	*
246
	* @param  string $url      Original URL
247
	* @param  Tag    $tag      Source tag
248
	* @param  array  $scrape   Array of scrape directives
249
	* @param  string $cacheDir Path to the cache directory
250
	* @return void
251
	*/
252 143
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
253
	{
254 143
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
255
256 143
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
257 143
		{
258 60
			return;
259
		}
260
261
		// Test whether this URL matches any regexp
262 90
		$vars    = [];
263 90
		$matched = false;
264 90
		foreach ((array) $matchRegexps as $matchRegexp)
265
		{
266 90
			if (preg_match($matchRegexp, $url, $m))
267 90
			{
268 68
				$vars   += $m;
269 68
				$matched = true;
270 68
			}
271 90
		}
272 90
		if (!$matched)
273 90
		{
274 29
			return;
275
		}
276
277
		// Add the tag's attributes to the named captures from the "match" regexp
278 68
		$vars += $tag->getAttributes();
279
280 68
		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
281 68
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
282 68
	}
283
284
	/**
285
	* Scrape a URL to help fill a tag's attributes
286
	*
287
	* @param  string      $url      URL to scrape
288
	* @param  Tag         $tag      Tag to fill
289
	* @param  string[]    $regexps  Regexps used to extract content from the page
290
	* @param  string|null $cacheDir Path to the cache directory
291
	* @return void
292
	*/
293 68
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
294
	{
295 68
		$content = self::wget($url, $cacheDir);
296
297
		// Execute the extract regexps and fill any missing attribute
298 68
		foreach ($regexps as $regexp)
299
		{
300 68
			if (preg_match($regexp, $content, $m))
301 68
			{
302 64
				foreach ($m as $k => $v)
303
				{
304 64
					if (!is_numeric($k) && !$tag->hasAttribute($k))
305 64
					{
306 64
						$tag->setAttribute($k, $v);
307 64
					}
308 64
				}
309 64
			}
310 68
		}
311 68
	}
312
313
	/**
314
	* Test whether a tag is missing any of given attributes
315
	*
316
	* @param  Tag      $tag
317
	* @param  string[] $attrNames
318
	* @return bool
319
	*/
320 143
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
321
	{
322 143
		foreach ($attrNames as $attrName)
323
		{
324 143
			if (!$tag->hasAttribute($attrName))
325 143
			{
326 90
				return true;
327
			}
328 62
		}
329
330 60
		return false;
331
	}
332
333
	/**
334
	* Retrieve external content (possibly from the cache)
335
	*
336
	* If the cache directory exists, the external content will be saved into it. Cached content is
337
	* never pruned
338
	*
339
	* @param  string $url      URL
340
	* @param  string $cacheDir Path to the cache directory
341
	* @return string           External content
342
	*/
343 68
	protected static function wget($url, $cacheDir = null)
344
	{
345 68
		$prefix = '';
346 68
		$url    = preg_replace('(#.*)s', '', $url);
347
348
		// Return the content from the cache if applicable
349 68
		if (isset($cacheDir) && file_exists($cacheDir))
350 68
		{
351 68
			$cacheFile = $cacheDir . '/http.' . crc32($url);
352 68
			if (extension_loaded('zlib'))
353 68
			{
354 68
				$prefix     = 'compress.zlib://';
355 68
				$cacheFile .= '.gz';
356 68
			}
357 68
			if (file_exists($cacheFile))
358 68
			{
359 68
				return file_get_contents($prefix . $cacheFile);
360
			}
361
		}
362
363
		// Retrieve the external content from the source
364
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
365
366
		// Save to the cache if applicable
367
		if (isset($cacheFile) && !empty($content))
368
		{
369
			file_put_contents($prefix . $cacheFile, $content);
370
		}
371
372
		return $content;
373
	}
374
}