Completed
Push — master ( 83cb8e...7577e4 )
by Josh
30:26
created

Parser::parse()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 13
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 2

Importance

Changes 0
Metric Value
dl 0
loc 13
ccs 8
cts 8
cp 1
rs 9.4285
c 0
b 0
f 0
cc 2
eloc 7
nc 2
nop 2
crap 2
1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2018 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10
use s9e\TextFormatter\Parser as TagStack;
11
use s9e\TextFormatter\Parser\Tag;
12
use s9e\TextFormatter\Plugins\ParserBase;
13
use s9e\TextFormatter\Utils\Http;
14
15
class Parser extends ParserBase
16
{
17
	/**
18
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19
	*/
20
	protected static $client;
21
22
	/**
23
	* {@inheritdoc}
24
	*/
25 405
	public function parse($text, array $matches)
26
	{
27 405
		foreach ($matches as $m)
28
		{
29 405
			$url = $m[0][0];
30 405
			$pos = $m[0][1];
31 405
			$len = strlen($url);
32
33
			// Give that tag priority over other tags such as Autolink's
34 405
			$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
35 405
			$tag->setAttribute('url', $url);
36
		}
37 405
	}
38
39
	/**
40
	* Filter a MEDIA tag
41
	*
42
	* This will always invalidate the original tag, and possibly replace it with the tag that
43
	* corresponds to the media site
44
	*
45
	* @param  Tag      $tag      The original tag
46
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47
	* @param  array    $sites    Map of [host => siteId]
48
	* @return void
49
	*/
50 405
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51
	{
52 405
		$tag->invalidate();
53 405
		if ($tag->hasAttribute('site'))
54
		{
55 5
			self::addTagFromMediaId($tag, $tagStack, $sites);
56
		}
57 401
		elseif ($tag->hasAttribute('url'))
58
		{
59 401
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
60
		}
61 405
	}
62
63
	/**
64
	* Invalidate given tag if it doesn't have at least one non-default attribute
65
	*
66
	* @param  Tag  $tag The original tag
67
	* @return void
68
	*/
69 60
	public static function hasNonDefaultAttribute(Tag $tag)
70
	{
71 60
		foreach ($tag->getAttributes() as $attrName => $void)
72
		{
73 60
			if ($attrName !== 'url')
74
			{
75 60
				return;
76
			}
77
		}
78
79 6
		$tag->invalidate();
80 6
	}
81
82
	/**
83
	* Scrape the content of an URL to extract some data
84
	*
85
	* @param  Tag    $tag          Source tag
86
	* @param  array  $scrapeConfig Array of scrape directives
87
	* @param  string $cacheDir     Path to the cache directory
88
	* @return bool                 Unconditionally TRUE
89
	*/
90 410
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
91
	{
92 410
		if ($tag->hasAttribute('url'))
93
		{
94
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
95 409
			$url = $tag->getAttribute('url');
96 409
			if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
97
			{
98 404
				$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
99 404
				foreach ($scrapeConfig as $scrape)
100
				{
101 141
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
102
				}
103
			}
104
		}
105
106 410
		return true;
107
	}
108
109
	//==============================================================================================
110
	// Internals
111
	//==============================================================================================
112
113
	/**
114
	* Add a site tag
115
	*
116
	* @param  Tag      $tag      The original tag
117
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
118
	* @param  string   $siteId   Site ID
119
	* @return void
120
	*/
121 403
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
122
	{
123 403
		$endTag = $tag->getEndTag();
124 403
		if ($endTag)
125
		{
126 12
			$startPos = $tag->getPos();
127 12
			$startLen = $tag->getLen();
128 12
			$endPos   = $endTag->getPos();
129 12
			$endLen   = $endTag->getLen();
130
		}
131
		else
132
		{
133 399
			$startPos = $tag->getPos();
134 399
			$startLen = 0;
135 399
			$endPos   = $tag->getPos() + $tag->getLen();
136 399
			$endLen   = 0;
137
		}
138
139
		// Create a new tag and copy this tag's attributes and priority
140 403
		$tagStack->addTagPair(strtoupper($siteId), $startPos, $startLen, $endPos, $endLen, $tag->getSortPriority())->setAttributes($tag->getAttributes());
141 403
	}
142
143
	/**
144
	* Add a media site tag based on the attributes of a MEDIA tag
145
	*
146
	* @param  Tag      $tag      The original tag
147
	* @param  TagStack $tagStack Parser instance
148
	* @param  array    $sites    Map of [host => siteId]
149
	* @return void
150
	*/
151 5
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
152
	{
153 5
		$siteId = strtolower($tag->getAttribute('site'));
154 5
		if (in_array($siteId, $sites, true))
155
		{
156 4
			self::addSiteTag($tag, $tagStack, $siteId);
157
		}
158 5
	}
159
160
	/**
161
	* Add a media site tag based on the url attribute of a MEDIA tag
162
	*
163
	* @param  Tag      $tag      The original tag
164
	* @param  TagStack $tagStack Parser instance
165
	* @param  array    $sites    Map of [host => siteId]
166
	* @return void
167
	*/
168 401
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
169
	{
170
		// Capture the host of the URL
171 401
		if (preg_match('(^\\w+://(?:[^@/]*@)?([^/]+))', $tag->getAttribute('url'), $m))
172
		{
173 401
			$siteId = self::findSiteIdByHost($m[1], $sites);
174
		}
175
176 401
		if (!empty($siteId))
177
		{
178 399
			self::addSiteTag($tag, $tagStack, $siteId);
179
		}
180 401
	}
181
182
	/**
183
	* Match a given host to a site ID
184
	*
185
	* @param  string      $host  Host
186
	* @param  array       $sites Map of [host => siteId]
187
	* @return string|bool        Site ID or FALSE
188
	*/
189 401
	protected static function findSiteIdByHost($host, array $sites)
190
	{
191
		// Start with the full host then pop domain labels off the start until we get a match
192
		do
193
		{
194 401
			if (isset($sites[$host]))
195
			{
196 399
				return $sites[$host];
197
			}
198
199 262
			$pos = strpos($host, '.');
200 262
			if ($pos === false)
201
			{
202 4
				break;
203
			}
204
205 262
			$host = substr($host, 1 + $pos);
206
		}
207 262
		while ($host > '');
208
209 4
		return false;
210
	}
211
212
	/**
213
	* Return a cached instance of the HTTP client
214
	*
215
	* @param  string|null $cacheDir
216
	* @return \s9e\TextFormatter\Utils\Http\Client
217 2
	*/
218
	protected static function getHttpClient($cacheDir)
219 2
	{
220
		if (!isset(self::$client))
221 1
		{
222
			self::$client = (isset($cacheDir)) ? Http::getCachingClient($cacheDir) : Http::getClient();
223 2
		}
224
225 2
		return self::$client;
226
	}
227
228
	/**
229
	* Replace {@var} tokens in given URL
230
	*
231
	* @param  string   $url  Original URL
232
	* @param  string[] $vars Replacements
233
	* @return string         Modified URL
234
	*/
235 18
	protected static function replaceTokens($url, array $vars)
236
	{
237 18
		return preg_replace_callback(
238 18
			'#\\{@(\\w+)\\}#',
239 18
			function ($m) use ($vars)
240
			{
241 18
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
242 18
			},
243 18
			$url
244
		);
245
	}
246
247
	/**
248
	* Scrape the content of an URL to extract some data
249
	*
250
	* @param  string $url      Original URL
251
	* @param  Tag    $tag      Source tag
252
	* @param  array  $scrape   Array of scrape directives
253
	* @param  string $cacheDir Path to the cache directory
254
	* @return void
255
	*/
256 141
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
257
	{
258 141
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
259 141
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
260
		{
261 74
			return;
262
		}
263
264
		// Test whether this URL matches any regexp
265 74
		$vars    = [];
266 74
		$matched = false;
267 74
		foreach ((array) $matchRegexps as $matchRegexp)
268
		{
269 74
			if (preg_match($matchRegexp, $url, $m))
270
			{
271 61
				$vars   += $m;
272 74
				$matched = true;
273
			}
274
		}
275 74
		if (!$matched)
276
		{
277 20
			return;
278
		}
279
280
		// Add the tag's attributes to the named captures from the "match" regexp
281 61
		$vars += $tag->getAttributes();
282
283 61
		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
284 61
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
285 61
	}
286
287
	/**
288
	* Scrape a URL to help fill a tag's attributes
289
	*
290
	* @param  string      $url      URL to scrape
291
	* @param  Tag         $tag      Tag to fill
292
	* @param  string[]    $regexps  Regexps used to extract content from the page
293
	* @param  string|null $cacheDir Path to the cache directory
294
	* @return void
295
	*/
296 61
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
297
	{
298 61
		$content = self::wget($url, $cacheDir);
299
300
		// Execute the extract regexps and fill any missing attribute
301 61
		foreach ($regexps as $regexp)
302
		{
303 61
			if (preg_match($regexp, $content, $m))
304
			{
305 59
				foreach ($m as $k => $v)
306
				{
307 59
					if (!is_numeric($k) && !$tag->hasAttribute($k))
308
					{
309 61
						$tag->setAttribute($k, $v);
310
					}
311
				}
312
			}
313
		}
314 61
	}
315
316
	/**
317
	* Test whether a tag is missing any of given attributes
318
	*
319
	* @param  Tag      $tag
320
	* @param  string[] $attrNames
321
	* @return bool
322
	*/
323 141
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
324
	{
325 141
		foreach ($attrNames as $attrName)
326
		{
327 141
			if (!$tag->hasAttribute($attrName))
328
			{
329 141
				return true;
330
			}
331
		}
332
333 74
		return false;
334
	}
335
336
	/**
337
	* Retrieve external content (possibly from the cache)
338
	*
339
	* If the cache directory exists, the external content will be saved into it. Cached content is
340
	* never pruned
341
	*
342
	* @param  string $url      URL
343
	* @param  string $cacheDir Path to the cache directory
344
	* @return string           External content
345
	*/
346 61
	protected static function wget($url, $cacheDir = null)
347
	{
348 61
		$url = preg_replace('(#.*)s', '', $url);
349 61
350
		return @self::getHttpClient($cacheDir)->get($url, ['User-Agent: PHP (not Mozilla)']);
351
	}
352
}