Completed
Push — master ( 5921a6...87237b )
by Josh
17:10 queued 10:38
created

src/Plugins/MediaEmbed/Parser.php (1 issue)

Labels
Severity

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
/**
4
* @package   s9e\TextFormatter
5
* @copyright Copyright (c) 2010-2017 The s9e Authors
6
* @license   http://www.opensource.org/licenses/mit-license.php The MIT License
7
*/
8
namespace s9e\TextFormatter\Plugins\MediaEmbed;
9
10
use s9e\TextFormatter\Parser as TagStack;
11
use s9e\TextFormatter\Parser\Tag;
12
use s9e\TextFormatter\Plugins\ParserBase;
13
use s9e\TextFormatter\Utils\Http;
14
15
class Parser extends ParserBase
16
{
17
	/**
18
	* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request
19
	*/
20
	protected static $client;
21
22
	/**
23
	* {@inheritdoc}
24
	*/
25 405
	public function parse($text, array $matches)
26
	{
27 405
		foreach ($matches as $m)
28
		{
29 405
			$url = $m[0][0];
30 405
			$pos = $m[0][1];
31 405
			$len = strlen($url);
32
33
			// Give that tag priority over other tags such as Autolink's
34 405
			$tag = $this->parser->addSelfClosingTag($this->config['tagName'], $pos, $len, -10);
35 405
			$tag->setAttribute('url', $url);
36
		}
37 405
	}
38
39
	/**
40
	* Filter a MEDIA tag
41
	*
42
	* This will always invalidate the original tag, and possibly replace it with the tag that
43
	* corresponds to the media site
44
	*
45
	* @param  Tag      $tag      The original tag
46
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
47
	* @param  array    $sites    Map of [host => siteId]
48
	* @return void
49
	*/
50 405
	public static function filterTag(Tag $tag, TagStack $tagStack, array $sites)
51
	{
52 405
		$tag->invalidate();
53 405
		if ($tag->hasAttribute('site'))
54
		{
55 5
			self::addTagFromMediaId($tag, $tagStack, $sites);
56
		}
57 401
		elseif ($tag->hasAttribute('url'))
58
		{
59 401
			self::addTagFromMediaUrl($tag, $tagStack, $sites);
60
		}
61 405
	}
62
63
	/**
64
	* Invalidate given tag if it doesn't have at least one non-default attribute
65
	*
66
	* @param  Tag  $tag The original tag
67
	* @return void
68
	*/
69 60
	public static function hasNonDefaultAttribute(Tag $tag)
70
	{
71 60
		foreach ($tag->getAttributes() as $attrName => $void)
72
		{
73 60
			if ($attrName !== 'url')
74
			{
75 60
				return;
76
			}
77
		}
78
79 6
		$tag->invalidate();
80 6
	}
81
82
	/**
83
	* Scrape the content of an URL to extract some data
84
	*
85
	* @param  Tag    $tag          Source tag
86
	* @param  array  $scrapeConfig Array of scrape directives
87
	* @param  string $cacheDir     Path to the cache directory
88
	* @return bool                 Unconditionally TRUE
89
	*/
90 410
	public static function scrape(Tag $tag, array $scrapeConfig, $cacheDir = null)
91
	{
92 410
		if ($tag->hasAttribute('url'))
93
		{
94
			// Ensure that the URL actually looks like a URL if we want to use it to scrape
95 409
			$url = $tag->getAttribute('url');
96 409
			if (preg_match('#^https?://[^<>"\'\\s]+$#Di', $url))
97
			{
98 404
				$url = strtolower(substr($url, 0, 5)) . substr($url, 5);
99 404
				foreach ($scrapeConfig as $scrape)
100
				{
101 141
					self::scrapeEntry($url, $tag, $scrape, $cacheDir);
102
				}
103
			}
104
		}
105
106 410
		return true;
107
	}
108
109
	//==============================================================================================
110
	// Internals
111
	//==============================================================================================
112
113
	/**
114
	* Add a site tag
115
	*
116
	* @param  Tag      $tag      The original tag
117
	* @param  TagStack $tagStack Parser instance, so that we can add the new tag to the stack
118
	* @param  string   $siteId   Site ID
119
	* @return void
120
	*/
121 403
	protected static function addSiteTag(Tag $tag, TagStack $tagStack, $siteId)
122
	{
123 403
		$endTag = $tag->getEndTag();
124 403
		if ($endTag)
125
		{
126 12
			$startPos = $tag->getPos();
127 12
			$startLen = $tag->getLen();
128 12
			$endPos   = $endTag->getPos();
129 12
			$endLen   = $endTag->getLen();
130
		}
131
		else
132
		{
133 399
			$startPos = $tag->getPos();
134 399
			$startLen = 0;
135 399
			$endPos   = $tag->getPos() + $tag->getLen();
136 399
			$endLen   = 0;
137
		}
138
139
		// Create a new tag and copy this tag's attributes and priority
140 403
		$tagStack->addTagPair(strtoupper($siteId), $startPos, $startLen, $endPos, $endLen, $tag->getSortPriority())->setAttributes($tag->getAttributes());
141 403
	}
142
143
	/**
144
	* Add a media site tag based on the attributes of a MEDIA tag
145
	*
146
	* @param  Tag      $tag      The original tag
147
	* @param  TagStack $tagStack Parser instance
148
	* @param  array    $sites    Map of [host => siteId]
149
	* @return void
150
	*/
151 5
	protected static function addTagFromMediaId(Tag $tag, TagStack $tagStack, array $sites)
152
	{
153 5
		$siteId = strtolower($tag->getAttribute('site'));
154 5
		if (in_array($siteId, $sites, true))
155
		{
156 4
			self::addSiteTag($tag, $tagStack, $siteId);
157
		}
158 5
	}
159
160
	/**
161
	* Add a media site tag based on the url attribute of a MEDIA tag
162
	*
163
	* @param  Tag      $tag      The original tag
164
	* @param  TagStack $tagStack Parser instance
165
	* @param  array    $sites    Map of [host => siteId]
166
	* @return void
167
	*/
168 401
	protected static function addTagFromMediaUrl(Tag $tag, TagStack $tagStack, array $sites)
169
	{
170
		// Capture the host of the URL
171 401
		if (preg_match('(^\\w+://(?:[^@/]*@)?([^/]+))', $tag->getAttribute('url'), $m))
172
		{
173 401
			$siteId = self::findSiteIdByHost($m[1], $sites);
174
		}
175
176 401
		if (!empty($siteId))
177
		{
178 399
			self::addSiteTag($tag, $tagStack, $siteId);
1 ignored issue
show
It seems like $siteId defined by self::findSiteIdByHost($m[1], $sites) on line 173 can also be of type boolean; however, s9e\TextFormatter\Plugin...ed\Parser::addSiteTag() does only seem to accept string, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
179
		}
180 401
	}
181
182
	/**
183
	* Match a given host to a site ID
184
	*
185
	* @param  string      $host  Host
186
	* @param  array       $sites Map of [host => siteId]
187
	* @return string|bool        Site ID or FALSE
188
	*/
189 401
	protected static function findSiteIdByHost($host, array $sites)
190
	{
191
		// Start with the full host then pop domain labels off the start until we get a match
192
		do
193
		{
194 401
			if (isset($sites[$host]))
195
			{
196 399
				return $sites[$host];
197
			}
198
199 262
			$pos = strpos($host, '.');
200 262
			if ($pos === false)
201
			{
202 4
				break;
203
			}
204
205 262
			$host = substr($host, 1 + $pos);
206
		}
207 262
		while ($host > '');
208
209 4
		return false;
210
	}
211
212
	/**
213
	* Return a cached instance of the HTTP client
214
	*
215
	* @return \s9e\TextFormatter\Utils\Http\Client
216
	*/
217 2
	protected static function getHttpClient()
218
	{
219 2
		if (!isset(self::$client))
220
		{
221 1
			self::$client = Http::getClient();
222
		}
223 2
		self::$client->timeout = 10;
224
225 2
		return self::$client;
226
	}
227
228
	/**
229
	* Replace {@var} tokens in given URL
230
	*
231
	* @param  string   $url  Original URL
232
	* @param  string[] $vars Replacements
233
	* @return string         Modified URL
234
	*/
235 18
	protected static function replaceTokens($url, array $vars)
236
	{
237 18
		return preg_replace_callback(
238 18
			'#\\{@(\\w+)\\}#',
239 18
			function ($m) use ($vars)
240
			{
241 18
				return (isset($vars[$m[1]])) ? $vars[$m[1]] : '';
242 18
			},
243 18
			$url
244
		);
245
	}
246
247
	/**
248
	* Scrape the content of an URL to extract some data
249
	*
250
	* @param  string $url      Original URL
251
	* @param  Tag    $tag      Source tag
252
	* @param  array  $scrape   Array of scrape directives
253
	* @param  string $cacheDir Path to the cache directory
254
	* @return void
255
	*/
256 141
	protected static function scrapeEntry($url, Tag $tag, array $scrape, $cacheDir)
257
	{
258 141
		list($matchRegexps, $extractRegexps, $attrNames) = $scrape;
259 141
		if (!self::tagIsMissingAnyAttribute($tag, $attrNames))
260
		{
261 74
			return;
262
		}
263
264
		// Test whether this URL matches any regexp
265 74
		$vars    = [];
266 74
		$matched = false;
267 74
		foreach ((array) $matchRegexps as $matchRegexp)
268
		{
269 74
			if (preg_match($matchRegexp, $url, $m))
270
			{
271 61
				$vars   += $m;
272 74
				$matched = true;
273
			}
274
		}
275 74
		if (!$matched)
276
		{
277 20
			return;
278
		}
279
280
		// Add the tag's attributes to the named captures from the "match" regexp
281 61
		$vars += $tag->getAttributes();
282
283 61
		$scrapeUrl = (isset($scrape[3])) ? self::replaceTokens($scrape[3], $vars) : $url;
284 61
		self::scrapeUrl($scrapeUrl, $tag, (array) $extractRegexps, $cacheDir);
285 61
	}
286
287
	/**
288
	* Scrape a URL to help fill a tag's attributes
289
	*
290
	* @param  string      $url      URL to scrape
291
	* @param  Tag         $tag      Tag to fill
292
	* @param  string[]    $regexps  Regexps used to extract content from the page
293
	* @param  string|null $cacheDir Path to the cache directory
294
	* @return void
295
	*/
296 61
	protected static function scrapeUrl($url, Tag $tag, array $regexps, $cacheDir)
297
	{
298 61
		$content = self::wget($url, $cacheDir);
299
300
		// Execute the extract regexps and fill any missing attribute
301 61
		foreach ($regexps as $regexp)
302
		{
303 61
			if (preg_match($regexp, $content, $m))
304
			{
305 59
				foreach ($m as $k => $v)
306
				{
307 59
					if (!is_numeric($k) && !$tag->hasAttribute($k))
308
					{
309 61
						$tag->setAttribute($k, $v);
310
					}
311
				}
312
			}
313
		}
314 61
	}
315
316
	/**
317
	* Test whether a tag is missing any of given attributes
318
	*
319
	* @param  Tag      $tag
320
	* @param  string[] $attrNames
321
	* @return bool
322
	*/
323 141
	protected static function tagIsMissingAnyAttribute(Tag $tag, array $attrNames)
324
	{
325 141
		foreach ($attrNames as $attrName)
326
		{
327 141
			if (!$tag->hasAttribute($attrName))
328
			{
329 141
				return true;
330
			}
331
		}
332
333 74
		return false;
334
	}
335
336
	/**
337
	* Retrieve external content (possibly from the cache)
338
	*
339
	* If the cache directory exists, the external content will be saved into it. Cached content is
340
	* never pruned
341
	*
342
	* @param  string $url      URL
343
	* @param  string $cacheDir Path to the cache directory
344
	* @return string           External content
345
	*/
346 61
	protected static function wget($url, $cacheDir = null)
347
	{
348 61
		$prefix = '';
349 61
		$url    = preg_replace('(#.*)s', '', $url);
350
351
		// Return the content from the cache if applicable
352 61
		if (isset($cacheDir) && file_exists($cacheDir))
353
		{
354 61
			$cacheFile = $cacheDir . '/http.' . crc32($url);
355 61
			if (extension_loaded('zlib'))
356
			{
357 61
				$prefix     = 'compress.zlib://';
358 61
				$cacheFile .= '.gz';
359
			}
360 61
			if (file_exists($cacheFile))
361
			{
362 59
				return file_get_contents($prefix . $cacheFile);
363
			}
364
		}
365
366
		// Retrieve the external content from the source
367 2
		$content = @self::getHttpClient()->get($url, ['User-Agent: PHP (not Mozilla)']);
368
369
		// Save to the cache if applicable
370 2
		if (isset($cacheFile) && !empty($content))
371
		{
372 1
			file_put_contents($prefix . $cacheFile, $content);
373
		}
374
375 2
		return $content;
376
	}
377
}