1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* @package s9e\TextFormatter |
5
|
|
|
* @copyright Copyright (c) 2010-2018 The s9e Authors |
6
|
|
|
* @license http://www.opensource.org/licenses/mit-license.php The MIT License |
7
|
|
|
*/ |
8
|
|
|
namespace s9e\TextFormatter\Plugins\MediaEmbed; |
9
|
|
|
|
10
|
|
|
use s9e\TextFormatter\Parser as TagStack; |
11
|
|
|
use s9e\TextFormatter\Parser\Tag; |
12
|
|
|
use s9e\TextFormatter\Plugins\ParserBase; |
13
|
|
|
use s9e\TextFormatter\Utils\Http; |
14
|
|
|
|
15
|
|
|
class Parser extends ParserBase |
16
|
|
|
{ |
17
|
|
|
/** |
18
|
|
|
* @var \s9e\TextFormatter\Utils\Http\Client Client used to perform HTTP request |
19
|
|
|
*/ |
20
|
|
|
protected static $client; |
21
|
|
|
|
22
|
|
|
/** |
23
|
|
|
* {@inheritdoc} |
24
|
|
|
*/ |
25
|
398 |
|
public function parse($text, array $matches) |
26
|
|
|
{ |
27
|
398 |
|
foreach ($matches as $m) |
28
|
|
|
{ |
29
|
398 |
|
$tagName = $this->config['tagName']; |
30
|
398 |
|
$url = $m[0][0]; |
31
|
398 |
|
$pos = $m[0][1]; |
32
|
398 |
|
$len = strlen($url); |
33
|
|
|
|
34
|
|
|
// Give that tag priority over other tags such as Autolink's |
35
|
398 |
|
$this->parser->addSelfClosingTag($tagName, $pos, $len, -10)->setAttribute('url', $url); |
36
|
|
|
} |
37
|
398 |
|
} |
38
|
|
|
|
39
|
|
|
/** |
40
|
|
|
* Filter a MEDIA tag |
41
|
|
|
* |
42
|
|
|
* This will always invalidate the original tag, and possibly replace it with the tag that |
43
|
|
|
* corresponds to the media site |
44
|
|
|
* |
45
|
|
|
* @param Tag $tag The original tag |
46
|
|
|
* @param TagStack $tagStack Parser instance, so that we can add the new tag to the stack |
47
|
|
|
* @param array $hosts Map of [hostname => siteId] |
48
|
|
|
* @param array $sites Map of [siteId => siteConfig] |
49
|
|
|
* @param string|null $cacheDir Path to the cache directory |
50
|
|
|
* @return void |
51
|
|
|
*/ |
52
|
400 |
|
public static function filterTag(Tag $tag, TagStack $tagStack, array $hosts, array $sites, $cacheDir) |
53
|
|
|
{ |
54
|
|
|
// Always invalidate this tag |
55
|
400 |
|
$tag->invalidate(); |
56
|
|
|
|
57
|
400 |
|
if ($tag->hasAttribute('url')) |
58
|
|
|
{ |
59
|
400 |
|
$url = $tag->getAttribute('url'); |
60
|
400 |
|
$siteId = self::getSiteIdFromUrl($url, $hosts); |
61
|
400 |
|
if (isset($sites[$siteId])) |
62
|
|
|
{ |
63
|
397 |
|
$attributes = self::getAttributes($url, $sites[$siteId], $cacheDir); |
64
|
397 |
|
if (!empty($attributes)) |
65
|
|
|
{ |
66
|
370 |
|
self::createTag(strtoupper($siteId), $tagStack, $tag)->setAttributes($attributes); |
67
|
|
|
} |
68
|
|
|
} |
69
|
|
|
} |
70
|
400 |
|
} |
71
|
|
|
|
72
|
|
|
/** |
73
|
|
|
* Add named captures from a set of regular expressions to a set of attributes |
74
|
|
|
* |
75
|
|
|
* @param array &$attributes Associative array of strings |
76
|
|
|
* @param string $string Text to match |
77
|
|
|
* @param array[] $regexps List of [regexp, map] pairs |
78
|
|
|
* @return bool Whether any regexp matched |
79
|
|
|
*/ |
80
|
397 |
|
protected static function addNamedCaptures(array &$attributes, $string, array $regexps) |
81
|
|
|
{ |
82
|
397 |
|
$matched = 0; |
83
|
397 |
|
foreach ($regexps as list($regexp, $map)) |
84
|
|
|
{ |
85
|
397 |
|
$matched += preg_match($regexp, $string, $m); |
86
|
397 |
|
foreach ($map as $i => $name) |
87
|
|
|
{ |
88
|
397 |
|
if (isset($m[$i]) && $m[$i] !== '' && $name !== '') |
89
|
|
|
{ |
90
|
397 |
|
$attributes[$name] = $m[$i]; |
91
|
|
|
} |
92
|
|
|
} |
93
|
|
|
} |
94
|
|
|
|
95
|
397 |
|
return (bool) $matched; |
96
|
|
|
} |
97
|
|
|
|
98
|
|
|
/** |
99
|
|
|
* Create a tag for a media embed |
100
|
|
|
* |
101
|
|
|
* @param string $tagName Tag's name |
102
|
|
|
* @param TagStack $tagStack |
103
|
|
|
* @param Tag $tag Reference tag |
104
|
|
|
* @return Tag New tag |
105
|
|
|
*/ |
106
|
370 |
|
protected static function createTag($tagName, TagStack $tagStack, Tag $tag) |
107
|
|
|
{ |
108
|
370 |
|
$startPos = $tag->getPos(); |
109
|
370 |
|
$endTag = $tag->getEndTag(); |
110
|
370 |
|
if ($endTag) |
111
|
|
|
{ |
112
|
10 |
|
$startLen = $tag->getLen(); |
113
|
10 |
|
$endPos = $endTag->getPos(); |
114
|
10 |
|
$endLen = $endTag->getLen(); |
115
|
|
|
} |
116
|
|
|
else |
117
|
|
|
{ |
118
|
370 |
|
$startLen = 0; |
119
|
370 |
|
$endPos = $tag->getPos() + $tag->getLen(); |
120
|
370 |
|
$endLen = 0; |
121
|
|
|
} |
122
|
|
|
|
123
|
370 |
|
return $tagStack->addTagPair($tagName, $startPos, $startLen, $endPos, $endLen, $tag->getSortPriority()); |
124
|
|
|
} |
125
|
|
|
|
126
|
|
|
/** |
127
|
|
|
* Return a set of attributes for given URL based on a site's config |
128
|
|
|
* |
129
|
|
|
* @param string $url Original URL |
130
|
|
|
* @param array $config Site config |
131
|
|
|
* @param string|null $cacheDir Path to the cache directory |
132
|
|
|
* @return array Associative array of attributes |
133
|
|
|
*/ |
134
|
397 |
|
protected static function getAttributes($url, array $config, $cacheDir) |
135
|
|
|
{ |
136
|
397 |
|
$attributes = []; |
137
|
397 |
|
self::addNamedCaptures($attributes, $url, $config[0]); |
138
|
397 |
|
foreach ($config[1] as $scrapeConfig) |
139
|
|
|
{ |
140
|
135 |
|
self::scrape($attributes, $url, $scrapeConfig, $cacheDir); |
141
|
|
|
} |
142
|
|
|
|
143
|
397 |
|
return $attributes; |
144
|
|
|
} |
145
|
|
|
|
146
|
|
|
/** |
147
|
|
|
* Return a cached instance of the HTTP client |
148
|
|
|
* |
149
|
|
|
* @param string|null $cacheDir |
150
|
|
|
* @return \s9e\TextFormatter\Utils\Http\Client |
151
|
|
|
*/ |
152
|
61 |
|
protected static function getHttpClient($cacheDir) |
153
|
|
|
{ |
154
|
61 |
|
if (!isset(self::$client)) |
155
|
|
|
{ |
156
|
1 |
|
self::$client = (isset($cacheDir)) ? Http::getCachingClient($cacheDir) : Http::getClient(); |
157
|
|
|
} |
158
|
|
|
|
159
|
61 |
|
return self::$client; |
160
|
|
|
} |
161
|
|
|
|
162
|
|
|
/** |
163
|
|
|
* Return the siteId that corresponds to given URL |
164
|
|
|
* |
165
|
|
|
* @param string $url Original URL |
166
|
|
|
* @param array $hosts Map of [hostname => siteId] |
167
|
|
|
* @return string URL's siteId, or an empty string |
168
|
|
|
*/ |
169
|
400 |
|
protected static function getSiteIdFromUrl($url, array $hosts) |
170
|
|
|
{ |
171
|
400 |
|
$host = (preg_match('(^https?://([^/]+))', strtolower($url), $m)) ? $m[1] : ''; |
172
|
400 |
|
while ($host > '') |
173
|
|
|
{ |
174
|
398 |
|
if (isset($hosts[$host])) |
175
|
|
|
{ |
176
|
397 |
|
return $hosts[$host]; |
177
|
|
|
} |
178
|
262 |
|
$host = preg_replace('(^[^.]*.)', '', $host); |
179
|
|
|
} |
180
|
|
|
|
181
|
5 |
|
return ''; |
182
|
|
|
} |
183
|
|
|
|
184
|
|
|
/** |
185
|
|
|
* Interpolate {@vars} in given string |
186
|
|
|
* |
187
|
|
|
* @param string $str Original string |
188
|
|
|
* @param array $vars Associative array |
189
|
|
|
* @return string Interpolated string |
190
|
|
|
*/ |
191
|
18 |
|
protected static function interpolateVars($str, array $vars) |
192
|
|
|
{ |
193
|
18 |
|
return preg_replace_callback( |
194
|
18 |
|
'(\\{@(\\w+)\\})', |
195
|
18 |
|
function ($m) use ($vars) |
196
|
|
|
{ |
197
|
18 |
|
return (isset($vars[$m[1]])) ? $vars[$m[1]] : ''; |
198
|
18 |
|
}, |
199
|
18 |
|
$str |
200
|
|
|
); |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
/** |
204
|
|
|
* Scrape values and add them to current attributes |
205
|
|
|
* |
206
|
|
|
* @param array &$attributes Attributes |
207
|
|
|
* @param string|null $cacheDir Path to the cache directory |
208
|
|
|
* @param string $url Original URL |
209
|
|
|
* @param array $config Scraping config |
210
|
|
|
* @return void |
211
|
|
|
*/ |
212
|
135 |
|
protected static function scrape(array &$attributes, $url, array $config, $cacheDir) |
213
|
|
|
{ |
214
|
135 |
|
$vars = []; |
215
|
135 |
|
if (self::addNamedCaptures($vars, $url, $config['match'])) |
216
|
|
|
{ |
217
|
61 |
|
if (isset($config['url'])) |
218
|
|
|
{ |
219
|
18 |
|
$url = self::interpolateVars($config['url'], $vars + $attributes); |
220
|
|
|
} |
221
|
61 |
|
if (preg_match('(^https?://[^#]+)i', $url, $m)) |
222
|
|
|
{ |
223
|
61 |
|
self::addNamedCaptures($attributes, self::wget($m[0], $cacheDir), $config['extract']); |
224
|
|
|
} |
225
|
|
|
} |
226
|
135 |
|
} |
227
|
|
|
|
228
|
|
|
/** |
229
|
|
|
* Retrieve external content |
230
|
|
|
* |
231
|
|
|
* @param string $url URL |
232
|
|
|
* @param string|null $cacheDir Path to the cache directory |
233
|
|
|
* @return string External content |
234
|
|
|
*/ |
235
|
61 |
|
protected static function wget($url, $cacheDir) |
236
|
|
|
{ |
237
|
61 |
|
return @self::getHttpClient($cacheDir)->get($url, ['User-Agent: PHP (not Mozilla)']); |
238
|
|
|
} |
239
|
|
|
} |