1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Sugarcrm\UpgradeSpec\Purifier; |
4
|
|
|
|
5
|
|
|
class Html implements PurifierInterface |
6
|
|
|
{ |
7
|
|
|
/** |
8
|
|
|
* @var string |
9
|
|
|
*/ |
10
|
|
|
private $baseUrl; |
11
|
|
|
|
12
|
|
|
/** |
13
|
|
|
* @var array |
14
|
|
|
*/ |
15
|
|
|
private $options; |
16
|
|
|
|
17
|
|
|
/** |
18
|
|
|
* Html constructor. |
19
|
|
|
* |
20
|
|
|
* @param string $baseUrl |
21
|
|
|
* @param array $options |
22
|
|
|
*/ |
23
|
|
|
public function __construct($baseUrl = '', array $options = []) |
24
|
|
|
{ |
25
|
|
|
$this->baseUrl = $baseUrl; |
26
|
|
|
|
27
|
|
|
$this->options = array_merge([ |
28
|
|
|
'absolute_urls' => false, |
29
|
|
|
'no_tag_duplicates' => false, |
30
|
|
|
'pre_to_code' => false, |
31
|
|
|
], $options); |
32
|
|
|
|
33
|
|
|
$this->validateOptions($baseUrl); |
|
|
|
|
34
|
|
|
} |
35
|
|
|
|
36
|
|
|
/** |
37
|
|
|
* Purifies html. |
38
|
|
|
* |
39
|
|
|
* @param $html |
40
|
|
|
* |
41
|
|
|
* @return string |
42
|
|
|
*/ |
43
|
|
|
public function purify($html) |
44
|
|
|
{ |
45
|
|
|
if ($this->options['absolute_urls']) { |
46
|
|
|
$html = $this->convertLinks($html); |
|
|
|
|
47
|
|
|
} |
48
|
|
|
|
49
|
|
|
if ($this->options['no_tag_duplicates']) { |
50
|
|
|
$html = $this->removeTagDuplicates($html); |
|
|
|
|
51
|
|
|
} |
52
|
|
|
|
53
|
|
|
if ($this->options['pre_to_code']) { |
54
|
|
|
$html = $this->convertCode($html); |
|
|
|
|
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
return $html; |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
/** |
61
|
|
|
* Validates options. |
62
|
|
|
*/ |
63
|
|
|
private function validateOptions() |
64
|
|
|
{ |
65
|
|
|
if (empty($this->baseUrl) && $this->options['absolute_urls']) { |
66
|
|
|
throw new \InvalidArgumentException('"absolute_urls" requires not empty base url'); |
67
|
|
|
} |
68
|
|
|
} |
69
|
|
|
|
70
|
|
|
/** |
71
|
|
|
* Converts all relative links (@href) to absolute ones. |
72
|
|
|
* |
73
|
|
|
* @param $content |
74
|
|
|
* |
75
|
|
|
* @return mixed |
76
|
|
|
*/ |
77
|
|
|
private function convertLinks($content) |
78
|
|
|
{ |
79
|
|
|
// href pattern |
80
|
|
|
$pattern = '/(?<=href=("|\'))[^"\']+(?=("|\'))/'; |
81
|
|
|
|
82
|
|
|
$base = $this->baseUrl; |
83
|
|
|
$host = parse_url($base, PHP_URL_HOST); |
84
|
|
|
$path = parse_url($base, PHP_URL_PATH); |
85
|
|
|
$scheme = parse_url($base, PHP_URL_SCHEME); |
86
|
|
|
|
87
|
|
|
return preg_replace_callback($pattern, function ($matches) use ($base, $scheme, $host, $path) { |
88
|
|
|
$hrefValue = $matches[0]; |
89
|
|
|
|
90
|
|
|
if (mb_strpos($hrefValue, '//') === 0) { |
91
|
|
|
return $scheme . ':' . $hrefValue; |
92
|
|
|
} |
93
|
|
|
|
94
|
|
|
// return if already absolute URL |
95
|
|
|
if (parse_url($hrefValue, PHP_URL_SCHEME) != '') { |
96
|
|
|
return $hrefValue; |
97
|
|
|
} |
98
|
|
|
|
99
|
|
|
// queries and anchors |
100
|
|
|
if ($hrefValue[0] == '#' || $hrefValue[0] == '?') { |
101
|
|
|
return $base . $hrefValue; |
102
|
|
|
} |
103
|
|
|
|
104
|
|
|
// remove non-directory element from path |
105
|
|
|
$path = preg_replace('#/[^/]*$#', '', $path); |
|
|
|
|
106
|
|
|
|
107
|
|
|
// destroy path if relative url points to root |
108
|
|
|
if ($hrefValue[0] == '/') { |
109
|
|
|
$path = ''; |
|
|
|
|
110
|
|
|
} |
111
|
|
|
|
112
|
|
|
// dirty absolute URL |
113
|
|
|
$abs = $host . $path . '/' . $hrefValue; |
114
|
|
|
|
115
|
|
|
// replace '//', '/./', '/foo/../' with '/' |
|
|
|
|
116
|
|
|
$abs = preg_replace('/\/[^\/]+\/\.\.\//', '/', str_replace(['//', '/./'], '/', $abs)); |
117
|
|
|
|
118
|
|
|
// absolute URL is ready |
119
|
|
|
return $scheme . '://' . $abs; |
120
|
|
|
}, $content); |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
/** |
124
|
|
|
* Removes duplicated tags. |
125
|
|
|
* |
126
|
|
|
* @param $content |
127
|
|
|
* |
128
|
|
|
* @return mixed |
129
|
|
|
*/ |
130
|
|
|
private function removeTagDuplicates($content) |
131
|
|
|
{ |
132
|
|
|
// strong -> b |
133
|
|
|
$content = str_replace(['<strong>', '</strong>'], ['<b>', '</b>'], $content); |
|
|
|
|
134
|
|
|
|
135
|
|
|
// unite duplicates |
136
|
|
|
$content = preg_replace('/(<\/b>\s*)+/', '</b> ', preg_replace('/(<b>\s*)+/', '<b>', $content)); |
|
|
|
|
137
|
|
|
|
138
|
|
|
// cleanup |
139
|
|
|
return preg_replace('/(<\/b>\s*<b>)+/', ' ', $content); |
140
|
|
|
} |
141
|
|
|
|
142
|
|
|
/** |
143
|
|
|
* Converts "pre" to "code". |
144
|
|
|
* |
145
|
|
|
* @param $content |
146
|
|
|
* |
147
|
|
|
* @return mixed |
148
|
|
|
*/ |
149
|
|
|
private function convertCode($content) |
150
|
|
|
{ |
151
|
|
|
// strpos with array support |
152
|
|
|
$strposa = function ($haystack, $needles = []) { |
153
|
|
|
$chr = []; |
154
|
|
|
foreach ($needles as $needle) { |
155
|
|
|
$res = mb_strpos($haystack, $needle); |
156
|
|
|
if ($res !== false) { |
157
|
|
|
$chr[$needle] = $res; |
158
|
|
|
} |
159
|
|
|
} |
160
|
|
|
|
161
|
|
|
if (empty($chr)) { |
162
|
|
|
return false; |
163
|
|
|
} |
164
|
|
|
|
165
|
|
|
return min($chr); |
166
|
|
|
}; |
167
|
|
|
|
168
|
|
|
$content = str_replace(['<pre>', '<pre ', '</pre>'], ['<code>', '<code ', '</code>'], $content); |
|
|
|
|
169
|
|
|
|
170
|
|
|
return preg_replace_callback('/<code(.*?)>(.*?)<\/code>/s', function ($matches) use ($strposa) { |
171
|
|
|
$noLineBreaks = str_replace(["\r\n", "\r", "\n"], '<br />', $matches[0]); |
172
|
|
|
$code = str_replace(['<br></br>', '<br>', '<br/>', '<br />'], PHP_EOL, $noLineBreaks); |
173
|
|
|
|
174
|
|
|
// if multiline or real code snippet |
175
|
|
|
if (false !== $strposa($code, ['function', 'class', 'array']) |
176
|
|
|
|| false !== mb_strpos($code, PHP_EOL)) { |
177
|
|
|
return '<br />' . $code . '<br />'; |
178
|
|
|
} |
179
|
|
|
|
180
|
|
|
return $code; |
181
|
|
|
}, $content); |
182
|
|
|
} |
183
|
|
|
} |
184
|
|
|
|
This check compares calls to functions or methods with their respective definitions. If the call has more arguments than are defined, it raises an issue.
If a function is defined several times with a different number of parameters, the check may pick up the wrong definition and report false positives. One codebase where this has been known to happen is Wordpress.
In this case you can add the
@ignore
PhpDoc annotation to the duplicate definition and it will be ignored.