|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
declare(strict_types = 1); |
|
4
|
|
|
|
|
5
|
|
|
namespace App\Model; |
|
6
|
|
|
|
|
7
|
|
|
use App\Repository\BlameRepository; |
|
8
|
|
|
|
|
9
|
|
|
/** |
|
10
|
|
|
* A Blame will search the given page for the given text and return the relevant revisions and authors. |
|
11
|
|
|
*/ |
|
12
|
|
|
class Blame extends Authorship |
|
13
|
|
|
{ |
|
14
|
|
|
/** @var string Text to search for. */ |
|
15
|
|
|
protected string $query; |
|
16
|
|
|
|
|
17
|
|
|
/** @var array|null Matches, keyed by revision ID, each with keys 'edit' <Edit> and 'tokens' <string[]>. */ |
|
18
|
|
|
protected ?array $matches; |
|
19
|
|
|
|
|
20
|
|
|
/** @var Edit|null Target revision that is being blamed. */ |
|
21
|
|
|
protected ?Edit $asOf; |
|
22
|
|
|
|
|
23
|
|
|
/** |
|
24
|
|
|
* Blame constructor. |
|
25
|
|
|
* @param BlameRepository $repository |
|
26
|
|
|
* @param Page $page The page to process. |
|
27
|
|
|
* @param string $query Text to search for. |
|
28
|
|
|
* @param string|null $target Either a revision ID or date in YYYY-MM-DD format. Null to use latest revision. |
|
29
|
|
|
*/ |
|
30
|
|
|
public function __construct( |
|
31
|
|
|
BlameRepository $repository, |
|
32
|
|
|
Page $page, |
|
33
|
|
|
string $query, |
|
34
|
|
|
?string $target = null |
|
35
|
|
|
) { |
|
36
|
|
|
parent::__construct($repository, $page, $target); |
|
37
|
|
|
$this->query = $query; |
|
38
|
|
|
} |
|
39
|
|
|
|
|
40
|
|
|
/** |
|
41
|
|
|
* Get the search query. |
|
42
|
|
|
* @return string |
|
43
|
|
|
*/ |
|
44
|
|
|
public function getQuery(): string |
|
45
|
|
|
{ |
|
46
|
|
|
return $this->query; |
|
47
|
|
|
} |
|
48
|
|
|
|
|
49
|
|
|
/** |
|
50
|
|
|
* Matches, keyed by revision ID, each with keys 'edit' <Edit> and 'tokens' <string[]>. |
|
51
|
|
|
* @return array|null |
|
52
|
|
|
*/ |
|
53
|
|
|
public function getMatches(): ?array |
|
54
|
|
|
{ |
|
55
|
|
|
return $this->matches; |
|
56
|
|
|
} |
|
57
|
|
|
|
|
58
|
|
|
/** |
|
59
|
|
|
* Get all the matches as Edits. |
|
60
|
|
|
* @return Edit[]|null |
|
61
|
|
|
*/ |
|
62
|
|
|
public function getEdits(): ?array |
|
63
|
|
|
{ |
|
64
|
|
|
return array_column($this->matches, 'edit'); |
|
|
|
|
|
|
65
|
|
|
} |
|
66
|
|
|
|
|
67
|
|
|
/** |
|
68
|
|
|
* Strip out spaces, since they are not accounted for in the WikiWho API. |
|
69
|
|
|
* @return string |
|
70
|
|
|
*/ |
|
71
|
|
|
public function getTokenizedQuery(): string |
|
72
|
|
|
{ |
|
73
|
|
|
return strtolower(preg_replace('/\s*/m', '', $this->query)); |
|
74
|
|
|
} |
|
75
|
|
|
|
|
76
|
|
|
/** |
|
77
|
|
|
* Get the first "token" of the search query. A "token" in this case is a word or group of syntax, |
|
78
|
|
|
* roughly correlating to the token structure returned by the WikiWho API. |
|
79
|
|
|
* @return string |
|
80
|
|
|
*/ |
|
81
|
|
|
public function getFirstQueryToken(): string |
|
82
|
|
|
{ |
|
83
|
|
|
return strtolower(preg_split('/[\n\s]/', $this->query)[0]); |
|
84
|
|
|
} |
|
85
|
|
|
|
|
86
|
|
|
/** |
|
87
|
|
|
* Get the target revision that is being blamed. |
|
88
|
|
|
* @return Edit|null |
|
89
|
|
|
*/ |
|
90
|
|
|
public function getAsOf(): ?Edit |
|
91
|
|
|
{ |
|
92
|
|
|
if (isset($this->asOf)) { |
|
93
|
|
|
return $this->asOf; |
|
94
|
|
|
} |
|
95
|
|
|
|
|
96
|
|
|
$this->asOf = $this->target |
|
97
|
|
|
? $this->repository->getEditFromRevId($this->page, $this->target) |
|
|
|
|
|
|
98
|
|
|
: null; |
|
99
|
|
|
|
|
100
|
|
|
return $this->asOf; |
|
101
|
|
|
} |
|
102
|
|
|
|
|
103
|
|
|
/** |
|
104
|
|
|
* Get authorship attribution from the WikiWho API. |
|
105
|
|
|
* @see https://www.f-squared.org/wikiwho/ |
|
106
|
|
|
*/ |
|
107
|
|
|
public function prepareData(): void |
|
108
|
|
|
{ |
|
109
|
|
|
if (isset($this->matches)) { |
|
110
|
|
|
return; |
|
111
|
|
|
} |
|
112
|
|
|
|
|
113
|
|
|
// Set revision data. self::setRevisionData() returns null if there are errors. |
|
114
|
|
|
$revisionData = $this->getRevisionData(true); |
|
115
|
|
|
if (null === $revisionData) { |
|
116
|
|
|
return; |
|
117
|
|
|
} |
|
118
|
|
|
|
|
119
|
|
|
$matches = $this->searchTokens($revisionData['tokens']); |
|
120
|
|
|
|
|
121
|
|
|
// We want the results grouped by editor and revision ID. |
|
122
|
|
|
$this->matches = []; |
|
123
|
|
|
foreach ($matches as $match) { |
|
124
|
|
|
if (isset($this->matches[$match['id']])) { |
|
125
|
|
|
$this->matches[$match['id']]['tokens'][] = $match['token']; |
|
126
|
|
|
continue; |
|
127
|
|
|
} |
|
128
|
|
|
|
|
129
|
|
|
$edit = $this->repository->getEditFromRevId($this->page, $match['id']); |
|
130
|
|
|
if ($edit) { |
|
131
|
|
|
$this->matches[$match['id']] = [ |
|
132
|
|
|
'edit' => $edit, |
|
133
|
|
|
'tokens' => [$match['token']], |
|
134
|
|
|
]; |
|
135
|
|
|
} |
|
136
|
|
|
} |
|
137
|
|
|
} |
|
138
|
|
|
|
|
139
|
|
|
/** |
|
140
|
|
|
* Find matches of search query in the given list of tokens. |
|
141
|
|
|
* @param array $tokens |
|
142
|
|
|
* @return array |
|
143
|
|
|
*/ |
|
144
|
|
|
private function searchTokens(array $tokens): array |
|
145
|
|
|
{ |
|
146
|
|
|
$matchData = []; |
|
147
|
|
|
$matchDataSoFar = []; |
|
148
|
|
|
$matchSoFar = ''; |
|
149
|
|
|
$firstQueryToken = $this->getFirstQueryToken(); |
|
150
|
|
|
$tokenizedQuery = $this->getTokenizedQuery(); |
|
151
|
|
|
|
|
152
|
|
|
foreach ($tokens as $token) { |
|
153
|
|
|
// The previous matches plus the new token. This is basically a candidate for what may become $matchSoFar. |
|
154
|
|
|
$newMatchSoFar = $matchSoFar.$token['str']; |
|
155
|
|
|
|
|
156
|
|
|
// We first check if the first token of the query matches, because we want to allow for partial matches |
|
157
|
|
|
// (e.g. for query "barbaz", the tokens ["foobar","baz"] should match). |
|
158
|
|
|
if (false !== strpos($newMatchSoFar, $firstQueryToken)) { |
|
159
|
|
|
// If the full query is in the new match, use it, otherwise use just the first token. This is because |
|
160
|
|
|
// the full match may exist across multiple tokens, but the first match is only a partial match. |
|
161
|
|
|
$newMatchSoFar = false !== strpos($newMatchSoFar, $tokenizedQuery) |
|
162
|
|
|
? $newMatchSoFar |
|
163
|
|
|
: $firstQueryToken; |
|
164
|
|
|
} |
|
165
|
|
|
|
|
166
|
|
|
// Keep track of tokens that match. To allow partial matches, |
|
167
|
|
|
// we check the query against $newMatchSoFar and vice versa. |
|
168
|
|
|
if (false !== strpos($tokenizedQuery, $newMatchSoFar) || |
|
169
|
|
|
false !== strpos($newMatchSoFar, $tokenizedQuery) |
|
170
|
|
|
) { |
|
171
|
|
|
$matchSoFar = $newMatchSoFar; |
|
172
|
|
|
$matchDataSoFar[] = [ |
|
173
|
|
|
'id' => $token['o_rev_id'], |
|
174
|
|
|
'editor' => $token['editor'], |
|
175
|
|
|
'token' => $token['str'], |
|
176
|
|
|
]; |
|
177
|
|
|
} elseif (!empty($matchSoFar)) { |
|
178
|
|
|
// We hit a token that isn't in the query string, so start over. |
|
179
|
|
|
$matchDataSoFar = []; |
|
180
|
|
|
$matchSoFar = ''; |
|
181
|
|
|
} |
|
182
|
|
|
|
|
183
|
|
|
// A full match was found, so merge $matchDataSoFar into $matchData, |
|
184
|
|
|
// and start over to see if there are more matches in the article. |
|
185
|
|
|
if (false !== strpos($matchSoFar, $tokenizedQuery)) { |
|
186
|
|
|
$matchData = array_merge($matchData, $matchDataSoFar); |
|
187
|
|
|
$matchDataSoFar = []; |
|
188
|
|
|
$matchSoFar = ''; |
|
189
|
|
|
} |
|
190
|
|
|
} |
|
191
|
|
|
|
|
192
|
|
|
// Full matches usually come last, but are the most relevant. |
|
193
|
|
|
return array_reverse($matchData); |
|
194
|
|
|
} |
|
195
|
|
|
} |
|
196
|
|
|
|