Completed
Push — master ( 3560ba...257e65 )
by Angus
03:23
created

Base_Site_Model::cleanTitleDataDOM()   A

Complexity

Conditions 1
Paths 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 0
CRAP Score 2

Importance

Changes 0
Metric Value
cc 1
eloc 2
nc 1
nop 1
dl 0
loc 3
ccs 0
cts 0
cp 0
crap 2
rs 10
c 0
b 0
f 0
1
<?php declare(strict_types=1); defined('BASEPATH') OR exit('No direct script access allowed');
2
3
class Tracker_Sites_Model extends CI_Model {
4 103
	public function __construct() {
5 103
		parent::__construct();
6 103
	}
7
8
	public function __get($name) {
9
		//TODO: Is this a good idea? There wasn't a good consensus on if this is good practice or not..
10
		//      It's probably a minor speed reduction, but that isn't much of an issue.
11
		//      An alternate solution would simply have a function which generates a PHP file with code to load each model. Similar to: https://github.com/shish/shimmie2/blob/834bc740a4eeef751f546979e6400fd089db64f8/core/util.inc.php#L1422
12
		if(!class_exists($name) || !(get_parent_class($name) === 'Base_Site_Model')) {
13
			return get_instance()->{$name};
14
		} else {
15
			$this->loadSite($name);
16
			return $this->{$name};
17
		}
18
	}
19
20
	private function loadSite(string $siteName) {
21
		$this->{$siteName} = new $siteName();
22
	}
23
}
24
25
abstract class Base_Site_Model extends CI_Model {
26
	public $site          = '';
27
	public $titleFormat   = '';
28
	public $chapterFormat = '';
29
30
	public function __construct() {
31
		parent::__construct();
32
33
		$this->load->database();
34
35
		$this->site = get_class($this);
36
	}
37
38
	abstract public function getFullTitleURL(string $title_url) : string;
39
40
	abstract public function getChapterData(string $title_url, string $chapter) : array;
41
42
	//TODO: When ci-phpunit-test supports PHP Parser 3.x, add " : ?array"
43
	abstract public function getTitleData(string $title_url, bool $firstGet = FALSE);
44
45
	final public function isValidTitleURL(string $title_url) : bool {
46
		$success = (bool) preg_match($this->titleFormat, $title_url);
47
		if(!$success) log_message('error', "Invalid Title URL ({$this->site}): {$title_url}");
48
		return $success;
49
	}
50
	final public function isValidChapter(string $chapter) : bool {
51
		$success = (bool) preg_match($this->chapterFormat, $chapter);
52
		if(!$success) log_message('error', "Invalid Chapter ({$this->site}): {$chapter}");
53
		return $success;
54
	}
55
56
	final protected function get_content(string $url, string $cookie_string = "", string $cookiejar_path = "", bool $follow_redirect = FALSE, bool $isPost = FALSE, array $postFields = []) {
57
		$ch = curl_init();
58
		curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
59
		curl_setopt($ch, CURLOPT_ENCODING , "gzip");
60
		//curl_setopt($ch, CURLOPT_VERBOSE, 1);
61
		curl_setopt($ch, CURLOPT_HEADER, 1);
62
63
		if($follow_redirect)        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
64
65
		if(!empty($cookie_string))  curl_setopt($ch, CURLOPT_COOKIE, $cookie_string);
66
		if(!empty($cookiejar_path)) curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiejar_path);
67
68
		//Some sites check the useragent for stuff, use a pre-defined user-agent to avoid stuff.
69
		curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2824.0 Safari/537.36');
70
71
		//TODO: Check in a while if this being enabled still causes issues
72
		//curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); //FIXME: This isn't safe, but it allows us to grab SSL URLs
73
74
		curl_setopt($ch, CURLOPT_URL, $url);
75
76
		if($isPost) {
77
			curl_setopt($ch,CURLOPT_POST, count($postFields));
78
			curl_setopt($ch,CURLOPT_POSTFIELDS, http_build_query($postFields));
79
		}
80
81
		$response = curl_exec($ch);
82
		if($response === FALSE) {
83
			log_message('error', "curl failed with error: ".curl_errno($ch)." | ".curl_error($ch));
84
			//FIXME: We don't always account for FALSE return
85
			return FALSE;
86
		}
87
88
		$status_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
89
		$header_size = curl_getinfo($ch, CURLINFO_HEADER_SIZE);
90
		$headers     = http_parse_headers(substr($response, 0, $header_size));
91
		$body        = substr($response, $header_size);
92
		curl_close($ch);
93
94
		return [
95
			'headers'     => $headers,
96
			'status_code' => $status_code,
97
			'body'        => $body
98
		];
99
	}
100
101
	/**
102
	 * @param array  $content
103
	 * @param string $title_url
104
	 * @param string $node_title_string
105
	 * @param string $node_row_string
106
	 * @param string $node_latest_string
107
	 * @param string $node_chapter_string
108
	 * @param string $failure_string
109
	 *
110
	 * @return DOMElement[]|false
111
	 */
112
	final protected function parseTitleDataDOM(
113
		$content, string $title_url,
114
		string $node_title_string, string $node_row_string,
115
		string $node_latest_string, string $node_chapter_string,
116
		string $failure_string = "") {
117
118
		if(!is_array($content)) {
119
			log_message('error', "{$this->site} : {$title_url} | Failed to grab URL (See above curl error)");
120
		} else {
121
			list('headers' => $headers, 'status_code' => $status_code, 'body' => $data) = $content;
0 ignored issues
show
Unused Code introduced by
The assignment to $headers is unused. Consider omitting it like so list($first,,$third).

This checks looks for assignemnts to variables using the list(...) function, where not all assigned variables are subsequently used.

Consider the following code example.

<?php

function returnThreeValues() {
    return array('a', 'b', 'c');
}

list($a, $b, $c) = returnThreeValues();

print $a . " - " . $c;

Only the variables $a and $c are used. There was no need to assign $b.

Instead, the list call could have been.

list($a,, $c) = returnThreeValues();
Loading history...
122
123
			if(!($status_code >= 200 && $status_code < 300)) {
124
				log_message('error', "{$this->site} : {$title_url} | Bad Status Code ({$status_code})");
125
			} else if(empty($data)) {
126
				log_message('error', "{$this->site} : {$title_url} | Data is empty? (Status code: {$status_code})");
127
			} else if($failure_string !== "" && strpos($data, $failure_string) !== FALSE) {
128
				log_message('error', "{$this->site} : {$title_url} | Failure string matched");
129
			} else {
130
				$data = $this->cleanTitleDataDOM($data); //This allows us to clean the DOM prior to parsing. It's faster to grab the only part we need THEN parse it.
131
132
				$dom = new DOMDocument();
133
				libxml_use_internal_errors(TRUE);
134
				$dom->loadHTML('<?xml encoding="utf-8" ?>' . $data);
135
				libxml_use_internal_errors(FALSE);
136
137
				$xpath = new DOMXPath($dom);
138
				$nodes_title = $xpath->query($node_title_string);
139
				$nodes_row   = $xpath->query($node_row_string);
140
				if($nodes_title->length === 1 && $nodes_row->length === 1) {
141
					$firstRow      = $nodes_row->item(0);
142
					$nodes_latest  = $xpath->query($node_latest_string,  $firstRow);
143
144
					if($node_chapter_string !== '') {
145
						$nodes_chapter = $xpath->query($node_chapter_string, $firstRow);
146
					} else {
147
						$nodes_chapter = $nodes_row;
148
					}
149
150
					if($nodes_latest->length === 1 && $nodes_chapter->length === 1) {
151
						return [
152
							'nodes_title'   => $nodes_title->item(0),
153
							'nodes_latest'  => $nodes_latest->item(0),
154
							'nodes_chapter' => $nodes_chapter->item(0)
155
						];
156
					} else {
157
						log_message('error', "{$this->site} : {$title_url} | Invalid amount of nodes (LATEST: {$nodes_latest->length} | CHAPTER: {$nodes_chapter->length})");
158
					}
159
				} else {
160
					log_message('error', "{$this->site} : {$title_url} | Invalid amount of nodes (TITLE: {$nodes_title->length} | ROW: {$nodes_row->length})");
161
				}
162
			}
163
		}
164
165
		return FALSE;
166
	}
167
168
	public function cleanTitleDataDOM(string $data) : string {
169
		return $data;
170
	}
171
172
	//This has it's own function due to FoOlSlide being used a lot by fan translation sites, and the code being pretty much the same across all of them.
173
	final public function parseFoolSlide(string $fullURL, string $title_url) {
174
		$titleData = [];
175
176
		if($content = $this->get_content($fullURL, "", "", FALSE, TRUE, ['adult' => 'true'])) {
177
			$content['body'] = preg_replace('/^[\S\s]*(<article[\S\s]*)<\/article>[\S\s]*$/', '$1', $content['body']);
178
179
			$data = $this->parseTitleDataDOM(
180
				$content,
181
				$title_url,
182
				"//div[@class='large comic']/h1[@class='title']",
183
				"(//div[@class='list']/div[@class='group']/div[@class='title' and text() = 'Chapters']/following-sibling::div[@class='element'][1] | //div[@class='list']/div[@class='element'][1] | //div[@class='list']/div[@class='group'][1]/div[@class='element'][1])[1]",
184
				"div[@class='meta_r']",
185
				"div[@class='title']/a"
186
			);
187
			if($data) {
188
				$titleData['title'] = trim($data['nodes_title']->textContent);
189
190
				$link                        = (string) $data['nodes_chapter']->getAttribute('href');
191
				$titleData['latest_chapter'] = preg_replace('/.*\/read\/.*?\/(.*?)\/$/', '$1', $link);
192
193
				$titleData['last_updated'] = date("Y-m-d H:i:s", strtotime((string) str_replace('.', '', explode(',', $data['nodes_latest']->nodeValue)[1])));
194
			}
195
		}
196
197
		return (!empty($titleData) ? $titleData : NULL);
198
	}
199
200
	final public function doCustomFollow(string $data = "", array $extra = []) : array {
201
		$titleData = [];
202
		$this->handleCustomFollow(function($content, $id, closure $successCallback = NULL) use(&$titleData) {
203
			if(is_array($content)) {
204
				if(array_key_exists('status_code', $content)) {
205
					$statusCode = $content['status_code'];
206
					if($statusCode === 200) {
207
						$isCallable = is_callable($successCallback);
208
						if(($isCallable && $successCallback($content['body'])) || !$isCallable) {
209
							$titleData['followed'] = 'Y';
210
211
							log_message('info', "doCustomFollow succeeded for {$id}");
212
						} else {
213
							log_message('error', "doCustomFollow failed (Invalid response?) for {$id}");
214
						}
215
					} else {
216
						log_message('error', "doCustomFollow failed (Invalid status code ({$statusCode})) for {$id}");
217
					}
218
				} else {
219
					log_message('error', "doCustomFollow failed (Missing status code?) for {$id}");
220
				}
221
			} else {
222
				log_message('error', "doCustomFollow failed (Failed request) for {$id}");
223
			}
224
		}, $data, $extra);
225
		return $titleData;
226
	}
227
	public function handleCustomFollow(callable $callback, string $data = "", array $extra = []) {}
228
	public function doCustomUpdate() {}
229
	public function doCustomCheck(string $oldChapter, string $newChapter) {}
230
	final public function doCustomCheckCompare(array $oldChapterSegments, array $newChapterSegments) : bool {
231
		//FIXME: Make this more generic when we have more site support for it. MangaFox and Batoto have similar chapter formats.
232
233
		//NOTE: We only need to check against the new chapter here, as that is what is used for confirming update.
234
		$status = FALSE;
235
236
		//Make sure we have a volume element
237
		if(count($oldChapterSegments) === 1) array_unshift($oldChapterSegments, 'v0');
238
		if(count($newChapterSegments) === 1) array_unshift($newChapterSegments, 'v0');
239
240
		$oldCount = count($oldChapterSegments);
241
		$newCount = count($newChapterSegments);
242
		if($newCount === $oldCount) {
243
			//Make sure chapter format looks correct.
244
			//NOTE: We only need to check newCount as we know oldCount is the same count.
245
			if($newCount === 2) {
246
				//FIXME: Can we loop this?
247
				$oldVolume = substr(array_shift($oldChapterSegments), 1);
248
				$newVolume = substr(array_shift($newChapterSegments), 1);
249
250
				//Forcing volume to 0 as TBD might not be the latest (although it can be, but that is covered by other checks)
251
				if(in_array($oldVolume, ['TBD', 'TBA', 'NA', 'LMT'])) $oldVolume = 0;
252
				if(in_array($newVolume, ['TBD', 'TBA', 'NA', 'LMT'])) $newVolume = 0;
253
254
				$oldVolume = floatval($oldVolume);
255
				$newVolume = floatval($newVolume);
256
			} else {
257
				$oldVolume = 0;
258
				$newVolume = 0;
259
			}
260
			$oldChapter = floatval(substr(array_shift($oldChapterSegments), 1));
261
			$newChapter = floatval(substr(array_shift($newChapterSegments), 1));
262
263
			if($newChapter > $oldChapter && ($oldChapter >= 10 && $newChapter >= 10)) {
264
				//$newChapter is higher than $oldChapter AND $oldChapter and $newChapter are both more than 10
265
				//This is intended to cover the /majority/ of valid updates, as we technically shouldn't have to check volumes.
266
267
				$status = TRUE;
268
			} elseif($newVolume > $oldVolume && ($oldChapter < 10 && $newChapter < 10)) {
269
				//This is pretty much just to match a one-off case where the site doesn't properly increment chapter numbers across volumes, and instead does something like: v1/c1..v1/c5, v2/c1..v1/c5 (and so on).
270
				$status = TRUE;
271
			} elseif($newVolume > $oldVolume && $newChapter >= $oldChapter) {
272
				//$newVolume is higher, and chapter is higher so no need to check chapter.
273
				$status = TRUE;
274
			} elseif($newChapter > $oldChapter) {
275
				//$newVolume isn't higher, but chapter is.
276
				$status = TRUE;
277
			}
278
		}
279
280
		return $status;
281
	}
282
}
283