1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace Colligator\Scrapers; |
4
|
|
|
|
5
|
|
|
use Symfony\Component\DomCrawler\Crawler; |
6
|
|
|
|
7
|
|
|
class BsScraper extends Scraper implements ScraperInterface |
8
|
|
|
{ |
9
|
|
|
public function recognizes($url) |
10
|
|
|
{ |
11
|
|
|
return strpos($url, 'bibsys.no'); |
12
|
|
|
} |
13
|
|
|
|
14
|
|
|
public function getSections($texts) |
15
|
|
|
{ |
16
|
|
|
// Secion list ordered by preference |
17
|
|
|
$sections = [ |
18
|
|
|
'Beskrivelse fra forlaget (lang)' => '', |
19
|
|
|
'Publisher\'s description (full)' => '', |
20
|
|
|
'Beskrivelse fra Forlagssentralen' => '', |
21
|
|
|
'Beskrivelse fra forlaget (kort)' => '', |
22
|
|
|
'Publisher\'s description (brief)' => '', |
23
|
|
|
'Innholdsfortegnelse' => '', |
24
|
|
|
]; |
25
|
|
|
|
26
|
|
|
$next = ''; |
27
|
|
|
foreach ($texts as $t) { |
28
|
|
|
if ($next != '') { |
29
|
|
|
$sections[$next] = $t; |
30
|
|
|
$next = ''; |
31
|
|
|
} |
32
|
|
|
if (isset($sections[$t])) { |
33
|
|
|
// It's a section heading |
34
|
|
|
$next = $t; |
35
|
|
|
} |
36
|
|
|
} |
37
|
|
|
|
38
|
|
|
return $sections; |
39
|
|
|
} |
40
|
|
|
|
41
|
|
|
public function getFirstNonEmpty($sections) |
42
|
|
|
{ |
43
|
|
|
$text = ''; |
44
|
|
|
$source = ''; |
45
|
|
|
foreach ($sections as $k => $v) { |
46
|
|
|
$v = explode('©', $v); |
47
|
|
|
if (count($v) > 1 && !empty($v[0])) { |
48
|
|
|
$text = $v[0]; |
49
|
|
|
$source = $v[1]; |
50
|
|
|
break; |
51
|
|
|
} else if (count($v) == 1 && !empty($v[0])) { |
52
|
|
|
$text = $v[0]; |
53
|
|
|
$source = ''; |
54
|
|
|
} |
55
|
|
|
} |
56
|
|
|
|
57
|
|
|
return [$text, $source]; |
58
|
|
|
} |
59
|
|
|
|
60
|
|
|
public function scrape(Crawler $crawler) |
61
|
|
|
{ |
62
|
|
|
$texts = $crawler->filter('#accordion > *')->each(function (Crawler $node) { |
63
|
|
|
return $node->text(); |
64
|
|
|
}); |
65
|
|
|
|
66
|
|
|
$sections = $this->getSections($texts); |
67
|
|
|
list($text, $source) = $this->getFirstNonEmpty($sections); |
68
|
|
|
|
69
|
|
|
return $this->returnResult($text, $source); |
70
|
|
|
} |
71
|
|
|
} |
72
|
|
|
|