Search   A
last analyzed

Complexity

Total Complexity 29

Size/Duplication

Total Lines 174
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 2

Test Coverage

Coverage 93.33%

Importance

Changes 3
Bugs 0 Features 0
Metric Value
wmc 29
c 3
b 0
f 0
lcom 1
cbo 2
dl 0
loc 174
ccs 98
cts 105
cp 0.9333
rs 10

10 Methods

Rating   Name   Duplication   Size   Complexity  
B getSerps() 0 28 4
B makeRequest() 0 27 3
B parseResults() 0 23 4
A parseLink() 0 9 3
B gCurl() 0 38 6
A getReference() 0 6 2
A getDomainFilter() 0 6 2
A getNextSerp() 0 6 2
A guardNoCaptcha() 0 7 2
A isAGoogleWebmasterLink() 0 4 1
1
<?php
2
namespace SEOstats\Services\Google;
3
4
/**
5
 * SEOstats extension for Google data.
6
 *
7
 * @package    SEOstats
8
 * @author     Stephan Schmitz <[email protected]>
9
 * @copyright  Copyright (c) 2010 - present Stephan Schmitz
10
 * @license    http://eyecatchup.mit-license.org/  MIT License
11
 * @updated    2013/12/17
12
 */
13
14
use SEOstats\Common\SEOstatsException as E;
15
use SEOstats\SEOstats as SEOstats;
16
use SEOstats\Config as Config;
17
use SEOstats\Helper as Helper;
18
19
class Search extends SEOstats
20
{
21
22
    /**
23
     * Returns array, containing detailed results for any Google search.
24
     *
25
     * @param     string    $query  String, containing the search query.
26
     * @param     string    $tld    String, containing the desired Google top level domain.
0 ignored issues
show
Bug introduced by
There is no parameter named $tld. Was it maybe removed?

This check looks for PHPDoc comments describing methods or function parameters that do not exist on the corresponding method or function.

Consider the following example. The parameter $italy is not defined by the method finale(...).

/**
 * @param array $germany
 * @param array $island
 * @param array $italy
 */
function finale($germany, $island) {
    return "2:1";
}

The most likely cause is that the parameter was removed, but the annotation was not.

Loading history...
27
     * @return    array             Returns array, containing the keys 'URL', 'Title' and 'Description'.
28
     */
29 12
    public static function getSerps($query, $maxResults=100, $domain=false)
30
    {
31 12
        $q = rawurlencode($query);
32 12
        $maxPage = ceil(($maxResults/10)-1);
33 12
        $result = new Helper\ArrayHandle ();
34 12
        $pages = 1;
35 12
        $delay = 0;
36
37 12
        $domainRexExp = static::getDomainFilter($domain);
38
39 12
        for ($start=0; $start<$pages; $start++) {
40
41 12
            $haveNextPage = static::makeRequest ($start, $q, $result, $domainRexExp);
42 12
            if (!$haveNextPage) {
43 9
                $pages -= 1;
44 9
            } else {
45 5
                $pages += 1;
46 5
                $delay += 200000;
47 5
                usleep($delay);
48
            }
49
50 12
            if ($start == $maxPage) {
51 9
                $pages -= 1;
52 9
            }
53 12
        } // for ($start=0; $start<$pages; $start++)
54
55 12
        return $result->toArray();
56
    }
57
58 12
    protected static function makeRequest ($start, $query, $result, $domainRexExp)
59
    {
60 12
        $ref = static::getReference($start, $query);
61 12
        $nextSerp = static::getNextSerp($start, $query);
62
63 12
        $curledSerp = utf8_decode( static::gCurl($nextSerp, $ref) );
64
65 12
        static::guardNoCaptcha($curledSerp);
66
67 12
        $matches = array();
68 12
        preg_match_all('#<h3 class="?r"?>(.*?)</h3>#', $curledSerp, $matches);
69
70 12
        if (empty($matches[1])) {
71
            // No [@id="rso"]/li/h3 on currect page
72 8
            return false;
73
        }
74
75 6
        static::parseResults($matches, $domainRexExp, $start * 10, $result);
76
77 6
        if ( preg_match('#id="?pnnext"?#', $curledSerp) ) {
78
            // Found 'Next'-link on currect page
79 5
            return true;
80
        }
81
82
        // No 'Next'-link on currect page
83 1
        return false;
84
    }
85
86 12
    protected static function getReference ($start, $query)
87
    {
88
        return 0 == $start
89 12
            ? 'ncr'
90 12
            : sprintf('search?q=%s&hl=en&prmd=imvns&start=%s0&sa=N', $query, $start);
91
    }
92
93 12
    protected static function getDomainFilter ($domain)
94
    {
95
        return $domain
96 12
            ? "#^(https?://)?[^/]*{$domain}#i"
97 12
            : false;
98
    }
99
100 12
    protected static function getNextSerp ($start, $query)
101
    {
102
        return 0 == $start
103 12
            ? sprintf('search?q=%s&filter=0', $query)
104 12
            : sprintf('search?q=%s&filter=0&start=%s0', $query, $start);
105
    }
106
107 12
    protected static function guardNoCaptcha ($response)
108
    {
109 12
        if (preg_match("#answer[=|/]86640#i", $response)) {
110
            print('Please read: https://support.google.com/websearch/answer/86640');
111
            exit();
112
        }
113 12
    }
114
115 6
    protected static function parseResults ($matches, $domainRexExp, $start, $result)
116
    {
117 6
        $c = 0;
118
119 6
        foreach ($matches[1] as $link) {
120 6
            $match = static::parseLink($link);
121
122 6
            $c++;
123 6
            $resCnt = $start + $c;
124 6
            if (! $domainRexExp) {
125 3
                $result->setElement($resCnt, array(
126 3
                    'url' => $match[1],
127 3
                    'headline' => trim(strip_tags($match[2]))
128 3
                ));
129 6
            } elseif (preg_match($domainRexExp, $match[1])) {
130 3
                $result->push(array(
131 3
                    'position' => $resCnt,
132 3
                    'url' => $match[1],
133 3
                    'headline' => trim(strip_tags($match[2]))
134 3
                ));
135 3
            }
136 6
        } // foreach ($matches[1] as $link)
137 6
    }
138
139 6
    protected static function parseLink($link)
140
    {
141 6
        $isValidLink = preg_match('#<a\s+[^>]*href=[\'"]?([^\'" ]+)[\'"]?[^>]*>(.*?)</a>#', $link, $match);
142
143
        // is valid and not webmaster link
144 6
        return ( !$isValidLink || self::isAGoogleWebmasterLink($match[1]) )
145 6
            ? false
146 6
            : $match;
147
    }
148
149 6
    protected static function isAGoogleWebmasterLink($url)
150
    {
151 6
        return preg_match('#^https?://www.google.com/(?:intl/.+/)?webmasters#', $url);
152
    }
153
154 2
    protected static function gCurl($path, $ref, $useCookie = Config\DefaultSettings::ALLOW_GOOGLE_COOKIES)
155
    {
156 2
        $url = sprintf('https://www.google.%s/', Config\DefaultSettings::GOOGLE_TLD);
157 2
        $referer = $ref == '' ? $url : $ref;
158 2
        $url .= $path;
159
160 2
        $ua = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36";
161 2
        if (isset($_SERVER["HTTP_USER_AGENT"]) && 0 < strlen($_SERVER["HTTP_USER_AGENT"])) {
162
            $ua = $_SERVER["HTTP_USER_AGENT"];
163
        }
164
165
        $header = array(
166 2
            'Host: www.google.' . Config\DefaultSettings::GOOGLE_TLD,
167 2
            'Connection: keep-alive',
168 2
            'Cache-Control: max-age=0',
169 2
            'User-Agent: ' . $ua,
170 2
            'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
171 2
            'Referer: ' . $referer,
172 2
            'Accept-Language: ' . Config\DefaultSettings::HTTP_HEADER_ACCEPT_LANGUAGE,
173
            'Accept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7'
174 2
        );
175
176 2
        $ch = curl_init($url);
177 2
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
178 2
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
179 2
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
180 2
        curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
181 2
        curl_setopt($ch, CURLOPT_USERAGENT, $ua);
182 2
        if ($useCookie == 1) {
183
            curl_setopt($ch, CURLOPT_COOKIEJAR, __DIR__ . '/cookie.txt');
184
            curl_setopt($ch, CURLOPT_COOKIEFILE, __DIR__ . '/cookie.txt');
185
        }
186 2
        curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
187 2
        $result = curl_exec($ch);
188 2
        $info = curl_getinfo($ch);
189 2
        curl_close($ch);
190 2
        return ($info['http_code']!=200) ? false : $result;
191
    }
192
}
193