Completed
Push — master ( 922be9...285e65 )
by Nick
41s queued 31s
created

Wikipedia   A

Complexity

Total Complexity 24

Size/Duplication

Total Lines 182
Duplicated Lines 0 %

Test Coverage

Coverage 57.6%

Importance

Changes 0
Metric Value
eloc 93
dl 0
loc 182
ccs 53
cts 92
cp 0.576
rs 10
c 0
b 0
f 0
wmc 24

2 Methods

Rating   Name   Duplication   Size   Complexity  
C antiTagInTag() 0 59 14
C wikipedize() 0 109 10
1
<?php
2
3
namespace MySociety\TheyWorkForYou\Utility;
4
5
/**
6
 * Utility functions related to Wikipedia.
7
 *
8
 * Based on wikiproxy.php v0.5 04-10-2004 stefan (wholivesat) whitelabel.org
9
 * scripts/wikipedia-update updates the database of titles each week.
10
 *
11
 * The bits of code I didn't borrow from elsewhere (and I've credited where) is
12
 * licenced under the GPL. Do with it what you will, but this is my first php
13
 * and my first code for 7 years, so I'd appreciate feedback and suggestions via
14
 * comments on my blog:
15
 * http://www.whitelabel.org/archives/002248.html
16
 * (especially regex optimisations for lines 64 and 65 - ideally a way of making
17
 * it NOT match if we're within an IMG tag, because then I could drop the
18
 * antiTaginTag stuff)
19
 */
20
21
class Wikipedia
22
{
23
24 8
    public static function wikipedize($source) {
25 8
        global $format_date_months;
26 8
        $months = join('|', array_slice($format_date_months, 1));
27
28 8
        $was_array = false;
29 8
        if (is_array($source)) {
30
            $source = join('|||', $source);
31
            $was_array = true;
32
        }
33
34
        # Set up various variables
35 8
        $capsword = "[A-Z][a-zA-Z'0-9,-]*"; # not starting with number, as catches too much
36 8
        $fillerwords = "of|and|in|on|under|the|for";
37 8
        $middlewordsre = "(?:\s*(?:$capsword|$fillerwords))*";
38 8
        $startwordre = "(?:$capsword)"; # and, of etc. can't appear at ends
39 8
        $endwordre = "(?:$capsword|[0-9]+)"; # We might want to catch e.g. a year at the end
40 8
        $notfiller = "(?:(?!Of|And|In|On|Under|The|For)$capsword)";
41
42
        # Match either "Two Endwords" or "Endword and Some Middle Words"
43 8
        $phrasewithfiller = "$startwordre$middlewordsre\s*$endwordre";
44 8
        $greedyproperre = "/\b$phrasewithfiller\b/ms";
45
46
        # And do a match ignoring things starting with filler words
47 8
        $greedynofillerstartre = "/\b$notfiller$middlewordsre\s*$endwordre\b/ms";
48
49
        # Match without filler words (so if you have a phrase like
50
        # "Amnesty International and Human Rights Watch" you also get both parts
51
        # separately "Amnesty International" and "Human Rights Watch")
52 8
        $frugalproperre = "/\b$startwordre(?:\s*$endwordre)+\b/ms";
53
54
        # And do a greedy without the first word of a sentence
55 8
        $greedynotfirst = "/(?:\.|\?|!)\s+\S+\s+($phrasewithfiller)\b/ms";
56
57
        # And one for proper nouns in the possessive
58 8
        $greedypossessive = "/\b($phrasewithfiller)'s\b/ms";
59
60 8
        preg_match_all($greedyproperre, $source, $propernounphrases1);
61 8
        preg_match_all($frugalproperre, $source, $propernounphrases2);
62 8
        preg_match_all($greedynotfirst, $source, $propernounphrases3);
63 8
        preg_match_all($greedypossessive, $source, $propernounphrases4);
64 8
        preg_match_all($greedynofillerstartre, $source, $propernounphrases5);
65
66
        # Three Letter Acronyms
67 8
        preg_match_all("/\b[A-Z]{2,}/ms", $source, $acronyms);
68
69
        # We don't want no steenking duplicates
70 8
        $phrases = array_unique(array_merge($propernounphrases1[0], $propernounphrases2[0],
71 8
            $propernounphrases3[1], $propernounphrases4[1], $propernounphrases5[0], $acronyms[0]));
72 8
        foreach ($phrases as $i => $phrase) {
73
            # Ignore months
74 5
            if (preg_match("#^($months)\s+\d+$#", $phrase)) {
75
                continue;
76
            }
77 5
            $phrases[$i] = str_replace(' ', '_', trim($phrase));
78
        }
79
80
        // Assemble the resulting phrases into a parameter array
81 8
        $params = array();
82 8
        foreach ($phrases as $i => $phrase) {
83 5
            $params[':phrase' . $i] = $phrase;
84
        }
85
86
        # Open up a db connection, and whittle our list down even further, against
87
        # the real titles.
88 8
        $matched = array();
89 8
        $db = new \ParlDB;
90 8
        $source = explode('|||', $source);
91 8
        $q = $db->query("SELECT titles.title FROM titles LEFT JOIN titles_ignored ON titles.title=titles_ignored.title WHERE titles.title IN (" . join(',', array_keys($params)) . ") AND titles_ignored.title IS NULL", $params);
92 8
        $phrases = array();
93 8
        foreach ($q as $row) {
94 5
            $phrases[] = $row['title'];
95
        }
96
97
        # Sort into order, largest first
98 8
        usort($phrases, function($a, $b) {
99
            return strlen($a) < strlen($b);
100 8
        });
101
102 8
        foreach ($phrases as $wikistring) {
103 5
            $phrase = str_replace('_', ' ', $wikistring);
104 5
            $wikistring = str_replace("'", "%27", $wikistring);
105 5
            $phrase_re = str_replace('_', '[ ]', $wikistring);
106
107
            # See if already matched a string this one is contained within
108 5
            foreach ($matched as $got) {
109
                if (strstr($got, $phrase))
110
                    continue 2;
111
            }
112
113 5
            twfy_debug("WIKIPEDIA", "Matched '$phrase'");
114
            # 1 means only replace one match for phrase per paragraph
115 5
            $source = preg_replace ('{
116 5
            \b(' . $phrase_re . ')\b # Match the phrase itself
117
            (?!                      # Match as long as the following does *not* apply:
118
                (?:                  #   Match, possessively, as many strings of:
119
                 [^<]+               #     non-"<" characters,
120
                 |                   #     or
121
                 <(?!/?a\b)          #     a "<" as long as it is not followed by "a"
122
                )*+                  #     as a word on its own (ie. "<a " or "</a>")
123
                </a>                 #   Match a "</a>"
124
            )                        # ie. match as long as we do not find a </a> and have not found a <a>
125 5
            }x', "<a href=\"https://en.wikipedia.org/wiki/$wikistring\">\\1</a>", $source, 1);
126 5
            array_push($matched, $phrase);
127
        }
128
129 8
        if (!$was_array)
130 8
            $source = join('|||', $source);
131
132 8
        return $source;
133
134
    }
135
136
    /**
137
     * Anti Tag-In-Tag
138
     *
139
     * Credit: isaac schlueter (lifted from http://uk2.php.net/strip-tags)
140
     *
141
     * @todo Remove this, it seems to be redundant.
142
     */
143
144
    public static function antiTagInTag($content = '', $format = 'htmlhead')
145
    {
146
      $tagend = -1;
147
      for( $tagstart = strpos( $content, '<', $tagend + 1 ) ; $tagstart !== false && $tagstart < strlen( $content ); $tagstart = strpos( $content, '<', $tagend ) )
148
        {
149
          // got the start of a tag.  Now find the proper end!
150
          $walker = $tagstart + 1;
151
          $open = 1;
152
          while( $open != 0 && $walker < strlen( $content ) )
153
        {
154
          $nextopen = strpos( $content, '<', $walker );
155
          $nextclose = strpos( $content, '>', $walker );
156
          if( $nextclose === false )
157
            {    // ERROR! Open waka without close waka!
158
              // echo '<code>Error in antiTagInTag - malformed tag!</code> ';
159
              return $content;
160
            }
161
          if( $nextopen === false || $nextopen > $nextclose )
162
            { // No more opens, but there was a close; or, a close happens before the next open.
163
              // walker goes to the close+1, and open decrements
164
              $open --;
165
              $walker = $nextclose + 1;
166
            }
167
          elseif( $nextopen < $nextclose )
168
            { // an open before the next close
169
              $open ++;
170
              $walker = $nextopen + 1;
171
            }
172
        }
173
          $tagend = $walker;
174
          if( $tagend > strlen( $content ) )
175
        $tagend = strlen( $content );
176
          else
177
        {
178
          $tagend --;
179
          $tagstart ++;
180
        }
181
          $tag = substr( $content, $tagstart, $tagend - $tagstart );
182
          $tags[] = '<' . $tag . '>';
183
184
          if (function_exists('format_to_output')) {
185
            $newtag = format_to_output($tag, $format);
186
          } else {
187
            $newtag = strip_tags($tag);
188
          }
189
190
          $newtags[] = '<' . $newtag . '>';
191
192
          if (function_exists('format_to_output')) {
193
            $newtag = format_to_output($tag, $format);
0 ignored issues
show
Unused Code introduced by
The assignment to $newtag is dead and can be removed.
Loading history...
194
          } else {
195
            $newtag = strip_tags($tag);
196
          }
197
        }
198
      if (isset($tags)&&isset($newtags)) {
199
      $content = str_replace($tags, $newtags, $content);
200
      }
201
202
    return $content;
203
204
    }
205
206
}
207