Completed
Push — master ( 96d573...f9f049 )
by Ehsan
07:54
created

GreekStemmer   C

Complexity

Total Complexity 63

Size/Duplication

Total Lines 343
Duplicated Lines 0 %

Coupling/Cohesion

Components 1
Dependencies 1

Importance

Changes 0
Metric Value
dl 0
loc 343
rs 5.8893
c 0
b 0
f 0
wmc 63
lcom 1
cbo 1

1 Method

Rating   Name   Duplication   Size   Complexity  
F stem() 0 294 63

How to fix   Complexity   

Complex Class

Complex classes like GreekStemmer often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes. You can also have a look at the cohesion graph to spot any un-connected, or weakly-connected components.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

While breaking up the class, it is a good idea to analyze how other classes use GreekStemmer, and based on these observations, apply Extract Interface, too.

1
<?php
2
3
namespace NlpTools\Stemmers;
4
5
/**
6
 * This stemmer is an implementation of the stemmer described by G. Ntais
7
 * in his Master Thesis.
8
 * http://people.dsv.su.se/~hercules/papers/Ntais_greek_stemmer_thesis_final.pdf
9
 *
10
 * It was first ported to php by P. Kyriakakis.
11
 * This stemmer expects lower case characters and not upper case.
12
 */
13
class GreekStemmer extends Stemmer
14
{
15
    protected static $step1list = array(
16
        "φαγια"=>"φα",
17
        "φαγιου"=>"φα",
18
        "φαγιων"=>"φα",
19
        "σκαγια"=>"σκα",
20
        "σκαγιου"=>"σκα",
21
        "σκαγιων"=>"σκα",
22
        "ολογιου"=>"ολο",
23
        "ολογια"=>"ολο",
24
        "ολογιων"=>"ολο",
25
        "σογιου"=>"σο",
26
        "σογια"=>"σο",
27
        "σογιων"=>"σο",
28
        "τατογια"=>"τατο",
29
        "τατογιου"=>"τατο",
30
        "τατογιων"=>"τατο",
31
        "κρεασ"=>"κρε",
32
        "κρεατοσ"=>"κρε",
33
        "κρεατα"=>"κρε",
34
        "κρεατων"=>"κρε",
35
        "περασ"=>"περ",
36
        "περατοσ"=>"περ",
37
        "περατα"=>"περ",
38
        "περατων"=>"περ",
39
        "τερασ"=>"τερ",
40
        "τερατοσ"=>"τερ",
41
        "τερατα"=>"τερ",
42
        "τερατων"=>"τερ",
43
        "φωσ"=>"φω",
44
        "φωτοσ"=>"φω",
45
        "φωτα"=>"φω",
46
        "φωτων"=>"φω",
47
        "καθεστωσ"=>"καθεστ",
48
        "καθεστωτοσ"=>"καθεστ",
49
        "καθεστωτα"=>"καθεστ",
50
        "καθεστωτων"=>"καθεστ",
51
        "γεγονοσ"=>"γεγον",
52
        "γεγονοτοσ"=>"γεγον",
53
        "γεγονοτα"=>"γεγον",
54
        "γεγονοτων"=>"γεγον"
55
   );
56
    protected static $step1regexp="/(.*)(φαγια|φαγιου|φαγιων|σκαγια|σκαγιου|σκαγιων|ολογιου|ολογια|ολογιων|σογιου|σογια|σογιων|τατογια|τατογιου|τατογιων|κρεασ|κρεατοσ|κρεατα|κρεατων|περασ|περατοσ|περατα|περατων|τερασ|τερατοσ|τερατα|τερατων|φωσ|φωτοσ|φωτα|φωτων|καθεστωσ|καθεστωτοσ|καθεστωτα|καθεστωτων|γεγονοσ|γεγονοτοσ|γεγονοτα|γεγονοτων)$/u";
57
    protected static $v = "[αεηιουω]";
58
    protected static $v2 = "[αεηιοω]";
59
60
    public function stem($w)
61
    {
62
$word = $w;
0 ignored issues
show
Unused Code introduced by
$word is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
63
        $stem="";
0 ignored issues
show
Unused Code introduced by
$stem is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
64
        $suffix="";
0 ignored issues
show
Unused Code introduced by
$suffix is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
65
        $firstch="";
0 ignored issues
show
Unused Code introduced by
$firstch is not used, you could remove the assignment.

This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently.

$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

Both the $myVar assignment in line 1 and the $higher assignment in line 2 are dead. The first because $myVar is never used and the second because $higher is always overwritten for every possible time line.

Loading history...
66
67
        $test1 = true;
68
69
        if (mb_strlen($w, "utf-8") < 4) {
70
            return $w;
71
        }
72
73
        //step1
74
        if (preg_match(self::$step1regexp,$w,$fp)) {
75
            $stem = $fp[1];
76
            $suffix = $fp[2];
77
            $w = $stem.self::$step1list[$suffix];
78
            $test1 = false;
79
        }
80
81
        $re1 = "/^(.+?)(αδεσ|αδων)$/u";
82
        $re2 = "/^(.+?)(εδεσ|εδων)$/u";
83
        $re3 = "/^(.+?)(ουδεσ|ουδων)$/u";
84
        $re4 = "/^(.+?)(εωσ|εων)$/u";
85
        if (preg_match($re1,$w,$fp)) { // step 2a
86
            $stem = $fp[1];
87
            $w = $stem;
88
            $re = "/(οκ|μαμ|μαν|μπαμπ|πατερ|γιαγι|νταντ|κυρ|θει|πεθερ)$/u";
89
            if (!preg_match($re,$w)) {
90
                $w .= "αδ";
91
            }
92
        } elseif (preg_match($re2,$w,$fp)) { //step 2b
93
            $stem = $fp[1];
94
            $w = $stem;
95
            $exept2 = "/(οπ|ιπ|εμπ|υπ|γηπ|δαπ|κρασπ|μιλ)$/u";
96
            if (preg_match($exept2,$w)) {
97
                $w .= "εδ";
98
            }
99
        } elseif (preg_match($re3,$w,$fp)) { //step 2c
100
            $stem = $fp[1];
101
            $w = $stem;
102
            $exept3 = "/(αρκ|καλιακ|πεταλ|λιχ|πλεξ|σκ|σ|φλ|φρ|βελ|λουλ|χν|σπ|τραγ|φε)$/u";
103
            if (preg_match($exept3,$w)) {
104
                $w .= "ουδ";
105
            }
106
        } elseif (preg_match($re4,$w,$fp)) { //step 2d
107
            $stem = $fp[1];
108
            $w = $stem;
109
            $test1 = false;
110
            $exept4 = "/^(θ|δ|ελ|γαλ|ν|π|ιδ|παρ)$/u";
111
            if (preg_match($exept4,$w)) {
112
                $w .= "ε";
113
            }
114
        }
115
116
        //step 3
117
        $re = "/^(.+?)(ια|ιου|ιων)$/u";
118
        if (preg_match($re,$w,$fp)) {
119
            $stem = $fp[1];
120
            $w = $stem;
121
            $re = "/".self::$v."$/u";
122
            $test1 = false;
123
            if (preg_match($re,$w)) {
124
                $w = $stem."ι";
125
            }
126
        }
127
128
        //step 4
129
        $re = "/^(.+?)(ικα|ικο|ικου|ικων)$/u";
130
        if (preg_match($re,$w,$fp)) {
131
            $stem = $fp[1];
132
            $w = $stem;
133
            $test1 = false;
134
            $re = "/".self::$v."$/u";
135
            $exept5 = "/^(αλ|αδ|ενδ|αμαν|αμμοχαλ|ηθ|ανηθ|αντιδ|φυσ|βρωμ|γερ|εξωδ|καλπ|καλλιν|καταδ|μουλ|μπαν|μπαγιατ|μπολ|μποσ|νιτ|ξικ|συνομηλ|πετσ|πιτσ|πικαντ|πλιατσ|ποστελν|πρωτοδ|σερτ|συναδ|τσαμ|υποδ|φιλον|φυλοδ|χασ)$/u";
136
            if (preg_match($re,$w) || preg_match($exept5,$w)) {
137
                $w .= "ικ";
138
            }
139
        }
140
141
        //step 5a
142
        $re = "/^(.+?)(αμε)$/u";
143
        $re2 = "/^(.+?)(αγαμε|ησαμε|ουσαμε|ηκαμε|ηθηκαμε)$/u";
144
        $re3 = "/^(.+?)(ανε)$/u";
145
        $re4 = "/^(.+?)(αγανε|ησανε|ουσανε|ιοντανε|ιοτανε|ιουντανε|οντανε|οτανε|ουντανε|ηκανε|ηθηκανε)$/u";
146
        $re5 = "/^(.+?)(ετε)$/u";
147
        $re6 = "/^(.+?)(ησετε)$/u";
148
        $re7 = "/^(.+?)(οντασ|ωντασ)$/u";
149
        $re8 = "/^(.+?)(ομαστε|ιομαστε)$/u";
150
        $re9 = "/^(.+?)(εστε)$/u";
151
        $re10 = "/^(.+?)(ιεστε)$/u";
152
        $re11 = "/^(.+?)(ηκα|ηκεσ|ηκε)$/u";
153
        $re12 = "/^(.+?)(ηθηκα|ηθηκεσ|ηθηκε)$/u";
154
        $re13 = "/^(.+?)(ουσα|ουσεσ|ουσε)$/u";
155
        $re14 = "/^(.+?)(αγα|αγεσ|αγε)$/u";
156
        $re15 = "/^(.+?)(ησε|ησου|ησα)$/u";
157
        $re16 = "/^(.+?)(ηστε)$/u";
158
        $re17 = "/^(.+?)(ουνε|ησουνε|ηθουνε)$/u";
159
        $re18 = "/^(.+?)(ουμε|ησουμε|ηθουμε)$/u";
160
161
        if ($w == "αγαμε") {
162
            return "αγαμ";
163
        }
164
165
        if (preg_match($re2,$w,$fp)) {
166
            $stem = $fp[1];
167
            $w = $stem;
168
            $test1 = false;
169
        } elseif (preg_match($re,$w,$fp)) {
170
            $stem = $fp[1];
171
            $w = $stem;
172
            $test1 = false;
173
            $exept6 = "/^(αναπ|αποθ|αποκ|αποστ|βουβ|ξεθ|ουλ|πεθ|πικρ|ποτ|σιχ|χ)$/u";
174
            if (preg_match($exept6,$w)) {
175
                $w .= "αμ";
176
            }
177
        } elseif (preg_match($re4,$w,$fp)) { //step 5b
178
            $stem = $fp[1];
179
            $w = $stem;
180
            $test1 = false;
181
            $re4 = "/^(τρ|τσ)$/u";
182
            if (preg_match($re4,$w)) {
183
                $w .= "αγαν";
184
            }
185
        } elseif (preg_match($re3,$w,$fp)) {
186
            $stem = $fp[1];
187
            $w = $stem;
188
            $test1 = false;
189
            $re3 = "/".self::$v2."$/u";
190
            $exept7 = "/^(βετερ|βουλκ|βραχμ|γ|δραδουμ|θ|καλπουζ|καστελ|κορμορ|λαοπλ|μωαμεθ|μ|μουσουλμ|ν|ουλ|π|πελεκ|πλ|πολισ|πορτολ|σαρακατσ|σουλτ|τσαρλατ|ορφ|τσιγγ|τσοπ|φωτοστεφ|χ|ψυχοπλ|αγ|ορφ|γαλ|γερ|δεκ|διπλ|αμερικαν|ουρ|πιθ|πουριτ|σ|ζωντ|ικ|καστ|κοπ|λιχ|λουθηρ|μαιντ|μελ|σιγ|σπ|στεγ|τραγ|τσαγ|φ|ερ|αδαπ|αθιγγ|αμηχ|ανικ|ανοργ|απηγ|απιθ|ατσιγγ|βασ|βασκ|βαθυγαλ|βιομηχ|βραχυκ|διατ|διαφ|ενοργ|θυσ|καπνοβιομηχ|καταγαλ|κλιβ|κοιλαρφ|λιβ|μεγλοβιομηχ|μικροβιομηχ|νταβ|ξηροκλιβ|ολιγοδαμ|ολογαλ|πενταρφ|περηφ|περιτρ|πλατ|πολυδαπ|πολυμηχ|στεφ|ταβ|τετ|υπερηφ|υποκοπ|χαμηλοδαπ|ψηλοταβ)$/u";
191
            if (preg_match($re3,$w) || preg_match($exept7,$w)) {
192
                $w .= "αν";
193
            }
194
        } elseif (preg_match($re6,$w,$fp)) { //step 5c
195
            $stem = $fp[1];
196
            $w = $stem;
197
            $test1 = false;
198
        } elseif (preg_match($re5,$w,$fp)) {
199
            $stem = $fp[1];
200
            $w = $stem;
201
            $test1 = false;
202
203
//          $re5 = $this->v2."$";
204
            $re5 = self::$v2."";
205
            $exept8 = "/(οδ|αιρ|φορ|ταθ|διαθ|σχ|ενδ|ευρ|τιθ|υπερθ|ραθ|ενθ|ροθ|σθ|πυρ|αιν|συνδ|συν|συνθ|χωρ|πον|βρ|καθ|ευθ|εκθ|νετ|ρον|αρκ|βαρ|βολ|ωφελ)$/u";
206
            $exept9 = "/^(αβαρ|βεν|εναρ|αβρ|αδ|αθ|αν|απλ|βαρον|ντρ|σκ|κοπ|μπορ|νιφ|παγ|παρακαλ|σερπ|σκελ|συρφ|τοκ|υ|δ|εμ|θαρρ|θ)$/u";
207
208
            if (preg_match($re5,$w) || preg_match($exept8,$w)) {
209
                $w .= "ετ";
210
            } elseif (preg_match($exept9, $w)) {
211
                $w .= "ετ";
212
            }
213
        } elseif (preg_match($re7,$w,$fp)) { //step 5d
214
            $stem = $fp[1];
215
            $w = $stem;
216
            $test1 = false;
217
218
            $exept10 = "/^(αρχ)$/u";
219
            $exept11 = "/(κρε)$/u";
220
            if (preg_match($exept10,$w)) {
221
                $w .= "οντ";
222
            }
223
            if (preg_match($exept11,$w)) {
224
                $w .= "ωντ";
225
            }
226
        } elseif (preg_match($re8,$w,$fp)) { //step 5e
227
            $stem = $fp[1];
228
            $w = $stem;
229
            $test1 = false;
230
231
            $exept11 = "/^(ον)$/u";
232
            if (preg_match($exept11,$w)) {
233
                $w .= "ομαστ";
234
            }
235
        } elseif (preg_match($re10,$w,$fp)) { //step 5f
236
            $stem = $fp[1];
237
            $w = $stem;
238
            $test1 = false;
239
240
            $re10 = "/^(π|απ|συμπ|ασυμπ|ακαταπ|αμεταμφ)$/u";
241
            if (preg_match($re10,$w)) {
242
               $w .= "ιεστ";
243
            }
244
        } elseif (preg_match($re9,$w,$fp)) {
245
            $stem = $fp[1];
246
            $w = $stem;
247
            $test1 = false;
248
249
            $exept12 = "/^(αλ|αρ|εκτελ|ζ|μ|ξ|παρακαλ|αρ|προ|νισ)$/u";
250
            if (preg_match($exept12,$w)) {
251
                $w .= "εστ";
252
            }
253
        } elseif (preg_match($re12,$w,$fp)) { //step 5g
254
            $stem = $fp[1];
255
            $w = $stem;
256
            $test1 = false;
257
        } elseif (preg_match($re11,$w,$fp)) {
258
            $stem = $fp[1];
259
            $w = $stem;
260
            $test1 = false;
261
262
            $exept13 = "/(σκωλ|σκουλ|ναρθ|σφ|οθ|πιθ)$/u";
263
            $exept14 = "/^(διαθ|θ|παρακαταθ|προσθ|συνθ|)$/u";
264
            if (preg_match($exept13,$w)) {
265
                $w .= "ηκ";
266
            } elseif (preg_match($exept14,$w)) {
267
                $w .= "ηκ";
268
            }
269
        } elseif (preg_match($re13,$w,$fp)) { //step 5h
270
            $stem = $fp[1];
271
            $w = $stem;
272
            $test1 = false;
273
274
            $exept15 = "/^(φαρμακ|χαδ|αγκ|αναρρ|βρομ|εκλιπ|λαμπιδ|λεχ|μ|πατ|ρ|λ|μεδ|μεσαζ|υποτειν|αμ|αιθ|ανηκ|δεσποζ|ενδιαφερ|δε|δευτερευ|καθαρευ|πλε|τσα)$/u";
275
            $exept16 = "/(ποδαρ|βλεπ|πανταχ|φρυδ|μαντιλ|μαλλ|κυματ|λαχ|ληγ|φαγ|ομ|πρωτ)$/u";
276
            if (preg_match($exept15,$w)) {
277
                $w .= "ουσ";
278
            } elseif (preg_match($exept16,$w)) {
279
                $w .= "ουσ";
280
            }
281
        } elseif (preg_match($re14,$w,$fp)) { //step 5i
282
            $stem = $fp[1];
283
            $w = $stem;
284
            $test1 = false;
285
286
            $exept17 = "/^(ψοφ|ναυλοχ)$/u";
287
            $exept20 = "/(κολλ)$/u";
288
            $exept18 = "/^(αβαστ|πολυφ|αδηφ|παμφ|ρ|ασπ|αφ|αμαλ|αμαλλι|ανυστ|απερ|ασπαρ|αχαρ|δερβεν|δροσοπ|ξεφ|νεοπ|νομοτ|ολοπ|ομοτ|προστ|προσωποπ|συμπ|συντ|τ|υποτ|χαρ|αειπ|αιμοστ|ανυπ|αποτ|αρτιπ|διατ|εν|επιτ|κροκαλοπ|σιδηροπ|λ|ναυ|ουλαμ|ουρ|π|τρ|μ)$/u";
289
            $exept19 = "/(οφ|πελ|χορτ|λλ|σφ|ρπ|φρ|πρ|λοχ|σμην)$/u";
290
291
            if((preg_match($exept18,$w) || preg_match($exept19,$w))
292
                && !(preg_match($exept17,$w) || preg_match($exept20,$w))) {
293
              $w .= "αγ";
294
            }
295
        } elseif (preg_match($re15,$w,$fp)) { //step 5j
296
            $stem = $fp[1];
297
            $w = $stem;
298
            $test1 = false;
299
300
            $exept21 = "/^(ν|χερσον|δωδεκαν|ερημον|μεγαλον|επταν)$/u";
301
            if (preg_match($exept21,$w)) {
302
                $w .= "ησ";
303
            }
304
        } elseif (preg_match($re16,$w,$fp)) { //step 5k
305
            $stem = $fp[1];
306
            $w = $stem;
307
            $test1 = false;
308
309
            $exept22 = "/^(ασβ|σβ|αχρ|χρ|απλ|αειμν|δυσχρ|ευχρ|κοινοχρ|παλιμψ)$/u";
310
            if (preg_match($exept22,$w)) {
311
                $w .= "ηστ";
312
            }
313
        } elseif (preg_match($re17,$w,$fp)) { //step 5l
314
            $stem = $fp[1];
315
            $w = $stem;
316
            $test1 = false;
317
318
            $exept23 = "/^(ν|ρ|σπι|στραβομουτσ|κακομουτσ|εξων)$/u";
319
            if (preg_match($exept23,$w)) {
320
                $w .= "ουν";
321
            }
322
        } elseif (preg_match($re18,$w,$fp)) { //step 5l
323
            $stem = $fp[1];
324
            $w = $stem;
325
            $test1 = false;
326
327
            $exept24 = "/^(παρασουσ|φ|χ|ωριοπλ|αζ|αλλοσουσ|ασουσ)$/u";
328
            if (preg_match($exept24,$w)) {
329
                $w .= "ουμ";
330
            }
331
        }
332
333
        // step 6
334
        $re = "/^(.+?)(ματα|ματων|ματοσ)$/u";
335
        $re2 = "/^(.+?)(α|αγατε|αγαν|αει|αμαι|αν|ασ|ασαι|αται|αω|ε|ει|εισ|ειτε|εσαι|εσ|εται|ι|ιεμαι|ιεμαστε|ιεται|ιεσαι|ιεσαστε|ιομασταν|ιομουν|ιομουνα|ιονταν|ιοντουσαν|ιοσασταν|ιοσαστε|ιοσουν|ιοσουνα|ιοταν|ιουμα|ιουμαστε|ιουνται|ιουνταν|η|ηδεσ|ηδων|ηθει|ηθεισ|ηθειτε|ηθηκατε|ηθηκαν|ηθουν|ηθω|ηκατε|ηκαν|ησ|ησαν|ησατε|ησει|ησεσ|ησουν|ησω|ο|οι|ομαι|ομασταν|ομουν|ομουνα|ονται|ονταν|οντουσαν|οσ|οσασταν|οσαστε|οσουν|οσουνα|οταν|ου|ουμαι|ουμαστε|ουν|ουνται|ουνταν|ουσ|ουσαν|ουσατε|υ|υσ|ω|ων)$/u";
336
        if (preg_match($re,$w,$fp)) {
337
            $stem = $fp[1];
338
            $w = $stem . "μα";
339
        }
340
        if (preg_match($re2,$w,$fp) && $test1) {
341
            $stem = $fp[1];
342
            $w = $stem;
343
        }
344
345
        // step 7
346
        $re = "/^(.+?)(εστερ|εστατ|οτερ|οτατ|υτερ|υτατ|ωτερ|ωτατ)$/u";
347
        if (preg_match($re,$w,$fp)) {
348
            $stem = $fp[1];
349
            $w = $stem;
350
        }
351
352
        return $w;
353
    }
354
355
}
356