1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace DaveChild\TextStatistics; |
4
|
|
|
|
5
|
|
|
class Syllables |
6
|
|
|
{ |
7
|
|
|
|
8
|
|
|
// Specific common exceptions that don't follow the rule set below are handled individually |
9
|
|
|
// array of problem words (with word as key, syllable count as value). |
10
|
|
|
// Common reasons we need to override some words: |
11
|
|
|
// - Trailing 'e' is pronounced |
12
|
|
|
// - Portmanteaus |
13
|
|
|
static public $arrProblemWords = array( |
14
|
|
|
'abalone' => 4 |
15
|
|
|
,'abare' => 3 |
16
|
|
|
,'abed' => 2 |
17
|
|
|
,'abruzzese' => 4 |
18
|
|
|
,'abbruzzese' => 4 |
19
|
|
|
,'aborigine' => 5 |
20
|
|
|
,'acreage' => 3 |
21
|
|
|
,'adame' => 3 |
22
|
|
|
,'adieu' => 2 |
23
|
|
|
,'adobe' => 3 |
24
|
|
|
,'anemone' => 4 |
25
|
|
|
,'apache' => 3 |
26
|
|
|
,'aphrodite' => 4 |
27
|
|
|
,'apostrophe' => 4 |
28
|
|
|
,'ariadne' => 4 |
29
|
|
|
,'cafe' => 2 |
30
|
|
|
,'calliope' => 4 |
31
|
|
|
,'catastrophe' => 4 |
32
|
|
|
,'chile' => 2 |
33
|
|
|
,'chloe' => 2 |
34
|
|
|
,'circe' => 2 |
35
|
|
|
,'coyote' => 3 |
36
|
|
|
,'epitome' => 4 |
37
|
|
|
,'forever' => 3 |
38
|
|
|
,'gethsemane' => 4 |
39
|
|
|
,'guacamole' => 4 |
40
|
|
|
,'hyperbole' => 4 |
41
|
|
|
,'jesse' => 2 |
42
|
|
|
,'jukebox' => 2 |
43
|
|
|
,'karate' => 3 |
44
|
|
|
,'machete' => 3 |
45
|
|
|
,'maybe' => 2 |
46
|
|
|
,'people' => 2 |
47
|
|
|
,'recipe' => 3 |
48
|
|
|
,'sesame' => 3 |
49
|
|
|
,'shoreline' => 2 |
50
|
|
|
,'simile' => 3 |
51
|
|
|
,'syncope' => 3 |
52
|
|
|
,'tamale' => 3 |
53
|
|
|
,'yosemite' => 4 |
54
|
|
|
,'daphne' => 2 |
55
|
|
|
,'eurydice' => 4 |
56
|
|
|
,'euterpe' => 3 |
57
|
|
|
,'hermione' => 4 |
58
|
|
|
,'penelope' => 4 |
59
|
|
|
,'persephone' => 4 |
60
|
|
|
,'phoebe' => 2 |
61
|
|
|
,'zoe' => 2 |
62
|
|
|
); |
63
|
|
|
|
64
|
|
|
// These syllables would be counted as two but should be one |
65
|
|
|
static public $arrSubSyllables = array( |
66
|
|
|
'cia(l|$)' // glacial, acacia |
67
|
|
|
,'tia' |
68
|
|
|
,'cius' |
69
|
|
|
,'cious' |
70
|
|
|
,'[^aeiou]giu' |
71
|
|
|
,'[aeiouy][^aeiouy]ion' |
72
|
|
|
,'iou' |
73
|
|
|
,'sia$' |
74
|
|
|
,'eous$' |
75
|
|
|
,'[oa]gue$' |
76
|
|
|
,'.[^aeiuoycgltdb]{2,}ed$' |
77
|
|
|
,'.ely$' |
78
|
|
|
//,'[cg]h?ed?$' |
79
|
|
|
//,'rved?$' |
80
|
|
|
//,'[aeiouy][dt]es?$' |
81
|
|
|
//,'^[dr]e[aeiou][^aeiou]+$' // Sorts out deal, deign etc |
82
|
|
|
//,'[aeiouy]rse$' // Purse, hearse |
83
|
|
|
,'^jua' |
84
|
|
|
//,'nne[ds]?$' // canadienne |
85
|
|
|
,'uai' // acquainted |
86
|
|
|
,'eau' // champeau |
87
|
|
|
//,'pagne[ds]?$' // champagne |
88
|
|
|
//,'[aeiouy][^aeiuoytdbcgrnzs]h?e[rsd]?$' |
89
|
|
|
// The following detects words ending with a soft e ending. Don't |
90
|
|
|
// mess with it unless you absolutely have to! The following |
91
|
|
|
// is a list of words you can use to test a new version of |
92
|
|
|
// this rule (add 'r', 's' and 'd' where possible to test |
93
|
|
|
// fully): |
94
|
|
|
// - absolve |
95
|
|
|
// - acquiesce |
96
|
|
|
// - audience |
97
|
|
|
// - ache |
98
|
|
|
// - acquire |
99
|
|
|
// - brunelle |
100
|
|
|
// - byrne |
101
|
|
|
// - canadienne |
102
|
|
|
// - coughed |
103
|
|
|
// - curved |
104
|
|
|
// - champagne |
105
|
|
|
// - designate |
106
|
|
|
// - force |
107
|
|
|
// - lace |
108
|
|
|
// - late |
109
|
|
|
// - lathe |
110
|
|
|
// - make |
111
|
|
|
// - relayed |
112
|
|
|
// - scrounge |
113
|
|
|
// - side |
114
|
|
|
// - sideline |
115
|
|
|
// - some |
116
|
|
|
// - wide |
117
|
|
|
// - taste |
118
|
|
|
,'[aeiouy](b|c|ch|d|dg|f|g|gh|gn|k|l|ll|lv|m|mm|n|nc|ng|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y|z)e$' |
119
|
|
|
// For soft e endings with a "d". Test words: |
120
|
|
|
// - crunched |
121
|
|
|
// - forced |
122
|
|
|
// - hated |
123
|
|
|
// - sided |
124
|
|
|
// - sidelined |
125
|
|
|
// - unexploded |
126
|
|
|
// - unexplored |
127
|
|
|
// - scrounged |
128
|
|
|
// - squelched |
129
|
|
|
// - forced |
130
|
|
|
,'[aeiouy](b|c|ch|dg|f|g|gh|gn|k|l|lch|ll|lv|m|mm|n|nc|ng|nch|nn|p|r|rc|rn|rs|rv|s|sc|sk|sl|squ|ss|th|v|y|z)ed$' |
131
|
|
|
// For soft e endings with a "s". Test words: |
132
|
|
|
// - absences |
133
|
|
|
// - accomplices |
134
|
|
|
// - acknowledges |
135
|
|
|
// - advantages |
136
|
|
|
// - byrnes |
137
|
|
|
// - crunches |
138
|
|
|
// - forces |
139
|
|
|
// - scrounges |
140
|
|
|
// - squelches |
141
|
|
|
,'[aeiouy](b|ch|d|f|gh|gn|k|l|lch|ll|lv|m|mm|n|nch|nn|p|r|rn|rs|rv|s|sc|sk|sl|squ|ss|st|t|th|v|y)es$' |
142
|
|
|
,'^busi$' |
143
|
|
|
); |
144
|
|
|
|
145
|
|
|
// These syllables would be counted as one but should be two |
146
|
|
|
static public $arrAddSyllables = array( |
147
|
|
|
'([^s]|^)ia' |
148
|
|
|
,'riet' |
149
|
|
|
,'dien' // audience |
150
|
|
|
,'iu' |
151
|
|
|
,'io' |
152
|
|
|
,'eo($|[b-df-hj-np-tv-z])' |
153
|
|
|
,'ii' |
154
|
|
|
,'[ou]a$' |
155
|
|
|
,'[aeiouym]bl$' |
156
|
|
|
,'[aeiou]{3}' |
157
|
|
|
,'[aeiou]y[aeiou]' |
158
|
|
|
,'^mc' |
159
|
|
|
,'ism$' |
160
|
|
|
,'asm$' |
161
|
|
|
,'thm$' |
162
|
|
|
,'([^aeiouy])\1l$' |
163
|
|
|
,'[^l]lien' |
164
|
|
|
,'^coa[dglx].' |
165
|
|
|
,'[^gq]ua[^auieo]' |
166
|
|
|
,'dnt$' |
167
|
|
|
,'uity$' |
168
|
|
|
,'[^aeiouy]ie(r|st|t)$' |
169
|
|
|
,'eings?$' |
170
|
|
|
,'[aeiouy]sh?e[rsd]$' |
171
|
|
|
,'iell' |
172
|
|
|
,'dea$' |
173
|
|
|
,'real' // real, cereal |
174
|
|
|
,'[^aeiou]y[ae]' // bryan, byerley |
175
|
|
|
,'gean$' // aegean |
176
|
|
|
,'uen' // influence, affluence |
177
|
|
|
); |
178
|
|
|
|
179
|
|
|
// Single syllable prefixes and suffixes |
180
|
|
|
static public $arrAffix = array( |
181
|
|
|
'`^un`' |
182
|
|
|
,'`^fore`' |
183
|
|
|
,'`^ware`' |
184
|
|
|
,'`^none?`' |
185
|
|
|
,'`^out`' |
186
|
|
|
,'`^post`' |
187
|
|
|
,'`^sub`' |
188
|
|
|
,'`^pre`' |
189
|
|
|
,'`^pro`' |
190
|
|
|
,'`^dis`' |
191
|
|
|
,'`^side`' |
192
|
|
|
,'`ly$`' |
193
|
|
|
,'`less$`' |
194
|
|
|
,'`some$`' |
195
|
|
|
,'`ful$`' |
196
|
|
|
,'`ers?$`' |
197
|
|
|
,'`ness$`' |
198
|
|
|
,'`cians?$`' |
199
|
|
|
,'`ments?$`' |
200
|
|
|
,'`ettes?$`' |
201
|
|
|
,'`villes?$`' |
202
|
|
|
,'`ships?$`' |
203
|
|
|
,'`sides?$`' |
204
|
|
|
,'`ports?$`' |
205
|
|
|
,'`shires?$`' |
206
|
|
|
,'`tion(ed)?$`' |
207
|
|
|
); |
208
|
|
|
|
209
|
|
|
// Double syllable prefixes and suffixes |
210
|
|
|
static public $arrDoubleAffix = array( |
211
|
|
|
'`^above`' |
212
|
|
|
,'`^ant[ie]`' |
213
|
|
|
,'`^counter`' |
214
|
|
|
,'`^hyper`' |
215
|
|
|
,'`^afore`' |
216
|
|
|
,'`^agri`' |
217
|
|
|
,'`^in[ft]ra`' |
218
|
|
|
,'`^inter`' |
219
|
|
|
,'`^over`' |
220
|
|
|
,'`^semi`' |
221
|
|
|
,'`^ultra`' |
222
|
|
|
,'`^under`' |
223
|
|
|
,'`^extra`' |
224
|
|
|
,'`^dia`' |
225
|
|
|
,'`^micro`' |
226
|
|
|
,'`^mega`' |
227
|
|
|
,'`^kilo`' |
228
|
|
|
,'`^pico`' |
229
|
|
|
,'`^nano`' |
230
|
|
|
,'`^macro`' |
231
|
|
|
,'`berry$`' |
232
|
|
|
,'`woman$`' |
233
|
|
|
,'`women$`' |
234
|
|
|
); |
235
|
|
|
|
236
|
|
|
// Triple syllable prefixes and suffixes |
237
|
|
|
static public $arrTripleAffix = array( |
238
|
|
|
'`ology$`' |
239
|
|
|
,'`ologist$`' |
240
|
|
|
,'`onomy$`' |
241
|
|
|
,'`onomist$`' |
242
|
|
|
); |
243
|
|
|
|
244
|
|
|
/** |
245
|
|
|
* Returns the number of syllables in the word. |
246
|
|
|
* Based in part on Greg Fast's Perl module Lingua::EN::Syllables |
247
|
|
|
* @param string $strWord Word to be measured |
248
|
|
|
* @param string $strEncoding Encoding of text |
249
|
|
|
* @return int |
250
|
|
|
*/ |
251
|
21 |
|
public static function syllableCount($strWord, $strEncoding = '') |
252
|
|
|
{ |
253
|
|
|
|
254
|
|
|
// Trim whitespace |
255
|
21 |
|
$strWord = trim($strWord); |
256
|
|
|
|
257
|
|
|
// Check we have some letters |
258
|
21 |
|
if (Text::letterCount(trim($strWord), $strEncoding) == 0) { |
259
|
1 |
|
return 0; |
260
|
|
|
} |
261
|
|
|
|
262
|
|
|
// $debug is an array containing the basic syllable counting steps for |
263
|
|
|
// this word. |
264
|
21 |
|
$debug = array(); |
265
|
21 |
|
$debug['Counting syllables for'] = $strWord; |
266
|
|
|
|
267
|
|
|
// Should be no non-alpha characters and lower case |
268
|
21 |
|
$strWord = preg_replace('`[^A-Za-z]`', '', $strWord); |
269
|
21 |
|
$strWord = Text::lowerCase($strWord, $strEncoding); |
270
|
|
|
|
271
|
|
|
// Check for problem words |
272
|
21 |
|
if (isset(self::$arrProblemWords[$strWord])) { |
273
|
1 |
|
return self::$arrProblemWords[$strWord]; |
274
|
|
|
} |
275
|
|
|
// Try singular |
276
|
20 |
|
$singularWord = Pluralise::getSingular($strWord); |
277
|
20 |
|
if ($singularWord != $strWord) { |
278
|
20 |
|
if (isset(self::$arrProblemWords[$singularWord])) { |
279
|
6 |
|
return self::$arrProblemWords[$singularWord]; |
280
|
|
|
} |
281
|
20 |
|
} |
282
|
|
|
|
283
|
20 |
|
$debug['After cleaning, lcase'] = $strWord; |
284
|
|
|
|
285
|
|
|
// Remove prefixes and suffixes and count how many were taken |
286
|
20 |
|
$strWord = preg_replace(self::$arrAffix, '', $strWord, -1, $intAffixCount); |
287
|
20 |
|
$strWord = preg_replace(self::$arrDoubleAffix, '', $strWord, -1, $intDoubleAffixCount); |
288
|
20 |
|
$strWord = preg_replace(self::$arrTripleAffix, '', $strWord, -1, $intTripleAffixCount); |
289
|
|
|
|
290
|
20 |
|
if (($intAffixCount + $intDoubleAffixCount + $intTripleAffixCount) > 0) { |
291
|
18 |
|
$debug['After Prefix and Suffix Removal'] = $strWord; |
292
|
18 |
|
$debug['Prefix and suffix counts'] = $intAffixCount . ' * 1 syllable, ' . $intDoubleAffixCount . ' * 2 syllables, ' . $intTripleAffixCount . ' * 3 syllables'; |
293
|
18 |
|
} |
294
|
|
|
|
295
|
|
|
// Removed non-word characters from word |
296
|
20 |
|
$arrWordParts = preg_split('`[^aeiouy]+`', $strWord); |
297
|
20 |
|
$intWordPartCount = 0; |
298
|
20 |
|
foreach ($arrWordParts as $strWordPart) { |
299
|
20 |
|
if ($strWordPart <> '') { |
300
|
20 |
|
$debug['Counting (' . $intWordPartCount . ')'] = $strWordPart; |
301
|
20 |
|
$intWordPartCount++; |
302
|
20 |
|
} |
303
|
20 |
|
} |
304
|
|
|
|
305
|
|
|
// Some syllables do not follow normal rules - check for them |
306
|
|
|
// Thanks to Joe Kovar for correcting a bug in the following lines |
307
|
20 |
|
$intSyllableCount = $intWordPartCount + $intAffixCount + (2 * $intDoubleAffixCount) + (3 * $intTripleAffixCount); |
308
|
20 |
|
$debug['Syllables by Vowel Count'] = $intSyllableCount; |
309
|
|
|
|
310
|
20 |
View Code Duplication |
foreach (self::$arrSubSyllables as $strSyllable) { |
|
|
|
|
311
|
20 |
|
$_intSyllableCount = $intSyllableCount; |
312
|
20 |
|
$intSyllableCount -= preg_match('`' . $strSyllable . '`', $strWord); |
313
|
20 |
|
if ($_intSyllableCount != $intSyllableCount) { |
314
|
20 |
|
$debug['Subtracting (' . $strSyllable . ')'] = $strSyllable; |
315
|
20 |
|
} |
316
|
20 |
|
} |
317
|
20 |
View Code Duplication |
foreach (self::$arrAddSyllables as $strSyllable) { |
|
|
|
|
318
|
20 |
|
$_intSyllableCount = $intSyllableCount; |
319
|
20 |
|
$intSyllableCount += preg_match('`' . $strSyllable . '`', $strWord); |
320
|
20 |
|
if ($_intSyllableCount != $intSyllableCount) { |
321
|
15 |
|
$debug['Adding (' . $strSyllable . ')'] = $strSyllable; |
322
|
15 |
|
} |
323
|
20 |
|
} |
324
|
20 |
|
$intSyllableCount = ($intSyllableCount == 0) ? 1 : $intSyllableCount; |
325
|
|
|
|
326
|
20 |
|
$debug['Result'] = $intSyllableCount; |
327
|
|
|
|
328
|
20 |
|
return $intSyllableCount; |
329
|
|
|
} |
330
|
|
|
|
331
|
|
|
/** |
332
|
|
|
* Returns total syllable count for text. |
333
|
|
|
* @param string $strText Text to be measured |
334
|
|
|
* @param string $strEncoding Encoding of text |
335
|
|
|
* @return int |
336
|
|
|
*/ |
337
|
1 |
|
public static function totalSyllables($strText, $strEncoding = '') |
338
|
|
|
{ |
339
|
1 |
|
$intSyllableCount = 0; |
340
|
1 |
|
$arrWords = explode(' ', $strText); |
341
|
1 |
|
$intWordCount = count($arrWords); |
342
|
1 |
View Code Duplication |
for ($i = 0; $i < $intWordCount; $i++) { |
|
|
|
|
343
|
1 |
|
$intSyllableCount += self::syllableCount($arrWords[$i], $strEncoding); |
344
|
1 |
|
} |
345
|
|
|
|
346
|
1 |
|
return $intSyllableCount; |
347
|
|
|
} |
348
|
|
|
|
349
|
|
|
/** |
350
|
|
|
* Returns average syllables per word for text. |
351
|
|
|
* @param string $strText Text to be measured |
352
|
|
|
* @param string $strEncoding Encoding of text |
353
|
|
|
* @return int|float |
354
|
|
|
*/ |
355
|
7 |
|
public static function averageSyllablesPerWord($strText, $strEncoding = '') |
356
|
|
|
{ |
357
|
7 |
|
$intSyllableCount = 0; |
358
|
7 |
|
$intWordCount = Text::wordCount($strText, $strEncoding); |
359
|
7 |
|
$arrWords = explode(' ', $strText); |
360
|
7 |
View Code Duplication |
for ($i = 0; $i < $intWordCount; $i++) { |
|
|
|
|
361
|
7 |
|
$intSyllableCount += self::syllableCount($arrWords[$i], $strEncoding); |
362
|
7 |
|
} |
363
|
7 |
|
$averageSyllables = (Maths::bcCalc($intSyllableCount, '/', $intWordCount)); |
364
|
7 |
|
return $averageSyllables; |
365
|
|
|
} |
366
|
|
|
|
367
|
|
|
/** |
368
|
|
|
* Returns the number of words with more than three syllables |
369
|
|
|
* @param string $strText Text to be measured |
370
|
|
|
* @param bool $blnCountProperNouns Boolean - should proper nouns be included in words count |
371
|
|
|
* @param string $strEncoding Encoding of text |
372
|
|
|
* @return int |
373
|
|
|
*/ |
374
|
8 |
|
public static function wordsWithThreeSyllables($strText, $blnCountProperNouns = true, $strEncoding = '') |
375
|
|
|
{ |
376
|
8 |
|
$intLongWordCount = 0; |
377
|
8 |
|
$intWordCount = Text::wordCount($strText, $strEncoding); |
378
|
8 |
|
$arrWords = explode(' ', $strText); |
379
|
8 |
|
for ($i = 0; $i < $intWordCount; $i++) { |
380
|
8 |
|
if (Syllables::syllableCount($arrWords[$i], $strEncoding) > 2) { |
381
|
7 |
|
if ($blnCountProperNouns) { |
382
|
4 |
|
$intLongWordCount++; |
383
|
4 |
|
} else { |
384
|
5 |
|
$strFirstLetter = Text::substring($arrWords[$i], 0, 1, $strEncoding); |
385
|
5 |
|
if ($strFirstLetter !== Text::upperCase($strFirstLetter, $strEncoding)) { |
386
|
|
|
// First letter is lower case. Count it. |
387
|
5 |
|
$intLongWordCount++; |
388
|
5 |
|
} |
389
|
|
|
} |
390
|
7 |
|
} |
391
|
8 |
|
} |
392
|
|
|
|
393
|
8 |
|
return $intLongWordCount; |
394
|
|
|
} |
395
|
|
|
|
396
|
|
|
/** |
397
|
|
|
* Returns the percentage of words with more than three syllables |
398
|
|
|
* @param string $strText Text to be measured |
399
|
|
|
* @param bool $blnCountProperNouns Boolean - should proper nouns be included in words count |
400
|
|
|
* @return int|float |
401
|
|
|
*/ |
402
|
4 |
|
public static function percentageWordsWithThreeSyllables($strText, $blnCountProperNouns = true, $strEncoding = '') |
403
|
|
|
{ |
404
|
4 |
|
$intWordCount = Text::wordCount($strText, $strEncoding); |
405
|
4 |
|
$intLongWordCount = self::wordsWithThreeSyllables($strText, $blnCountProperNouns, $strEncoding); |
406
|
4 |
|
$intPercentage = Maths::bcCalc(Maths::bcCalc($intLongWordCount, '/', $intWordCount), '*', 100); |
407
|
|
|
|
408
|
4 |
|
return $intPercentage; |
409
|
|
|
} |
410
|
|
|
} |
411
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.