1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
namespace DaveChild\TextStatistics; |
4
|
|
|
|
5
|
|
|
class Text |
6
|
|
|
{ |
7
|
|
|
|
8
|
|
|
/** |
9
|
|
|
* @var boolean $blnMbstring Efficiency: Is the MB String extension loaded? |
10
|
|
|
*/ |
11
|
|
|
protected static $blnMbstring = null; |
12
|
|
|
|
13
|
|
|
/** |
14
|
|
|
* @var array $clean Efficiency: Store strings once processed. |
15
|
|
|
*/ |
16
|
|
|
protected static $clean = array(); |
17
|
|
|
|
18
|
|
|
/** |
19
|
|
|
* Trims, removes line breaks, multiple spaces and generally cleans text |
20
|
|
|
* before processing. |
21
|
|
|
* @param string|boolean $strText Text to be transformed |
22
|
|
|
* @return string |
23
|
|
|
*/ |
24
|
39 |
|
public static function cleanText($strText) |
25
|
|
|
{ |
26
|
|
|
|
27
|
|
|
// Check for boolean before processing as string |
28
|
39 |
|
if (is_bool($strText)) { |
29
|
1 |
|
return ''; |
30
|
|
|
} |
31
|
|
|
|
32
|
|
|
// Check to see if we already processed this text. If we did, don't |
33
|
|
|
// re-process it. |
34
|
39 |
|
$key = sha1($strText); |
35
|
39 |
|
if (isset(self::$clean[$key])) { |
36
|
34 |
|
return self::$clean[$key]; |
37
|
|
|
} |
38
|
|
|
|
39
|
19 |
|
$strText = utf8_decode($strText); |
40
|
|
|
|
41
|
|
|
// Curly quotes etc |
42
|
19 |
|
$strText = str_replace( |
43
|
|
|
array( |
44
|
19 |
|
"\xe2\x80\x98", |
45
|
19 |
|
"\xe2\x80\x99", |
46
|
19 |
|
"\xe2\x80\x9c", |
47
|
19 |
|
"\xe2\x80\x9d", |
48
|
19 |
|
"\xe2\x80\x93", |
49
|
19 |
|
"\xe2\x80\x94", |
50
|
|
|
"\xe2\x80\xa6" |
51
|
19 |
|
), |
52
|
|
|
array( |
53
|
19 |
|
"'", |
54
|
19 |
|
"'", |
55
|
19 |
|
'"', |
56
|
19 |
|
'"', |
57
|
19 |
|
'-', |
58
|
19 |
|
'--', |
59
|
|
|
'...' |
60
|
19 |
|
), |
61
|
|
|
$strText |
62
|
19 |
|
); |
63
|
19 |
|
$strText = str_replace( |
64
|
|
|
array( |
65
|
19 |
|
chr(145), |
66
|
19 |
|
chr(146), |
67
|
19 |
|
chr(147), |
68
|
19 |
|
chr(148), |
69
|
19 |
|
chr(150), |
70
|
19 |
|
chr(151), |
71
|
19 |
|
chr(133) |
72
|
19 |
|
), |
73
|
|
|
array( |
74
|
19 |
|
"'", |
75
|
19 |
|
"'", |
76
|
19 |
|
'"', |
77
|
19 |
|
'"', |
78
|
19 |
|
'-', |
79
|
19 |
|
'--', |
80
|
|
|
'...' |
81
|
19 |
|
), |
82
|
|
|
$strText |
83
|
19 |
|
); |
84
|
|
|
|
85
|
|
|
// Replace periods within numbers |
86
|
19 |
|
$strText = preg_replace('`([^0-9][0-9]+)\.([0-9]+[^0-9])`mis', '${1}0$2', $strText); |
87
|
|
|
|
88
|
|
|
// Handle HTML. Treat block level elements as sentence terminators and |
89
|
|
|
// remove all other tags. |
90
|
19 |
|
$strText = preg_replace('`<script(.*?)>(.*?)</script>`is', '', $strText); |
91
|
19 |
|
$strText = preg_replace('`\</?(address|blockquote|center|dir|div|dl|dd|dt|fieldset|form|h1|h2|h3|h4|h5|h6|menu|noscript|ol|p|pre|table|ul|li)[^>]*>`is', '.', $strText); |
92
|
19 |
|
$strText = html_entity_decode($strText); |
93
|
19 |
|
$strText = strip_tags($strText); |
94
|
|
|
|
95
|
|
|
// Assume blank lines (i.e., paragraph breaks) end sentences (useful |
96
|
|
|
// for titles in plain text documents) and replace remaining new |
97
|
|
|
// lines with spaces |
98
|
19 |
|
$strText = preg_replace('`(\r\n|\n\r)`is', "\n", $strText); |
99
|
19 |
|
$strText = preg_replace('`(\r|\n){2,}`is', ".\n\n", $strText); |
100
|
19 |
|
$strText = preg_replace('`[ ]*(\n|\r\n|\r)[ ]*`', ' ', $strText); |
101
|
|
|
|
102
|
|
|
// Replace commas, hyphens, quotes etc (count as spaces) |
103
|
19 |
|
$strText = preg_replace('`[",:;()/\`-]`', ' ', $strText); |
104
|
|
|
|
105
|
|
|
// Unify terminators and spaces |
106
|
19 |
|
$strText = trim($strText, '. ') . '.'; // Add final terminator. |
107
|
19 |
|
$strText = preg_replace('`[\.!?]`', '.', $strText); // Unify terminators |
108
|
19 |
|
$strText = preg_replace('`([\.\s]*\.[\.\s]*)`mis', '. ', $strText); // Merge terminators separated by whitespace. |
109
|
19 |
|
$strText = preg_replace('`[ ]+`', ' ', $strText); // Remove multiple spaces |
110
|
19 |
|
$strText = preg_replace('`([\.])[\. ]+`', '$1', $strText); // Check for duplicated terminators |
111
|
19 |
|
$strText = trim(preg_replace('`[ ]*([\.])`', '$1 ', $strText)); // Pad sentence terminators |
112
|
|
|
|
113
|
|
|
// Lower case all words following terminators (for gunning fog score) |
114
|
19 |
|
$strText = preg_replace_callback('`\. [^\. ]`', function($matches) { return strtolower($matches[0]); }, $strText); |
115
|
|
|
|
116
|
19 |
|
$strText = trim($strText); |
117
|
|
|
|
118
|
|
|
// Cache it and return |
119
|
19 |
|
self::$clean[$key] = $strText; |
120
|
19 |
|
return $strText; |
121
|
|
|
} |
122
|
|
|
|
123
|
|
|
/** |
124
|
|
|
* Converts string to lower case. Tries mb_strtolower and if that fails uses regular strtolower. |
125
|
|
|
* @param string $strText Text to be transformed |
126
|
|
|
* @param string $strEncoding Encoding of text |
127
|
|
|
* @return string |
128
|
|
|
*/ |
129
|
22 |
View Code Duplication |
public static function lowerCase($strText, $strEncoding = '') |
|
|
|
|
130
|
|
|
{ |
131
|
|
|
|
132
|
22 |
|
if (is_null(self::$blnMbstring)) { |
133
|
|
|
self::$blnMbstring = extension_loaded('mbstring'); |
134
|
|
|
} |
135
|
|
|
|
136
|
22 |
|
if (!self::$blnMbstring) { |
137
|
|
|
$strLowerCaseText = strtolower($strText); |
138
|
|
|
} else { |
139
|
22 |
|
if ($strEncoding == '') { |
140
|
22 |
|
$strLowerCaseText = mb_strtolower($strText); |
141
|
22 |
|
} else { |
142
|
|
|
$strLowerCaseText = mb_strtolower($strText, $strEncoding); |
143
|
|
|
} |
144
|
|
|
} |
145
|
|
|
|
146
|
22 |
|
return $strLowerCaseText; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* Converts string to upper case. Tries mb_strtoupper and if that fails uses regular strtoupper. |
151
|
|
|
* @param string $strText Text to be transformed |
152
|
|
|
* @param string $strEncoding Encoding of text |
153
|
|
|
* @return string |
154
|
|
|
*/ |
155
|
6 |
View Code Duplication |
public static function upperCase($strText, $strEncoding = '') |
|
|
|
|
156
|
|
|
{ |
157
|
|
|
|
158
|
6 |
|
if (is_null(self::$blnMbstring)) { |
159
|
|
|
self::$blnMbstring = extension_loaded('mbstring'); |
160
|
|
|
} |
161
|
|
|
|
162
|
6 |
|
if (!self::$blnMbstring) { |
163
|
|
|
$strUpperCaseText = strtoupper($strText); |
164
|
|
|
} else { |
165
|
6 |
|
if ($strEncoding == '') { |
166
|
6 |
|
$strUpperCaseText = mb_strtoupper($strText); |
167
|
6 |
|
} else { |
168
|
|
|
$strUpperCaseText = mb_strtoupper($strText, $strEncoding); |
169
|
|
|
} |
170
|
|
|
} |
171
|
|
|
|
172
|
6 |
|
return $strUpperCaseText; |
173
|
|
|
} |
174
|
|
|
|
175
|
|
|
/** |
176
|
|
|
* Gets portion of string. Tries mb_substr and if that fails uses regular substr. |
177
|
|
|
* @param string $strText Text to be cut up |
178
|
|
|
* @param int $intStart Start character |
179
|
|
|
* @param int $intLength Length |
180
|
|
|
* @param string $strEncoding Encoding of text |
181
|
|
|
* @return string |
182
|
|
|
*/ |
183
|
5 |
|
public static function substring($strText, $intStart, $intLength, $strEncoding = '') |
184
|
|
|
{ |
185
|
|
|
|
186
|
5 |
|
if (is_null(self::$blnMbstring)) { |
187
|
|
|
self::$blnMbstring = extension_loaded('mbstring'); |
188
|
|
|
} |
189
|
|
|
|
190
|
5 |
|
if (!self::$blnMbstring) { |
191
|
|
|
$strSubstring = substr($strText, $intStart, $intLength); |
192
|
|
|
} else { |
193
|
5 |
|
if ($strEncoding == '') { |
194
|
5 |
|
$strSubstring = mb_substr($strText, $intStart, $intLength); |
195
|
5 |
|
} else { |
196
|
|
|
$strSubstring = mb_substr($strText, $intStart, $intLength, $strEncoding); |
197
|
|
|
} |
198
|
|
|
} |
199
|
|
|
|
200
|
5 |
|
return $strSubstring; |
201
|
|
|
} |
202
|
|
|
|
203
|
|
|
/** |
204
|
|
|
* Gives string length. Tries mb_strlen and if that fails uses regular strlen. |
205
|
|
|
* @param string $strText Text to be measured |
206
|
|
|
* @param string $strEncoding Encoding of text |
207
|
|
|
* @return int |
208
|
|
|
*/ |
209
|
29 |
View Code Duplication |
public static function textLength($strText, $strEncoding = '') |
|
|
|
|
210
|
|
|
{ |
211
|
|
|
|
212
|
29 |
|
if (is_null(self::$blnMbstring)) { |
213
|
|
|
self::$blnMbstring = extension_loaded('mbstring'); |
214
|
|
|
} |
215
|
|
|
|
216
|
29 |
|
if (!self::$blnMbstring) { |
217
|
|
|
$intTextLength = strlen($strText); |
218
|
|
|
} else { |
219
|
29 |
|
if ($strEncoding == '') { |
220
|
29 |
|
$intTextLength = mb_strlen($strText); |
221
|
29 |
|
} else { |
222
|
|
|
$intTextLength = mb_strlen($strText, $strEncoding); |
223
|
|
|
} |
224
|
|
|
} |
225
|
|
|
|
226
|
29 |
|
return $intTextLength; |
227
|
|
|
} |
228
|
|
|
|
229
|
|
|
/** |
230
|
|
|
* Alias for textLength, as "letterCount", "wordCount" etc also used |
231
|
|
|
* @param string $strText Text to be measured |
232
|
|
|
* @param string $strEncoding Encoding of text |
233
|
|
|
* @return int |
234
|
|
|
*/ |
235
|
1 |
|
public static function characterCount($strText, $strEncoding = '') |
236
|
|
|
{ |
237
|
1 |
|
return self::textLength($strText, $strEncoding); |
238
|
|
|
} |
239
|
|
|
|
240
|
|
|
/** |
241
|
|
|
* Gives letter count (ignores all non-letters). Tries mb_strlen and if |
242
|
|
|
* that fails uses regular strlen. |
243
|
|
|
* @param string $strText Text to be measured |
244
|
|
|
* @param string $strEncoding Encoding of text |
245
|
|
|
* @return int |
246
|
|
|
*/ |
247
|
31 |
|
public static function letterCount($strText, $strEncoding = '') |
248
|
|
|
{ |
249
|
31 |
|
if (strlen(trim($strText)) == 0) { |
250
|
1 |
|
return 0; |
251
|
|
|
} |
252
|
|
|
|
253
|
31 |
|
if (is_null(self::$blnMbstring)) { |
254
|
|
|
self::$blnMbstring = extension_loaded('mbstring'); |
255
|
|
|
} |
256
|
|
|
|
257
|
31 |
|
$strText = self::cleanText($strText); // To clear out newlines etc |
258
|
31 |
|
$intTextLength = 0; |
259
|
31 |
|
$strText = preg_replace('`[^A-Za-z]+`', '', $strText); |
260
|
|
|
try { |
261
|
|
|
|
262
|
31 |
|
if (!self::$blnMbstring) { |
263
|
|
|
throw new \Exception('The extension mbstring is not loaded.'); |
264
|
|
|
} |
265
|
|
|
|
266
|
31 |
|
if ($strEncoding == '') { |
267
|
31 |
|
$intTextLength = mb_strlen($strText); |
268
|
31 |
|
} else { |
269
|
|
|
$intTextLength = mb_strlen($strText, $strEncoding); |
270
|
|
|
} |
271
|
31 |
|
} catch (\Exception $e) { |
272
|
|
|
$intTextLength = strlen($strText); |
273
|
|
|
} |
274
|
|
|
|
275
|
31 |
|
return $intTextLength; |
276
|
|
|
} |
277
|
|
|
|
278
|
|
|
/** |
279
|
|
|
* Returns word count for text. |
280
|
|
|
* @param string $strText Text to be measured |
281
|
|
|
* @param string $strEncoding Encoding of text |
282
|
|
|
* @return int |
283
|
|
|
*/ |
284
|
26 |
View Code Duplication |
public static function wordCount($strText, $strEncoding = '') |
|
|
|
|
285
|
|
|
{ |
286
|
26 |
|
if (strlen(trim($strText)) == 0) { |
287
|
1 |
|
return 0; |
288
|
|
|
} |
289
|
|
|
|
290
|
|
|
// Will be tripped by em dashes with spaces either side, among other similar characters |
291
|
25 |
|
$intWords = 1 + self::textLength(preg_replace('`[^ ]`', '', preg_replace('`\s+`', ' ', $strText)), $strEncoding); // Space count + 1 is word count |
292
|
|
|
|
293
|
25 |
|
return $intWords; |
294
|
|
|
} |
295
|
|
|
|
296
|
|
|
/** |
297
|
|
|
* Returns sentence count for text. |
298
|
|
|
* @param string $strText Text to be measured |
299
|
|
|
* @param string $strEncoding Encoding of text |
300
|
|
|
* @return int |
301
|
|
|
*/ |
302
|
23 |
View Code Duplication |
public static function sentenceCount($strText, $strEncoding = '') |
|
|
|
|
303
|
|
|
{ |
304
|
23 |
|
if (strlen(trim($strText)) == 0) { |
305
|
1 |
|
return 0; |
306
|
|
|
} |
307
|
|
|
|
308
|
|
|
// Will be tripped up by "Mr." or "U.K.". Not a major concern at this point. |
309
|
22 |
|
$intSentences = max(1, self::textLength(preg_replace('`[^\.!?]`', '', $strText), $strEncoding)); |
310
|
|
|
|
311
|
22 |
|
return $intSentences; |
312
|
|
|
} |
313
|
|
|
|
314
|
|
|
/** |
315
|
|
|
* Returns average words per sentence for text. |
316
|
|
|
* @param string $strText Text to be measured |
317
|
|
|
* @param string $strEncoding Encoding of text |
318
|
|
|
* @return int|float |
319
|
|
|
*/ |
320
|
10 |
|
public static function averageWordsPerSentence($strText, $strEncoding = '') |
321
|
|
|
{ |
322
|
10 |
|
$intSentenceCount = self::sentenceCount($strText, $strEncoding); |
323
|
10 |
|
$intWordCount = self::wordCount($strText, $strEncoding); |
324
|
|
|
|
325
|
10 |
|
$averageWords = (Maths::bcCalc($intWordCount, '/', $intSentenceCount)); |
326
|
10 |
|
return $averageWords; |
327
|
|
|
} |
328
|
|
|
} |
329
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.