Passed
Push — master ( 0b7aa3...1e9d3c )
by Chris
08:25
created

Sentiment::setDataFolder()   A

Complexity

Conditions 4
Paths 6

Size

Total Lines 17
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 4
eloc 9
c 1
b 0
f 0
nc 6
nop 2
dl 0
loc 17
rs 9.9666
1
<?php
2
namespace PHPInsight;
3
4
/*
5
  phpInsight is a Naive Bayes classifier to calculate sentiment. The program
6
  uses a database of words categorised as positive, negative or neutral
7
8
  Copyright (C) 2012  James Hennessey
9
  Class modifications and improvements by Ismayil Khayredinov ([email protected])
10
11
  This program is free software: you can redistribute it and/or modify
12
  it under the terms of the GNU General Public License as published by
13
  the Free Software Foundation, either version 3 of the License, or
14
  (at your option) any later version.
15
16
  This program is distributed in the hope that it will be useful,
17
  but WITHOUT ANY WARRANTY; without even the implied warranty of
18
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19
  GNU General Public License for more details.
20
21
  You should have received a copy of the GNU General Public License
22
  along with this program.  If not, see <http://www.gnu.org/licenses/>
23
24
 */
25
26
class Sentiment {
27
28
	/**
29
	 * Location of the dictionary files
30
	 * @var str 
31
	 */
32
	private $dataFolder = '';
33
34
	/**
35
	 * List of tokens to ignore
36
	 * @var array 
37
	 */
38
	private $ignoreList = array();
39
40
	/**
41
	 * List of words with negative prefixes, e.g. isn't, arent't
42
	 * @var array
43
	 */
44
	private $negPrefixList = array();
45
46
	/**
47
	 * Storage of cached dictionaries
48
	 * @var array 
49
	 */
50
	private $dictionary = array();
51
52
	/**
53
	 * Min length of a token for it to be taken into consideration
54
	 * @var int
55
	 */
56
	private $minTokenLength = 1;
57
58
	/**
59
	 * Max length of a taken for it be taken into consideration
60
	 * @var int
61
	 */
62
	private $maxTokenLength = 15;
63
64
	/**
65
	 * Classification of opinions
66
	 * @var array
67
	 */
68
	private $classes = array('pos', 'neg', 'neu');
69
70
	/**
71
	 * Token score per class
72
	 * @var array 
73
	 */
74
	private $classTokCounts = array(
75
		'pos' => 0,
76
		'neg' => 0,
77
		'neu' => 0
78
	);
79
80
	/**
81
	 * Analyzed text score per class
82
	 * @var array
83
	 */
84
	private $classDocCounts = array(
85
		'pos' => 0,
86
		'neg' => 0,
87
		'neu' => 0
88
	);
89
90
	/**
91
	 * Number of tokens in a text
92
	 * @var int 
93
	 */
94
	private $tokCount = 0;
95
96
	/**
97
	 * Number of analyzed texts
98
	 * @var int
99
	 */
100
	private $docCount = 0;
101
102
	/**
103
	 * Implication that the analyzed text has 1/3 chance of being in either of the 3 categories
104
	 * @var array
105
	 */
106
	private $prior = array(
107
		'pos' => 0.333,
108
		'neg' => 0.333,
109
		'neu' => 0.334,
110
	);
111
112
	/**
113
	 * Class constructor
114
	 * @param str $dataFolder base folder
115
	 * Sets defaults and loads/caches dictionaries
116
	 */
117
	public function __construct($dataFolder = false) {
118
119
		//set the base folder for the data models
120
		$this->setDataFolder($dataFolder);
0 ignored issues
show
Bug introduced by
It seems like $dataFolder can also be of type false; however, parameter $dataFolder of PHPInsight\Sentiment::setDataFolder() does only seem to accept string, maybe add an additional type check? ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

120
		$this->setDataFolder(/** @scrutinizer ignore-type */ $dataFolder);
Loading history...
121
122
		//load and cache directories, get ignore and prefix lists
123
		$this->loadDefaults();
124
	}
125
126
	/**
127
	 * Get scores for each class
128
	 *
129
	 * @param str $sentence Text to analyze
130
	 * @return int Score
131
	 */
132
	public function score($sentence) {
133
134
		//For each negative prefix in the list
135
		foreach ($this->negPrefixList as $negPrefix) {
136
137
			//Search if that prefix is in the document
138
			if (strpos($sentence, $negPrefix) !== false) {
139
				//Reove the white space after the negative prefix
140
				$sentence = str_replace($negPrefix . ' ', $negPrefix, $sentence);
141
			}
142
		}
143
144
		//Tokenise Document
145
		$tokens = $this->_getTokens($sentence);
146
		// calculate the score in each category
147
148
		$total_score = 0;
149
150
		//Empty array for the scores for each of the possible categories
151
		$scores = array();
152
153
		//Loop through all of the different classes set in the $classes variable
154
		foreach ($this->classes as $class) {
155
156
			//In the scores array add another dimention for the class and set it's value to 1. EG $scores->neg->1
157
			$scores[$class] = 1;
158
159
			//For each of the individual words used loop through to see if they match anything in the $dictionary
160
			foreach ($tokens as $token) {
161
162
				//If statement so to ignore tokens which are either too long or too short or in the $ignoreList
163
				if (strlen($token) > $this->minTokenLength && strlen($token) < $this->maxTokenLength && !in_array($token, $this->ignoreList)) {
164
					//If dictionary[token][class] is set
165
					if (isset($this->dictionary[$token][$class])) {
166
						//Set count equal to it
167
						$count = $this->dictionary[$token][$class];
168
					} else {
169
						$count = 0;
170
					}
171
172
					//Score[class] is calcumeted by $scores[class] x $count +1 divided by the $classTokCounts[class] + $tokCount
173
					$scores[$class] *= ($count + 1);
174
				}
175
			}
176
177
			//Score for this class is the prior probability multiplyied by the score for this class
178
			$scores[$class] = $this->prior[$class] * $scores[$class];
179
		}
180
181
		//Makes the scores relative percents
182
		foreach ($this->classes as $class) {
183
			$total_score += $scores[$class];
184
		}
185
186
		foreach ($this->classes as $class) {
187
			$scores[$class] = round($scores[$class] / $total_score, 3);
188
		}
189
190
		//Sort array in reverse order
191
		arsort($scores);
192
193
		return $scores;
194
	}
195
196
	/**
197
	 * Get the class of the text based on it's score
198
	 * 
199
	 * @param str $sentence
200
	 * @return str pos|neu|neg
201
	 */
202
	public function categorise($sentence) {
203
204
		$scores = $this->score($sentence);
205
206
		//Classification is the key to the scores array
207
		$classification = key($scores);
0 ignored issues
show
Bug introduced by
$scores of type integer is incompatible with the type array expected by parameter $array of key(). ( Ignorable by Annotation )

If this is a false-positive, you can also ignore this issue in your code via the ignore-type  annotation

207
		$classification = key(/** @scrutinizer ignore-type */ $scores);
Loading history...
208
209
		return $classification;
210
	}
211
212
	/**
213
	 * Load and cache dictionary
214
	 *
215
	 * @param str $class
216
	 * @return boolean
217
	 */
218
	public function setDictionary($class) {
219
		/**
220
		 *  For some people this file extention causes some problems!
221
		 */
222
		$fn = "{$this->dataFolder}data.{$class}.php";
223
224
		if (file_exists($fn)) {
225
			$temp = file_get_contents($fn);
226
			$words = unserialize($temp);
227
		} else {
228
			echo 'File does not exist: ' . $fn;
229
		}
230
231
		//Loop through all of the entries
232
		foreach ($words as $word) {
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable $words does not seem to be defined for all execution paths leading up to this point.
Loading history...
233
234
			$this->docCount++;
235
			$this->classDocCounts[$class]++;
236
237
			//Trim word
238
			$word = trim($word);
239
240
			//If this word isn't already in the dictionary with this class
241
			if (!isset($this->dictionary[$word][$class])) {
242
243
				//Add to this word to the dictionary and set counter value as one. This function ensures that if a word is in the text file more than once it still is only accounted for one in the array
244
				$this->dictionary[$word][$class] = 1;
245
			}//Close If statement
246
247
			$this->classTokCounts[$class]++;
248
			$this->tokCount++;
249
		}//Close while loop going through everyline in the text file
250
251
		return true;
252
	}
253
254
	/**
255
	 * Set the base folder for loading data models
256
	 * @param str  $dataFolder base folder
257
	 * @param bool $loadDefaults true - load everything by default | false - just change the directory
258
	 */
259
	public function setDataFolder($dataFolder = false, $loadDefaults = false){
260
		//if $dataFolder not provided, load default, else set the provided one
261
		if($dataFolder == false){
0 ignored issues
show
Bug introduced by
It seems like you are loosely comparing $dataFolder of type false|string against false; this is ambiguous if the string can be empty. Consider using a strict comparison === instead.
Loading history...
262
			$this->dataFolder = __DIR__ . '/data/';
263
		}
264
		else{
265
			if(file_exists($dataFolder)){
266
				$this->dataFolder = $dataFolder;
267
			}
268
			else{
269
				echo 'Error: could not find the directory - '.$dataFolder;
270
			}
271
		}
272
273
		//load default directories, ignore and prefixe lists
274
		if($loadDefaults !== false){
275
			$this->loadDefaults();
276
		}
277
	}
278
279
	/**
280
	 * Load and cache directories, get ignore and prefix lists
281
	 */
282
	private function loadDefaults(){
283
		// Load and cache dictionaries
284
		foreach ($this->classes as $class) {
285
			if (!$this->setDictionary($class)) {
286
				echo "Error: Dictionary for class '$class' could not be loaded";
287
			}
288
		}
289
290
		if (!isset($this->dictionary) || empty($this->dictionary))
291
			echo 'Error: Dictionaries not set';
292
293
		//Run function to get ignore list
294
		$this->ignoreList = $this->getList('ign');
295
296
		//If ingnoreList not get give error message
297
		if (!isset($this->ignoreList))
298
			echo 'Error: Ignore List not set';
299
300
		//Get the list of negative prefixes
301
		$this->negPrefixList = $this->getList('prefix');
302
303
		//If neg prefix list not set give error
304
		if (!isset($this->negPrefixList))
305
			echo 'Error: Ignore List not set';
306
	}
307
308
	/**
309
	 * Break text into tokens
310
	 *
311
	 * @param str $string	String being broken up
312
	 * @return array An array of tokens
313
	 */
314
	private function _getTokens($string) {
315
316
		// Replace line endings with spaces
317
		$string = str_replace("\r\n", " ", $string);
318
319
		//Clean the string so is free from accents
320
		$string = $this->_cleanString($string);
321
322
		//Make all texts lowercase as the database of words in in lowercase
323
		$string = strtolower($string);
324
		$string = preg_replace('/[[:punct:]]+/', '', $string);
325
326
		//Break string into individual words using explode putting them into an array
327
		$matches = explode(' ', $string);
328
329
		//Return array with each individual token
330
		return $matches;
331
	}
332
333
	/**
334
	 * Load and cache additional word lists
335
	 *
336
	 * @param str $type
337
	 * @return array
338
	 */
339
	public function getList($type) {
340
		//Set up empty word list array
341
		$wordList = array();
342
343
		$fn = "{$this->dataFolder}data.{$type}.php";
344
		;
345
		if (file_exists($fn)) {
346
			$temp = file_get_contents($fn);
347
			$words = unserialize($temp);
348
		} else {
349
			return 'File does not exist: ' . $fn;
350
		}
351
352
		//Loop through results
353
		foreach ($words as $word) {
354
			//remove any slashes
355
			$word = stripcslashes($word);
356
			//Trim word
357
			$trimmed = trim($word);
358
359
			//Push results into $wordList array
360
			array_push($wordList, $trimmed);
361
		}
362
		//Return $wordList
363
		return $wordList;
364
	}
365
366
	/**
367
	 * Function to clean a string so all characters with accents are turned into ASCII characters. EG: ‡ = a
368
	 * 
369
	 * @param str $string
370
	 * @return str
371
	 */
372
	private function _cleanString($string) {
373
374
		$diac =
375
				/* A */ chr(192) . chr(193) . chr(194) . chr(195) . chr(196) . chr(197) .
376
				/* a */ chr(224) . chr(225) . chr(226) . chr(227) . chr(228) . chr(229) .
377
				/* O */ chr(210) . chr(211) . chr(212) . chr(213) . chr(214) . chr(216) .
378
				/* o */ chr(242) . chr(243) . chr(244) . chr(245) . chr(246) . chr(248) .
379
				/* E */ chr(200) . chr(201) . chr(202) . chr(203) .
380
				/* e */ chr(232) . chr(233) . chr(234) . chr(235) .
381
				/* Cc */ chr(199) . chr(231) .
382
				/* I */ chr(204) . chr(205) . chr(206) . chr(207) .
383
				/* i */ chr(236) . chr(237) . chr(238) . chr(239) .
384
				/* U */ chr(217) . chr(218) . chr(219) . chr(220) .
385
				/* u */ chr(249) . chr(250) . chr(251) . chr(252) .
386
				/* yNn */ chr(255) . chr(209) . chr(241);
387
388
		return strtolower(strtr($string, $diac, 'AAAAAAaaaaaaOOOOOOooooooEEEEeeeeCcIIIIiiiiUUUUuuuuyNn'));
389
	}
390
391
	/**
392
	 * Deletes old data/data.* files
393
	 * Creates new files from updated source fi
394
	 */
395
	public function reloadDictionaries(){
396
397
		foreach($this->classes as $class){
398
			$fn = "{$this->dataFolder}data.{$class}.php";
399
			if (file_exists($fn)) {
400
				unlink($fn);
401
			} 
402
		}
403
404
		$dictionaries = __DIR__ . '/dictionaries/';
405
406
		foreach($this->classes as $class){
407
			$dict = "{$dictionaries}source.{$class}.php";
408
409
			require_once($dict);
410
411
			$data = $class;
412
413
			$fn = "{$this->dataFolder}data.{$class}.php";
414
			file_put_contents($fn, serialize($$data));
415
		}
416
417
		
418
419
	}
420
421
}
422
423
?>
0 ignored issues
show
Best Practice introduced by
It is not recommended to use PHP's closing tag ?> in files other than templates.

Using a closing tag in PHP files that only contain PHP code is not recommended as you might accidentally add whitespace after the closing tag which would then be output by PHP. This can cause severe problems, for example headers cannot be sent anymore.

A simple precaution is to leave off the closing tag as it is not required, and it also has no negative effects whatsoever.

Loading history...
424