@@ -78,32 +78,32 @@ discard block |
||
| 78 | 78 | if (!is_string($a) || !is_string($b)) { |
| 79 | 79 | return false; |
| 80 | 80 | } |
| 81 | - $filter = function ($v) { |
|
| 81 | + $filter = function($v) { |
|
| 82 | 82 | return !(ctype_space($v)); |
| 83 | 83 | }; |
| 84 | 84 | self::filter($a, $b, $filter, true); |
| 85 | 85 | return self::waorDiff($a, $b, count($a), count($b)); |
| 86 | 86 | } |
| 87 | 87 | |
| 88 | - private static function filter(&$a, &$b, $filter, $insensitive = true, $captureLength=false) |
|
| 88 | + private static function filter(&$a, &$b, $filter, $insensitive = true, $captureLength = false) |
|
| 89 | 89 | { |
| 90 | 90 | if ($insensitive) { |
| 91 | 91 | $a = array_filter(self::getParts(self::strtolower($a), $c, $captureLength), $filter); |
| 92 | - if ($c===1) { |
|
| 93 | - $a=self::strtolower($a); |
|
| 92 | + if ($c === 1) { |
|
| 93 | + $a = self::strtolower($a); |
|
| 94 | 94 | } |
| 95 | 95 | $b = array_filter(self::getParts(self::strtolower($b), $c, $captureLength), $filter); |
| 96 | - if ($c===1) { |
|
| 97 | - $b=self::strtolower($b); |
|
| 96 | + if ($c === 1) { |
|
| 97 | + $b = self::strtolower($b); |
|
| 98 | 98 | } |
| 99 | 99 | } else { |
| 100 | 100 | $a = array_filter(self::getParts(self::split($a), $c, $captureLength), $filter); |
| 101 | - if ($c===1) { |
|
| 102 | - $a=self::strtolower($a); |
|
| 101 | + if ($c === 1) { |
|
| 102 | + $a = self::strtolower($a); |
|
| 103 | 103 | } |
| 104 | 104 | $b = array_filter(self::getParts(self::split($b), $c, $captureLength), $filter); |
| 105 | - if ($c===1) { |
|
| 106 | - $b=self::strtolower($b); |
|
| 105 | + if ($c === 1) { |
|
| 106 | + $b = self::strtolower($b); |
|
| 107 | 107 | } |
| 108 | 108 | } |
| 109 | 109 | } |
@@ -116,7 +116,7 @@ discard block |
||
| 116 | 116 | |
| 117 | 117 | public static function punctuationChangesOccured($a, $b, $insensitive = true, $considerSpace = true) |
| 118 | 118 | { |
| 119 | - $filter = function ($v) use ($considerSpace) { |
|
| 119 | + $filter = function($v) use ($considerSpace) { |
|
| 120 | 120 | return $considerSpace ? !(ctype_space($v) || ctype_punct($v)) : !ctype_punct($v); |
| 121 | 121 | }; |
| 122 | 122 | if (!is_string($a) || !is_string($b)) { |
@@ -132,7 +132,7 @@ discard block |
||
| 132 | 132 | if (!is_string($a) || !is_string($b)) { |
| 133 | 133 | return false; |
| 134 | 134 | } |
| 135 | - $filter = function ($v) { |
|
| 135 | + $filter = function($v) { |
|
| 136 | 136 | return !(ctype_space($v[0]) || ctype_punct($v[0])); |
| 137 | 137 | }; |
| 138 | 138 | |
@@ -143,7 +143,7 @@ discard block |
||
| 143 | 143 | private static function aoeStemming($a, $b) |
| 144 | 144 | { |
| 145 | 145 | foreach ($a as $index=>$word) { |
| 146 | - if (!self::haveSameRoot($word[0], $b[$index][0]) || ($a[$index][1]>2 && $b[$index][1]>2)) { |
|
| 146 | + if (!self::haveSameRoot($word[0], $b[$index][0]) || ($a[$index][1] > 2 && $b[$index][1] > 2)) { |
|
| 147 | 147 | return false; |
| 148 | 148 | } |
| 149 | 149 | } |
@@ -36,7 +36,7 @@ |
||
| 36 | 36 | |
| 37 | 37 | public static function wordsReorderOccured($a, $b, $considerPunctuation = true) |
| 38 | 38 | { |
| 39 | - $filter = function ($v) use ($considerPunctuation) { |
|
| 39 | + $filter = function($v) use ($considerPunctuation) { |
|
| 40 | 40 | return $considerPunctuation ? !(ctype_space($v) || ctype_punct($v)) : !ctype_space($v); |
| 41 | 41 | }; |
| 42 | 42 | return self::similarText($a, $b, 2, true, $check, true) && is_array($check) && self::wro_filter($check, $filter) ?true :false; |
@@ -14,17 +14,17 @@ discard block |
||
| 14 | 14 | namespace EZAMA{ |
| 15 | 15 | class distance extends complexCommonTextSimilarities |
| 16 | 16 | { |
| 17 | - public static function jaroWinkler($a, $b, $round=2) |
|
| 17 | + public static function jaroWinkler($a, $b, $round = 2) |
|
| 18 | 18 | { |
| 19 | - if (!is_string($a)||!is_string($b)) { |
|
| 19 | + if (!is_string($a) || !is_string($b)) { |
|
| 20 | 20 | return false; |
| 21 | 21 | } |
| 22 | - static $distance=array(); |
|
| 23 | - static $previous=array(); |
|
| 24 | - if (array($a,$b)===$previous) { |
|
| 22 | + static $distance = array(); |
|
| 23 | + static $previous = array(); |
|
| 24 | + if (array($a, $b) === $previous) { |
|
| 25 | 25 | return $distance; |
| 26 | 26 | } |
| 27 | - $previous=array($a,$b); |
|
| 27 | + $previous = array($a, $b); |
|
| 28 | 28 | return self::getJWDistance($a, $b, $distance, $round); |
| 29 | 29 | } |
| 30 | 30 | |
@@ -33,15 +33,15 @@ discard block |
||
| 33 | 33 | private static function getJWDistance(&$a, &$b, &$distance, $round) |
| 34 | 34 | { |
| 35 | 35 | extract(self::prepareJaroWinkler($a, $b)); |
| 36 | - for ($i=0,$min=min(count($a), count($b)),$t=0;$i<$min;$i++) { |
|
| 37 | - if ($a[$i]!==$b[$i]) { |
|
| 36 | + for ($i = 0, $min = min(count($a), count($b)), $t = 0; $i < $min; $i++) { |
|
| 37 | + if ($a[$i] !== $b[$i]) { |
|
| 38 | 38 | $t++; |
| 39 | 39 | } |
| 40 | 40 | } |
| 41 | - $t/=2; |
|
| 42 | - $distance['jaro']=1/3*($corresponding/$ca+$corresponding/$cb+($corresponding-$t)/$corresponding); |
|
| 43 | - $distance['jaro-winkler']=$distance['jaro']+(min($longCommonSubstr, 4)*0.1*(1-$distance['jaro'])); |
|
| 44 | - $distance=array_map(function ($v) use ($round) { |
|
| 41 | + $t /= 2; |
|
| 42 | + $distance['jaro'] = 1 / 3 * ($corresponding / $ca + $corresponding / $cb + ($corresponding - $t) / $corresponding); |
|
| 43 | + $distance['jaro-winkler'] = $distance['jaro'] + (min($longCommonSubstr, 4) * 0.1 * (1 - $distance['jaro'])); |
|
| 44 | + $distance = array_map(function($v) use ($round) { |
|
| 45 | 45 | return round($v, $round); |
| 46 | 46 | }, $distance); |
| 47 | 47 | |
@@ -50,15 +50,15 @@ discard block |
||
| 50 | 50 | |
| 51 | 51 | private static function prepareJaroWinkler(&$a, &$b) |
| 52 | 52 | { |
| 53 | - $a=self::split($a); |
|
| 54 | - $b=self::split($b); |
|
| 55 | - $transpositions=array('a'=>array(),'b'=>array(),'corresponding'=>0,'longCommonSubstr'=>0,'ca'=>count($a),'cb'=>count($b)); |
|
| 56 | - $Δ=max($transpositions['ca'], $transpositions['cb'])/2-1; |
|
| 53 | + $a = self::split($a); |
|
| 54 | + $b = self::split($b); |
|
| 55 | + $transpositions = array('a'=>array(), 'b'=>array(), 'corresponding'=>0, 'longCommonSubstr'=>0, 'ca'=>count($a), 'cb'=>count($b)); |
|
| 56 | + $Δ = max($transpositions['ca'], $transpositions['cb']) / 2 - 1; |
|
| 57 | 57 | self::jwMatches($a, $b, $transpositions, $Δ); |
| 58 | 58 | ksort($transpositions['a']); |
| 59 | 59 | ksort($transpositions['b']); |
| 60 | - $transpositions['a']=array_values($transpositions['a']); |
|
| 61 | - $transpositions['b']=array_values($transpositions['b']); |
|
| 60 | + $transpositions['a'] = array_values($transpositions['a']); |
|
| 61 | + $transpositions['b'] = array_values($transpositions['b']); |
|
| 62 | 62 | return $transpositions; |
| 63 | 63 | } |
| 64 | 64 | |
@@ -66,12 +66,12 @@ discard block |
||
| 66 | 66 | { |
| 67 | 67 | foreach ($a as $ind=>$chr) { |
| 68 | 68 | foreach ($b as $index=>$char) { |
| 69 | - if ($chr===$char&&(abs($index-$ind)<=$Δ)) { |
|
| 70 | - if ($ind!==$index) { |
|
| 71 | - $transpositions['a'][$ind]=$chr; |
|
| 72 | - $transpositions['b'][$index]=$char; |
|
| 69 | + if ($chr === $char && (abs($index - $ind) <= $Δ)) { |
|
| 70 | + if ($ind !== $index) { |
|
| 71 | + $transpositions['a'][$ind] = $chr; |
|
| 72 | + $transpositions['b'][$index] = $char; |
|
| 73 | 73 | } else { |
| 74 | - if ($ind-1<=$transpositions['longCommonSubstr']) { |
|
| 74 | + if ($ind - 1 <= $transpositions['longCommonSubstr']) { |
|
| 75 | 75 | $transpositions['longCommonSubstr']++; |
| 76 | 76 | } |
| 77 | 77 | } |
@@ -84,55 +84,55 @@ discard block |
||
| 84 | 84 | |
| 85 | 85 | public static function hamming($a, $b) |
| 86 | 86 | { |
| 87 | - if (!is_string($a)||!is_string($b)||(strlen($a)!==strlen($b))) { |
|
| 87 | + if (!is_string($a) || !is_string($b) || (strlen($a) !== strlen($b))) { |
|
| 88 | 88 | return false; |
| 89 | 89 | } |
| 90 | - static $distance=0; |
|
| 91 | - static $previous=array(); |
|
| 92 | - if (array($a,$b)===$previous) { |
|
| 90 | + static $distance = 0; |
|
| 91 | + static $previous = array(); |
|
| 92 | + if (array($a, $b) === $previous) { |
|
| 93 | 93 | return $distance; |
| 94 | 94 | } |
| 95 | - $previous=array($a,$b); |
|
| 96 | - $a=self::split($a); |
|
| 97 | - $b=self::split($b); |
|
| 98 | - $distance=count(array_diff_assoc($a, $b)); |
|
| 95 | + $previous = array($a, $b); |
|
| 96 | + $a = self::split($a); |
|
| 97 | + $b = self::split($b); |
|
| 98 | + $distance = count(array_diff_assoc($a, $b)); |
|
| 99 | 99 | return $distance; |
| 100 | 100 | } |
| 101 | 101 | |
| 102 | - public static function dice($a, $b, $round=2) |
|
| 102 | + public static function dice($a, $b, $round = 2) |
|
| 103 | 103 | { |
| 104 | - if (!is_string($a)||!is_string($b)) { |
|
| 104 | + if (!is_string($a) || !is_string($b)) { |
|
| 105 | 105 | return false; |
| 106 | 106 | } |
| 107 | - if (empty($a)||empty($b)) { |
|
| 107 | + if (empty($a) || empty($b)) { |
|
| 108 | 108 | return 0.0; |
| 109 | 109 | } |
| 110 | - if ($a===$b) { |
|
| 110 | + if ($a === $b) { |
|
| 111 | 111 | return 1.0; |
| 112 | 112 | } |
| 113 | 113 | |
| 114 | - static $distance=0; |
|
| 115 | - static $previous=array(); |
|
| 116 | - if (array($a,$b)===$previous) { |
|
| 114 | + static $distance = 0; |
|
| 115 | + static $previous = array(); |
|
| 116 | + if (array($a, $b) === $previous) { |
|
| 117 | 117 | return $distance; |
| 118 | 118 | } |
| 119 | - $previous=array($a,$b); |
|
| 120 | - $a=self::split($a, 2); |
|
| 121 | - $b=self::split($b, 2); |
|
| 122 | - $ca=($caGrams=count($a))*2-self::getEndStrLen($a); |
|
| 123 | - $cb=($cbGrams=count($b))*2-self::getEndStrLen($b); |
|
| 124 | - $distance=round(2*count($caGrams>$cbGrams?array_intersect($a, $b):array_intersect($b, $a))/($ca+$cb), $round); |
|
| 119 | + $previous = array($a, $b); |
|
| 120 | + $a = self::split($a, 2); |
|
| 121 | + $b = self::split($b, 2); |
|
| 122 | + $ca = ($caGrams = count($a)) * 2 - self::getEndStrLen($a); |
|
| 123 | + $cb = ($cbGrams = count($b)) * 2 - self::getEndStrLen($b); |
|
| 124 | + $distance = round(2 * count($caGrams > $cbGrams ?array_intersect($a, $b) : array_intersect($b, $a)) / ($ca + $cb), $round); |
|
| 125 | 125 | return $distance; |
| 126 | 126 | } |
| 127 | 127 | |
| 128 | 128 | private static function getEndStrLen($a) |
| 129 | 129 | { |
| 130 | 130 | if (function_exists('array_key_last')) { |
| 131 | - $end=array_key_last($a); |
|
| 132 | - $end=(isset($end[1]))?0:1; |
|
| 131 | + $end = array_key_last($a); |
|
| 132 | + $end = (isset($end[1])) ? 0 : 1; |
|
| 133 | 133 | } else { |
| 134 | - $end=end($a); |
|
| 135 | - $end=(isset($end[1]))?0:1; |
|
| 134 | + $end = end($a); |
|
| 135 | + $end = (isset($end[1])) ? 0 : 1; |
|
| 136 | 136 | reset($a); |
| 137 | 137 | } |
| 138 | 138 | return $end; |
@@ -140,61 +140,61 @@ discard block |
||
| 140 | 140 | |
| 141 | 141 | public static function levenshtein($a, $b) |
| 142 | 142 | { |
| 143 | - if (!is_string($a)||!is_string($b)) { |
|
| 143 | + if (!is_string($a) || !is_string($b)) { |
|
| 144 | 144 | return false; |
| 145 | 145 | } |
| 146 | 146 | |
| 147 | 147 | |
| 148 | - static $distance=0; |
|
| 149 | - static $previous=array(); |
|
| 150 | - if (array($a,$b)===$previous) { |
|
| 148 | + static $distance = 0; |
|
| 149 | + static $previous = array(); |
|
| 150 | + if (array($a, $b) === $previous) { |
|
| 151 | 151 | return $distance; |
| 152 | 152 | } |
| 153 | - $previous=array($a,$b); |
|
| 154 | - $a=self::split($a); |
|
| 155 | - $b=self::split($b); |
|
| 153 | + $previous = array($a, $b); |
|
| 154 | + $a = self::split($a); |
|
| 155 | + $b = self::split($b); |
|
| 156 | 156 | $ca = count($a); |
| 157 | 157 | $cb = count($b); |
| 158 | 158 | $dis = range(0, $cb); |
| 159 | 159 | self::BuildLevenshteinCostMatrix($a, $b, $ca, $cb, $dis); |
| 160 | 160 | |
| 161 | - return $distance=$dis[$cb]; |
|
| 161 | + return $distance = $dis[$cb]; |
|
| 162 | 162 | } |
| 163 | 163 | |
| 164 | 164 | |
| 165 | 165 | public static function levenshteinDamerau($a, $b) |
| 166 | 166 | { |
| 167 | - if (!is_string($a)||!is_string($b)) { |
|
| 167 | + if (!is_string($a) || !is_string($b)) { |
|
| 168 | 168 | return false; |
| 169 | 169 | } |
| 170 | 170 | |
| 171 | - static $distance=0; |
|
| 172 | - static $previous=array(); |
|
| 173 | - if (array($a,$b)===$previous) { |
|
| 171 | + static $distance = 0; |
|
| 172 | + static $previous = array(); |
|
| 173 | + if (array($a, $b) === $previous) { |
|
| 174 | 174 | return $distance; |
| 175 | 175 | } |
| 176 | - $previous=array($a,$b); |
|
| 177 | - $a=self::split($a); |
|
| 178 | - $b=self::split($b); |
|
| 176 | + $previous = array($a, $b); |
|
| 177 | + $a = self::split($a); |
|
| 178 | + $b = self::split($b); |
|
| 179 | 179 | $ca = count($a); |
| 180 | 180 | $cb = count($b); |
| 181 | 181 | $dis = range(0, $cb); |
| 182 | 182 | self::BuildLevenshteinCostMatrix($a, $b, $ca, $cb, $dis, true); |
| 183 | 183 | |
| 184 | - return $distance=$dis[$cb]; |
|
| 184 | + return $distance = $dis[$cb]; |
|
| 185 | 185 | } |
| 186 | 186 | |
| 187 | - private static function BuildLevenshteinCostMatrix($a, $b, $ca, $cb, &$dis, $damerau=false) |
|
| 187 | + private static function BuildLevenshteinCostMatrix($a, $b, $ca, $cb, &$dis, $damerau = false) |
|
| 188 | 188 | { |
| 189 | - $dis_new=array(); |
|
| 190 | - for ($x=1;$x<=$ca;$x++) { |
|
| 191 | - $dis_new[0]=$x; |
|
| 192 | - for ($y=1;$y<=$cb;$y++) { |
|
| 193 | - $c = ($a[$x-1] == $b[$y-1])?0:1; |
|
| 194 | - $dis_new[$y] = min($dis[$y]+1, $dis_new[$y-1]+1, $dis[$y-1]+$c); |
|
| 189 | + $dis_new = array(); |
|
| 190 | + for ($x = 1; $x <= $ca; $x++) { |
|
| 191 | + $dis_new[0] = $x; |
|
| 192 | + for ($y = 1; $y <= $cb; $y++) { |
|
| 193 | + $c = ($a[$x - 1] == $b[$y - 1]) ? 0 : 1; |
|
| 194 | + $dis_new[$y] = min($dis[$y] + 1, $dis_new[$y - 1] + 1, $dis[$y - 1] + $c); |
|
| 195 | 195 | if ($damerau) { |
| 196 | - if ($x > 1 && $y > 1 && $a[$x-1] == $b[$y-2] && $a[$x-2] == $b[$y-1]) { |
|
| 197 | - $dis_new[$y]= min($dis_new[$y-1], $dis[$y-3] + $c) ; |
|
| 196 | + if ($x > 1 && $y > 1 && $a[$x - 1] == $b[$y - 2] && $a[$x - 2] == $b[$y - 1]) { |
|
| 197 | + $dis_new[$y] = min($dis_new[$y - 1], $dis[$y - 3] + $c); |
|
| 198 | 198 | } |
| 199 | 199 | } |
| 200 | 200 | } |
@@ -76,13 +76,13 @@ discard block |
||
| 76 | 76 | return $stats; |
| 77 | 77 | } |
| 78 | 78 | |
| 79 | - protected static function getParts($b, &$c = 0, $lengthCapture=false) |
|
| 79 | + protected static function getParts($b, &$c = 0, $lengthCapture = false) |
|
| 80 | 80 | { |
| 81 | 81 | $parts = array(); |
| 82 | 82 | $tmp = ''; |
| 83 | 83 | $c = 0; |
| 84 | - $length=0; |
|
| 85 | - $lengthCapture=(bool)$lengthCapture; |
|
| 84 | + $length = 0; |
|
| 85 | + $lengthCapture = (bool) $lengthCapture; |
|
| 86 | 86 | if ($lengthCapture) { |
| 87 | 87 | self::capturePartsWithLength($b, $length, $tmp, $c, $parts); |
| 88 | 88 | } else { |
@@ -114,17 +114,17 @@ discard block |
||
| 114 | 114 | foreach ($b as $k=>$v) { |
| 115 | 115 | $length++; |
| 116 | 116 | if (ctype_space($v) || ctype_punct($v)) { |
| 117 | - $parts[] =array($tmp,$length-1); |
|
| 118 | - $parts[] = array($v,1); |
|
| 117 | + $parts[] = array($tmp, $length - 1); |
|
| 118 | + $parts[] = array($v, 1); |
|
| 119 | 119 | $c += 2; |
| 120 | 120 | $tmp = ''; |
| 121 | - $length=0; |
|
| 121 | + $length = 0; |
|
| 122 | 122 | continue; |
| 123 | 123 | } |
| 124 | 124 | $tmp .= $v; |
| 125 | 125 | } |
| 126 | 126 | if (!empty($tmp)) { |
| 127 | - $parts[] = array($tmp,$length); |
|
| 127 | + $parts[] = array($tmp, $length); |
|
| 128 | 128 | $c++; |
| 129 | 129 | } |
| 130 | 130 | } |
@@ -146,7 +146,7 @@ discard block |
||
| 146 | 146 | if (is_array($split)) { |
| 147 | 147 | return |
| 148 | 148 | array_map( |
| 149 | - function ($val) { |
|
| 149 | + function($val) { |
|
| 150 | 150 | if (self::is_ascii($val)) { |
| 151 | 151 | return strtolower($val); |
| 152 | 152 | } |
@@ -161,21 +161,21 @@ discard block |
||
| 161 | 161 | } |
| 162 | 162 | } |
| 163 | 163 | |
| 164 | - protected static function split($str, $grams=false) |
|
| 164 | + protected static function split($str, $grams = false) |
|
| 165 | 165 | { |
| 166 | 166 | if (!is_string($str)) { |
| 167 | 167 | return array(); |
| 168 | 168 | } |
| 169 | 169 | static $split = []; |
| 170 | 170 | static $old = ''; |
| 171 | - static $oldGrams=1; |
|
| 172 | - $grams=is_int($grams) && $grams >=1 && $grams <= strlen($str) ? $grams : false; |
|
| 173 | - if ($old === $str && $oldGrams===$grams) { |
|
| 171 | + static $oldGrams = 1; |
|
| 172 | + $grams = is_int($grams) && $grams >= 1 && $grams <= strlen($str) ? $grams : false; |
|
| 173 | + if ($old === $str && $oldGrams === $grams) { |
|
| 174 | 174 | return $split; |
| 175 | 175 | } else { |
| 176 | 176 | $old = $str; |
| 177 | - $oldGrams=$grams; |
|
| 178 | - $split = !$grams ? preg_split('//u', $str, -1, PREG_SPLIT_NO_EMPTY):preg_split('/(.{'.$grams.'})/su', $str, -1, PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE); |
|
| 177 | + $oldGrams = $grams; |
|
| 178 | + $split = !$grams ? preg_split('//u', $str, -1, PREG_SPLIT_NO_EMPTY) : preg_split('/(.{'.$grams.'})/su', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); |
|
| 179 | 179 | return $split; |
| 180 | 180 | } |
| 181 | 181 | } |
@@ -95,7 +95,7 @@ discard block |
||
| 95 | 95 | } |
| 96 | 96 | |
| 97 | 97 | |
| 98 | - function dice($a, $b, $round=2) |
|
| 98 | + function dice($a, $b, $round = 2) |
|
| 99 | 99 | { |
| 100 | 100 | return Distance::dice($a, $b, $round); |
| 101 | 101 | } |
@@ -107,7 +107,7 @@ discard block |
||
| 107 | 107 | } |
| 108 | 108 | |
| 109 | 109 | |
| 110 | - function jaroWinkler($a, $b, $round=2) |
|
| 110 | + function jaroWinkler($a, $b, $round = 2) |
|
| 111 | 111 | { |
| 112 | 112 | return Distance::jaroWinkler($a, $b, $round); |
| 113 | 113 | } |