Issues (81)

src/ub.strings.similarity.js (4 issues)

Labels
Severity
1
/** global: UB */
2
3
var stringFuncs = {
4
	
5
	// equality testing
6
	equals: function(str2){
7
		return this.isEqual(str2);
8
	},
9
	equalsCI: function(str2){
10
		return this.isEqual(str2, false);
11
	},
12
	isEqual: function(str2, caseSensitive = true, str1IsLower = false, str2IsLower = false){
13
		var str1 = this;
14
		
15
		if (!caseSensitive) {
16
			
17
			// quick checks
18
			if (str2 == null) {
19
				return (str1 == str2);
20
			}
21
			if (str1.length != str2.length) {
22
				return false;
23
			}
24
			
25
			// init casing tables
26
			if (UB.UTF_lowerToUpper == null){
27
				UB.initCasing();
28
			}
29
			
30
			// very fast CI comparison
31
			for (var c = 0, cl = str1.length;c<cl;c++){
32
				var c1 = str1.charCodeAt(c);
33
				var c2 = str2.charCodeAt(c);
34
				
35
				// CI
36
				if (!str1IsLower) {
37
					if (c1 <= UB.UTF_casingTablesMax){
38
						c1 = UB.UTF_upperToLower[c1];
39
					}
40
				}
41
				if (!str2IsLower) {
42
					if (c2 <= UB.UTF_casingTablesMax){
43
						c2 = UB.UTF_upperToLower[c2];
44
					}
45
				}
46
				
47
				
48
				if (c1 != c2) {
49
					return false;
50
				}
51
			}
52
			return true;
53
			
54
			
55
			// causes thousands of HOC in a big loop
56
			/*str1 = str1.toUpperCase();
57
			str2 = str2.toUpperCase();*/
58
		}
59
		
60
		return (str1 == str2);
61
	},
62
	isEqualAny: function(str2, caseSensitive = true, str1IsLower = false, str2IsLower = false){
63
		var str1 = this;
64
		
65
		for (var s = 0, sl = str2.length;s<sl;s++){
66
			var s2 = str2[s];
67
			if (s2 != null) {
68
				if (caseSensitive) {
69
					if (str1 == s2) {
70
						return true;
71
					}
72
				}else{
73
					if (str1.isEqual(s2, false)) {
74
						return true;
75
					}
76
				}
77
			}
78
		}
79
		
80
		return false;
81
	},
82
	isNotEqual: function(str2, caseSensitive = true){
83
		var str1 = this;
84
		
85
		if (!caseSensitive) {
86
			if (str1 == null || str2 == null) {
87
				return (str1 != str2);
88
			}
89
			
90
			// very fast CI comparison
91
			return !str1.isEqual(str2, false);
92
		}
93
		
94
		return (str1 != str2);
95
	},
96
	
97
	// similarity testing
98
	similarityLevenshtein: function(target){
99
		var source = this;
100
		var ed = source.levenshteinDistance(target);
101
		var maxLen = Math.max(source.length, target.length);
102
		if (maxLen === 0) {
103
			return 100;
104
		}
105
		return (1 - ed/maxLen) * 100;
106
	},
107
	levenshteinDistance: function(target){
108
		var source = this;
109
		
110
		/**
111
		*	Levenshtein distance is a measure of the similarity between two strings,
112
		*	The distance is the number of deletions, insertions, or substitutions required to
113
		*	transform p_source into p_target.
114
		*/
115
		var i;
116
	
117
		var d = [];
118
		var cost;
119
		var n = source.length;
120
		var m = target.length;
121
		var j;
122
	
123
		if (n === 0) { return m; }
124
		if (m === 0) { return n; }
125
	
126
		for (i=0; i<=n; i++) { d[i] = []; }
127
		for (i=0; i<=n; i++) { d[i][0] = i; }
128
		for (j=0; j<=m; j++) { d[0][j] = j; }
129
	
130
		for (i=1; i<=n; i++) {
131
	
132
			var s_i = source.charAt(i - 1);
133
			for (j=1; j<=m; j++) {
134
	
135
				var t_j = target.charAt(j - 1);
136
	
137
				if (s_i == t_j) { cost = 0; }
138
				else { cost = 1; }
139
	
140
				d[i][j] = S._minimum(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+cost);
0 ignored issues
show
The variable S seems to be never declared. If this is a global, consider adding a /** global: S */ comment.

This checks looks for references to variables that have not been declared. This is most likey a typographical error or a variable has been renamed.

To learn more about declaring variables in Javascript, see the MDN.

Loading history...
141
			}
142
		}
143
		return d[n][m];
144
	},
145
	
146
	similarityScore: function(str2, caseSensitive = false, spaceSensitive = true){
147
		var str1 = this;
148
		
149
		// better for unequal length strings
150
		
151
		// returns ~0.9 for "jhonny" and "jonny"
152
		// returns 0.3 for "ABCD" and "DBCD"
153
		
154
		if	(!caseSensitive){
155
			str1 = str1.toUpperCase();
156
			str2 = str2.toUpperCase();
157
		}
158
		
159
		if	(!spaceSensitive){
160
			str1 = str1.removeSpaces();
161
			str2 = str2.removeSpaces();
162
		}
163
		
164
		if (str1 == str2) {
165
			return 1;
166
		}
167
		
168
		return N1D.MatchingScore(str1.splitCharPairs(), str2.splitCharPairs());
0 ignored issues
show
The variable N1D seems to be never declared. If this is a global, consider adding a /** global: N1D */ comment.

This checks looks for references to variables that have not been declared. This is most likey a typographical error or a variable has been renamed.

To learn more about declaring variables in Javascript, see the MDN.

Loading history...
169
	},
170
	similarityScoreChars: function(str2, caseSensitive = false){
171
		var str1 = this;
172
		
173
		// better for equal length strings where char by char matching is wanted
174
		
175
		// returns 0.75 for "ABCD" and "DBCD"
176
		
177
		
178
		// don't process unequal len strings
179
		if (str1.length != str2.length) {
180
			return str1.similarityScore(str2, caseSensitive);
181
		}
182
		
183
		if	(!caseSensitive){
184
			str1 = str1.toUpperCase();
185
			str2 = str2.toUpperCase();
186
		}
187
		
188
		// just counts matching chars
189
		return str1.countMatchingChars(str2) / str1.length;
190
	},
191
	countMatchingCharPairs: function(str2, caseSensitive = false){
192
		var str1 = this;
193
		
194
		if	(!caseSensitive){
195
			str1 = str1.toUpperCase();
196
			str2 = str2.toUpperCase();
197
		}
198
		
199
		return N1D.MatchingSlots(str1.splitCharPairs(), str2.splitCharPairs());
0 ignored issues
show
The variable N1D seems to be never declared. If this is a global, consider adding a /** global: N1D */ comment.

This checks looks for references to variables that have not been declared. This is most likey a typographical error or a variable has been renamed.

To learn more about declaring variables in Javascript, see the MDN.

Loading history...
200
	},
201
	countMatchingChars: function(str2){
202
		var str1 = this;
203
		
204
		// just counts matching chars
205
		var count = 0;
206
		for (var c = 0, cl = str1.length;c<cl;c++){
207
			if (str1.charAt(c) == str2.charAt(c)) {
208
				count++;
209
			}
210
		}
211
		return count;
212
	},
213
	indexOfBestMatchRepeated: function(caseSensitive, similarChars = 4, minSimilarity = 0.7){
214
		var text = this;
215
		
216
		// finds the starting point of the 2nd instance of any repeated substring .. "zjohn ajohn" will return 7;
217
		
218
		// per char set
219
		var sims = [];
220
		for (var c = 0, cl = text.length - (similarChars - 1);c<cl;c++){
221
			var chars = text.substr(c, similarChars);
222
			
223
			// per every other char set ahead of this
224
			var sims2 = UB.newArray(0, cl);
225
			for (var c2 = c + similarChars;c2<cl;c2++){
226
				var chars2 = text.substr(c2, similarChars);
227
				
228
				
229
				// calc similarity
230
				sims2[c2] = chars.similarityScoreChars(chars2, caseSensitive);
231
			}
232
			
233
			// store best similarity match if above wanted similarity
234
			sims[c] = N1D.MaxIndexInRange(sims2, minSimilarity, 1);
0 ignored issues
show
The variable N1D seems to be never declared. If this is a global, consider adding a /** global: N1D */ comment.

This checks looks for references to variables that have not been declared. This is most likey a typographical error or a variable has been renamed.

To learn more about declaring variables in Javascript, see the MDN.

Loading history...
235
		}
236
		
237
		// find char nearest to left
238
		var leftCharIndex = N1D.MinInRange(sims, 0, cl, true);
239
		var leftCharIndex2 = sims[leftCharIndex];
240
		
241
		// now find first exactly matching char after it
242
		for (c = leftCharIndex, c2 = leftCharIndex2, cl = text.length; c2 < cl; c++, c2++) {
243
			var leftChar = text.charAt(c);
244
			var leftChar2 = text.charAt(c2);
245
			if (!caseSensitive) {
246
				leftChar = leftChar.toUpperCase();
247
				leftChar2 = leftChar2.toUpperCase();
248
			}
249
			if (leftChar == leftChar2) {
250
				return c2;
251
			}
252
		}
253
		return leftCharIndex2;
254
	},
255
	isSimilar: function(str2, caseSensitive = false, threshold = 0.8){
256
		var str1 = this;
257
		return (str1.similarityScore(str2, caseSensitive) >= threshold);
258
	},
259
	none:null
260
};
261
262
// register funcs
263
UB.registerFuncs(String.prototype, stringFuncs);
264