Conditions | 142 |
Total Lines | 657 |
Code Lines | 382 |
Lines | 0 |
Ratio | 0 % |
Tests | 264 |
CRAP Score | 142 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic._phonet.Phonet.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # Copyright 2014-2020 by Christopher C. Little. |
||
1083 | def encode(self, word: str) -> str: |
||
1084 | """Return the phonet code for a word. |
||
1085 | |||
1086 | Parameters |
||
1087 | ---------- |
||
1088 | word : str |
||
1089 | The word to transform |
||
1090 | |||
1091 | Returns |
||
1092 | ------- |
||
1093 | str |
||
1094 | 1 | The phonet value |
|
1095 | 1 | ||
1096 | Examples |
||
1097 | 1 | -------- |
|
1098 | >>> pe = Phonet() |
||
1099 | >>> pe.encode('Christopher') |
||
1100 | 'KRISTOFA' |
||
1101 | >>> pe.encode('Niall') |
||
1102 | 'NIAL' |
||
1103 | >>> pe.encode('Smith') |
||
1104 | 'SMIT' |
||
1105 | >>> pe.encode('Schmidt') |
||
1106 | 'SHMIT' |
||
1107 | |||
1108 | >>> pe2 = Phonet(mode=2) |
||
1109 | >>> pe2.encode('Christopher') |
||
1110 | 'KRIZTUFA' |
||
1111 | >>> pe2.encode('Niall') |
||
1112 | 'NIAL' |
||
1113 | >>> pe2.encode('Smith') |
||
1114 | 'ZNIT' |
||
1115 | >>> pe2.encode('Schmidt') |
||
1116 | 'ZNIT' |
||
1117 | |||
1118 | >>> pe_none = Phonet(lang='none') |
||
1119 | >>> pe_none.encode('Christopher') |
||
1120 | 'CHRISTOPHER' |
||
1121 | >>> pe_none.encode('Niall') |
||
1122 | 'NIAL' |
||
1123 | >>> pe_none.encode('Smith') |
||
1124 | 'SMITH' |
||
1125 | >>> pe_none.encode('Schmidt') |
||
1126 | 'SCHMIDT' |
||
1127 | |||
1128 | |||
1129 | .. versionadded:: 0.1.0 |
||
1130 | .. versionchanged:: 0.3.6 |
||
1131 | Encapsulated in class |
||
1132 | |||
1133 | """ |
||
1134 | phonet_hash = Counter() # type: TCounter[str] |
||
1135 | alpha_pos = Counter() # type: TCounter[str] |
||
1136 | |||
1137 | phonet_hash_1 = Counter() # type: TCounter[Tuple[int, int]] |
||
1138 | phonet_hash_2 = Counter() # type: TCounter[Tuple[int, int]] |
||
1139 | |||
1140 | def _initialize_phonet(lang: str) -> None: |
||
1141 | """Initialize phonet variables. |
||
1142 | |||
1143 | Parameters |
||
1144 | ---------- |
||
1145 | lang : str |
||
1146 | Language to use for rules |
||
1147 | |||
1148 | 1 | .. versionadded:: 0.1.0 |
|
1149 | 1 | ||
1150 | """ |
||
1151 | 1 | if lang == 'none': |
|
1152 | 1 | _phonet_rules = self._rules_no_lang |
|
1153 | else: |
||
1154 | 1 | _phonet_rules = self._rules_german |
|
1155 | |||
1156 | phonet_hash[''] = -1 |
||
1157 | |||
1158 | # German and international umlauts |
||
1159 | for ch in { |
||
1160 | 'À', |
||
1161 | 'Á', |
||
1162 | 'Â', |
||
1163 | 'Ã', |
||
1164 | 'Ä', |
||
1165 | 1 | 'Å', |
|
1166 | 1 | 'Æ', |
|
1167 | 'Ç', |
||
1168 | 1 | 'È', |
|
1169 | 'É', |
||
1170 | 1 | 'Ê', |
|
1171 | 'Ë', |
||
1172 | 'Ì', |
||
1173 | 1 | 'Í', |
|
1174 | 'Î', |
||
1175 | 'Ï', |
||
1176 | 'Ð', |
||
1177 | 'Ñ', |
||
1178 | 'Ò', |
||
1179 | 'Ó', |
||
1180 | 'Ô', |
||
1181 | 'Õ', |
||
1182 | 'Ö', |
||
1183 | 'Ø', |
||
1184 | 'Ù', |
||
1185 | 'Ú', |
||
1186 | 'Û', |
||
1187 | 'Ü', |
||
1188 | 'Ý', |
||
1189 | 'Þ', |
||
1190 | 'ß', |
||
1191 | 'Œ', |
||
1192 | 'Š', |
||
1193 | 'Ÿ', |
||
1194 | }: |
||
1195 | alpha_pos[ch] = 1 |
||
1196 | phonet_hash[ch] = -1 |
||
1197 | |||
1198 | # "normal" letters ('A'-'Z') |
||
1199 | for i, ch in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'): |
||
1200 | alpha_pos[ch] = i + 2 |
||
1201 | phonet_hash[ch] = -1 |
||
1202 | |||
1203 | for i in range(26): |
||
1204 | for j in range(28): |
||
1205 | phonet_hash_1[i, j] = -1 |
||
1206 | phonet_hash_2[i, j] = -1 |
||
1207 | |||
1208 | # for each phonetc rule |
||
1209 | 1 | for i in range(len(_phonet_rules)): |
|
1210 | 1 | rule = _phonet_rules[i] |
|
1211 | |||
1212 | if rule and i % 3 == 0: |
||
1213 | 1 | # calculate first hash value |
|
1214 | 1 | ch = cast(str, _phonet_rules[i])[0] |
|
1215 | 1 | ||
1216 | if phonet_hash[ch] < 0 and ( |
||
1217 | 1 | cast(str, _phonet_rules[i + 1]) |
|
1218 | 1 | or cast(str, _phonet_rules[i + 2]) |
|
1219 | 1 | ): |
|
1220 | 1 | phonet_hash[ch] = i |
|
1221 | |||
1222 | # calculate second hash values |
||
1223 | 1 | if ch and alpha_pos[ch] >= 2: |
|
1224 | 1 | k = alpha_pos[ch] |
|
1225 | |||
1226 | 1 | j = k - 2 |
|
1227 | rule = rule[1:] |
||
1228 | 1 | ||
1229 | if not rule: |
||
1230 | 1 | rule = ' ' |
|
1231 | elif rule[0] == '(': |
||
1232 | rule = rule[1:] |
||
1233 | 1 | else: |
|
1234 | rule = rule[0] |
||
1235 | |||
1236 | 1 | while rule and (rule[0] != ')'): |
|
1237 | 1 | k = alpha_pos[rule[0]] |
|
1238 | |||
1239 | 1 | if k > 0: |
|
1240 | 1 | # add hash value for this letter |
|
1241 | if phonet_hash_1[j, k] < 0: |
||
1242 | 1 | phonet_hash_1[j, k] = i |
|
1243 | 1 | phonet_hash_2[j, k] = i |
|
1244 | 1 | ||
1245 | 1 | if phonet_hash_2[j, k] >= (i - 30): |
|
1246 | phonet_hash_2[j, k] = i |
||
1247 | 1 | else: |
|
1248 | k = -1 |
||
1249 | 1 | ||
1250 | 1 | if k <= 0: |
|
1251 | # add hash value for all letters |
||
1252 | 1 | if phonet_hash_1[j, 0] < 0: |
|
1253 | phonet_hash_1[j, 0] = i |
||
1254 | 1 | ||
1255 | 1 | phonet_hash_2[j, 0] = i |
|
1256 | 1 | ||
1257 | rule = rule[1:] |
||
1258 | 1 | ||
1259 | 1 | def _phonet(term: str, mode: int, lang: str) -> str: |
|
1260 | """Return the phonet coded form of a term. |
||
1261 | 1 | ||
1262 | Parameters |
||
1263 | 1 | ---------- |
|
1264 | term : str |
||
1265 | 1 | Term to transform |
|
1266 | 1 | mode : int |
|
1267 | The ponet variant to employ (1 or 2) |
||
1268 | 1 | lang : str |
|
1269 | ``de`` (default) for German, ``none`` for no language |
||
1270 | 1 | ||
1271 | Returns |
||
1272 | 1 | ------- |
|
1273 | str |
||
1274 | The phonet value |
||
1275 | |||
1276 | .. versionadded:: 0.1.0 |
||
1277 | |||
1278 | """ |
||
1279 | if lang == 'none': |
||
1280 | _phonet_rules = self._rules_no_lang |
||
1281 | else: |
||
1282 | _phonet_rules = self._rules_german |
||
1283 | |||
1284 | char0 = '' |
||
1285 | dest = term |
||
1286 | |||
1287 | if not term: |
||
1288 | return '' |
||
1289 | |||
1290 | term_length = len(term) |
||
1291 | |||
1292 | 1 | # convert input string to upper-case |
|
1293 | 1 | src = term.translate(self._upper_trans) |
|
1294 | |||
1295 | 1 | # check "src" |
|
1296 | i = 0 |
||
1297 | 1 | j = 0 |
|
1298 | 1 | zeta = 0 |
|
1299 | |||
1300 | 1 | while i < len(src): |
|
1301 | 1 | char = src[i] |
|
1302 | |||
1303 | 1 | pos = alpha_pos[char] |
|
1304 | |||
1305 | if pos >= 2: |
||
1306 | 1 | xpos = pos - 2 |
|
1307 | |||
1308 | if i + 1 == len(src): |
||
1309 | 1 | pos = alpha_pos[''] |
|
1310 | 1 | else: |
|
1311 | 1 | pos = alpha_pos[src[i + 1]] |
|
1312 | |||
1313 | 1 | start1 = phonet_hash_1[xpos, pos] |
|
1314 | 1 | start2 = phonet_hash_1[xpos, 0] |
|
1315 | end1 = phonet_hash_2[xpos, pos] |
||
1316 | 1 | end2 = phonet_hash_2[xpos, 0] |
|
1317 | |||
1318 | 1 | # preserve rule priorities |
|
1319 | 1 | if (start2 >= 0) and ((start1 < 0) or (start2 < start1)): |
|
1320 | pos = start1 |
||
1321 | 1 | start1 = start2 |
|
1322 | 1 | start2 = pos |
|
1323 | pos = end1 |
||
1324 | 1 | end1 = end2 |
|
1325 | end2 = pos |
||
1326 | 1 | ||
1327 | 1 | if (end1 >= start2) and (start2 >= 0): |
|
1328 | 1 | if end2 > end1: |
|
1329 | 1 | end1 = end2 |
|
1330 | |||
1331 | start2 = -1 |
||
1332 | 1 | end2 = -1 |
|
1333 | 1 | else: |
|
1334 | 1 | pos = phonet_hash[char] |
|
1335 | 1 | start1 = pos |
|
1336 | 1 | end1 = 10000 |
|
1337 | 1 | start2 = -1 |
|
1338 | 1 | end2 = -1 |
|
1339 | |||
1340 | 1 | pos = start1 |
|
1341 | 1 | zeta0 = 0 |
|
1342 | 1 | ||
1343 | if pos >= 0: |
||
1344 | 1 | # check rules for this char |
|
1345 | 1 | while (_phonet_rules[pos] is None) or ( |
|
1346 | cast(str, _phonet_rules[pos])[0] == char |
||
1347 | 1 | ): |
|
1348 | 1 | if pos > end1: |
|
1349 | 1 | if start2 > 0: |
|
1350 | 1 | pos = start2 |
|
1351 | 1 | start1 = start2 |
|
1352 | start2 = -1 |
||
1353 | 1 | end1 = end2 |
|
1354 | 1 | end2 = -1 |
|
1355 | continue |
||
1356 | 1 | ||
1357 | break |
||
1358 | 1 | ||
1359 | if (_phonet_rules[pos] is None) or ( |
||
1360 | _phonet_rules[pos + mode] is None |
||
1361 | 1 | ): |
|
1362 | 1 | # no conversion rule available |
|
1363 | 1 | pos += 3 |
|
1364 | 1 | continue |
|
1365 | 1 | ||
1366 | 1 | # check whole string |
|
1367 | 1 | matches = 1 # number of matching letters |
|
1368 | 1 | priority = 5 # default priority |
|
1369 | rule = cast(str, _phonet_rules[pos])[1:] |
||
1370 | 1 | ||
1371 | while ( |
||
1372 | 1 | rule |
|
1373 | and (len(src) > (i + matches)) |
||
1374 | and (src[i + matches] == rule[0]) |
||
1375 | and not rule[0].isdigit() |
||
1376 | 1 | and (rule not in '(-<^$') |
|
1377 | 1 | ): |
|
1378 | matches += 1 |
||
1379 | rule = rule[1:] |
||
1380 | 1 | ||
1381 | 1 | if rule and (rule[0] == '('): |
|
1382 | 1 | # check an array of letters |
|
1383 | 1 | if ( |
|
1384 | (len(src) > (i + matches)) |
||
1385 | 1 | and src[i + matches].isalpha() |
|
1386 | and (src[i + matches] in rule[1:]) |
||
1387 | ): |
||
1388 | matches += 1 |
||
1389 | |||
1390 | while rule and rule[0] != ')': |
||
1391 | rule = rule[1:] |
||
1392 | 1 | ||
1393 | 1 | # if rule[0] == ')': |
|
1394 | rule = rule[1:] |
||
1395 | 1 | ||
1396 | if rule: |
||
1397 | 1 | priority0 = ord(rule[0]) |
|
1398 | else: |
||
1399 | priority0 = 0 |
||
1400 | |||
1401 | matches0 = matches |
||
1402 | 1 | ||
1403 | while rule and rule[0] == '-' and matches > 1: |
||
1404 | 1 | matches -= 1 |
|
1405 | 1 | rule = rule[1:] |
|
1406 | |||
1407 | if rule and rule[0] == '<': |
||
1408 | 1 | rule = rule[1:] |
|
1409 | |||
1410 | 1 | if rule and rule[0].isdigit(): |
|
1411 | 1 | # read priority |
|
1412 | priority = int(rule[0]) |
||
1413 | 1 | rule = rule[1:] |
|
1414 | |||
1415 | 1 | if rule and rule[0:2] == '^^': |
|
1416 | rule = rule[1:] |
||
1417 | 1 | ||
1418 | 1 | if ( |
|
1419 | 1 | not rule |
|
1420 | or ( |
||
1421 | 1 | (rule[0] == '^') |
|
1422 | 1 | and ((i == 0) or not src[i - 1].isalpha()) |
|
1423 | and ( |
||
1424 | 1 | (rule[1:2] != '$') |
|
1425 | or ( |
||
1426 | 1 | not ( |
|
1427 | 1 | src[ |
|
1428 | i + matches0 : i + matches0 + 1 |
||
1429 | 1 | ].isalpha() |
|
1430 | 1 | ) |
|
1431 | and ( |
||
1432 | 1 | src[ |
|
1433 | i + matches0 : i + matches0 + 1 |
||
1434 | ] |
||
1435 | != '.' |
||
1436 | ) |
||
1437 | ) |
||
1438 | ) |
||
1439 | ) |
||
1440 | or ( |
||
1441 | (rule[0] == '$') |
||
1442 | and (i > 0) |
||
1443 | and src[i - 1].isalpha() |
||
1444 | and ( |
||
1445 | ( |
||
1446 | not src[ |
||
1447 | i + matches0 : i + matches0 + 1 |
||
1448 | ].isalpha() |
||
1449 | ) |
||
1450 | and ( |
||
1451 | src[i + matches0 : i + matches0 + 1] |
||
1452 | != '.' |
||
1453 | ) |
||
1454 | ) |
||
1455 | ) |
||
1456 | ): |
||
1457 | # look for continuation, if: |
||
1458 | # matches > 1 und NO '-' in first string */ |
||
1459 | pos0 = -1 |
||
1460 | |||
1461 | start3 = 0 |
||
1462 | start4 = 0 |
||
1463 | end3 = 0 |
||
1464 | end4 = 0 |
||
1465 | |||
1466 | if ( |
||
1467 | (matches > 1) |
||
1468 | and src[i + matches : i + matches + 1] |
||
1469 | and (priority0 != ord('-')) |
||
1470 | ): |
||
1471 | char0 = src[i + matches - 1] |
||
1472 | pos0 = alpha_pos[char0] |
||
1473 | 1 | ||
1474 | if pos0 >= 2 and src[i + matches]: |
||
1475 | 1 | xpos = pos0 - 2 |
|
1476 | 1 | pos0 = alpha_pos[src[i + matches]] |
|
1477 | 1 | start3 = phonet_hash_1[xpos, pos0] |
|
1478 | 1 | start4 = phonet_hash_1[xpos, 0] |
|
1479 | end3 = phonet_hash_2[xpos, pos0] |
||
1480 | 1 | end4 = phonet_hash_2[xpos, 0] |
|
1481 | |||
1482 | # preserve rule priorities |
||
1483 | if (start4 >= 0) and ( |
||
1484 | (start3 < 0) or (start4 < start3) |
||
1485 | 1 | ): |
|
1486 | 1 | pos0 = start3 |
|
1487 | start3 = start4 |
||
1488 | 1 | start4 = pos0 |
|
1489 | 1 | pos0 = end3 |
|
1490 | 1 | end3 = end4 |
|
1491 | 1 | end4 = pos0 |
|
1492 | 1 | ||
1493 | 1 | if (end3 >= start4) and (start4 >= 0): |
|
1494 | 1 | if end4 > end3: |
|
1495 | end3 = end4 |
||
1496 | |||
1497 | 1 | start4 = -1 |
|
1498 | end4 = -1 |
||
1499 | else: |
||
1500 | 1 | pos0 = phonet_hash[char0] |
|
1501 | 1 | start3 = pos0 |
|
1502 | 1 | end3 = 10000 |
|
1503 | 1 | start4 = -1 |
|
1504 | 1 | end4 = -1 |
|
1505 | 1 | ||
1506 | pos0 = start3 |
||
1507 | 1 | ||
1508 | 1 | # check continuation rules for src[i+matches] |
|
1509 | 1 | if pos0 >= 0: |
|
1510 | while (_phonet_rules[pos0] is None) or ( |
||
1511 | 1 | cast(str, _phonet_rules[pos0])[0] == char0 |
|
1512 | 1 | ): |
|
1513 | if pos0 > end3: |
||
1514 | 1 | if start4 > 0: |
|
1515 | 1 | pos0 = start4 |
|
1516 | 1 | start3 = start4 |
|
1517 | 1 | start4 = -1 |
|
1518 | 1 | end3 = end4 |
|
1519 | end4 = -1 |
||
1520 | 1 | continue |
|
1521 | |||
1522 | priority0 = -1 |
||
1523 | 1 | ||
1524 | 1 | # important |
|
1525 | break |
||
1526 | |||
1527 | 1 | if (_phonet_rules[pos0] is None) or ( |
|
1528 | 1 | _phonet_rules[pos0 + mode] is None |
|
1529 | 1 | ): |
|
1530 | 1 | # no conversion rule available |
|
1531 | 1 | pos0 += 3 |
|
1532 | 1 | continue |
|
1533 | 1 | ||
1534 | 1 | # check whole string |
|
1535 | matches0 = matches |
||
1536 | 1 | priority0 = 5 |
|
1537 | rule = cast(str, _phonet_rules[pos0])[1:] |
||
1538 | |||
1539 | 1 | while ( |
|
1540 | rule |
||
1541 | 1 | and ( |
|
1542 | src[ |
||
1543 | i + matches0 : i + matches0 + 1 |
||
1544 | ] |
||
1545 | 1 | == rule[0] |
|
1546 | 1 | ) |
|
1547 | and ( |
||
1548 | not rule[0].isdigit() |
||
1549 | 1 | or (rule in '(-<^$') |
|
1550 | 1 | ) |
|
1551 | 1 | ): |
|
1552 | 1 | matches0 += 1 |
|
1553 | rule = rule[1:] |
||
1554 | 1 | ||
1555 | if rule and rule[0] == '(': |
||
1556 | # check an array of letters |
||
1557 | if src[ |
||
1558 | i + matches0 : i + matches0 + 1 |
||
1559 | ].isalpha() and ( |
||
1560 | src[i + matches0] in rule[1:] |
||
1561 | ): |
||
1562 | matches0 += 1 |
||
1563 | |||
1564 | while rule and rule[0] != ')': |
||
1565 | rule = rule[1:] |
||
1566 | |||
1567 | 1 | # if rule[0] == ')': |
|
1568 | 1 | rule = rule[1:] |
|
1569 | |||
1570 | 1 | while rule and rule[0] == '-': |
|
1571 | # "matches0" is NOT decremented |
||
1572 | 1 | # because of |
|
1573 | # "if (matches0 == matches)" |
||
1574 | rule = rule[1:] |
||
1575 | |||
1576 | if rule and rule[0] == '<': |
||
1577 | 1 | rule = rule[1:] |
|
1578 | |||
1579 | 1 | if rule and rule[0].isdigit(): |
|
1580 | 1 | priority0 = int(rule[0]) |
|
1581 | rule = rule[1:] |
||
1582 | |||
1583 | 1 | if ( |
|
1584 | not rule |
||
1585 | 1 | # rule == '^' is not possible here |
|
1586 | or ( |
||
1587 | (rule[0] == '$') |
||
1588 | and not src[ |
||
1589 | 1 | i + matches0 : i + matches0 + 1 |
|
1590 | ].isalpha() |
||
1591 | 1 | and ( |
|
1592 | 1 | src[ |
|
1593 | i |
||
1594 | 1 | + matches0 : i |
|
1595 | 1 | + matches0 |
|
1596 | 1 | + 1 |
|
1597 | ] |
||
1598 | 1 | != '.' |
|
1599 | ) |
||
1600 | ) |
||
1601 | ): |
||
1602 | if matches0 == matches: |
||
1603 | # this is only a partial string |
||
1604 | pos0 += 3 |
||
1605 | continue |
||
1606 | |||
1607 | if priority0 < priority: |
||
1608 | # priority is too low |
||
1609 | pos0 += 3 |
||
1610 | continue |
||
1611 | |||
1612 | # continuation rule found |
||
1613 | break |
||
1614 | |||
1615 | pos0 += 3 |
||
1616 | |||
1617 | # end of "while" |
||
1618 | 1 | if (priority0 >= priority) and ( |
|
1619 | (_phonet_rules[pos0] is not None) |
||
1620 | 1 | and ( |
|
1621 | 1 | cast(str, _phonet_rules[pos0])[0] |
|
1622 | == char0 |
||
1623 | 1 | ) |
|
1624 | ): |
||
1625 | 1 | ||
1626 | 1 | pos += 3 |
|
1627 | continue |
||
1628 | |||
1629 | 1 | # replace string |
|
1630 | if _phonet_rules[pos] and ( |
||
1631 | 1 | '<' in cast(str, _phonet_rules[pos])[1:] |
|
1632 | ): |
||
1633 | priority0 = 1 |
||
1634 | 1 | else: |
|
1635 | priority0 = 0 |
||
1636 | |||
1637 | rule = cast(str, _phonet_rules[pos + mode]) |
||
1638 | |||
1639 | 1 | if (priority0 == 1) and (zeta == 0): |
|
1640 | 1 | # rule with '<' is applied |
|
1641 | if ( |
||
1642 | (j > 0) |
||
1643 | 1 | and rule |
|
1644 | and ( |
||
1645 | (dest[j - 1] == char) |
||
1646 | 1 | or (dest[j - 1] == rule[0]) |
|
1647 | ) |
||
1648 | 1 | ): |
|
1649 | j -= 1 |
||
1650 | 1 | ||
1651 | zeta0 = 1 |
||
1652 | 1 | zeta += 1 |
|
1653 | matches0 = 0 |
||
1654 | 1 | ||
1655 | while rule and src[i + matches0]: |
||
1656 | src = ( |
||
1657 | src[0 : i + matches0] |
||
1658 | + rule[0] |
||
1659 | + src[i + matches0 + 1 :] |
||
1660 | ) |
||
1661 | matches0 += 1 |
||
1662 | 1 | rule = rule[1:] |
|
1663 | |||
1664 | 1 | if matches0 < matches: |
|
1665 | 1 | src = ( |
|
1666 | 1 | src[0 : i + matches0] |
|
1667 | + src[i + matches :] |
||
1668 | 1 | ) |
|
1669 | 1 | ||
1670 | char = src[i] |
||
1671 | else: |
||
1672 | i = i + matches - 1 |
||
1673 | zeta = 0 |
||
1674 | 1 | ||
1675 | 1 | while len(rule) > 1: |
|
1676 | if (j == 0) or (dest[j - 1] != rule[0]): |
||
1677 | 1 | dest = ( |
|
1678 | 1 | dest[0:j] |
|
1679 | + rule[0] |
||
1680 | + dest[min(len(dest), j + 1) :] |
||
1681 | ) |
||
1682 | j += 1 |
||
1683 | 1 | ||
1684 | rule = rule[1:] |
||
1685 | 1 | ||
1686 | 1 | # new "current char" |
|
1687 | if not rule: |
||
1688 | 1 | rule = '' |
|
1689 | 1 | char = '' |
|
1690 | 1 | else: |
|
1691 | char = rule[0] |
||
1692 | |||
1693 | if ( |
||
1694 | _phonet_rules[pos] |
||
1695 | 1 | and '^^' |
|
1696 | in cast(str, _phonet_rules[pos])[1:] |
||
1697 | 1 | ): |
|
1698 | if char: |
||
1699 | dest = ( |
||
1700 | 1 | dest[0:j] |
|
1701 | 1 | + char |
|
1702 | 1 | + dest[min(len(dest), j + 1) :] |
|
1703 | ) |
||
1704 | 1 | j += 1 |
|
1705 | |||
1706 | 1 | src = src[i + 1 :] |
|
1707 | i = 0 |
||
1708 | zeta0 = 1 |
||
1709 | |||
1710 | 1 | break |
|
1711 | 1 | ||
1712 | pos += 3 |
||
1713 | |||
1714 | if pos > end1 and start2 > 0: |
||
1715 | pos = start2 |
||
1716 | 1 | start1 = start2 |
|
1717 | end1 = end2 |
||
1718 | 1 | start2 = -1 |
|
1719 | 1 | end2 = -1 |
|
1720 | 1 | ||
1721 | if zeta0 == 0: |
||
1722 | 1 | if char and ((j == 0) or (dest[j - 1] != char)): |
|
1723 | # delete multiple letters only |
||
1724 | 1 | dest = ( |
|
1725 | dest[0:j] + char + dest[min(j + 1, term_length) :] |
||
1726 | 1 | ) |
|
1727 | 1 | j += 1 |
|
1728 | 1 | ||
1729 | 1 | i += 1 |
|
1730 | 1 | zeta = 0 |
|
1731 | 1 | ||
1732 | dest = dest[0:j] |
||
1733 | 1 | ||
1734 | 1 | return dest |
|
1735 | |||
1736 | 1 | _initialize_phonet(self._lang) |
|
1737 | |||
1738 | word = unicode_normalize('NFKC', word) |
||
1739 | 1 | return _phonet(word, self._mode, self._lang) |
|
1740 | |||
1746 |