Conditions | 142 |
Total Lines | 648 |
Code Lines | 381 |
Lines | 0 |
Ratio | 0 % |
Tests | 269 |
CRAP Score | 142 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic._phonet.Phonet.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
1075 | 1 | def encode(self, word, mode=1, lang='de'): |
|
1076 | """Return the phonet code for a word. |
||
1077 | |||
1078 | Parameters |
||
1079 | ---------- |
||
1080 | word : str |
||
1081 | The word to transform |
||
1082 | mode : int |
||
1083 | The ponet variant to employ (1 or 2) |
||
1084 | lang : str |
||
1085 | ``de`` (default) for German, ``none`` for no language |
||
1086 | |||
1087 | Returns |
||
1088 | ------- |
||
1089 | str |
||
1090 | The phonet value |
||
1091 | |||
1092 | Examples |
||
1093 | -------- |
||
1094 | >>> pe = Phonet() |
||
1095 | >>> pe.encode('Christopher') |
||
1096 | 'KRISTOFA' |
||
1097 | >>> pe.encode('Niall') |
||
1098 | 'NIAL' |
||
1099 | >>> pe.encode('Smith') |
||
1100 | 'SMIT' |
||
1101 | >>> pe.encode('Schmidt') |
||
1102 | 'SHMIT' |
||
1103 | |||
1104 | >>> pe.encode('Christopher', mode=2) |
||
1105 | 'KRIZTUFA' |
||
1106 | >>> pe.encode('Niall', mode=2) |
||
1107 | 'NIAL' |
||
1108 | >>> pe.encode('Smith', mode=2) |
||
1109 | 'ZNIT' |
||
1110 | >>> pe.encode('Schmidt', mode=2) |
||
1111 | 'ZNIT' |
||
1112 | |||
1113 | >>> pe.encode('Christopher', lang='none') |
||
1114 | 'CHRISTOPHER' |
||
1115 | >>> pe.encode('Niall', lang='none') |
||
1116 | 'NIAL' |
||
1117 | >>> pe.encode('Smith', lang='none') |
||
1118 | 'SMITH' |
||
1119 | >>> pe.encode('Schmidt', lang='none') |
||
1120 | 'SCHMIDT' |
||
1121 | |||
1122 | """ |
||
1123 | 1 | phonet_hash = Counter() |
|
1124 | 1 | alpha_pos = Counter() |
|
1125 | |||
1126 | 1 | phonet_hash_1 = Counter() |
|
1127 | 1 | phonet_hash_2 = Counter() |
|
1128 | |||
1129 | 1 | def _initialize_phonet(lang): |
|
1130 | """Initialize phonet variables. |
||
1131 | |||
1132 | Parameters |
||
1133 | ---------- |
||
1134 | lang : str |
||
1135 | Language to use for rules |
||
1136 | |||
1137 | """ |
||
1138 | 1 | if lang == 'none': |
|
1139 | 1 | _phonet_rules = self._rules_no_lang |
|
1140 | else: |
||
1141 | 1 | _phonet_rules = self._rules_german |
|
1142 | |||
1143 | 1 | phonet_hash[''] = -1 |
|
1144 | |||
1145 | # German and international umlauts |
||
1146 | 1 | for j in { |
|
1147 | 'À', |
||
1148 | 'Á', |
||
1149 | 'Â', |
||
1150 | 'Ã', |
||
1151 | 'Ä', |
||
1152 | 'Å', |
||
1153 | 'Æ', |
||
1154 | 'Ç', |
||
1155 | 'È', |
||
1156 | 'É', |
||
1157 | 'Ê', |
||
1158 | 'Ë', |
||
1159 | 'Ì', |
||
1160 | 'Í', |
||
1161 | 'Î', |
||
1162 | 'Ï', |
||
1163 | 'Ð', |
||
1164 | 'Ñ', |
||
1165 | 'Ò', |
||
1166 | 'Ó', |
||
1167 | 'Ô', |
||
1168 | 'Õ', |
||
1169 | 'Ö', |
||
1170 | 'Ø', |
||
1171 | 'Ù', |
||
1172 | 'Ú', |
||
1173 | 'Û', |
||
1174 | 'Ü', |
||
1175 | 'Ý', |
||
1176 | 'Þ', |
||
1177 | 'ß', |
||
1178 | 'Œ', |
||
1179 | 'Š', |
||
1180 | 'Ÿ', |
||
1181 | }: |
||
1182 | 1 | alpha_pos[j] = 1 |
|
1183 | 1 | phonet_hash[j] = -1 |
|
1184 | |||
1185 | # "normal" letters ('A'-'Z') |
||
1186 | 1 | for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'): |
|
1187 | 1 | alpha_pos[j] = i + 2 |
|
1188 | 1 | phonet_hash[j] = -1 |
|
1189 | |||
1190 | 1 | for i in range(26): |
|
1191 | 1 | for j in range(28): |
|
1192 | 1 | phonet_hash_1[i, j] = -1 |
|
1193 | 1 | phonet_hash_2[i, j] = -1 |
|
1194 | |||
1195 | # for each phonetc rule |
||
1196 | 1 | for i in range(len(_phonet_rules)): |
|
1197 | 1 | rule = _phonet_rules[i] |
|
1198 | |||
1199 | 1 | if rule and i % 3 == 0: |
|
1200 | # calculate first hash value |
||
1201 | 1 | k = _phonet_rules[i][0] |
|
1202 | |||
1203 | 1 | if phonet_hash[k] < 0 and ( |
|
1204 | _phonet_rules[i + 1] or _phonet_rules[i + 2] |
||
1205 | ): |
||
1206 | 1 | phonet_hash[k] = i |
|
1207 | |||
1208 | # calculate second hash values |
||
1209 | 1 | if k and alpha_pos[k] >= 2: |
|
1210 | 1 | k = alpha_pos[k] |
|
1211 | |||
1212 | 1 | j = k - 2 |
|
1213 | 1 | rule = rule[1:] |
|
1214 | |||
1215 | 1 | if not rule: |
|
1216 | 1 | rule = ' ' |
|
1217 | 1 | elif rule[0] == '(': |
|
1218 | 1 | rule = rule[1:] |
|
1219 | else: |
||
1220 | 1 | rule = rule[0] |
|
1221 | |||
1222 | 1 | while rule and (rule[0] != ')'): |
|
1223 | 1 | k = alpha_pos[rule[0]] |
|
1224 | |||
1225 | 1 | if k > 0: |
|
1226 | # add hash value for this letter |
||
1227 | 1 | if phonet_hash_1[j, k] < 0: |
|
1228 | 1 | phonet_hash_1[j, k] = i |
|
1229 | 1 | phonet_hash_2[j, k] = i |
|
1230 | |||
1231 | 1 | if phonet_hash_2[j, k] >= (i - 30): |
|
1232 | 1 | phonet_hash_2[j, k] = i |
|
1233 | else: |
||
1234 | 1 | k = -1 |
|
1235 | |||
1236 | 1 | if k <= 0: |
|
1237 | # add hash value for all letters |
||
1238 | 1 | if phonet_hash_1[j, 0] < 0: |
|
1239 | 1 | phonet_hash_1[j, 0] = i |
|
1240 | |||
1241 | 1 | phonet_hash_2[j, 0] = i |
|
1242 | |||
1243 | 1 | rule = rule[1:] |
|
1244 | |||
1245 | 1 | def _phonet(term, mode, lang): |
|
1246 | """Return the phonet coded form of a term. |
||
1247 | |||
1248 | Parameters |
||
1249 | ---------- |
||
1250 | term : str |
||
1251 | Term to transform |
||
1252 | mode : int |
||
1253 | The ponet variant to employ (1 or 2) |
||
1254 | lang : str |
||
1255 | ``de`` (default) for German, ``none`` for no language |
||
1256 | |||
1257 | Returns |
||
1258 | ------- |
||
1259 | str |
||
1260 | The phonet value |
||
1261 | |||
1262 | """ |
||
1263 | 1 | if lang == 'none': |
|
1264 | 1 | _phonet_rules = self._rules_no_lang |
|
1265 | else: |
||
1266 | 1 | _phonet_rules = self._rules_german |
|
1267 | |||
1268 | 1 | char0 = '' |
|
1269 | 1 | dest = term |
|
1270 | |||
1271 | 1 | if not term: |
|
1272 | 1 | return '' |
|
1273 | |||
1274 | 1 | term_length = len(term) |
|
1275 | |||
1276 | # convert input string to upper-case |
||
1277 | 1 | src = term.translate(self._upper_trans) |
|
1278 | |||
1279 | # check "src" |
||
1280 | 1 | i = 0 |
|
1281 | 1 | j = 0 |
|
1282 | 1 | zeta = 0 |
|
1283 | |||
1284 | 1 | while i < len(src): |
|
1285 | 1 | char = src[i] |
|
1286 | |||
1287 | 1 | pos = alpha_pos[char] |
|
1288 | |||
1289 | 1 | if pos >= 2: |
|
1290 | 1 | xpos = pos - 2 |
|
1291 | |||
1292 | 1 | if i + 1 == len(src): |
|
1293 | 1 | pos = alpha_pos[''] |
|
1294 | else: |
||
1295 | 1 | pos = alpha_pos[src[i + 1]] |
|
1296 | |||
1297 | 1 | start1 = phonet_hash_1[xpos, pos] |
|
1298 | 1 | start2 = phonet_hash_1[xpos, 0] |
|
1299 | 1 | end1 = phonet_hash_2[xpos, pos] |
|
1300 | 1 | end2 = phonet_hash_2[xpos, 0] |
|
1301 | |||
1302 | # preserve rule priorities |
||
1303 | 1 | if (start2 >= 0) and ((start1 < 0) or (start2 < start1)): |
|
1304 | 1 | pos = start1 |
|
1305 | 1 | start1 = start2 |
|
1306 | 1 | start2 = pos |
|
1307 | 1 | pos = end1 |
|
1308 | 1 | end1 = end2 |
|
1309 | 1 | end2 = pos |
|
1310 | |||
1311 | 1 | if (end1 >= start2) and (start2 >= 0): |
|
1312 | 1 | if end2 > end1: |
|
1313 | 1 | end1 = end2 |
|
1314 | |||
1315 | 1 | start2 = -1 |
|
1316 | 1 | end2 = -1 |
|
1317 | else: |
||
1318 | 1 | pos = phonet_hash[char] |
|
1319 | 1 | start1 = pos |
|
1320 | 1 | end1 = 10000 |
|
1321 | 1 | start2 = -1 |
|
1322 | 1 | end2 = -1 |
|
1323 | |||
1324 | 1 | pos = start1 |
|
1325 | 1 | zeta0 = 0 |
|
1326 | |||
1327 | 1 | if pos >= 0: |
|
1328 | # check rules for this char |
||
1329 | 1 | while (_phonet_rules[pos] is None) or ( |
|
1330 | _phonet_rules[pos][0] == char |
||
1331 | ): |
||
1332 | 1 | if pos > end1: |
|
1333 | 1 | if start2 > 0: |
|
1334 | 1 | pos = start2 |
|
1335 | 1 | start1 = start2 |
|
1336 | 1 | start2 = -1 |
|
1337 | 1 | end1 = end2 |
|
1338 | 1 | end2 = -1 |
|
1339 | 1 | continue |
|
1340 | |||
1341 | 1 | break |
|
1342 | |||
1343 | 1 | if (_phonet_rules[pos] is None) or ( |
|
1344 | _phonet_rules[pos + mode] is None |
||
1345 | ): |
||
1346 | # no conversion rule available |
||
1347 | 1 | pos += 3 |
|
1348 | 1 | continue |
|
1349 | |||
1350 | # check whole string |
||
1351 | 1 | matches = 1 # number of matching letters |
|
1352 | 1 | priority = 5 # default priority |
|
1353 | 1 | rule = _phonet_rules[pos] |
|
1354 | 1 | rule = rule[1:] |
|
1355 | |||
1356 | 1 | while ( |
|
1357 | rule |
||
1358 | and (len(src) > (i + matches)) |
||
1359 | and (src[i + matches] == rule[0]) |
||
1360 | and not rule[0].isdigit() |
||
1361 | and (rule not in '(-<^$') |
||
1362 | ): |
||
1363 | 1 | matches += 1 |
|
1364 | 1 | rule = rule[1:] |
|
1365 | |||
1366 | 1 | if rule and (rule[0] == '('): |
|
1367 | # check an array of letters |
||
1368 | 1 | if ( |
|
1369 | (len(src) > (i + matches)) |
||
1370 | and src[i + matches].isalpha() |
||
1371 | and (src[i + matches] in rule[1:]) |
||
1372 | ): |
||
1373 | 1 | matches += 1 |
|
1374 | |||
1375 | 1 | while rule and rule[0] != ')': |
|
1376 | 1 | rule = rule[1:] |
|
1377 | |||
1378 | # if rule[0] == ')': |
||
1379 | 1 | rule = rule[1:] |
|
1380 | |||
1381 | 1 | if rule: |
|
1382 | 1 | priority0 = ord(rule[0]) |
|
1383 | else: |
||
1384 | 1 | priority0 = 0 |
|
1385 | |||
1386 | 1 | matches0 = matches |
|
1387 | |||
1388 | 1 | while rule and rule[0] == '-' and matches > 1: |
|
1389 | 1 | matches -= 1 |
|
1390 | 1 | rule = rule[1:] |
|
1391 | |||
1392 | 1 | if rule and rule[0] == '<': |
|
1393 | 1 | rule = rule[1:] |
|
1394 | |||
1395 | 1 | if rule and rule[0].isdigit(): |
|
1396 | # read priority |
||
1397 | 1 | priority = int(rule[0]) |
|
1398 | 1 | rule = rule[1:] |
|
1399 | |||
1400 | 1 | if rule and rule[0:2] == '^^': |
|
1401 | 1 | rule = rule[1:] |
|
1402 | |||
1403 | 1 | if ( |
|
1404 | not rule |
||
1405 | or ( |
||
1406 | (rule[0] == '^') |
||
1407 | and ((i == 0) or not src[i - 1].isalpha()) |
||
1408 | and ( |
||
1409 | (rule[1:2] != '$') |
||
1410 | or ( |
||
1411 | not ( |
||
1412 | src[ |
||
1413 | i + matches0 : i + matches0 + 1 |
||
1414 | ].isalpha() |
||
1415 | ) |
||
1416 | and ( |
||
1417 | src[ |
||
1418 | i + matches0 : i + matches0 + 1 |
||
1419 | ] |
||
1420 | != '.' |
||
1421 | ) |
||
1422 | ) |
||
1423 | ) |
||
1424 | ) |
||
1425 | or ( |
||
1426 | (rule[0] == '$') |
||
1427 | and (i > 0) |
||
1428 | and src[i - 1].isalpha() |
||
1429 | and ( |
||
1430 | ( |
||
1431 | not src[ |
||
1432 | i + matches0 : i + matches0 + 1 |
||
1433 | ].isalpha() |
||
1434 | ) |
||
1435 | and ( |
||
1436 | src[i + matches0 : i + matches0 + 1] |
||
1437 | != '.' |
||
1438 | ) |
||
1439 | ) |
||
1440 | ) |
||
1441 | ): |
||
1442 | # look for continuation, if: |
||
1443 | # matches > 1 und NO '-' in first string */ |
||
1444 | 1 | pos0 = -1 |
|
1445 | |||
1446 | 1 | start3 = 0 |
|
1447 | 1 | start4 = 0 |
|
1448 | 1 | end3 = 0 |
|
1449 | 1 | end4 = 0 |
|
1450 | |||
1451 | 1 | if ( |
|
1452 | (matches > 1) |
||
1453 | and src[i + matches : i + matches + 1] |
||
1454 | and (priority0 != ord('-')) |
||
1455 | ): |
||
1456 | 1 | char0 = src[i + matches - 1] |
|
1457 | 1 | pos0 = alpha_pos[char0] |
|
1458 | |||
1459 | 1 | if pos0 >= 2 and src[i + matches]: |
|
1460 | 1 | xpos = pos0 - 2 |
|
1461 | 1 | pos0 = alpha_pos[src[i + matches]] |
|
1462 | 1 | start3 = phonet_hash_1[xpos, pos0] |
|
1463 | 1 | start4 = phonet_hash_1[xpos, 0] |
|
1464 | 1 | end3 = phonet_hash_2[xpos, pos0] |
|
1465 | 1 | end4 = phonet_hash_2[xpos, 0] |
|
1466 | |||
1467 | # preserve rule priorities |
||
1468 | 1 | if (start4 >= 0) and ( |
|
1469 | (start3 < 0) or (start4 < start3) |
||
1470 | ): |
||
1471 | 1 | pos0 = start3 |
|
1472 | 1 | start3 = start4 |
|
1473 | 1 | start4 = pos0 |
|
1474 | 1 | pos0 = end3 |
|
1475 | 1 | end3 = end4 |
|
1476 | 1 | end4 = pos0 |
|
1477 | |||
1478 | 1 | if (end3 >= start4) and (start4 >= 0): |
|
1479 | 1 | if end4 > end3: |
|
1480 | 1 | end3 = end4 |
|
1481 | |||
1482 | 1 | start4 = -1 |
|
1483 | 1 | end4 = -1 |
|
1484 | else: |
||
1485 | 1 | pos0 = phonet_hash[char0] |
|
1486 | 1 | start3 = pos0 |
|
1487 | 1 | end3 = 10000 |
|
1488 | 1 | start4 = -1 |
|
1489 | 1 | end4 = -1 |
|
1490 | |||
1491 | 1 | pos0 = start3 |
|
1492 | |||
1493 | # check continuation rules for src[i+matches] |
||
1494 | 1 | if pos0 >= 0: |
|
1495 | 1 | while (_phonet_rules[pos0] is None) or ( |
|
1496 | _phonet_rules[pos0][0] == char0 |
||
1497 | ): |
||
1498 | 1 | if pos0 > end3: |
|
1499 | 1 | if start4 > 0: |
|
1500 | 1 | pos0 = start4 |
|
1501 | 1 | start3 = start4 |
|
1502 | 1 | start4 = -1 |
|
1503 | 1 | end3 = end4 |
|
1504 | 1 | end4 = -1 |
|
1505 | 1 | continue |
|
1506 | |||
1507 | 1 | priority0 = -1 |
|
1508 | |||
1509 | # important |
||
1510 | 1 | break |
|
1511 | |||
1512 | 1 | if (_phonet_rules[pos0] is None) or ( |
|
1513 | _phonet_rules[pos0 + mode] is None |
||
1514 | ): |
||
1515 | # no conversion rule available |
||
1516 | 1 | pos0 += 3 |
|
1517 | 1 | continue |
|
1518 | |||
1519 | # check whole string |
||
1520 | 1 | matches0 = matches |
|
1521 | 1 | priority0 = 5 |
|
1522 | 1 | rule = _phonet_rules[pos0] |
|
1523 | 1 | rule = rule[1:] |
|
1524 | |||
1525 | 1 | while ( |
|
1526 | rule |
||
1527 | and ( |
||
1528 | src[ |
||
1529 | i + matches0 : i + matches0 + 1 |
||
1530 | ] |
||
1531 | == rule[0] |
||
1532 | ) |
||
1533 | and ( |
||
1534 | not rule[0].isdigit() |
||
1535 | or (rule in '(-<^$') |
||
1536 | ) |
||
1537 | ): |
||
1538 | 1 | matches0 += 1 |
|
1539 | 1 | rule = rule[1:] |
|
1540 | |||
1541 | 1 | if rule and rule[0] == '(': |
|
1542 | # check an array of letters |
||
1543 | 1 | if src[ |
|
1544 | i + matches0 : i + matches0 + 1 |
||
1545 | ].isalpha() and ( |
||
1546 | src[i + matches0] in rule[1:] |
||
1547 | ): |
||
1548 | 1 | matches0 += 1 |
|
1549 | |||
1550 | 1 | while rule and rule[0] != ')': |
|
1551 | 1 | rule = rule[1:] |
|
1552 | |||
1553 | # if rule[0] == ')': |
||
1554 | 1 | rule = rule[1:] |
|
1555 | |||
1556 | 1 | while rule and rule[0] == '-': |
|
1557 | # "matches0" is NOT decremented |
||
1558 | # because of |
||
1559 | # "if (matches0 == matches)" |
||
1560 | 1 | rule = rule[1:] |
|
1561 | |||
1562 | 1 | if rule and rule[0] == '<': |
|
1563 | 1 | rule = rule[1:] |
|
1564 | |||
1565 | 1 | if rule and rule[0].isdigit(): |
|
1566 | 1 | priority0 = int(rule[0]) |
|
1567 | 1 | rule = rule[1:] |
|
1568 | |||
1569 | 1 | if ( |
|
1570 | not rule |
||
1571 | or |
||
1572 | # rule == '^' is not possible here |
||
1573 | ( |
||
1574 | (rule[0] == '$') |
||
1575 | and not src[ |
||
1576 | i + matches0 : i + matches0 + 1 |
||
1577 | ].isalpha() |
||
1578 | and ( |
||
1579 | src[ |
||
1580 | i |
||
1581 | + matches0 : i |
||
1582 | + matches0 |
||
1583 | + 1 |
||
1584 | ] |
||
1585 | != '.' |
||
1586 | ) |
||
1587 | ) |
||
1588 | ): |
||
1589 | 1 | if matches0 == matches: |
|
1590 | # this is only a partial string |
||
1591 | 1 | pos0 += 3 |
|
1592 | 1 | continue |
|
1593 | |||
1594 | 1 | if priority0 < priority: |
|
1595 | # priority is too low |
||
1596 | 1 | pos0 += 3 |
|
1597 | 1 | continue |
|
1598 | |||
1599 | # continuation rule found |
||
1600 | 1 | break |
|
1601 | |||
1602 | 1 | pos0 += 3 |
|
1603 | |||
1604 | # end of "while" |
||
1605 | 1 | if (priority0 >= priority) and ( |
|
1606 | (_phonet_rules[pos0] is not None) |
||
1607 | and (_phonet_rules[pos0][0] == char0) |
||
1608 | ): |
||
1609 | |||
1610 | 1 | pos += 3 |
|
1611 | 1 | continue |
|
1612 | |||
1613 | # replace string |
||
1614 | 1 | if _phonet_rules[pos] and ( |
|
1615 | '<' in _phonet_rules[pos][1:] |
||
1616 | ): |
||
1617 | 1 | priority0 = 1 |
|
1618 | else: |
||
1619 | 1 | priority0 = 0 |
|
1620 | |||
1621 | 1 | rule = _phonet_rules[pos + mode] |
|
1622 | |||
1623 | 1 | if (priority0 == 1) and (zeta == 0): |
|
1624 | # rule with '<' is applied |
||
1625 | 1 | if ( |
|
1626 | (j > 0) |
||
1627 | and rule |
||
1628 | and ( |
||
1629 | (dest[j - 1] == char) |
||
1630 | or (dest[j - 1] == rule[0]) |
||
1631 | ) |
||
1632 | ): |
||
1633 | 1 | j -= 1 |
|
1634 | |||
1635 | 1 | zeta0 = 1 |
|
1636 | 1 | zeta += 1 |
|
1637 | 1 | matches0 = 0 |
|
1638 | |||
1639 | 1 | while rule and src[i + matches0]: |
|
1640 | 1 | src = ( |
|
1641 | src[0 : i + matches0] |
||
1642 | + rule[0] |
||
1643 | + src[i + matches0 + 1 :] |
||
1644 | ) |
||
1645 | 1 | matches0 += 1 |
|
1646 | 1 | rule = rule[1:] |
|
1647 | |||
1648 | 1 | if matches0 < matches: |
|
1649 | 1 | src = ( |
|
1650 | src[0 : i + matches0] |
||
1651 | + src[i + matches :] |
||
1652 | ) |
||
1653 | |||
1654 | 1 | char = src[i] |
|
1655 | else: |
||
1656 | 1 | i = i + matches - 1 |
|
1657 | 1 | zeta = 0 |
|
1658 | |||
1659 | 1 | while len(rule) > 1: |
|
1660 | 1 | if (j == 0) or (dest[j - 1] != rule[0]): |
|
1661 | 1 | dest = ( |
|
1662 | dest[0:j] |
||
1663 | + rule[0] |
||
1664 | + dest[min(len(dest), j + 1) :] |
||
1665 | ) |
||
1666 | 1 | j += 1 |
|
1667 | |||
1668 | 1 | rule = rule[1:] |
|
1669 | |||
1670 | # new "current char" |
||
1671 | 1 | if not rule: |
|
1672 | 1 | rule = '' |
|
1673 | 1 | char = '' |
|
1674 | else: |
||
1675 | 1 | char = rule[0] |
|
1676 | |||
1677 | 1 | if ( |
|
1678 | _phonet_rules[pos] |
||
1679 | and '^^' in _phonet_rules[pos][1:] |
||
1680 | ): |
||
1681 | 1 | if char: |
|
1682 | 1 | dest = ( |
|
1683 | dest[0:j] |
||
1684 | + char |
||
1685 | + dest[min(len(dest), j + 1) :] |
||
1686 | ) |
||
1687 | 1 | j += 1 |
|
1688 | |||
1689 | 1 | src = src[i + 1 :] |
|
1690 | 1 | i = 0 |
|
1691 | 1 | zeta0 = 1 |
|
1692 | |||
1693 | 1 | break |
|
1694 | |||
1695 | 1 | pos += 3 |
|
1696 | |||
1697 | 1 | if pos > end1 and start2 > 0: |
|
1698 | 1 | pos = start2 |
|
1699 | 1 | start1 = start2 |
|
1700 | 1 | end1 = end2 |
|
1701 | 1 | start2 = -1 |
|
1702 | 1 | end2 = -1 |
|
1703 | |||
1704 | 1 | if zeta0 == 0: |
|
1705 | 1 | if char and ((j == 0) or (dest[j - 1] != char)): |
|
1706 | # delete multiple letters only |
||
1707 | 1 | dest = ( |
|
1708 | dest[0:j] + char + dest[min(j + 1, term_length) :] |
||
1709 | ) |
||
1710 | 1 | j += 1 |
|
1711 | |||
1712 | 1 | i += 1 |
|
1713 | 1 | zeta = 0 |
|
1714 | |||
1715 | 1 | dest = dest[0:j] |
|
1716 | |||
1717 | 1 | return dest |
|
1718 | |||
1719 | 1 | _initialize_phonet(lang) |
|
1720 | |||
1721 | 1 | word = unicode_normalize('NFKC', text_type(word)) |
|
1722 | 1 | return _phonet(word, mode, lang) |
|
1723 | |||
1781 |