@@ 17-39 (lines=23) @@ | ||
14 | ||
15 | ||
16 | # from http://stackoverflow.com/questions/1197981/convert-html-entities |
|
17 | def asciify2(s): |
|
18 | matches = re.findall("&#\d+;", s) |
|
19 | if len(matches) > 0: |
|
20 | hits = set(matches) |
|
21 | for hit in hits: |
|
22 | name = hit[2:-1] |
|
23 | try: |
|
24 | entnum = int(name) |
|
25 | s = s.replace(hit, chr(entnum)) |
|
26 | except ValueError: |
|
27 | pass |
|
28 | ||
29 | matches = re.findall("&\w+;", s) |
|
30 | hits = set(matches) |
|
31 | amp = "&" |
|
32 | if amp in hits: |
|
33 | hits.remove(amp) |
|
34 | for hit in hits: |
|
35 | name = hit[1:-1] |
|
36 | if name in html.entities.name2codepoint: |
|
37 | s = s.replace(hit, "") |
|
38 | s = s.replace(amp, "&") |
|
39 | return s |
|
40 | ||
41 | ||
42 | def opensoup(url): |
@@ 16-38 (lines=23) @@ | ||
13 | ||
14 | ||
15 | # from http://stackoverflow.com/questions/1197981/convert-html-entities |
|
16 | def asciify2(s): |
|
17 | matches = re.findall("&#\d+;", s) |
|
18 | if len(matches) > 0: |
|
19 | hits = set(matches) |
|
20 | for hit in hits: |
|
21 | name = hit[2:-1] |
|
22 | try: |
|
23 | entnum = int(name) |
|
24 | s = s.replace(hit, chr(entnum)) |
|
25 | except ValueError: |
|
26 | pass |
|
27 | ||
28 | matches = re.findall("&\w+;", s) |
|
29 | hits = set(matches) |
|
30 | amp = "&" |
|
31 | if amp in hits: |
|
32 | hits.remove(amp) |
|
33 | for hit in hits: |
|
34 | name = hit[1:-1] |
|
35 | if name in html.entities.name2codepoint: |
|
36 | s = s.replace(hit, "") |
|
37 | s = s.replace(amp, "&") |
|
38 | return s |
|
39 | ||
40 | ||
41 | def opensoup(url): |