1 | #!/usr/bin/env python |
||
2 | # Parse html tables from a given URL and output CSV. |
||
3 | # Note: To install a missing python module foo do "easy_install foo" |
||
4 | # (or the new way is "pip install foo" but you might have to do |
||
5 | # "easy_install pip" first) |
||
6 | |||
7 | from BeautifulSoup import BeautifulSoup |
||
8 | import urllib.request, urllib.error, urllib.parse |
||
9 | import html.entities |
||
10 | import re |
||
11 | import sys |
||
12 | import unicodedata |
||
13 | |||
14 | |||
15 | # from http://stackoverflow.com/questions/1197981/convert-html-entities |
||
16 | View Code Duplication | def asciify2(s): |
|
0 ignored issues
–
show
Duplication
introduced
by
![]() |
|||
17 | matches = re.findall("&#\d+;", s) |
||
18 | if len(matches) > 0: |
||
19 | hits = set(matches) |
||
20 | for hit in hits: |
||
21 | name = hit[2:-1] |
||
22 | try: |
||
23 | entnum = int(name) |
||
24 | s = s.replace(hit, chr(entnum)) |
||
25 | except ValueError: |
||
26 | pass |
||
27 | |||
28 | matches = re.findall("&\w+;", s) |
||
29 | hits = set(matches) |
||
30 | amp = "&" |
||
31 | if amp in hits: |
||
32 | hits.remove(amp) |
||
33 | for hit in hits: |
||
34 | name = hit[1:-1] |
||
35 | if name in html.entities.name2codepoint: |
||
36 | s = s.replace(hit, "") |
||
37 | s = s.replace(amp, "&") |
||
38 | return s |
||
39 | |||
40 | |||
41 | def opensoup(url): |
||
42 | request = urllib.request.Request(url) |
||
43 | request.add_header("User-Agent", "Mozilla/5.0") |
||
44 | # To mimic a real browser's user-agent string more exactly, if necessary: |
||
45 | # Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) |
||
46 | # Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14 |
||
47 | pagefile = urllib.request.urlopen(request) |
||
48 | soup = BeautifulSoup(pagefile) |
||
49 | pagefile.close() |
||
50 | return soup |
||
51 | |||
52 | |||
53 | def asciify(s): |
||
54 | return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') |
||
55 | |||
56 | |||
57 | # remove extra whitespace, including stripping leading and trailing whitespace. |
||
58 | def condense(s): |
||
59 | s = re.sub(r"\s+", " ", s, re.DOTALL) |
||
60 | return s.strip() |
||
61 | |||
62 | |||
63 | # this gets rid of tags and condenses whitespace |
||
64 | def striptags(s): |
||
65 | s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) |
||
66 | s = re.sub(r"\&\#160\;", " ", s) |
||
67 | return condense(re.sub(r"\<[^\>]*\>", " ", s)) |
||
68 | |||
69 | |||
70 | if len(sys.argv) == 1: # called with no arguments |
||
71 | print("Usage: ", sys.argv[0], " url [n]") |
||
72 | print(" (where n indicates which html table to parse)") |
||
73 | exit(1) |
||
74 | |||
75 | |||
76 | def getUrlArgs(parseUrl): |
||
77 | return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups() |
||
78 | |||
79 | |||
80 | def run(url): |
||
81 | soup = opensoup(url) |
||
82 | tables = soup.findAll("table") |
||
83 | for table in tables: |
||
84 | ct = 0 |
||
85 | for r in table.findAll('tr'): |
||
86 | rl = [] |
||
87 | for c in r.findAll(re.compile('td|th')): |
||
88 | rl.append(striptags(c.renderContents())) |
||
89 | if ct > 0: |
||
90 | rl[0] = getUrlArgs(url)[0].zfill(3) + " " + \ |
||
91 | getUrlArgs(url)[1].zfill(3) + " " + rl[0].zfill(3) + " 000" |
||
0 ignored issues
–
show
|
|||
92 | if len(rl) > 1: |
||
93 | if "Reserved" in rl[1]: |
||
94 | rl[0] = '!' + rl[0] |
||
95 | if "See Table" in rl[2] or "Code table" in rl[2]: |
||
96 | rl[2] = "cat" |
||
97 | rl[1] = rl[1][:32].ljust(32) |
||
98 | rl[2] = rl[2].ljust(20) |
||
99 | rl[3] = rl[3].ljust(12) + " 0 -9999.00" |
||
100 | if ct: |
||
101 | print(" ".join(rl)) |
||
102 | ct += 1 |
||
103 | |||
104 | |||
105 | if __name__ == '__main__': |
||
106 | run(sys.argv[1]) |
||
107 |