1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# Parse html tables from a given URL and output CSV. |
3
|
|
|
# Note: To install a missing python module foo do "easy_install foo" |
4
|
|
|
# (or the new way is "pip install foo" but you might have to do |
5
|
|
|
# "easy_install pip" first) |
6
|
|
|
|
7
|
|
|
from BeautifulSoup import BeautifulSoup |
8
|
|
|
import urllib.request, urllib.error, urllib.parse |
9
|
|
|
import html.entities |
10
|
|
|
import re |
11
|
|
|
import sys |
12
|
|
|
import unicodedata |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
# from http://stackoverflow.com/questions/1197981/convert-html-entities |
16
|
|
View Code Duplication |
def asciify2(s): |
|
|
|
|
17
|
|
|
matches = re.findall("&#\d+;", s) |
18
|
|
|
if len(matches) > 0: |
19
|
|
|
hits = set(matches) |
20
|
|
|
for hit in hits: |
21
|
|
|
name = hit[2:-1] |
22
|
|
|
try: |
23
|
|
|
entnum = int(name) |
24
|
|
|
s = s.replace(hit, chr(entnum)) |
25
|
|
|
except ValueError: |
26
|
|
|
pass |
27
|
|
|
|
28
|
|
|
matches = re.findall("&\w+;", s) |
29
|
|
|
hits = set(matches) |
30
|
|
|
amp = "&" |
31
|
|
|
if amp in hits: |
32
|
|
|
hits.remove(amp) |
33
|
|
|
for hit in hits: |
34
|
|
|
name = hit[1:-1] |
35
|
|
|
if name in html.entities.name2codepoint: |
36
|
|
|
s = s.replace(hit, "") |
37
|
|
|
s = s.replace(amp, "&") |
38
|
|
|
return s |
39
|
|
|
|
40
|
|
|
|
41
|
|
|
def opensoup(url): |
42
|
|
|
request = urllib.request.Request(url) |
43
|
|
|
request.add_header("User-Agent", "Mozilla/5.0") |
44
|
|
|
# To mimic a real browser's user-agent string more exactly, if necessary: |
45
|
|
|
# Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) |
46
|
|
|
# Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14 |
47
|
|
|
pagefile = urllib.request.urlopen(request) |
48
|
|
|
soup = BeautifulSoup(pagefile) |
49
|
|
|
pagefile.close() |
50
|
|
|
return soup |
51
|
|
|
|
52
|
|
|
|
53
|
|
|
def asciify(s): |
54
|
|
|
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') |
55
|
|
|
|
56
|
|
|
|
57
|
|
|
# remove extra whitespace, including stripping leading and trailing whitespace. |
58
|
|
|
def condense(s): |
59
|
|
|
s = re.sub(r"\s+", " ", s, re.DOTALL) |
60
|
|
|
return s.strip() |
61
|
|
|
|
62
|
|
|
|
63
|
|
|
# this gets rid of tags and condenses whitespace |
64
|
|
|
def striptags(s): |
65
|
|
|
s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) |
66
|
|
|
s = re.sub(r"\&\#160\;", " ", s) |
67
|
|
|
return condense(re.sub(r"\<[^\>]*\>", " ", s)) |
68
|
|
|
|
69
|
|
|
|
70
|
|
|
if len(sys.argv) == 1: # called with no arguments |
71
|
|
|
print("Usage: ", sys.argv[0], " url [n]") |
72
|
|
|
print(" (where n indicates which html table to parse)") |
73
|
|
|
exit(1) |
74
|
|
|
|
75
|
|
|
|
76
|
|
|
def getUrlArgs(parseUrl): |
77
|
|
|
return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups() |
78
|
|
|
|
79
|
|
|
|
80
|
|
|
def run(url): |
81
|
|
|
soup = opensoup(url) |
82
|
|
|
tables = soup.findAll("table") |
83
|
|
|
for table in tables: |
84
|
|
|
ct = 0 |
85
|
|
|
for r in table.findAll('tr'): |
86
|
|
|
rl = [] |
87
|
|
|
for c in r.findAll(re.compile('td|th')): |
88
|
|
|
rl.append(striptags(c.renderContents())) |
89
|
|
|
if ct > 0: |
90
|
|
|
rl[0] = getUrlArgs(url)[0].zfill(3) + " " + \ |
91
|
|
|
getUrlArgs(url)[1].zfill(3) + " " + rl[0].zfill(3) + " 000" |
|
|
|
|
92
|
|
|
if len(rl) > 1: |
93
|
|
|
if "Reserved" in rl[1]: |
94
|
|
|
rl[0] = '!' + rl[0] |
95
|
|
|
if "See Table" in rl[2] or "Code table" in rl[2]: |
96
|
|
|
rl[2] = "cat" |
97
|
|
|
rl[1] = rl[1][:32].ljust(32) |
98
|
|
|
rl[2] = rl[2].ljust(20) |
99
|
|
|
rl[3] = rl[3].ljust(12) + " 0 -9999.00" |
100
|
|
|
if ct: |
101
|
|
|
print(" ".join(rl)) |
102
|
|
|
ct += 1 |
103
|
|
|
|
104
|
|
|
|
105
|
|
|
if __name__ == '__main__': |
106
|
|
|
run(sys.argv[1]) |
107
|
|
|
|