|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# Parse html tables from a given URL and output CSV. |
|
3
|
|
|
# Note: To install a missing python module foo do "easy_install foo" |
|
4
|
|
|
# (or the new way is "pip install foo" but you might have to do |
|
5
|
|
|
# "easy_install pip" first) |
|
6
|
|
|
|
|
7
|
|
|
from BeautifulSoup import BeautifulSoup |
|
8
|
|
|
import scrape |
|
9
|
|
|
import urllib.request, urllib.error, urllib.parse |
|
10
|
|
|
import html.entities |
|
11
|
|
|
import re |
|
12
|
|
|
import sys |
|
13
|
|
|
import unicodedata |
|
14
|
|
|
|
|
15
|
|
|
|
|
16
|
|
|
# from http://stackoverflow.com/questions/1197981/convert-html-entities |
|
17
|
|
View Code Duplication |
def asciify2(s): |
|
|
|
|
|
|
18
|
|
|
matches = re.findall("&#\d+;", s) |
|
19
|
|
|
if len(matches) > 0: |
|
20
|
|
|
hits = set(matches) |
|
21
|
|
|
for hit in hits: |
|
22
|
|
|
name = hit[2:-1] |
|
23
|
|
|
try: |
|
24
|
|
|
entnum = int(name) |
|
25
|
|
|
s = s.replace(hit, chr(entnum)) |
|
26
|
|
|
except ValueError: |
|
27
|
|
|
pass |
|
28
|
|
|
|
|
29
|
|
|
matches = re.findall("&\w+;", s) |
|
30
|
|
|
hits = set(matches) |
|
31
|
|
|
amp = "&" |
|
32
|
|
|
if amp in hits: |
|
33
|
|
|
hits.remove(amp) |
|
34
|
|
|
for hit in hits: |
|
35
|
|
|
name = hit[1:-1] |
|
36
|
|
|
if name in html.entities.name2codepoint: |
|
37
|
|
|
s = s.replace(hit, "") |
|
38
|
|
|
s = s.replace(amp, "&") |
|
39
|
|
|
return s |
|
40
|
|
|
|
|
41
|
|
|
|
|
42
|
|
|
def opensoup(url): |
|
43
|
|
|
request = urllib.request.Request(url) |
|
44
|
|
|
request.add_header("User-Agent", "Mozilla/5.0") |
|
45
|
|
|
# To mimic a real browser's user-agent string more exactly, if necessary: |
|
46
|
|
|
# Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) |
|
47
|
|
|
# Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14 |
|
48
|
|
|
pagefile = urllib.request.urlopen(request) |
|
49
|
|
|
soup = BeautifulSoup(pagefile) |
|
50
|
|
|
pagefile.close() |
|
51
|
|
|
return soup |
|
52
|
|
|
|
|
53
|
|
|
|
|
54
|
|
|
def asciify(s): |
|
55
|
|
|
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') |
|
56
|
|
|
|
|
57
|
|
|
|
|
58
|
|
|
# remove extra whitespace, including stripping leading and trailing whitespace. |
|
59
|
|
|
def condense(s): |
|
60
|
|
|
s = re.sub(r"\s+", " ", s, re.DOTALL) |
|
61
|
|
|
return s.strip() |
|
62
|
|
|
|
|
63
|
|
|
|
|
64
|
|
|
def stripurl(s): |
|
65
|
|
|
s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) |
|
66
|
|
|
s = re.sub(r"\&\#160\;", " ", s) |
|
67
|
|
|
return condense(re.sub(r"\<[^\>]*\>", " ", s)) |
|
68
|
|
|
|
|
69
|
|
|
|
|
70
|
|
|
# this gets rid of tags and condenses whitespace |
|
71
|
|
|
def striptags(s): |
|
72
|
|
|
s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) |
|
73
|
|
|
s = re.sub(r"\&\#160\;", " ", s) |
|
74
|
|
|
return condense(s) |
|
75
|
|
|
|
|
76
|
|
|
|
|
77
|
|
|
def getUrlArgs(parseUrl): |
|
78
|
|
|
return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups() |
|
79
|
|
|
|
|
80
|
|
|
|
|
81
|
|
|
if len(sys.argv) == 1: |
|
82
|
|
|
print("Usage: ", sys.argv[0], " url [n]") |
|
83
|
|
|
print(" (where n indicates which html table to parse)") |
|
84
|
|
|
exit(1) |
|
85
|
|
|
|
|
86
|
|
|
url = sys.argv[1] |
|
87
|
|
|
soup = opensoup(url) |
|
88
|
|
|
tables = soup.findAll("table") |
|
89
|
|
|
|
|
90
|
|
|
for table in tables: |
|
91
|
|
|
for r in table.findAll('tr'): |
|
92
|
|
|
rl = [] |
|
93
|
|
|
for c in r.findAll(re.compile('td|th')): |
|
94
|
|
|
rl.append(striptags(c.renderContents())) |
|
95
|
|
|
if len(rl) > 1 and "href" in rl[1]: |
|
96
|
|
|
print('! ' + stripurl(rl[1])) |
|
97
|
|
|
scrapeUrl = 'http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_table4-2-' + \ |
|
98
|
|
|
getUrlArgs(rl[1])[0] + "-" + getUrlArgs(rl[1])[1] + '.shtml' |
|
99
|
|
|
scrape.run(scrapeUrl) |
|
100
|
|
|
|