1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# Parse html tables from a given URL and output CSV. |
3
|
|
|
# Note: To install a missing python module foo do "easy_install foo" |
4
|
|
|
# (or the new way is "pip install foo" but you might have to do |
5
|
|
|
# "easy_install pip" first) |
6
|
|
|
|
7
|
|
|
from BeautifulSoup import BeautifulSoup |
8
|
|
|
import scrape |
9
|
|
|
import urllib.request, urllib.error, urllib.parse |
10
|
|
|
import html.entities |
11
|
|
|
import re |
12
|
|
|
import sys |
13
|
|
|
import unicodedata |
14
|
|
|
|
15
|
|
|
|
16
|
|
|
# from http://stackoverflow.com/questions/1197981/convert-html-entities |
17
|
|
View Code Duplication |
def asciify2(s): |
|
|
|
|
18
|
|
|
matches = re.findall("&#\d+;", s) |
19
|
|
|
if len(matches) > 0: |
20
|
|
|
hits = set(matches) |
21
|
|
|
for hit in hits: |
22
|
|
|
name = hit[2:-1] |
23
|
|
|
try: |
24
|
|
|
entnum = int(name) |
25
|
|
|
s = s.replace(hit, chr(entnum)) |
26
|
|
|
except ValueError: |
27
|
|
|
pass |
28
|
|
|
|
29
|
|
|
matches = re.findall("&\w+;", s) |
30
|
|
|
hits = set(matches) |
31
|
|
|
amp = "&" |
32
|
|
|
if amp in hits: |
33
|
|
|
hits.remove(amp) |
34
|
|
|
for hit in hits: |
35
|
|
|
name = hit[1:-1] |
36
|
|
|
if name in html.entities.name2codepoint: |
37
|
|
|
s = s.replace(hit, "") |
38
|
|
|
s = s.replace(amp, "&") |
39
|
|
|
return s |
40
|
|
|
|
41
|
|
|
|
42
|
|
|
def opensoup(url): |
43
|
|
|
request = urllib.request.Request(url) |
44
|
|
|
request.add_header("User-Agent", "Mozilla/5.0") |
45
|
|
|
# To mimic a real browser's user-agent string more exactly, if necessary: |
46
|
|
|
# Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) |
47
|
|
|
# Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14 |
48
|
|
|
pagefile = urllib.request.urlopen(request) |
49
|
|
|
soup = BeautifulSoup(pagefile) |
50
|
|
|
pagefile.close() |
51
|
|
|
return soup |
52
|
|
|
|
53
|
|
|
|
54
|
|
|
def asciify(s): |
55
|
|
|
return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') |
56
|
|
|
|
57
|
|
|
|
58
|
|
|
# remove extra whitespace, including stripping leading and trailing whitespace. |
59
|
|
|
def condense(s): |
60
|
|
|
s = re.sub(r"\s+", " ", s, re.DOTALL) |
61
|
|
|
return s.strip() |
62
|
|
|
|
63
|
|
|
|
64
|
|
|
def stripurl(s): |
65
|
|
|
s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) |
66
|
|
|
s = re.sub(r"\&\#160\;", " ", s) |
67
|
|
|
return condense(re.sub(r"\<[^\>]*\>", " ", s)) |
68
|
|
|
|
69
|
|
|
|
70
|
|
|
# this gets rid of tags and condenses whitespace |
71
|
|
|
def striptags(s): |
72
|
|
|
s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) |
73
|
|
|
s = re.sub(r"\&\#160\;", " ", s) |
74
|
|
|
return condense(s) |
75
|
|
|
|
76
|
|
|
|
77
|
|
|
def getUrlArgs(parseUrl): |
78
|
|
|
return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups() |
79
|
|
|
|
80
|
|
|
|
81
|
|
|
if len(sys.argv) == 1: |
82
|
|
|
print("Usage: ", sys.argv[0], " url [n]") |
83
|
|
|
print(" (where n indicates which html table to parse)") |
84
|
|
|
exit(1) |
85
|
|
|
|
86
|
|
|
url = sys.argv[1] |
87
|
|
|
soup = opensoup(url) |
88
|
|
|
tables = soup.findAll("table") |
89
|
|
|
|
90
|
|
|
for table in tables: |
91
|
|
|
for r in table.findAll('tr'): |
92
|
|
|
rl = [] |
93
|
|
|
for c in r.findAll(re.compile('td|th')): |
94
|
|
|
rl.append(striptags(c.renderContents())) |
95
|
|
|
if len(rl) > 1 and "href" in rl[1]: |
96
|
|
|
print('! ' + stripurl(rl[1])) |
97
|
|
|
scrapeUrl = 'http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_table4-2-' + \ |
98
|
|
|
getUrlArgs(rl[1])[0] + "-" + getUrlArgs(rl[1])[1] + '.shtml' |
99
|
|
|
scrape.run(scrapeUrl) |
100
|
|
|
|