1 | #!/usr/bin/env python |
||
2 | # Parse html tables from a given URL and output CSV. |
||
3 | # Note: To install a missing python module foo do "easy_install foo" |
||
4 | # (or the new way is "pip install foo" but you might have to do |
||
5 | # "easy_install pip" first) |
||
6 | |||
7 | from BeautifulSoup import BeautifulSoup |
||
8 | import scrape |
||
9 | import urllib.request, urllib.error, urllib.parse |
||
10 | import html.entities |
||
11 | import re |
||
12 | import sys |
||
13 | import unicodedata |
||
14 | |||
15 | |||
16 | # from http://stackoverflow.com/questions/1197981/convert-html-entities |
||
17 | View Code Duplication | def asciify2(s): |
|
0 ignored issues
–
show
Duplication
introduced
by
![]() |
|||
18 | matches = re.findall("&#\d+;", s) |
||
19 | if len(matches) > 0: |
||
20 | hits = set(matches) |
||
21 | for hit in hits: |
||
22 | name = hit[2:-1] |
||
23 | try: |
||
24 | entnum = int(name) |
||
25 | s = s.replace(hit, chr(entnum)) |
||
26 | except ValueError: |
||
27 | pass |
||
28 | |||
29 | matches = re.findall("&\w+;", s) |
||
30 | hits = set(matches) |
||
31 | amp = "&" |
||
32 | if amp in hits: |
||
33 | hits.remove(amp) |
||
34 | for hit in hits: |
||
35 | name = hit[1:-1] |
||
36 | if name in html.entities.name2codepoint: |
||
37 | s = s.replace(hit, "") |
||
38 | s = s.replace(amp, "&") |
||
39 | return s |
||
40 | |||
41 | |||
42 | def opensoup(url): |
||
43 | request = urllib.request.Request(url) |
||
44 | request.add_header("User-Agent", "Mozilla/5.0") |
||
45 | # To mimic a real browser's user-agent string more exactly, if necessary: |
||
46 | # Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) |
||
47 | # Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14 |
||
48 | pagefile = urllib.request.urlopen(request) |
||
49 | soup = BeautifulSoup(pagefile) |
||
50 | pagefile.close() |
||
51 | return soup |
||
52 | |||
53 | |||
54 | def asciify(s): |
||
55 | return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') |
||
56 | |||
57 | |||
58 | # remove extra whitespace, including stripping leading and trailing whitespace. |
||
59 | def condense(s): |
||
60 | s = re.sub(r"\s+", " ", s, re.DOTALL) |
||
61 | return s.strip() |
||
62 | |||
63 | |||
64 | def stripurl(s): |
||
65 | s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) |
||
66 | s = re.sub(r"\&\#160\;", " ", s) |
||
67 | return condense(re.sub(r"\<[^\>]*\>", " ", s)) |
||
68 | |||
69 | |||
70 | # this gets rid of tags and condenses whitespace |
||
71 | def striptags(s): |
||
72 | s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s) |
||
73 | s = re.sub(r"\&\#160\;", " ", s) |
||
74 | return condense(s) |
||
75 | |||
76 | |||
77 | def getUrlArgs(parseUrl): |
||
78 | return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups() |
||
79 | |||
80 | |||
81 | if len(sys.argv) == 1: |
||
82 | print("Usage: ", sys.argv[0], " url [n]") |
||
83 | print(" (where n indicates which html table to parse)") |
||
84 | exit(1) |
||
85 | |||
86 | url = sys.argv[1] |
||
87 | soup = opensoup(url) |
||
88 | tables = soup.findAll("table") |
||
89 | |||
90 | for table in tables: |
||
91 | for r in table.findAll('tr'): |
||
92 | rl = [] |
||
93 | for c in r.findAll(re.compile('td|th')): |
||
94 | rl.append(striptags(c.renderContents())) |
||
95 | if len(rl) > 1 and "href" in rl[1]: |
||
96 | print('! ' + stripurl(rl[1])) |
||
97 | scrapeUrl = 'http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_table4-2-' + \ |
||
98 | getUrlArgs(rl[1])[0] + "-" + getUrlArgs(rl[1])[1] + '.shtml' |
||
99 | scrape.run(scrapeUrl) |
||
100 |