Issues (227)

awips/gempak/ncepGribTables.py (1 issue)

1
#!/usr/bin/env python
2
# Parse html tables from a given URL and output CSV.
3
# Note: To install a missing python module foo do "easy_install foo"
4
#   (or the new way is "pip install foo" but you might have to do 
5
#    "easy_install pip" first)
6
7
from BeautifulSoup import BeautifulSoup
8
import scrape
9
import urllib.request, urllib.error, urllib.parse
10
import html.entities
11
import re
12
import sys
13
import unicodedata
14
15
16
# from http://stackoverflow.com/questions/1197981/convert-html-entities
17 View Code Duplication
def asciify2(s):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
18
    matches = re.findall("&#\d+;", s)
19
    if len(matches) > 0:
20
        hits = set(matches)
21
        for hit in hits:
22
            name = hit[2:-1]
23
            try:
24
                entnum = int(name)
25
                s = s.replace(hit, chr(entnum))
26
            except ValueError:
27
                pass
28
29
    matches = re.findall("&\w+;", s)
30
    hits = set(matches)
31
    amp = "&"
32
    if amp in hits:
33
        hits.remove(amp)
34
    for hit in hits:
35
        name = hit[1:-1]
36
        if name in html.entities.name2codepoint:
37
            s = s.replace(hit, "")
38
    s = s.replace(amp, "&")
39
    return s
40
41
42
def opensoup(url):
43
    request = urllib.request.Request(url)
44
    request.add_header("User-Agent", "Mozilla/5.0")
45
    # To mimic a real browser's user-agent string more exactly, if necessary:
46
    #   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14)
47
    #   Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14
48
    pagefile = urllib.request.urlopen(request)
49
    soup = BeautifulSoup(pagefile)
50
    pagefile.close()
51
    return soup
52
53
54
def asciify(s):
55
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
56
57
58
# remove extra whitespace, including stripping leading and trailing whitespace.
59
def condense(s):
60
    s = re.sub(r"\s+", " ", s, re.DOTALL)
61
    return s.strip()
62
63
64
def stripurl(s):
65
    s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s)
66
    s = re.sub(r"\&\#160\;", " ", s)
67
    return condense(re.sub(r"\<[^\>]*\>", " ", s))
68
69
70
# this gets rid of tags and condenses whitespace
71
def striptags(s):
72
    s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s)
73
    s = re.sub(r"\&\#160\;", " ", s)
74
    return condense(s)
75
76
77
def getUrlArgs(parseUrl):
78
    return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups()
79
80
81
if len(sys.argv) == 1:
82
    print("Usage: ", sys.argv[0], " url [n]")
83
    print("  (where n indicates which html table to parse)")
84
    exit(1)
85
86
url = sys.argv[1]
87
soup = opensoup(url)
88
tables = soup.findAll("table")
89
90
for table in tables:
91
    for r in table.findAll('tr'):
92
        rl = []
93
        for c in r.findAll(re.compile('td|th')):
94
            rl.append(striptags(c.renderContents()))
95
        if len(rl) > 1 and "href" in rl[1]:
96
            print('! ' + stripurl(rl[1]))
97
            scrapeUrl = 'http://www.nco.ncep.noaa.gov/pmb/docs/grib2/grib2_table4-2-' + \
98
                        getUrlArgs(rl[1])[0] + "-" + getUrlArgs(rl[1])[1] + '.shtml'
99
            scrape.run(scrapeUrl)
100