Issues (227)

awips/gempak/scrape.py (2 issues)

1
#!/usr/bin/env python
2
# Parse html tables from a given URL and output CSV.
3
# Note: To install a missing python module foo do "easy_install foo"
4
#   (or the new way is "pip install foo" but you might have to do 
5
#    "easy_install pip" first)
6
7
from BeautifulSoup import BeautifulSoup
8
import urllib.request, urllib.error, urllib.parse
9
import html.entities
10
import re
11
import sys
12
import unicodedata
13
14
15
# from http://stackoverflow.com/questions/1197981/convert-html-entities
16 View Code Duplication
def asciify2(s):
0 ignored issues
show
This code seems to be duplicated in your project.
Loading history...
17
    matches = re.findall("&#\d+;", s)
18
    if len(matches) > 0:
19
        hits = set(matches)
20
        for hit in hits:
21
            name = hit[2:-1]
22
            try:
23
                entnum = int(name)
24
                s = s.replace(hit, chr(entnum))
25
            except ValueError:
26
                pass
27
28
    matches = re.findall("&\w+;", s)
29
    hits = set(matches)
30
    amp = "&"
31
    if amp in hits:
32
        hits.remove(amp)
33
    for hit in hits:
34
        name = hit[1:-1]
35
        if name in html.entities.name2codepoint:
36
            s = s.replace(hit, "")
37
    s = s.replace(amp, "&")
38
    return s
39
40
41
def opensoup(url):
42
    request = urllib.request.Request(url)
43
    request.add_header("User-Agent", "Mozilla/5.0")
44
    # To mimic a real browser's user-agent string more exactly, if necessary:
45
    #   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14)
46
    #   Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14
47
    pagefile = urllib.request.urlopen(request)
48
    soup = BeautifulSoup(pagefile)
49
    pagefile.close()
50
    return soup
51
52
53
def asciify(s):
54
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
55
56
57
# remove extra whitespace, including stripping leading and trailing whitespace.
58
def condense(s):
59
    s = re.sub(r"\s+", " ", s, re.DOTALL)
60
    return s.strip()
61
62
63
# this gets rid of tags and condenses whitespace
64
def striptags(s):
65
    s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s)
66
    s = re.sub(r"\&\#160\;", " ", s)
67
    return condense(re.sub(r"\<[^\>]*\>", " ", s))
68
69
70
if len(sys.argv) == 1:  # called with no arguments
71
    print("Usage: ", sys.argv[0], " url [n]")
72
    print("  (where n indicates which html table to parse)")
73
    exit(1)
74
75
76
def getUrlArgs(parseUrl):
77
    return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups()
78
79
80
def run(url):
81
    soup = opensoup(url)
82
    tables = soup.findAll("table")
83
    for table in tables:
84
        ct = 0
85
        for r in table.findAll('tr'):
86
            rl = []
87
            for c in r.findAll(re.compile('td|th')):
88
                rl.append(striptags(c.renderContents()))
89
        if ct > 0:
90
            rl[0] = getUrlArgs(url)[0].zfill(3) + " " + \
91
                    getUrlArgs(url)[1].zfill(3) + " " + rl[0].zfill(3) + " 000"
0 ignored issues
show
The variable rl does not seem to be defined for all execution paths.
Loading history...
92
        if len(rl) > 1:
93
            if "Reserved" in rl[1]:
94
                rl[0] = '!' + rl[0]
95
            if "See Table" in rl[2] or "Code table" in rl[2]:
96
                rl[2] = "cat"
97
            rl[1] = rl[1][:32].ljust(32)
98
            rl[2] = rl[2].ljust(20)
99
            rl[3] = rl[3].ljust(12) + "     0  -9999.00"
100
            if ct:
101
                print(" ".join(rl))
102
        ct += 1
103
104
105
if __name__ == '__main__':
106
    run(sys.argv[1])
107