Issues in scrape.py (master) - Issues in master - Unidata/python-awips - Measure and Improve Code Quality continuously with Scrutinizer

Issues (227)

awips/gempak/scrape.py (2 issues)

Labels

Duplication 1

Severity

#!/usr/bin/env python
# Parse html tables from a given URL and output CSV.
# Note: To install a missing python module foo do "easy_install foo"
#   (or the new way is "pip install foo" but you might have to do 
#    "easy_install pip" first)

from BeautifulSoup import BeautifulSoup
import urllib.request, urllib.error, urllib.parse
import html.entities
import re
import sys
import unicodedata


# from http://stackoverflow.com/questions/1197981/convert-html-entities
def asciify2(s):

    matches = re.findall("&#\d+;", s)
    if len(matches) > 0:
        hits = set(matches)
        for hit in hits:
            name = hit[2:-1]
            try:
                entnum = int(name)
                s = s.replace(hit, chr(entnum))
            except ValueError:
                pass

    matches = re.findall("&\w+;", s)
    hits = set(matches)
    amp = "&amp;"
    if amp in hits:
        hits.remove(amp)
    for hit in hits:
        name = hit[1:-1]
        if name in html.entities.name2codepoint:
            s = s.replace(hit, "")
    s = s.replace(amp, "&")
    return s


def opensoup(url):
    request = urllib.request.Request(url)
    request.add_header("User-Agent", "Mozilla/5.0")
    # To mimic a real browser's user-agent string more exactly, if necessary:
    #   Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14)
    #   Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14
    pagefile = urllib.request.urlopen(request)
    soup = BeautifulSoup(pagefile)
    pagefile.close()
    return soup


def asciify(s):
    return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')


# remove extra whitespace, including stripping leading and trailing whitespace.
def condense(s):
    s = re.sub(r"\s+", " ", s, re.DOTALL)
    return s.strip()


# this gets rid of tags and condenses whitespace
def striptags(s):
    s = re.sub(r"\<span\s+style\s*\=\s*\"display\:none[^\"]*\"[^\>]*\>[^\<]*\<\/span\>", "", s)
    s = re.sub(r"\&\#160\;", " ", s)
    return condense(re.sub(r"\<[^\>]*\>", " ", s))


if len(sys.argv) == 1:  # called with no arguments
    print("Usage: ", sys.argv[0], " url [n]")
    print("  (where n indicates which html table to parse)")
    exit(1)


def getUrlArgs(parseUrl):
    return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups()


def run(url):
    soup = opensoup(url)
    tables = soup.findAll("table")
    for table in tables:
        ct = 0
        for r in table.findAll('tr'):
            rl = []
            for c in r.findAll(re.compile('td|th')):
                rl.append(striptags(c.renderContents()))
        if ct > 0:
            rl[0] = getUrlArgs(url)[0].zfill(3) + " " + \
                    getUrlArgs(url)[1].zfill(3) + " " + rl[0].zfill(3) + " 000"

        if len(rl) > 1:
            if "Reserved" in rl[1]:
                rl[0] = '!' + rl[0]
            if "See Table" in rl[2] or "Code table" in rl[2]:
                rl[2] = "cat"
            rl[1] = rl[1][:32].ljust(32)
            rl[2] = rl[2].ljust(20)
            rl[3] = rl[3].ljust(12) + "     0  -9999.00"
            if ct:
                print(" ".join(rl))
        ct += 1


if __name__ == '__main__':
    run(sys.argv[1])


1		#!/usr/bin/env python
2		# Parse html tables from a given URL and output CSV.
3		# Note: To install a missing python module foo do "easy_install foo"
4		# (or the new way is "pip install foo" but you might have to do
5		# "easy_install pip" first)
6
7		from BeautifulSoup import BeautifulSoup
8		import urllib.request, urllib.error, urllib.parse
9		import html.entities
10		import re
11		import sys
12		import unicodedata
13
14
15		# from http://stackoverflow.com/questions/1197981/convert-html-entities
16	View Code Duplication	def asciify2(s):
		0 ignored issues – show Duplication introduced 2018-10-30 15:45 UTC by Report Bug Copy Issue Report Show Similar Issues like this This code seems to be duplicated in your project. Loading history...
17		matches = re.findall("&#\d+;", s)
18		if len(matches) > 0:
19		hits = set(matches)
20		for hit in hits:
21		name = hit[2:-1]
22		try:
23		entnum = int(name)
24		s = s.replace(hit, chr(entnum))
25		except ValueError:
26		pass
27
28		matches = re.findall("&\w+;", s)
29		hits = set(matches)
30		amp = "&"
31		if amp in hits:
32		hits.remove(amp)
33		for hit in hits:
34		name = hit[1:-1]
35		if name in html.entities.name2codepoint:
36		s = s.replace(hit, "")
37		s = s.replace(amp, "&")
38		return s
39
40
41		def opensoup(url):
42		request = urllib.request.Request(url)
43		request.add_header("User-Agent", "Mozilla/5.0")
44		# To mimic a real browser's user-agent string more exactly, if necessary:
45		# Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14)
46		# Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14
47		pagefile = urllib.request.urlopen(request)
48		soup = BeautifulSoup(pagefile)
49		pagefile.close()
50		return soup
51
52
53		def asciify(s):
54		return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore')
55
56
57		# remove extra whitespace, including stripping leading and trailing whitespace.
58		def condense(s):
59		s = re.sub(r"\s+", " ", s, re.DOTALL)
60		return s.strip()
61
62
63		# this gets rid of tags and condenses whitespace
64		def striptags(s):
65		s = re.sub(r"\<span\s+style\s\=\s\"display\:none[^\"]\"[^\>]\>[^\<]*\<\/span\>", "", s)
66		s = re.sub(r"\&\#160\;", " ", s)
67		return condense(re.sub(r"\<[^\>]*\>", " ", s))
68
69
70		if len(sys.argv) == 1: # called with no arguments
71		print("Usage: ", sys.argv[0], " url [n]")
72		print(" (where n indicates which html table to parse)")
73		exit(1)
74
75
76		def getUrlArgs(parseUrl):
77		return re.search('grib2_table4-2-(\d+)-(\d+).shtml', parseUrl).groups()
78
79
80		def run(url):
81		soup = opensoup(url)
82		tables = soup.findAll("table")
83		for table in tables:
84		ct = 0
85		for r in table.findAll('tr'):
86		rl = []
87		for c in r.findAll(re.compile('td\|th')):
88		rl.append(striptags(c.renderContents()))
89		if ct > 0:
90		rl[0] = getUrlArgs(url)[0].zfill(3) + " " + \
91		getUrlArgs(url)[1].zfill(3) + " " + rl[0].zfill(3) + " 000"
		0 ignored issues – show introduced 2018-10-30 15:45 UTC by Report Bug Copy Issue Report Show Similar Issues like this The variable `rl` does not seem to be defined for all execution paths. Loading history...
92		if len(rl) > 1:
93		if "Reserved" in rl[1]:
94		rl[0] = '!' + rl[0]
95		if "See Table" in rl[2] or "Code table" in rl[2]:
96		rl[2] = "cat"
97		rl[1] = rl[1][:32].ljust(32)
98		rl[2] = rl[2].ljust(20)
99		rl[3] = rl[3].ljust(12) + " 0 -9999.00"
100		if ct:
101		print(" ".join(rl))
102		ct += 1
103
104
105		if __name__ == '__main__':
106		run(sys.argv[1])
107

Unidata / python-awips

Issues (227)

awips/gempak/scrape.py (2 issues)

Labels

Severity

Introduced By

Duplication Side-by-Side

Filter issues like