e2edutch.download.get_parser() - Code Metrics - Filter-Bubble/e2e-Dutch - Measure and Improve Code Quality continuously with Scrutinizer

e2edutch.download.get_parser() A
last analyzed 2021-08-05 09:00 UTC

↳ Parent: e2edutch.download

Complexity

Conditions

Size

Total Lines	5
Code Lines	5

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	5
dl	0
loc	5
rs	10
c	0
b	0
f	0
cc	1
nop	0

import requests

import gzip

import zipfile

import argparse

import logging

from tqdm import tqdm
from pathlib import Path

from e2edutch import util

logger = logging.getLogger()


def download_file(url, path):
    """
    Download a URL into a file as specified by `path`.
    """
    # This function is copied from stanza
    # https://github.com/stanfordnlp/stanza/blob/f0338f891a03e242c7e11e440dec6e191d54ab77/stanza/resources/common.py#L103
    r = requests.get(url, stream=True)

    with open(path, 'wb') as f:

        file_size = int(r.headers.get('content-length'))
        default_chunk_size = 131072
        desc = 'Downloading ' + url
        with tqdm(total=file_size, unit='B', unit_scale=True,
                  desc=desc) as pbar:
            for chunk in r.iter_content(chunk_size=default_chunk_size):
                if chunk:
                    f.write(chunk)
                    f.flush()
                    pbar.update(len(chunk))


def download_data(config={}):
# Bad:
# If array_param is modified inside the function, the next invocation will
# receive the modified object.
def some_function(array_param=[]):
    # ...

# Better: Create an array on each invocation
def some_function(array_param=None):
    array_param = array_param or []
    # ...
    # Create the data directory if it doesn't exist yet
    data_dir = Path(config['datapath'])
    logger.info('Downloading to {}'.format(data_dir))

    data_dir.mkdir(parents=True, exist_ok=True)

    # Download word vectors
    logger.info('Download word vectors')
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz"
    fname = data_dir / 'fasttext.300.vec'
    fname_gz = data_dir / 'fasttext.300.vec.gz'
    if not fname.exists():
        download_file(url, fname_gz)
        with gzip.open(fname_gz, 'rb') as fin:
            with open(fname, 'wb') as fout:
                # We need to remove the first line
                for i, line in enumerate(fin.readlines()):
                    if i > 0:
                        fout.write(line)
        # Remove gz file
        fname_gz.unlink()
    else:
        logger.info('Word vectors file already exists')

    # Download e2e dutch model_
    logger.info('Download e2e model')
    url = "https://surfdrive.surf.nl/files/index.php/s/UnZMyDrBEFunmQZ/download"
    fname_zip = data_dir / 'model.zip'
    log_dir_name = data_dir / 'final'
    model_file = log_dir_name / 'model.max.ckpt.index'
    if not fname_zip.exists() and not model_file.exists():
        download_file(url, fname_zip)
    if not model_file.exists():
        with zipfile.ZipFile(fname_zip, 'r') as zfile:
            zfile.extractall(data_dir)
        Path(data_dir / 'logs' / 'final').rename(log_dir_name)
        Path(data_dir, 'logs').rmdir()
    else:
        logger.info('E2e model file already exists')

    # Download char_dict
    logger.info('Download char dict')
    url = "https://github.com/Filter-Bubble/e2e-Dutch/raw/v0.2.0/data/char_vocab.dutch.txt"
    fname = data_dir / 'char_vocab.dutch.txt'
    if not fname.exists():
        download_file(url, fname)
    else:
        logger.info('Char dict file already exists')


def get_parser():

    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--datapath', default=None)
    parser.add_argument('-v', '--verbose', action='store_true')
    return parser


def main():

    parser = get_parser()
    args = parser.parse_args()
    if args.verbose:
        # logger.setLevel(logging.INFO)
        logging.basicConfig(level=logging.INFO)
    # To do: argparse for config file
    if args.datapath is None:
        config = util.initialize_from_env(model_name='final')
    else:
        config = {'datapath': args.datapath}
    download_data(config)


if __name__ == "__main__":
    main()


1			import requests
			0 ignored issues – show introduced 2021-01-27 15:28 UTC by Report Bug Copy Issue Report Missing module docstring Loading history...
2			import gzip
			0 ignored issues – show introduced 2021-01-07 13:31 UTC by Report Bug Copy Issue Report standard import "import gzip" should be placed before "import requests" Loading history...
3			import zipfile
			0 ignored issues – show introduced 2021-01-27 15:28 UTC by Report Bug Copy Issue Report standard import "import zipfile" should be placed before "import requests" Loading history...
4			import argparse
			0 ignored issues – show introduced 2021-01-27 15:28 UTC by Report Bug Copy Issue Report standard import "import argparse" should be placed before "import requests" Loading history...
5			import logging
			0 ignored issues – show introduced 2021-01-27 15:28 UTC by Report Bug Copy Issue Report standard import "import logging" should be placed before "import requests" Loading history...
6			from tqdm import tqdm
7			from pathlib import Path
			0 ignored issues – show introduced 2021-01-08 13:12 UTC by Report Bug Copy Issue Report standard import "from pathlib import Path" should be placed before "import requests" Loading history...
8			from e2edutch import util
9
10			logger = logging.getLogger()
11
12
13			def download_file(url, path):
14			"""
15			Download a URL into a file as specified by `path`.
16			"""
17			# This function is copied from stanza
18			# https://github.com/stanfordnlp/stanza/blob/f0338f891a03e242c7e11e440dec6e191d54ab77/stanza/resources/common.py#L103
19			r = requests.get(url, stream=True)
			0 ignored issues – show Coding Style Naming introduced 2021-01-07 13:31 UTC by Report Bug Copy Issue Report Variable name "r" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
20			with open(path, 'wb') as f:
			0 ignored issues – show Coding Style Naming introduced 2021-01-07 13:31 UTC by Report Bug Copy Issue Report Variable name "f" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,\|_[^\\WA-Z]*\|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern) This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
21			file_size = int(r.headers.get('content-length'))
22			default_chunk_size = 131072
23			desc = 'Downloading ' + url
24			with tqdm(total=file_size, unit='B', unit_scale=True,
25			desc=desc) as pbar:
26			for chunk in r.iter_content(chunk_size=default_chunk_size):
27			if chunk:
28			f.write(chunk)
29			f.flush()
30			pbar.update(len(chunk))
31
32
33			def download_data(config={}):
			0 ignored issues – show Bug Best Practice introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report The default value `{}` might cause unintended side-effects. Objects as default values are only created once in Python and not on each invocation of the function. If the default object is modified, this modification is carried over to the next invocation of the method. # Bad: # If array_param is modified inside the function, the next invocation will # receive the modified object. def some_function(array_param=[]): # ... # Better: Create an array on each invocation def some_function(array_param=None): array_param = array_param or [] # ... Loading history... introduced 2021-01-25 09:19 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
34			# Create the data directory if it doesn't exist yet
35			data_dir = Path(config['datapath'])
36			logger.info('Downloading to {}'.format(data_dir))
			0 ignored issues – show introduced 2021-01-27 15:28 UTC by Report Bug Copy Issue Report Use lazy % formatting in logging functions Loading history...
37			data_dir.mkdir(parents=True, exist_ok=True)
38
39			# Download word vectors
40			logger.info('Download word vectors')
41			url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz"
42			fname = data_dir / 'fasttext.300.vec'
43			fname_gz = data_dir / 'fasttext.300.vec.gz'
44			if not fname.exists():
45			download_file(url, fname_gz)
46			with gzip.open(fname_gz, 'rb') as fin:
47			with open(fname, 'wb') as fout:
48			# We need to remove the first line
49			for i, line in enumerate(fin.readlines()):
50			if i > 0:
51			fout.write(line)
52			# Remove gz file
53			fname_gz.unlink()
54			else:
55			logger.info('Word vectors file already exists')
56
57			# Download e2e dutch model_
58			logger.info('Download e2e model')
59			url = "https://surfdrive.surf.nl/files/index.php/s/UnZMyDrBEFunmQZ/download"
60			fname_zip = data_dir / 'model.zip'
61			log_dir_name = data_dir / 'final'
62			model_file = log_dir_name / 'model.max.ckpt.index'
63			if not fname_zip.exists() and not model_file.exists():
64			download_file(url, fname_zip)
65			if not model_file.exists():
66			with zipfile.ZipFile(fname_zip, 'r') as zfile:
67			zfile.extractall(data_dir)
68			Path(data_dir / 'logs' / 'final').rename(log_dir_name)
69			Path(data_dir, 'logs').rmdir()
70			else:
71			logger.info('E2e model file already exists')
72
73			# Download char_dict
74			logger.info('Download char dict')
75			url = "https://github.com/Filter-Bubble/e2e-Dutch/raw/v0.2.0/data/char_vocab.dutch.txt"
76			fname = data_dir / 'char_vocab.dutch.txt'
77			if not fname.exists():
78			download_file(url, fname)
79			else:
80			logger.info('Char dict file already exists')
81
82
83			def get_parser():
			0 ignored issues – show introduced 2021-01-27 15:28 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
84			parser = argparse.ArgumentParser()
85			parser.add_argument('-d', '--datapath', default=None)
86			parser.add_argument('-v', '--verbose', action='store_true')
87			return parser
88
89
90			def main():
			0 ignored issues – show introduced 2021-01-27 15:28 UTC by Report Bug Copy Issue Report Missing function or method docstring Loading history...
91			parser = get_parser()
92			args = parser.parse_args()
93			if args.verbose:
94			# logger.setLevel(logging.INFO)
95			logging.basicConfig(level=logging.INFO)
96			# To do: argparse for config file
97			if args.datapath is None:
98			config = util.initialize_from_env(model_name='final')
99			else:
100			config = {'datapath': args.datapath}
101			download_data(config)
102
103
104			if __name__ == "__main__":
105			main()
106

Filter-Bubble / e2e-Dutch

e2edutch.download.get_parser() A last analyzed 2021-08-05 09:00 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

e2edutch.download.get_parser() A
last analyzed 2021-08-05 09:00 UTC