Completed
Push — master ( fc9889...278673 )
by Dafne van
18s queued 16s
created

e2edutch.download.main()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
cc 1
nop 0
1
import os
0 ignored issues
show
introduced by
Missing module docstring
Loading history...
2
import requests
3
import gzip
0 ignored issues
show
introduced by
standard import "import gzip" should be placed before "import requests"
Loading history...
4
import zipfile
0 ignored issues
show
introduced by
standard import "import zipfile" should be placed before "import requests"
Loading history...
5
from tqdm import tqdm
6
from e2edutch import util
7
8
9
def download_file(url, path):
10
    """
11
    Download a URL into a file as specified by `path`.
12
    """
13
    # This function is copied from stanza
14
    # https://github.com/stanfordnlp/stanza/blob/f0338f891a03e242c7e11e440dec6e191d54ab77/stanza/resources/common.py#L103
15
    r = requests.get(url, stream=True)
0 ignored issues
show
Coding Style Naming introduced by
Variable name "r" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
16
    with open(path, 'wb') as f:
0 ignored issues
show
Coding Style Naming introduced by
Variable name "f" doesn't conform to snake_case naming style ('([^\\W\\dA-Z][^\\WA-Z]2,|_[^\\WA-Z]*|__[^\\WA-Z\\d_][^\\WA-Z]+__)$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
17
        file_size = int(r.headers.get('content-length'))
18
        default_chunk_size = 131072
19
        desc = 'Downloading ' + url
20
        with tqdm(total=file_size, unit='B', unit_scale=True,
21
                  desc=desc) as pbar:
22
            for chunk in r.iter_content(chunk_size=default_chunk_size):
23
                if chunk:
24
                    f.write(chunk)
25
                    f.flush()
26
                    pbar.update(len(chunk))
27
28
29
def download_data(config={}):
0 ignored issues
show
Bug Best Practice introduced by
The default value {} might cause unintended side-effects.

Objects as default values are only created once in Python and not on each invocation of the function. If the default object is modified, this modification is carried over to the next invocation of the method.

# Bad:
# If array_param is modified inside the function, the next invocation will
# receive the modified object.
def some_function(array_param=[]):
    # ...

# Better: Create an array on each invocation
def some_function(array_param=None):
    array_param = array_param or []
    # ...
Loading history...
introduced by
Missing function or method docstring
Loading history...
30
    data_dir = util.get_data_dir(config)
31
32
    # Download word vectors
33
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz"
34
    fname = os.path.join(data_dir, 'fasttext.300.vec')
35
    fname_gz = fname+'.gz'
36
    if not os.path.exists(fname):
37
        download_file(url, fname_gz)
38
        with gzip.open(fname_gz, 'rb') as fin:
39
            with open(fname, 'wb') as fout:
40
                # We need to remove the first line
41
                for i, line in enumerate(fin.readlines()):
42
                    if i > 0:
43
                        fout.write(line)
44
        os.remove(fname_gz)
45
46
    # Download e2e dutch model_
47
    url = "https://surfdrive.surf.nl/files/index.php/s/UnZMyDrBEFunmQZ/download"
48
    fname_zip = os.path.join(data_dir, 'model.zip')
49
    log_dir_name = os.path.join(data_dir, 'final')
50
    if not os.path.exists(fname_zip) and not os.path.exists(log_dir_name):
51
        download_file(url, fname_zip)
52
    if not os.path.exists(log_dir_name):
53
        with zipfile.ZipFile(fname_zip, 'r') as zfile:
54
            zfile.extractall(data_dir)
55
        os.rename(os.path.join(data_dir, 'logs', 'final'), log_dir_name)
56
        os.rmdir(os.path.join(data_dir, 'logs'))
57
58
    # Download char_dict
59
    url = "https://github.com/Filter-Bubble/e2e-Dutch/raw/v0.2.0/data/char_vocab.dutch.txt"
60
    fname = os.path.join(data_dir, 'char_vocab.dutch.txt')
61
    if not os.path.exists(fname):
62
        download_file(url, fname)
63
64
65
def main():
0 ignored issues
show
introduced by
Missing function or method docstring
Loading history...
66
    # To do: argparse for config file
67
    download_data()
68
69
70
if __name__ == "__main__":
71
    main()
72