1
|
|
|
import os |
|
|
|
|
2
|
|
|
import requests |
3
|
|
|
import gzip |
|
|
|
|
4
|
|
|
import zipfile |
|
|
|
|
5
|
|
|
from tqdm import tqdm |
6
|
|
|
from e2edutch import util |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
def download_file(url, path): |
10
|
|
|
""" |
11
|
|
|
Download a URL into a file as specified by `path`. |
12
|
|
|
""" |
13
|
|
|
# This function is copied from stanza |
14
|
|
|
# https://github.com/stanfordnlp/stanza/blob/f0338f891a03e242c7e11e440dec6e191d54ab77/stanza/resources/common.py#L103 |
15
|
|
|
r = requests.get(url, stream=True) |
|
|
|
|
16
|
|
|
with open(path, 'wb') as f: |
|
|
|
|
17
|
|
|
file_size = int(r.headers.get('content-length')) |
18
|
|
|
default_chunk_size = 131072 |
19
|
|
|
desc = 'Downloading ' + url |
20
|
|
|
with tqdm(total=file_size, unit='B', unit_scale=True, |
21
|
|
|
desc=desc) as pbar: |
22
|
|
|
for chunk in r.iter_content(chunk_size=default_chunk_size): |
23
|
|
|
if chunk: |
24
|
|
|
f.write(chunk) |
25
|
|
|
f.flush() |
26
|
|
|
pbar.update(len(chunk)) |
27
|
|
|
|
28
|
|
|
|
29
|
|
|
def download_data(config={}): |
|
|
|
|
30
|
|
|
data_dir = util.get_data_dir(config) |
31
|
|
|
|
32
|
|
|
# Download word vectors |
33
|
|
|
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.nl.300.vec.gz" |
34
|
|
|
fname = os.path.join(data_dir, 'fasttext.300.vec') |
35
|
|
|
fname_gz = fname+'.gz' |
36
|
|
|
if not os.path.exists(fname): |
37
|
|
|
download_file(url, fname_gz) |
38
|
|
|
with gzip.open(fname_gz, 'rb') as fin: |
39
|
|
|
with open(fname, 'wb') as fout: |
40
|
|
|
# We need to remove the first line |
41
|
|
|
for i, line in enumerate(fin.readlines()): |
42
|
|
|
if i > 0: |
43
|
|
|
fout.write(line) |
44
|
|
|
os.remove(fname_gz) |
45
|
|
|
|
46
|
|
|
# Download e2e dutch model_ |
47
|
|
|
url = "https://surfdrive.surf.nl/files/index.php/s/UnZMyDrBEFunmQZ/download" |
48
|
|
|
fname_zip = os.path.join(data_dir, 'model.zip') |
49
|
|
|
log_dir_name = os.path.join(data_dir, 'final') |
50
|
|
|
if not os.path.exists(fname_zip) and not os.path.exists(log_dir_name): |
51
|
|
|
download_file(url, fname_zip) |
52
|
|
|
if not os.path.exists(log_dir_name): |
53
|
|
|
with zipfile.ZipFile(fname_zip, 'r') as zfile: |
54
|
|
|
zfile.extractall(data_dir) |
55
|
|
|
os.rename(os.path.join(data_dir, 'logs', 'final'), log_dir_name) |
56
|
|
|
os.rmdir(os.path.join(data_dir, 'logs')) |
57
|
|
|
|
58
|
|
|
# Download char_dict |
59
|
|
|
url = "https://github.com/Filter-Bubble/e2e-Dutch/raw/v0.2.0/data/char_vocab.dutch.txt" |
60
|
|
|
fname = os.path.join(data_dir, 'char_vocab.dutch.txt') |
61
|
|
|
if not os.path.exists(fname): |
62
|
|
|
download_file(url, fname) |
63
|
|
|
|
64
|
|
|
|
65
|
|
|
def main(): |
|
|
|
|
66
|
|
|
# To do: argparse for config file |
67
|
|
|
download_data() |
68
|
|
|
|
69
|
|
|
|
70
|
|
|
if __name__ == "__main__": |
71
|
|
|
main() |
72
|
|
|
|