Conditions | 17 |
Total Lines | 126 |
Code Lines | 71 |
Lines | 0 |
Ratio | 0 % |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like ocrd.resolver.Resolver.download_to_directory() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
Methods with many parameters are not only hard to understand, but their parameters also often become inconsistent when you need more, or different data.
There are several approaches to avoid long parameter lists:
1 | from tempfile import mkdtemp |
||
28 | def download_to_directory(self, directory, url, basename=None, if_exists='skip', subdir=None, retries=None, timeout=None): |
||
29 | """ |
||
30 | Download a URL ``url`` to a local file in ``directory``. |
||
31 | |||
32 | If ``url`` looks like a file path, check whether that exists. |
||
33 | If it does exist and is within ``directory` already, return early. |
||
34 | If it does exist but is outside of ``directory``. copy it. |
||
35 | If ``url` does not appear to be a file path, try downloading via HTTP, retrying ``retries`` times with timeout ``timeout`` between calls. |
||
36 | |||
37 | If ``basename`` is not given but ``subdir`` is, set ``basename`` to the last path segment of ``url``. |
||
38 | |||
39 | If the target file already exists within ``directory``, behavior depends on ``if_exists``: |
||
40 | - ``skip`` (default): do nothing and return early. Note that this |
||
41 | - ``overwrite``: overwrite the existing file |
||
42 | - ``raise``: raise a ``FileExistsError`` |
||
43 | |||
44 | Args: |
||
45 | directory (string): Directory to download files to |
||
46 | url (string): URL to download from |
||
47 | |||
48 | Keyword Args: |
||
49 | basename (string, None): basename part of the filename on disk. Defaults to last path segment of ``url`` if unset. |
||
50 | if_exists (string, "skip"): What to do if target file already exists. |
||
51 | One of ``skip`` (default), ``overwrite`` or ``raise`` |
||
52 | subdir (string, None): Subdirectory to create within the directory. Think ``mets:fileGrp[@USE]``. |
||
53 | retries (int, None): Number of retries to attempt on network failure. |
||
54 | timeout (tuple, None): Timeout in seconds for establishing a connection and reading next chunk of data. |
||
55 | |||
56 | Returns: |
||
57 | Local filename string, *relative* to directory |
||
58 | """ |
||
59 | log = getLogger('ocrd.resolver.download_to_directory') # pylint: disable=redefined-outer-name |
||
60 | log.debug("directory=|%s| url=|%s| basename=|%s| if_exists=|%s| subdir=|%s|", directory, url, basename, if_exists, subdir) |
||
61 | |||
62 | if not url: |
||
63 | raise ValueError(f"'url' must be a non-empty string, not '{url}'") # actually Path also ok |
||
64 | if not directory: |
||
65 | raise ValueError(f"'directory' must be a non-empty string, not '{url}'") # actually Path would also work |
||
66 | |||
67 | url = str(url) |
||
68 | directory = Path(directory) |
||
69 | directory.mkdir(parents=True, exist_ok=True) |
||
70 | |||
71 | subdir_path = Path(subdir if subdir else '') |
||
72 | basename_path = Path(basename if basename else nth_url_segment(url)) |
||
73 | ret = Path(subdir_path, basename_path) |
||
74 | dst_path = Path(directory, ret) |
||
75 | |||
76 | # log.info("\n\tdst_path='%s \n\turl=%s", dst_path, url) |
||
77 | # print(f'>>> url={url}') |
||
78 | # print(f'>>> directory={directory}') |
||
79 | # print(f'>>> subdir_path={subdir_path}') |
||
80 | # print(f'>>> basename_path={basename_path}') |
||
81 | # print(f'>>> dst_path={dst_path}') |
||
82 | # print(f'>>> ret={ret}') |
||
83 | |||
84 | src_path = None |
||
85 | if is_local_filename(url): |
||
86 | try: |
||
87 | src_path = Path(get_local_filename(url)).resolve() |
||
88 | except FileNotFoundError as e: |
||
89 | log.error("Failed to resolve URL locally: %s --> '%s' which does not exist" % (url, src_path)) |
||
90 | raise e |
||
91 | if not src_path.exists(): |
||
92 | raise FileNotFoundError(f"File path passed as 'url' to download_to_directory does not exist: '{url}") |
||
93 | if src_path == dst_path: |
||
94 | log.debug("Stop early, src_path and dst_path are the same: '%s' (url: '%s')" % (src_path, url)) |
||
95 | return str(ret) |
||
96 | |||
97 | # Respect 'if_exists' kwarg |
||
98 | if dst_path.exists(): |
||
99 | if if_exists == 'skip': |
||
100 | log.debug(f"File already exists but if_exists == {if_exists}, skipping.") |
||
101 | return str(ret) |
||
102 | elif if_exists == 'raise': |
||
103 | raise FileExistsError(f"File already exists and if_exists == '{if_exists}': {dst_path}") |
||
104 | else: |
||
105 | log.debug(f"File already exists but if_exists == {if_exists}, overwriting.") |
||
106 | |||
107 | # Create dst_path parent dir |
||
108 | dst_path.parent.mkdir(parents=True, exist_ok=True) |
||
109 | |||
110 | # Copy files or download remote assets |
||
111 | if src_path: |
||
112 | # src_path set, so it is a file source, we can copy directly |
||
113 | log.debug("Copying file '%s' to '%s'", src_path, dst_path) |
||
114 | dst_path.write_bytes(src_path.read_bytes()) |
||
115 | else: |
||
116 | # src_path not set, it's an http URL, try to download |
||
117 | log.debug("Downloading URL '%s' to '%s'", url, dst_path) |
||
118 | if not retries and config.is_set('OCRD_DOWNLOAD_RETRIES'): |
||
119 | retries = config.OCRD_DOWNLOAD_RETRIES |
||
120 | if timeout is None and config.is_set('OCRD_DOWNLOAD_TIMEOUT'): |
||
121 | timeout = config.OCRD_DOWNLOAD_TIMEOUT |
||
122 | session = requests.Session() |
||
123 | retries = Retry(total=retries or 0, |
||
124 | status_forcelist=[ |
||
125 | # probably too wide (only transient failures): |
||
126 | 408, # Request Timeout |
||
127 | 409, # Conflict |
||
128 | 412, # Precondition Failed |
||
129 | 417, # Expectation Failed |
||
130 | 423, # Locked |
||
131 | 424, # Fail |
||
132 | 425, # Too Early |
||
133 | 426, # Upgrade Required |
||
134 | 428, # Precondition Required |
||
135 | 429, # Too Many Requests |
||
136 | 440, # Login Timeout |
||
137 | 500, # Internal Server Error |
||
138 | 503, # Service Unavailable |
||
139 | 504, # Gateway Timeout |
||
140 | 509, # Bandwidth Limit Exceeded |
||
141 | 529, # Site Overloaded |
||
142 | 598, # Proxy Read Timeout |
||
143 | 599, # Proxy Connect Timeout |
||
144 | ]) |
||
145 | adapter = HTTPAdapter(max_retries=retries) |
||
146 | session.mount('http://', adapter) |
||
147 | session.mount('https://', adapter) |
||
148 | response = session.get(url, timeout=timeout) |
||
149 | response.raise_for_status() |
||
150 | contents = handle_oai_response(response) |
||
151 | dst_path.write_bytes(contents) |
||
152 | |||
153 | return str(ret) |
||
154 | |||
316 |