PathUtils.sanitize_node()   F
last analyzed

Complexity

Conditions 24

Size

Total Lines 90
Code Lines 48

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 48
dl 0
loc 90
rs 0
c 0
b 0
f 0
cc 24
nop 7

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like pocketutils.tools.path_tools.PathUtils.sanitize_node() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# SPDX-FileCopyrightText: Copyright 2020-2023, Contributors to pocketutils
2
# SPDX-PackageHomePage: https://github.com/dmyersturnbull/pocketutils
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
6
"""
7
8
import logging
9
import os
10
import re
11
import sys
12
from collections.abc import Callable, Sequence
13
from copy import copy
14
from dataclasses import dataclass
15
from pathlib import Path, PurePath
16
from typing import Any, Self
17
18
from pocketutils import ValueIllegalError
19
20
__all__ = ["PathUtils", "PathTools"]
21
22
logger = logging.getLogger("pocketutils")
23
24
_bad_chars = {
25
    "<",
26
    ">",
27
    ":",
28
    '"',
29
    "|",
30
    "?",
31
    "*",
32
    "\\",
33
    "/",
34
    *{chr(c) for c in range(128, 128 + 33)},
35
    *{chr(c) for c in range(32)},
36
    "\t",
37
}
38
39
# note that we can't call WindowsPath.is_reserved because it can't be instantiated on non-Linux
40
# also, these appear to be different from the ones defined there
41
42
# don't handle Long UNC paths
43
# also cannot be blank or whitespace
44
# the $ suffixed ones are for FAT
45
# no CLOCK$, even with an ext
46
# also no SCREEN$
47
_bad_strs = {
48
    "CON",
49
    "PRN",
50
    "AUX",
51
    "NUL",
52
    "COM1",
53
    "COM2",
54
    "COM3",
55
    "COM4",
56
    "COM5",
57
    "COM6",
58
    "COM7",
59
    "COM8",
60
    "COM9",
61
    "LPT1",
62
    "LPT2",
63
    "LPT3",
64
    "LPT4",
65
    "LPT5",
66
    "LPT6",
67
    "LPT7",
68
    "LPT8",
69
    "LPT9",
70
}
71
_bad_strs_fat = {*_bad_strs, *{"$IDLE$", "CONFIG$", "KEYBD$", "SCREEN$", "CLOCK$", "LST"}}
72
73
74
@dataclass(slots=True, frozen=True)
75
class PathUtils:
76
    def is_path_like(self: Self, value: Any) -> bool:
77
        return isinstance(value, str | PurePath | os.PathLike)
78
79
    def up_dir(self: Self, n: int, *parts) -> Path:
80
        """
81
        Get an absolute path `n` parents from `os.getcwd()`.
82
        Does not sanitize.
83
84
        Ex: In dir '/home/john/dir_a/dir_b':
85
            updir(2, 'dir1', 'dir2')  # returns Path('/home/john/dir1/dir2')
86
        """
87
        base = Path(os.getcwd())
88
        for _ in range(n):
89
            base = base.parent
90
        for part in parts:
91
            base = base / part
92
        return base.resolve()
93
94
    def guess_trash(self: Self) -> Path:
95
        """
96
        Chooses a reasonable path for trash based on the OS.
97
        This is not reliable.
98
        For a more sophisticated solution, see https://github.com/hsoft/send2trash
99
        However, even that can fail.
100
        """
101
        plat = sys.platform.lower()
102
        if "darwin" in plat:
103
            return Path.home() / ".Trash"
104
        elif "win" in plat:
105
            return Path(Path.home().root) / "$Recycle.Bin"
106
        else:
107
            return Path.home() / ".trash"
108
109
    def sanitize_path(
110
        self: Self,
111
        path: PurePath | str,
112
        *,
113
        is_file: bool | None = None,
114
        fat: bool = False,
115
        trim: bool = False,
116
        warn: bool | Callable[[str], Any] = True,
117
    ) -> Path:
118
        r"""
119
        Sanitizes a path for major OSes and filesystems.
120
        Also see sanitize_path_nodes and sanitize_path_node.
121
        Mostly platform-independent.
122
123
        The idea is to sanitize for both Windows and Posix, regardless of the platform in use.
124
        The sanitization should be as uniform as possible for both platforms.
125
        This works for at least Windows+NTFS.
126
        Tilde substitution for long filenames in Windows is unsupported.
127
128
        A corner case is drive letters in Linux:
129
        "C:\\Users\\john" is converted to '/C:/users/john' if os.name=='posix'
130
        """
131
        w = {True: logger.warning, False: lambda _: None}.get(warn, warn)
132
        path = str(path)
133
        if path.startswith("\\\\?"):
134
            msg = f"Long UNC Windows paths (\\\\? prefix) are not supported (path '{path}')"
135
            raise ValueIllegalError(msg, value=str(path))
136
        bits = str(path).strip().replace("\\", "/").split("/")
137
        new_nodes = list(self.sanitize_nodes(bits, is_file=is_file, fat=fat, trim=trim))
138
        # unfortunately POSIX turns Path('C:\', '5') into C:\/5
139
        # this isn't an ideal way to fix it, but it works
140
        pat = re.compile(r"^([A-Z]:)\\?$")
141
        if os.name == "posix" and len(new_nodes) > 0 and pat.fullmatch(new_nodes[0]):
142
            new_nodes[0] = new_nodes[0].rstrip("\\")
143
            new_nodes.insert(0, "/")
144
        new_path = Path(*new_nodes)
145
        if new_path != path:
146
            w(f"Sanitized filename {path} → {new_path}")
147
        return Path(new_path)
148
149
    def sanitize_nodes(
150
        self: Self,
151
        bits: Sequence[PurePath | str],
152
        *,
153
        is_file: bool | None = None,
154
        fat: bool = False,
155
        trim: bool = False,
156
    ) -> Sequence[str]:
157
        fixed_bits = [
158
            bit + os.sep
159
            if i == 0 and bit.strip() in ["", ".", ".."]
160
            else self.sanitize_node(
161
                bit,
162
                is_file=(False if i < len(bits) - 1 else is_file),
163
                trim=trim,
164
                fat=fat,
165
                is_root_or_drive=(None if i == 0 else False),
166
            )
167
            for i, bit in enumerate(bits)
168
            if bit.strip() not in ["", "."]
169
            or i == 0  # ignore // (empty) just like Path does (but fail on sanitize_path_node(' '))
170
        ]
171
        return [bit for i, bit in enumerate(fixed_bits) if i == 0 or bit not in ["", "."]]
172
173
    def sanitize_node(
174
        self: Self,
175
        bit: PurePath | str,
176
        *,
177
        is_file: bool | None = None,
178
        is_root_or_drive: bool | None = None,
179
        fat: bool = False,
180
        trim: bool = False,
181
    ) -> str:
182
        r"""
183
        Sanitizes a path node such that it will be fine for major OSes and filesystems.
184
        For example:
185
            - 'plums;and/or;apples' becomes 'plums_and_or_apples' (escaped ; and /)
186
            - 'null.txt' becomes '_null_.txt' ('null' is forbidden in Windows)
187
            - 'abc  ' becomes 'abc' (no trailing spaces)
188
189
        The behavior is platform-independent -- os, sys, and pathlib are not used.
190
        For ex, calling sanitize_path_node(r'C:\') returns r'C:\' on both Windows and Linux
191
        If you want to sanitize a whole path, see sanitize_path instead.
192
193
        Args:
194
            bit: The node
195
            is_file: False for directories, True otherwise, None if unknown
196
            is_root_or_drive: True if known to be the root ('/') or a drive ('C:\'), None if unknown
197
            fat: Also make compatible with FAT filesystems
198
            trim: Truncate to 254 chars (otherwise fails)
199
        """
200
        # since is_file and is_root_or_drive are both Optional[bool], let's be explicit and use 'is' for clarity
201
        if is_file is True and is_root_or_drive is True:
202
            msg = "is_file and is_root_or_drive are both true"
203
            raise ValueIllegalError(msg)
204
        if is_file is True and is_root_or_drive is None:
205
            is_root_or_drive = False
206
        if is_root_or_drive is True and is_file is None:
207
            is_file = False
208
        source_bit = copy(str(bit))
209
        bit = str(bit).strip()
210
        # first, catch root or drive as long as is_root_or_drive is not false
211
        # if is_root_or_drive is True (which is a weird call), then fail if it's not
212
        # otherwise, it's not a root or drive letter, so keep going
213
        if is_root_or_drive is not False:
214
            # \ is allowed in Windows
215
            if bit in ["/", "\\"]:
216
                return bit
217
            m = re.compile(r"^([A-Z]:)(?:\\)?$").fullmatch(bit)
218
            # this is interesting
219
            # for bit=='C:' and is_root_or_drive=None,
220
            # it could be either a drive letter
221
            # or a file path that should be corrected to 'C_'
222
            # I guess here we're going with a drive letter
223
            if m is not None:
224
                # we need C:\ and not C: because:
225
                # Path('C:\\', '5').is_absolute() is True
226
                # but Path('C:', '5').is_absolute() is False
227
                # unfortunately, doing Path('C:\\', '5') on Linux gives 'C:\\/5'
228
                # I can't handle that here, but sanitize_path() will account for it
229
                return m.group(1) + "\\"
230
            if is_root_or_drive is True:
231
                msg = f"Node '{bit}' is not the root or a drive letter"
232
                raise ValueIllegalError(msg, value=bit)
233
        # just dots is invalid
234
        if set(bit.replace(" ", "")) == "." and bit not in ["..", "."]:
235
            bit = "_" + bit + "_"
236
            # raise IllegalPathError(f"Node '{source_bit}' is invalid")
237
        for q in _bad_chars:
238
            bit = bit.replace(q, "_")
239
        bad_strs = _bad_strs_fat if fat else _bad_strs
240
        if bit.upper() in bad_strs:
241
            # arbitrary decision
242
            bit = "_" + bit + "_"
243
        else:
244
            stub, ext = os.path.splitext(bit)
245
            if stub.upper() in bad_strs:
246
                bit = "_" + stub + "_" + ext
247
        if bit.strip() == "":
248
            bit = "_" + bit + "_"
249
            # raise IllegalPathError(f"Node '{source_bit}' is empty or contains only whitespace")
250
        # "." cannot end a node
251
        bit = bit.rstrip()
252
        if is_file is not True and (bit == "." or bit == ".."):
253
            return bit
254
        # never allow '.' or ' ' to end a filename
255
        bit = bit.rstrip(". ")
256
        # do this after
257
        if len(bit) > 254 and trim:
258
            bit = bit[:254]
259
        elif len(bit) > 254:
260
            msg = f"Node '{source_bit}' has more than 254 characters"
261
            raise ValueIllegalError(msg, value=bit)
262
        return bit
263
264
265
PathTools = PathUtils()
266