|
1
|
|
|
import os |
|
2
|
|
|
import platform |
|
3
|
|
|
import shutil |
|
4
|
|
|
from datetime import datetime |
|
5
|
|
|
from functools import reduce |
|
6
|
|
|
from hashlib import md5 |
|
7
|
|
|
from math import inf |
|
8
|
|
|
from multiprocessing import cpu_count |
|
9
|
|
|
from multiprocessing.pool import Pool |
|
10
|
|
|
from operator import itemgetter |
|
11
|
|
|
from pathlib import Path |
|
12
|
|
|
|
|
13
|
|
|
from looptools import Timer |
|
14
|
|
|
|
|
15
|
|
|
from dirutility.walk.filter import PathFilters |
|
16
|
|
|
from dirutility.walk.multiprocess import Sprinter |
|
17
|
|
|
from dirutility.walk.sequential import Crawler |
|
18
|
|
|
|
|
19
|
|
|
|
|
20
|
|
View Code Duplication |
class Printer: |
|
|
|
|
|
|
21
|
|
|
|
|
22
|
|
|
def __init__(self, console_output, console_stream): |
|
23
|
|
|
"""Printer function initialized with classes. Used for optional printing""" |
|
24
|
|
|
self.console_output = console_output |
|
25
|
|
|
self.console_stream = console_stream |
|
26
|
|
|
|
|
27
|
|
|
def printer(self, message, stream=False): |
|
28
|
|
|
if not stream: |
|
29
|
|
|
if self.console_output: |
|
30
|
|
|
print('\t' + message) |
|
31
|
|
|
else: |
|
32
|
|
|
if self.console_stream: |
|
33
|
|
|
print('\t' + message) |
|
34
|
|
|
|
|
35
|
|
|
|
|
36
|
|
|
def pool_process(func, iterable, process_name='Pool processing', cpus=cpu_count()): |
|
37
|
|
|
""" |
|
38
|
|
|
Apply a function to each element in an iterable and return a result list. |
|
39
|
|
|
|
|
40
|
|
|
:param func: A function that returns a value |
|
41
|
|
|
:param iterable: A list or set of elements to be passed to the func as the singular parameter |
|
42
|
|
|
:param process_name: Name of the process, for printing purposes only |
|
43
|
|
|
:param cpus: Number of CPUs |
|
44
|
|
|
:return: Result list |
|
45
|
|
|
""" |
|
46
|
|
|
with Timer('\t{0} ({1}) completed in'.format(process_name, str(func))): |
|
47
|
|
|
pool = Pool(cpus) |
|
48
|
|
|
vals = pool.map(func, iterable) |
|
49
|
|
|
pool.close() |
|
50
|
|
|
return vals |
|
51
|
|
|
|
|
52
|
|
|
|
|
53
|
|
|
def md5_hash(file_path): |
|
54
|
|
|
"""Open a file path and hash the contents.""" |
|
55
|
|
|
with open(file_path, 'rb') as fp: |
|
56
|
|
|
return md5(fp.read()).hexdigest() |
|
57
|
|
|
|
|
58
|
|
|
|
|
59
|
|
|
def md5_tuple(file_path): |
|
60
|
|
|
"""Returns a file_path, hash tuple.""" |
|
61
|
|
|
return file_path, md5_hash(file_path) |
|
62
|
|
|
|
|
63
|
|
|
|
|
64
|
|
|
def pool_hash(path_list): |
|
65
|
|
|
"""Pool process file hashing.""" |
|
66
|
|
|
return pool_process(md5_tuple, path_list, 'MD5 hashing') |
|
67
|
|
|
|
|
68
|
|
|
|
|
69
|
|
|
def remover(file_path): |
|
70
|
|
|
"""Delete a file or directory path only if it exists.""" |
|
71
|
|
|
if os.path.isfile(file_path): |
|
72
|
|
|
os.remove(file_path) |
|
73
|
|
|
return True |
|
74
|
|
|
elif os.path.isdir(file_path): |
|
75
|
|
|
shutil.rmtree(file_path) |
|
76
|
|
|
return True |
|
77
|
|
|
else: |
|
78
|
|
|
return False |
|
79
|
|
|
|
|
80
|
|
|
|
|
81
|
|
View Code Duplication |
def creation_date(path_to_file, return_datetime=True): |
|
|
|
|
|
|
82
|
|
|
""" |
|
83
|
|
|
Retrieve a file's creation date. |
|
84
|
|
|
|
|
85
|
|
|
Try to get the date that a file was created, falling back to when it was |
|
86
|
|
|
last modified if that isn't possible. |
|
87
|
|
|
|
|
88
|
|
|
See http://stackoverflow.com/a/39501288/1709587 for explanation. |
|
89
|
|
|
|
|
90
|
|
|
:param path_to_file: File path |
|
91
|
|
|
:param return_datetime: Bool, returns value in Datetime format |
|
92
|
|
|
:return: Creation date |
|
93
|
|
|
""" |
|
94
|
|
|
if platform.system() == 'Windows': |
|
95
|
|
|
created_at = os.path.getctime(path_to_file) |
|
96
|
|
|
else: |
|
97
|
|
|
stat = os.stat(path_to_file) |
|
98
|
|
|
try: |
|
99
|
|
|
created_at = stat.st_birthtime |
|
100
|
|
|
except AttributeError: |
|
101
|
|
|
# We're probably on Linux. No easy way to get creation dates here, |
|
102
|
|
|
# so we'll settle for when its content was last modified. |
|
103
|
|
|
created_at = stat.st_mtime |
|
104
|
|
|
|
|
105
|
|
|
if return_datetime: |
|
106
|
|
|
return datetime.fromtimestamp(created_at) |
|
107
|
|
|
else: |
|
108
|
|
|
return created_at |
|
109
|
|
|
|
|
110
|
|
|
|
|
111
|
|
|
def creation_date_tuple(file_path): |
|
112
|
|
|
"""Returns a (file_path, creation_date) tuple.""" |
|
113
|
|
|
return file_path, creation_date(file_path) |
|
114
|
|
|
|
|
115
|
|
|
|
|
116
|
|
|
def pool_creation_date(path_list): |
|
117
|
|
|
"""Pool process file creation dates.""" |
|
118
|
|
|
return pool_process(creation_date_tuple, path_list, 'File creation dates') |
|
119
|
|
|
|
|
120
|
|
|
|
|
121
|
|
View Code Duplication |
class DirPaths: |
|
|
|
|
|
|
122
|
|
|
|
|
123
|
|
|
def __init__(self, |
|
124
|
|
|
directory, |
|
125
|
|
|
full_paths=False, |
|
126
|
|
|
topdown=True, |
|
127
|
|
|
to_include=None, |
|
128
|
|
|
to_exclude=None, |
|
129
|
|
|
min_level=0, |
|
130
|
|
|
max_level=inf, |
|
131
|
|
|
filters=None, |
|
132
|
|
|
non_empty_folders=False, |
|
133
|
|
|
parallelize=False, |
|
134
|
|
|
pool_size=cpu_count(), |
|
135
|
|
|
console_output=False, |
|
136
|
|
|
console_stream=False, |
|
137
|
|
|
hash_files=False): |
|
138
|
|
|
""" |
|
139
|
|
|
This class generates a list of either files and or folders within a root directory. |
|
140
|
|
|
|
|
141
|
|
|
The walk method generates a directory list of files by walking the file tree top down or bottom up. The |
|
142
|
|
|
files and folders method generate a list of files or folders in the top level of the tree. |
|
143
|
|
|
|
|
144
|
|
|
:param directory: Starting directory file path |
|
145
|
|
|
:param full_paths: Bool, when true full paths are concatenated to file paths list |
|
146
|
|
|
:param topdown: Bool, when true walk method walks tree from the topdwon. When false tree is walked bottom up |
|
147
|
|
|
:param to_include: None by default. List of filters acceptable to find within file path string return |
|
148
|
|
|
:param to_exclude: None by default. List of filters NOT acceptable to return |
|
149
|
|
|
:param min_level: 0 by default. Minimum directory level to save paths from |
|
150
|
|
|
:param max_level: Infinity by default. Maximum directory level to save paths from |
|
151
|
|
|
:param parallelize: Bool, when true pool processing is enabled within walk method |
|
152
|
|
|
:param pool_size: Number of CPUs for pool processing, default is number of processors |
|
153
|
|
|
:param console_output: Bool, when true console output is printed |
|
154
|
|
|
:param console_stream: Bool, when true loops print live results |
|
155
|
|
|
:param hash_files: Bool, when true walk() method return a dictionary file_paths and hashes |
|
156
|
|
|
""" |
|
157
|
|
|
self.timer = Timer() |
|
158
|
|
|
self.full_paths = full_paths |
|
159
|
|
|
self.topdown = topdown |
|
160
|
|
|
|
|
161
|
|
|
# Exclude .DS_Store by default, set to_exclude to False to include .DS_Store |
|
162
|
|
|
to_exclude = ['.DS_Store'] if to_exclude is None else to_exclude |
|
163
|
|
|
if any(i for i in [to_include, to_exclude, filters]) or min_level != 0 or max_level != inf: |
|
164
|
|
|
self.filters = PathFilters(to_include, to_exclude, min_level, max_level, filters, non_empty_folders) |
|
165
|
|
|
else: |
|
166
|
|
|
self.filters = False |
|
167
|
|
|
|
|
168
|
|
|
self.console_output = console_output |
|
169
|
|
|
self.console_stream = console_stream |
|
170
|
|
|
self._hash_files = hash_files |
|
171
|
|
|
|
|
172
|
|
|
self._printer = Printer(console_output, console_stream).printer |
|
173
|
|
|
self._printer('DIRPATHS') |
|
174
|
|
|
|
|
175
|
|
|
# Check that parallelization is enabled |
|
176
|
|
|
if parallelize: |
|
177
|
|
|
self.pool_size = pool_size |
|
178
|
|
|
self.parallelize = parallelize |
|
179
|
|
|
self.filepaths = [] |
|
180
|
|
|
|
|
181
|
|
|
# Check if directory is a singular (1) string or if it is a list of strings (multiple) |
|
182
|
|
|
try: |
|
183
|
|
|
self.directory = [str(directory)] |
|
184
|
|
|
except TypeError: |
|
185
|
|
|
self.directory = [str(dirs) for dirs in directory] |
|
186
|
|
|
|
|
187
|
|
|
def __iter__(self): |
|
188
|
|
|
return iter(list(self.filepaths)) |
|
189
|
|
|
|
|
190
|
|
|
def __str__(self): |
|
191
|
|
|
return str(self.filepaths) |
|
192
|
|
|
|
|
193
|
|
|
def __len__(self): |
|
194
|
|
|
return len(self.filepaths) |
|
195
|
|
|
|
|
196
|
|
|
def _get_filepaths(self): |
|
197
|
|
|
"""Filters list of file paths to remove non-included, remove excluded files and concatenate full paths.""" |
|
198
|
|
|
self._printer(str(self.__len__()) + " file paths have been parsed in " + str(self.timer.end)) |
|
199
|
|
|
if self._hash_files: |
|
200
|
|
|
return pool_hash(self.filepaths) |
|
201
|
|
|
else: |
|
202
|
|
|
return self.filepaths |
|
203
|
|
|
|
|
204
|
|
|
def creation_dates(self, sort=True): |
|
205
|
|
|
""" |
|
206
|
|
|
Return a list of (file_path, creation_date) tuples created from list of walked paths. |
|
207
|
|
|
|
|
208
|
|
|
:param sort: Bool, sorts file_paths on created_date from newest to oldest. |
|
209
|
|
|
:return: List of (file_path, created_date) tuples. |
|
210
|
|
|
""" |
|
211
|
|
|
if not sort: |
|
212
|
|
|
return pool_creation_date(self.filepaths) |
|
213
|
|
|
else: |
|
214
|
|
|
pcd = pool_creation_date(self.filepaths) |
|
215
|
|
|
pcd.sort(key=itemgetter(1), reverse=True) |
|
216
|
|
|
return pcd |
|
217
|
|
|
|
|
218
|
|
|
def walk(self): |
|
219
|
|
|
""" |
|
220
|
|
|
Default file path retrieval function. |
|
221
|
|
|
sprinter() - Generates file path list using pool processing and Queues |
|
222
|
|
|
crawler() - Generates file path list using os.walk() in sequence |
|
223
|
|
|
""" |
|
224
|
|
|
if self.parallelize: |
|
225
|
|
|
self.filepaths = Sprinter(self.directory, self.filters, self.full_paths, self.pool_size, |
|
226
|
|
|
self._printer).sprinter() |
|
227
|
|
|
else: |
|
228
|
|
|
self.filepaths = Crawler(self.directory, self.filters, self.full_paths, self.topdown, |
|
229
|
|
|
self._printer).crawler() |
|
230
|
|
|
return self._get_filepaths() |
|
231
|
|
|
|
|
232
|
|
|
def files(self): |
|
233
|
|
|
"""Return list of files in root directory""" |
|
234
|
|
|
self._printer('\tFiles Walk') |
|
235
|
|
|
for directory in self.directory: |
|
236
|
|
|
for path in os.listdir(directory): |
|
237
|
|
|
full_path = os.path.join(directory, path) |
|
238
|
|
|
if os.path.isfile(full_path): |
|
239
|
|
|
if not path.startswith('.'): |
|
240
|
|
|
self.filepaths.append(full_path) |
|
241
|
|
|
return self._get_filepaths() |
|
242
|
|
|
|
|
243
|
|
|
def folders(self): |
|
244
|
|
|
"""Return list of folders in root directory""" |
|
245
|
|
|
for directory in self.directory: |
|
246
|
|
|
for path in os.listdir(directory): |
|
247
|
|
|
full_path = os.path.join(directory, path) |
|
248
|
|
|
if os.path.isdir(full_path): |
|
249
|
|
|
if not path.startswith('.'): |
|
250
|
|
|
self.filepaths.append(full_path) |
|
251
|
|
|
return self._get_filepaths() |
|
252
|
|
|
|
|
253
|
|
|
|
|
254
|
|
View Code Duplication |
class DirTree: |
|
|
|
|
|
|
255
|
|
|
|
|
256
|
|
|
def __init__(self, root, branches=None): |
|
257
|
|
|
""" |
|
258
|
|
|
Generate a tree dictionary of the contents of a root directory. |
|
259
|
|
|
:param root: Starting directory |
|
260
|
|
|
:param branches: List of function tuples used for filtering |
|
261
|
|
|
""" |
|
262
|
|
|
self.tree_dict = {} |
|
263
|
|
|
self.directory = Path(root) |
|
264
|
|
|
self.start = str(self.directory).rfind(os.sep) + 1 |
|
265
|
|
|
self.branches = branches |
|
266
|
|
|
self.get() |
|
267
|
|
|
|
|
268
|
|
|
def __iter__(self): |
|
269
|
|
|
return iter(self.tree_dict.items()) |
|
270
|
|
|
|
|
271
|
|
|
def __str__(self): |
|
272
|
|
|
return str(self.tree_dict) |
|
273
|
|
|
|
|
274
|
|
|
@property |
|
275
|
|
|
def dict(self): |
|
276
|
|
|
return self.tree_dict |
|
277
|
|
|
|
|
278
|
|
|
def _filter(self, folders, folder_or_file): |
|
279
|
|
|
for index in range(0, len(folders)): |
|
280
|
|
|
filters = self.branches[index][folder_or_file] |
|
281
|
|
|
if filters: |
|
282
|
|
|
exclude = filters.get |
|
283
|
|
|
include = filters.get |
|
284
|
|
|
|
|
285
|
|
|
if exclude and folders[index] in exclude: |
|
286
|
|
|
return False |
|
287
|
|
|
if include and folders[index] not in include: |
|
288
|
|
|
return False |
|
289
|
|
|
return True |
|
290
|
|
|
|
|
291
|
|
|
def get(self): |
|
292
|
|
|
""" |
|
293
|
|
|
Generate path, dirs, files tuple for each path in directory. Executes filters if branches are not None |
|
294
|
|
|
:return: |
|
295
|
|
|
""" |
|
296
|
|
|
for path, dirs, files in os.walk(self.directory): |
|
297
|
|
|
folders = path[self.start:].split(os.sep) |
|
298
|
|
|
if self.branches: |
|
299
|
|
|
if self._filter(folders, 'folders'): |
|
300
|
|
|
files = dict.fromkeys(files) |
|
301
|
|
|
parent = reduce(dict.get, folders[:-1], self.tree_dict) |
|
302
|
|
|
parent[folders[-1]] = files |
|
303
|
|
|
else: |
|
304
|
|
|
files = dict.fromkeys(files) |
|
305
|
|
|
parent = reduce(dict.get, folders[:-1], self.tree_dict) |
|
306
|
|
|
parent[folders[-1]] = files |
|
307
|
|
|
return self.tree_dict |
|
308
|
|
|
|
|
309
|
|
|
|
|
310
|
|
View Code Duplication |
def gui(): |
|
|
|
|
|
|
311
|
|
|
from dirutility.gui import WalkGUI |
|
312
|
|
|
gui = WalkGUI() |
|
313
|
|
|
params = gui.parsing() |
|
314
|
|
|
parse = params['parse'] |
|
315
|
|
|
|
|
316
|
|
|
paths = DirPaths(parse['directory'], |
|
317
|
|
|
console_stream=parse['console_stream'], |
|
318
|
|
|
parallelize=parse['parallelize'], |
|
319
|
|
|
max_level=parse['max_level'], |
|
320
|
|
|
non_empty_folders=parse['non_empty_folders']).walk() |
|
321
|
|
|
|
|
322
|
|
|
if params['save']: |
|
323
|
|
|
from databasetools import CSVExport, DictTools |
|
324
|
|
|
save = params['save'] |
|
325
|
|
|
if save['csv']: |
|
326
|
|
|
CSVExport(list(paths), |
|
327
|
|
|
cols=['files'], |
|
328
|
|
|
file_path=save['directory'], |
|
329
|
|
|
file_name=os.path.basename(parse['directory'])) |
|
330
|
|
|
if save['json']: |
|
331
|
|
|
DictTools(save['directory'], os.path.basename(parse['directory'])).save(list(paths)) |
|
332
|
|
|
print('Done!') |
|
333
|
|
|
|
|
334
|
|
|
|
|
335
|
|
|
if __name__ == "__main__": |
|
336
|
|
|
gui() |
|
337
|
|
|
|