Passed
Push — main ( 3e91f8...c39039 )
by Alexander
01:43
created

MailboxCleanerIMAP.does_msg_exist()   B

Complexity

Conditions 7

Size

Total Lines 24
Code Lines 21

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 21
nop 2
dl 0
loc 24
rs 7.9759
c 0
b 0
f 0
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
4
"""
5
Module to download and to detach/strip/remove attachments
6
from e-mails on IMAP servers.
7
"""
8
9
from __future__ import print_function
10
11
import re
12
import email
13
import email.mime.text
14
import email.utils
15
import email.parser
16
import imaplib
17
import logging
18
import socket
19
import time
20
import typing
21
import collections
22
import os.path
23
import pickle
24
25
from src.mailbox_message import MailboxCleanerMessage
26
27
imaplib._MAXLINE = 10000000  # pylint: disable=protected-access
28
29
30
__author__ = "Alexander Willner"
31
__copyright__ = "Copyright 2020, Alexander Willner"
32
__credits__ = ["github.com/guido4000",
33
               "github.com/halteproblem", "github.com/jamesridgway"]
34
__license__ = "MIT"
35
__version__ = "1.0.4"
36
__maintainer__ = "Alexander Willner"
37
__email__ = "[email protected]"
38
__status__ = "Development"
39
40
41
class MailboxCleanerIMAP():
42
    """
43
    Download and detach/strip/remove attachments from e-mails
44
    on IMAP servers.
45
    """
46
47
    # Number of retries to get messages
48
    __RETRIES = 2
49
50
    # IMAP folders to ignore
51
    __IGNORE_PREFIX = ('Contacts', 'Calendar', '"Calendar',
52
                       'Trash', '"Deleted', 'Tasks',
53
                       '"[Gmail]"')
54
55
    def __init__(self, args, imap=None):
56
        """Initialize class."""
57
58
        self.args = args
59
        self.message = MailboxCleanerMessage(args)
60
        self.cache = collections.OrderedDict()
61
        self.cache_file = os.path.join(
62
            self.args.target, '_cache-' + args.server + '.pkl')
63
        self.imap: imaplib.IMAP4_SSL = imap
64
        self.stopped: bool = False
65
66
    def cleanup(self):
67
        """Cleanup after error."""
68
69
        self._save_cache()
70
71
    def login(self):
72
        """Log into the IMAP server."""
73
74
        try:
75
            if self.imap is None:
76
                self.imap = imaplib.IMAP4_SSL(self.args.server)
77
            self.imap.login(self.args.user, self.args.password)
78
            self.imap.socket().setsockopt(socket.IPPROTO_TCP,
79
                                          socket.TCP_NODELAY, 1)
80
            self._load_cache()
81
        except socket.gaierror as error:
82
            raise SystemExit('Login failed (wrong server?): %s' %
83
                             error) from error
84
        except imaplib.IMAP4.error as error:
85
            raise SystemExit('Login failed (wrong password?): %s' %
86
                             error) from error
87
88
    def logout(self):
89
        """Log out of the IMAP server."""
90
91
        try:
92
            self.imap.close()
93
            logging.warning('Connection\t: Closed')
94
        except (AttributeError, imaplib.IMAP4.error):
95
            pass
96
97
        try:
98
            self.imap.logout()
99
            logging.warning('Connection\t: Logged Out')
100
        except (AttributeError, imaplib.IMAP4.error):
101
            pass
102
103
        self.imap = None
104
105
    def does_msg_exist(self, msg) -> bool:
106
        """Check if message is already on the server."""
107
108
        msg_uid = self.message.get_uid(msg)
109
        for _attempt in range(2):  # don't select folder in every single check
110
            try:
111
                status, data = self.imap.uid(
112
                    'SEARCH', None,
113
                    '(UNDELETED HEADER Message-ID "%s")' % msg_uid)
114
                if data is not None and\
115
                        len(data[0]) > 0 and\
116
                        self.args.upload is not None:
117
                    logging.warning('    Duplicate\t: %s', status)
118
                    self.cache[msg_uid] = self.message.get_subject(msg_uid)
119
                    return True
120
            except imaplib.IMAP4.error as error:
121
                status, error = self.imap.select(
122
                    self.args.folder, readonly=self.args.read_only)
123
                if status != "OK":
124
                    raise imaplib.IMAP4.error(
125
                        'Could not select folder: %s' % error) from error
126
            break
127
128
        return False
129
130
    def process_directory(self):
131
        """Iterate over mails from a local directory for upload."""
132
        self.message.process_directory(self.upload, cache=self.cache)
133
134
    def process_folders(self):  # noqa: C901
135
        """Iterate over mails in configured folders."""
136
137
        folders = self.get_folders()
138
        self.stopped = False
139
140
        # Iterate over each folder
141
        for i, folder in enumerate(folders, start=1):
142
143
            # For threaded environments
144
            if self.stopped:
145
                break
146
147
            # Get all mails in this folder
148
            if hasattr(self.args, 'logger'):
149
                self.args.logger.log_progress_folders(i, len(folders), folder)
150
            logging.info('Progress\t: %s / %s (folders)', i, len(folders))
151
            logging.warning('Folder\t\t: %s (started)', folder)
152
            msg_uids = self.get_msgs_from_folder(folder)
153
154
            # Iterate over each email
155
            for j, msg_uid in enumerate(msg_uids, start=1):
156
157
                # For threaded environments
158
                if self.stopped:
159
                    break
160
161
                # Skip if already in cache
162
                logging.info('Progress\t: %s / %s (mail uid: %s)',
163
                             j, len(msg_uids), msg_uid.decode())
164
                if msg_uid in self.cache:
165
                    logging.info('  Subject\t: %s (cached)',
166
                                 self.cache[msg_uid])
167
                    if hasattr(self.args, 'logger'):
168
                        self.args.logger.log_progress_mails(
169
                            j, len(msg_uids), self.cache[msg_uid])
170
                    continue
171
172
                # Get the actual email
173
                try:
174
                    msg, msg_flags = self.get_msg(msg_uid)
175
                except imaplib.IMAP4.error as error:
176
                    try:
177
                        logging.info(
178
                            '  Error\t\t: Message %s (%s). Logging in again.',
179
                            msg_uid, error)
180
                        self.logout()
181
                        self.login()
182
                        self.imap.select(folder, readonly=self.args.read_only)
183
                        msg, msg_flags = self.get_msg(msg_uid)
184
                    except imaplib.IMAP4.error:
185
                        logging.info('  Error\t: Message %s skipped', msg_uid)
186
                        continue
187
188
                subject = self.message.get_subject(msg)
189
                logging.info('  Subject\t: %s', subject)
190
                if hasattr(self.args, 'logger'):
191
                    self.args.logger.log_progress_mails(
192
                        j, len(msg_uids), subject)
193
194
                # Download and detach attachments from email
195
                modified = self.message.download_and_detach_attachments(msg)
196
197
                # Upload new email
198
                if modified:
199
                    self.replace_msg(msg, msg_flags, msg_uid, folder)
200
201
                self.cache[msg_uid] = subject
202
                if j % 10 == 0:
203
                    self._save_cache()
204
205
            logging.warning('Folder\t\t: %s (completed)', folder)
206
207
    def replace_msg(self, msg, msg_flags, msg_uid, folder):
208
        """Upload new message and remove the old one."""
209
210
        # Only upload in non-readonly mode
211
        if self.args.read_only:
212
            logging.debug('    Replacing\t: skipped (read-only)')
213
            return
214
215
        # Upload new message
216
        status, data = self.upload(msg, folder, msg_flags)
217
218
        # Delete old message
219
        if status == 'OK' and self.args.read_only is False:
220
            result = self.imap.select(folder, readonly=self.args.read_only)
221
            assert result[0] == 'OK'
222
            result = self.imap.uid('STORE', msg_uid, '+FLAGS', '\\Deleted')
223
            logging.debug('    Deleting\t: %s', result)
224
            # GMail needs special treatment
225
            try:
226
                self.imap.uid('STORE', msg_uid, '+X-GM-LABELS', '\\Trash')
227
            except imaplib.IMAP4.error:
228
                pass
229
            # Sometimes expunge just fails with an EOF socket error
230
            try:
231
                self.imap.expunge()
232
                logging.debug('    Comment\t: Expunged')
233
            except imaplib.IMAP4.abort:
234
                pass
235
        else:
236
            logging.warning('    Result\t: %s (%s)', status, data)
237
238
    def upload(self, msg, folder, msg_flags='\\Seen'):
239
        """Upload message to server."""
240
241
        # Knowing what's going on
242
        try:
243
            msg_date = self.convert_date(msg.get('date'))
244
            msg_subject = self.message.get_subject(msg)
245
            msg_uid = self.message.get_uid(msg)
246
        except TypeError as error:
247
            status = "Error"
248
            data = error
249
            logging.warning('    Error\t: %s, %s', status, data)
250
            return status, data
251
252
        if self.args.read_only:
253
            logging.warning('    Uploading\t: skipped (read-only)')
254
            return ('Read Only', '')
255
256
        # Check cache
257
        msg_uid = self.message.get_uid(msg)
258
        if msg_uid in self.cache:
259
            logging.warning('    Cache\t: OK')
260
            return ('Cached', '')
261
262
        # Check for duplicates
263
        if self.does_msg_exist(msg) is True:
264
            self.cache[msg_uid] = msg_subject
265
            return ('Duplicate', '')
266
267
        logging.debug('    Uploading\t: %s / %s', msg_date, msg_flags)
268
269
        try:
270
            status, data = self.imap.append(
271
                folder, msg_flags, msg_date, msg.as_string().encode())
272
            if status == "OK":
273
                logging.warning('    Success\t: %s', status)
274
                self.cache[msg_uid] = msg_subject
275
            else:
276
                logging.warning('    Error\t: %s, %s (in %s)',
277
                                status, data, folder)
278
        except imaplib.IMAP4.abort as error:
279
            status = "Error"
280
            data = error
281
            self.logout()
282
            self.login()
283
284
        return status, data
285
286
    def get_msg(self, uid):
287
        """Fetch an email from the IMAP server."""
288
289
        # Sometimes IMAP servers might return empty bodies, so try again
290
        for _ in range(self.__RETRIES):
291
            try:
292
                result, data = self.imap.uid('fetch', uid,
293
                                             '(UID BODY.PEEK[] FLAGS)')
294
                if data is None or data[0] is None:
295
                    logging.warning('  Error\t: '
296
                                    'Could not get a message body. '
297
                                    'Retrying in a few seconds...')
298
                    time.sleep(2)
299
                    raise imaplib.IMAP4.error('Could not get a message body')
300
301
                body = data[0][1]
302
                logging.debug('  Result (Size)\t: %s (%d KB)',
303
                              result, len(body) / 1024)
304
305
                msg = self.get_msg_from_struct(data)
306
                msg_flags = self.get_flags_from_struct(data)
307
308
                logging.debug('  Flags\t\t: %s', msg_flags)
309
310
                return (msg, msg_flags)
311
            except imaplib.IMAP4.error:
312
                continue
313
            break
314
        else:
315
            raise imaplib.IMAP4.error('Could not get a message subject')
316
317
    def get_msgs_from_folder(self, folder):
318
        """Get all emails from a folder on the IMAP server."""
319
320
        # Safety net: enable read-only if requested
321
        self.imap.select(folder, readonly=self.args.read_only)
322
323
        # Extract email UIDs
324
        result_mails, data_mails = self.imap.uid('search', None, "ALL")
325
        msg_uids = data_mails[0].split()
326
        logging.warning('Mails (#)\t: %s (%s)',
327
                        result_mails, len(msg_uids))
328
329
        return msg_uids
330
331
    def get_folders(self) -> typing.List[str]:
332
        """Get the folders from the IMAP server to iterate through."""
333
334
        res, folder_list = self.imap.list()
335
        logging.warning('Folders (#)\t: %s (%s)', res, len(folder_list))
336
337
        folders = [re.split('"."|"/"', item.decode())[-1].strip()
338
                   for item in folder_list]
339
340
        if not self.args.all:
341
            if self.args.folder.lower() not in map(str.lower, folders):
342
                raise imaplib.IMAP4.error(
343
                    'IMAP folder %s does not exist. Existing folders: %s'
344
                    % (self.args.folder, folders))
345
            folders = [self.args.folder]
346
        else:
347
            folders[:] = [item for item in folders
348
                          if not item.startswith(self.__IGNORE_PREFIX)]
349
            folders[:] = [item for item in folders
350
                          if not item.startswith(self.__IGNORE_PREFIX)]
351
352
        return folders
353
354
    @staticmethod
355
    def convert_date(date):
356
        """Convert dates to copy old date to new message."""
357
358
        pz_time = email.utils.parsedate_tz(date)
359
        stamp = email.utils.mktime_tz(pz_time)
360
        date = imaplib.Time2Internaldate(stamp)
361
        return date
362
363
    @staticmethod
364
    def get_msg_from_struct(data) -> str:
365
        """Convert message to a string."""
366
367
        try:
368
            raw_email = (data[0][1]).decode('utf-8')
369
        except ValueError:
370
            try:
371
                raw_email = (data[0][1]).decode('iso-8859-1')
372
            except ValueError:
373
                raw_email = (data[0][1]).decode('utf-8', 'backslashreplace')
374
375
        return email.message_from_string(raw_email)
376
377
    @staticmethod
378
    def get_flags_from_struct(data):
379
        """Get flags to copy old flags to new message."""
380
381
        flags = imaplib.ParseFlags(data[1])
382
        flags = b" ".join(flags) if flags != () else b""
383
        flags = flags.decode("utf-8")
384
        flags = flags.replace("\\Recent", "")  # read-only attribute
385
        return flags.strip()
386
387
    def _load_cache(self):
388
        """Load cache of processed mail UIDs with their subjects."""
389
390
        # Create new cache if needed
391
        if not os.path.exists(self.cache_file) or\
392
           self.args.reset_cache:
393
            self._save_cache()
394
395
        with open(self.cache_file, 'rb') as filepointer:
396
            self.cache = pickle.load(filepointer)
397
398
    def _save_cache(self):
399
        """Save cache of processed mail UIDs with their subjects."""
400
401
        if not os.path.exists(os.path.dirname(self.cache_file)):
402
            os.mkdir(os.path.dirname(self.cache_file))
403
            print("Cache folder created")
404
        with open(self.cache_file, 'wb+') as filepointer:
405
            pickle.dump(self.cache, filepointer, pickle.HIGHEST_PROTOCOL)
406