Completed
Push — main ( 6f109e...210312 )
by Alexander
04:04
created

MailboxCleanerIMAP.process_directory()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 1
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
4
"""
5
Module to download and to detach/strip/remove attachments
6
from e-mails on IMAP servers.
7
"""
8
9
from __future__ import print_function
10
11
import email
12
import email.mime.text
13
import email.utils
14
import email.parser
15
import imaplib
16
import logging
17
import socket
18
import time
19
import typing
20
import collections
21
import os.path
22
import pickle
23
24
from src.mailbox_message import MailboxCleanerMessage
25
26
imaplib._MAXLINE = 10000000  # pylint: disable=protected-access
27
28
29
__author__ = "Alexander Willner"
30
__copyright__ = "Copyright 2020, Alexander Willner"
31
__credits__ = ["github.com/guido4000",
32
               "github.com/halteproblem", "github.com/jamesridgway"]
33
__license__ = "MIT"
34
__version__ = "1.0.0"
35
__maintainer__ = "Alexander Willner"
36
__email__ = "[email protected]"
37
__status__ = "Development"
38
39
40
class MailboxCleanerIMAP():
41
    """
42
    Download and detach/strip/remove attachments from e-mails
43
    on IMAP servers.
44
    """
45
46
    # Number of retries to get messages
47
    __RETRIES = 2
48
49
    # IMAP folders to ignore
50
    __IGNORE_PREFIX = ('Contacts', 'Calendar', '"Calendar',
51
                       'Trash', '"Deleted', 'Tasks',
52
                       '"[Gmail]"')
53
54
    def __init__(self, args, imap=None):
55
        """Initialize class."""
56
57
        self.args = args
58
        self.message = MailboxCleanerMessage(args)
59
        self.cache = collections.OrderedDict()
60
        self.cache_file = args.server + '_cache.pkl'
61
        self.imap: imaplib.IMAP4_SSL = imap
62
63
    def cleanup(self):
64
        """Cleanup after error."""
65
66
        self._save_cache()
67
68
    def login(self):
69
        """Log into the IMAP server."""
70
71
        try:
72
            if self.imap is None:
73
                self.imap = imaplib.IMAP4_SSL(self.args.server)
74
            self.imap.login(self.args.user, self.args.password)
75
            self._load_cache()
76
        except socket.gaierror as error:
77
            raise SystemExit('Login failed (wrong server?): %s' %
78
                             error) from error
79
        except imaplib.IMAP4.error as error:
80
            raise SystemExit('Login failed (wrong password?): %s' %
81
                             error) from error
82
83
    def logout(self):
84
        """Log out of the IMAP server."""
85
86
        try:
87
            self.imap.close()
88
            logging.warning('Connection\t: Closed')
89
        except (AttributeError, imaplib.IMAP4.error):
90
            pass
91
92
        try:
93
            self.imap.logout()
94
            logging.warning('Connection\t: Logged Out')
95
        except (AttributeError, imaplib.IMAP4.error):
96
            pass
97
98
    def does_msg_exist(self, msg) -> bool:
99
        """Check if message is already on the server."""
100
101
        msg_uid = self.message.get_uid(msg)
102
        self.imap.select(self.args.folder, readonly=self.args.read_only)
103
        status, data = self.imap.uid('SEARCH', None,
104
                                     '(HEADER Message-ID "%s") UNDELETED'
105
                                     % msg_uid)
106
107
        if data is not None and\
108
           len(data[0]) > 0 and\
109
           self.args.upload is not None:
110
            logging.warning('    Duplicate\t: %s', status)
111
            self.cache[msg_uid] = self.message.get_subject(msg_uid)
112
            return True
113
114
        return False
115
116
    def process_directory(self):
117
        """Iterate over mails from a local directory for upload."""
118
        self.message.process_directory(self.upload)
119
120
    def process_folders(self):
121
        """Iterate over mails in configured folders."""
122
123
        folders = self.get_folders()
124
125
        # Iterate over each folder
126
        for i, folder in enumerate(folders, start=1):
127
128
            # Get all mails in this folder
129
            logging.info('Progress\t: %s / %s (folders)', i, len(folders))
130
            logging.warning('Folder\t\t: %s (started)', folder)
131
            msg_uids = self.get_msgs_from_folder(folder)
132
133
            # Iterate over each email
134
            for j, msg_uid in enumerate(msg_uids, start=1):
135
136
                # Skip if already in cache
137
                logging.info('Progress\t: %s / %s (mail uid: %s)',
138
                             j, len(msg_uids), msg_uid.decode())
139
                if msg_uid in self.cache:
140
                    logging.info('  Subject\t: %s (cached)',
141
                                 self.cache[msg_uid])
142
                    continue
143
144
                # Get the actual email
145
                try:
146
                    msg, msg_flags = self.get_msg(msg_uid)
147
                except imaplib.IMAP4.error:
148
                    logging.info('  Error\t: Message %s skipped', msg_uid)
149
                    continue
150
                subject = self.message.get_subject(msg)
151
                logging.info('  Subject\t: %s', subject)
152
153
                # Download and detach attachments from email
154
                modified = self.message.download_and_detach_attachments(msg)
155
156
                # Upload new email
157
                if modified:
158
                    self.replace_msg(msg, msg_flags, msg_uid, folder)
159
160
                self.cache[msg_uid] = subject
161
162
            logging.warning('Folder\t\t: %s (completed)', folder)
163
164
    def replace_msg(self, msg, msg_flags, msg_uid, folder):
165
        """Upload new message and remove the old one."""
166
167
        # Only upload in non-readonly mode
168
        if self.args.read_only:
169
            logging.debug('    Replacing\t: skipped (read-only)')
170
            return
171
172
        # Upload new message
173
        status, data = self.upload(msg, msg_flags)
174
175
        # Delete old message
176
        if status == 'OK' and self.args.read_only is False:
177
            result = self.imap.select(folder, readonly=self.args.read_only)
178
            assert result[0] == 'OK'
179
            result = self.imap.uid('STORE', msg_uid, '+FLAGS', '\\Deleted')
180
            logging.debug('    Deleting\t: %s', result)
181
            # GMail needs special treatment
182
            try:
183
                self.imap.uid('STORE', msg_uid, '+X-GM-LABELS', '\\Trash')
184
            except imaplib.IMAP4.error:
185
                pass
186
            # Sometimes expunge just fails with an EOF socket error
187
            try:
188
                self.imap.expunge()
189
                logging.debug('    Comment\t: Expunged')
190
            except imaplib.IMAP4.abort:
191
                pass
192
        else:
193
            logging.warning('    Result\t: %s (%s)', status, data)
194
195
    def upload(self, msg, msg_flags='\\Seen'):
196
        """Upload message to server."""
197
198
        # Knowing what's going on
199
        msg_date = self.convert_date(msg.get('date'))
200
        msg_subject = self.message.get_subject(msg)
201
        msg_uid = self.message.get_uid(msg)
202
        if self.args.read_only:
203
            logging.warning('    Uploading\t: skipped (read-only)')
204
            return ('Read Only', '')
205
206
        logging.debug('    Uploading\t: %s / %s', msg_date, msg_flags)
207
208
        # Check cache
209
        msg_uid = self.message.get_uid(msg)
210
        if msg_uid in self.cache:
211
            logging.warning('    Cache\t: OK')
212
            return ('Cached', '')
213
214
        # Check for duplicates
215
        if self.does_msg_exist(msg) is True:
216
            self.cache[msg_uid] = msg_subject
217
            return ('Duplicate', '')
218
219
        status, data = self.imap.append(
220
            self.args.folder, msg_flags, msg_date, msg.as_string().encode())
221
        if status == "OK":
222
            logging.warning('    Success\t: %s', status)
223
            self.cache[msg_uid] = msg_subject
224
        else:
225
            logging.warning('    Return\t\t: %s, %s', status, data)
226
227
        return status, data
228
229
    def get_msg(self, uid):
230
        """Fetch an email from the IMAP server."""
231
232
        # Sometimes IMAP servers might return empty bodies, so try again
233
        for _ in range(self.__RETRIES):
234
            try:
235
                result, data = self.imap.uid('fetch', uid,
236
                                             '(UID BODY.PEEK[] FLAGS)')
237
                if data is None or data[0] is None:
238
                    logging.warning('  Error\t: '
239
                                    'Could not get a message body. '
240
                                    'Retrying in a few seconds...')
241
                    time.sleep(2)
242
                    raise imaplib.IMAP4.error('Could not get a message body')
243
244
                body = data[0][1]
245
                logging.debug('  Result (Size)\t: %s (%d KB)',
246
                              result, len(body) / 1024)
247
248
                msg = self.get_msg_from_struct(data)
249
                msg_flags = self.get_flags_from_struct(data)
250
251
                logging.debug('  Flags\t\t: %s', msg_flags)
252
253
                return (msg, msg_flags)
254
            except imaplib.IMAP4.error:
255
                continue
256
            break
257
        else:
258
            raise imaplib.IMAP4.error('Could not get a message subject')
259
260
    def get_msgs_from_folder(self, folder):
261
        """Get all emails from a folder on the IMAP server."""
262
263
        # Safety net: enable read-only if requested
264
        self.imap.select(folder, readonly=self.args.read_only)
265
266
        # Extract email UIDs
267
        result_mails, data_mails = self.imap.uid('search', None, "ALL")
268
        msg_uids = data_mails[0].split()
269
        logging.warning('Mails (#)\t: %s (%s)',
270
                        result_mails, len(msg_uids))
271
272
        return msg_uids
273
274
    def get_folders(self) -> typing.List[str]:
275
        """Get the folders from the IMAP server to iterate through."""
276
277
        res, folder_list = self.imap.list()
278
        logging.warning('Folders (#)\t: %s (%s)', res, len(folder_list))
279
280
        folders = [item.decode().split('"/"')[-1].strip()
281
                   for item in folder_list]
282
283
        if not self.args.all:
284
            if self.args.folder.lower() not in map(str.lower, folders):
285
                raise imaplib.IMAP4.error(
286
                    'IMAP folder %s does not exist. Existing folders: %s'
287
                    % (self.args.folder, folders))
288
            folders = [self.args.folder]
289
        else:
290
            folders[:] = [item for item in folders
291
                          if not item.startswith(self.__IGNORE_PREFIX)]
292
            folders[:] = [item for item in folders
293
                          if not item.startswith(self.__IGNORE_PREFIX)]
294
295
        return folders
296
297
    @staticmethod
298
    def convert_date(date):
299
        """Convert dates to copy old date to new message."""
300
301
        pz_time = email.utils.parsedate_tz(date)
302
        stamp = email.utils.mktime_tz(pz_time)
303
        date = imaplib.Time2Internaldate(stamp)
304
        return date
305
306
    @staticmethod
307
    def get_msg_from_struct(data) -> str:
308
        """Convert message to a string."""
309
310
        try:
311
            raw_email = (data[0][1]).decode('utf-8')
312
        except ValueError:
313
            try:
314
                raw_email = (data[0][1]).decode('iso-8859-1')
315
            except ValueError:
316
                raw_email = (data[0][1]).decode('utf-8', 'backslashreplace')
317
318
        return email.message_from_string(raw_email)
319
320
    @staticmethod
321
    def get_flags_from_struct(data):
322
        """Get flags to copy old flags to new message."""
323
324
        flags = imaplib.ParseFlags(data[1])
325
        flags = b" ".join(flags) if flags != () else b""
326
        flags = flags.decode("utf-8")
327
        flags = flags.replace("\\Recent", "")  # read-only attribute
328
        return flags.strip()
329
330
    def _load_cache(self):
331
        """Load cache of processed mail UIDs with their subjects."""
332
333
        # Create new cache if needed
334
        if not os.path.exists(self.cache_file) or\
335
           self.args.reset_cache:
336
            self._save_cache()
337
338
        with open(self.cache_file, 'rb') as filepointer:
339
            self.cache = pickle.load(filepointer)
340
341
    def _save_cache(self):
342
        """Save cache of processed mail UIDs with their subjects."""
343
344
        with open(self.cache_file, 'wb+') as filepointer:
345
            pickle.dump(self.cache, filepointer, pickle.HIGHEST_PROTOCOL)
346