SubmissionFile.attachment_md5()   F
last analyzed

Complexity

Conditions 14

Size

Total Lines 55

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 0
loc 55
rs 3.4604
cc 14

2 Methods

Rating   Name   Duplication   Size   Complexity  
A SubmissionFile.md5_add_file() 0 8 3
A SubmissionFile.md5_add_text() 0 10 2

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like SubmissionFile.attachment_md5() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
from django.db import models
2
from django.utils import timezone
3
from django.core.urlresolvers import reverse
4
5
from django.conf import settings
6
7
import zipfile
8
import tarfile
9
import unicodedata
10
import os
11
import hashlib
12
13
import logging
14
logger = logging.getLogger('OpenSubmit')
15
16
17
def upload_path(instance, filename):
18
    '''
19
        Sanitize the user-provided file name, add timestamp for uniqness.
20
    '''
21
22
    filename = filename.replace(" ", "_")
23
    filename = unicodedata.normalize('NFKD', filename).lower()
24
    return os.path.join(str(timezone.now().date().isoformat()), filename)
25
26
27
class ValidSubmissionFileManager(models.Manager):
28
    '''
29
        A model manager used by SubmissionFile. It returns only submission files
30
        that were not replaced, for submission that were not withdrawn.
31
    '''
32
33
    def get_queryset(self):
34
        from .submission import Submission
35
        return super(ValidSubmissionFileManager, self).get_queryset().filter(replaced_by=None).exclude(submissions__state=Submission.WITHDRAWN).exclude(submissions=None)
36
37
38
class SubmissionFile(models.Model):
39
    '''
40
        A file attachment for a student submission. File attachments may be replaced
41
        by the student, but we keep the original version for some NSA-style data gathering.
42
        The "fetched" field defines the time stamp when the file was fetched for
43
        checking by some executor. On result retrieval, this timestamp is emptied
44
        again, which allows to find 'stucked' executor jobs on the server side.
45
        The "md5" field keeps a checksum of the file upload, for duplicate detection.
46
    '''
47
48
    attachment = models.FileField(
49
        upload_to=upload_path, verbose_name="File upload")
50
    original_filename = models.CharField(max_length=255, default='student.upload')
51
    fetched = models.DateTimeField(editable=False, null=True)
52
    replaced_by = models.ForeignKey(
53
        'SubmissionFile', null=True, blank=True, editable=False)
54
    md5 = models.CharField(max_length=36, null=True,
55
                           blank=True, editable=False)
56
57
    class Meta:
58
        app_label = 'opensubmit'
59
60
    def __str__(self):
61
        return self.attachment.name
62
63
    def attachment_md5(self):
64
        '''
65
            Calculate the checksum of the file upload.
66
            For binary files (e.g. PDFs), the MD5 of the file itself is used.
67
68
            Archives are unpacked and the MD5 is generated from the sanitized textfiles
69
            in the archive. This is done with some smartness:
70
            - Whitespace and tabs are removed before comparison.
71
            - For MD5, ordering is important, so we compute it on the sorted list of
72
              file hashes.
73
        '''
74
        MAX_MD5_FILE_SIZE = 10000
75
        md5_set = []
76
77
        def md5_add_text(text):
78
            try:
79
                text = str(text, errors='ignore')
80
                text = text.replace(' ', '').replace(
81
                    '\n', '').replace('\t', '')
82
                hexvalues = hashlib.md5(text.encode('utf-8')).hexdigest()
83
                md5_set.append(hexvalues)
84
            except Exception as e:
85
                # not unicode decodable
86
                pass
87
88
        def md5_add_file(f):
89
            try:
90
                md5 = hashlib.md5()
91
                for chunk in f.chunks():
92
                    md5.update(chunk)
93
                md5_set.append(md5.hexdigest())
94
            except Exception:
95
                pass
96
97
        try:
98
            if zipfile.is_zipfile(self.attachment.path):
99
                zf = zipfile.ZipFile(self.attachment.path, 'r')
100
                for zipinfo in zf.infolist():
101
                    if zipinfo.file_size < MAX_MD5_FILE_SIZE:
102
                        md5_add_text(zf.read(zipinfo))
103
            elif tarfile.is_tarfile(self.attachment.path):
104
                tf = tarfile.open(self.attachment.path, 'r')
105
                for tarinfo in tf.getmembers():
106
                    if tarinfo.isfile():
107
                        if tarinfo.size < MAX_MD5_FILE_SIZE:
108
                            md5_add_text(tf.extractfile(tarinfo).read())
109
            else:
110
                md5_add_file(self.attachment)
111
        except Exception as e:
112
            logger.warning(
113
                "Exception on archive MD5 computation, using file checksum: " + str(e))
114
115
        result = hashlib.md5(
116
            ''.join(sorted(md5_set)).encode('utf-8')).hexdigest()
117
        return result
118
119
    def basename(self):
120
        return self.attachment.name[self.attachment.name.rfind('/') + 1:]
121
122
    def get_absolute_url(self):
123
        # To realize access protection for student files,
124
        # we implement our own download method here.
125
        # This implies that the Apache media serving (MEDIA_URL) is disabled.
126
        assert(len(self.submissions.all()) > 0)
127
        return reverse('submission_attachment_file', args=(self.submissions.all()[0].pk,))
128
129
    def get_preview_url(self):
130
        if self.submissions.all():
131
            return reverse('preview', args=(self.submissions.all()[0].pk,))
132
        else:
133
            return None
134
135
    def absolute_path(self):
136
        return settings.MEDIA_ROOT + "/" + self.attachment.name
137
138
    def is_executed(self):
139
        return self.fetched is not None
140
141
    def is_archive(self):
142
        '''
143
            Determines if the attachment is an archive.
144
        '''
145
        try:
146
            if zipfile.is_zipfile(self.attachment.path) or tarfile.is_tarfile(self.attachment.path):
147
                return True
148
        except Exception:
149
            pass
150
        return False
151
152
    def previews(self):
153
        '''
154
            Return preview on archive file / single file content as dictionary.
155
            In order to avoid browser and web server trashing by the students,
156
            there is a size limit for the single files shown.
157
        '''
158
        MAX_PREVIEW_SIZE = 1000000
159
160
        def sanitize(bytes):
161
            return bytes.decode('utf-8', 'ignore')
162
163
        def is_code(fname):
164
            code_endings = ['.c', '.cpp', 'Makefile',
165
                            '.java', '.py', '.rb', '.js']
166
            for ending in code_endings:
167
                if fname.endswith(ending):
168
                    return True
169
            return False
170
171
        result = []
172
        if zipfile.is_zipfile(self.attachment.path):
173
            zf = zipfile.ZipFile(self.attachment.path, 'r')
174
            for zipinfo in zf.infolist():
175
                if zipinfo.file_size < MAX_PREVIEW_SIZE:
176
                    result.append({'name': zipinfo.filename, 'is_code': is_code(
177
                        zipinfo.filename), 'preview': sanitize(zf.read(zipinfo))})
178
                else:
179
                    result.append(
180
                        {'name': zipinfo.filename, 'is_code': False, 'preview': '(maximum size exceeded)'})
181
        elif tarfile.is_tarfile(self.attachment.path):
182
            tf = tarfile.open(self.attachment.path, 'r')
183
            for tarinfo in tf.getmembers():
184
                if tarinfo.isfile():
185
                    if tarinfo.size < MAX_PREVIEW_SIZE:
186
                        result.append({'name': tarinfo.name, 'is_code': is_code(
187
                            tarinfo.name), 'preview': sanitize(tf.extractfile(tarinfo).read())})
188
                    else:
189
                        result.append(
190
                            {'name': tarinfo.name, 'is_code': False, 'preview': '(maximum size exceeded)'})
191
        else:
192
            # single file
193
            f = open(self.attachment.path, 'rb')
194
            fname = f.name[f.name.rfind(os.sep) + 1:]
195
            result = [{'name': fname, 'is_code': is_code(
196
                fname), 'preview': sanitize(f.read())}, ]
197
        return result
198
199
    def test_result_dict(self):
200
        '''
201
            Create a compact data structure representation of all result
202
            types for this file.
203
204
            Returns a dictionary where the keys are the result types, and
205
            the values are dicts of all the other result information.
206
        '''
207
        list_of_dicts = list(self.test_results.all().values())
208
        return {entry['kind']: {'result': entry['result']} for entry in list_of_dicts}
209
210
    objects = models.Manager()
211
    valid_ones = ValidSubmissionFileManager()
212