SubmissionFile.attachment_md5() - Code Metrics - troeger/opensubmit - Measure and Improve Code Quality continuously with Scrutinizer

SubmissionFile.attachment_md5() F
last analyzed 2018-05-24 09:45 UTC

↳ Parent: SubmissionFile

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	0
loc	55
rs	3.4604
cc	14

2 Methods

Rating	Name	Duplication	Size	Complexity
A	SubmissionFile.md5_add_file()	0	8	3
A	SubmissionFile.md5_add_text()	0	10	2

How to fix Long Method Complexity

from django.db import models
from django.utils import timezone
from django.core.urlresolvers import reverse

from django.conf import settings

import zipfile
import tarfile
import unicodedata
import os
import hashlib

import logging
logger = logging.getLogger('OpenSubmit')


def upload_path(instance, filename):
    '''
        Sanitize the user-provided file name, add timestamp for uniqness.
    '''

    filename = filename.replace(" ", "_")
    filename = unicodedata.normalize('NFKD', filename).lower()
    return os.path.join(str(timezone.now().date().isoformat()), filename)


class ValidSubmissionFileManager(models.Manager):
    '''
        A model manager used by SubmissionFile. It returns only submission files
        that were not replaced, for submission that were not withdrawn.
    '''

    def get_queryset(self):
        from .submission import Submission
        return super(ValidSubmissionFileManager, self).get_queryset().filter(replaced_by=None).exclude(submissions__state=Submission.WITHDRAWN).exclude(submissions=None)


class SubmissionFile(models.Model):
    '''
        A file attachment for a student submission. File attachments may be replaced
        by the student, but we keep the original version for some NSA-style data gathering.
        The "fetched" field defines the time stamp when the file was fetched for
        checking by some executor. On result retrieval, this timestamp is emptied
        again, which allows to find 'stucked' executor jobs on the server side.
        The "md5" field keeps a checksum of the file upload, for duplicate detection.
    '''

    attachment = models.FileField(
        upload_to=upload_path, verbose_name="File upload")
    original_filename = models.CharField(max_length=255, default='student.upload')
    fetched = models.DateTimeField(editable=False, null=True)
    replaced_by = models.ForeignKey(
        'SubmissionFile', null=True, blank=True, editable=False)
    md5 = models.CharField(max_length=36, null=True,
                           blank=True, editable=False)

    class Meta:
        app_label = 'opensubmit'

    def __str__(self):
        return self.attachment.name

    def attachment_md5(self):
        '''
            Calculate the checksum of the file upload.
            For binary files (e.g. PDFs), the MD5 of the file itself is used.

            Archives are unpacked and the MD5 is generated from the sanitized textfiles
            in the archive. This is done with some smartness:
            - Whitespace and tabs are removed before comparison.
            - For MD5, ordering is important, so we compute it on the sorted list of
              file hashes.
        '''
        MAX_MD5_FILE_SIZE = 10000
        md5_set = []

        def md5_add_text(text):
            try:
                text = str(text, errors='ignore')
                text = text.replace(' ', '').replace(
                    '\n', '').replace('\t', '')
                hexvalues = hashlib.md5(text.encode('utf-8')).hexdigest()
                md5_set.append(hexvalues)
            except Exception as e:
                # not unicode decodable
                pass

        def md5_add_file(f):
            try:
                md5 = hashlib.md5()
                for chunk in f.chunks():
                    md5.update(chunk)
                md5_set.append(md5.hexdigest())
            except Exception:
                pass

        try:
            if zipfile.is_zipfile(self.attachment.path):
                zf = zipfile.ZipFile(self.attachment.path, 'r')
                for zipinfo in zf.infolist():
                    if zipinfo.file_size < MAX_MD5_FILE_SIZE:
                        md5_add_text(zf.read(zipinfo))
            elif tarfile.is_tarfile(self.attachment.path):
                tf = tarfile.open(self.attachment.path, 'r')
                for tarinfo in tf.getmembers():
                    if tarinfo.isfile():
                        if tarinfo.size < MAX_MD5_FILE_SIZE:
                            md5_add_text(tf.extractfile(tarinfo).read())
            else:
                md5_add_file(self.attachment)
        except Exception as e:
            logger.warning(
                "Exception on archive MD5 computation, using file checksum: " + str(e))

        result = hashlib.md5(
            ''.join(sorted(md5_set)).encode('utf-8')).hexdigest()
        return result

    def basename(self):
        return self.attachment.name[self.attachment.name.rfind('/') + 1:]

    def get_absolute_url(self):
        # To realize access protection for student files,
        # we implement our own download method here.
        # This implies that the Apache media serving (MEDIA_URL) is disabled.
        assert(len(self.submissions.all()) > 0)
        return reverse('submission_attachment_file', args=(self.submissions.all()[0].pk,))

    def get_preview_url(self):
        if self.submissions.all():
            return reverse('preview', args=(self.submissions.all()[0].pk,))
        else:
            return None

    def absolute_path(self):
        return settings.MEDIA_ROOT + "/" + self.attachment.name

    def is_executed(self):
        return self.fetched is not None

    def is_archive(self):
        '''
            Determines if the attachment is an archive.
        '''
        try:
            if zipfile.is_zipfile(self.attachment.path) or tarfile.is_tarfile(self.attachment.path):
                return True
        except Exception:
            pass
        return False

    def previews(self):
        '''
            Return preview on archive file / single file content as dictionary.
            In order to avoid browser and web server trashing by the students,
            there is a size limit for the single files shown.
        '''
        MAX_PREVIEW_SIZE = 1000000

        def sanitize(bytes):
            return bytes.decode('utf-8', 'ignore')

        def is_code(fname):
            code_endings = ['.c', '.cpp', 'Makefile',
                            '.java', '.py', '.rb', '.js']
            for ending in code_endings:
                if fname.endswith(ending):
                    return True
            return False

        result = []
        if zipfile.is_zipfile(self.attachment.path):
            zf = zipfile.ZipFile(self.attachment.path, 'r')
            for zipinfo in zf.infolist():
                if zipinfo.file_size < MAX_PREVIEW_SIZE:
                    result.append({'name': zipinfo.filename, 'is_code': is_code(
                        zipinfo.filename), 'preview': sanitize(zf.read(zipinfo))})
                else:
                    result.append(
                        {'name': zipinfo.filename, 'is_code': False, 'preview': '(maximum size exceeded)'})
        elif tarfile.is_tarfile(self.attachment.path):
            tf = tarfile.open(self.attachment.path, 'r')
            for tarinfo in tf.getmembers():
                if tarinfo.isfile():
                    if tarinfo.size < MAX_PREVIEW_SIZE:
                        result.append({'name': tarinfo.name, 'is_code': is_code(
                            tarinfo.name), 'preview': sanitize(tf.extractfile(tarinfo).read())})
                    else:
                        result.append(
                            {'name': tarinfo.name, 'is_code': False, 'preview': '(maximum size exceeded)'})
        else:
            # single file
            f = open(self.attachment.path, 'rb')
            fname = f.name[f.name.rfind(os.sep) + 1:]
            result = [{'name': fname, 'is_code': is_code(
                fname), 'preview': sanitize(f.read())}, ]
        return result

    def test_result_dict(self):
        '''
            Create a compact data structure representation of all result
            types for this file.

            Returns a dictionary where the keys are the result types, and
            the values are dicts of all the other result information.
        '''
        list_of_dicts = list(self.test_results.all().values())
        return {entry['kind']: {'result': entry['result']} for entry in list_of_dicts}

    objects = models.Manager()
    valid_ones = ValidSubmissionFileManager()


1			from django.db import models
2			from django.utils import timezone
3			from django.core.urlresolvers import reverse
4
5			from django.conf import settings
6
7			import zipfile
8			import tarfile
9			import unicodedata
10			import os
11			import hashlib
12
13			import logging
14			logger = logging.getLogger('OpenSubmit')
15
16
17			def upload_path(instance, filename):
18			'''
19			Sanitize the user-provided file name, add timestamp for uniqness.
20			'''
21
22			filename = filename.replace(" ", "_")
23			filename = unicodedata.normalize('NFKD', filename).lower()
24			return os.path.join(str(timezone.now().date().isoformat()), filename)
25
26
27			class ValidSubmissionFileManager(models.Manager):
28			'''
29			A model manager used by SubmissionFile. It returns only submission files
30			that were not replaced, for submission that were not withdrawn.
31			'''
32
33			def get_queryset(self):
34			from .submission import Submission
35			return super(ValidSubmissionFileManager, self).get_queryset().filter(replaced_by=None).exclude(submissions__state=Submission.WITHDRAWN).exclude(submissions=None)
36
37
38			class SubmissionFile(models.Model):
39			'''
40			A file attachment for a student submission. File attachments may be replaced
41			by the student, but we keep the original version for some NSA-style data gathering.
42			The "fetched" field defines the time stamp when the file was fetched for
43			checking by some executor. On result retrieval, this timestamp is emptied
44			again, which allows to find 'stucked' executor jobs on the server side.
45			The "md5" field keeps a checksum of the file upload, for duplicate detection.
46			'''
47
48			attachment = models.FileField(
49			upload_to=upload_path, verbose_name="File upload")
50			original_filename = models.CharField(max_length=255, default='student.upload')
51			fetched = models.DateTimeField(editable=False, null=True)
52			replaced_by = models.ForeignKey(
53			'SubmissionFile', null=True, blank=True, editable=False)
54			md5 = models.CharField(max_length=36, null=True,
55			blank=True, editable=False)
56
57			class Meta:
58			app_label = 'opensubmit'
59
60			def __str__(self):
61			return self.attachment.name
62
63			def attachment_md5(self):
64			'''
65			Calculate the checksum of the file upload.
66			For binary files (e.g. PDFs), the MD5 of the file itself is used.
67
68			Archives are unpacked and the MD5 is generated from the sanitized textfiles
69			in the archive. This is done with some smartness:
70			- Whitespace and tabs are removed before comparison.
71			- For MD5, ordering is important, so we compute it on the sorted list of
72			file hashes.
73			'''
74			MAX_MD5_FILE_SIZE = 10000
75			md5_set = []
76
77			def md5_add_text(text):
78			try:
79			text = str(text, errors='ignore')
80			text = text.replace(' ', '').replace(
81			'\n', '').replace('\t', '')
82			hexvalues = hashlib.md5(text.encode('utf-8')).hexdigest()
83			md5_set.append(hexvalues)
84			except Exception as e:
85			# not unicode decodable
86			pass
87
88			def md5_add_file(f):
89			try:
90			md5 = hashlib.md5()
91			for chunk in f.chunks():
92			md5.update(chunk)
93			md5_set.append(md5.hexdigest())
94			except Exception:
95			pass
96
97			try:
98			if zipfile.is_zipfile(self.attachment.path):
99			zf = zipfile.ZipFile(self.attachment.path, 'r')
100			for zipinfo in zf.infolist():
101			if zipinfo.file_size < MAX_MD5_FILE_SIZE:
102			md5_add_text(zf.read(zipinfo))
103			elif tarfile.is_tarfile(self.attachment.path):
104			tf = tarfile.open(self.attachment.path, 'r')
105			for tarinfo in tf.getmembers():
106			if tarinfo.isfile():
107			if tarinfo.size < MAX_MD5_FILE_SIZE:
108			md5_add_text(tf.extractfile(tarinfo).read())
109			else:
110			md5_add_file(self.attachment)
111			except Exception as e:
112			logger.warning(
113			"Exception on archive MD5 computation, using file checksum: " + str(e))
114
115			result = hashlib.md5(
116			''.join(sorted(md5_set)).encode('utf-8')).hexdigest()
117			return result
118
119			def basename(self):
120			return self.attachment.name[self.attachment.name.rfind('/') + 1:]
121
122			def get_absolute_url(self):
123			# To realize access protection for student files,
124			# we implement our own download method here.
125			# This implies that the Apache media serving (MEDIA_URL) is disabled.
126			assert(len(self.submissions.all()) > 0)
127			return reverse('submission_attachment_file', args=(self.submissions.all()[0].pk,))
128
129			def get_preview_url(self):
130			if self.submissions.all():
131			return reverse('preview', args=(self.submissions.all()[0].pk,))
132			else:
133			return None
134
135			def absolute_path(self):
136			return settings.MEDIA_ROOT + "/" + self.attachment.name
137
138			def is_executed(self):
139			return self.fetched is not None
140
141			def is_archive(self):
142			'''
143			Determines if the attachment is an archive.
144			'''
145			try:
146			if zipfile.is_zipfile(self.attachment.path) or tarfile.is_tarfile(self.attachment.path):
147			return True
148			except Exception:
149			pass
150			return False
151
152			def previews(self):
153			'''
154			Return preview on archive file / single file content as dictionary.
155			In order to avoid browser and web server trashing by the students,
156			there is a size limit for the single files shown.
157			'''
158			MAX_PREVIEW_SIZE = 1000000
159
160			def sanitize(bytes):
161			return bytes.decode('utf-8', 'ignore')
162
163			def is_code(fname):
164			code_endings = ['.c', '.cpp', 'Makefile',
165			'.java', '.py', '.rb', '.js']
166			for ending in code_endings:
167			if fname.endswith(ending):
168			return True
169			return False
170
171			result = []
172			if zipfile.is_zipfile(self.attachment.path):
173			zf = zipfile.ZipFile(self.attachment.path, 'r')
174			for zipinfo in zf.infolist():
175			if zipinfo.file_size < MAX_PREVIEW_SIZE:
176			result.append({'name': zipinfo.filename, 'is_code': is_code(
177			zipinfo.filename), 'preview': sanitize(zf.read(zipinfo))})
178			else:
179			result.append(
180			{'name': zipinfo.filename, 'is_code': False, 'preview': '(maximum size exceeded)'})
181			elif tarfile.is_tarfile(self.attachment.path):
182			tf = tarfile.open(self.attachment.path, 'r')
183			for tarinfo in tf.getmembers():
184			if tarinfo.isfile():
185			if tarinfo.size < MAX_PREVIEW_SIZE:
186			result.append({'name': tarinfo.name, 'is_code': is_code(
187			tarinfo.name), 'preview': sanitize(tf.extractfile(tarinfo).read())})
188			else:
189			result.append(
190			{'name': tarinfo.name, 'is_code': False, 'preview': '(maximum size exceeded)'})
191			else:
192			# single file
193			f = open(self.attachment.path, 'rb')
194			fname = f.name[f.name.rfind(os.sep) + 1:]
195			result = [{'name': fname, 'is_code': is_code(
196			fname), 'preview': sanitize(f.read())}, ]
197			return result
198
199			def test_result_dict(self):
200			'''
201			Create a compact data structure representation of all result
202			types for this file.
203
204			Returns a dictionary where the keys are the result types, and
205			the values are dicts of all the other result information.
206			'''
207			list_of_dicts = list(self.test_results.all().values())
208			return {entry['kind']: {'result': entry['result']} for entry in list_of_dicts}
209
210			objects = models.Manager()
211			valid_ones = ValidSubmissionFileManager()
212

troeger / opensubmit

SubmissionFile.attachment_md5() F last analyzed 2018-05-24 09:45 UTC

Complexity

Size

Duplication

Importance

2 Methods

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

SubmissionFile.attachment_md5() F
last analyzed 2018-05-24 09:45 UTC