Passed
Push — develop ( 1f6ef3...42aeec )
by Koen
01:25
created

atramhasis.scripts.dump_rdf.main()   F

Complexity

Conditions 17

Size

Total Lines 134
Code Lines 104

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 104
dl 0
loc 134
rs 1.2599
c 0
b 0
f 0
cc 17
nop 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like atramhasis.scripts.dump_rdf.main() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
import logging
2
import optparse
3
import os
4
import sys
5
import textwrap
6
import time
7
8
import transaction
9
from pyramid.paster import bootstrap
10
from pyramid.paster import setup_logging
11
from rdflib import RDF
12
from rdflib import SKOS
13
from rdflib.term import URIRef
14
from skosprovider_rdf import utils
15
16
from atramhasis.data.datamanagers import CountsManager
17
from atramhasis.data.models import ConceptschemeCounts
18
from atramhasis.errors import SkosRegistryNotFoundException
19
20
log = logging.getLogger(__name__)
21
22
23
def main():
24
    description = """\
25
    Dump all conceptschemes to files. Will serialise as Turtle and RDF/XML format.
26
    """
27
    usage = "usage: %prog config_uri"
28
    parser = optparse.OptionParser(
29
        usage=usage,
30
        description=textwrap.dedent(description)
31
    )
32
    parser.add_option(
33
        '-l', '--location', dest='dump_location', type='string',
34
        help='Specify where to dump the conceptschemes. If not specified, this \
35
        is set to the atramhasis.dump_location from your ini file.'
36
    )
37
    parser.add_option(
38
        '-r', '--rdf2hdt', dest='rdf2hdt', type='string', default=False,
39
        help='Specify where the rdf2hdt command can be found. If not specified, this \
40
        is set to atramhasis.rdf2hdt from your ini file.'
41
    )
42
43
    options, args = parser.parse_args(sys.argv[1:])
44
45
    if not len(args) >= 1:
46
        log.error('You must provide at least one argument.')
47
        return 2
48
49
    config_uri = args[0]
50
51
    env = bootstrap(config_uri)
52
    setup_logging(config_uri)
53
54
    dump_location = options.dump_location
55
    if dump_location is None:
56
        dump_location = env['registry'].settings.get(
57
            'atramhasis.dump_location',
58
            os.path.abspath(os.path.dirname(config_uri))
59
        )
60
    if not os.access(dump_location, os.W_OK | os.X_OK):
61
        log.error('Dump location "' + dump_location + '" is not writable.')
62
        return 2
63
64
    rdf2hdt = options.rdf2hdt
65
    if not rdf2hdt:
66
        rdf2hdt = env['registry'].settings.get(
67
            'atramhasis.rdf2hdt',
68
            False
69
        )
70
71
    request = env['request']
72
73
    if hasattr(request, 'skos_registry') and request.skos_registry is not None:
74
        skos_registry = request.skos_registry
75
    else:
76
        raise SkosRegistryNotFoundException()  # pragma: no cover
77
78
    counts = []
79
80
    files = []
81
82
    for p in skos_registry.get_providers():
83
        if any([not_shown in p.get_metadata()['subject'] for not_shown in ['external']]):
84
            continue
85
        start_time = time.time()
86
        pid = p.get_metadata()['id']
87
        filename = os.path.join(dump_location, '%s-full' % pid)
88
        filename_ttl = '%s.ttl' % filename
89
        filename_rdf = '%s.rdf' % filename
90
        files.append(filename_ttl)
91
        log.info('Generating dump for %s' % pid)
92
        graph = utils.rdf_dumper(p)
93
        triples = len(graph)
94
        log.info('Number of triples in Graph: %d' % triples)
95
        csuri = URIRef(p.concept_scheme.uri)
96
        cs_triples = len(list(graph.predicate_objects(csuri)))
97
        log.info('Number of triples in Conceptscheme: %d' % cs_triples)
98
        count_concepts = len(list(graph.subjects(RDF.type, SKOS.Concept)))
99
        count_collections = len(list(graph.subjects(RDF.type, SKOS.Collection)))
100
        try:
101
            avg_concept_triples = ((triples - cs_triples) /
102
                                   (count_concepts + count_collections))
103
        except ZeroDivisionError:
104
            avg_concept_triples = 0
105
        log.info('Average number of triples per concept: %d' % avg_concept_triples)
106
        counts.append({
107
            'conceptscheme_id': pid,
108
            'triples': triples,
109
            'conceptscheme_triples': cs_triples,
110
            'avg_concept_triples': avg_concept_triples
111
        })
112
        log.info(f'Dumping {pid} to Turtle: {filename_ttl}')
113
        graph.serialize(destination=filename_ttl, format='turtle')
114
        log.info(f'Dumping {pid} to RDFxml: {filename_rdf}')
115
        graph.serialize(destination=filename_rdf, format='pretty-xml')
116
        del graph
117
        log.info(f'--- {(time.time() - start_time)} seconds ---')
118
119
    log.info('All files dumped to %s' % dump_location)
120
121
    if rdf2hdt:
122
        from subprocess import check_call, CalledProcessError
123
        parsing_error = False
124
        for f in files:
125
            log.info(f'Converting {f} to hdt')
126
            hdtf = f.replace('.ttl', '.hdt')
127
            try:
128
                check_call([rdf2hdt, '-f', 'turtle', f, hdtf])
129
            except (FileNotFoundError, CalledProcessError) as e:
130
                # Turtle failed, let's try rdfxml
131
                parsing_error = True
132
                log.warning(f'rdf2hdt for file {f} failed with error {e}. Trying rdfxml...')
133
                rdff = f.replace('.ttl', '.rdf')
134
                try:
135
                    check_call([rdf2hdt, '-f', 'rdfxml', rdff, hdtf])
136
                except (FileNotFoundError, CalledProcessError) as e:
137
                    # rdfxml failed
138
                    log.error(f'rdfxml for file {f} failed with error {e}')
139
        if parsing_error:
140
            log.error('Error during rdf2hdt conversion. Check logs for more information.')
141
        else:
142
            log.info(f'All hdt files dumped to {dump_location}')
143
144
    with transaction.manager:
145
        dbsession = request.registry.dbmaker()
146
        manager = CountsManager(dbsession)
147
        for c in counts:
148
            cs_count = ConceptschemeCounts(
149
                conceptscheme_id=c['conceptscheme_id'],
150
                triples=c['triples'],
151
                conceptscheme_triples=c['conceptscheme_triples'],
152
                avg_concept_triples=c['avg_concept_triples']
153
            )
154
            manager.save(cs_count)
155
156
    env['closer']()
157