Passed
Pull Request — master (#1184)
by Konstantin
03:15
created

tests.model.mets_bench_extreme   A

Complexity

Total Complexity 34

Size/Duplication

Total Lines 278
Duplicated Lines 7.91 %

Importance

Changes 0
Metric Value
wmc 34
eloc 194
dl 22
loc 278
rs 9.68
c 0
b 0
f 0

27 Functions

Rating   Name   Duplication   Size   Complexity  
A test_b500() 0 6 1
A benchmark_find_files_all() 0 2 1
A test_b2000_c() 0 6 1
A test_s2000() 0 6 1
A test_s500() 0 6 1
A benchmark_find_files_fileid() 0 5 1
A test_s1000_c() 0 6 1
A test_b2000() 0 6 1
A test_s5000() 0 6 1
A test_b1000_c() 0 6 1
A test_s50() 0 6 1
B _build_mets() 22 22 8
A benchmark_find_files_physical_page() 0 5 1
A assert_len() 0 3 1
A test_b50() 0 6 1
A test_b50_c() 0 6 1
A test_s500_c() 0 6 1
A test_s5000_c() 0 6 1
A test_b5000() 0 6 1
A test_b1000() 0 6 1
A test_s2000_c() 0 6 1
A benchmark_find_files() 0 4 1
A test_s1000() 0 6 1
A benchmark_find_files_filegrp() 0 5 1
A test_b500_c() 0 6 1
A test_b5000_c() 0 6 1
A test_s50_c() 0 6 1

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
# -*- coding: utf-8 -*-
2
3
from contextlib import contextmanager
4
from time import time
5
6
from pytest import main, fixture, mark
7
8
from ocrd import Resolver
9
from ocrd_utils import MIME_TO_EXT, getLogger
10
from ocrd_models import OcrdMets
11
12
logger = getLogger('ocrd.benchmark.mets')
13
14
GRPS_REG = ['SEG-REG', 'SEG-REPAIR', 'SEG-REG-DESKEW', 'SEG-REG-DESKEW-CLIP', 'SEG-LINE', 'SEG-REPAIR-LINE', 'SEG-LINE-RESEG-DEWARP']
15
GRPS_IMG = ['FULL', 'PRESENTATION', 'BIN', 'CROP', 'BIN2', 'BIN-DENOISE', 'BIN-DENOISE-DESKEW', 'OCR']
16
17
REGIONS_PER_PAGE = 2
18
LINES_PER_REGION = 2
19
FILES_PER_PAGE = len(GRPS_IMG) * LINES_PER_REGION + len(GRPS_REG) * REGIONS_PER_PAGE
20
21
# Caching is disabled by default
22 View Code Duplication
def _build_mets(number_of_pages, force=False, cache_flag=False):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
23
    mets = OcrdMets.empty_mets(cache_flag=cache_flag)
24
    mets._number_of_pages = number_of_pages
25
26
    for n in ['%04d' % (n + 1) for n in range(number_of_pages)]:
27
        _add_file = lambda n, fileGrp, mimetype, ID=None: mets.add_file(
28
            fileGrp,
29
            mimetype=mimetype,
30
            pageId='PHYS_%s' % n,
31
            ID=ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()),
32
            url='%s/%s%s' % (fileGrp, ID if ID else '%s_%s_%s' % (fileGrp, n, MIME_TO_EXT.get(mimetype)[1:].upper()), MIME_TO_EXT.get(mimetype))
33
        )
34
        for grp in GRPS_IMG:
35
            # LINES_PER_REGION = 2
36
            _add_file(n, grp, 'image/tiff')
37
            _add_file(n, grp, 'application/vnd.prima.page+xml')
38
        for grp in GRPS_REG:
39
            # REGIONS_PER_PAGE = 2
40
            for region_n in range(REGIONS_PER_PAGE):
41
                _add_file(n, grp, 'image/png', '%s_%s_region%s' % (grp, n, region_n))
42
43
    return mets
44
45
def assert_len(expected_len, mets, kwargs):
46
    test_list = mets.find_all_files(**kwargs)
47
    assert expected_len == len(test_list)
48
49
def benchmark_find_files(number_of_pages, mets):
50
    benchmark_find_files_filegrp(number_of_pages, mets)
51
    benchmark_find_files_fileid(number_of_pages, mets)
52
    benchmark_find_files_physical_page(number_of_pages, mets)
53
    # This is not really useful to measure. 
54
    # We iterate all files in both cached and non-cached in the same routine
55
    # When no specific search parameters are provided
56
    # benchmark_find_files_all(number_of_pages, mets)
57
58
def benchmark_find_files_filegrp(number_of_pages, mets):
59
	# Best case - first fileGrp
60
    assert_len((number_of_pages * REGIONS_PER_PAGE), mets, dict(fileGrp='SEG-REG'))
61
    # Worst case - does not exist
62
    assert_len(0, mets, dict(fileGrp='SEG-REG-NOTEXIST'))
63
64
def benchmark_find_files_fileid(number_of_pages, mets):
65
	# Best case - first file ID
66
    assert_len(1, mets, dict(ID='FULL_0001_TIF', fileGrp='FULL'))
67
    # Worst case - does not exist
68
    assert_len(0, mets, dict(ID='FULL_0001_TIF-NOTEXISTS', fileGrp='FULL-NOTEXIST'))
69
70
def benchmark_find_files_physical_page(number_of_pages, mets):
71
	# Best case - first physical page
72
    assert_len(FILES_PER_PAGE, mets, dict(pageId='PHYS_0001'))
73
    # Worst case - does not exist
74
    assert_len(0, mets, dict(pageId='PHYS_0001-NOTEXISTS'))
75
76
# Get all files, i.e., pass an empty search parameter -> dict()
77
def benchmark_find_files_all(number_of_pages, mets):
78
    assert_len((number_of_pages * FILES_PER_PAGE), mets, dict())
79
80
81
82
83
# ---- BENCHMARKING for 50-500-1000-2000-5000 pages ---- #
84
85
# ----- 50 pages -> build, search, build (cached), search (cached) ----- #
86
mets_50 = None
87
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
88
def test_b50(benchmark):
89
    @benchmark
90
    def result():
91
        global mets_50
92
        mets_50 = _build_mets(50, force=True)
93
94
@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
95
def test_s50(benchmark):
96
    @benchmark
97
    def ret(): 
98
        global mets_50
99
        benchmark_find_files(50, mets_50)
100
del mets_50
101
102
mets_c_50 = None
103
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
104
def test_b50_c(benchmark):
105
    @benchmark
106
    def result():
107
        global mets_c_50
108
        mets_c_50 = _build_mets(50, force=True, cache_flag=True)
109
110
@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
111
def test_s50_c(benchmark):
112
    @benchmark
113
    def ret():
114
        global mets_c_50
115
        benchmark_find_files(50, mets_c_50)
116
del mets_c_50
117
# ----------------------------------------------------------------------- #
118
119
120
121
# ----- 500 pages -> build, search, build (cached), search (cached) ----- #
122
mets_500 = None
123
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
124
def test_b500(benchmark):
125
    @benchmark
126
    def result():
127
        global mets_500
128
        mets_500 = _build_mets(500, force=True)
129
130
@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
131
def test_s500(benchmark):
132
    @benchmark
133
    def ret(): 
134
        global mets_500
135
        benchmark_find_files(500, mets_500)
136
del mets_500
137
138
139
mets_c_500 = None
140
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
141
def test_b500_c(benchmark):
142
    @benchmark
143
    def result():
144
        global mets_c_500
145
        mets_c_500 = _build_mets(500, force=True, cache_flag=True)
146
147
@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
148
def test_s500_c(benchmark):
149
    @benchmark
150
    def ret():
151
        global mets_c_500
152
        benchmark_find_files(500, mets_c_500)
153
del mets_c_500
154
155
# ----------------------------------------------------------------------- #
156
157
158
159
# ----- 1000 pages -> build, search, build (cached), search (cached) ----- #
160
mets_1000 = None
161
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
162
def test_b1000(benchmark):
163
    @benchmark
164
    def result():
165
        global mets_1000
166
        mets_1000 = _build_mets(1000, force=True)
167
168
@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
169
def test_s1000(benchmark):
170
    @benchmark
171
    def ret(): 
172
        global mets_1000
173
        benchmark_find_files(1000, mets_1000)
174
del mets_1000
175
176
mets_c_1000 = None
177
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
178
def test_b1000_c(benchmark):
179
    @benchmark
180
    def result():
181
        global mets_c_1000
182
        mets_c_1000 = _build_mets(1000, force=True, cache_flag=True)
183
184
@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
185
def test_s1000_c(benchmark):
186
    @benchmark
187
    def ret():
188
        global mets_c_1000
189
        benchmark_find_files(1000, mets_c_1000)
190
del mets_c_1000
191
192
# ------------------------------------------------------------------------ #
193
194
195
196
# ----- 2000 pages -> build, search, build (cached), search (cached) ----- #
197
mets_2000 = None
198
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
199
def test_b2000(benchmark):
200
    @benchmark
201
    def result():
202
        global mets_2000
203
        mets_2000 = _build_mets(2000, force=True)
204
205
@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
206
def test_s2000(benchmark):
207
    @benchmark
208
    def ret(): 
209
        global mets_2000
210
        benchmark_find_files(2000, mets_2000)
211
del mets_2000
212
213
mets_c_2000 = None
214
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
215
def test_b2000_c(benchmark):
216
    @benchmark
217
    def result():
218
        global mets_c_2000
219
        mets_c_2000 = _build_mets(2000, force=True, cache_flag=True)
220
221
@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
222
def test_s2000_c(benchmark):
223
    @benchmark
224
    def ret():
225
        global mets_c_2000
226
        benchmark_find_files(2000, mets_c_2000)
227
del mets_c_2000
228
229
# ------------------------------------------------------------------------ #
230
231
232
233
# ----- 5000 pages -> build, search, build (cached), search (cached) ----- #
234
mets_5000 = None
235
@mark.benchmark(group="build", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
236
def test_b5000(benchmark):
237
    @benchmark
238
    def result():
239
        global mets_5000
240
        mets_5000 = _build_mets(5000, force=True)
241
242
@mark.benchmark(group="search", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
243
def test_s5000(benchmark):
244
    @benchmark
245
    def ret(): 
246
        global mets_5000
247
        benchmark_find_files(5000, mets_5000)
248
del mets_5000
249
250
mets_c_5000 = None
251
@mark.benchmark(group="build_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
252
def test_b5000_c(benchmark):
253
    @benchmark
254
    def result():
255
        global mets_c_5000
256
        mets_c_5000 = _build_mets(5000, force=True, cache_flag=True)
257
258
@mark.benchmark(group="search_cached", max_time=0.1, min_rounds=1, disable_gc=False, warmup=False)
259
def test_s5000_c(benchmark):
260
    @benchmark
261
    def ret():
262
        global mets_c_5000
263
        benchmark_find_files(5000, mets_c_5000)
264
del mets_c_5000
265
266
# ------------------------------------------------------------------------ #
267
268
if __name__ == '__main__':
269
    args = ['']
270
    # args.append('--benchmark-max-time=10')
271
    # args.append('--benchmark-min-time=0.1')
272
    # args.append('--benchmark-warmup=False')
273
    # args.append('--benchmark-disable-gc')
274
    args.append('--benchmark-verbose')
275
    args.append('--benchmark-min-rounds=1')
276
    args.append('--tb=short')
277
    main(args)
278