Passed
Branch master (ea367e)
by Osma
02:12
created

test_docdir_require_keyfile()   B

Complexity

Conditions 6

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 6
c 1
b 0
f 0
dl 0
loc 14
rs 8
1
"""Unit tests for corpus functionality in Annif"""
2
3
import annif.corpus
4
5
6
def test_subjectset_uris():
7
    data = """<http://example.org/dummy>\tdummy
8
    <http://example.org/another>\tanother
9
    """
10
11
    sset = annif.corpus.SubjectSet(data)
12
    assert sset.has_uris()
13
    assert len(sset.subject_uris) == 2
14
    assert "http://example.org/dummy" in sset.subject_uris
15
    assert "http://example.org/another" in sset.subject_uris
16
17
18
def test_subjectset_labels():
19
    data = """dummy
20
    another
21
    """
22
23
    sset = annif.corpus.SubjectSet(data)
24
    assert not sset.has_uris()
25
    assert len(sset.subject_labels) == 2
26
    assert "dummy" in sset.subject_labels
27
    assert "another" in sset.subject_labels
28
29
30
def test_docdir_key(tmpdir):
31
    tmpdir.join('doc1.txt').write('doc1')
32
    tmpdir.join('doc1.key').write('key1')
33
    tmpdir.join('doc2.txt').write('doc2')
34
    tmpdir.join('doc2.key').write('key2')
35
    tmpdir.join('doc3.txt').write('doc3')
36
37
    docdir = annif.corpus.DocumentDirectory(str(tmpdir))
38
    files = sorted(list(docdir))
39
    assert len(files) == 3
40
    assert files[0][0] == str(tmpdir.join('doc1.txt'))
41
    assert files[0][1] == str(tmpdir.join('doc1.key'))
42
    assert files[1][0] == str(tmpdir.join('doc2.txt'))
43
    assert files[1][1] == str(tmpdir.join('doc2.key'))
44
    assert files[2][0] == str(tmpdir.join('doc3.txt'))
45
    assert files[2][1] is None
46
47
48
def test_docdir_tsv(tmpdir):
49
    tmpdir.join('doc1.txt').write('doc1')
50
    tmpdir.join('doc1.tsv').write('<http://example.org/key1>\tkey1')
51
    tmpdir.join('doc2.txt').write('doc2')
52
    tmpdir.join('doc2.tsv').write('<http://example.org/key2>\tkey2')
53
    tmpdir.join('doc3.txt').write('doc3')
54
55
    docdir = annif.corpus.DocumentDirectory(str(tmpdir))
56
    files = sorted(list(docdir))
57
    assert len(files) == 3
58
    assert files[0][0] == str(tmpdir.join('doc1.txt'))
59
    assert files[0][1] == str(tmpdir.join('doc1.tsv'))
60
    assert files[1][0] == str(tmpdir.join('doc2.txt'))
61
    assert files[1][1] == str(tmpdir.join('doc2.tsv'))
62
    assert files[2][0] == str(tmpdir.join('doc3.txt'))
63
    assert files[2][1] is None
64
65
66
def test_docdir_key_require_subjects(tmpdir):
67
    tmpdir.join('doc1.txt').write('doc1')
68
    tmpdir.join('doc1.key').write('<http://example.org/key1>\tkey1')
69
    tmpdir.join('doc2.txt').write('doc2')
70
    tmpdir.join('doc2.key').write('<http://example.org/key2>\tkey2')
71
    tmpdir.join('doc3.txt').write('doc3')
72
73
    docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
74
    files = sorted(list(docdir))
75
    assert len(files) == 2
76
    assert files[0][0] == str(tmpdir.join('doc1.txt'))
77
    assert files[0][1] == str(tmpdir.join('doc1.key'))
78
    assert files[1][0] == str(tmpdir.join('doc2.txt'))
79
    assert files[1][1] == str(tmpdir.join('doc2.key'))
80
81
82
def test_docdir_tsv_require_subjects(tmpdir):
83
    tmpdir.join('doc1.txt').write('doc1')
84
    tmpdir.join('doc1.tsv').write('key1')
85
    tmpdir.join('doc2.txt').write('doc2')
86
    tmpdir.join('doc2.tsv').write('key2')
87
    tmpdir.join('doc3.txt').write('doc3')
88
89
    docdir = annif.corpus.DocumentDirectory(str(tmpdir), require_subjects=True)
90
    files = sorted(list(docdir))
91
    assert len(files) == 2
92
    assert files[0][0] == str(tmpdir.join('doc1.txt'))
93
    assert files[0][1] == str(tmpdir.join('doc1.tsv'))
94
    assert files[1][0] == str(tmpdir.join('doc2.txt'))
95
    assert files[1][1] == str(tmpdir.join('doc2.tsv'))
96
97
98
def test_subjdir(tmpdir):
99
    tmpdir.join('subj1.txt').write("""http://example.org/subj1 subject one
100
        first subject
101
        this is the first thing we know about""")
102
    tmpdir.join('subj2.txt').write("""http://example.org/subj2 subject two
103
        second subject
104
        this is the second thing we know about""")
105
    tmpdir.join('subj3.txt').write("""http://example.org/subj3 subject three
106
        third subject
107
        this is the third thing we know about""")
108
109
    subjdir = annif.corpus.SubjectDirectory(str(tmpdir))
110
    subjects = sorted(list(subjdir), key=lambda subj: subj.uri)
111
    assert len(subjects) == 3
112
    assert subjects[0].uri == 'http://example.org/subj1'
113
    assert subjects[0].label == 'subject one'
114
    assert 'first' in subjects[0].text
115
    assert subjects[1].uri == 'http://example.org/subj2'
116
    assert subjects[1].label == 'subject two'
117
    assert 'second' in subjects[1].text
118
    assert subjects[2].uri == 'http://example.org/subj3'
119
    assert subjects[2].label == 'subject three'
120
    assert 'third' in subjects[2].text
121