Completed
Pull Request — master (#1098)
by Mischa
01:43
created

coalib.bearlib.languages.documentation.extract_documentation()   B

Complexity

Conditions 1

Size

Total Lines 27

Duplication

Lines 0
Ratio 0 %
Metric Value
cc 1
dl 0
loc 27
rs 8.8571
1
import re
2
3
from coalib.bearlib.languages.documentation.DocstyleDefinition import (
4
    DocstyleDefinition)
5
from coalib.bearlib.languages.documentation.DocumentationComment import (
6
    DocumentationComment)
7
from coalib.results.TextRange import TextRange
8
9
10
#TODO - Implement Match also for `split` and `search_for`? File an issue
11
#TODO - document currently existing docstyles from coala side?
12
#TODO - Add ''' ''' as markers for python 2/3 inside doc-definition files.
13
14
15
def extract_documentation_with_docstyle(content, docstyle_definition):
16
    """
17
    Extracts all documentation texts inside the given source-code-string.
18
19
    :param content:             The source-code-string where to extract
20
                                documentation from or an iterable with strings
21
                                where each string is a single line (including
22
                                ending whitespaces like `\\n`).
23
    :param docstyle_definition: The DocstyleDefinition that identifies the
24
                                documentation comments.
25
    :return:                    An iterator returning each documentation text
26
                                found in the content.
27
    """
28
    if isinstance(content, str):
29
        content = content.splitlines(keepends=True)
30
    else:
31
        content = list(content)
32
33
    # Used to break out of outer loops via exception raise.
34
    class BreakOut(Exception):
35
        pass
36
37
    # Prepare marker-tuple dict that maps a begin pattern to the corresponding
38
    # marker_set(s). This makes it faster to retrieve a marker-set from a
39
    # begin sequence we initially want to search for in source code. Then
40
    # the possible found documentation match is processed further with the
41
    # rest markers.
42
    begin_sequence_dict = {}
43
    for marker_set in docstyle_definition.markers:
44
        if marker_set[0] not in begin_sequence_dict:
45
            begin_sequence_dict[marker_set[0]] = [marker_set]
46
        else:
47
            begin_sequence_dict[marker_set[0]].append(marker_set)
48
49
    # Using regexes to perform a variable match is faster than finding each
50
    # substring with `str.find()` choosing the lowest match.
51
    begin_regex = re.compile("|".join(
52
        re.escape(marker_set[0])
53
        for marker_set in docstyle_definition.markers))
54
55
    line = 0
56
    line_pos = 0
57
    while line < len(content):
58
        begin_match = begin_regex.search(content[line], line_pos)
59
60
        if begin_match:
61
            begin_match_line = line
62
            # Prevents infinite loop when the start marker matches but not the
63
            # complete documentation comment.
64
            line_pos = begin_match.end()
65
66
            # begin_sequence_dict[begin_match.group()] returns the marker set
67
            # the begin sequence from before matched.
68
            for marker_set in begin_sequence_dict[begin_match.group()]:
69
                try:
70
                    # If the each-line marker and the end marker do equal,
71
                    # search for the each-line marker until it runs out.
72
                    if marker_set[1] == marker_set[2]:
73
                        docstring = content[line][begin_match.end():]
74
75
                        line2 = line + 1
76
                        stripped_content = content[line2].lstrip()
77
78
                        # Now the each-line marker is no requirement for a
79
                        # docstring any more, just extract as long as there are
80
                        # no each-line markers any more.
81
                        while (stripped_content[:len(marker_set[1])] ==
82
                               marker_set[1]):
83
                            docstring += stripped_content[len(marker_set[1]):]
84
85
                            line2 += 1
86
                            if line2 >= len(content):
87
                                # End of content reached, done with
88
                                # doc-extraction.
89
                                break
90
91
                            stripped_content = content[line2].lstrip()
92
93
                        line = line2 - 1
94
                        line_pos = len(content[line])
95
                    else:
96
                        end_marker_pos = content[line].find(marker_set[2],
97
                                                            begin_match.end())
98
99
                        if end_marker_pos == -1:
100
                            docstring = content[line][begin_match.end():]
101
102
                            line2 = line + 1
103
                            if line2 >= len(content):
104
                                continue
105
106
                            end_marker_pos = content[line2].find(marker_set[2])
107
108
                            while end_marker_pos == -1:
109
                                if marker_set[1] == "":
110
                                    # When no each-line marker is set (i.e. for
111
                                    # Python docstrings), then align the
112
                                    # comment to the start-marker.
113
                                    stripped_content = (
114
                                        content[line2][begin_match.start():])
115
                                else:
116
                                    # Check whether we violate the each-line
117
                                    # marker "rule".
118
                                    current_each_line_marker = (content[line2]
119
                                        [begin_match.start():
120
                                         begin_match.start()
121
                                             + len(marker_set[1])])
122
                                    if (current_each_line_marker !=
123
                                            marker_set[1]):
124
                                        # Effectively a 'continue' for the
125
                                        # outer for-loop.
126
                                        raise BreakOut
127
128
                                    stripped_content = (
129
                                        content[line2][begin_match.start()
130
                                                       + len(marker_set[1]):])
131
132
                                # TODO Test also other C style doccomments
133
134
                                docstring += stripped_content
135
                                line2 += 1
136
137
                                if line2 >= len(content):
138
                                    # End of content reached, so there's no
139
                                    # closing marker and that's a mismatch.
140
                                    raise BreakOut
141
142
                                end_marker_pos = content[line2].find(
143
                                    marker_set[2])
144
145
                            docstring += (content[line2]
146
                                [begin_match.start():end_marker_pos])
147
                            line = line2
148
                        else:
149
                            docstring = (content[line]
150
                                [begin_match.end():end_marker_pos])
151
152
                        line_pos = end_marker_pos + len(marker_set[2])
153
154
                    rng = TextRange.from_values(begin_match_line + 1,
155
                                                begin_match.start() + 1,
156
                                                line + 1,
157
                                                line_pos + 1)
158
159
                    yield DocumentationComment(docstring,
160
                                               docstyle_definition,
161
                                               marker_set,
162
                                               rng)
163
164
                    break
165
166
                except BreakOut:
167
                    # Continues the marker_set loop.
168
                    pass
169
170
        else:
171
            line += 1
172
            line_pos = 0
173
174
175
def extract_documentation(content, language, docstyle):
176
    """
177
    Extracts all documentation texts inside the given source-code-string using
178
    the coala docstyle definition files.
179
180
    The documentation texts are sorted by their order appearing in `content`.
181
182
    For more information about how documentation comments are identified and
183
    extracted, see DocstyleDefinition.doctypes enumeration.
184
185
    :param content:            The source-code-string where to extract
186
                               documentation from.
187
    :param language:           The programming language used.
188
    :param docstyle:           The documentation style/tool used
189
                               (i.e. doxygen).
190
    :raises FileNotFoundError: Raised when the docstyle definition file was not
191
                               found. This is a compatability exception from
192
                               `coalib.misc.Compatability` module.
193
    :raises KeyError:          Raised when the given language is not defined in
194
                               given docstyle.
195
    :raises ValueError:        Raised when a docstyle definition setting has an
196
                               invalid format.
197
    :return:                   An iterator returning each DocumentationComment
198
                               found in the content.
199
    """
200
    docstyle_definition = DocstyleDefinition.load(language, docstyle)
201
    return extract_documentation_with_docstyle(content, docstyle_definition)
202