Passed
Push — master ( ea7c4a...d61422 )
by Matěj
01:18 queued 12s
created

generate_pcidss_json.harvest_ids_descriptions()   B

Complexity

Conditions 8

Size

Total Lines 68
Code Lines 33

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 33
nop 2
dl 0
loc 68
rs 7.2213
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python2
2
3
# Copyright 2016 Red Hat Inc., Durham, North Carolina.
4
#
5
# This library is free software; you can redistribute it and/or
6
# modify it under the terms of the GNU Lesser General Public
7
# License as published by the Free Software Foundation; either
8
# version 2 of the License, or (at your option) any later version.
9
#
10
# This library is distributed in the hope that it will be useful,
11
# but WITHOUT ANY WARRANTY; without even the implied warranty of
12
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
# Lesser General Public License for more details.
14
#
15
# You should have received a copy of the GNU Lesser General Public
16
# License along with this library; if not, write to the Free Software
17
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
#
19
# Authors:
20
#      Martin Preisler <[email protected]>
21
22
import subprocess
23
import logging
24
import re
25
import json
26
import codecs
27
28
import ssg.xml
29
30
31
FILENAME = "PCI_DSS_v3-1.pdf"
32
JSON_FILENAME = "PCI_DSS.json"
33
34
35
def autocorrect_pci_id(id_):
36
    if id_ == "8.2.3a":
37
        return "8.2.3.a"
38
39
    return id_
40
41
42
def is_applicable_to_os(id_):
43
    if id_.startswith("1."):
44
        return False
45
    elif id_.startswith("9."):
46
        return False
47
    # Category "11." is supposedly not applicable on OS level but we do have
48
    # rules referencing it, therefore let's keep it in our Benchmarks for the
49
    # time being
50
    #elif id_.startswith("11."):
51
    #    return False
52
    elif id_.startswith("12."):
53
        return False
54
55
    return True
56
57
58
def harvest_ids_descriptions(page, id_map):
59
    logging.debug("Harvesting page %s", page.get("number", "unknown"))
60
61
    for text in page.findall("./text"):
62
        # every text element describing a PCI DSS requirement will have
63
        # several properties we will exploit here
64
65
        # 1) some elements present
66
        if len(text) == 0:
67
            continue
68
69
        # 2) first element is b
70
        if text[0].tag != "b":
71
            continue
72
73
        # 3) the first element is b and contains a PCI-DSS requirement ID
74
        id_candidate = text[0].text.strip()
75
76
        # PCI-DSS PDF contains ID mistakes, let's fix the known ones
77
        id_candidate = autocorrect_pci_id(id_candidate)
78
79
        # It is my understanding that this will match all valid PCI-DSS IDs
80
        id_pattern = ""
81
82
        # number followed by a dot
83
        id_pattern += "^[1-9][0-9]*\\."
84
        # second section, number plus optional letter
85
        id_pattern += "([1-9][0-9]*[a-z]?"
86
        # third section only if second section is present, number plus
87
        # optional letter
88
        id_pattern += "(\\.[1-9][0-9]*[a-z]?)?)"
89
        # sometimes there is a suffix with just a letter, preceded by a dot
90
        id_pattern += "?(\\.[a-z])?$"
91
92
        if re.match(id_pattern, id_candidate) is None:
93
            continue
94
95
        # now we are reasonably sure the text element describes a req ID
96
        logging.debug("This text describes req of ID '%s'.", id_candidate)
97
98
        if not is_applicable_to_os(id_candidate):
99
            logging.debug(
100
                "Req ID '%s' is not applicable on OS level.", id_candidate
101
            )
102
            continue
103
104
        # TODO: Would be great to get the entire description but that's very
105
        # complex to achieve
106
        description_excerpt = text[0].tail
107
108
        if description_excerpt is None:
109
            continue
110
111
        description_excerpt = description_excerpt.strip()
112
113
        if id_candidate not in id_map:
114
            logging.debug(
115
                "Assigning '%s' as description excerpt for ID '%s'.",
116
                description_excerpt, id_candidate
117
            )
118
            id_map[id_candidate] = description_excerpt
119
120
        else:
121
            # It is normal to encounter this. The second encounters are
122
            # rationale guidances, the first encounter are descriptions
123
            logging.debug(
124
                "Not assigning '%s' as description excerpt for ID '%s'. This "
125
                "ID is already in the map!", description_excerpt, id_candidate
126
            )
127
128
129
def sort_pci_subtree(subtree):
130
    return sorted(subtree, key=lambda item: item[0].rsplit(".", 1)[1])
131
132
133
def handle_id(id_, desc_, id_map, handled_ids):
134
    handled_ids.append(id_)
135
136
    full_prefix = id_
137
    if not full_prefix.endswith("."):
138
        full_prefix += "."
139
140
    children = []
141
    for child_id, child_desc in id_map.items():
142
        if child_id in handled_ids:
143
            continue
144
145
        if not child_id.startswith(full_prefix):
146
            continue
147
148
        id_suffix = child_id[len(full_prefix):]
149
150
        if "." in id_suffix:
151
            # not a direct child
152
            continue
153
154
        # it passed all our requirements, it must be a direct child
155
        children.append(handle_id(child_id, child_desc, id_map, handled_ids))
156
157
    return (id_, desc_, sort_pci_subtree(children))
158
159
160
def main():
161
    logging.basicConfig(format='%(levelname)s:%(message)s',
162
                        level=logging.DEBUG)
163
164
    xml_string = subprocess.check_output(
165
        ["pdftohtml", "-xml", "-i", "-stdout", FILENAME],
166
        shell=False
167
    )
168
169
    tree = ssg.xml.ElementTree.fromstring(xml_string)
170
    id_map = {}
171
172
    for page in tree.findall("./page"):
173
        harvest_ids_descriptions(page, id_map)
174
175
    handled_ids = []
176
    id_tree = []
177
178
    # start with top level IDs
179
    for id_, desc in id_map.items():
180
        if re.match("^[1-9][0-9]*\\.$", id_) is None:
181
            continue
182
183
        handled_ids.append(id_)
184
        # for every top level ID, handle all direct children
185
        id_tree.append(handle_id(id_, desc, id_map, handled_ids))
186
187
    # top level IDs have different sorting rules
188
    id_tree = sorted(id_tree, key=lambda item: int(item[0].split(".", 1)[0]))
189
190
    for id_ in id_map.keys():
191
        if id_ in handled_ids:
192
            continue
193
194
        logging.warning(
195
            "id '%s' wasn't handled during PCI tree reconstruction!", id_
196
        )
197
198
    with codecs.open(JSON_FILENAME, "w", encoding="utf-8") as f:
199
        json.dump(id_tree, f)
200
201
202
if __name__ == "__main__":
203
    main()
204