1
|
|
|
#!/usr/bin/env python2 |
2
|
|
|
|
3
|
|
|
# Copyright 2016 Red Hat Inc., Durham, North Carolina. |
4
|
|
|
# |
5
|
|
|
# This library is free software; you can redistribute it and/or |
6
|
|
|
# modify it under the terms of the GNU Lesser General Public |
7
|
|
|
# License as published by the Free Software Foundation; either |
8
|
|
|
# version 2 of the License, or (at your option) any later version. |
9
|
|
|
# |
10
|
|
|
# This library is distributed in the hope that it will be useful, |
11
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
12
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13
|
|
|
# Lesser General Public License for more details. |
14
|
|
|
# |
15
|
|
|
# You should have received a copy of the GNU Lesser General Public |
16
|
|
|
# License along with this library; if not, write to the Free Software |
17
|
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
18
|
|
|
# |
19
|
|
|
# Authors: |
20
|
|
|
# Martin Preisler <[email protected]> |
21
|
|
|
|
22
|
|
|
import subprocess |
23
|
|
|
import logging |
24
|
|
|
import re |
25
|
|
|
import json |
26
|
|
|
import codecs |
27
|
|
|
|
28
|
|
|
import ssg.xml |
29
|
|
|
|
30
|
|
|
|
31
|
|
|
FILENAME = "PCI_DSS_v3-1.pdf" |
32
|
|
|
JSON_FILENAME = "PCI_DSS.json" |
33
|
|
|
|
34
|
|
|
|
35
|
|
|
def autocorrect_pci_id(id_): |
36
|
|
|
if id_ == "8.2.3a": |
37
|
|
|
return "8.2.3.a" |
38
|
|
|
|
39
|
|
|
return id_ |
40
|
|
|
|
41
|
|
|
|
42
|
|
|
def is_applicable_to_os(id_): |
43
|
|
|
if id_.startswith("1."): |
44
|
|
|
return False |
45
|
|
|
elif id_.startswith("9."): |
46
|
|
|
return False |
47
|
|
|
# Category "11." is supposedly not applicable on OS level but we do have |
48
|
|
|
# rules referencing it, therefore let's keep it in our Benchmarks for the |
49
|
|
|
# time being |
50
|
|
|
#elif id_.startswith("11."): |
51
|
|
|
# return False |
52
|
|
|
elif id_.startswith("12."): |
53
|
|
|
return False |
54
|
|
|
|
55
|
|
|
return True |
56
|
|
|
|
57
|
|
|
|
58
|
|
|
def harvest_ids_descriptions(page, id_map): |
59
|
|
|
logging.debug("Harvesting page %s", page.get("number", "unknown")) |
60
|
|
|
|
61
|
|
|
for text in page.findall("./text"): |
62
|
|
|
# every text element describing a PCI DSS requirement will have |
63
|
|
|
# several properties we will exploit here |
64
|
|
|
|
65
|
|
|
# 1) some elements present |
66
|
|
|
if len(text) == 0: |
67
|
|
|
continue |
68
|
|
|
|
69
|
|
|
# 2) first element is b |
70
|
|
|
if text[0].tag != "b": |
71
|
|
|
continue |
72
|
|
|
|
73
|
|
|
# 3) the first element is b and contains a PCI-DSS requirement ID |
74
|
|
|
id_candidate = text[0].text.strip() |
75
|
|
|
|
76
|
|
|
# PCI-DSS PDF contains ID mistakes, let's fix the known ones |
77
|
|
|
id_candidate = autocorrect_pci_id(id_candidate) |
78
|
|
|
|
79
|
|
|
# It is my understanding that this will match all valid PCI-DSS IDs |
80
|
|
|
id_pattern = "" |
81
|
|
|
|
82
|
|
|
# number followed by a dot |
83
|
|
|
id_pattern += "^[1-9][0-9]*\\." |
84
|
|
|
# second section, number plus optional letter |
85
|
|
|
id_pattern += "([1-9][0-9]*[a-z]?" |
86
|
|
|
# third section only if second section is present, number plus |
87
|
|
|
# optional letter |
88
|
|
|
id_pattern += "(\\.[1-9][0-9]*[a-z]?)?)" |
89
|
|
|
# sometimes there is a suffix with just a letter, preceded by a dot |
90
|
|
|
id_pattern += "?(\\.[a-z])?$" |
91
|
|
|
|
92
|
|
|
if re.match(id_pattern, id_candidate) is None: |
93
|
|
|
continue |
94
|
|
|
|
95
|
|
|
# now we are reasonably sure the text element describes a req ID |
96
|
|
|
logging.debug("This text describes req of ID '%s'.", id_candidate) |
97
|
|
|
|
98
|
|
|
if not is_applicable_to_os(id_candidate): |
99
|
|
|
logging.debug( |
100
|
|
|
"Req ID '%s' is not applicable on OS level.", id_candidate |
101
|
|
|
) |
102
|
|
|
continue |
103
|
|
|
|
104
|
|
|
# TODO: Would be great to get the entire description but that's very |
105
|
|
|
# complex to achieve |
106
|
|
|
description_excerpt = text[0].tail |
107
|
|
|
|
108
|
|
|
if description_excerpt is None: |
109
|
|
|
continue |
110
|
|
|
|
111
|
|
|
description_excerpt = description_excerpt.strip() |
112
|
|
|
|
113
|
|
|
if id_candidate not in id_map: |
114
|
|
|
logging.debug( |
115
|
|
|
"Assigning '%s' as description excerpt for ID '%s'.", |
116
|
|
|
description_excerpt, id_candidate |
117
|
|
|
) |
118
|
|
|
id_map[id_candidate] = description_excerpt |
119
|
|
|
|
120
|
|
|
else: |
121
|
|
|
# It is normal to encounter this. The second encounters are |
122
|
|
|
# rationale guidances, the first encounter are descriptions |
123
|
|
|
logging.debug( |
124
|
|
|
"Not assigning '%s' as description excerpt for ID '%s'. This " |
125
|
|
|
"ID is already in the map!", description_excerpt, id_candidate |
126
|
|
|
) |
127
|
|
|
|
128
|
|
|
|
129
|
|
|
def sort_pci_subtree(subtree): |
130
|
|
|
return sorted(subtree, key=lambda item: item[0].rsplit(".", 1)[1]) |
131
|
|
|
|
132
|
|
|
|
133
|
|
|
def handle_id(id_, desc_, id_map, handled_ids): |
134
|
|
|
handled_ids.append(id_) |
135
|
|
|
|
136
|
|
|
full_prefix = id_ |
137
|
|
|
if not full_prefix.endswith("."): |
138
|
|
|
full_prefix += "." |
139
|
|
|
|
140
|
|
|
children = [] |
141
|
|
|
for child_id, child_desc in id_map.items(): |
142
|
|
|
if child_id in handled_ids: |
143
|
|
|
continue |
144
|
|
|
|
145
|
|
|
if not child_id.startswith(full_prefix): |
146
|
|
|
continue |
147
|
|
|
|
148
|
|
|
id_suffix = child_id[len(full_prefix):] |
149
|
|
|
|
150
|
|
|
if "." in id_suffix: |
151
|
|
|
# not a direct child |
152
|
|
|
continue |
153
|
|
|
|
154
|
|
|
# it passed all our requirements, it must be a direct child |
155
|
|
|
children.append(handle_id(child_id, child_desc, id_map, handled_ids)) |
156
|
|
|
|
157
|
|
|
return (id_, desc_, sort_pci_subtree(children)) |
158
|
|
|
|
159
|
|
|
|
160
|
|
|
def main(): |
161
|
|
|
logging.basicConfig(format='%(levelname)s:%(message)s', |
162
|
|
|
level=logging.DEBUG) |
163
|
|
|
|
164
|
|
|
xml_string = subprocess.check_output( |
165
|
|
|
["pdftohtml", "-xml", "-i", "-stdout", FILENAME], |
166
|
|
|
shell=False |
167
|
|
|
) |
168
|
|
|
|
169
|
|
|
tree = ssg.xml.ElementTree.fromstring(xml_string) |
170
|
|
|
id_map = {} |
171
|
|
|
|
172
|
|
|
for page in tree.findall("./page"): |
173
|
|
|
harvest_ids_descriptions(page, id_map) |
174
|
|
|
|
175
|
|
|
handled_ids = [] |
176
|
|
|
id_tree = [] |
177
|
|
|
|
178
|
|
|
# start with top level IDs |
179
|
|
|
for id_, desc in id_map.items(): |
180
|
|
|
if re.match("^[1-9][0-9]*\\.$", id_) is None: |
181
|
|
|
continue |
182
|
|
|
|
183
|
|
|
handled_ids.append(id_) |
184
|
|
|
# for every top level ID, handle all direct children |
185
|
|
|
id_tree.append(handle_id(id_, desc, id_map, handled_ids)) |
186
|
|
|
|
187
|
|
|
# top level IDs have different sorting rules |
188
|
|
|
id_tree = sorted(id_tree, key=lambda item: int(item[0].split(".", 1)[0])) |
189
|
|
|
|
190
|
|
|
for id_ in id_map.keys(): |
191
|
|
|
if id_ in handled_ids: |
192
|
|
|
continue |
193
|
|
|
|
194
|
|
|
logging.warning( |
195
|
|
|
"id '%s' wasn't handled during PCI tree reconstruction!", id_ |
196
|
|
|
) |
197
|
|
|
|
198
|
|
|
with codecs.open(JSON_FILENAME, "w", encoding="utf-8") as f: |
199
|
|
|
json.dump(id_tree, f) |
200
|
|
|
|
201
|
|
|
|
202
|
|
|
if __name__ == "__main__": |
203
|
|
|
main() |
204
|
|
|
|