|
1
|
|
|
#!/usr/bin/env python2 |
|
2
|
|
|
|
|
3
|
|
|
# Copyright 2016 Red Hat Inc., Durham, North Carolina. |
|
4
|
|
|
# |
|
5
|
|
|
# This library is free software; you can redistribute it and/or |
|
6
|
|
|
# modify it under the terms of the GNU Lesser General Public |
|
7
|
|
|
# License as published by the Free Software Foundation; either |
|
8
|
|
|
# version 2 of the License, or (at your option) any later version. |
|
9
|
|
|
# |
|
10
|
|
|
# This library is distributed in the hope that it will be useful, |
|
11
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
13
|
|
|
# Lesser General Public License for more details. |
|
14
|
|
|
# |
|
15
|
|
|
# You should have received a copy of the GNU Lesser General Public |
|
16
|
|
|
# License along with this library; if not, write to the Free Software |
|
17
|
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
18
|
|
|
# |
|
19
|
|
|
# Authors: |
|
20
|
|
|
# Martin Preisler <[email protected]> |
|
21
|
|
|
|
|
22
|
|
|
import subprocess |
|
23
|
|
|
import logging |
|
24
|
|
|
import re |
|
25
|
|
|
import json |
|
26
|
|
|
import codecs |
|
27
|
|
|
|
|
28
|
|
|
import ssg.xml |
|
29
|
|
|
|
|
30
|
|
|
|
|
31
|
|
|
FILENAME = "PCI_DSS_v3-1.pdf" |
|
32
|
|
|
JSON_FILENAME = "PCI_DSS.json" |
|
33
|
|
|
|
|
34
|
|
|
|
|
35
|
|
|
def autocorrect_pci_id(id_): |
|
36
|
|
|
if id_ == "8.2.3a": |
|
37
|
|
|
return "8.2.3.a" |
|
38
|
|
|
|
|
39
|
|
|
return id_ |
|
40
|
|
|
|
|
41
|
|
|
|
|
42
|
|
|
def is_applicable_to_os(id_): |
|
43
|
|
|
if id_.startswith("1."): |
|
44
|
|
|
return False |
|
45
|
|
|
elif id_.startswith("9."): |
|
46
|
|
|
return False |
|
47
|
|
|
# Category "11." is supposedly not applicable on OS level but we do have |
|
48
|
|
|
# rules referencing it, therefore let's keep it in our Benchmarks for the |
|
49
|
|
|
# time being |
|
50
|
|
|
#elif id_.startswith("11."): |
|
51
|
|
|
# return False |
|
52
|
|
|
elif id_.startswith("12."): |
|
53
|
|
|
return False |
|
54
|
|
|
|
|
55
|
|
|
return True |
|
56
|
|
|
|
|
57
|
|
|
|
|
58
|
|
|
def harvest_ids_descriptions(page, id_map): |
|
59
|
|
|
logging.debug("Harvesting page %s", page.get("number", "unknown")) |
|
60
|
|
|
|
|
61
|
|
|
for text in page.findall("./text"): |
|
62
|
|
|
# every text element describing a PCI DSS requirement will have |
|
63
|
|
|
# several properties we will exploit here |
|
64
|
|
|
|
|
65
|
|
|
# 1) some elements present |
|
66
|
|
|
if len(text) == 0: |
|
67
|
|
|
continue |
|
68
|
|
|
|
|
69
|
|
|
# 2) first element is b |
|
70
|
|
|
if text[0].tag != "b": |
|
71
|
|
|
continue |
|
72
|
|
|
|
|
73
|
|
|
# 3) the first element is b and contains a PCI-DSS requirement ID |
|
74
|
|
|
id_candidate = text[0].text.strip() |
|
75
|
|
|
|
|
76
|
|
|
# PCI-DSS PDF contains ID mistakes, let's fix the known ones |
|
77
|
|
|
id_candidate = autocorrect_pci_id(id_candidate) |
|
78
|
|
|
|
|
79
|
|
|
# It is my understanding that this will match all valid PCI-DSS IDs |
|
80
|
|
|
id_pattern = "" |
|
81
|
|
|
|
|
82
|
|
|
# number followed by a dot |
|
83
|
|
|
id_pattern += "^[1-9][0-9]*\\." |
|
84
|
|
|
# second section, number plus optional letter |
|
85
|
|
|
id_pattern += "([1-9][0-9]*[a-z]?" |
|
86
|
|
|
# third section only if second section is present, number plus |
|
87
|
|
|
# optional letter |
|
88
|
|
|
id_pattern += "(\\.[1-9][0-9]*[a-z]?)?)" |
|
89
|
|
|
# sometimes there is a suffix with just a letter, preceded by a dot |
|
90
|
|
|
id_pattern += "?(\\.[a-z])?$" |
|
91
|
|
|
|
|
92
|
|
|
if re.match(id_pattern, id_candidate) is None: |
|
93
|
|
|
continue |
|
94
|
|
|
|
|
95
|
|
|
# now we are reasonably sure the text element describes a req ID |
|
96
|
|
|
logging.debug("This text describes req of ID '%s'.", id_candidate) |
|
97
|
|
|
|
|
98
|
|
|
if not is_applicable_to_os(id_candidate): |
|
99
|
|
|
logging.debug( |
|
100
|
|
|
"Req ID '%s' is not applicable on OS level.", id_candidate |
|
101
|
|
|
) |
|
102
|
|
|
continue |
|
103
|
|
|
|
|
104
|
|
|
# TODO: Would be great to get the entire description but that's very |
|
105
|
|
|
# complex to achieve |
|
106
|
|
|
description_excerpt = text[0].tail |
|
107
|
|
|
|
|
108
|
|
|
if description_excerpt is None: |
|
109
|
|
|
continue |
|
110
|
|
|
|
|
111
|
|
|
description_excerpt = description_excerpt.strip() |
|
112
|
|
|
|
|
113
|
|
|
if id_candidate not in id_map: |
|
114
|
|
|
logging.debug( |
|
115
|
|
|
"Assigning '%s' as description excerpt for ID '%s'.", |
|
116
|
|
|
description_excerpt, id_candidate |
|
117
|
|
|
) |
|
118
|
|
|
id_map[id_candidate] = description_excerpt |
|
119
|
|
|
|
|
120
|
|
|
else: |
|
121
|
|
|
# It is normal to encounter this. The second encounters are |
|
122
|
|
|
# rationale guidances, the first encounter are descriptions |
|
123
|
|
|
logging.debug( |
|
124
|
|
|
"Not assigning '%s' as description excerpt for ID '%s'. This " |
|
125
|
|
|
"ID is already in the map!", description_excerpt, id_candidate |
|
126
|
|
|
) |
|
127
|
|
|
|
|
128
|
|
|
|
|
129
|
|
|
def sort_pci_subtree(subtree): |
|
130
|
|
|
return sorted(subtree, key=lambda item: item[0].rsplit(".", 1)[1]) |
|
131
|
|
|
|
|
132
|
|
|
|
|
133
|
|
|
def handle_id(id_, desc_, id_map, handled_ids): |
|
134
|
|
|
handled_ids.append(id_) |
|
135
|
|
|
|
|
136
|
|
|
full_prefix = id_ |
|
137
|
|
|
if not full_prefix.endswith("."): |
|
138
|
|
|
full_prefix += "." |
|
139
|
|
|
|
|
140
|
|
|
children = [] |
|
141
|
|
|
for child_id, child_desc in id_map.items(): |
|
142
|
|
|
if child_id in handled_ids: |
|
143
|
|
|
continue |
|
144
|
|
|
|
|
145
|
|
|
if not child_id.startswith(full_prefix): |
|
146
|
|
|
continue |
|
147
|
|
|
|
|
148
|
|
|
id_suffix = child_id[len(full_prefix):] |
|
149
|
|
|
|
|
150
|
|
|
if "." in id_suffix: |
|
151
|
|
|
# not a direct child |
|
152
|
|
|
continue |
|
153
|
|
|
|
|
154
|
|
|
# it passed all our requirements, it must be a direct child |
|
155
|
|
|
children.append(handle_id(child_id, child_desc, id_map, handled_ids)) |
|
156
|
|
|
|
|
157
|
|
|
return (id_, desc_, sort_pci_subtree(children)) |
|
158
|
|
|
|
|
159
|
|
|
|
|
160
|
|
|
def main(): |
|
161
|
|
|
logging.basicConfig(format='%(levelname)s:%(message)s', |
|
162
|
|
|
level=logging.DEBUG) |
|
163
|
|
|
|
|
164
|
|
|
xml_string = subprocess.check_output( |
|
165
|
|
|
["pdftohtml", "-xml", "-i", "-stdout", FILENAME], |
|
166
|
|
|
shell=False |
|
167
|
|
|
) |
|
168
|
|
|
|
|
169
|
|
|
tree = ssg.xml.ElementTree.fromstring(xml_string) |
|
170
|
|
|
id_map = {} |
|
171
|
|
|
|
|
172
|
|
|
for page in tree.findall("./page"): |
|
173
|
|
|
harvest_ids_descriptions(page, id_map) |
|
174
|
|
|
|
|
175
|
|
|
handled_ids = [] |
|
176
|
|
|
id_tree = [] |
|
177
|
|
|
|
|
178
|
|
|
# start with top level IDs |
|
179
|
|
|
for id_, desc in id_map.items(): |
|
180
|
|
|
if re.match("^[1-9][0-9]*\\.$", id_) is None: |
|
181
|
|
|
continue |
|
182
|
|
|
|
|
183
|
|
|
handled_ids.append(id_) |
|
184
|
|
|
# for every top level ID, handle all direct children |
|
185
|
|
|
id_tree.append(handle_id(id_, desc, id_map, handled_ids)) |
|
186
|
|
|
|
|
187
|
|
|
# top level IDs have different sorting rules |
|
188
|
|
|
id_tree = sorted(id_tree, key=lambda item: int(item[0].split(".", 1)[0])) |
|
189
|
|
|
|
|
190
|
|
|
for id_ in id_map.keys(): |
|
191
|
|
|
if id_ in handled_ids: |
|
192
|
|
|
continue |
|
193
|
|
|
|
|
194
|
|
|
logging.warning( |
|
195
|
|
|
"id '%s' wasn't handled during PCI tree reconstruction!", id_ |
|
196
|
|
|
) |
|
197
|
|
|
|
|
198
|
|
|
with codecs.open(JSON_FILENAME, "w", encoding="utf-8") as f: |
|
199
|
|
|
json.dump(id_tree, f) |
|
200
|
|
|
|
|
201
|
|
|
|
|
202
|
|
|
if __name__ == "__main__": |
|
203
|
|
|
main() |
|
204
|
|
|
|