generate_pcidss_json.harvest_ids_descriptions() - Code Metrics - Inspection of "Merge pull request #6005 from yuumasato/test_updat..." - ComplianceAsCode/content - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( ea7c4a...d61422 )

by Matěj

created 2020-08-17 11:55 UTC

generate_pcidss_json.harvest_ids_descriptions() B

↳ Parent: generate_pcidss_json

Complexity

Conditions

Size

Total Lines	68
Code Lines	33

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	8
eloc	33
nop	2
dl	0
loc	68
rs	7.2213
c	0
b	0
f	0

How to fix Long Method

#!/usr/bin/env python2

# Copyright 2016 Red Hat Inc., Durham, North Carolina.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Authors:
#      Martin Preisler <[email protected]>

import subprocess
import logging
import re
import json
import codecs

import ssg.xml


FILENAME = "PCI_DSS_v3-1.pdf"
JSON_FILENAME = "PCI_DSS.json"


def autocorrect_pci_id(id_):
    if id_ == "8.2.3a":
        return "8.2.3.a"

    return id_


def is_applicable_to_os(id_):
    if id_.startswith("1."):
        return False
    elif id_.startswith("9."):
        return False
    # Category "11." is supposedly not applicable on OS level but we do have
    # rules referencing it, therefore let's keep it in our Benchmarks for the
    # time being
    #elif id_.startswith("11."):
    #    return False
    elif id_.startswith("12."):
        return False

    return True


def harvest_ids_descriptions(page, id_map):
    logging.debug("Harvesting page %s", page.get("number", "unknown"))

    for text in page.findall("./text"):
        # every text element describing a PCI DSS requirement will have
        # several properties we will exploit here

        # 1) some elements present
        if len(text) == 0:
            continue

        # 2) first element is b
        if text[0].tag != "b":
            continue

        # 3) the first element is b and contains a PCI-DSS requirement ID
        id_candidate = text[0].text.strip()

        # PCI-DSS PDF contains ID mistakes, let's fix the known ones
        id_candidate = autocorrect_pci_id(id_candidate)

        # It is my understanding that this will match all valid PCI-DSS IDs
        id_pattern = ""

        # number followed by a dot
        id_pattern += "^[1-9][0-9]*\\."
        # second section, number plus optional letter
        id_pattern += "([1-9][0-9]*[a-z]?"
        # third section only if second section is present, number plus
        # optional letter
        id_pattern += "(\\.[1-9][0-9]*[a-z]?)?)"
        # sometimes there is a suffix with just a letter, preceded by a dot
        id_pattern += "?(\\.[a-z])?$"

        if re.match(id_pattern, id_candidate) is None:
            continue

        # now we are reasonably sure the text element describes a req ID
        logging.debug("This text describes req of ID '%s'.", id_candidate)

        if not is_applicable_to_os(id_candidate):
            logging.debug(
                "Req ID '%s' is not applicable on OS level.", id_candidate
            )
            continue

        # TODO: Would be great to get the entire description but that's very
        # complex to achieve
        description_excerpt = text[0].tail

        if description_excerpt is None:
            continue

        description_excerpt = description_excerpt.strip()

        if id_candidate not in id_map:
            logging.debug(
                "Assigning '%s' as description excerpt for ID '%s'.",
                description_excerpt, id_candidate
            )
            id_map[id_candidate] = description_excerpt

        else:
            # It is normal to encounter this. The second encounters are
            # rationale guidances, the first encounter are descriptions
            logging.debug(
                "Not assigning '%s' as description excerpt for ID '%s'. This "
                "ID is already in the map!", description_excerpt, id_candidate
            )


def sort_pci_subtree(subtree):
    return sorted(subtree, key=lambda item: item[0].rsplit(".", 1)[1])


def handle_id(id_, desc_, id_map, handled_ids):
    handled_ids.append(id_)

    full_prefix = id_
    if not full_prefix.endswith("."):
        full_prefix += "."

    children = []
    for child_id, child_desc in id_map.items():
        if child_id in handled_ids:
            continue

        if not child_id.startswith(full_prefix):
            continue

        id_suffix = child_id[len(full_prefix):]

        if "." in id_suffix:
            # not a direct child
            continue

        # it passed all our requirements, it must be a direct child
        children.append(handle_id(child_id, child_desc, id_map, handled_ids))

    return (id_, desc_, sort_pci_subtree(children))


def main():
    logging.basicConfig(format='%(levelname)s:%(message)s',
                        level=logging.DEBUG)

    xml_string = subprocess.check_output(
        ["pdftohtml", "-xml", "-i", "-stdout", FILENAME],
        shell=False
    )

    tree = ssg.xml.ElementTree.fromstring(xml_string)
    id_map = {}

    for page in tree.findall("./page"):
        harvest_ids_descriptions(page, id_map)

    handled_ids = []
    id_tree = []

    # start with top level IDs
    for id_, desc in id_map.items():
        if re.match("^[1-9][0-9]*\\.$", id_) is None:
            continue

        handled_ids.append(id_)
        # for every top level ID, handle all direct children
        id_tree.append(handle_id(id_, desc, id_map, handled_ids))

    # top level IDs have different sorting rules
    id_tree = sorted(id_tree, key=lambda item: int(item[0].split(".", 1)[0]))

    for id_ in id_map.keys():
        if id_ in handled_ids:
            continue

        logging.warning(
            "id '%s' wasn't handled during PCI tree reconstruction!", id_
        )

    with codecs.open(JSON_FILENAME, "w", encoding="utf-8") as f:
        json.dump(id_tree, f)


if __name__ == "__main__":
    main()


1			#!/usr/bin/env python2
2
3			# Copyright 2016 Red Hat Inc., Durham, North Carolina.
4			#
5			# This library is free software; you can redistribute it and/or
6			# modify it under the terms of the GNU Lesser General Public
7			# License as published by the Free Software Foundation; either
8			# version 2 of the License, or (at your option) any later version.
9			#
10			# This library is distributed in the hope that it will be useful,
11			# but WITHOUT ANY WARRANTY; without even the implied warranty of
12			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13			# Lesser General Public License for more details.
14			#
15			# You should have received a copy of the GNU Lesser General Public
16			# License along with this library; if not, write to the Free Software
17			# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18			#
19			# Authors:
20			# Martin Preisler <[email protected]>
21
22			import subprocess
23			import logging
24			import re
25			import json
26			import codecs
27
28			import ssg.xml
29
30
31			FILENAME = "PCI_DSS_v3-1.pdf"
32			JSON_FILENAME = "PCI_DSS.json"
33
34
35			def autocorrect_pci_id(id_):
36			if id_ == "8.2.3a":
37			return "8.2.3.a"
38
39			return id_
40
41
42			def is_applicable_to_os(id_):
43			if id_.startswith("1."):
44			return False
45			elif id_.startswith("9."):
46			return False
47			# Category "11." is supposedly not applicable on OS level but we do have
48			# rules referencing it, therefore let's keep it in our Benchmarks for the
49			# time being
50			#elif id_.startswith("11."):
51			# return False
52			elif id_.startswith("12."):
53			return False
54
55			return True
56
57
58			def harvest_ids_descriptions(page, id_map):
59			logging.debug("Harvesting page %s", page.get("number", "unknown"))
60
61			for text in page.findall("./text"):
62			# every text element describing a PCI DSS requirement will have
63			# several properties we will exploit here
64
65			# 1) some elements present
66			if len(text) == 0:
67			continue
68
69			# 2) first element is b
70			if text[0].tag != "b":
71			continue
72
73			# 3) the first element is b and contains a PCI-DSS requirement ID
74			id_candidate = text[0].text.strip()
75
76			# PCI-DSS PDF contains ID mistakes, let's fix the known ones
77			id_candidate = autocorrect_pci_id(id_candidate)
78
79			# It is my understanding that this will match all valid PCI-DSS IDs
80			id_pattern = ""
81
82			# number followed by a dot
83			id_pattern += "^[1-9][0-9]*\\."
84			# second section, number plus optional letter
85			id_pattern += "([1-9][0-9]*[a-z]?"
86			# third section only if second section is present, number plus
87			# optional letter
88			id_pattern += "(\\.[1-9][0-9]*[a-z]?)?)"
89			# sometimes there is a suffix with just a letter, preceded by a dot
90			id_pattern += "?(\\.[a-z])?$"
91
92			if re.match(id_pattern, id_candidate) is None:
93			continue
94
95			# now we are reasonably sure the text element describes a req ID
96			logging.debug("This text describes req of ID '%s'.", id_candidate)
97
98			if not is_applicable_to_os(id_candidate):
99			logging.debug(
100			"Req ID '%s' is not applicable on OS level.", id_candidate
101			)
102			continue
103
104			# TODO: Would be great to get the entire description but that's very
105			# complex to achieve
106			description_excerpt = text[0].tail
107
108			if description_excerpt is None:
109			continue
110
111			description_excerpt = description_excerpt.strip()
112
113			if id_candidate not in id_map:
114			logging.debug(
115			"Assigning '%s' as description excerpt for ID '%s'.",
116			description_excerpt, id_candidate
117			)
118			id_map[id_candidate] = description_excerpt
119
120			else:
121			# It is normal to encounter this. The second encounters are
122			# rationale guidances, the first encounter are descriptions
123			logging.debug(
124			"Not assigning '%s' as description excerpt for ID '%s'. This "
125			"ID is already in the map!", description_excerpt, id_candidate
126			)
127
128
129			def sort_pci_subtree(subtree):
130			return sorted(subtree, key=lambda item: item[0].rsplit(".", 1)[1])
131
132
133			def handle_id(id_, desc_, id_map, handled_ids):
134			handled_ids.append(id_)
135
136			full_prefix = id_
137			if not full_prefix.endswith("."):
138			full_prefix += "."
139
140			children = []
141			for child_id, child_desc in id_map.items():
142			if child_id in handled_ids:
143			continue
144
145			if not child_id.startswith(full_prefix):
146			continue
147
148			id_suffix = child_id[len(full_prefix):]
149
150			if "." in id_suffix:
151			# not a direct child
152			continue
153
154			# it passed all our requirements, it must be a direct child
155			children.append(handle_id(child_id, child_desc, id_map, handled_ids))
156
157			return (id_, desc_, sort_pci_subtree(children))
158
159
160			def main():
161			logging.basicConfig(format='%(levelname)s:%(message)s',
162			level=logging.DEBUG)
163
164			xml_string = subprocess.check_output(
165			["pdftohtml", "-xml", "-i", "-stdout", FILENAME],
166			shell=False
167			)
168
169			tree = ssg.xml.ElementTree.fromstring(xml_string)
170			id_map = {}
171
172			for page in tree.findall("./page"):
173			harvest_ids_descriptions(page, id_map)
174
175			handled_ids = []
176			id_tree = []
177
178			# start with top level IDs
179			for id_, desc in id_map.items():
180			if re.match("^[1-9][0-9]*\\.$", id_) is None:
181			continue
182
183			handled_ids.append(id_)
184			# for every top level ID, handle all direct children
185			id_tree.append(handle_id(id_, desc, id_map, handled_ids))
186
187			# top level IDs have different sorting rules
188			id_tree = sorted(id_tree, key=lambda item: int(item[0].split(".", 1)[0]))
189
190			for id_ in id_map.keys():
191			if id_ in handled_ids:
192			continue
193
194			logging.warning(
195			"id '%s' wasn't handled during PCI tree reconstruction!", id_
196			)
197
198			with codecs.open(JSON_FILENAME, "w", encoding="utf-8") as f:
199			json.dump(id_tree, f)
200
201
202			if __name__ == "__main__":
203			main()
204

ComplianceAsCode / content

Push — master ( ea7c4a...d61422 )

generate_pcidss_json.harvest_ids_descriptions() B

Complexity

Size

Duplication

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like