Passed
Pull Request — master (#178)
by Matěj
01:03
created

org_fedora_oscap.content_handling   A

Complexity

Total Complexity 30

Size/Duplication

Total Lines 179
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 86
dl 0
loc 179
rs 10
c 0
b 0
f 0
wmc 30

5 Methods

Rating   Name   Duplication   Size   Complexity  
A ParseHTMLContent.handle_data() 0 2 1
A ParseHTMLContent.handle_endtag() 0 5 3
A ParseHTMLContent.handle_starttag() 0 7 4
A ParseHTMLContent.__init__() 0 3 1
A ParseHTMLContent.get_content() 0 2 1

4 Functions

Rating   Name   Duplication   Size   Complexity  
A parse_HTML_from_content() 0 13 1
A identify_files() 0 3 1
B get_doc_type() 0 21 6
D explore_content_files() 0 38 12
1
#
2
# Copyright (C) 2013  Red Hat, Inc.
3
#
4
# This copyrighted material is made available to anyone wishing to use,
5
# modify, copy, or redistribute it subject to the terms and conditions of
6
# the GNU General Public License v.2, or (at your option) any later version.
7
# This program is distributed in the hope that it will be useful, but WITHOUT
8
# ANY WARRANTY expressed or implied, including the implied warranties of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General
10
# Public License for more details.  You should have received a copy of the
11
# GNU General Public License along with this program; if not, write to the
12
# Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
13
# 02110-1301, USA.  Any Red Hat trademarks that are incorporated in the
14
# source code or documentation are not subject to the GNU General Public
15
# License and may only be used or replicated with the express permission of
16
# Red Hat, Inc.
17
#
18
# Red Hat Author(s): Vratislav Podzimek <[email protected]>
19
#
20
21
"""
22
Module with various classes for SCAP content processing and retrieving data
23
from it.
24
25
"""
26
27
import os.path
28
29
from collections import namedtuple
30
import multiprocessing
31
32
from pyanaconda.core.util import execReadlines
33
try:
34
    from html.parser import HTMLParser
35
except ImportError:
36
    from HTMLParser import HTMLParser
37
38
import logging
39
log = logging.getLogger("anaconda")
40
41
42
CONTENT_TYPES = dict(
43
    DATASTREAM="Source Data Stream",
44
    XCCDF_CHECKLIST="XCCDF Checklist",
45
    OVAL="OVAL Definitions",
46
    CPE_DICT="CPE Dictionary",
47
    TAILORING="XCCDF Tailoring",
48
)
49
50
51
class ContentHandlingError(Exception):
52
    """Exception class for errors related to SCAP content handling."""
53
54
    pass
55
56
57
class ContentCheckError(ContentHandlingError):
58
    """
59
    Exception class for errors related to content (integrity,...) checking.
60
    """
61
62
    pass
63
64
65
class ParseHTMLContent(HTMLParser):
66
    """Parser class for HTML tags within content"""
67
68
    def __init__(self):
69
        HTMLParser.__init__(self)
70
        self.content = ""
71
72
    def handle_starttag(self, tag, attrs):
73
        if tag == "html:ul":
74
            self.content += "\n"
75
        elif tag == "html:li":
76
            self.content += "\n"
77
        elif tag == "html:br":
78
            self.content += "\n"
79
80
    def handle_endtag(self, tag):
81
        if tag == "html:ul":
82
            self.content += "\n"
83
        elif tag == "html:li":
84
            self.content += "\n"
85
86
    def handle_data(self, data):
87
        self.content += data.strip()
88
89
    def get_content(self):
90
        return self.content
91
92
93
def parse_HTML_from_content(content):
94
    """This is a very simple HTML to text parser.
95
96
    HTML tags will be removed while trying to maintain readability
97
    of content.
98
99
    :param content: content whose HTML tags will be parsed
100
    :return: content without HTML tags
101
    """
102
103
    parser = ParseHTMLContent()
104
    parser.feed(content)
105
    return parser.get_content()
106
107
108
# namedtuple class for info about content files found
109
# pylint: disable-msg=C0103
110
ContentFiles = namedtuple("ContentFiles", ["xccdf", "cpe", "tailoring"])
111
112
113
def identify_files(fpaths):
114
    result = {path: get_doc_type(path) for path in fpaths}
115
    return result
116
117
118
def get_doc_type(file_path):
119
    content_type = "unknown"
120
    try:
121
        for line in execReadlines("oscap", ["info", file_path]):
122
            if line.startswith("Document type:"):
123
                _prefix, _sep, type_info = line.partition(":")
124
                content_type = type_info.strip()
125
                break
126
    except OSError:
127
        # 'oscap info' exitted with a non-zero exit code -> unknown doc
128
        # type
129
        pass
130
    except UnicodeDecodeError:
131
        # 'oscap info' supplied weird output, which happens when it tries
132
        # to explain why it can't examine e.g. a JPG.
133
        pass
134
    except Exception as e:
135
        log.warning(f"OSCAP addon: Unexpected error when looking at {file_path}: {str(e)}")
136
    log.info("OSCAP addon: Identified {file_path} as {content_type}"
137
             .format(file_path=file_path, content_type=content_type))
138
    return content_type
139
140
141
def explore_content_files(fpaths):
142
    """
143
    Function for finding content files in a list of file paths. SIMPLY PICKS
144
    THE FIRST USABLE CONTENT FILE OF A PARTICULAR TYPE AND JUST PREFERS DATA
145
    STREAMS OVER STANDALONE BENCHMARKS.
146
147
    :param fpaths: a list of file paths to search for content files in
148
    :type fpaths: [str]
149
    :return: ContentFiles instance containing the file names of the XCCDF file,
150
        CPE dictionary and tailoring file or "" in place of those items
151
        if not found
152
    :rtype: ContentFiles
153
154
    """
155
    xccdf_file = ""
156
    cpe_file = ""
157
    tailoring_file = ""
158
    found_ds = False
159
160
    for fpath in fpaths:
161
        doc_type = get_doc_type(fpath)
162
        if not doc_type:
163
            continue
164
165
        # prefer DS over standalone XCCDF
166
        if doc_type == "Source Data Stream" and (not xccdf_file or not found_ds):
167
            xccdf_file = fpath
168
            found_ds = True
169
        elif doc_type == "XCCDF Checklist" and not xccdf_file:
170
            xccdf_file = fpath
171
        elif doc_type == "CPE Dictionary" and not cpe_file:
172
            cpe_file = fpath
173
        elif doc_type == "XCCDF Tailoring" and not tailoring_file:
174
            tailoring_file = fpath
175
176
    # TODO: raise exception if no xccdf_file is found?
177
    files = ContentFiles(xccdf_file, cpe_file, tailoring_file)
178
    return files
179