1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace Beccha\OfxParser\Service; |
||
6 | |||
7 | use Beccha\OfxParser\Exception\FileNotFoundException; |
||
8 | use Beccha\OfxParser\Exception\OfxTagNotFoundException; |
||
9 | use Beccha\OfxParser\Exception\XmlContentNotFoundException; |
||
10 | use SimpleXMLElement; |
||
11 | |||
12 | class SgmlToXml |
||
13 | { |
||
14 | public function parse(string $sgmlFilePath): SimpleXMLElement |
||
15 | { |
||
16 | $fileContent = $this->loadFile($sgmlFilePath); |
||
17 | |||
18 | // Check if file is already XML |
||
19 | $isXml = $this->isXml($fileContent); |
||
20 | if ($isXml) { |
||
21 | return $this->getXmlContent($fileContent); |
||
22 | } |
||
23 | |||
24 | $contentWithoutSgmlHeader = $this->removeSgmlHeader($fileContent); |
||
25 | |||
26 | // Fix unclosed tags |
||
27 | $sgmlLines = $this->sgmlContentToArrayOfLines($contentWithoutSgmlHeader); |
||
28 | $tagsWithoutContent = $this->listTagsWithoutContent($sgmlLines); |
||
29 | $tagsToClose = $this->filterTagsThatHaveAClosingCouterpart($sgmlLines, $tagsWithoutContent); |
||
30 | $tagsFixedForXml = $this->closeTags($sgmlLines, $tagsToClose); |
||
31 | $xmlFileContent = $this->buildXmlFileContent($tagsFixedForXml); |
||
32 | |||
33 | return $this->getXmlContent($xmlFileContent); |
||
34 | } |
||
35 | |||
36 | private function loadFile(string $sgmlFilePath): string |
||
37 | { |
||
38 | if (file_exists($sgmlFilePath) && $fileContent = file_get_contents($sgmlFilePath)) { |
||
39 | $detectedEncoding = mb_detect_encoding($fileContent); |
||
40 | if ($detectedEncoding) { |
||
41 | return mb_convert_encoding($fileContent, "UTF-8", $detectedEncoding); |
||
0 ignored issues
–
show
Bug
Best Practice
introduced
by
![]() |
|||
42 | } |
||
43 | } |
||
44 | throw new FileNotFoundException($sgmlFilePath); |
||
45 | } |
||
46 | |||
47 | /** |
||
48 | * Remove SGMLheader |
||
49 | */ |
||
50 | private function removeSgmlHeader(string $sgmlFileContent): string |
||
51 | { |
||
52 | $upercasedContent = mb_convert_case($sgmlFileContent, MB_CASE_UPPER); |
||
53 | if ($sgmlStart = stripos($upercasedContent, '<OFX>')) { |
||
54 | return trim(substr($sgmlFileContent, $sgmlStart)); |
||
55 | } |
||
56 | throw new OfxTagNotFoundException('OFX'); |
||
57 | } |
||
58 | |||
59 | /** |
||
60 | * @return array<string> |
||
61 | */ |
||
62 | private function sgmlContentToArrayOfLines(string $sgmlFileContent): array |
||
63 | { |
||
64 | $trimmedLines = []; |
||
65 | $lines = explode("\n", $sgmlFileContent); |
||
66 | foreach ($lines as $line) { |
||
67 | $trimmedLines[] = trim($line); |
||
68 | } |
||
69 | |||
70 | return $trimmedLines; |
||
71 | } |
||
72 | |||
73 | /** |
||
74 | * Search for tags within a xml file without content |
||
75 | * @param array<string> $linesFromSgml |
||
76 | * @return array<string> |
||
77 | */ |
||
78 | private function listTagsWithoutContent(array $linesFromSgml): array |
||
79 | { |
||
80 | $tagsWithoutContent = []; |
||
81 | foreach ($linesFromSgml as $line) { |
||
82 | $trimmedLine = trim($line); |
||
83 | if (preg_match('/^<[a-z0-9\-_]*>$/i', $trimmedLine)) { |
||
84 | $tagsWithoutContent[] = $trimmedLine; |
||
85 | } |
||
86 | } |
||
87 | |||
88 | return $tagsWithoutContent; |
||
89 | } |
||
90 | |||
91 | /** |
||
92 | * Within a xml file, filter out tags that have a closing couterpart |
||
93 | * @param array<string> $linesFromSgml |
||
94 | * @param array<string> $tagsWithoutContent |
||
95 | * @return array<string> |
||
96 | */ |
||
97 | private function filterTagsThatHaveAClosingCouterpart(array $linesFromSgml, array $tagsWithoutContent): array |
||
98 | { |
||
99 | $tagsToClose = []; |
||
100 | foreach ($tagsWithoutContent as $tag) { |
||
101 | $tagWithoutClosing = str_replace('<', '</', $tag); |
||
102 | if (!in_array($tagWithoutClosing, $linesFromSgml, true)) { |
||
103 | $tagsToClose[] = $tag; |
||
104 | } |
||
105 | } |
||
106 | return $tagsToClose; |
||
107 | } |
||
108 | |||
109 | /** |
||
110 | * Within a xml file, close tags that are given in the array $tagsToClose |
||
111 | * @param array<string> $linesFromSgml |
||
112 | * @param array<string> $emptyTagsToClose |
||
113 | * @return array<string> |
||
114 | */ |
||
115 | public function closeTags(array $linesFromSgml, array $emptyTagsToClose): array |
||
116 | { |
||
117 | $updatedLines = []; |
||
118 | |||
119 | foreach ($linesFromSgml as $id => $tag) { |
||
120 | $updatedLines[$id] = $tag; |
||
121 | |||
122 | // Close empty tags that no closing counterpart and no data |
||
123 | if (in_array($tag, $emptyTagsToClose, true)) { |
||
124 | $updatedLines[$id] = $tag . str_replace('<', '</', $tag); |
||
125 | } |
||
126 | |||
127 | // Close tags that have no closing counterpart but have data |
||
128 | $pattern = '/^(<[a-z0-9\-_]*>)(.+)$/i'; |
||
129 | |||
130 | if (preg_match($pattern, $tag, $openingTag)) { |
||
131 | $closingTag = str_replace('<', '</', $openingTag[1]); |
||
132 | |||
133 | // Only close tag if no clasing tag is found in the file |
||
134 | if (!in_array($closingTag, $linesFromSgml, true)) { |
||
135 | $cleanedUpContent = $this->cleanUpContent($openingTag[2]); |
||
136 | $updatedLines[$id] = $openingTag[1] . $cleanedUpContent . $closingTag; |
||
137 | } |
||
138 | } |
||
139 | } |
||
140 | |||
141 | return $updatedLines; |
||
142 | } |
||
143 | |||
144 | private function cleanUpContent(string $content): string |
||
145 | { |
||
146 | return str_replace('&', '&', $content); |
||
147 | } |
||
148 | |||
149 | /** |
||
150 | * @param array<string> $linesFromSgml |
||
151 | */ |
||
152 | private function buildXmlFileContent(array $linesFromSgml): string |
||
153 | { |
||
154 | array_unshift($linesFromSgml, '<?xml version="1.0" encoding="UTF-8"?>'); |
||
155 | return implode("\n", $linesFromSgml); |
||
156 | } |
||
157 | |||
158 | private function getXmlContent(string $fileContent): SimpleXMLElement |
||
159 | { |
||
160 | if ($xmlContent = simplexml_load_string($fileContent)) { |
||
161 | return $xmlContent; |
||
162 | } |
||
163 | |||
164 | throw new XmlContentNotFoundException(); |
||
165 | } |
||
166 | |||
167 | private function isXml(string $fileContent): bool |
||
168 | { |
||
169 | libxml_use_internal_errors(true); |
||
170 | $isXml = simplexml_load_string($fileContent); |
||
171 | return $isXml !== false; |
||
172 | } |
||
173 | } |
||
174 |