1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* (c) Kitodo. Key to digital objects e.V. <[email protected]> |
5
|
|
|
* |
6
|
|
|
* This file is part of the Kitodo and TYPO3 projects. |
7
|
|
|
* |
8
|
|
|
* @license GNU General Public License version 3 or later. |
9
|
|
|
* For the full copyright and license information, please read the |
10
|
|
|
* LICENSE.txt file that was distributed with this source code. |
11
|
|
|
*/ |
12
|
|
|
|
13
|
|
|
namespace Kitodo\Dlf\Format; |
14
|
|
|
|
15
|
|
|
/** |
16
|
|
|
* Fulltext ALTO format class for the 'dlf' extension |
17
|
|
|
* |
18
|
|
|
* ** This currently supports ALTO 2.x / 3.x / 4.x ** |
19
|
|
|
* |
20
|
|
|
* @package TYPO3 |
21
|
|
|
* @subpackage dlf |
22
|
|
|
* |
23
|
|
|
* @access public |
24
|
|
|
*/ |
25
|
|
|
class Alto implements \Kitodo\Dlf\Common\FulltextInterface |
26
|
|
|
{ |
27
|
|
|
/** |
28
|
|
|
* This extracts the fulltext data from ALTO XML |
29
|
|
|
* |
30
|
|
|
* @access public |
31
|
|
|
* |
32
|
|
|
* @param \SimpleXMLElement $xml The XML to extract the raw text from |
33
|
|
|
* |
34
|
|
|
* @return string The raw unformatted fulltext |
35
|
|
|
*/ |
36
|
|
|
public function getRawText(\SimpleXMLElement $xml): string |
37
|
|
|
{ |
38
|
|
|
$rawText = ''; |
39
|
|
|
|
40
|
|
|
// register ALTO namespace depending on document |
41
|
|
|
$this->registerAltoNamespace($xml); |
42
|
|
|
|
43
|
|
|
// Get all (presumed) words of the text. |
44
|
|
|
$strings = $xml->xpath('./alto:Layout/alto:Page/alto:PrintSpace//alto:TextBlock/alto:TextLine/alto:String'); |
45
|
|
|
$words = []; |
46
|
|
|
if (!empty($strings)) { |
47
|
|
|
for ($i = 0; $i < count($strings); $i++) { |
|
|
|
|
48
|
|
|
$attributes = $strings[$i]->attributes(); |
49
|
|
|
if (isset($attributes['SUBS_TYPE'])) { |
50
|
|
|
if ($attributes['SUBS_TYPE'] == 'HypPart1') { |
51
|
|
|
$i++; |
52
|
|
|
$words[] = $attributes['SUBS_CONTENT']; |
53
|
|
|
} |
54
|
|
|
} else { |
55
|
|
|
$words[] = $attributes['CONTENT']; |
56
|
|
|
} |
57
|
|
|
} |
58
|
|
|
$rawText = implode(' ', $words); |
59
|
|
|
} |
60
|
|
|
return $rawText; |
61
|
|
|
} |
62
|
|
|
|
63
|
|
|
/** |
64
|
|
|
* This extracts the fulltext data from ALTO XML and returns it in MiniOCR format |
65
|
|
|
* |
66
|
|
|
* @access public |
67
|
|
|
* |
68
|
|
|
* @param \SimpleXMLElement $xml The XML to extract the raw text from |
69
|
|
|
* |
70
|
|
|
* @return string The unformatted fulltext in MiniOCR format |
71
|
|
|
*/ |
72
|
|
|
public function getTextAsMiniOcr(\SimpleXMLElement $xml): string |
73
|
|
|
{ |
74
|
|
|
// register ALTO namespace depending on document |
75
|
|
|
$this->registerAltoNamespace($xml); |
76
|
|
|
|
77
|
|
|
// get all text blocks |
78
|
|
|
$blocks = $xml->xpath('./alto:Layout/alto:Page/alto:PrintSpace//alto:TextBlock'); |
79
|
|
|
|
80
|
|
|
if (empty($blocks)) { |
81
|
|
|
return ''; |
82
|
|
|
} |
83
|
|
|
|
84
|
|
|
$miniOcr = new \SimpleXMLElement("<ocr></ocr>"); |
85
|
|
|
|
86
|
|
|
foreach ($blocks as $block) { |
87
|
|
|
$newBlock = $miniOcr->addChild('b'); |
88
|
|
|
foreach ($block->children() as $key => $value) { |
89
|
|
|
if ($key === "TextLine") { |
90
|
|
|
$newLine = $newBlock->addChild('l'); |
91
|
|
|
foreach ($value->children() as $wordKey => $word) { |
92
|
|
|
if ($wordKey == "String") { |
93
|
|
|
$attributes = $word->attributes(); |
94
|
|
|
$newWord = $newLine->addChild('w', $this->getWord($attributes)); |
95
|
|
|
$newWord->addAttribute('x', $this->getCoordinates($attributes)); |
96
|
|
|
} |
97
|
|
|
} |
98
|
|
|
} |
99
|
|
|
} |
100
|
|
|
} |
101
|
|
|
|
102
|
|
|
$miniOcrXml = $miniOcr->asXml(); |
103
|
|
|
if (\is_string($miniOcrXml)) { |
104
|
|
|
return $miniOcrXml; |
105
|
|
|
} |
106
|
|
|
return ''; |
107
|
|
|
} |
108
|
|
|
|
109
|
|
|
/** |
110
|
|
|
* This extracts and parses the word from attribute |
111
|
|
|
* |
112
|
|
|
* @access private |
113
|
|
|
* |
114
|
|
|
* @param \SimpleXMLElement $attributes The XML to extract the word |
115
|
|
|
* |
116
|
|
|
* @return string The parsed word extracted from attribute |
117
|
|
|
*/ |
118
|
|
|
private function getWord(\SimpleXMLElement $attributes): string |
119
|
|
|
{ |
120
|
|
|
if (!empty($attributes['SUBS_CONTENT'])) { |
121
|
|
|
if ($attributes['SUBS_TYPE'] == 'HypPart1') { |
|
|
|
|
122
|
|
|
return htmlspecialchars((string) $attributes['SUBS_CONTENT']); |
123
|
|
|
} |
124
|
|
|
return ' '; |
125
|
|
|
} |
126
|
|
|
return htmlspecialchars((string) $attributes['CONTENT']) . ' '; |
127
|
|
|
} |
128
|
|
|
|
129
|
|
|
/** |
130
|
|
|
* This extracts and parses the word coordinates from attributes |
131
|
|
|
* |
132
|
|
|
* @access private |
133
|
|
|
* |
134
|
|
|
* @param \SimpleXMLElement $attributes The XML to extract the word coordinates |
135
|
|
|
* |
136
|
|
|
* @return string The parsed word coordinates extracted from attribute |
137
|
|
|
*/ |
138
|
|
|
private function getCoordinates(\SimpleXMLElement $attributes): string |
139
|
|
|
{ |
140
|
|
|
return (string) $attributes['HPOS'] . ' ' . (string) $attributes['VPOS'] . ' ' . (string) $attributes['WIDTH'] . ' ' . (string) $attributes['HEIGHT']; |
141
|
|
|
} |
142
|
|
|
|
143
|
|
|
/** |
144
|
|
|
* This registers the necessary ALTO namespace for the current ALTO-XML |
145
|
|
|
* |
146
|
|
|
* @access private |
147
|
|
|
* |
148
|
|
|
* @param \SimpleXMLElement &$xml: The XML to register the namespace for |
149
|
|
|
*/ |
150
|
|
|
private function registerAltoNamespace(\SimpleXMLElement &$xml) |
151
|
|
|
{ |
152
|
|
|
$namespace = $xml->getDocNamespaces(); |
153
|
|
|
|
154
|
|
|
if (in_array('http://www.loc.gov/standards/alto/ns-v2#', $namespace, true)) { |
155
|
|
|
$xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v2#'); |
156
|
|
|
} elseif (in_array('http://www.loc.gov/standards/alto/ns-v3#', $namespace, true)) { |
157
|
|
|
$xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v3#'); |
158
|
|
|
} elseif (in_array('http://www.loc.gov/standards/alto/ns-v4#', $namespace, true)) { |
159
|
|
|
$xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v4#'); |
160
|
|
|
} |
161
|
|
|
} |
162
|
|
|
} |
163
|
|
|
|
If the size of the collection does not change during the iteration, it is generally a good practice to compute it beforehand, and not on each iteration: