Passed
Pull Request — master (#86)
by
unknown
02:39
created

Alto::getTextAsMiniOcr()   B

Complexity

Conditions 8
Paths 13

Size

Total Lines 34
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 3
Bugs 0 Features 0
Metric Value
eloc 19
c 3
b 0
f 0
dl 0
loc 34
rs 8.4444
cc 8
nc 13
nop 1
1
<?php
2
3
/**
4
 * (c) Kitodo. Key to digital objects e.V. <[email protected]>
5
 *
6
 * This file is part of the Kitodo and TYPO3 projects.
7
 *
8
 * @license GNU General Public License version 3 or later.
9
 * For the full copyright and license information, please read the
10
 * LICENSE.txt file that was distributed with this source code.
11
 */
12
13
namespace Kitodo\Dlf\Format;
14
15
/**
16
 * Fulltext ALTO format class for the 'dlf' extension
17
 *
18
 * ** This currently supports only ALTO 2.x **
19
 *
20
 * @author Sebastian Meyer <[email protected]>
21
 * @package TYPO3
22
 * @subpackage dlf
23
 * @access public
24
 */
25
class Alto implements \Kitodo\Dlf\Common\FulltextInterface
26
{
27
    /**
28
     * This extracts the fulltext data from ALTO XML
29
     *
30
     * @access public
31
     *
32
     * @param \SimpleXMLElement $xml: The XML to extract the raw text from
33
     *
34
     * @return string The raw unformatted fulltext
35
     */
36
    public function getRawText(\SimpleXMLElement $xml)
37
    {
38
        $rawText = '';
39
        $xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v2#');
40
        // Get all (presumed) words of the text.
41
        $words = $xml->xpath('./alto:Layout/alto:Page/alto:PrintSpace//alto:TextBlock/alto:TextLine/alto:String/@CONTENT');
42
        if (!empty($words)) {
43
            $rawText = implode(' ', $words);
44
        }
45
        return $rawText;
46
    }
47
48
    /**
49
     * This extracts the fulltext data from ALTO XML and returns it in MiniOCR format
50
     *
51
     * @access public
52
     *
53
     * @param \SimpleXMLElement $xml: The XML to extract the raw text from
54
     *
55
     * @return string The unformatted fulltext in MiniOCR format
56
     */
57
    public function getTextAsMiniOcr(\SimpleXMLElement $xml)
58
    {
59
        $xml->registerXPathNamespace('alto', 'http://www.loc.gov/standards/alto/ns-v2#');
60
61
        // get all text blocks
62
        $blocks = $xml->xpath('./alto:Layout/alto:Page/alto:PrintSpace//alto:TextBlock');
63
64
        if (empty($blocks)) {
65
            return '';
66
        }
67
68
        $miniOcr = new \SimpleXMLElement("<ocr></ocr>");
69
70
        foreach ($blocks as $block) {
71
            $newBlock = $miniOcr->addChild('b');
72
            foreach ($block->children() as $key => $value) {
73
                if ($key === "TextLine") {
74
                    $newLine = $newBlock->addChild('l');
75
                    foreach ($value->children() as $wordKey => $word) {
76
                        if ($wordKey == "String") {
77
                            $attributes = $word->attributes();
78
                            $newWord = $newLine->addChild('w', $this->getWord($attributes));
79
                            $newWord->addAttribute('x', $this->getCoordinates($attributes));
80
                        }
81
                    }
82
                }
83
            }
84
        }
85
86
        $miniOcrXml = $miniOcr->asXml();
87
        if (\is_string($miniOcrXml)) {
88
            return $miniOcrXml;
89
        }
90
        return '';
91
    }
92
93
    /**
94
     * This extracts and parses the word from attribute
95
     *
96
     * @access private
97
     *
98
     * @param \SimpleXMLElement $attributes: The XML to extract the word
99
     *
100
     * @return string The parsed word extracted from attribute
101
     */
102
    private function getWord($attributes)
103
    {
104
        return htmlspecialchars((string) $attributes['CONTENT']) . ' ';
105
    }
106
107
    /**
108
     * This extracts and parses the word coordinates from attributes
109
     *
110
     * @access private
111
     *
112
     * @param \SimpleXMLElement $attributes: The XML to extract the word coordinates
113
     *
114
     * @return string The parsed word coordinates extracted from attribute
115
     */
116
    private function getCoordinates($attributes)
117
    {
118
        return (string) $attributes['HPOS'] . ' ' . (string) $attributes['VPOS'] . ' ' . (string) $attributes['WIDTH'] . ' ' . (string) $attributes['HEIGHT'];
119
    }
120
}
121