AbstractXliffParser::extractTagContent() - Code Metrics - matecat/xliff-parser - Measure and Improve Code Quality continuously with Scrutinizer

AbstractXliffParser::extractTagContent() A
last analyzed 2024-10-03 10:20 UTC

↳ Parent: AbstractXliffParser

Complexity

Conditions	3
Paths	2

Size

Total Lines	11
Code Lines	6

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	6
c	0
b	0
f	0
nc	2
nop	2
dl	0
loc	11
rs	10

<?php

namespace Matecat\XliffParser\XliffParser;

use DOMDocument;
use DOMElement;
use DOMNode;
use Exception;
use Matecat\EmojiParser\Emoji;
use Matecat\XliffParser\Constants\Placeholder;
use Matecat\XliffParser\Utils\Strings;
use OverflowException;
use Psr\Log\LoggerInterface;

abstract class AbstractXliffParser {

    const MAX_GROUP_RECURSION_LEVEL = 50;

    /**
     * @var LoggerInterface|null
     */
    protected ?LoggerInterface $logger;

    /**
     * @var string|null
     */
    protected ?string $xliffProprietary;

    /**
     * @var int
     */
    protected $xliffVersion;

    /**
     * AbstractXliffParser constructor.
     *
     * @param int                  $xliffVersion
     * @param string|null          $xliffProprietary
     * @param LoggerInterface|null $logger
     */
    public function __construct( int $xliffVersion, ?string $xliffProprietary = null, LoggerInterface $logger = null ) {
        $this->xliffVersion     = $xliffVersion;
        $this->logger           = $logger;
        $this->xliffProprietary = $xliffProprietary;
    }

    /**
     * @return string
     */
    protected function getTuTagName(): string {
        return ( $this->xliffVersion === 1 ) ? 'trans-unit' : 'unit';
    }

    /**
     * @param DOMDocument $dom
     * @param array|null  $output
     *
     * @return array
     */
    abstract public function parse( DOMDocument $dom, ?array $output = [] ): array;

    /**
     * Extract trans-unit content from the current node
     *
     * @param DOMElement  $childNode
     * @param array       $transUnitIdArrayForUniquenessCheck
     * @param DOMDocument $dom
     * @param array       $output
     * @param int         $i
     * @param int         $j
     * @param array|null  $contextGroups
     * @param int|null    $recursionLevel
     */
    protected function extractTuFromNode( DOMNode $childNode, array &$transUnitIdArrayForUniquenessCheck, DOMDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [], ?int $recursionLevel = 0 ) {

        if ( $childNode->nodeType != XML_ELEMENT_NODE ) {
            return;
        }

        if ( $childNode->nodeName === 'group' ) {

            // add nested context-groups
            foreach ( $childNode->childNodes as $nestedChildNode ) {
                if ( $nestedChildNode->nodeName === 'context-group' ) {
                    $contextGroups[] = $nestedChildNode;
                }
            }

            // avoid infinite recursion
            $recursionLevel++;

            foreach ( $childNode->childNodes as $nestedChildNode ) {

                // nested groups
                if ( $nestedChildNode->nodeName === 'group' ) {

                    if ( $recursionLevel < self::MAX_GROUP_RECURSION_LEVEL ) {
                        $this->extractTuFromNode( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups, $recursionLevel );
                    } else {
                        throw new OverflowException( "Maximum tag group nesting level of '" . self::MAX_GROUP_RECURSION_LEVEL . "' reached, aborting!" );
                    }

                } elseif ( $nestedChildNode->nodeName === $this->getTuTagName() ) {
                    $this->extractTransUnit( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
                }
            }
        } elseif ( $childNode->nodeName === $this->getTuTagName() ) {
            $this->extractTransUnit( $childNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
        }
    }

    /**
     * Extract and populate 'trans-units' array
     *
     * @param DOMElement  $transUnit
     * @param array       $transUnitIdArrayForUniquenessCheck
     * @param DOMDocument $dom
     * @param array       $output
     * @param int         $i
     * @param int         $j
     * @param array|null  $contextGroups
     *
     * @return mixed
     */
    abstract protected function extractTransUnit( DOMElement $transUnit, array &$transUnitIdArrayForUniquenessCheck, DomDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [] );

    /**
     * @param DOMDocument $dom
     * @param DOMElement  $node
     *
     * @return array
     */
    protected function extractContent( DOMDocument $dom, DOMNode $node ): array {
        return [
                'raw-content' => $this->extractTagContent( $dom, $node ),
                'attr'        => $this->extractTagAttributes( $node )
        ];
    }

    /**
     * Extract attributes if they are present
     *
     * Ex:
     * <p align=center style="font-size: 12px;">some text</p>
     *
     * $attr->nodeName == 'align' :: $attr->nodeValue == 'center'
     * $attr->nodeName == 'style' :: $attr->nodeValue == 'font-size: 12px;'
     *
     * @param DOMNode $element
     *
     * @return array
     */
    protected function extractTagAttributes( DOMNode $element ): array {
        $tagAttributes = [];

        if ( $element->hasAttributes() ) {
            foreach ( $element->attributes as $attr ) {
                $tagAttributes[ $attr->nodeName ] = $attr->nodeValue;
            }
        }

        return $tagAttributes;
    }

    /**
     * Extract tag content from DOMDocument node
     *
     * @param DOMDocument $dom
     * @param DOMNode     $element
     *
     * @return string
     */
    protected function extractTagContent( DOMDocument $dom, DOMNode $element ): string {
        $childNodes       = $element->hasChildNodes();
        $extractedContent = '';

        if ( !empty( $childNodes ) ) {
            foreach ( $element->childNodes as $node ) {
                $extractedContent .= Emoji::toEntity( Strings::fixNonWellFormedXml( $dom->saveXML( $node ) ) );
            }
        }

        return str_replace( Placeholder::EMPTY_TAG_PLACEHOLDER, '', $extractedContent );
    }

    /**
     * Used to extract <seg-source> and <seg-target>
     *
     * @param DOMDocument $dom
     * @param DOMElement  $childNode
     *
     * @return array
     */
    protected function extractContentWithMarksAndExtTags( DOMDocument $dom, DOMElement $childNode ): array {
        $source = [];

        // example:
        // <g id="1"><mrk mid="0" mtype="seg">An English string with g tags</mrk></g>
        $raw = $this->extractTagContent( $dom, $childNode );

        $markers = preg_split( '#<mrk\s#si', $raw, -1 );

        $mi = 0;
        while ( isset( $markers[ $mi + 1 ] ) ) {
            unset( $mid );

            preg_match( '|mid\s?=\s?["\'](.*?)["\']|si', $markers[ $mi + 1 ], $mid );

            // if it's a Trados file the trailing spaces after </mrk> are meaningful
            // so we add them to
            $trailingSpaces = '';
            if ( $this->xliffProprietary === 'trados' ) {
                preg_match_all( '/<\/mrk>[\s]+/iu', $markers[ $mi + 1 ], $trailingSpacesMatches );

                if ( isset( $trailingSpacesMatches[ 0 ] ) && count( $trailingSpacesMatches[ 0 ] ) > 0 ) {
                    foreach ( $trailingSpacesMatches[ 0 ] as $match ) {
                        $trailingSpaces = str_replace( '</mrk>', '', $match );
                    }
                }
            }

            //re-build the mrk tag after the split
            $originalMark = trim( '<mrk ' . $markers[ $mi + 1 ] );

            $mark_string  = preg_replace( '#^<mrk\s[^>]+>(.*)#', '$1', $originalMark ); // at this point we have: ---> 'Test </mrk> </g>>'
            $mark_content = preg_split( '#</mrk>#si', $mark_string );

            $sourceArray = [
                    'mid'           => ( isset( $mid[ 1 ] ) ) ? $mid[ 1 ] : $mi,
                    'ext-prec-tags' => ( $mi == 0 ? $markers[ 0 ] : "" ),
                    'raw-content'   => ( isset( $mark_content[ 0 ] ) ) ? $mark_content[ 0 ] . $trailingSpaces : '',
                    'ext-succ-tags' => ( isset( $mark_content[ 1 ] ) ) ? $mark_content[ 1 ] : '',
            ];

            $source[] = $sourceArray;

            $mi++;
        }

        return $source;
    }

    /**
     * @param array $originalData
     *
     * @return array
     */
    protected function getDataRefMap( array $originalData ): array {
        // dataRef map
        $dataRefMap = [];
        foreach ( $originalData as $datum ) {
            if ( isset( $datum[ 'attr' ][ 'id' ] ) ) {
                $dataRefMap[ $datum[ 'attr' ][ 'id' ] ] = $datum[ 'raw-content' ];
            }
        }

        return $dataRefMap;
    }

    /**
     * @param $raw
     *
     * @return bool
     */
    protected function stringContainsMarks( $raw ): bool {
        $markers = preg_split( '#<mrk\s#si', $raw, -1 );

        return isset( $markers[ 1 ] );
    }

    /**
     * @param      $noteValue
     * @param bool $escapeStrings
     *
     * @return array
     * @throws Exception
     */
    protected function JSONOrRawContentArray( $noteValue, ?bool $escapeStrings = true ): array {
        //
        // convert double escaped entites
        //
        // Example:
        //
        // &amp;#39; ---> &#39;
        // &amp;amp; ---> &amp;
        // &amp;apos ---> &apos;
        //
        if ( Strings::isADoubleEscapedEntity( $noteValue ) ) {
            $noteValue = Strings::htmlspecialchars_decode( $noteValue, true );
        } else {
            // for non escaped entities $escapeStrings is always true for security reasons
            $escapeStrings = true;
        }

        if ( Strings::isJSON( $noteValue ) ) {
            return [ 'json' => Strings::cleanCDATA( $noteValue ) ];
        }

        return [ 'raw-content' => Strings::fixNonWellFormedXml( $noteValue, $escapeStrings ) ];
    }
}


1			<?php
2
3			namespace Matecat\XliffParser\XliffParser;
4
5			use DOMDocument;
6			use DOMElement;
7			use DOMNode;
8			use Exception;
9			use Matecat\EmojiParser\Emoji;
10			use Matecat\XliffParser\Constants\Placeholder;
11			use Matecat\XliffParser\Utils\Strings;
12			use OverflowException;
13			use Psr\Log\LoggerInterface;
14
15			abstract class AbstractXliffParser {
16
17			const MAX_GROUP_RECURSION_LEVEL = 50;
18
19			/**
20			* @var LoggerInterface\|null
21			*/
22			protected ?LoggerInterface $logger;
23
24			/**
25			* @var string\|null
26			*/
27			protected ?string $xliffProprietary;
28
29			/**
30			* @var int
31			*/
32			protected $xliffVersion;
33
34			/**
35			* AbstractXliffParser constructor.
36			*
37			* @param int $xliffVersion
38			* @param string\|null $xliffProprietary
39			* @param LoggerInterface\|null $logger
40			*/
41			public function __construct( int $xliffVersion, ?string $xliffProprietary = null, LoggerInterface $logger = null ) {
42			$this->xliffVersion = $xliffVersion;
43			$this->logger = $logger;
44			$this->xliffProprietary = $xliffProprietary;
45			}
46
47			/**
48			* @return string
49			*/
50			protected function getTuTagName(): string {
51			return ( $this->xliffVersion === 1 ) ? 'trans-unit' : 'unit';
52			}
53
54			/**
55			* @param DOMDocument $dom
56			* @param array\|null $output
57			*
58			* @return array
59			*/
60			abstract public function parse( DOMDocument $dom, ?array $output = [] ): array;
61
62			/**
63			* Extract trans-unit content from the current node
64			*
65			* @param DOMElement $childNode
66			* @param array $transUnitIdArrayForUniquenessCheck
67			* @param DOMDocument $dom
68			* @param array $output
69			* @param int $i
70			* @param int $j
71			* @param array\|null $contextGroups
72			* @param int\|null $recursionLevel
73			*/
74			protected function extractTuFromNode( DOMNode $childNode, array &$transUnitIdArrayForUniquenessCheck, DOMDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [], ?int $recursionLevel = 0 ) {
75
76			if ( $childNode->nodeType != XML_ELEMENT_NODE ) {
77			return;
78			}
79
80			if ( $childNode->nodeName === 'group' ) {
81
82			// add nested context-groups
83			foreach ( $childNode->childNodes as $nestedChildNode ) {
84			if ( $nestedChildNode->nodeName === 'context-group' ) {
85			$contextGroups[] = $nestedChildNode;
86			}
87			}
88
89			// avoid infinite recursion
90			$recursionLevel++;
91
92			foreach ( $childNode->childNodes as $nestedChildNode ) {
93
94			// nested groups
95			if ( $nestedChildNode->nodeName === 'group' ) {
96
97			if ( $recursionLevel < self::MAX_GROUP_RECURSION_LEVEL ) {
98			$this->extractTuFromNode( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups, $recursionLevel );
99			} else {
100			throw new OverflowException( "Maximum tag group nesting level of '" . self::MAX_GROUP_RECURSION_LEVEL . "' reached, aborting!" );
101			}
102
103			} elseif ( $nestedChildNode->nodeName === $this->getTuTagName() ) {
104			$this->extractTransUnit( $nestedChildNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
105			}
106			}
107			} elseif ( $childNode->nodeName === $this->getTuTagName() ) {
108			$this->extractTransUnit( $childNode, $transUnitIdArrayForUniquenessCheck, $dom, $output, $i, $j, $contextGroups );
109			}
110			}
111
112			/**
113			* Extract and populate 'trans-units' array
114			*
115			* @param DOMElement $transUnit
116			* @param array $transUnitIdArrayForUniquenessCheck
117			* @param DOMDocument $dom
118			* @param array $output
119			* @param int $i
120			* @param int $j
121			* @param array\|null $contextGroups
122			*
123			* @return mixed
124			*/
125			abstract protected function extractTransUnit( DOMElement $transUnit, array &$transUnitIdArrayForUniquenessCheck, DomDocument $dom, array &$output, int &$i, int &$j, ?array $contextGroups = [] );
126
127			/**
128			* @param DOMDocument $dom
129			* @param DOMElement $node
130			*
131			* @return array
132			*/
133			protected function extractContent( DOMDocument $dom, DOMNode $node ): array {
134			return [
135			'raw-content' => $this->extractTagContent( $dom, $node ),
136			'attr' => $this->extractTagAttributes( $node )
137			];
138			}
139
140			/**
141			* Extract attributes if they are present
142			*
143			* Ex:
144			* <p align=center style="font-size: 12px;">some text</p>
145			*
146			* $attr->nodeName == 'align' :: $attr->nodeValue == 'center'
147			* $attr->nodeName == 'style' :: $attr->nodeValue == 'font-size: 12px;'
148			*
149			* @param DOMNode $element
150			*
151			* @return array
152			*/
153			protected function extractTagAttributes( DOMNode $element ): array {
154			$tagAttributes = [];
155
156			if ( $element->hasAttributes() ) {
157			foreach ( $element->attributes as $attr ) {
158			$tagAttributes[ $attr->nodeName ] = $attr->nodeValue;
159			}
160			}
161
162			return $tagAttributes;
163			}
164
165			/**
166			* Extract tag content from DOMDocument node
167			*
168			* @param DOMDocument $dom
169			* @param DOMNode $element
170			*
171			* @return string
172			*/
173			protected function extractTagContent( DOMDocument $dom, DOMNode $element ): string {
174			$childNodes = $element->hasChildNodes();
175			$extractedContent = '';
176
177			if ( !empty( $childNodes ) ) {
178			foreach ( $element->childNodes as $node ) {
179			$extractedContent .= Emoji::toEntity( Strings::fixNonWellFormedXml( $dom->saveXML( $node ) ) );
180			}
181			}
182
183			return str_replace( Placeholder::EMPTY_TAG_PLACEHOLDER, '', $extractedContent );
184			}
185
186			/**
187			* Used to extract <seg-source> and <seg-target>
188			*
189			* @param DOMDocument $dom
190			* @param DOMElement $childNode
191			*
192			* @return array
193			*/
194			protected function extractContentWithMarksAndExtTags( DOMDocument $dom, DOMElement $childNode ): array {
195			$source = [];
196
197			// example:
198			// <g id="1"><mrk mid="0" mtype="seg">An English string with g tags</mrk></g>
199			$raw = $this->extractTagContent( $dom, $childNode );
200
201			$markers = preg_split( '#<mrk\s#si', $raw, -1 );
202
203			$mi = 0;
204			while ( isset( $markers[ $mi + 1 ] ) ) {
205			unset( $mid );
206
207			preg_match( '\|mid\s?=\s?["\'](.*?)["\']\|si', $markers[ $mi + 1 ], $mid );
208
209			// if it's a Trados file the trailing spaces after </mrk> are meaningful
210			// so we add them to
211			$trailingSpaces = '';
212			if ( $this->xliffProprietary === 'trados' ) {
213			preg_match_all( '/<\/mrk>[\s]+/iu', $markers[ $mi + 1 ], $trailingSpacesMatches );
214
215			if ( isset( $trailingSpacesMatches[ 0 ] ) && count( $trailingSpacesMatches[ 0 ] ) > 0 ) {
216			foreach ( $trailingSpacesMatches[ 0 ] as $match ) {
217			$trailingSpaces = str_replace( '</mrk>', '', $match );
218			}
219			}
220			}
221
222			//re-build the mrk tag after the split
223			$originalMark = trim( '<mrk ' . $markers[ $mi + 1 ] );
224
225			$mark_string = preg_replace( '#^<mrk\s[^>]+>(.*)#', '$1', $originalMark ); // at this point we have: ---> 'Test </mrk> </g>>'
226			$mark_content = preg_split( '#</mrk>#si', $mark_string );
227
228			$sourceArray = [
229			'mid' => ( isset( $mid[ 1 ] ) ) ? $mid[ 1 ] : $mi,
230			'ext-prec-tags' => ( $mi == 0 ? $markers[ 0 ] : "" ),
231			'raw-content' => ( isset( $mark_content[ 0 ] ) ) ? $mark_content[ 0 ] . $trailingSpaces : '',
232			'ext-succ-tags' => ( isset( $mark_content[ 1 ] ) ) ? $mark_content[ 1 ] : '',
233			];
234
235			$source[] = $sourceArray;
236
237			$mi++;
238			}
239
240			return $source;
241			}
242
243			/**
244			* @param array $originalData
245			*
246			* @return array
247			*/
248			protected function getDataRefMap( array $originalData ): array {
249			// dataRef map
250			$dataRefMap = [];
251			foreach ( $originalData as $datum ) {
252			if ( isset( $datum[ 'attr' ][ 'id' ] ) ) {
253			$dataRefMap[ $datum[ 'attr' ][ 'id' ] ] = $datum[ 'raw-content' ];
254			}
255			}
256
257			return $dataRefMap;
258			}
259
260			/**
261			* @param $raw
262			*
263			* @return bool
264			*/
265			protected function stringContainsMarks( $raw ): bool {
266			$markers = preg_split( '#<mrk\s#si', $raw, -1 );
267
268			return isset( $markers[ 1 ] );
269			}
270
271			/**
272			* @param $noteValue
273			* @param bool $escapeStrings
274			*
275			* @return array
276			* @throws Exception
277			*/
278			protected function JSONOrRawContentArray( $noteValue, ?bool $escapeStrings = true ): array {
279			//
280			// convert double escaped entites
281			//
282			// Example:
283			//
284			// &#39; ---> '
285			// &amp; ---> &
286			// &apos ---> '
287			//
288			if ( Strings::isADoubleEscapedEntity( $noteValue ) ) {
289			$noteValue = Strings::htmlspecialchars_decode( $noteValue, true );
290			} else {
291			// for non escaped entities $escapeStrings is always true for security reasons
292			$escapeStrings = true;
293			}
294
295			if ( Strings::isJSON( $noteValue ) ) {
296			return [ 'json' => Strings::cleanCDATA( $noteValue ) ];
297			}
298
299			return [ 'raw-content' => Strings::fixNonWellFormedXml( $noteValue, $escapeStrings ) ];
300			}
301			}
302

matecat / xliff-parser

AbstractXliffParser::extractTagContent() A last analyzed 2024-10-03 10:20 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

AbstractXliffParser::extractTagContent() A
last analyzed 2024-10-03 10:20 UTC