Issues in Strings.php (master) - Issues in master - matecat/xliff-parser - Measure and Improve Code Quality continuously with Scrutinizer

Issues (16)

src/Utils/Strings.php (1 issue)

Labels

Unused Code 1

Severity

Minor 1

<?php

namespace Matecat\XliffParser\Utils;

use Exception;
use Matecat\XliffParser\Constants\XliffTags;
use Matecat\XliffParser\Exception\NotValidJSONException;
use SimpleXMLElement;

class Strings {
    private static ?string $find_xliff_tags_reg = null;
    private static string  $htmlEntityRegex     = '/&amp;[#a-zA-Z0-9]{1,20};/u';

    /**
     * @param string $testString
     *
     * @return string
     * @throws Exception
     */
    public static function cleanCDATA( string $testString ): string {
        $cleanXMLContent = new SimpleXMLElement( '<rootNoteNode>' . $testString . '</rootNoteNode>', LIBXML_NOCDATA );

        return $cleanXMLContent->__toString();
    }

    /**
     * @param string $string
     *
     * @return bool
     */
    public static function isJSON( string $string ): bool {
        if ( is_numeric( $string ) ) {
            return false;
        }

        try {
            $string = Strings::cleanCDATA( $string );
        } catch ( Exception $e ) {
            return false;
        }

        $string = trim( $string );
        if ( empty( $string ) ) {
            return false;
        }

        // String representation in json is "quoted", but we want to accept only object or arrays.
        // exclude strings and numbers and other primitive types
        if ( in_array( $string [ 0 ], [ "{", "[" ] ) ) {
            json_decode( $string );

            return empty( self::getLastJsonError()[ 0 ] );
        } else {
            return false; // Not accepted: string or primitive types.
        }

    }

    /**
     * @param string $string
     *
     * @return array
     */
    public static function jsonToArray( string $string ): array {
        $decodedJSON = json_decode( $string, true );

        return ( is_array( $decodedJSON ) ) ? $decodedJSON : [];
    }

    /**
     * @return void
     * @throws NotValidJSONException
     */
    private static function raiseLastJsonException() {


        [ $msg, $error ] = self::getLastJsonError();

        if ( $error != JSON_ERROR_NONE ) {
            throw new NotValidJSONException( $msg, $error );
        }

    }

    /**
     * @return array
     */
    private static function getLastJsonError(): array {

        if ( function_exists( "json_last_error" ) ) {

            $error = json_last_error();

            switch ( $error ) {
                case JSON_ERROR_NONE:
                    $msg = null; # - No errors
                    break;
                case JSON_ERROR_DEPTH:
                    $msg = ' - Maximum stack depth exceeded';
                    break;
                case JSON_ERROR_STATE_MISMATCH:
                    $msg = ' - Underflow or the modes mismatch';
                    break;
                case JSON_ERROR_CTRL_CHAR:
                    $msg = ' - Unexpected control character found';
                    break;
                case JSON_ERROR_SYNTAX:
                    $msg = ' - Syntax error, malformed JSON';
                    break;
                case JSON_ERROR_UTF8:
                    $msg = ' - Malformed UTF-8 characters, possibly incorrectly encoded';
                    break;
                default:
                    $msg = ' - Unknown error';
                    break;
            }

            return [ $msg, $error ];
        }

        return [ null, JSON_ERROR_NONE ];

    }

    /**
     * This function exists because many developers started adding html tags directly into the XLIFF source since:
     * 1) XLIFF tag remapping is too complex for them
     * 2) Trados does not lock Tags within the <source> that are expressed as &gt;b&lt; but is tolerant to html tags in <source>
     *
     * in short people typed:
     * <source>The <b>red</d> house</source> or worst <source>5 > 3</source>
     * instead of
     * <source>The <g id="1">red</g> house.</source> and <source>5 &gt; 3</source>
     *
     * This function will do the following
     * <g id="1">Hello</g>, 4 > 3 -> <g id="1">Hello</g>, 4 &gt; 3
     * <g id="1">Hello</g>, 4 > 3 &gt; -> <g id="1">Hello</g>, 4 &gt; 3 &gt; 2
     *
     * @param string $content
     * @param bool   $escapeStrings
     *
     * @return string
     */
    public static function fixNonWellFormedXml( string $content, ?bool $escapeStrings = true ): string {
        if ( self::$find_xliff_tags_reg === null ) {
            // Convert the list of tags in a regexp list, for example "g|x|bx|ex"
            $xliffTags           = XliffTags::$tags;
            $xliff_tags_reg_list = implode( '|', $xliffTags );
            // Regexp to find all the XLIFF tags:
            //   </?               -> matches the tag start, for both opening and
            //                        closure tags (see the optional slash)
            //   ($xliff_tags_reg) -> matches one of the XLIFF tags in the list above
            //   (\s[^>]*)?        -> matches attributes and so on; ensures there's a
            //                        space after the tag, to not confuse for example a
            //                        "g" tag with a "gblabla"; [^>]* matches anything,
            //                        including additional spaces; the entire block is
            //                        optional, to allow tags with no spaces or attrs
            //   /? >              -> matches tag end, with optional slash for
            //                        self-closing ones
            // If you are wondering about spaces inside tags, look at this:
            // http://www.w3.org/TR/REC-xml/#sec-starttags
            // It says that there cannot be any space between the '<' and the tag name,
            // between '</' and the tag name, or inside '/>'. But you can add white
            // space after the tag name, though.
            self::$find_xliff_tags_reg = "#</?($xliff_tags_reg_list)(\\s[^>]*)?/?>#si";
        }

        // Find all the XLIFF tags
        preg_match_all( self::$find_xliff_tags_reg, $content, $matches );
        $tags = (array)$matches[ 0 ];

        // Prepare placeholders
        $tags_placeholders = [];
        $tagsNum           = count( $tags );
        for ( $i = 0; $i < $tagsNum; $i++ ) {
            $tag                       = $tags[ $i ];
            $tags_placeholders[ $tag ] = "#@!XLIFF-TAG-$i!@#";
        }

        // Replace all XLIFF tags with placeholders that will not be escaped
        foreach ( $tags_placeholders as $tag => $placeholder ) {
            $content = str_replace( $tag, $placeholder, $content );
        }

        // Escape the string with the remaining non-XLIFF tags
        if ( $escapeStrings ) {
            $content = htmlspecialchars( $content, ENT_NOQUOTES, 'UTF-8', false );
        }

        // Put again in place the original XLIFF tags replacing placeholders
        foreach ( $tags_placeholders as $tag => $placeholder ) {
            $content = str_replace( $placeholder, $tag, $content );
        }

        return $content;
    }

    /**
     * @param $string
     *
     * @return string
     */
    public static function removeDangerousChars( $string ): string {
        // clean invalid xml entities ( characters with ascii < 32 and different from 0A, 0D and 09
        $regexpEntity = '/&#x(0[0-8BCEF]|1[\dA-F]|7F);/u';

        // remove binary chars in some xliff files
        $regexpAscii = '/[\x{00}-\x{08}\x{0B}\x{0C}\x{0E}-\x{1F}\x{7F}]/u';

        $string = preg_replace( $regexpAscii, '', $string ?? '' );
        $string = preg_replace( $regexpEntity, '', $string ?? '' );

        return !empty( $string ) || strlen( $string ) > 0 ? $string : "";
    }


    /**
     * @param string $string
     * @param ?bool  $onlyEscapedEntities
     *
     * @return string
     */
    public static function htmlspecialchars_decode( string $string, ?bool $onlyEscapedEntities = false ): string {
        if ( false === $onlyEscapedEntities ) {
            return htmlspecialchars_decode( $string, ENT_NOQUOTES );
        }

        return preg_replace_callback( self::$htmlEntityRegex,
                function ( $match ) {
                    return self::htmlspecialchars_decode( $match[ 0 ] );
                }, $string );
    }

    /**
     * Checks if a string is a double encoded entity.
     *
     * Example:
     *
     * &amp;#39; ---> true
     * &#39;     ---> false
     *
     * @param string $str
     *
     * @return bool
     */
    public static function isADoubleEscapedEntity( string $str ): bool {
        return preg_match( self::$htmlEntityRegex, $str ) != 0;
    }

    /**
     * @param string $uuid
     *
     * @return bool
     */
    public static function isAValidUuid( $uuid ) {
        return preg_match( '/^[\da-f]{8}-[\da-f]{4}-4[\da-f]{3}-[89ab][\da-f]{3}-[\da-f]{12}$/', $uuid ) === 1;
    }

    /**
     * @param $pattern
     * @param $subject
     *
     * @return array|false|string[]
     */
    public static function preg_split( $pattern, $subject ) {
        return preg_split( $pattern, $subject, -1, PREG_SPLIT_NO_EMPTY );
    }

    /**
     * @param string $segment
     *
     * @return int
     */
    public static function getTheNumberOfTrailingSpaces( $segment ): int {
        return mb_strlen( $segment ) - mb_strlen( rtrim( $segment, ' ' ) );
    }

}


1			<?php
2
3			namespace Matecat\XliffParser\Utils;
4
5			use Exception;
6			use Matecat\XliffParser\Constants\XliffTags;
7			use Matecat\XliffParser\Exception\NotValidJSONException;
8			use SimpleXMLElement;
9
10			class Strings {
11			private static ?string $find_xliff_tags_reg = null;
12			private static string $htmlEntityRegex = '/&[#a-zA-Z0-9]{1,20};/u';
13
14			/**
15			* @param string $testString
16			*
17			* @return string
18			* @throws Exception
19			*/
20			public static function cleanCDATA( string $testString ): string {
21			$cleanXMLContent = new SimpleXMLElement( '<rootNoteNode>' . $testString . '</rootNoteNode>', LIBXML_NOCDATA );
22
23			return $cleanXMLContent->__toString();
24			}
25
26			/**
27			* @param string $string
28			*
29			* @return bool
30			*/
31			public static function isJSON( string $string ): bool {
32			if ( is_numeric( $string ) ) {
33			return false;
34			}
35
36			try {
37			$string = Strings::cleanCDATA( $string );
38			} catch ( Exception $e ) {
39			return false;
40			}
41
42			$string = trim( $string );
43			if ( empty( $string ) ) {
44			return false;
45			}
46
47			// String representation in json is "quoted", but we want to accept only object or arrays.
48			// exclude strings and numbers and other primitive types
49			if ( in_array( $string [ 0 ], [ "{", "[" ] ) ) {
50			json_decode( $string );
51
52			return empty( self::getLastJsonError()[ 0 ] );
53			} else {
54			return false; // Not accepted: string or primitive types.
55			}
56
57			}
58
59			/**
60			* @param string $string
61			*
62			* @return array
63			*/
64			public static function jsonToArray( string $string ): array {
65			$decodedJSON = json_decode( $string, true );
66
67			return ( is_array( $decodedJSON ) ) ? $decodedJSON : [];
68			}
69
70			/**
71			* @return void
72			* @throws NotValidJSONException
73			*/
74			private static function raiseLastJsonException() {
			0 ignored issues – show Unused Code introduced 2023-01-31 16:14 UTC by Report Bug Copy Issue Report Show Similar Issues like this The method `raiseLastJsonException()` is not used, and could be removed. This check looks for private methods that have been defined, but are not used inside the class. Loading history...
75
76			[ $msg, $error ] = self::getLastJsonError();
77
78			if ( $error != JSON_ERROR_NONE ) {
79			throw new NotValidJSONException( $msg, $error );
80			}
81
82			}
83
84			/**
85			* @return array
86			*/
87			private static function getLastJsonError(): array {
88
89			if ( function_exists( "json_last_error" ) ) {
90
91			$error = json_last_error();
92
93			switch ( $error ) {
94			case JSON_ERROR_NONE:
95			$msg = null; # - No errors
96			break;
97			case JSON_ERROR_DEPTH:
98			$msg = ' - Maximum stack depth exceeded';
99			break;
100			case JSON_ERROR_STATE_MISMATCH:
101			$msg = ' - Underflow or the modes mismatch';
102			break;
103			case JSON_ERROR_CTRL_CHAR:
104			$msg = ' - Unexpected control character found';
105			break;
106			case JSON_ERROR_SYNTAX:
107			$msg = ' - Syntax error, malformed JSON';
108			break;
109			case JSON_ERROR_UTF8:
110			$msg = ' - Malformed UTF-8 characters, possibly incorrectly encoded';
111			break;
112			default:
113			$msg = ' - Unknown error';
114			break;
115			}
116
117			return [ $msg, $error ];
118			}
119
120			return [ null, JSON_ERROR_NONE ];
121
122			}
123
124			/**
125			* This function exists because many developers started adding html tags directly into the XLIFF source since:
126			* 1) XLIFF tag remapping is too complex for them
127			* 2) Trados does not lock Tags within the <source> that are expressed as >b< but is tolerant to html tags in <source>
128			*
129			* in short people typed:
130			* <source>The <b>red</d> house</source> or worst <source>5 > 3</source>
131			* instead of
132			* <source>The <g id="1">red</g> house.</source> and <source>5 > 3</source>
133			*
134			* This function will do the following
135			* <g id="1">Hello</g>, 4 > 3 -> <g id="1">Hello</g>, 4 > 3
136			* <g id="1">Hello</g>, 4 > 3 > -> <g id="1">Hello</g>, 4 > 3 > 2
137			*
138			* @param string $content
139			* @param bool $escapeStrings
140			*
141			* @return string
142			*/
143			public static function fixNonWellFormedXml( string $content, ?bool $escapeStrings = true ): string {
144			if ( self::$find_xliff_tags_reg === null ) {
145			// Convert the list of tags in a regexp list, for example "g\|x\|bx\|ex"
146			$xliffTags = XliffTags::$tags;
147			$xliff_tags_reg_list = implode( '\|', $xliffTags );
148			// Regexp to find all the XLIFF tags:
149			// </? -> matches the tag start, for both opening and
150			// closure tags (see the optional slash)
151			// ($xliff_tags_reg) -> matches one of the XLIFF tags in the list above
152			// (\s[^>]*)? -> matches attributes and so on; ensures there's a
153			// space after the tag, to not confuse for example a
154			// "g" tag with a "gblabla"; [^>]* matches anything,
155			// including additional spaces; the entire block is
156			// optional, to allow tags with no spaces or attrs
157			// /? > -> matches tag end, with optional slash for
158			// self-closing ones
159			// If you are wondering about spaces inside tags, look at this:
160			// http://www.w3.org/TR/REC-xml/#sec-starttags
161			// It says that there cannot be any space between the '<' and the tag name,
162			// between '</' and the tag name, or inside '/>'. But you can add white
163			// space after the tag name, though.
164			self::$find_xliff_tags_reg = "#</?($xliff_tags_reg_list)(\\s[^>]*)?/?>#si";
165			}
166
167			// Find all the XLIFF tags
168			preg_match_all( self::$find_xliff_tags_reg, $content, $matches );
169			$tags = (array)$matches[ 0 ];
170
171			// Prepare placeholders
172			$tags_placeholders = [];
173			$tagsNum = count( $tags );
174			for ( $i = 0; $i < $tagsNum; $i++ ) {
175			$tag = $tags[ $i ];
176			$tags_placeholders[ $tag ] = "#@!XLIFF-TAG-$i!@#";
177			}
178
179			// Replace all XLIFF tags with placeholders that will not be escaped
180			foreach ( $tags_placeholders as $tag => $placeholder ) {
181			$content = str_replace( $tag, $placeholder, $content );
182			}
183
184			// Escape the string with the remaining non-XLIFF tags
185			if ( $escapeStrings ) {
186			$content = htmlspecialchars( $content, ENT_NOQUOTES, 'UTF-8', false );
187			}
188
189			// Put again in place the original XLIFF tags replacing placeholders
190			foreach ( $tags_placeholders as $tag => $placeholder ) {
191			$content = str_replace( $placeholder, $tag, $content );
192			}
193
194			return $content;
195			}
196
197			/**
198			* @param $string
199			*
200			* @return string
201			*/
202			public static function removeDangerousChars( $string ): string {
203			// clean invalid xml entities ( characters with ascii < 32 and different from 0A, 0D and 09
204			$regexpEntity = '/&#x(0[0-8BCEF]\|1[\dA-F]\|7F);/u';
205
206			// remove binary chars in some xliff files
207			$regexpAscii = '/[\x{00}-\x{08}\x{0B}\x{0C}\x{0E}-\x{1F}\x{7F}]/u';
208
209			$string = preg_replace( $regexpAscii, '', $string ?? '' );
210			$string = preg_replace( $regexpEntity, '', $string ?? '' );
211
212			return !empty( $string ) \|\| strlen( $string ) > 0 ? $string : "";
213			}
214
215
216			/**
217			* @param string $string
218			* @param ?bool $onlyEscapedEntities
219			*
220			* @return string
221			*/
222			public static function htmlspecialchars_decode( string $string, ?bool $onlyEscapedEntities = false ): string {
223			if ( false === $onlyEscapedEntities ) {
224			return htmlspecialchars_decode( $string, ENT_NOQUOTES );
225			}
226
227			return preg_replace_callback( self::$htmlEntityRegex,
228			function ( $match ) {
229			return self::htmlspecialchars_decode( $match[ 0 ] );
230			}, $string );
231			}
232
233			/**
234			* Checks if a string is a double encoded entity.
235			*
236			* Example:
237			*
238			* &#39; ---> true
239			* ' ---> false
240			*
241			* @param string $str
242			*
243			* @return bool
244			*/
245			public static function isADoubleEscapedEntity( string $str ): bool {
246			return preg_match( self::$htmlEntityRegex, $str ) != 0;
247			}
248
249			/**
250			* @param string $uuid
251			*
252			* @return bool
253			*/
254			public static function isAValidUuid( $uuid ) {
255			return preg_match( '/^[\da-f]{8}-[\da-f]{4}-4[\da-f]{3}-[89ab][\da-f]{3}-[\da-f]{12}$/', $uuid ) === 1;
256			}
257
258			/**
259			* @param $pattern
260			* @param $subject
261			*
262			* @return array\|false\|string[]
263			*/
264			public static function preg_split( $pattern, $subject ) {
265			return preg_split( $pattern, $subject, -1, PREG_SPLIT_NO_EMPTY );
266			}
267
268			/**
269			* @param string $segment
270			*
271			* @return int
272			*/
273			public static function getTheNumberOfTrailingSpaces( $segment ): int {
274			return mb_strlen( $segment ) - mb_strlen( rtrim( $segment, ' ' ) );
275			}
276
277			}
278

matecat / xliff-parser

Issues (16)

src/Utils/Strings.php (1 issue)

Labels

Severity

Introduced By

Duplication Side-by-Side

Filter issues like