matecat /
xliff-parser
| 1 | <?php |
||
| 2 | |||
| 3 | namespace Matecat\XliffParser\Utils; |
||
| 4 | |||
| 5 | use Exception; |
||
| 6 | use Matecat\XliffParser\Constants\XliffTags; |
||
| 7 | use Matecat\XliffParser\Exception\NotValidJSONException; |
||
| 8 | use SimpleXMLElement; |
||
| 9 | |||
| 10 | class Strings { |
||
| 11 | private static ?string $find_xliff_tags_reg = null; |
||
| 12 | private static string $htmlEntityRegex = '/&[#a-zA-Z0-9]{1,20};/u'; |
||
| 13 | |||
| 14 | /** |
||
| 15 | * @param string $testString |
||
| 16 | * |
||
| 17 | * @return string |
||
| 18 | * @throws Exception |
||
| 19 | */ |
||
| 20 | public static function cleanCDATA( string $testString ): string { |
||
| 21 | $cleanXMLContent = new SimpleXMLElement( '<rootNoteNode>' . $testString . '</rootNoteNode>', LIBXML_NOCDATA ); |
||
| 22 | |||
| 23 | return $cleanXMLContent->__toString(); |
||
| 24 | } |
||
| 25 | |||
| 26 | /** |
||
| 27 | * @param string $string |
||
| 28 | * |
||
| 29 | * @return bool |
||
| 30 | */ |
||
| 31 | public static function isJSON( string $string ): bool { |
||
| 32 | if ( is_numeric( $string ) ) { |
||
| 33 | return false; |
||
| 34 | } |
||
| 35 | |||
| 36 | try { |
||
| 37 | $string = Strings::cleanCDATA( $string ); |
||
| 38 | } catch ( Exception $e ) { |
||
| 39 | return false; |
||
| 40 | } |
||
| 41 | |||
| 42 | $string = trim( $string ); |
||
| 43 | if ( empty( $string ) ) { |
||
| 44 | return false; |
||
| 45 | } |
||
| 46 | |||
| 47 | // String representation in json is "quoted", but we want to accept only object or arrays. |
||
| 48 | // exclude strings and numbers and other primitive types |
||
| 49 | if ( in_array( $string [ 0 ], [ "{", "[" ] ) ) { |
||
| 50 | json_decode( $string ); |
||
| 51 | |||
| 52 | return empty( self::getLastJsonError()[ 0 ] ); |
||
| 53 | } else { |
||
| 54 | return false; // Not accepted: string or primitive types. |
||
| 55 | } |
||
| 56 | |||
| 57 | } |
||
| 58 | |||
| 59 | /** |
||
| 60 | * @param string $string |
||
| 61 | * |
||
| 62 | * @return array |
||
| 63 | */ |
||
| 64 | public static function jsonToArray( string $string ): array { |
||
| 65 | $decodedJSON = json_decode( $string, true ); |
||
| 66 | |||
| 67 | return ( is_array( $decodedJSON ) ) ? $decodedJSON : []; |
||
| 68 | } |
||
| 69 | |||
| 70 | /** |
||
| 71 | * @return void |
||
| 72 | * @throws NotValidJSONException |
||
| 73 | */ |
||
| 74 | private static function raiseLastJsonException() { |
||
|
0 ignored issues
–
show
|
|||
| 75 | |||
| 76 | [ $msg, $error ] = self::getLastJsonError(); |
||
| 77 | |||
| 78 | if ( $error != JSON_ERROR_NONE ) { |
||
| 79 | throw new NotValidJSONException( $msg, $error ); |
||
| 80 | } |
||
| 81 | |||
| 82 | } |
||
| 83 | |||
| 84 | /** |
||
| 85 | * @return array |
||
| 86 | */ |
||
| 87 | private static function getLastJsonError(): array { |
||
| 88 | |||
| 89 | if ( function_exists( "json_last_error" ) ) { |
||
| 90 | |||
| 91 | $error = json_last_error(); |
||
| 92 | |||
| 93 | switch ( $error ) { |
||
| 94 | case JSON_ERROR_NONE: |
||
| 95 | $msg = null; # - No errors |
||
| 96 | break; |
||
| 97 | case JSON_ERROR_DEPTH: |
||
| 98 | $msg = ' - Maximum stack depth exceeded'; |
||
| 99 | break; |
||
| 100 | case JSON_ERROR_STATE_MISMATCH: |
||
| 101 | $msg = ' - Underflow or the modes mismatch'; |
||
| 102 | break; |
||
| 103 | case JSON_ERROR_CTRL_CHAR: |
||
| 104 | $msg = ' - Unexpected control character found'; |
||
| 105 | break; |
||
| 106 | case JSON_ERROR_SYNTAX: |
||
| 107 | $msg = ' - Syntax error, malformed JSON'; |
||
| 108 | break; |
||
| 109 | case JSON_ERROR_UTF8: |
||
| 110 | $msg = ' - Malformed UTF-8 characters, possibly incorrectly encoded'; |
||
| 111 | break; |
||
| 112 | default: |
||
| 113 | $msg = ' - Unknown error'; |
||
| 114 | break; |
||
| 115 | } |
||
| 116 | |||
| 117 | return [ $msg, $error ]; |
||
| 118 | } |
||
| 119 | |||
| 120 | return [ null, JSON_ERROR_NONE ]; |
||
| 121 | |||
| 122 | } |
||
| 123 | |||
| 124 | /** |
||
| 125 | * This function exists because many developers started adding html tags directly into the XLIFF source since: |
||
| 126 | * 1) XLIFF tag remapping is too complex for them |
||
| 127 | * 2) Trados does not lock Tags within the <source> that are expressed as >b< but is tolerant to html tags in <source> |
||
| 128 | * |
||
| 129 | * in short people typed: |
||
| 130 | * <source>The <b>red</d> house</source> or worst <source>5 > 3</source> |
||
| 131 | * instead of |
||
| 132 | * <source>The <g id="1">red</g> house.</source> and <source>5 > 3</source> |
||
| 133 | * |
||
| 134 | * This function will do the following |
||
| 135 | * <g id="1">Hello</g>, 4 > 3 -> <g id="1">Hello</g>, 4 > 3 |
||
| 136 | * <g id="1">Hello</g>, 4 > 3 > -> <g id="1">Hello</g>, 4 > 3 > 2 |
||
| 137 | * |
||
| 138 | * @param string $content |
||
| 139 | * @param bool $escapeStrings |
||
| 140 | * |
||
| 141 | * @return string |
||
| 142 | */ |
||
| 143 | public static function fixNonWellFormedXml( string $content, ?bool $escapeStrings = true ): string { |
||
| 144 | if ( self::$find_xliff_tags_reg === null ) { |
||
| 145 | // Convert the list of tags in a regexp list, for example "g|x|bx|ex" |
||
| 146 | $xliffTags = XliffTags::$tags; |
||
| 147 | $xliff_tags_reg_list = implode( '|', $xliffTags ); |
||
| 148 | // Regexp to find all the XLIFF tags: |
||
| 149 | // </? -> matches the tag start, for both opening and |
||
| 150 | // closure tags (see the optional slash) |
||
| 151 | // ($xliff_tags_reg) -> matches one of the XLIFF tags in the list above |
||
| 152 | // (\s[^>]*)? -> matches attributes and so on; ensures there's a |
||
| 153 | // space after the tag, to not confuse for example a |
||
| 154 | // "g" tag with a "gblabla"; [^>]* matches anything, |
||
| 155 | // including additional spaces; the entire block is |
||
| 156 | // optional, to allow tags with no spaces or attrs |
||
| 157 | // /? > -> matches tag end, with optional slash for |
||
| 158 | // self-closing ones |
||
| 159 | // If you are wondering about spaces inside tags, look at this: |
||
| 160 | // http://www.w3.org/TR/REC-xml/#sec-starttags |
||
| 161 | // It says that there cannot be any space between the '<' and the tag name, |
||
| 162 | // between '</' and the tag name, or inside '/>'. But you can add white |
||
| 163 | // space after the tag name, though. |
||
| 164 | self::$find_xliff_tags_reg = "#</?($xliff_tags_reg_list)(\\s[^>]*)?/?>#si"; |
||
| 165 | } |
||
| 166 | |||
| 167 | // Find all the XLIFF tags |
||
| 168 | preg_match_all( self::$find_xliff_tags_reg, $content, $matches ); |
||
| 169 | $tags = (array)$matches[ 0 ]; |
||
| 170 | |||
| 171 | // Prepare placeholders |
||
| 172 | $tags_placeholders = []; |
||
| 173 | $tagsNum = count( $tags ); |
||
| 174 | for ( $i = 0; $i < $tagsNum; $i++ ) { |
||
| 175 | $tag = $tags[ $i ]; |
||
| 176 | $tags_placeholders[ $tag ] = "#@!XLIFF-TAG-$i!@#"; |
||
| 177 | } |
||
| 178 | |||
| 179 | // Replace all XLIFF tags with placeholders that will not be escaped |
||
| 180 | foreach ( $tags_placeholders as $tag => $placeholder ) { |
||
| 181 | $content = str_replace( $tag, $placeholder, $content ); |
||
| 182 | } |
||
| 183 | |||
| 184 | // Escape the string with the remaining non-XLIFF tags |
||
| 185 | if ( $escapeStrings ) { |
||
| 186 | $content = htmlspecialchars( $content, ENT_NOQUOTES, 'UTF-8', false ); |
||
| 187 | } |
||
| 188 | |||
| 189 | // Put again in place the original XLIFF tags replacing placeholders |
||
| 190 | foreach ( $tags_placeholders as $tag => $placeholder ) { |
||
| 191 | $content = str_replace( $placeholder, $tag, $content ); |
||
| 192 | } |
||
| 193 | |||
| 194 | return $content; |
||
| 195 | } |
||
| 196 | |||
| 197 | /** |
||
| 198 | * @param $string |
||
| 199 | * |
||
| 200 | * @return string |
||
| 201 | */ |
||
| 202 | public static function removeDangerousChars( $string ): string { |
||
| 203 | // clean invalid xml entities ( characters with ascii < 32 and different from 0A, 0D and 09 |
||
| 204 | $regexpEntity = '/&#x(0[0-8BCEF]|1[\dA-F]|7F);/u'; |
||
| 205 | |||
| 206 | // remove binary chars in some xliff files |
||
| 207 | $regexpAscii = '/[\x{00}-\x{08}\x{0B}\x{0C}\x{0E}-\x{1F}\x{7F}]/u'; |
||
| 208 | |||
| 209 | $string = preg_replace( $regexpAscii, '', $string ?? '' ); |
||
| 210 | $string = preg_replace( $regexpEntity, '', $string ?? '' ); |
||
| 211 | |||
| 212 | return !empty( $string ) || strlen( $string ) > 0 ? $string : ""; |
||
| 213 | } |
||
| 214 | |||
| 215 | |||
| 216 | /** |
||
| 217 | * @param string $string |
||
| 218 | * @param ?bool $onlyEscapedEntities |
||
| 219 | * |
||
| 220 | * @return string |
||
| 221 | */ |
||
| 222 | public static function htmlspecialchars_decode( string $string, ?bool $onlyEscapedEntities = false ): string { |
||
| 223 | if ( false === $onlyEscapedEntities ) { |
||
| 224 | return htmlspecialchars_decode( $string, ENT_NOQUOTES ); |
||
| 225 | } |
||
| 226 | |||
| 227 | return preg_replace_callback( self::$htmlEntityRegex, |
||
| 228 | function ( $match ) { |
||
| 229 | return self::htmlspecialchars_decode( $match[ 0 ] ); |
||
| 230 | }, $string ); |
||
| 231 | } |
||
| 232 | |||
| 233 | /** |
||
| 234 | * Checks if a string is a double encoded entity. |
||
| 235 | * |
||
| 236 | * Example: |
||
| 237 | * |
||
| 238 | * &#39; ---> true |
||
| 239 | * ' ---> false |
||
| 240 | * |
||
| 241 | * @param string $str |
||
| 242 | * |
||
| 243 | * @return bool |
||
| 244 | */ |
||
| 245 | public static function isADoubleEscapedEntity( string $str ): bool { |
||
| 246 | return preg_match( self::$htmlEntityRegex, $str ) != 0; |
||
| 247 | } |
||
| 248 | |||
| 249 | /** |
||
| 250 | * @param string $uuid |
||
| 251 | * |
||
| 252 | * @return bool |
||
| 253 | */ |
||
| 254 | public static function isAValidUuid( $uuid ) { |
||
| 255 | return preg_match( '/^[\da-f]{8}-[\da-f]{4}-4[\da-f]{3}-[89ab][\da-f]{3}-[\da-f]{12}$/', $uuid ) === 1; |
||
| 256 | } |
||
| 257 | |||
| 258 | /** |
||
| 259 | * @param $pattern |
||
| 260 | * @param $subject |
||
| 261 | * |
||
| 262 | * @return array|false|string[] |
||
| 263 | */ |
||
| 264 | public static function preg_split( $pattern, $subject ) { |
||
| 265 | return preg_split( $pattern, $subject, -1, PREG_SPLIT_NO_EMPTY ); |
||
| 266 | } |
||
| 267 | |||
| 268 | /** |
||
| 269 | * @param string $segment |
||
| 270 | * |
||
| 271 | * @return int |
||
| 272 | */ |
||
| 273 | public static function getTheNumberOfTrailingSpaces( $segment ): int { |
||
| 274 | return mb_strlen( $segment ) - mb_strlen( rtrim( $segment, ' ' ) ); |
||
| 275 | } |
||
| 276 | |||
| 277 | } |
||
| 278 |
This check looks for private methods that have been defined, but are not used inside the class.