Hyphenator::hyphenate()   A
last analyzed

Complexity

Conditions 3
Paths 2

Size

Total Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 12
rs 9.8666
c 0
b 0
f 0
cc 3
nc 2
nop 1
1
<?php
2
/**
3
 * Copyright (c) 2008-2011 Andreas Heigl<[email protected]>
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a copy
6
 * of this software and associated documentation files (the "Software"), to deal
7
 * in the Software without restriction, including without limitation the rights
8
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
 * copies of the Software, and to permit persons to whom the Software is
10
 * furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice shall be included in
13
 * all copies or substantial portions of the Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
 * THE SOFTWARE.
22
 *
23
 * @category  Hyphenator
24
 * @package   Org\Heigl\Hyphenator
25
 * @author    Andreas Heigl <[email protected]>
26
 * @copyright 2008-2011 Andreas Heigl<[email protected]>
27
 * @license   http://www.opensource.org/licenses/mit-license.php MIT-License
28
 * @version   2.0.1
29
 * @link      http://github.com/heiglandreas/Hyphenator
30
 * @since     02.11.2011
31
 */
32
33
namespace Org\Heigl\Hyphenator;
34
35
use Org\Heigl\Hyphenator\Dictionary\Dictionary;
36
use Org\Heigl\Hyphenator\Dictionary\DictionaryRegistry;
37
use Org\Heigl\Hyphenator\Exception\PathNotDirException;
38
use Org\Heigl\Hyphenator\Exception\PathNotFoundException;
39
use Org\Heigl\Hyphenator\Filter\Filter;
40
use Org\Heigl\Hyphenator\Filter\FilterRegistry;
41
use Org\Heigl\Hyphenator\Tokenizer\Token;
42
use Org\Heigl\Hyphenator\Tokenizer\Tokenizer;
43
use Org\Heigl\Hyphenator\Tokenizer\TokenizerRegistry;
44
use Org\Heigl\Hyphenator\Tokenizer\TokenRegistry;
45
use Org\Heigl\Hyphenator\Tokenizer\WordToken;
46
47
/**
48
 * This class implements word-hyphenation
49
 *
50
 * Word-hyphenation is implemented on the basis of the algorithms developed by
51
 * Franklin Mark Liang for LaTeX as described in his dissertation at the department
52
 * of computer science at stanford university.
53
 *
54
 * The idea to this package came from Mathias Nater <[email protected]> who
55
 * implemented this word-hyphenation-algorithm for javascript.
56
 *
57
 * After Implementing that algorithm for the first Hyphenator-Version I stumbled
58
 * over the Informations of LÁSZLÓ NÉMETH from OpenOffice.org.
59
 *
60
 * That brought me to change three things for the next Version of the
61
 * Hyphenator.
62
 * <ol>
63
 * <li>Use the Dictionary files from OpenOffice.org instead of the ones directly
64
 * from Tex because the OOo-Files are already stripped of the unnecessary
65
 * Informations</li>
66
 * <li>Add the possibility to use non-standard hyphenations</li>
67
 * <li>Add the possibility to add better word-tokenising</li>
68
 * </ol>
69
 *
70
 * Beside those changes there are some other changes between the first and the
71
 * second version of the Hyphenator.
72
 *
73
 * So Version 2 of the Hyphenator<ul>
74
 * <li>requires PHP5.3 as it uses namespaces.</li>
75
 * <li>aims to 100% Code-Coverage via Unit-Tests</li>
76
 * <li>removes some unnecessary options</li>
77
 * <li>is completely rewritten from scratch</li>
78
 * </ul>
79
 *
80
 * So here is the smalest example for the usage of the class:
81
 * <code>
82
 * &lt;?php
83
 * use \Org\Heigl\Hyphenator as h;
84
 * // Create a hyphenator-instance based on a given config-file
85
 * $hyphenator = h\Hyphenator::factory('/path/to/the/config/file.properties');
86
 *
87
 * // And hyphenate a given string
88
 * $hyphenatedText = $hyphenator->hyphenate($string);
89
 * </code>
90
 * Registering the autoloader is essential before the first call to the
91
 * Hyphenator
92
 * <code language="php">
93
 * &lt;?php
94
 * require_once '/path/to/Org/Heigl/Hyphenator/Hyphenator.php';
95
 * spl_autoload_register('\Org\Heigl\Hyphenator\Hyphenator::__autoload');
96
 * </code>
97
 * Of course the Hyphenator can be adapted to the most requirements via an
98
 * Options-Object. And the tokenisation in this small example uses the simple
99
 * WhiteSpace-Tokenizer. Other more complex Tokenizers are available.
100
 *
101
 * Examples for those can be found at http://github.com/heiglandreas/Hyphenator
102
 *
103
 * @category  Org_Heigl
104
 * @package   Org_Heigl_Hyphenator
105
 * @author    Andreas Heigl <[email protected]>
106
 * @copyright 2008-2011 Andreas Heigl
107
 * @license   http://www.opensource.org/licenses/mit-license.php MIT-License
108
 * @version   2.0.1
109
 * @link      http://code.google.com/p/hyphenator
110
 * @link      http://www.tug.org/docs/liang/liang-thesis.pdf
111
 * @link      http://hunspell.sourceforge.net/tb87nemeth.pdf
112
 * @link      http://github.com/heiglandreas/Hyphenator
113
 * @since     04.11.2011
114
 */
115
final class Hyphenator
116
{
117
118
    /**
119
     * The highest possible hyphernation quality
120
     *
121
     * @const int QUALITY_HIGHEST
122
     */
123
    const QUALITY_HIGHEST = 9;
124
125
    /**
126
     * A high hyphernation quality
127
     *
128
     * @const int QUALITY_HIGH
129
     */
130
    const QUALITY_HIGH    = 7;
131
132
    /**
133
     * A medium hyphernation quality
134
     *
135
     * @const int QUALITY_NORMAL
136
     */
137
    const QUALITY_NORMAL  = 5;
138
139
    /**
140
     * A low hyphernation quality
141
     *
142
     * @const int QUALITY_LOW
143
     */
144
    const QUALITY_LOW     = 3;
145
146
    /**
147
     * The lowest possible hyphernation quality
148
     *
149
     * @const int QUALITY_LOWEST
150
     */
151
    const QUALITY_LOWEST  = 1;
152
153
    /**
154
     * Storage for the Home-path.
155
     *
156
     * The hyphenation-files iare searched in different places.
157
     * <ol><li>Location given via the constant HYPHENATOR_HOME</li>
158
     * <li>Location set via \Org\Heigl\Hyphenator\Hyphenator::setDefaultHome()</li>
159
     * <li>Location set via \Org\Heigl\Hyphenator\Hyphenator::setHome()</li>
160
     * <li>The 'share'-Folder inside the Hyphenator-Package</li>
161
     * </ol>
162
     *
163
     * The configuration-object can also be obtained using the
164
     * \Org\Heigl\Hyphenator::getConfig()-Method and can then be adapted
165
     * according to ones needs.
166
     *
167
     * @var string $homePath
168
     */
169
    private $homePath = '';
170
171
    /**
172
     * Storage of the default Home-Path.
173
     *
174
     * @var string $defaultHomePath
175
     */
176
    private static $defaultHomePath = '';
177
178
    /**
179
     * Storage for the Options-Object.
180
     *
181
     * @var Options $options
182
     */
183
    private $options;
184
185
    /**
186
     * Storage for the Dictionaries.
187
     *
188
     * @var DictionaryRegistry $dicts
189
     */
190
    private $dicts;
191
192
    /**
193
     * Storage for the Filters.
194
     *
195
     * @var FilterRegistry $filters
196
     */
197
    private $filters;
198
199
    /**
200
     * Storage for the tokenizers.
201
     *
202
     * @var TokenizerRegistry $tokenizers
203
     */
204
    private $tokenizers;
205
206
    /**
207
     * Set the Options
208
     *
209
     * @param Options $options The options to set
210
     *
211
     * @return Hyphenator
212
     */
213
    public function setOptions(Options $options)
214
    {
215
        $this->options = $options;
216
        $this->tokenizers->cleanup();
217
        foreach ($this->options->getTokenizers() as $tokenizer) {
218
            $this->addTokenizer($tokenizer);
219
        }
220
221
        return $this;
222
    }
223
224
    /**
225
     * Get the Options
226
     *
227
     * @return Options
228
     */
229
    public function getOptions()
230
    {
231
        return $this->options;
232
    }
233
234
    /**
235
     * Add a Dictionary to the Hyphenator
236
     *
237
     * @param Dictionary|string $dictionary The
238
     * Dictionary wit hyphenation-Patterns to add to this Hyphenator
239
     *
240
     * @return Hyphenator
241
     */
242
    public function addDictionary($dictionary)
243
    {
244
        if (! $dictionary instanceof Dictionary) {
245
            Dictionary::setFileLocation($this->getHomePath() . '/files/dictionaries');
246
            $dictionary = Dictionary::factory($dictionary);
247
        }
248
        $this->dicts->add($dictionary);
249
250
        return $this;
251
    }
252
253
    /**
254
     * Add a Filter to the Hyphenator
255
     *
256
     * @param Filter|string $filter The Filter with
257
     * non-standard-hyphenation-patterns
258
     *
259
     * @link http://hunspell.sourceforge.net/tb87nemeth.pdf
260
     * @return Hyphenator
261
     */
262
    public function addFilter($filter)
263
    {
264
        if (! $filter instanceof Filter) {
265
            $filter = '\\Org\\Heigl\\Hyphenator\\Filter\\' . ucfirst($filter) . 'Filter';
266
            /** @var Filter $filter */
267
            $filter = new $filter();
268
        }
269
        $filter->setOptions($this->getOptions());
270
        $this->filters->add($filter);
271
272
        return $this;
273
    }
274
275
    /**
276
     * Add a tokenizer to the tokenizer-registry
277
     *
278
     * @param Tokenizer|string $tokenizer The tokenizer to add
279
     *
280
     * @return Hyphenator
281
     */
282
    public function addTokenizer($tokenizer)
283
    {
284
        if (! $tokenizer instanceof Tokenizer) {
285
            $tokenizer = '\\Org\\Heigl\Hyphenator\\Tokenizer\\' . ucfirst($tokenizer) . 'Tokenizer';
286
            /** @var Tokenizer $tokenizer */
287
            $tokenizer = new $tokenizer();
288
        }
289
        $this->tokenizers->add($tokenizer);
290
291
        return $this;
292
    }
293
294
    /**
295
     * Get the tokenizers
296
     *
297
     * @return TokenizerRegistry
298
     */
299
    public function getTokenizers()
300
    {
301
        if (0 == $this->tokenizers->count()) {
302
            foreach ($this->getOptions()->getTokenizers() as $tokenizer) {
303
                $this->addTokenizer($tokenizer);
304
            }
305
        }
306
307
        return $this->tokenizers;
308
    }
309
310
    /**
311
     * Get the dictionaries
312
     *
313
     * @return DictionaryRegistry
314
     */
315
    public function getDictionaries()
316
    {
317
        if (0 == $this->dicts->count()) {
318
            $this->addDictionary($this->getOptions()->getDefaultLocale());
319
        }
320
321
        return $this->dicts;
322
    }
323
324
    /**
325
     * Get the filters
326
     *
327
     * @return FilterRegistry
328
     */
329
    public function getFilters()
330
    {
331
        if (0 == $this->filters->count()) {
332
            foreach ($this->getOptions()->getFilters() as $filter) {
333
                $this->addFilter($filter);
334
            }
335
        }
336
337
        return $this->filters;
338
    }
339
340
    public function __construct()
341
    {
342
        $this->dicts      = new DictionaryRegistry();
343
        $this->filters    = new FilterRegistry();
344
        $this->tokenizers = new TokenizerRegistry();
345
346
        $optFile = $this->getHomePath() . DIRECTORY_SEPARATOR . 'Hyphenator.properties';
347
        $this->setOptions(Options::factory($optFile));
348
    }
349
350
    /**
351
     * This method does the actual hyphenation.
352
     *
353
     * The given <var>$string</var> is splitted into chunks (i.e. Words) at
354
     * every blank.
355
     *
356
     * After that every chunk is hyphenated and the array of chunks is merged
357
     * into a single string using blanks again.
358
     *
359
     * This method does not take into account other word-delimiters than blanks
360
     * (eg. returns or tabstops) and it will fail with texts containing markup
361
     * in any way.
362
     *
363
     * @param string $string The string to hyphenate
364
     *
365
     * @return string|array<array-key, mixed> The hyphenated string
366
     */
367
    public function hyphenate($string)
368
    {
369
        $tokens = $this->tokenizers->tokenize($string);
370
        $tokens = $this->getHyphenationPattern($tokens);
371
        $tokens = $this->filter($tokens);
372
        if (1 === count($tokens) && 1 === $this->getFilters()->count()) {
373
            $tokens->rewind();
374
            return $tokens->current()->getHyphenatedContent();
375
        }
376
377
        return $this->getFilters()->concatenate($tokens);
378
    }
379
380
    /**
381
     * Get the hyphenation pattern for the contained tokens
382
     *
383
     * Use the dictionaties and options of the given Hyphenator-Object
384
     *
385
     * @param TokenRegistry $registry The Hyphenator object containing the
386
     * dictionaries and options
387
     *
388
     * @return TokenRegistry
389
     */
390
    public function getHyphenationPattern(TokenRegistry $registry)
391
    {
392
        $minWordLength = $this->getOptions()->getMinWordLength();
393
        foreach ($registry as $token) {
394
            if (! $token instanceof WordToken) {
395
                continue;
396
            }
397
            if ($minWordLength > $token->length()) {
398
                continue;
399
            }
400
            $this->getPatternForToken($token);
401
        }
402
403
        return $registry;
404
    }
405
406
    /**
407
     * Filter the content of the given TokenRegistry
408
     *
409
     * @param TokenRegistry $registry The tokens
410
     * to filter
411
     *
412
     * @return TokenRegistry
413
     */
414
    public function filter(TokenRegistry $registry)
415
    {
416
        return $this->getFilters()->filter($registry);
417
    }
418
419
    /**
420
     * Hyphenate a Token-Object
421
     *
422
     * @param WordToken $token The token to hyphenate
423
     *
424
     * @return Token
425
     */
426
    public function getPatternForToken(WordToken $token)
427
    {
428
        $token->addPattern($this->getDictionaries()->getHyphenationPatterns($token->get()));
429
430
        return $token;
431
    }
432
433
    /**
434
     * Set the default home-Path
435
     *
436
     * @param string $homePath The default Hyphenator Home-path.
437
     *
438
     * @throws PathNotFoundException
439
     * @throws PathNotDirException
440
     * @return void
441
     */
442 View Code Duplication
    public static function setDefaultHomePath($homePath)
443
    {
444
        if (! file_exists($homePath)) {
445
            throw new PathNotFoundException($homePath . ' does not exist');
446
        }
447
        if (! is_Dir($homePath)) {
448
            throw new PathNotDirException($homePath . ' is not a directory');
449
        }
450
451
        self::$defaultHomePath = realpath($homePath);
452
    }
453
454
    /**
455
     * Get the default Home-Path
456
     *
457
     * @return string
458
     */
459
    public static function getDefaultHomePath()
460
    {
461
        if (is_Dir(self::$defaultHomePath)) {
462
            return self::$defaultHomePath;
463
        }
464
        if (defined('HYPHENATOR_HOME') && is_Dir(HYPHENATOR_HOME)) {
465
            return realpath(HYPHENATOR_HOME);
466
        }
467
        if ($home = getenv('HYPHENATOR_HOME')) {
468
            if (is_Dir($home)) {
469
                return $home;
470
            }
471
        }
472
473
        return __DIR__ . '/share';
474
    }
475
476
    /**
477
     * Set the instance-home-Path
478
     *
479
     * @param string $homePath This instances home-path.
480
     *
481
     * @throws PathNotFoundException
482
     * @throws PathNotDirException
483
     * @return Hyphenator
484
     */
485 View Code Duplication
    public function setHomePath($homePath)
486
    {
487
        if (! file_exists($homePath)) {
488
            throw new PathNotFoundException($homePath . ' does not exist');
489
        }
490
        if (! is_Dir($homePath)) {
491
            throw new PathNotDirException($homePath . ' is not a directory');
492
        }
493
494
        $this->homePath = realpath($homePath);
495
496
        return $this;
497
    }
498
499
    /**
500
     * Get this instances Home-Path.
501
     *
502
     * If no homePath is set for this instance this method will return the
503
     * result of the \Org\Heigl\Hyphenator\Hyphenator::getdefaultHomePath()
504
     * Method
505
     *
506
     * @return string
507
     */
508
    public function getHomePath()
509
    {
510
        if (! is_dir($this->homePath)) {
511
            return self::getDefaultHomePath();
512
        }
513
514
        return $this->homePath;
515
    }
516
517
    /**
518
     * Create a new Hyphenator-Object for a certain locale
519
     *
520
     * To determine the storage of the dictionaries we either use the set
521
     * default configuration-file or we take the provided file and set the
522
     * home-path from the information within that file.
523
     *
524
     * @param string $path   The path to the configuration-file to use
525
     * @param string $locale The locale to be used
526
     *
527
     * @return Hyphenator
528
     */
529
    public static function factory($path = null, $locale = null)
530
    {
531
        $hyphenator = new Hyphenator();
532
        if (null !== $path && file_Exists($path)) {
533
            $hyphenator->setHomePath($path);
534
        }
535
        if (null !== $locale) {
536
            $hyphenator->getOptions()->setDefaultLocale($locale);
537
        }
538
539
        return $hyphenator;
540
    }
541
542
    /**
543
     * autoload classes.
544
     *
545
     * @param string $className the name of the class to load
546
     *
547
     * @return bool
548
     */
549
    public static function __autoload($className)
550
    {
551
        if (0 !== strpos($className, 'Org\\Heigl\\Hyphenator')) {
552
            return false;
553
        }
554
        $className = substr($className, strlen('Org\\Heigl\\Hyphenator\\'));
555
        $file = str_replace('\\', '/', $className) . '.php';
556
        $fileName = __DIR__ . DIRECTORY_SEPARATOR . $file;
557
        if (! file_exists(realpath($fileName))) {
558
            return false;
559
        }
560
        if (! @include_once $fileName) {
561
            return false;
562
        }
563
564
        return true;
565
    }
566
567
    /**
568
     * Register this packages autoloader with the autoload-stack
569
     *
570
     * @return void
571
     */
572
    public static function registerAutoload()
573
    {
574
        spl_autoload_register(array(Hyphenator::class, '__autoload'));
575
    }
576
}
577
578
/*
579
 * Check for requirements and if these are not met throw an exception
580
 */
581
if (! extension_loaded('mbstring')) {
582
    throw new \Exception('\Org\Heigl\Hyphenator requires the mbstring-extension to be loaded');
583
}
584
mb_internal_encoding('UTF-8');
585