Passed
Push — master ( 474905...ba9227 )
by Dev
01:17
created

ExtractExpressions::cleanExpr()   A

Complexity

Conditions 4
Paths 6

Size

Total Lines 19
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 11
nc 6
nop 2
dl 0
loc 19
rs 9.9
c 0
b 0
f 0
1
<?php
2
3
namespace rOpenDev\ExtractExpression;
4
5
// curl: automatic set referrer
6
// pomme de terre ne marche pas
7
8
class ExtractExpressions
9
{
10
    public $onlyInSentence = false;
11
    public $expressionMaxWords = 5;
12
    public $keepTrail = 5;
13
14
    protected $text;
15
16
    protected $expressions = [];
17
18
    protected $wordNumber = 0;
19
20
    protected $trail = [];
21
22
    public function __construct(string $text)
23
    {
24
        $this->addText($text);
25
    }
26
27
    protected function addText(string $text)
28
    {
29
        $text = CleanText::stripHtmlTags($text);
30
        $text = CleanText::fixEncoding($text);
31
32
        $text = CleanText::removeDate($text);
33
34
        if ($this->onlyInSentence) {
35
            $text = CleanText::keepOnlySentence($text);
36
        }
37
38
        $this->text = $text;
39
40
        return $this;
41
    }
42
43
    public function getWordNumber()
44
    {
45
        return $this->wordNumber;
46
    }
47
48
    protected function incrementWordNumber($value)
49
    {
50
        $this->wordNumber = $this->getWordNumber() + $value;
51
    }
52
53
    public function getExpressionsByDensity()
54
    {
55
        $expressions = $this->expressions;
56
57
        foreach ($expressions as $k => $v) {
58
            $expressions[$k] = round(($v / $this->getWordNumber()) * 10000) / 100;
59
        }
60
61
        return $expressions;
62
    }
63
64
    public function extract()
65
    {
66
        if ($this->onlyInSentence) {
67
            $sentences = [];
68
            foreach (explode(chr(10), $this->text) as $paragraph) {
69
                $sentences = array_merge($sentences, CleanText::getSentences($paragraph));
70
            }
71
        } else {
72
            $sentences = explode(chr(10), trim($this->text));
73
        }
74
75
        foreach ($sentences as $sentence) {
76
            $sentence = CleanText::removePunctuation($sentence);
77
78
            $words = explode(' ', trim(strtolower($sentence)));
79
80
            foreach ($words as $key => $word) {
81
                for ($wordNumber = 1; $wordNumber < $this->expressionMaxWords; ++$wordNumber) {
82
                    $expression = '';
83
                    for ($i = 0; $i < $wordNumber; ++$i) {
84
                        if (isset($words[$key + $i])) {
85
                            $expression .= ($i > 0 ? ' ' : '').$words[$key + $i];
86
                        }
87
                    }
88
89
                    $expression = $this->cleanExpr($expression, $wordNumber);
90
91
                    if (
92
                        empty($expression)
93
                        || ((substr_count($expression, ' ') + 1) != $wordNumber) // We avoid sur-pondération
94
                        || !preg_match('/[a-z]/', $expression) // We avoid number or symbol only result
95
                    ) {
96
                        if (1 === $wordNumber) {
97
                            $this->incrementWordNumber(-1);
98
                        }
99
                    } else {
100
                        $plus = 1 + substr_count(CleanText::removeStopWords($expression), ' ');
101
                        $this->expressions[$expression] = isset($this->expressions[$expression]) ? $this->expressions[$expression] + $plus : $plus;
102
                        if ($this->keepTrail > 0 && $this->expressions[$expression] > $this->keepTrail) {
103
                            $this->trail[$expression][] = $sentence;
104
                        }
105
                    }
106
                }
107
                $this->incrementWordNumber(1);
108
            }
109
        }
110
111
        arsort($this->expressions);
112
113
        return $this->expressions;
114
    }
115
116
    protected function cleanExpr($expression, $wordNumber)
117
    {
118
        if ($wordNumber <= 2) {
119
            $expression = trim(CleanText::removeStopWords(' '.$expression.' '));
120
        } else {
121
            $expression = CleanText::removeStopWordsAtExtremity($expression);
122
            $expression = CleanText::removeStopWordsAtExtremity($expression);
123
            if (false === strpos($expression, ' ')) {
124
                $expression = trim(CleanText::removeStopWords(' '.$expression.' '));
125
            }
126
        }
127
128
        // Last Clean
129
        $expression = trim(preg_replace('/\s+/', ' ', $expression));
130
        if ('' == htmlentities($expression)) { //Avoid �
131
            $expression = '';
132
        }
133
134
        return $expression;
135
    }
136
137
    public function getExpressions(?int $number = null)
138
    {
139
        if (null === $this->expressions) {
140
            $this->extract();
141
        }
142
143
        return !$number ? $this->expressions : array_slice($this->getExpressions(), 0, $number);
144
    }
145
146
    /**
147
     * @return array containing sentence where we can find expresion
148
     */
149
    public function getTrail(string $expression)
150
    {
151
        if (null === $this->mergedExpressions) {
0 ignored issues
show
Bug Best Practice introduced by
The property mergedExpressions does not exist on rOpenDev\ExtractExpression\ExtractExpressions. Did you maybe forget to declare it?
Loading history...
152
            $this->extract();
153
        }
154
155
        if (isset($this->trail[$expression])) {
156
            return $this->trail[$expression];
157
        }
158
159
        return [];
160
    }
161
162
    public function getTrails()
163
    {
164
        return $this->trail;
165
    }
166
}
167