|
1
|
|
|
<?php |
|
2
|
|
|
namespace NirjharLo\Cgss\Lib\Analysis\Lib; |
|
3
|
|
|
|
|
4
|
|
|
if ( ! defined( 'ABSPATH' ) ) exit; |
|
5
|
|
|
|
|
6
|
|
|
|
|
7
|
|
|
use \DomXPath; |
|
8
|
|
|
|
|
9
|
|
|
/** |
|
10
|
|
|
* An object for treating text by using methods of xpath and generating array of words along with |
|
11
|
|
|
* text to html ratio in kb. |
|
12
|
|
|
* |
|
13
|
|
|
* 2 properties. |
|
14
|
|
|
* @property obj $dom document object model |
|
15
|
|
|
* @property string $body_size Size of complete HTML |
|
16
|
|
|
*/ |
|
17
|
|
|
class Text { |
|
18
|
|
|
|
|
19
|
|
|
|
|
20
|
|
|
public $dom; |
|
21
|
|
|
public $body_size; |
|
22
|
|
|
|
|
23
|
|
|
|
|
24
|
|
|
// Execute the xPath first, to prevent multiple execution |
|
25
|
|
|
public function execute() { |
|
26
|
|
|
|
|
27
|
|
|
$this->xpath = $this->xpath(); |
|
|
|
|
|
|
28
|
|
|
$this->text = $this->text(); |
|
|
|
|
|
|
29
|
|
|
} |
|
30
|
|
|
|
|
31
|
|
|
|
|
32
|
|
|
//generate individual words from text content |
|
33
|
|
|
public function words() { |
|
34
|
|
|
|
|
35
|
|
|
//get word counts from text string. Here I use 2 loops to check for voids and characters. |
|
36
|
|
|
$text = str_replace( array( '.', ',', ':', '\'', '"', ')', '(', ']', '[', '}', '{', ';', '+', '-', '_', '*', '&', '^', '%', '$', '#', '@', '!', '~', '?', '>', '<', '/', '\\', '|' ), ' ' , $this->text ); |
|
37
|
|
|
$pure_text = filter_var( $text, FILTER_SANITIZE_STRING, FILTER_FLAG_STRIP_LOW ); |
|
38
|
|
|
|
|
39
|
|
|
//after formating text, explode the string into words and remove empty elements. |
|
40
|
|
|
$text_string = explode( ' ', $pure_text ); |
|
41
|
|
|
$words = array(); |
|
42
|
|
|
foreach ( $text_string as $key ) { |
|
43
|
|
|
if ( $key ) { |
|
44
|
|
|
$words[] = trim( $key ); |
|
45
|
|
|
} |
|
46
|
|
|
} |
|
47
|
|
|
return $words; |
|
48
|
|
|
} |
|
49
|
|
|
|
|
50
|
|
|
//get text to html ratio |
|
51
|
|
|
public function ratio() { |
|
52
|
|
|
|
|
53
|
|
|
$xpath = $this->xpath; |
|
54
|
|
|
return $xpath['ratio']; |
|
55
|
|
|
} |
|
56
|
|
|
|
|
57
|
|
|
//get total text content from xpath |
|
58
|
|
|
public function text() { |
|
59
|
|
|
|
|
60
|
|
|
$xpath = $this->xpath; |
|
61
|
|
|
return preg_replace( '/[ \n]+/', ' ', preg_replace( '/[ \t]+/', ' ', preg_replace( '/\s*$^\s*/m', ' ', $xpath['content'] ) ) ); |
|
62
|
|
|
} |
|
63
|
|
|
|
|
64
|
|
|
//get total text content from xpath |
|
65
|
|
|
public function size() { |
|
66
|
|
|
$xpath = $this->xpath; |
|
67
|
|
|
return $xpath['size']; |
|
68
|
|
|
} |
|
69
|
|
|
|
|
70
|
|
|
//Create xpath object from document object model |
|
71
|
|
|
public function xpath() { |
|
72
|
|
|
|
|
73
|
|
|
//generate whole html |
|
74
|
|
|
$xpath = new DomXPath( $this->dom ); |
|
75
|
|
|
|
|
76
|
|
|
//Get html size |
|
77
|
|
|
$html_size = $this->body_size; |
|
78
|
|
|
|
|
79
|
|
|
//make it ready to get body xpath for text. |
|
80
|
|
|
foreach ( $xpath->query( '//script' ) as $key ) { |
|
81
|
|
|
$key->parentNode->removeChild( $key ); |
|
82
|
|
|
} |
|
83
|
|
|
foreach ( $xpath->query( '//style' ) as $key ) { |
|
84
|
|
|
$key->parentNode->removeChild( $key ); |
|
85
|
|
|
} |
|
86
|
|
|
$all_text = $xpath->query( '//body[text()]' ); |
|
87
|
|
|
|
|
88
|
|
|
//generate whole text |
|
89
|
|
|
$all_text_target = $all_text->item(0); |
|
90
|
|
|
if ( $all_text_target ) { |
|
91
|
|
|
$only_text = strtolower( trim( $all_text_target->nodeValue ) ); |
|
92
|
|
|
} else { |
|
93
|
|
|
$only_text = ''; |
|
94
|
|
|
} |
|
95
|
|
|
|
|
96
|
|
|
//get text size |
|
97
|
|
|
$text_size = mb_strlen( $only_text, '8bit' ); |
|
98
|
|
|
|
|
99
|
|
|
//get html to text ratio |
|
100
|
|
|
$ht_ratio = round( ( $text_size / $html_size ) * 100, 1 ); |
|
101
|
|
|
|
|
102
|
|
|
return array( |
|
103
|
|
|
'content' => $only_text, |
|
104
|
|
|
'ratio' => $ht_ratio, |
|
105
|
|
|
'size' => round( ( $text_size / 1024 ), 1 ), |
|
106
|
|
|
); |
|
107
|
|
|
} |
|
108
|
|
|
} |
|
109
|
|
|
?> |
|
|
|
|
|
|
110
|
|
|
|