WikiTextStructure::extractHeadingBeforeFirstHeading() - Code Metrics - Inspection of "Daily Inspection: Merge "Add DEFAULTSORT to search..." - wikimedia/mediawiki - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (dc3656)

unknown

created 2016-09-01 17:35 UTC

extractHeadingBeforeFirstHeading() B

↳ Parent: WikiTextStructure

Complexity

Conditions	4
Paths	4

Size

Total Lines	28
Code Lines	15

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	4
eloc	15
nc	4
nop	1
dl	0
loc	28
rs	8.5806
c	0
b	0
f	0

<?php

use HtmlFormatter\HtmlFormatter;
.
|-- OtherDir
|   |-- Bar.php
|   `-- Foo.php
`-- SomeDir
    `-- Foo.php
use MediaWiki\Logger\LoggerFactory;

/**
 * Class allowing to explore structure of parsed wikitext.
 */
class WikiTextStructure {
	/**
	 * @var string
	 */
	private $openingText;
	/**
	 * @var string
	 */
	private $allText;
	/**
	 * @var string[]
	 */
	private $auxText = [];
	/**
	 * @var ParserOutput
	 */
	private $parserOutput;

	/**
	 * @var string[] selectors to elements that are excluded entirely from search
	 */
	private $excludedElementSelectors = [
		'audio', 'video',       // "it looks like you don't have javascript enabled..."
		                        // do not need to index
		'sup.reference',        // The [1] for references
		'.mw-cite-backlink',    // The ↑ next to references in the references section
		'h1', 'h2', 'h3',       // Headings are already indexed in their own field.
		'h5', 'h6', 'h4',
		'.autocollapse',        // Collapsed fields are hidden by default so we don't want them
								// showing up.
	];

	/**
	 * @var string[] selectors to elements that are considered auxiliary to article text for search
	 */
	private $auxiliaryElementSelectors = [
		'.thumbcaption',        // Thumbnail captions aren't really part of the text proper
		'table',                // Neither are tables
		'.rellink',             // Common style for "See also:".
		'.dablink',             // Common style for calling out helpful links at the top
								// of the article.
		'.searchaux',           // New class users can use to mark stuff as auxiliary to searches.
	];

	/**
	 * WikiTextStructure constructor.
	 * @param ParserOutput $parserOutput
	 */
	public function __construct( ParserOutput $parserOutput ) {
		$this->parserOutput = $parserOutput;
	}

	/**
	 * Get headings on the page.
	 * @return string[]
	 * First strip out things that look like references.  We can't use HTML filtering because
	 * the references come back as <sup> tags without a class.  To keep from breaking stuff like
	 *  ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
	 * we don't remove the whole <sup> tag.  We also don't want to strip the <sup> tag and remove
	 * everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
	 * or something.  Whatever.  So we only strip things that look like <sup> tags wrapping a
	 * reference.  And since the data looks like:
	 *      Reference in heading <sup>&#91;1&#93;</sup><sup>&#91;2&#93;</sup>
	 * we can not really use HtmlFormatter as we have no suitable selector.
	 */
	public function headings() {
		$headings = [];
		$ignoredHeadings = $this->getIgnoredHeadings();
		foreach ( $this->parserOutput->getSections() as $heading ) {
			$heading = $heading[ 'line' ];

			// Some wikis wrap the brackets in a span:
			// http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
			$heading = preg_replace( '/<\/?span>/', '', $heading );
			// Normalize [] so the following regexp would work.
			$heading = preg_replace( [ '/&#91;/', '/&#93;/' ], [ '[', ']' ], $heading );
			$heading = preg_replace( '/<sup>\s*\[\s*\d+\s*\]\s*<\/sup>/is', '', $heading );

			// Strip tags from the heading or else we'll display them (escaped) in search results
			$heading = trim( Sanitizer::stripAllTags( $heading ) );

			// Note that we don't take the level of the heading into account - all headings are equal.
			// Except the ones we ignore.
			if ( !in_array( $heading, $ignoredHeadings ) ) {
				$headings[] = $heading;
			}
		}
		return $headings;
	}

	/**
	 * Parse a message content into an array. This function is generally used to
	 * parse settings stored as i18n messages (see search-ignored-headings).
	 *
	 * @param string $message
	 * @return string[]
	 */
	public static function parseSettingsInMessage( $message ) {
		$lines = explode( "\n", $message );
		$lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
		$lines = array_map( 'trim', $lines );          // Remove extra spaces
		$lines = array_filter( $lines );               // Remove empty lines
		return $lines;
	}

	/**
	 * Get list of heading to ignore.
	 * @return string[]
	 */
	private function getIgnoredHeadings() {
		static $ignoredHeadings = null;
		if ( $ignoredHeadings === null ) {
			$ignoredHeadings = [];
			$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
			if ( $source->isBlank() ) {
				// Try old version too, just in case
				$source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
			}
			if ( !$source->isDisabled() ) {
				$lines = self::parseSettingsInMessage( $source->plain() );
				$ignoredHeadings = $lines;               // Now we just have headings!
			}
		}
		return $ignoredHeadings;
	}

	/**
	 * Extract parts of the text - opening, main and auxiliary.
	 */
	private function extractWikitextParts() {
		if ( !is_null( $this->allText ) ) {
			return;
		}
		$this->parserOutput->setEditSectionTokens( false );
		$this->parserOutput->setTOCEnabled( false );
		$text = $this->parserOutput->getText();
		if ( strlen( $text ) == 0 ) {
			$this->allText = "";
			// empty text - nothing to seek here
			return;
		}
		$opening = null;
$myVar = 'Value';
$higher = false;

if (rand(1, 6) > 3) {
    $higher = true;
} else {
    $higher = false;
}

		$this->openingText = $this->extractHeadingBeforeFirstHeading( $text );

		// Add extra spacing around break tags so text crammed together like<br>this
		// doesn't make one word.
		$text = str_replace( '<br', "\n<br", $text );

		$formatter = new HtmlFormatter( $text );

		// Strip elements from the page that we never want in the search text.
		$formatter->remove( $this->excludedElementSelectors );
		$formatter->filterContent();

		// Strip elements from the page that are auxiliary text.  These will still be
		// searched but matches will be ranked lower and non-auxiliary matches will be
		// preferred in highlighting.
		$formatter->remove( $this->auxiliaryElementSelectors );
		$auxiliaryElements = $formatter->filterContent();
		$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
		foreach ( $auxiliaryElements as $auxiliaryElement ) {
			$this->auxText[] =
				trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
		}
	}

	/**
	 * Get text before first heading.
	 * @param string $text
	 * @return string|null
	 */
	private function extractHeadingBeforeFirstHeading( $text ) {
		$matches = [];
		if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
			// There isn't a first heading so we interpret this as the article
			// being entirely without heading.
			return null;
		}
		$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
		if ( !$text ) {
			// There isn't any text before the first heading so we declare there isn't
			// a first heading.
			return null;
		}

		$formatter = new HtmlFormatter( $text );
		$formatter->remove( $this->excludedElementSelectors );
		$formatter->remove( $this->auxiliaryElementSelectors );
		$formatter->filterContent();
		$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );

		if ( !$text ) {
			// There isn't any text after filtering before the first heading so we declare
			// that there isn't a first heading.
			return null;
		}

		return $text;
	}

	/**
	 * Get opening text
	 * @return string
	 */
	public function getOpeningText() {
		$this->extractWikitextParts();
		return $this->openingText;
	}

	/**
	 * Get main text
	 * @return string
	 */
	public function getMainText() {
		$this->extractWikitextParts();
		return $this->allText;
	}

	/**
	 * Get auxiliary text
	 * @return string[]
	 */
	public function getAuxiliaryText() {
		$this->extractWikitextParts();
		return $this->auxText;
	}

	/**
	 * Get the defaultsort property
	 * @return string|null
	 */
	public function getDefaultSort() {
		return $this->parserOutput->getProperty( 'defaultsort' );
	}
}


1			<?php
2
3			use HtmlFormatter\HtmlFormatter;
			0 ignored issues – show Bug introduced 2016-08-04 17:50 UTC by Report Bug Copy Issue Report This use statement conflicts with another class in this namespace, `HtmlFormatter`. Let’s assume that you have a directory layout like this: . \|-- OtherDir \| \|-- Bar.php \| `-- Foo.php `-- SomeDir `-- Foo.php and let’s assume the following content of `Bar.php`: // Bar.php namespace OtherDir; use SomeDir\Foo; // This now conflicts the class OtherDir\Foo If both files `OtherDir/Foo.php` and `SomeDir/Foo.php` are loaded in the same runtime, you will see a PHP error such as the following: PHP Fatal error: Cannot use SomeDir\Foo as Foo because the name is already in use in OtherDir/Foo.php However, as `OtherDir/Foo.php` does not necessarily have to be loaded and the error is only triggered if it is loaded before `OtherDir/Bar.php`, this problem might go unnoticed for a while. In order to prevent this error from surfacing, you must import the namespace with a different alias: // Bar.php namespace OtherDir; use SomeDir\Foo as SomeDirFoo; // There is no conflict anymore. Loading history...
4			use MediaWiki\Logger\LoggerFactory;
5
6			/**
7			* Class allowing to explore structure of parsed wikitext.
8			*/
9			class WikiTextStructure {
10			/**
11			* @var string
12			*/
13			private $openingText;
14			/**
15			* @var string
16			*/
17			private $allText;
18			/**
19			* @var string[]
20			*/
21			private $auxText = [];
22			/**
23			* @var ParserOutput
24			*/
25			private $parserOutput;
26
27			/**
28			* @var string[] selectors to elements that are excluded entirely from search
29			*/
30			private $excludedElementSelectors = [
31			'audio', 'video', // "it looks like you don't have javascript enabled..."
32			// do not need to index
33			'sup.reference', // The [1] for references
34			'.mw-cite-backlink', // The ↑ next to references in the references section
35			'h1', 'h2', 'h3', // Headings are already indexed in their own field.
36			'h5', 'h6', 'h4',
37			'.autocollapse', // Collapsed fields are hidden by default so we don't want them
38			// showing up.
39			];
40
41			/**
42			* @var string[] selectors to elements that are considered auxiliary to article text for search
43			*/
44			private $auxiliaryElementSelectors = [
45			'.thumbcaption', // Thumbnail captions aren't really part of the text proper
46			'table', // Neither are tables
47			'.rellink', // Common style for "See also:".
48			'.dablink', // Common style for calling out helpful links at the top
49			// of the article.
50			'.searchaux', // New class users can use to mark stuff as auxiliary to searches.
51			];
52
53			/**
54			* WikiTextStructure constructor.
55			* @param ParserOutput $parserOutput
56			*/
57			public function __construct( ParserOutput $parserOutput ) {
58			$this->parserOutput = $parserOutput;
59			}
60
61			/**
62			* Get headings on the page.
63			* @return string[]
64			* First strip out things that look like references. We can't use HTML filtering because
65			* the references come back as <sup> tags without a class. To keep from breaking stuff like
66			* ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
67			* we don't remove the whole <sup> tag. We also don't want to strip the <sup> tag and remove
68			* everything that looks like [2] because, I dunno, maybe there is a band named Word [2] Foo
69			* or something. Whatever. So we only strip things that look like <sup> tags wrapping a
70			* reference. And since the data looks like:
71			* Reference in heading <sup>[1]</sup><sup>[2]</sup>
72			* we can not really use HtmlFormatter as we have no suitable selector.
73			*/
74			public function headings() {
75			$headings = [];
76			$ignoredHeadings = $this->getIgnoredHeadings();
77			foreach ( $this->parserOutput->getSections() as $heading ) {
78			$heading = $heading[ 'line' ];
79
80			// Some wikis wrap the brackets in a span:
81			// http://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
82			$heading = preg_replace( '/<\/?span>/', '', $heading );
83			// Normalize [] so the following regexp would work.
84			$heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
85			$heading = preg_replace( '/<sup>\s\[\s\d+\s\]\s<\/sup>/is', '', $heading );
86
87			// Strip tags from the heading or else we'll display them (escaped) in search results
88			$heading = trim( Sanitizer::stripAllTags( $heading ) );
89
90			// Note that we don't take the level of the heading into account - all headings are equal.
91			// Except the ones we ignore.
92			if ( !in_array( $heading, $ignoredHeadings ) ) {
93			$headings[] = $heading;
94			}
95			}
96			return $headings;
97			}
98
99			/**
100			* Parse a message content into an array. This function is generally used to
101			* parse settings stored as i18n messages (see search-ignored-headings).
102			*
103			* @param string $message
104			* @return string[]
105			*/
106			public static function parseSettingsInMessage( $message ) {
107			$lines = explode( "\n", $message );
108			$lines = preg_replace( '/#.*$/', '', $lines ); // Remove comments
109			$lines = array_map( 'trim', $lines ); // Remove extra spaces
110			$lines = array_filter( $lines ); // Remove empty lines
111			return $lines;
112			}
113
114			/**
115			* Get list of heading to ignore.
116			* @return string[]
117			*/
118			private function getIgnoredHeadings() {
119			static $ignoredHeadings = null;
120			if ( $ignoredHeadings === null ) {
121			$ignoredHeadings = [];
122			$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
123			if ( $source->isBlank() ) {
124			// Try old version too, just in case
125			$source = wfMessage( 'cirrussearch-ignored-headings' )->inContentLanguage();
126			}
127			if ( !$source->isDisabled() ) {
128			$lines = self::parseSettingsInMessage( $source->plain() );
129			$ignoredHeadings = $lines; // Now we just have headings!
130			}
131			}
132			return $ignoredHeadings;
133			}
134
135			/**
136			* Extract parts of the text - opening, main and auxiliary.
137			*/
138			private function extractWikitextParts() {
139			if ( !is_null( $this->allText ) ) {
140			return;
141			}
142			$this->parserOutput->setEditSectionTokens( false );
143			$this->parserOutput->setTOCEnabled( false );
144			$text = $this->parserOutput->getText();
145			if ( strlen( $text ) == 0 ) {
146			$this->allText = "";
147			// empty text - nothing to seek here
148			return;
149			}
150			$opening = null;
			0 ignored issues – show Unused Code introduced 2016-08-04 17:50 UTC by Report Bug Copy Issue Report `$opening` is not used, you could remove the assignment. This check looks for variable assignements that are either overwritten by other assignments or where the variable is not used subsequently. $myVar = 'Value'; $higher = false; if (rand(1, 6) > 3) { $higher = true; } else { $higher = false; } Both the `$myVar` assignment in line 1 and the `$higher` assignment in line 2 are dead. The first because `$myVar` is never used and the second because `$higher` is always overwritten for every possible time line. Loading history...
151
152			$this->openingText = $this->extractHeadingBeforeFirstHeading( $text );
153
154			// Add extra spacing around break tags so text crammed together like<br>this
155			// doesn't make one word.
156			$text = str_replace( '<br', "\n<br", $text );
157
158			$formatter = new HtmlFormatter( $text );
159
160			// Strip elements from the page that we never want in the search text.
161			$formatter->remove( $this->excludedElementSelectors );
162			$formatter->filterContent();
163
164			// Strip elements from the page that are auxiliary text. These will still be
165			// searched but matches will be ranked lower and non-auxiliary matches will be
166			// preferred in highlighting.
167			$formatter->remove( $this->auxiliaryElementSelectors );
168			$auxiliaryElements = $formatter->filterContent();
169			$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
170			foreach ( $auxiliaryElements as $auxiliaryElement ) {
171			$this->auxText[] =
172			trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
173			}
174			}
175
176			/**
177			* Get text before first heading.
178			* @param string $text
179			* @return string\|null
180			*/
181			private function extractHeadingBeforeFirstHeading( $text ) {
182			$matches = [];
183			if ( !preg_match( '/<h[123456]>/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
184			// There isn't a first heading so we interpret this as the article
185			// being entirely without heading.
186			return null;
187			}
188			$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
189			if ( !$text ) {
190			// There isn't any text before the first heading so we declare there isn't
191			// a first heading.
192			return null;
193			}
194
195			$formatter = new HtmlFormatter( $text );
196			$formatter->remove( $this->excludedElementSelectors );
197			$formatter->remove( $this->auxiliaryElementSelectors );
198			$formatter->filterContent();
199			$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
200
201			if ( !$text ) {
202			// There isn't any text after filtering before the first heading so we declare
203			// that there isn't a first heading.
204			return null;
205			}
206
207			return $text;
208			}
209
210			/**
211			* Get opening text
212			* @return string
213			*/
214			public function getOpeningText() {
215			$this->extractWikitextParts();
216			return $this->openingText;
217			}
218
219			/**
220			* Get main text
221			* @return string
222			*/
223			public function getMainText() {
224			$this->extractWikitextParts();
225			return $this->allText;
226			}
227
228			/**
229			* Get auxiliary text
230			* @return string[]
231			*/
232			public function getAuxiliaryText() {
233			$this->extractWikitextParts();
234			return $this->auxText;
235			}
236
237			/**
238			* Get the defaultsort property
239			* @return string\|null
240			*/
241			public function getDefaultSort() {
242			return $this->parserOutput->getProperty( 'defaultsort' );
243			}
244			}
245

wikimedia / mediawiki

Branch — master (dc3656)

extractHeadingBeforeFirstHeading() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like