wiki.techinc.nl/tests/phpunit/includes/search/SearchUpdateTest.php
Brion Vibber f79d1d3ffb * (bug 32712) Fix for search indexing of pages with certain unicode chars following URL
A regex in SearchUpdate was built for ancient pure ISO 8859-1 and looked for \xa0-\xff bytes -- this caused the regex to cut off partway through if there was a char containing a byte in the \x80-\x9f range.
Fixed regex to pass \x80-\xff instead.

Added a test case to SearchUpdateTest which checks for this case (example text run through the update squash algo, then run through preg_replace with a /u param to make sure it gets treated as UTF-8 and checking whether it breaks.)
2011-11-30 00:36:34 +00:00

90 lines
1.8 KiB
PHP

<?php
class MockSearch extends SearchEngine {
public static $id;
public static $title;
public static $text;
public function __construct( $db ) {
}
public function update( $id, $title, $text ) {
self::$id = $id;
self::$title = $title;
self::$text = $text;
}
}
/**
* @group Search
*/
class SearchUpdateTest extends MediaWikiTestCase {
static $searchType;
function update( $text, $title = 'Test', $id = 1 ) {
$u = new SearchUpdate( $id, $title, $text );
$u->doUpdate();
return array( MockSearch::$title, MockSearch::$text );
}
function updateText( $text ) {
list( , $resultText ) = $this->update( $text );
$resultText = trim( $resultText ); // abstract from some implementation details
return $resultText;
}
function setUp() {
global $wgSearchType;
self::$searchType = $wgSearchType;
$wgSearchType = 'MockSearch';
}
function tearDown() {
global $wgSearchType;
$wgSearchType = self::$searchType;
}
function testUpdateText() {
$this->assertEquals(
'test',
$this->updateText( '<div>TeSt</div>' ),
'HTML stripped, text lowercased'
);
$this->assertEquals(
'foo bar boz quux',
$this->updateText( <<<EOT
<table style="color:red; font-size:100px">
<tr class="scary"><td><div>foo</div></td><tr>bar</td></tr>
<tr><td>boz</td><tr>quux</td></tr>
</table>
EOT
), 'Stripping HTML tables' );
$this->assertEquals(
'a b',
$this->updateText( 'a > b' ),
'Handle unclosed tags'
);
$text = str_pad( "foo <barbarbar \n", 10000, 'x' );
$this->assertNotEquals(
'',
$this->updateText( $text ),
'Bug 18609'
);
}
function testBug32712() {
$text = "text „http://example.com“ text";
$result = $this->updateText( $text );
$processed = preg_replace( '/Q/u', 'Q', $result );
$this->assertTrue(
$processed != '',
'Link surrounded by unicode quotes should not fail UTF-8 validation'
);
}
}