Fix Chinese diff segmentation
segmentForDiff() is supposed to allow character-level diffing of Chinese text, by adding spaces and then removing them after the diff is complete. But when I tested it for I2d0a6996b02d37a3, unsegmentForDiff() failed to remove the spaces, since there was an <ins> tag between the space and the Chinese character. So instead, use formfeed characters to separate the Chinese characters, and strip them unconditionally instead of relying on them being next to Chinese characters. Add test. Change-Id: I230d8261bbda34ad313785a1f7c31d4db7bf989b
This commit is contained in:
parent
36b92d45db
commit
5d26080d62
3 changed files with 35 additions and 5 deletions
|
|
@ -92,14 +92,15 @@ class TextSlotDiffRenderer extends SlotDiffRenderer {
|
|||
* Convenience helper to use getTextDiff without an instance.
|
||||
* @param string $oldText
|
||||
* @param string $newText
|
||||
* @param array $options
|
||||
* @return string
|
||||
*/
|
||||
public static function diff( $oldText, $newText ) {
|
||||
public static function diff( $oldText, $newText, $options = [] ) {
|
||||
/** @var TextSlotDiffRenderer $slotDiffRenderer */
|
||||
$slotDiffRenderer = MediaWikiServices::getInstance()
|
||||
->getContentHandlerFactory()
|
||||
->getContentHandler( CONTENT_MODEL_TEXT )
|
||||
->getSlotDiffRenderer( RequestContext::getMain() );
|
||||
->getSlotDiffRenderer( RequestContext::getMain(), $options );
|
||||
'@phan-var TextSlotDiffRenderer $slotDiffRenderer';
|
||||
return $slotDiffRenderer->getTextDiff( $oldText, $newText );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,13 +29,22 @@
|
|||
*/
|
||||
class LanguageZh extends LanguageZh_hans {
|
||||
/**
|
||||
* this should give much better diff info
|
||||
* Add a formfeed character between each non-ASCII character, so that
|
||||
* "word-level" diffs will effectively operate on a character level. The FF
|
||||
* characters are stripped out by unsegmentForDiff().
|
||||
*
|
||||
* We use FF because it is the least used character that is matched by
|
||||
* PCRE's \s class.
|
||||
*
|
||||
* In the unlikely event that an FF character appears in the input, it will
|
||||
* be displayed in the diff as a replacement character.
|
||||
*
|
||||
* @param string $text
|
||||
* @return string
|
||||
*/
|
||||
public function segmentForDiff( $text ) {
|
||||
return preg_replace( '/[\xc0-\xff][\x80-\xbf]*/', ' $0', $text );
|
||||
$text = str_replace( "\x0c", "\u{FFFD}", $text );
|
||||
return preg_replace( '/[\xc0-\xff][\x80-\xbf]*/', "\x0c$0", $text );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -43,7 +52,7 @@ class LanguageZh extends LanguageZh_hans {
|
|||
* @return string
|
||||
*/
|
||||
public function unsegmentForDiff( $text ) {
|
||||
return preg_replace( '/ ([\xc0-\xff][\x80-\xbf]*)/', '$1', $text );
|
||||
return str_replace( "\x0c", '', $text );
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
20
tests/phpunit/includes/languages/LanguageZhTest.php
Normal file
20
tests/phpunit/includes/languages/LanguageZhTest.php
Normal file
|
|
@ -0,0 +1,20 @@
|
|||
<?php
|
||||
|
||||
use MediaWiki\MainConfigNames;
|
||||
|
||||
/**
|
||||
* @covers LanguageZh
|
||||
*/
|
||||
class LanguageZhTest extends LanguageClassesTestCase {
|
||||
public function testSegmentForDiff() {
|
||||
$this->overrideConfigValue( MainConfigNames::DiffEngine, 'php' );
|
||||
$lhs = '维基';
|
||||
$rhs = '维基百科';
|
||||
$diff = TextSlotDiffRenderer::diff( $lhs, $rhs, [ 'contentLanguage' => 'zh' ] );
|
||||
// Check that only the second part is highlighted, and word segmentation markers are not present
|
||||
$this->assertStringContainsString(
|
||||
'<div>维基<ins class="diffchange diffchange-inline">百科</ins></div>',
|
||||
$diff
|
||||
);
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue