Fix Chinese diff segmentation

segmentForDiff() is supposed to allow character-level diffing of Chinese
text, by adding spaces and then removing them after the diff is
complete. But when I tested it for I2d0a6996b02d37a3, unsegmentForDiff()
failed to remove the spaces, since there was an <ins> tag between the space
and the Chinese character.

So instead, use formfeed characters to separate the Chinese characters,
and strip them unconditionally instead of relying on them being next to
Chinese characters.

Add test.

Change-Id: I230d8261bbda34ad313785a1f7c31d4db7bf989b
This commit is contained in:
Tim Starling 2023-07-06 10:50:04 +10:00
parent 36b92d45db
commit 5d26080d62
3 changed files with 35 additions and 5 deletions

View file

@ -92,14 +92,15 @@ class TextSlotDiffRenderer extends SlotDiffRenderer {
* Convenience helper to use getTextDiff without an instance.
* @param string $oldText
* @param string $newText
* @param array $options
* @return string
*/
public static function diff( $oldText, $newText ) {
public static function diff( $oldText, $newText, $options = [] ) {
/** @var TextSlotDiffRenderer $slotDiffRenderer */
$slotDiffRenderer = MediaWikiServices::getInstance()
->getContentHandlerFactory()
->getContentHandler( CONTENT_MODEL_TEXT )
->getSlotDiffRenderer( RequestContext::getMain() );
->getSlotDiffRenderer( RequestContext::getMain(), $options );
'@phan-var TextSlotDiffRenderer $slotDiffRenderer';
return $slotDiffRenderer->getTextDiff( $oldText, $newText );
}

View file

@ -29,13 +29,22 @@
*/
class LanguageZh extends LanguageZh_hans {
/**
* this should give much better diff info
* Add a formfeed character between each non-ASCII character, so that
* "word-level" diffs will effectively operate on a character level. The FF
* characters are stripped out by unsegmentForDiff().
*
* We use FF because it is the least used character that is matched by
* PCRE's \s class.
*
* In the unlikely event that an FF character appears in the input, it will
* be displayed in the diff as a replacement character.
*
* @param string $text
* @return string
*/
public function segmentForDiff( $text ) {
return preg_replace( '/[\xc0-\xff][\x80-\xbf]*/', ' $0', $text );
$text = str_replace( "\x0c", "\u{FFFD}", $text );
return preg_replace( '/[\xc0-\xff][\x80-\xbf]*/', "\x0c$0", $text );
}
/**
@ -43,7 +52,7 @@ class LanguageZh extends LanguageZh_hans {
* @return string
*/
public function unsegmentForDiff( $text ) {
return preg_replace( '/ ([\xc0-\xff][\x80-\xbf]*)/', '$1', $text );
return str_replace( "\x0c", '', $text );
}
/**

View file

@ -0,0 +1,20 @@
<?php
use MediaWiki\MainConfigNames;
/**
* @covers LanguageZh
*/
class LanguageZhTest extends LanguageClassesTestCase {
public function testSegmentForDiff() {
$this->overrideConfigValue( MainConfigNames::DiffEngine, 'php' );
$lhs = '维基';
$rhs = '维基百科';
$diff = TextSlotDiffRenderer::diff( $lhs, $rhs, [ 'contentLanguage' => 'zh' ] );
// Check that only the second part is highlighted, and word segmentation markers are not present
$this->assertStringContainsString(
'<div>维基<ins class="diffchange diffchange-inline">百科</ins></div>',
$diff
);
}
}