diff --git a/includes/diff/TextSlotDiffRenderer.php b/includes/diff/TextSlotDiffRenderer.php index c254c08f189..22ca94ce912 100644 --- a/includes/diff/TextSlotDiffRenderer.php +++ b/includes/diff/TextSlotDiffRenderer.php @@ -92,14 +92,15 @@ class TextSlotDiffRenderer extends SlotDiffRenderer { * Convenience helper to use getTextDiff without an instance. * @param string $oldText * @param string $newText + * @param array $options * @return string */ - public static function diff( $oldText, $newText ) { + public static function diff( $oldText, $newText, $options = [] ) { /** @var TextSlotDiffRenderer $slotDiffRenderer */ $slotDiffRenderer = MediaWikiServices::getInstance() ->getContentHandlerFactory() ->getContentHandler( CONTENT_MODEL_TEXT ) - ->getSlotDiffRenderer( RequestContext::getMain() ); + ->getSlotDiffRenderer( RequestContext::getMain(), $options ); '@phan-var TextSlotDiffRenderer $slotDiffRenderer'; return $slotDiffRenderer->getTextDiff( $oldText, $newText ); } diff --git a/includes/languages/LanguageZh.php b/includes/languages/LanguageZh.php index 4e23ca3934b..11aa6be4448 100644 --- a/includes/languages/LanguageZh.php +++ b/includes/languages/LanguageZh.php @@ -29,13 +29,22 @@ */ class LanguageZh extends LanguageZh_hans { /** - * this should give much better diff info + * Add a formfeed character between each non-ASCII character, so that + * "word-level" diffs will effectively operate on a character level. The FF + * characters are stripped out by unsegmentForDiff(). + * + * We use FF because it is the least used character that is matched by + * PCRE's \s class. + * + * In the unlikely event that an FF character appears in the input, it will + * be displayed in the diff as a replacement character. * * @param string $text * @return string */ public function segmentForDiff( $text ) { - return preg_replace( '/[\xc0-\xff][\x80-\xbf]*/', ' $0', $text ); + $text = str_replace( "\x0c", "\u{FFFD}", $text ); + return preg_replace( '/[\xc0-\xff][\x80-\xbf]*/', "\x0c$0", $text ); } /** @@ -43,7 +52,7 @@ class LanguageZh extends LanguageZh_hans { * @return string */ public function unsegmentForDiff( $text ) { - return preg_replace( '/ ([\xc0-\xff][\x80-\xbf]*)/', '$1', $text ); + return str_replace( "\x0c", '', $text ); } /** diff --git a/tests/phpunit/includes/languages/LanguageZhTest.php b/tests/phpunit/includes/languages/LanguageZhTest.php new file mode 100644 index 00000000000..815890d82d9 --- /dev/null +++ b/tests/phpunit/includes/languages/LanguageZhTest.php @@ -0,0 +1,20 @@ +overrideConfigValue( MainConfigNames::DiffEngine, 'php' ); + $lhs = '维基'; + $rhs = '维基百科'; + $diff = TextSlotDiffRenderer::diff( $lhs, $rhs, [ 'contentLanguage' => 'zh' ] ); + // Check that only the second part is highlighted, and word segmentation markers are not present + $this->assertStringContainsString( + '
维基百科
', + $diff + ); + } +}