Apply exportTransform in backupTextPass.inc

This is intended to help us fix 72348: Wikidata uses exportTransform to 
convert old revision content to the new canonical JSON model. This patch
will make this work with the multi-pass dumping process employed to 
generate dumps for dumps.wikimedia.org.

Bug: 72361
Change-Id: Ie9046d1968efc40a02a0812a536f5ef7176af7d7
This commit is contained in:
daniel 2014-10-23 13:49:59 +02:00 committed by Addshore
parent 50dfcc138f
commit e596cac514
4 changed files with 128 additions and 30 deletions

View file

@ -354,6 +354,8 @@ class TextPassDumper extends BackupDumper {
$this->lastName = "";
$this->thisPage = 0;
$this->thisRev = 0;
$this->thisRevModel = null;
$this->thisRevFormat = null;
$parser = xml_parser_create( "UTF-8" );
xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
@ -421,8 +423,34 @@ class TextPassDumper extends BackupDumper {
return true;
}
/**
* Applies applicable export transformations to $text.
*
* @param string $text
* @param string $model
* @param string|null $format
*
* @return string
*/
private function exportTransform( $text, $model, $format = null ) {
try {
$handler = ContentHandler::getForModelID( $model );
$text = $handler->exportTransform( $text, $format );
}
catch ( MWException $ex ) {
$this->progress(
"Unable to apply export transformation for content model '$model': " .
$ex->getMessage()
);
}
return $text;
}
/**
* Tries to get the revision text for a revision id.
* Export transformations are applied if the content model can is given or can be
* determined from the database.
*
* Upon errors, retries (Up to $this->maxFailures tries each call).
* If still no good revision get could be found even after this retrying, "" is returned.
@ -431,11 +459,14 @@ class TextPassDumper extends BackupDumper {
* is thrown.
*
* @param string $id The revision id to get the text for
* @param string|bool|null $model The content model used to determine applicable export transformations.
* If $model is null, it will be determined from the database.
* @param string|null $format The content format used when applying export transformations.
*
* @return string The revision text for $id, or ""
* @throws MWException
* @return string The revision text for $id, or ""
*/
function getText( $id ) {
function getText( $id, $model = null, $format = null ) {
global $wgContentHandlerUseDB;
$prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
@ -453,6 +484,24 @@ class TextPassDumper extends BackupDumper {
$oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
$consecutiveFailedTextRetrievals = 0;
if ( $model === null && $wgContentHandlerUseDB ) {
$row = $this->db->selectRow(
'revision',
array( 'rev_content_model', 'rev_content_format' ),
array( 'rev_id' => $this->thisRev ),
__METHOD__
);
if ( $row ) {
$model = $row->rev_content_model;
$format = $row->rev_content_format;
}
}
if ( $model === null || $model === '' ) {
$model = false;
}
while ( $failures < $this->maxFailures ) {
// As soon as we found a good text for the $id, we will return immediately.
@ -469,9 +518,19 @@ class TextPassDumper extends BackupDumper {
$tryIsPrefetch = true;
$text = $this->prefetch->prefetch( intval( $this->thisPage ),
intval( $this->thisRev ) );
if ( $text === null ) {
$text = false;
}
if ( is_string( $text ) && $model !== false ) {
// Apply export transformation to text coming from an old dump.
// The purpose of this transformation is to convert up from legacy
// formats, which may still be used in the older dump that is used
// for pre-fetching. Applying the transformation again should not
// interfere with content that is already in the correct form.
$text = $this->exportTransform( $text, $model, $format );
}
}
if ( $text === false ) {
@ -483,6 +542,12 @@ class TextPassDumper extends BackupDumper {
$text = $this->getTextDb( $id );
}
if ( $text !== false && $model !== false ) {
// Apply export transformation to text coming from the database.
// Prefetched text should already have transformations applied.
$text = $this->exportTransform( $text, $model, $format );
}
// No more checks for texts from DB for now.
// If we received something that is not false,
// We treat it as good text, regardless of whether it actually is or is not
@ -504,21 +569,8 @@ class TextPassDumper extends BackupDumper {
throw new MWException( "No database available" );
}
$revLength = strlen( $text );
if ( $wgContentHandlerUseDB ) {
$row = $this->db->selectRow(
'revision',
array( 'rev_len', 'rev_content_model' ),
array( 'rev_id' => $revID ),
__METHOD__
);
if ( $row ) {
// only check the length for the wikitext content handler,
// it's a wasted (and failed) check otherwise
if ( $row->rev_content_model == CONTENT_MODEL_WIKITEXT ) {
$revLength = $row->rev_len;
}
}
if ( $model !== CONTENT_MODEL_WIKITEXT ) {
$revLength = strlen( $text );
} else {
$revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
}
@ -757,7 +809,14 @@ class TextPassDumper extends BackupDumper {
}
if ( $name == "text" && isset( $attribs['id'] ) ) {
$text = $this->getText( $attribs['id'] );
$id = $attribs['id'];
$model = trim( $this->thisRevModel );
$format = trim( $this->thisRevFormat );
$model = $model === '' ? null : $model;
$format = $format === '' ? null : $format;
$text = $this->getText( $id, $model, $format );
$this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
if ( strlen( $text ) > 0 ) {
$this->characterData( $parser, $text );
@ -780,6 +839,8 @@ class TextPassDumper extends BackupDumper {
$this->egress->writeRevision( null, $this->buffer );
$this->buffer = "";
$this->thisRev = "";
$this->thisRevModel = null;
$this->thisRevFormat = null;
} elseif ( $name == 'page' ) {
if ( !$this->firstPageWritten ) {
$this->firstPageWritten = trim( $this->thisPage );
@ -834,6 +895,13 @@ class TextPassDumper extends BackupDumper {
$this->thisPage .= $data;
}
}
elseif ( $this->lastName == "model" ) {
$this->thisRevModel .= $data;
}
elseif ( $this->lastName == "format" ) {
$this->thisRevFormat .= $data;
}
// have to skip the newline left over from closepagetag line of
// end of checkpoint files. nasty hack!!
if ( $this->checkpointJustWritten ) {

View file

@ -32,7 +32,8 @@ require_once __DIR__ . '/Maintenance.php';
class FetchText extends Maintenance {
public function __construct() {
parent::__construct();
$this->mDescription = "Fetch the revision text from an old_id";
$this->mDescription = "Fetch the raw revision blob from an old_id.";
$this->mDescription .= "\nNOTE: Export transformations are NOT applied. This is left to backupTextPass.php";
}
/**

View file

@ -30,13 +30,15 @@ abstract class DumpTestCase extends MediaWikiLangTestCase {
*
* @param Page $page Page to add the revision to
* @param string $text Revisions text
* @param string $summary Revisions summare
* @return array
* @param string $summary Revisions summary
* @param string $model The model ID (defaults to wikitext)
*
* @throws MWException
* @return array
*/
protected function addRevision( Page $page, $text, $summary ) {
protected function addRevision( Page $page, $text, $summary, $model = CONTENT_MODEL_WIKITEXT ) {
$status = $page->doEditContent(
ContentHandler::makeContent( $text, $page->getTitle() ),
ContentHandler::makeContent( $text, $page->getTitle(), $model ),
$summary
);

View file

@ -27,6 +27,10 @@ class TextPassDumperTest extends DumpTestCase {
$this->tablesUsed[] = 'revision';
$this->tablesUsed[] = 'text';
$this->mergeMwGlobalArrayValue( 'wgContentHandlers', array(
"BackupTextPassTestModel" => "BackupTextPassTestModelHandler"
) );
$ns = $this->getDefaultWikitextNS();
try {
@ -61,7 +65,8 @@ class TextPassDumperTest extends DumpTestCase {
$this->pageId3 = $page->getId();
$page->doDeleteArticle( "Testing ;)" );
// Page from non-default namespace
// Page from non-default namespace and model.
// ExportTransform applies.
if ( $ns === NS_TALK ) {
// @todo work around this.
@ -73,7 +78,8 @@ class TextPassDumperTest extends DumpTestCase {
$page = WikiPage::factory( $title );
list( $this->revId4_1, $this->textId4_1 ) = $this->addRevision( $page,
"Talk about BackupDumperTestP1 Text1",
"Talk BackupDumperTestP1 Summary1" );
"Talk BackupDumperTestP1 Summary1",
"BackupTextPassTestModel" );
$this->pageId4 = $page->getId();
} catch ( Exception $e ) {
// We'd love to pass $e directly. However, ... see
@ -141,7 +147,10 @@ class TextPassDumperTest extends DumpTestCase {
$this->assertPageStart( $this->pageId4, NS_TALK, "Talk:BackupDumperTestP1" );
$this->assertRevision( $this->revId4_1, "Talk BackupDumperTestP1 Summary1",
$this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
"Talk about BackupDumperTestP1 Text1" );
"TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
false,
"BackupTextPassTestModel",
"text/plain" );
$this->assertPageEnd();
$this->assertDumpEnd();
@ -209,7 +218,10 @@ class TextPassDumperTest extends DumpTestCase {
$this->assertPageStart( $this->pageId4, NS_TALK, "Talk:BackupDumperTestP1" );
$this->assertRevision( $this->revId4_1, "Talk BackupDumperTestP1 Summary1",
$this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
"Talk about BackupDumperTestP1 Text1" );
"TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
false,
"BackupTextPassTestModel",
"text/plain" );
$this->assertPageEnd();
$this->assertDumpEnd();
@ -362,7 +374,10 @@ class TextPassDumperTest extends DumpTestCase {
$this->assertRevision( $this->revId4_1 + $i * self::$numOfRevs,
"Talk BackupDumperTestP1 Summary1",
$this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
"Talk about BackupDumperTestP1 Text1" );
"TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
false,
"BackupTextPassTestModel",
"text/plain" );
$this->assertPageEnd();
$lookingForPage = 1;
@ -566,8 +581,8 @@ class TextPassDumperTest extends DumpTestCase {
<ip>127.0.0.1</ip>
</contributor>
<comment>Talk BackupDumperTestP1 Summary1</comment>
<model>wikitext</model>
<format>text/x-wiki</format>
<model>BackupTextPassTestModel</model>
<format>text/plain</format>
<text id="' . $this->textId4_1 . '" bytes="35" />
<sha1>nktofwzd0tl192k3zfepmlzxoax1lpe</sha1>
</revision>
@ -582,3 +597,15 @@ class TextPassDumperTest extends DumpTestCase {
return $fname;
}
}
class BackupTextPassTestModelHandler extends TextContentHandler {
public function __construct() {
parent::__construct( 'BackupTextPassTestModel' );
}
public function exportTransform( $text, $format = null ) {
return strtoupper( $text );
}
}