Apply exportTransform in backupTextPass.inc
This is intended to help us fix 72348: Wikidata uses exportTransform to convert old revision content to the new canonical JSON model. This patch will make this work with the multi-pass dumping process employed to generate dumps for dumps.wikimedia.org. Bug: 72361 Change-Id: Ie9046d1968efc40a02a0812a536f5ef7176af7d7
This commit is contained in:
parent
50dfcc138f
commit
e596cac514
4 changed files with 128 additions and 30 deletions
|
|
@ -354,6 +354,8 @@ class TextPassDumper extends BackupDumper {
|
|||
$this->lastName = "";
|
||||
$this->thisPage = 0;
|
||||
$this->thisRev = 0;
|
||||
$this->thisRevModel = null;
|
||||
$this->thisRevFormat = null;
|
||||
|
||||
$parser = xml_parser_create( "UTF-8" );
|
||||
xml_parser_set_option( $parser, XML_OPTION_CASE_FOLDING, false );
|
||||
|
|
@ -421,8 +423,34 @@ class TextPassDumper extends BackupDumper {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies applicable export transformations to $text.
|
||||
*
|
||||
* @param string $text
|
||||
* @param string $model
|
||||
* @param string|null $format
|
||||
*
|
||||
* @return string
|
||||
*/
|
||||
private function exportTransform( $text, $model, $format = null ) {
|
||||
try {
|
||||
$handler = ContentHandler::getForModelID( $model );
|
||||
$text = $handler->exportTransform( $text, $format );
|
||||
}
|
||||
catch ( MWException $ex ) {
|
||||
$this->progress(
|
||||
"Unable to apply export transformation for content model '$model': " .
|
||||
$ex->getMessage()
|
||||
);
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tries to get the revision text for a revision id.
|
||||
* Export transformations are applied if the content model can is given or can be
|
||||
* determined from the database.
|
||||
*
|
||||
* Upon errors, retries (Up to $this->maxFailures tries each call).
|
||||
* If still no good revision get could be found even after this retrying, "" is returned.
|
||||
|
|
@ -431,11 +459,14 @@ class TextPassDumper extends BackupDumper {
|
|||
* is thrown.
|
||||
*
|
||||
* @param string $id The revision id to get the text for
|
||||
* @param string|bool|null $model The content model used to determine applicable export transformations.
|
||||
* If $model is null, it will be determined from the database.
|
||||
* @param string|null $format The content format used when applying export transformations.
|
||||
*
|
||||
* @return string The revision text for $id, or ""
|
||||
* @throws MWException
|
||||
* @return string The revision text for $id, or ""
|
||||
*/
|
||||
function getText( $id ) {
|
||||
function getText( $id, $model = null, $format = null ) {
|
||||
global $wgContentHandlerUseDB;
|
||||
|
||||
$prefetchNotTried = true; // Whether or not we already tried to get the text via prefetch.
|
||||
|
|
@ -453,6 +484,24 @@ class TextPassDumper extends BackupDumper {
|
|||
$oldConsecutiveFailedTextRetrievals = $consecutiveFailedTextRetrievals;
|
||||
$consecutiveFailedTextRetrievals = 0;
|
||||
|
||||
if ( $model === null && $wgContentHandlerUseDB ) {
|
||||
$row = $this->db->selectRow(
|
||||
'revision',
|
||||
array( 'rev_content_model', 'rev_content_format' ),
|
||||
array( 'rev_id' => $this->thisRev ),
|
||||
__METHOD__
|
||||
);
|
||||
|
||||
if ( $row ) {
|
||||
$model = $row->rev_content_model;
|
||||
$format = $row->rev_content_format;
|
||||
}
|
||||
}
|
||||
|
||||
if ( $model === null || $model === '' ) {
|
||||
$model = false;
|
||||
}
|
||||
|
||||
while ( $failures < $this->maxFailures ) {
|
||||
|
||||
// As soon as we found a good text for the $id, we will return immediately.
|
||||
|
|
@ -469,9 +518,19 @@ class TextPassDumper extends BackupDumper {
|
|||
$tryIsPrefetch = true;
|
||||
$text = $this->prefetch->prefetch( intval( $this->thisPage ),
|
||||
intval( $this->thisRev ) );
|
||||
|
||||
if ( $text === null ) {
|
||||
$text = false;
|
||||
}
|
||||
|
||||
if ( is_string( $text ) && $model !== false ) {
|
||||
// Apply export transformation to text coming from an old dump.
|
||||
// The purpose of this transformation is to convert up from legacy
|
||||
// formats, which may still be used in the older dump that is used
|
||||
// for pre-fetching. Applying the transformation again should not
|
||||
// interfere with content that is already in the correct form.
|
||||
$text = $this->exportTransform( $text, $model, $format );
|
||||
}
|
||||
}
|
||||
|
||||
if ( $text === false ) {
|
||||
|
|
@ -483,6 +542,12 @@ class TextPassDumper extends BackupDumper {
|
|||
$text = $this->getTextDb( $id );
|
||||
}
|
||||
|
||||
if ( $text !== false && $model !== false ) {
|
||||
// Apply export transformation to text coming from the database.
|
||||
// Prefetched text should already have transformations applied.
|
||||
$text = $this->exportTransform( $text, $model, $format );
|
||||
}
|
||||
|
||||
// No more checks for texts from DB for now.
|
||||
// If we received something that is not false,
|
||||
// We treat it as good text, regardless of whether it actually is or is not
|
||||
|
|
@ -504,21 +569,8 @@ class TextPassDumper extends BackupDumper {
|
|||
throw new MWException( "No database available" );
|
||||
}
|
||||
|
||||
$revLength = strlen( $text );
|
||||
if ( $wgContentHandlerUseDB ) {
|
||||
$row = $this->db->selectRow(
|
||||
'revision',
|
||||
array( 'rev_len', 'rev_content_model' ),
|
||||
array( 'rev_id' => $revID ),
|
||||
__METHOD__
|
||||
);
|
||||
if ( $row ) {
|
||||
// only check the length for the wikitext content handler,
|
||||
// it's a wasted (and failed) check otherwise
|
||||
if ( $row->rev_content_model == CONTENT_MODEL_WIKITEXT ) {
|
||||
$revLength = $row->rev_len;
|
||||
}
|
||||
}
|
||||
if ( $model !== CONTENT_MODEL_WIKITEXT ) {
|
||||
$revLength = strlen( $text );
|
||||
} else {
|
||||
$revLength = $this->db->selectField( 'revision', 'rev_len', array( 'rev_id' => $revID ) );
|
||||
}
|
||||
|
|
@ -757,7 +809,14 @@ class TextPassDumper extends BackupDumper {
|
|||
}
|
||||
|
||||
if ( $name == "text" && isset( $attribs['id'] ) ) {
|
||||
$text = $this->getText( $attribs['id'] );
|
||||
$id = $attribs['id'];
|
||||
$model = trim( $this->thisRevModel );
|
||||
$format = trim( $this->thisRevFormat );
|
||||
|
||||
$model = $model === '' ? null : $model;
|
||||
$format = $format === '' ? null : $format;
|
||||
|
||||
$text = $this->getText( $id, $model, $format );
|
||||
$this->openElement = array( $name, array( 'xml:space' => 'preserve' ) );
|
||||
if ( strlen( $text ) > 0 ) {
|
||||
$this->characterData( $parser, $text );
|
||||
|
|
@ -780,6 +839,8 @@ class TextPassDumper extends BackupDumper {
|
|||
$this->egress->writeRevision( null, $this->buffer );
|
||||
$this->buffer = "";
|
||||
$this->thisRev = "";
|
||||
$this->thisRevModel = null;
|
||||
$this->thisRevFormat = null;
|
||||
} elseif ( $name == 'page' ) {
|
||||
if ( !$this->firstPageWritten ) {
|
||||
$this->firstPageWritten = trim( $this->thisPage );
|
||||
|
|
@ -834,6 +895,13 @@ class TextPassDumper extends BackupDumper {
|
|||
$this->thisPage .= $data;
|
||||
}
|
||||
}
|
||||
elseif ( $this->lastName == "model" ) {
|
||||
$this->thisRevModel .= $data;
|
||||
}
|
||||
elseif ( $this->lastName == "format" ) {
|
||||
$this->thisRevFormat .= $data;
|
||||
}
|
||||
|
||||
// have to skip the newline left over from closepagetag line of
|
||||
// end of checkpoint files. nasty hack!!
|
||||
if ( $this->checkpointJustWritten ) {
|
||||
|
|
|
|||
|
|
@ -32,7 +32,8 @@ require_once __DIR__ . '/Maintenance.php';
|
|||
class FetchText extends Maintenance {
|
||||
public function __construct() {
|
||||
parent::__construct();
|
||||
$this->mDescription = "Fetch the revision text from an old_id";
|
||||
$this->mDescription = "Fetch the raw revision blob from an old_id.";
|
||||
$this->mDescription .= "\nNOTE: Export transformations are NOT applied. This is left to backupTextPass.php";
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -30,13 +30,15 @@ abstract class DumpTestCase extends MediaWikiLangTestCase {
|
|||
*
|
||||
* @param Page $page Page to add the revision to
|
||||
* @param string $text Revisions text
|
||||
* @param string $summary Revisions summare
|
||||
* @return array
|
||||
* @param string $summary Revisions summary
|
||||
* @param string $model The model ID (defaults to wikitext)
|
||||
*
|
||||
* @throws MWException
|
||||
* @return array
|
||||
*/
|
||||
protected function addRevision( Page $page, $text, $summary ) {
|
||||
protected function addRevision( Page $page, $text, $summary, $model = CONTENT_MODEL_WIKITEXT ) {
|
||||
$status = $page->doEditContent(
|
||||
ContentHandler::makeContent( $text, $page->getTitle() ),
|
||||
ContentHandler::makeContent( $text, $page->getTitle(), $model ),
|
||||
$summary
|
||||
);
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,10 @@ class TextPassDumperTest extends DumpTestCase {
|
|||
$this->tablesUsed[] = 'revision';
|
||||
$this->tablesUsed[] = 'text';
|
||||
|
||||
$this->mergeMwGlobalArrayValue( 'wgContentHandlers', array(
|
||||
"BackupTextPassTestModel" => "BackupTextPassTestModelHandler"
|
||||
) );
|
||||
|
||||
$ns = $this->getDefaultWikitextNS();
|
||||
|
||||
try {
|
||||
|
|
@ -61,7 +65,8 @@ class TextPassDumperTest extends DumpTestCase {
|
|||
$this->pageId3 = $page->getId();
|
||||
$page->doDeleteArticle( "Testing ;)" );
|
||||
|
||||
// Page from non-default namespace
|
||||
// Page from non-default namespace and model.
|
||||
// ExportTransform applies.
|
||||
|
||||
if ( $ns === NS_TALK ) {
|
||||
// @todo work around this.
|
||||
|
|
@ -73,7 +78,8 @@ class TextPassDumperTest extends DumpTestCase {
|
|||
$page = WikiPage::factory( $title );
|
||||
list( $this->revId4_1, $this->textId4_1 ) = $this->addRevision( $page,
|
||||
"Talk about BackupDumperTestP1 Text1",
|
||||
"Talk BackupDumperTestP1 Summary1" );
|
||||
"Talk BackupDumperTestP1 Summary1",
|
||||
"BackupTextPassTestModel" );
|
||||
$this->pageId4 = $page->getId();
|
||||
} catch ( Exception $e ) {
|
||||
// We'd love to pass $e directly. However, ... see
|
||||
|
|
@ -141,7 +147,10 @@ class TextPassDumperTest extends DumpTestCase {
|
|||
$this->assertPageStart( $this->pageId4, NS_TALK, "Talk:BackupDumperTestP1" );
|
||||
$this->assertRevision( $this->revId4_1, "Talk BackupDumperTestP1 Summary1",
|
||||
$this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
|
||||
"Talk about BackupDumperTestP1 Text1" );
|
||||
"TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
|
||||
false,
|
||||
"BackupTextPassTestModel",
|
||||
"text/plain" );
|
||||
$this->assertPageEnd();
|
||||
|
||||
$this->assertDumpEnd();
|
||||
|
|
@ -209,7 +218,10 @@ class TextPassDumperTest extends DumpTestCase {
|
|||
$this->assertPageStart( $this->pageId4, NS_TALK, "Talk:BackupDumperTestP1" );
|
||||
$this->assertRevision( $this->revId4_1, "Talk BackupDumperTestP1 Summary1",
|
||||
$this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
|
||||
"Talk about BackupDumperTestP1 Text1" );
|
||||
"TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
|
||||
false,
|
||||
"BackupTextPassTestModel",
|
||||
"text/plain" );
|
||||
$this->assertPageEnd();
|
||||
|
||||
$this->assertDumpEnd();
|
||||
|
|
@ -362,7 +374,10 @@ class TextPassDumperTest extends DumpTestCase {
|
|||
$this->assertRevision( $this->revId4_1 + $i * self::$numOfRevs,
|
||||
"Talk BackupDumperTestP1 Summary1",
|
||||
$this->textId4_1, false, "nktofwzd0tl192k3zfepmlzxoax1lpe",
|
||||
"Talk about BackupDumperTestP1 Text1" );
|
||||
"TALK ABOUT BACKUPDUMPERTESTP1 TEXT1",
|
||||
false,
|
||||
"BackupTextPassTestModel",
|
||||
"text/plain" );
|
||||
$this->assertPageEnd();
|
||||
|
||||
$lookingForPage = 1;
|
||||
|
|
@ -566,8 +581,8 @@ class TextPassDumperTest extends DumpTestCase {
|
|||
<ip>127.0.0.1</ip>
|
||||
</contributor>
|
||||
<comment>Talk BackupDumperTestP1 Summary1</comment>
|
||||
<model>wikitext</model>
|
||||
<format>text/x-wiki</format>
|
||||
<model>BackupTextPassTestModel</model>
|
||||
<format>text/plain</format>
|
||||
<text id="' . $this->textId4_1 . '" bytes="35" />
|
||||
<sha1>nktofwzd0tl192k3zfepmlzxoax1lpe</sha1>
|
||||
</revision>
|
||||
|
|
@ -582,3 +597,15 @@ class TextPassDumperTest extends DumpTestCase {
|
|||
return $fname;
|
||||
}
|
||||
}
|
||||
|
||||
class BackupTextPassTestModelHandler extends TextContentHandler {
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct( 'BackupTextPassTestModel' );
|
||||
}
|
||||
|
||||
public function exportTransform( $text, $format = null ) {
|
||||
return strtoupper( $text );
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in a new issue