Merge "Manual and automatic image metadata reserialization"

This commit is contained in:
jenkins-bot 2021-06-29 13:12:35 +00:00 committed by Gerrit Code Review
commit 1ffb4ae93a
9 changed files with 333 additions and 41 deletions

View file

@ -646,6 +646,12 @@ $wgImgAuthUrlPathMap = [];
* If the media handler opts in, large metadata items will be split into a
* separate blob in the database if the item is larger than this threshold.
* Default: 1000
* - updateCompatibleMetadata
* If true, image metadata will be upgraded by reloading it from the file,
* if the handler indicates that it is out of date. Default: false.
* - reserializeMetadata
* If true, image metadata will be automatically rewritten to the database
* if its serialization format is out of date. Default: false
*
* These settings describe a foreign MediaWiki installation. They are optional, and will be ignored
* for local repositories:

View file

@ -329,7 +329,9 @@ if ( !$wgLocalFileRepo ) {
'thumbScriptUrl' => $wgThumbnailScriptPath,
'transformVia404' => !$wgGenerateThumbnailOnParse,
'deletedDir' => $wgDeletedDirectory,
'deletedHashLevels' => $wgHashedUploadDirectory ? 3 : 0
'deletedHashLevels' => $wgHashedUploadDirectory ? 3 : 0,
'updateCompatibleMetadata' => $wgUpdateCompatibleMetadata,
'reserializeMetadata' => $wgUpdateCompatibleMetadata,
];
}

View file

@ -69,6 +69,12 @@ class LocalRepo extends FileRepo {
/** @var int|null */
protected $splitMetadataThreshold = 1000;
/** @var bool */
protected $updateCompatibleMetadata = false;
/** @var bool */
protected $reserializeMetadata = false;
public function __construct( array $info = null ) {
parent::__construct( $info );
@ -89,7 +95,9 @@ class LocalRepo extends FileRepo {
[
'useJsonMetadata',
'useSplitMetadata',
'splitMetadataThreshold'
'splitMetadataThreshold',
'updateCompatibleMetadata',
'reserializeMetadata',
] as $option
) {
if ( isset( $info[$option] ) ) {
@ -667,6 +675,14 @@ class LocalRepo extends FileRepo {
return $this->splitMetadataThreshold;
}
public function isMetadataUpdateEnabled() {
return $this->updateCompatibleMetadata;
}
public function isMetadataReserializeEnabled() {
return $this->reserializeMetadata;
}
/**
* Get a BlobStore for storing and retrieving large metadata, or null if
* that can't be done.

View file

@ -422,6 +422,16 @@ class RepoGroup {
}
}
/**
* Create a local repo with the specified option overrides.
*
* @param array $info
* @return FileRepo
*/
public function newCustomLocalRepo( $info = [] ) {
return $this->newRepo( $info + $this->localInfo );
}
/**
* Create a repo class based on an info structure
* @param array $info

View file

@ -65,6 +65,18 @@ class LocalFile extends File {
private const CACHE_FIELD_MAX_LEN = 1000;
/** @var string Metadata serialization: empty string. This is a compact non-legacy format. */
private const MDS_EMPTY = 'empty';
/** @var string Metadata serialization: some other string */
private const MDS_LEGACY = 'legacy';
/** @var string Metadata serialization: PHP serialize() */
private const MDS_PHP = 'php';
/** @var string Metadata serialization: JSON */
private const MDS_JSON = 'json';
/** @var bool Does the file exist on disk? (loadFromXxx) */
protected $fileExists;
@ -89,6 +101,14 @@ class LocalFile extends File {
/** @var array Unserialized metadata */
protected $metadataArray = [];
/**
* One of the MDS_* constants, giving the format of the metadata as stored
* in the DB, or null if the data was not loaded from the DB.
*
* @var string|null
*/
protected $metadataSerializationFormat;
/** @var string[] Map of metadata item name to blob address */
protected $metadataBlobs = [];
@ -722,15 +742,15 @@ class LocalFile extends File {
/**
* Upgrade a row if it needs it
* @internal
*/
protected function maybeUpgradeRow() {
global $wgUpdateCompatibleMetadata;
public function maybeUpgradeRow() {
if ( wfReadOnly() || $this->upgrading ) {
return;
}
$upgrade = false;
$reserialize = false;
if ( $this->media_type === null || $this->mime == 'image/svg' ) {
$upgrade = true;
} else {
@ -739,19 +759,34 @@ class LocalFile extends File {
$validity = $handler->isFileMetadataValid( $this );
if ( $validity === MediaHandler::METADATA_BAD ) {
$upgrade = true;
} elseif ( $validity === MediaHandler::METADATA_COMPATIBLE ) {
$upgrade = $wgUpdateCompatibleMetadata;
} elseif ( $validity === MediaHandler::METADATA_COMPATIBLE
&& $this->repo->isMetadataUpdateEnabled()
) {
$upgrade = true;
} elseif ( $this->repo->isJsonMetadataEnabled()
&& $this->repo->isMetadataReserializeEnabled()
) {
if ( $this->repo->isSplitMetadataEnabled() && $this->isMetadataOversize() ) {
$reserialize = true;
} elseif ( $this->metadataSerializationFormat !== self::MDS_EMPTY &&
$this->metadataSerializationFormat !== self::MDS_JSON ) {
$reserialize = true;
}
}
}
}
if ( $upgrade ) {
if ( $upgrade || $reserialize ) {
$this->upgrading = true;
// Defer updates unless in auto-commit CLI mode
DeferredUpdates::addCallableUpdate( function () {
DeferredUpdates::addCallableUpdate( function () use ( $upgrade ) {
$this->upgrading = false; // avoid duplicate updates
try {
$this->upgradeRow();
if ( $upgrade ) {
$this->upgradeRow();
} else {
$this->reserializeMetadata();
}
} catch ( LocalFileLockError $e ) {
// let the other process handle it (or do it next time)
}
@ -815,6 +850,27 @@ class LocalFile extends File {
$this->upgraded = true; // avoid rework/retries
}
/**
* Write the metadata back to the database with the current serialization
* format.
*/
protected function reserializeMetadata() {
if ( wfReadOnly() ) {
return;
}
$dbw = $this->repo->getMasterDB();
$dbw->update(
'image',
[ 'img_metadata' => $this->getMetadataForDb( $dbw ) ],
[
'img_name' => $this->name,
'img_timestamp' => $dbw->timestamp( $this->timestamp ),
],
__METHOD__
);
$this->upgraded = true;
}
/**
* Set properties in this object to be equal to those given in the
* associative array $info. Only cacheable fields can be set.
@ -1188,6 +1244,26 @@ class LocalFile extends File {
return $s;
}
/**
* Determine whether the loaded metadata may be a candidate for splitting, by measuring its
* serialized size. Helper for maybeUpgradeRow().
*
* @return bool
*/
private function isMetadataOversize() {
if ( !$this->repo->isSplitMetadataEnabled() ) {
return false;
}
$threshold = $this->repo->getSplitMetadataThreshold();
$directItems = array_diff_key( $this->metadataArray, $this->metadataBlobs );
foreach ( $directItems as $value ) {
if ( strlen( $this->jsonEncode( $value ) ) > $threshold ) {
return true;
}
}
return false;
}
/**
* Unserialize a metadata blob which came from the database and store it
* in $this.
@ -1214,6 +1290,7 @@ class LocalFile extends File {
$this->unloadedMetadataBlobs = [];
$metadataString = (string)$metadataString;
if ( $metadataString === '' ) {
$this->metadataSerializationFormat = self::MDS_EMPTY;
return;
}
if ( $metadataString[0] === '{' ) {
@ -1221,7 +1298,9 @@ class LocalFile extends File {
if ( !$envelope ) {
// Legacy error encoding
$this->metadataArray = [ '_error' => $metadataString ];
$this->metadataSerializationFormat = self::MDS_LEGACY;
} else {
$this->metadataSerializationFormat = self::MDS_JSON;
if ( isset( $envelope['data'] ) ) {
$this->metadataArray = $envelope['data'];
}
@ -1235,6 +1314,9 @@ class LocalFile extends File {
if ( !is_array( $data ) ) {
// Legacy error encoding
$data = [ '_error' => $metadataString ];
$this->metadataSerializationFormat = self::MDS_LEGACY;
} else {
$this->metadataSerializationFormat = self::MDS_PHP;
}
$this->metadataArray = $data;
}

View file

@ -377,6 +377,12 @@ class OldLocalFile extends LocalFile {
);
}
protected function reserializeMetadata() {
// TODO: implement this and make it possible to hit it from refreshImageMetadata.php
// It can be hit from action=purge but that's not very useful if the
// goal is to reserialize the whole oldimage table.
}
/**
* @param int $field One of DELETED_* bitfield constants for file or
* revision rows

View file

@ -69,11 +69,6 @@ class ImageBuilder extends Maintenance {
public function __construct() {
parent::__construct();
global $wgUpdateCompatibleMetadata;
// make sure to update old, but compatible img_metadata fields.
$wgUpdateCompatibleMetadata = true;
$this->addDescription( 'Script to update image metadata records' );
$this->addOption( 'missing', 'Check for files without associated database record' );
@ -100,7 +95,11 @@ class ImageBuilder extends Maintenance {
*/
private function getRepo() {
if ( $this->repo === null ) {
$this->repo = MediaWikiServices::getInstance()->getRepoGroup()->getLocalRepo();
$this->repo = MediaWikiServices::getInstance()->getRepoGroup()
->newCustomLocalRepo( [
// make sure to update old, but compatible img_metadata fields.
'updateCompatibleMetadata' => true
] );
}
return $this->repo;

View file

@ -62,6 +62,14 @@ class RefreshImageMetadata extends Maintenance {
'broken-only',
'Only fix really broken records, leave old but still compatible records alone.'
);
$this->addOption(
'convert-to-json',
'Fix records with an out of date serialization format.'
);
$this->addOption(
'split',
'Enable splitting out large metadata items to the text table. Implies --convert-to-json.'
);
$this->addOption(
'verbose',
'Output extra information about each upgraded/non-upgraded file.',
@ -100,7 +108,8 @@ class RefreshImageMetadata extends Maintenance {
$brokenOnly = $this->hasOption( 'broken-only' );
$verbose = $this->hasOption( 'verbose' );
$start = $this->getOption( 'start', false );
$this->setupParameters( $force, $brokenOnly );
$split = $this->hasOption( 'split' );
$reserialize = $this->hasOption( 'convert-to-json' );
$upgraded = 0;
$leftAlone = 0;
@ -112,7 +121,7 @@ class RefreshImageMetadata extends Maintenance {
$this->fatalError( "Batch size is too low...", 12 );
}
$repo = MediaWikiServices::getInstance()->getRepoGroup()->getLocalRepo();
$repo = $this->newLocalRepo( $force, $brokenOnly, $reserialize, $split );
$conds = $this->getConditions( $dbw );
// For the WHERE img_name > 'foo' condition that comes after doing a batch
@ -149,6 +158,7 @@ class RefreshImageMetadata extends Maintenance {
try {
// LocalFile will upgrade immediately here if obsolete
$file = $repo->newFileFromRow( $row );
$file->maybeUpgradeRow();
if ( $file->getUpgraded() ) {
// File was upgraded.
$upgraded++;
@ -221,19 +231,34 @@ class RefreshImageMetadata extends Maintenance {
/**
* @param bool $force
* @param bool $brokenOnly
* @param bool $reserialize
* @param bool $split
*
* @return LocalRepo
*/
private function setupParameters( $force, $brokenOnly ) {
global $wgUpdateCompatibleMetadata;
if ( $brokenOnly ) {
$wgUpdateCompatibleMetadata = false;
} else {
$wgUpdateCompatibleMetadata = true;
}
private function newLocalRepo( $force, $brokenOnly, $reserialize, $split ): LocalRepo {
if ( $brokenOnly && $force ) {
$this->fatalError( 'Cannot use --broken-only and --force together. ', 2 );
}
$reserialize = $reserialize || $split;
if ( $brokenOnly && $reserialize ) {
$this->fatalError( 'Cannot use --broken-only with --convert-to-json or --split. ',
2 );
}
$overrides = [
'updateCompatibleMetadata' => !$brokenOnly,
];
if ( $reserialize ) {
$overrides['reserializeMetadata'] = true;
$overrides['useJsonMetadata'] = true;
}
if ( $split ) {
$overrides['useSplitMetadata'] = true;
}
return MediaWikiServices::getInstance()->getRepoGroup()
->newCustomLocalRepo( $overrides );
}
}

View file

@ -635,7 +635,7 @@ class LocalFileTest extends MediaWikiIntegrationTestCase {
$repo = $services->getRepoGroup()->getLocalRepo();
$file = $repo->findFile( $title );
$this->assertFileProperties( $file, $expectedProps );
$this->assertFileProperties( $expectedProps, $file );
$this->assertSame( 'truecolour', $file->getMetadataItem( 'colorType' ) );
$this->assertSame(
[ 'loopCount' => 1, 'bitDepth' => 16 ],
@ -650,7 +650,7 @@ class LocalFileTest extends MediaWikiIntegrationTestCase {
[ 'img_name' => 'Random-11m.png' ], __METHOD__ );
$file = LocalFile::newFromTitle( $title, $repo );
$this->assertFileProperties( $file, $expectedProps );
$this->assertFileProperties( $expectedProps, $file );
$this->assertSame( 'truecolour', $file->getMetadataItem( 'colorType' ) );
$this->assertSame(
[ 'loopCount' => 1, 'bitDepth' => 16 ],
@ -666,7 +666,7 @@ class LocalFileTest extends MediaWikiIntegrationTestCase {
$this->assertSame( false, $file->exists() );
}
private function assertFileProperties( $file, $expectedProps ) {
private function assertFileProperties( $expectedProps, $file ) {
// Compare metadata without ordering
if ( isset( $expectedProps['metadata'] ) ) {
$this->assertArrayEquals( $expectedProps['metadata'], $file->getMetadataArray() );
@ -803,6 +803,17 @@ class LocalFileTest extends MediaWikiIntegrationTestCase {
return ArrayUtils::cartesianProduct( $files, $configurations );
}
private function getMockPdfHandler() {
return new class extends ImageHandler {
public function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) {
}
public function useSplitMetadata() {
return true;
}
};
}
/**
* Test recordUpload3() and confirm that file properties are reflected back
* after loading the new file from the DB.
@ -828,15 +839,7 @@ class LocalFileTest extends MediaWikiIntegrationTestCase {
$file = new LocalFile( $title, $repo );
if ( $props['mime'] === 'application/pdf' ) {
$mockPdfHandler = new class extends ImageHandler {
public function doTransform( $image, $dstPath, $dstUrl, $params, $flags = 0 ) {
}
public function useSplitMetadata() {
return true;
}
};
TestingAccessWrapper::newFromObject( $file )->handler = $mockPdfHandler;
TestingAccessWrapper::newFromObject( $file )->handler = $this->getMockPdfHandler();
}
$status = $file->recordUpload3(
@ -848,10 +851,10 @@ class LocalFileTest extends MediaWikiIntegrationTestCase {
);
$this->assertSame( [], $status->getErrors() );
// Check properties of the same object immediately after upload
$this->assertFileProperties( $file, $props );
$this->assertFileProperties( $props, $file );
// Check round-trip through the DB
$file = new LocalFile( $title, $repo );
$this->assertFileProperties( $file, $props );
$this->assertFileProperties( $props, $file );
}
/**
@ -891,4 +894,147 @@ class LocalFileTest extends MediaWikiIntegrationTestCase {
);
$this->assertSame( [], $status->getErrors() );
}
public function provideReserializeMetadata() {
return [
[
'',
''
],
[
'a:1:{s:4:"test";i:1;}',
'{"data":{"test":1}}'
],
[
serialize( [ 'test' => str_repeat( 'x', 100 ) ] ),
'{"data":[],"blobs":{"test":"tt:%d"}}'
]
];
}
/**
* Test reserializeMetadata() via maybeUpgradeRow()
*
* @covers LocalFile::maybeUpgradeRow
* @covers LocalFile::reserializeMetadata
* @dataProvider provideReserializeMetadata
*/
public function testReserializeMetadata( $input, $expected ) {
$dbw = wfGetDB( DB_PRIMARY );
$services = MediaWikiServices::getInstance();
$norm = $services->getActorNormalization();
$user = $this->getTestSysop()->getUserIdentity();
$actorId = $norm->acquireActorId( $user, $dbw );
$comment = $services->getCommentStore()->createComment( $dbw, 'comment' );
$dbw->insert(
'image',
[
'img_name' => 'Test.pdf',
'img_size' => 1,
'img_width' => 1,
'img_height' => 1,
'img_metadata' => $input,
'img_bits' => 0,
'img_media_type' => 'OFFICE',
'img_major_mime' => 'application',
'img_minor_mime' => 'pdf',
'img_description_id' => $comment->id,
'img_actor' => $actorId,
'img_timestamp' => $dbw->timestamp( '20201105235242' ),
'img_sha1' => 'hhhh',
]
);
$repo = new LocalRepo( [
'class' => LocalRepo::class,
'name' => 'test',
'useJsonMetadata' => true,
'useSplitMetadata' => true,
'splitMetadataThreshold' => 50,
'updateCompatibleMetadata' => true,
'reserializeMetadata' => true,
'backend' => new FSFileBackend( [
'name' => 'test-backend',
'wikiId' => WikiMap::getCurrentWikiId(),
'basePath' => '/nonexistent'
] )
] );
$title = Title::newFromText( 'File:Test.pdf' );
$file = new LocalFile( $title, $repo );
TestingAccessWrapper::newFromObject( $file )->handler = $this->getMockPdfHandler();
$file->load();
$file->maybeUpgradeRow();
$metadata = $dbw->selectField( 'image', 'img_metadata',
[ 'img_name' => 'Test.pdf' ], __METHOD__ );
$this->assertStringMatchesFormat( $expected, $metadata );
}
/**
* Test upgradeRow() via maybeUpgradeRow()
*
* @covers LocalFile::maybeUpgradeRow
* @covers LocalFile::upgradeRow
*/
public function testUpgradeRow() {
$repo = new LocalRepo( [
'class' => LocalRepo::class,
'name' => 'test',
'updateCompatibleMetadata' => true,
'useJsonMetadata' => true,
'hashLevels' => 0,
'backend' => new FSFileBackend( [
'name' => 'test-backend',
'wikiId' => WikiMap::getCurrentWikiId(),
'containerPaths' => [ 'test-public' => __DIR__ . '/../../../data/media' ]
] )
] );
$dbw = wfGetDB( DB_PRIMARY );
$services = MediaWikiServices::getInstance();
$norm = $services->getActorNormalization();
$user = $this->getTestSysop()->getUserIdentity();
$actorId = $norm->acquireActorId( $user, $dbw );
$comment = $services->getCommentStore()->createComment( $dbw, 'comment' );
$dbw->insert(
'image',
[
'img_name' => 'Png-native-test.png',
'img_size' => 1,
'img_width' => 1,
'img_height' => 1,
'img_metadata' => 'a:1:{s:8:"metadata";a:1:{s:15:"_MW_PNG_VERSION";i:0;}}',
'img_bits' => 0,
'img_media_type' => 'OFFICE',
'img_major_mime' => 'image',
'img_minor_mime' => 'png',
'img_description_id' => $comment->id,
'img_actor' => $actorId,
'img_timestamp' => $dbw->timestamp( '20201105235242' ),
'img_sha1' => 'hhhh',
]
);
$title = Title::newFromText( 'File:Png-native-test.png' );
$file = new LocalFile( $title, $repo );
$file->load();
$file->maybeUpgradeRow();
$metadata = $dbw->selectField( 'image', 'img_metadata',
[ 'img_name' => 'Png-native-test.png' ] );
// Just confirm that it looks like JSON with real metadata
$this->assertStringStartsWith( '{"data":{"frameCount":0,', $metadata );
$file = new LocalFile( $title, $repo );
$this->assertFileProperties(
[
'size' => 4665,
'width' => 420,
'height' => 300,
'sha1' => '3n69qtiaif1swp3kyfueqjtmw2u4c2b',
'bits' => 8,
'media_type' => 'BITMAP',
],
$file );
}
}