2008-10-17 09:11:43 +00:00
|
|
|
<?php
|
2010-12-16 19:15:12 +00:00
|
|
|
/**
|
2012-09-20 20:49:55 +00:00
|
|
|
* Test revision text compression and decompression.
|
|
|
|
|
*
|
2010-12-16 19:15:12 +00:00
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
|
*
|
|
|
|
|
* @file
|
2012-09-20 20:49:55 +00:00
|
|
|
* @ingroup Maintenance ExternalStorage
|
2010-12-16 19:15:12 +00:00
|
|
|
*/
|
2008-10-17 09:11:43 +00:00
|
|
|
|
2016-02-17 09:09:32 +00:00
|
|
|
$optionsWithArgs = [ 'start', 'limit', 'type' ];
|
2013-05-17 00:16:59 +00:00
|
|
|
require __DIR__ . '/../commandLine.inc';
|
2008-10-17 09:11:43 +00:00
|
|
|
|
2014-04-22 20:23:18 +00:00
|
|
|
if ( !isset( $args[0] ) ) {
|
|
|
|
|
echo "Usage: php testCompression.php [--type=<type>] [--start=<start-date>] " .
|
|
|
|
|
"[--limit=<num-revs>] <page-title>\n";
|
2008-10-17 09:11:43 +00:00
|
|
|
exit( 1 );
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-01 21:36:08 +00:00
|
|
|
$lang = Language::factory( 'en' );
|
2008-10-17 09:11:43 +00:00
|
|
|
$title = Title::newFromText( $args[0] );
|
|
|
|
|
if ( isset( $options['start'] ) ) {
|
|
|
|
|
$start = wfTimestamp( TS_MW, strtotime( $options['start'] ) );
|
2016-03-01 21:36:08 +00:00
|
|
|
echo "Starting from " . $lang->timeanddate( $start ) . "\n";
|
2008-10-17 09:11:43 +00:00
|
|
|
} else {
|
|
|
|
|
$start = '19700101000000';
|
|
|
|
|
}
|
2008-11-01 12:06:23 +00:00
|
|
|
if ( isset( $options['limit'] ) ) {
|
|
|
|
|
$limit = $options['limit'];
|
|
|
|
|
$untilHappy = false;
|
|
|
|
|
} else {
|
|
|
|
|
$limit = 1000;
|
|
|
|
|
$untilHappy = true;
|
|
|
|
|
}
|
2017-10-06 22:17:58 +00:00
|
|
|
$type = $options['type'] ?? ConcatenatedGzipHistoryBlob::class;
|
2008-10-17 09:11:43 +00:00
|
|
|
|
2016-09-05 19:55:19 +00:00
|
|
|
$dbr = $this->getDB( DB_REPLICA );
|
2017-10-06 17:03:55 +00:00
|
|
|
$revQuery = Revision::getQueryInfo( [ 'page', 'text' ] );
|
2010-05-22 16:50:39 +00:00
|
|
|
$res = $dbr->select(
|
2017-10-06 17:03:55 +00:00
|
|
|
$revQuery['tables'],
|
|
|
|
|
$revQuery['fields'],
|
2016-02-17 09:09:32 +00:00
|
|
|
[
|
2008-10-17 09:11:43 +00:00
|
|
|
'page_namespace' => $title->getNamespace(),
|
|
|
|
|
'page_title' => $title->getDBkey(),
|
|
|
|
|
'rev_timestamp > ' . $dbr->addQuotes( $dbr->timestamp( $start ) ),
|
2017-10-06 17:03:55 +00:00
|
|
|
],
|
|
|
|
|
__FILE__,
|
|
|
|
|
[ 'LIMIT' => $limit ],
|
|
|
|
|
$revQuery['joins']
|
2008-10-17 09:11:43 +00:00
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$blob = new $type;
|
2016-02-17 09:09:32 +00:00
|
|
|
$hashes = [];
|
|
|
|
|
$keys = [];
|
2008-10-17 09:11:43 +00:00
|
|
|
$uncompressedSize = 0;
|
|
|
|
|
$t = -microtime( true );
|
|
|
|
|
foreach ( $res as $row ) {
|
|
|
|
|
$revision = new Revision( $row );
|
2012-06-08 07:07:35 +00:00
|
|
|
$text = $revision->getSerializedData();
|
2008-10-17 09:11:43 +00:00
|
|
|
$uncompressedSize += strlen( $text );
|
|
|
|
|
$hashes[$row->rev_id] = md5( $text );
|
|
|
|
|
$keys[$row->rev_id] = $blob->addItem( $text );
|
2008-11-01 12:06:23 +00:00
|
|
|
if ( $untilHappy && !$blob->isHappy() ) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
2008-10-17 09:11:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$serialized = serialize( $blob );
|
|
|
|
|
$t += microtime( true );
|
2010-05-22 16:50:39 +00:00
|
|
|
# print_r( $blob->mDiffMap );
|
2008-10-17 09:11:43 +00:00
|
|
|
|
2010-05-22 16:50:39 +00:00
|
|
|
printf( "%s\nCompression ratio for %d revisions: %5.2f, %s -> %d\n",
|
2008-11-01 12:06:23 +00:00
|
|
|
$type,
|
|
|
|
|
count( $hashes ),
|
2008-10-17 09:11:43 +00:00
|
|
|
$uncompressedSize / strlen( $serialized ),
|
2016-03-01 21:36:08 +00:00
|
|
|
$lang->formatSize( $uncompressedSize ),
|
2008-11-01 12:06:23 +00:00
|
|
|
strlen( $serialized )
|
2008-10-17 09:11:43 +00:00
|
|
|
);
|
|
|
|
|
printf( "Compression time: %5.2f ms\n", $t * 1000 );
|
|
|
|
|
|
|
|
|
|
$t = -microtime( true );
|
|
|
|
|
$blob = unserialize( $serialized );
|
|
|
|
|
foreach ( $keys as $id => $key ) {
|
|
|
|
|
$text = $blob->getItem( $key );
|
|
|
|
|
if ( md5( $text ) != $hashes[$id] ) {
|
|
|
|
|
echo "Content hash mismatch for rev_id $id\n";
|
2010-05-22 16:50:39 +00:00
|
|
|
# var_dump( $text );
|
2008-10-17 09:11:43 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
$t += microtime( true );
|
|
|
|
|
printf( "Decompression time: %5.2f ms\n", $t * 1000 );
|