2010-07-23 20:58:11 +00:00
|
|
|
<?php
|
|
|
|
|
/**
|
2012-09-03 18:10:09 +00:00
|
|
|
* Find all rows in the categorylinks table whose collation is out-of-date
|
|
|
|
|
* (cl_collation != $wgCategoryCollation) and repopulate cl_sortkey
|
2010-12-16 19:15:12 +00:00
|
|
|
* using the page title and cl_sortkey_prefix.
|
|
|
|
|
*
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
|
|
|
*
|
Reconcept cl_raw_sortkey as cl_sortkey_prefix
In response to feedback by Phillipe Verdy on bug 164. Now if a bunch of
pages have [[Category:Foo| ]], they'll sort amongst themselves according
to page name, instead of in basically random order as it is currently.
This also makes storage more elegant and intuitive: instead of giving
NULL a magic meaning when there's no custom sortkey specified, we just
store an empty string, since there's no prefix.
This means {{defaultsort:}} really now means {{defaultsortprefix:}},
which is slightly confusing, and a lot of code is now slightly
misleading or poorly named. But it should all work fine.
Also, while I was at it, I made updateCollation.php work as a transition
script, so you can apply the SQL patch and then run updateCollation.php
and things will work. However, with the new schema it's not trivial to
reverse this -- you'd have to recover the raw sort keys with some PHP.
Conversion goes at about a thousand rows a second for me, and seems to
be CPU-bound. Could probably be optimized.
I also adjusted the transition script so it will fix rows with collation
versions *greater* than the current one, as well as less. Thus if some
site wants to use their own collation, they can call it 137 or
something, and if they later want to switch back to MediaWiki stock
collation 7, it will work.
Also fixed a silly bug in updateCollation.php where it would say "1000
done" if it did nothing, and changed $res->numRows() >= self::BATCH_SIZE
to == so people don't wonder how it could be bigger (since it can't, I
hope).
2010-07-26 19:27:13 +00:00
|
|
|
* @file
|
2010-07-23 20:58:11 +00:00
|
|
|
* @ingroup Maintenance
|
|
|
|
|
* @author Aryeh Gregor (Simetrical)
|
|
|
|
|
*/
|
|
|
|
|
|
2013-05-17 00:16:59 +00:00
|
|
|
require_once __DIR__ . '/Maintenance.php';
|
2010-07-23 20:58:11 +00:00
|
|
|
|
2018-08-05 08:36:32 +00:00
|
|
|
use MediaWiki\MediaWikiServices;
|
2017-03-30 20:46:06 +00:00
|
|
|
use Wikimedia\Rdbms\IDatabase;
|
|
|
|
|
|
2012-09-03 18:10:09 +00:00
|
|
|
/**
|
|
|
|
|
* Maintenance script that will find all rows in the categorylinks table
|
|
|
|
|
* whose collation is out-of-date.
|
|
|
|
|
*
|
|
|
|
|
* @ingroup Maintenance
|
|
|
|
|
*/
|
2010-07-23 20:58:11 +00:00
|
|
|
class UpdateCollation extends Maintenance {
|
2020-05-09 23:22:50 +00:00
|
|
|
private const BATCH_SIZE = 100; // Number of rows to process in one batch
|
2010-07-23 20:58:11 +00:00
|
|
|
|
2016-02-17 09:09:32 +00:00
|
|
|
public $sizeHistogram = [];
|
2012-07-11 03:39:39 +00:00
|
|
|
|
2010-07-23 20:58:11 +00:00
|
|
|
public function __construct() {
|
|
|
|
|
parent::__construct();
|
|
|
|
|
|
2016-01-30 02:48:47 +00:00
|
|
|
$this->addDescription( <<<TEXT
|
Reconcept cl_raw_sortkey as cl_sortkey_prefix
In response to feedback by Phillipe Verdy on bug 164. Now if a bunch of
pages have [[Category:Foo| ]], they'll sort amongst themselves according
to page name, instead of in basically random order as it is currently.
This also makes storage more elegant and intuitive: instead of giving
NULL a magic meaning when there's no custom sortkey specified, we just
store an empty string, since there's no prefix.
This means {{defaultsort:}} really now means {{defaultsortprefix:}},
which is slightly confusing, and a lot of code is now slightly
misleading or poorly named. But it should all work fine.
Also, while I was at it, I made updateCollation.php work as a transition
script, so you can apply the SQL patch and then run updateCollation.php
and things will work. However, with the new schema it's not trivial to
reverse this -- you'd have to recover the raw sort keys with some PHP.
Conversion goes at about a thousand rows a second for me, and seems to
be CPU-bound. Could probably be optimized.
I also adjusted the transition script so it will fix rows with collation
versions *greater* than the current one, as well as less. Thus if some
site wants to use their own collation, they can call it 137 or
something, and if they later want to switch back to MediaWiki stock
collation 7, it will work.
Also fixed a silly bug in updateCollation.php where it would say "1000
done" if it did nothing, and changed $res->numRows() >= self::BATCH_SIZE
to == so people don't wonder how it could be bigger (since it can't, I
hope).
2010-07-26 19:27:13 +00:00
|
|
|
This script will find all rows in the categorylinks table whose collation is
|
2020-02-25 17:13:09 +00:00
|
|
|
out-of-date (cl_collation is not the same as \$wgCategoryCollation) and
|
|
|
|
|
repopulate cl_sortkey using the page title and cl_sortkey_prefix. If all
|
|
|
|
|
collations are up-to-date, it will do nothing.
|
2016-01-30 02:48:47 +00:00
|
|
|
TEXT
|
|
|
|
|
);
|
2010-07-23 20:58:11 +00:00
|
|
|
|
2011-11-16 13:22:03 +00:00
|
|
|
$this->addOption( 'force', 'Run on all rows, even if the collation is ' .
|
2016-02-22 22:21:52 +00:00
|
|
|
'supposed to be up-to-date.', false, false, 'f' );
|
2011-09-15 12:17:44 +00:00
|
|
|
$this->addOption( 'previous-collation', 'Set the previous value of ' .
|
|
|
|
|
'$wgCategoryCollation here to speed up this script, especially if your ' .
|
|
|
|
|
'categorylinks table is large. This will only update rows with that ' .
|
|
|
|
|
'collation, though, so it may miss out-of-date rows with a different, ' .
|
|
|
|
|
'even older collation.', false, true );
|
2012-07-11 03:39:39 +00:00
|
|
|
$this->addOption( 'target-collation', 'Set this to the new collation type to ' .
|
2012-09-03 18:10:09 +00:00
|
|
|
'use instead of $wgCategoryCollation. Usually you should not use this, ' .
|
|
|
|
|
'you should just update $wgCategoryCollation in LocalSettings.php.',
|
2012-07-11 03:39:39 +00:00
|
|
|
false, true );
|
|
|
|
|
$this->addOption( 'dry-run', 'Don\'t actually change the collations, just ' .
|
|
|
|
|
'compile statistics.' );
|
|
|
|
|
$this->addOption( 'verbose-stats', 'Show more statistics.' );
|
2010-07-23 20:58:11 +00:00
|
|
|
}
|
2010-12-04 03:20:14 +00:00
|
|
|
|
2010-07-23 20:58:11 +00:00
|
|
|
public function execute() {
|
2021-04-29 02:37:11 +00:00
|
|
|
$dbw = $this->getDB( DB_PRIMARY );
|
2016-09-05 19:55:19 +00:00
|
|
|
$dbr = $this->getDB( DB_REPLICA );
|
2011-01-20 06:24:11 +00:00
|
|
|
$force = $this->getOption( 'force' );
|
2012-07-11 03:39:39 +00:00
|
|
|
$dryRun = $this->getOption( 'dry-run' );
|
|
|
|
|
$verboseStats = $this->getOption( 'verbose-stats' );
|
|
|
|
|
if ( $this->hasOption( 'target-collation' ) ) {
|
|
|
|
|
$collationName = $this->getOption( 'target-collation' );
|
|
|
|
|
} else {
|
2019-08-19 15:44:14 +00:00
|
|
|
$collationName = $this->getConfig()->get( 'CategoryCollation' );
|
2012-07-11 03:39:39 +00:00
|
|
|
}
|
2021-08-24 19:12:39 +00:00
|
|
|
$collation = MediaWikiServices::getInstance()->getCollationFactory()->makeCollation( $collationName );
|
2011-01-20 06:24:11 +00:00
|
|
|
|
2013-05-18 19:46:32 +00:00
|
|
|
// Collation sanity check: in some cases the constructor will work,
|
|
|
|
|
// but this will raise an exception, breaking all category pages
|
|
|
|
|
$collation->getFirstLetter( 'MediaWiki' );
|
|
|
|
|
|
2016-02-21 21:59:14 +00:00
|
|
|
// Locally at least, (my local is a rather old version of mysql)
|
|
|
|
|
// mysql seems to filesort if there is both an equality
|
|
|
|
|
// (but not for an inequality) condition on cl_collation in the
|
|
|
|
|
// WHERE and it is also the first item in the ORDER BY.
|
|
|
|
|
if ( $this->hasOption( 'previous-collation' ) ) {
|
|
|
|
|
$orderBy = 'cl_to, cl_type, cl_from';
|
|
|
|
|
} else {
|
|
|
|
|
$orderBy = 'cl_collation, cl_to, cl_type, cl_from';
|
|
|
|
|
}
|
2016-02-17 09:09:32 +00:00
|
|
|
$options = [
|
2013-03-12 00:26:12 +00:00
|
|
|
'LIMIT' => self::BATCH_SIZE,
|
2016-02-21 21:59:14 +00:00
|
|
|
'ORDER BY' => $orderBy,
|
2016-05-10 19:30:03 +00:00
|
|
|
'STRAIGHT_JOIN' // per T58041
|
2016-02-17 09:09:32 +00:00
|
|
|
];
|
2011-01-20 06:24:11 +00:00
|
|
|
|
2019-08-30 17:56:27 +00:00
|
|
|
$collationConds = [];
|
|
|
|
|
if ( !$force ) {
|
2011-09-15 12:17:44 +00:00
|
|
|
if ( $this->hasOption( 'previous-collation' ) ) {
|
|
|
|
|
$collationConds['cl_collation'] = $this->getOption( 'previous-collation' );
|
|
|
|
|
} else {
|
2016-02-17 09:09:32 +00:00
|
|
|
$collationConds = [ 0 =>
|
2012-07-11 03:39:39 +00:00
|
|
|
'cl_collation != ' . $dbw->addQuotes( $collationName )
|
2016-02-17 09:09:32 +00:00
|
|
|
];
|
2011-09-15 12:17:44 +00:00
|
|
|
}
|
2011-01-20 06:24:11 +00:00
|
|
|
|
2016-05-10 19:38:27 +00:00
|
|
|
$count = $dbr->estimateRowCount(
|
2013-03-12 00:26:12 +00:00
|
|
|
'categorylinks',
|
|
|
|
|
'*',
|
|
|
|
|
$collationConds,
|
|
|
|
|
__METHOD__
|
|
|
|
|
);
|
|
|
|
|
// Improve estimate if feasible
|
|
|
|
|
if ( $count < 1000000 ) {
|
2016-05-10 19:38:27 +00:00
|
|
|
$count = $dbr->selectField(
|
2011-03-08 16:47:26 +00:00
|
|
|
'categorylinks',
|
|
|
|
|
'COUNT(*)',
|
|
|
|
|
$collationConds,
|
|
|
|
|
__METHOD__
|
|
|
|
|
);
|
2012-07-06 16:57:40 +00:00
|
|
|
}
|
|
|
|
|
if ( $count == 0 ) {
|
|
|
|
|
$this->output( "Collations up-to-date.\n" );
|
2014-04-23 18:09:26 +00:00
|
|
|
|
2012-07-06 16:57:40 +00:00
|
|
|
return;
|
2011-01-20 06:24:11 +00:00
|
|
|
}
|
2016-08-29 23:29:24 +00:00
|
|
|
if ( $dryRun ) {
|
|
|
|
|
$this->output( "$count rows would be updated.\n" );
|
|
|
|
|
} else {
|
|
|
|
|
$this->output( "Fixing collation for $count rows.\n" );
|
|
|
|
|
}
|
2020-05-01 00:36:46 +00:00
|
|
|
MediaWikiServices::getInstance()->getDBLoadBalancerFactory()->waitForReplication();
|
2010-08-03 21:11:16 +00:00
|
|
|
}
|
2010-07-23 20:58:11 +00:00
|
|
|
$count = 0;
|
2016-02-17 09:09:32 +00:00
|
|
|
$batchConds = [];
|
2010-07-23 20:58:11 +00:00
|
|
|
do {
|
2011-09-15 12:17:44 +00:00
|
|
|
$this->output( "Selecting next " . self::BATCH_SIZE . " rows..." );
|
2016-02-21 21:59:14 +00:00
|
|
|
|
|
|
|
|
// cl_type must be selected as a number for proper paging because
|
|
|
|
|
// enums suck.
|
|
|
|
|
if ( $dbw->getType() === 'mysql' ) {
|
|
|
|
|
$clType = 'cl_type+0 AS "cl_type_numeric"';
|
|
|
|
|
} else {
|
|
|
|
|
$clType = 'cl_type';
|
|
|
|
|
}
|
2010-07-23 20:58:11 +00:00
|
|
|
$res = $dbw->select(
|
2016-02-17 09:09:32 +00:00
|
|
|
[ 'categorylinks', 'page' ],
|
|
|
|
|
[ 'cl_from', 'cl_to', 'cl_sortkey_prefix', 'cl_collation',
|
2016-02-21 21:59:14 +00:00
|
|
|
'cl_sortkey', $clType,
|
|
|
|
|
'page_namespace', 'page_title'
|
2016-02-17 09:09:32 +00:00
|
|
|
],
|
|
|
|
|
array_merge( $collationConds, $batchConds, [ 'cl_from = page_id' ] ),
|
2010-07-23 20:58:11 +00:00
|
|
|
__METHOD__,
|
2011-01-20 06:24:11 +00:00
|
|
|
$options
|
2010-07-23 20:58:11 +00:00
|
|
|
);
|
2011-09-15 12:17:44 +00:00
|
|
|
$this->output( " processing..." );
|
2010-07-23 20:58:11 +00:00
|
|
|
|
2012-07-11 03:39:39 +00:00
|
|
|
if ( !$dryRun ) {
|
2015-12-22 08:51:42 +00:00
|
|
|
$this->beginTransaction( $dbw, __METHOD__ );
|
2012-07-11 03:39:39 +00:00
|
|
|
}
|
2010-07-23 20:58:11 +00:00
|
|
|
foreach ( $res as $row ) {
|
Reconcept cl_raw_sortkey as cl_sortkey_prefix
In response to feedback by Phillipe Verdy on bug 164. Now if a bunch of
pages have [[Category:Foo| ]], they'll sort amongst themselves according
to page name, instead of in basically random order as it is currently.
This also makes storage more elegant and intuitive: instead of giving
NULL a magic meaning when there's no custom sortkey specified, we just
store an empty string, since there's no prefix.
This means {{defaultsort:}} really now means {{defaultsortprefix:}},
which is slightly confusing, and a lot of code is now slightly
misleading or poorly named. But it should all work fine.
Also, while I was at it, I made updateCollation.php work as a transition
script, so you can apply the SQL patch and then run updateCollation.php
and things will work. However, with the new schema it's not trivial to
reverse this -- you'd have to recover the raw sort keys with some PHP.
Conversion goes at about a thousand rows a second for me, and seems to
be CPU-bound. Could probably be optimized.
I also adjusted the transition script so it will fix rows with collation
versions *greater* than the current one, as well as less. Thus if some
site wants to use their own collation, they can call it 137 or
something, and if they later want to switch back to MediaWiki stock
collation 7, it will work.
Also fixed a silly bug in updateCollation.php where it would say "1000
done" if it did nothing, and changed $res->numRows() >= self::BATCH_SIZE
to == so people don't wonder how it could be bigger (since it can't, I
hope).
2010-07-26 19:27:13 +00:00
|
|
|
$title = Title::newFromRow( $row );
|
2011-01-17 06:27:49 +00:00
|
|
|
if ( !$row->cl_collation ) {
|
Reconcept cl_raw_sortkey as cl_sortkey_prefix
In response to feedback by Phillipe Verdy on bug 164. Now if a bunch of
pages have [[Category:Foo| ]], they'll sort amongst themselves according
to page name, instead of in basically random order as it is currently.
This also makes storage more elegant and intuitive: instead of giving
NULL a magic meaning when there's no custom sortkey specified, we just
store an empty string, since there's no prefix.
This means {{defaultsort:}} really now means {{defaultsortprefix:}},
which is slightly confusing, and a lot of code is now slightly
misleading or poorly named. But it should all work fine.
Also, while I was at it, I made updateCollation.php work as a transition
script, so you can apply the SQL patch and then run updateCollation.php
and things will work. However, with the new schema it's not trivial to
reverse this -- you'd have to recover the raw sort keys with some PHP.
Conversion goes at about a thousand rows a second for me, and seems to
be CPU-bound. Could probably be optimized.
I also adjusted the transition script so it will fix rows with collation
versions *greater* than the current one, as well as less. Thus if some
site wants to use their own collation, they can call it 137 or
something, and if they later want to switch back to MediaWiki stock
collation 7, it will work.
Also fixed a silly bug in updateCollation.php where it would say "1000
done" if it did nothing, and changed $res->numRows() >= self::BATCH_SIZE
to == so people don't wonder how it could be bigger (since it can't, I
hope).
2010-07-26 19:27:13 +00:00
|
|
|
# This is an old-style row, so the sortkey needs to be
|
|
|
|
|
# converted.
|
2010-08-03 20:50:31 +00:00
|
|
|
if ( $row->cl_sortkey == $title->getText()
|
2014-04-23 18:09:26 +00:00
|
|
|
|| $row->cl_sortkey == $title->getPrefixedText()
|
|
|
|
|
) {
|
Reconcept cl_raw_sortkey as cl_sortkey_prefix
In response to feedback by Phillipe Verdy on bug 164. Now if a bunch of
pages have [[Category:Foo| ]], they'll sort amongst themselves according
to page name, instead of in basically random order as it is currently.
This also makes storage more elegant and intuitive: instead of giving
NULL a magic meaning when there's no custom sortkey specified, we just
store an empty string, since there's no prefix.
This means {{defaultsort:}} really now means {{defaultsortprefix:}},
which is slightly confusing, and a lot of code is now slightly
misleading or poorly named. But it should all work fine.
Also, while I was at it, I made updateCollation.php work as a transition
script, so you can apply the SQL patch and then run updateCollation.php
and things will work. However, with the new schema it's not trivial to
reverse this -- you'd have to recover the raw sort keys with some PHP.
Conversion goes at about a thousand rows a second for me, and seems to
be CPU-bound. Could probably be optimized.
I also adjusted the transition script so it will fix rows with collation
versions *greater* than the current one, as well as less. Thus if some
site wants to use their own collation, they can call it 137 or
something, and if they later want to switch back to MediaWiki stock
collation 7, it will work.
Also fixed a silly bug in updateCollation.php where it would say "1000
done" if it did nothing, and changed $res->numRows() >= self::BATCH_SIZE
to == so people don't wonder how it could be bigger (since it can't, I
hope).
2010-07-26 19:27:13 +00:00
|
|
|
$prefix = '';
|
|
|
|
|
} else {
|
|
|
|
|
# Custom sortkey, use it as a prefix
|
|
|
|
|
$prefix = $row->cl_sortkey;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
$prefix = $row->cl_sortkey_prefix;
|
|
|
|
|
}
|
|
|
|
|
# cl_type will be wrong for lots of pages if cl_collation is 0,
|
|
|
|
|
# so let's update it while we're here.
|
2018-08-05 08:36:32 +00:00
|
|
|
$type = MediaWikiServices::getInstance()->getNamespaceInfo()->
|
|
|
|
|
getCategoryLinkType( $title->getNamespace() );
|
2012-07-11 03:39:39 +00:00
|
|
|
$newSortKey = $collation->getSortKey(
|
|
|
|
|
$title->getCategorySortkey( $prefix ) );
|
|
|
|
|
if ( $verboseStats ) {
|
|
|
|
|
$this->updateSortKeySizeHistogram( $newSortKey );
|
|
|
|
|
}
|
|
|
|
|
|
2018-02-15 17:52:26 +00:00
|
|
|
if ( $dryRun ) {
|
|
|
|
|
// Add 1 to the count if the sortkey was changed. (Note that this doesn't count changes in
|
|
|
|
|
// other fields, if any, those usually only happen when upgrading old MediaWikis.)
|
|
|
|
|
$count += ( $row->cl_sortkey !== $newSortKey );
|
|
|
|
|
} else {
|
2012-07-11 03:39:39 +00:00
|
|
|
$dbw->update(
|
|
|
|
|
'categorylinks',
|
2016-02-17 09:09:32 +00:00
|
|
|
[
|
2012-07-11 03:39:39 +00:00
|
|
|
'cl_sortkey' => $newSortKey,
|
|
|
|
|
'cl_sortkey_prefix' => $prefix,
|
|
|
|
|
'cl_collation' => $collationName,
|
|
|
|
|
'cl_type' => $type,
|
2013-03-12 23:18:12 +00:00
|
|
|
'cl_timestamp = cl_timestamp',
|
2016-02-17 09:09:32 +00:00
|
|
|
],
|
|
|
|
|
[ 'cl_from' => $row->cl_from, 'cl_to' => $row->cl_to ],
|
2012-07-11 03:39:39 +00:00
|
|
|
__METHOD__
|
|
|
|
|
);
|
2018-02-15 17:52:26 +00:00
|
|
|
$count++;
|
2012-07-11 03:39:39 +00:00
|
|
|
}
|
2013-11-26 03:08:23 +00:00
|
|
|
if ( $row ) {
|
2016-02-17 09:09:32 +00:00
|
|
|
$batchConds = [ $this->getBatchCondition( $row, $dbw ) ];
|
2013-11-26 03:08:23 +00:00
|
|
|
}
|
2012-07-11 03:39:39 +00:00
|
|
|
}
|
|
|
|
|
if ( !$dryRun ) {
|
2015-12-22 08:51:42 +00:00
|
|
|
$this->commitTransaction( $dbw, __METHOD__ );
|
2010-07-23 20:58:11 +00:00
|
|
|
}
|
|
|
|
|
|
2018-02-15 17:52:26 +00:00
|
|
|
if ( $dryRun ) {
|
|
|
|
|
$this->output( "$count rows would be updated so far.\n" );
|
|
|
|
|
} else {
|
|
|
|
|
$this->output( "$count done.\n" );
|
|
|
|
|
}
|
Reconcept cl_raw_sortkey as cl_sortkey_prefix
In response to feedback by Phillipe Verdy on bug 164. Now if a bunch of
pages have [[Category:Foo| ]], they'll sort amongst themselves according
to page name, instead of in basically random order as it is currently.
This also makes storage more elegant and intuitive: instead of giving
NULL a magic meaning when there's no custom sortkey specified, we just
store an empty string, since there's no prefix.
This means {{defaultsort:}} really now means {{defaultsortprefix:}},
which is slightly confusing, and a lot of code is now slightly
misleading or poorly named. But it should all work fine.
Also, while I was at it, I made updateCollation.php work as a transition
script, so you can apply the SQL patch and then run updateCollation.php
and things will work. However, with the new schema it's not trivial to
reverse this -- you'd have to recover the raw sort keys with some PHP.
Conversion goes at about a thousand rows a second for me, and seems to
be CPU-bound. Could probably be optimized.
I also adjusted the transition script so it will fix rows with collation
versions *greater* than the current one, as well as less. Thus if some
site wants to use their own collation, they can call it 137 or
something, and if they later want to switch back to MediaWiki stock
collation 7, it will work.
Also fixed a silly bug in updateCollation.php where it would say "1000
done" if it did nothing, and changed $res->numRows() >= self::BATCH_SIZE
to == so people don't wonder how it could be bigger (since it can't, I
hope).
2010-07-26 19:27:13 +00:00
|
|
|
} while ( $res->numRows() == self::BATCH_SIZE );
|
2012-07-11 03:39:39 +00:00
|
|
|
|
2018-02-15 17:52:26 +00:00
|
|
|
if ( !$dryRun ) {
|
|
|
|
|
$this->output( "$count rows processed\n" );
|
|
|
|
|
}
|
2012-07-11 03:39:39 +00:00
|
|
|
|
|
|
|
|
if ( $verboseStats ) {
|
|
|
|
|
$this->output( "\n" );
|
|
|
|
|
$this->showSortKeySizeHistogram();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-03-12 00:26:12 +00:00
|
|
|
/**
|
|
|
|
|
* Return an SQL expression selecting rows which sort above the given row,
|
2016-02-21 21:59:14 +00:00
|
|
|
* assuming an ordering of cl_collation, cl_to, cl_type, cl_from
|
2014-08-15 16:22:34 +00:00
|
|
|
* @param stdClass $row
|
2017-03-30 20:46:06 +00:00
|
|
|
* @param IDatabase $dbw
|
2014-08-25 16:50:35 +00:00
|
|
|
* @return string
|
2013-03-12 00:26:12 +00:00
|
|
|
*/
|
2019-10-11 19:07:32 +00:00
|
|
|
private function getBatchCondition( $row, $dbw ) {
|
2016-02-21 21:59:14 +00:00
|
|
|
if ( $this->hasOption( 'previous-collation' ) ) {
|
|
|
|
|
$fields = [ 'cl_to', 'cl_type', 'cl_from' ];
|
|
|
|
|
} else {
|
|
|
|
|
$fields = [ 'cl_collation', 'cl_to', 'cl_type', 'cl_from' ];
|
|
|
|
|
}
|
2013-03-12 00:26:12 +00:00
|
|
|
$first = true;
|
|
|
|
|
$cond = false;
|
|
|
|
|
$prefix = false;
|
|
|
|
|
foreach ( $fields as $field ) {
|
2016-02-21 21:59:14 +00:00
|
|
|
if ( $dbw->getType() === 'mysql' && $field === 'cl_type' ) {
|
|
|
|
|
// Range conditions with enums are weird in mysql
|
|
|
|
|
// This must be a numeric literal, or it won't work.
|
|
|
|
|
$encValue = intval( $row->cl_type_numeric );
|
|
|
|
|
} else {
|
|
|
|
|
$encValue = $dbw->addQuotes( $row->$field );
|
|
|
|
|
}
|
2013-03-12 00:26:12 +00:00
|
|
|
$inequality = "$field > $encValue";
|
|
|
|
|
$equality = "$field = $encValue";
|
|
|
|
|
if ( $first ) {
|
|
|
|
|
$cond = $inequality;
|
|
|
|
|
$prefix = $equality;
|
|
|
|
|
$first = false;
|
|
|
|
|
} else {
|
|
|
|
|
$cond .= " OR ($prefix AND $inequality)";
|
|
|
|
|
$prefix .= " AND $equality";
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-04-23 18:09:26 +00:00
|
|
|
|
2013-03-12 00:26:12 +00:00
|
|
|
return $cond;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-11 19:07:32 +00:00
|
|
|
private function updateSortKeySizeHistogram( $key ) {
|
2012-07-11 03:39:39 +00:00
|
|
|
$length = strlen( $key );
|
|
|
|
|
if ( !isset( $this->sizeHistogram[$length] ) ) {
|
|
|
|
|
$this->sizeHistogram[$length] = 0;
|
|
|
|
|
}
|
|
|
|
|
$this->sizeHistogram[$length]++;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-11 19:07:32 +00:00
|
|
|
private function showSortKeySizeHistogram() {
|
2021-06-06 08:52:08 +00:00
|
|
|
if ( !$this->sizeHistogram ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
2012-07-11 03:39:39 +00:00
|
|
|
$maxLength = max( array_keys( $this->sizeHistogram ) );
|
|
|
|
|
if ( $maxLength == 0 ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
$numBins = 20;
|
|
|
|
|
$coarseHistogram = array_fill( 0, $numBins, 0 );
|
2016-02-17 09:09:32 +00:00
|
|
|
$coarseBoundaries = [];
|
2012-07-11 03:39:39 +00:00
|
|
|
$boundary = 0;
|
|
|
|
|
for ( $i = 0; $i < $numBins - 1; $i++ ) {
|
|
|
|
|
$boundary += $maxLength / $numBins;
|
|
|
|
|
$coarseBoundaries[$i] = round( $boundary );
|
|
|
|
|
}
|
|
|
|
|
$coarseBoundaries[$numBins - 1] = $maxLength + 1;
|
|
|
|
|
$raw = '';
|
|
|
|
|
for ( $i = 0; $i <= $maxLength; $i++ ) {
|
|
|
|
|
if ( $raw !== '' ) {
|
|
|
|
|
$raw .= ', ';
|
|
|
|
|
}
|
2018-10-27 12:30:02 +00:00
|
|
|
$val = $this->sizeHistogram[$i] ?? 0;
|
2012-07-11 03:39:39 +00:00
|
|
|
for ( $coarseIndex = 0; $coarseIndex < $numBins - 1; $coarseIndex++ ) {
|
|
|
|
|
if ( $coarseBoundaries[$coarseIndex] > $i ) {
|
|
|
|
|
$coarseHistogram[$coarseIndex] += $val;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if ( $coarseIndex == $numBins - 1 ) {
|
|
|
|
|
$coarseHistogram[$coarseIndex] += $val;
|
|
|
|
|
}
|
|
|
|
|
$raw .= $val;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$this->output( "Sort key size histogram\nRaw data: $raw\n\n" );
|
|
|
|
|
|
|
|
|
|
$maxBinVal = max( $coarseHistogram );
|
|
|
|
|
$scale = 60 / $maxBinVal;
|
|
|
|
|
$prevBoundary = 0;
|
|
|
|
|
for ( $coarseIndex = 0; $coarseIndex < $numBins; $coarseIndex++ ) {
|
2018-10-27 12:30:02 +00:00
|
|
|
$val = $coarseHistogram[$coarseIndex] ?? 0;
|
2012-07-11 03:39:39 +00:00
|
|
|
$boundary = $coarseBoundaries[$coarseIndex];
|
|
|
|
|
$this->output( sprintf( "%-10s %-10d |%s\n",
|
|
|
|
|
$prevBoundary . '-' . ( $boundary - 1 ) . ': ',
|
|
|
|
|
$val,
|
|
|
|
|
str_repeat( '*', $scale * $val ) ) );
|
|
|
|
|
$prevBoundary = $boundary;
|
|
|
|
|
}
|
2010-07-23 20:58:11 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-01-13 00:02:09 +00:00
|
|
|
$maintClass = UpdateCollation::class;
|
2013-05-07 23:00:15 +00:00
|
|
|
require_once RUN_MAINTENANCE_IF_MAIN;
|