Only store currently-existing categories in the categories table
A "currently-existing category" is defined as a category that either contains any pages or has a description page. Thus: * Category::initialize() now schedules an update to insert a row if the title exits but the row is missing. * Category::refreshCounts() now removes the row if the title doesn't exist and the category is empty. * WikiPage::onArticleCreate() loads the Category object, to trigger bullet #1. * WikiPage::updateCategoryCounts() refreshes the counts if it results in the row showing 0 pages, to trigger bullet #2. * LinksDeletionUpdate refreshes the counts if the row shows 0 pages, to trigger bullet #2. A maintenance script is provided to update the category table for this new definition. Bug: T28411 Bug: T50824 Change-Id: I0f0adf124c181ae5d3c7c95b3b5fb275a725794c
This commit is contained in:
parent
99b84272c2
commit
47e2ef34b0
8 changed files with 287 additions and 21 deletions
|
|
@ -245,6 +245,7 @@ $wgAutoloadLocalClasses = [
|
|||
'ClassCollector' => __DIR__ . '/includes/utils/AutoloadGenerator.php',
|
||||
'CleanupAncientTables' => __DIR__ . '/maintenance/cleanupAncientTables.php',
|
||||
'CleanupBlocks' => __DIR__ . '/maintenance/cleanupBlocks.php',
|
||||
'CleanupEmptyCategories' => __DIR__ . '/maintenance/cleanupEmptyCategories.php',
|
||||
'CleanupPreferences' => __DIR__ . '/maintenance/cleanupPreferences.php',
|
||||
'CleanupRemovedModules' => __DIR__ . '/maintenance/cleanupRemovedModules.php',
|
||||
'CleanupSpam' => __DIR__ . '/maintenance/cleanupSpam.php',
|
||||
|
|
|
|||
|
|
@ -79,6 +79,11 @@ class Category {
|
|||
$this->mSubcats = 0;
|
||||
$this->mFiles = 0;
|
||||
|
||||
# If the title exists, call refreshCounts to add a row for it.
|
||||
if ( $this->mTitle->exists() ) {
|
||||
DeferredUpdates::addCallableUpdate( [ $this, 'refreshCounts' ] );
|
||||
}
|
||||
|
||||
return true;
|
||||
} else {
|
||||
return false; # Fail
|
||||
|
|
@ -331,21 +336,35 @@ class Category {
|
|||
[ 'LOCK IN SHARE MODE' ]
|
||||
);
|
||||
|
||||
$shouldExist = $result->pages > 0 || $this->getTitle()->exists();
|
||||
|
||||
if ( $this->mID ) {
|
||||
# The category row already exists, so do a plain UPDATE instead
|
||||
# of INSERT...ON DUPLICATE KEY UPDATE to avoid creating a gap
|
||||
# in the cat_id sequence. The row may or may not be "affected".
|
||||
$dbw->update(
|
||||
'category',
|
||||
[
|
||||
'cat_pages' => $result->pages,
|
||||
'cat_subcats' => $result->subcats,
|
||||
'cat_files' => $result->files
|
||||
],
|
||||
[ 'cat_title' => $this->mName ],
|
||||
__METHOD__
|
||||
);
|
||||
} else {
|
||||
if ( $shouldExist ) {
|
||||
# The category row already exists, so do a plain UPDATE instead
|
||||
# of INSERT...ON DUPLICATE KEY UPDATE to avoid creating a gap
|
||||
# in the cat_id sequence. The row may or may not be "affected".
|
||||
$dbw->update(
|
||||
'category',
|
||||
[
|
||||
'cat_pages' => $result->pages,
|
||||
'cat_subcats' => $result->subcats,
|
||||
'cat_files' => $result->files
|
||||
],
|
||||
[ 'cat_title' => $this->mName ],
|
||||
__METHOD__
|
||||
);
|
||||
} else {
|
||||
# The category is empty and has no description page, delete it
|
||||
$dbw->delete(
|
||||
'category',
|
||||
[ 'cat_title' => $this->mName ],
|
||||
__METHOD__
|
||||
);
|
||||
$this->mID = false;
|
||||
}
|
||||
} elseif ( $shouldExist ) {
|
||||
# The category row doesn't exist but should, so create it. Use
|
||||
# upsert in case of races.
|
||||
$dbw->upsert(
|
||||
'category',
|
||||
[
|
||||
|
|
@ -362,6 +381,8 @@ class Category {
|
|||
],
|
||||
__METHOD__
|
||||
);
|
||||
// @todo: Should we update $this->mID here? Or not since Category
|
||||
// objects tend to be short lived enough to not matter?
|
||||
}
|
||||
|
||||
$dbw->endAtomic( __METHOD__ );
|
||||
|
|
|
|||
|
|
@ -61,6 +61,8 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate
|
|||
// This handles the case when updates have to batched into several COMMITs.
|
||||
$scopedLock = LinksUpdate::acquirePageLock( $this->mDb, $id );
|
||||
|
||||
$title = $this->page->getTitle();
|
||||
|
||||
// Delete restrictions for it
|
||||
$this->mDb->delete( 'page_restrictions', [ 'pr_page' => $id ], __METHOD__ );
|
||||
|
||||
|
|
@ -80,6 +82,20 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate
|
|||
}
|
||||
}
|
||||
|
||||
// Refresh the category table entry if it seems to have no pages. Check
|
||||
// master for the most up-to-date cat_pages count.
|
||||
if ( $title->getNamespace() === NS_CATEGORY ) {
|
||||
$row = $this->mDb->selectRow(
|
||||
'category',
|
||||
[ 'cat_id', 'cat_title', 'cat_pages', 'cat_subcats', 'cat_files' ],
|
||||
[ 'cat_title' => $title->getDBkey(), 'cat_pages <= 0' ],
|
||||
__METHOD__
|
||||
);
|
||||
if ( $row ) {
|
||||
$cat = Category::newFromRow( $row, $title )->refreshCounts();
|
||||
}
|
||||
}
|
||||
|
||||
// If using cascading deletes, we can skip some explicit deletes
|
||||
if ( !$this->mDb->cascadingDeletes() ) {
|
||||
// Delete outgoing links
|
||||
|
|
@ -132,7 +148,6 @@ class LinksDeletionUpdate extends SqlDataUpdate implements EnqueueableDataUpdate
|
|||
|
||||
// If using cleanup triggers, we can skip some manual deletes
|
||||
if ( !$this->mDb->cleanupTriggers() ) {
|
||||
$title = $this->page->getTitle();
|
||||
// Find recentchanges entries to clean up...
|
||||
$rcIdsForTitle = $this->mDb->selectFieldValues(
|
||||
'recentchanges',
|
||||
|
|
|
|||
|
|
@ -75,6 +75,7 @@ abstract class DatabaseUpdater {
|
|||
PopulateFilearchiveSha1::class,
|
||||
PopulateBacklinkNamespace::class,
|
||||
FixDefaultJsonContentPages::class,
|
||||
CleanupEmptyCategories::class,
|
||||
];
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -3279,6 +3279,14 @@ class WikiPage implements Page, IDBAccessObject {
|
|||
$title->touchLinks();
|
||||
$title->purgeSquid();
|
||||
$title->deleteTitleProtection();
|
||||
|
||||
if ( $title->getNamespace() == NS_CATEGORY ) {
|
||||
// Load the Category object, which will schedule a job to create
|
||||
// the category table row if necessary. Checking a slave is ok
|
||||
// here, in the worst case it'll run an unnecessary recount job on
|
||||
// a category that probably doesn't have many members.
|
||||
Category::newFromTitle( $title )->getID();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -3525,6 +3533,22 @@ class WikiPage implements Page, IDBAccessObject {
|
|||
$cat = Category::newFromName( $catName );
|
||||
Hooks::run( 'CategoryAfterPageRemoved', [ $cat, $this, $id ] );
|
||||
}
|
||||
|
||||
// Refresh counts on categories that should be empty now, to
|
||||
// trigger possible deletion. Check master for the most
|
||||
// up-to-date cat_pages.
|
||||
if ( count( $deleted ) ) {
|
||||
$rows = $dbw->select(
|
||||
'category',
|
||||
[ 'cat_id', 'cat_title', 'cat_pages', 'cat_subcats', 'cat_files' ],
|
||||
[ 'cat_title' => $deleted, 'cat_pages <= 0' ],
|
||||
$method
|
||||
);
|
||||
foreach ( $rows as $row ) {
|
||||
$cat = Category::newFromRow( $row );
|
||||
$cat->refreshCounts();
|
||||
}
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
|
|
|
|||
204
maintenance/cleanupEmptyCategories.php
Normal file
204
maintenance/cleanupEmptyCategories.php
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
<?php
|
||||
/**
|
||||
* Clean up empty categories in the category table.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along
|
||||
* with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||||
* http://www.gnu.org/copyleft/gpl.html
|
||||
*
|
||||
* @file
|
||||
* @ingroup Maintenance
|
||||
*/
|
||||
|
||||
require_once __DIR__ . '/Maintenance.php';
|
||||
|
||||
/**
|
||||
* Maintenance script to clean up empty categories in the category table.
|
||||
*
|
||||
* @ingroup Maintenance
|
||||
* @since 1.28
|
||||
*/
|
||||
class CleanupEmptyCategories extends LoggedUpdateMaintenance {
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct();
|
||||
$this->addDescription(
|
||||
<<<TEXT
|
||||
This script will clean up the category table by removing entries for empty
|
||||
categories without a description page and adding entries for empty categories
|
||||
with a description page. It will print out progress indicators every batch. The
|
||||
script is perfectly safe to run on large, live wikis, and running it multiple
|
||||
times is harmless. You may want to use the throttling options if it's causing
|
||||
too much load; they will not affect correctness.
|
||||
|
||||
If the script is stopped and later resumed, you can use the --mode and --begin
|
||||
options with the last printed progress indicator to pick up where you left off.
|
||||
|
||||
When the script has finished, it will make a note of this in the database, and
|
||||
will not run again without the --force option.
|
||||
TEXT
|
||||
);
|
||||
|
||||
$this->addOption(
|
||||
'mode',
|
||||
'"add" empty categories with description pages, "remove" empty categories '
|
||||
. 'without description pages, or "both"',
|
||||
false,
|
||||
true
|
||||
);
|
||||
$this->addOption(
|
||||
'begin',
|
||||
'Only do categories whose names are alphabetically after the provided name',
|
||||
false,
|
||||
true
|
||||
);
|
||||
$this->addOption(
|
||||
'throttle',
|
||||
'Wait this many milliseconds after each batch. Default: 0',
|
||||
false,
|
||||
true
|
||||
);
|
||||
}
|
||||
|
||||
protected function getUpdateKey() {
|
||||
return 'cleanup empty categories';
|
||||
}
|
||||
|
||||
protected function doDBUpdates() {
|
||||
$mode = $this->getOption( 'mode', 'both' );
|
||||
$begin = $this->getOption( 'begin', '' );
|
||||
$throttle = $this->getOption( 'throttle', 0 );
|
||||
|
||||
if ( !in_array( $mode, [ 'add', 'remove', 'both' ] ) ) {
|
||||
$this->output( "--mode must be 'add', 'remove', or 'both'.\n" );
|
||||
return false;
|
||||
}
|
||||
|
||||
$dbw = $this->getDB( DB_MASTER );
|
||||
|
||||
$throttle = intval( $throttle );
|
||||
|
||||
if ( $mode === 'add' || $mode === 'both' ) {
|
||||
if ( $begin !== '' ) {
|
||||
$where = [ 'page_title > ' . $dbw->addQuotes( $begin ) ];
|
||||
} else {
|
||||
$where = [];
|
||||
}
|
||||
|
||||
$this->output( "Adding empty categories with description pages...\n" );
|
||||
while ( true ) {
|
||||
# Find which category to update
|
||||
$rows = $dbw->select(
|
||||
[ 'page', 'category' ],
|
||||
'page_title',
|
||||
array_merge( $where, [
|
||||
'page_namespace' => NS_CATEGORY,
|
||||
'cat_title' => null,
|
||||
] ),
|
||||
__METHOD__,
|
||||
[
|
||||
'ORDER BY' => 'page_title',
|
||||
'LIMIT' => $this->mBatchSize,
|
||||
],
|
||||
[
|
||||
'category' => [ 'LEFT JOIN', 'page_title = cat_title' ],
|
||||
]
|
||||
);
|
||||
if ( !$rows || $rows->numRows() <= 0 ) {
|
||||
# Done, hopefully.
|
||||
break;
|
||||
}
|
||||
|
||||
foreach ( $rows as $row ) {
|
||||
$name = $row->page_title;
|
||||
$where = [ 'page_title > ' . $dbw->addQuotes( $name ) ];
|
||||
|
||||
# Use the row to update the category count
|
||||
$cat = Category::newFromName( $name );
|
||||
if ( !is_object( $cat ) ) {
|
||||
$this->output( "The category named $name is not valid?!\n" );
|
||||
} else {
|
||||
$cat->refreshCounts();
|
||||
}
|
||||
}
|
||||
$this->output( "--mode=$mode --begin=$name\n" );
|
||||
|
||||
wfWaitForSlaves();
|
||||
usleep( $throttle * 1000 );
|
||||
}
|
||||
|
||||
$begin = '';
|
||||
}
|
||||
|
||||
if ( $mode === 'remove' || $mode === 'both' ) {
|
||||
if ( $begin !== '' ) {
|
||||
$where = [ 'cat_title > ' . $dbw->addQuotes( $begin ) ];
|
||||
} else {
|
||||
$where = [];
|
||||
}
|
||||
$i = 0;
|
||||
|
||||
$this->output( "Removing empty categories without description pages...\n" );
|
||||
while ( true ) {
|
||||
# Find which category to update
|
||||
$rows = $dbw->select(
|
||||
[ 'category', 'page' ],
|
||||
'cat_title',
|
||||
array_merge( $where, [
|
||||
'page_title' => null,
|
||||
'cat_pages' => 0,
|
||||
] ),
|
||||
__METHOD__,
|
||||
[
|
||||
'ORDER BY' => 'cat_title',
|
||||
'LIMIT' => $this->mBatchSize,
|
||||
],
|
||||
[
|
||||
'page' => [ 'LEFT JOIN', [
|
||||
'page_namespace' => NS_CATEGORY, 'page_title = cat_title'
|
||||
] ],
|
||||
]
|
||||
);
|
||||
if ( !$rows || $rows->numRows() <= 0 ) {
|
||||
# Done, hopefully.
|
||||
break;
|
||||
}
|
||||
foreach ( $rows as $row ) {
|
||||
$name = $row->cat_title;
|
||||
$where = [ 'cat_title > ' . $dbw->addQuotes( $name ) ];
|
||||
|
||||
# Use the row to update the category count
|
||||
$cat = Category::newFromName( $name );
|
||||
if ( !is_object( $cat ) ) {
|
||||
$this->output( "The category named $name is not valid?!\n" );
|
||||
} else {
|
||||
$cat->refreshCounts();
|
||||
}
|
||||
}
|
||||
|
||||
$this->output( "--mode=remove --begin=$name\n" );
|
||||
|
||||
wfWaitForSlaves();
|
||||
usleep( $throttle * 1000 );
|
||||
}
|
||||
}
|
||||
|
||||
$this->output( "Category cleanup complete.\n" );
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
$maintClass = 'CleanupEmptyCategories';
|
||||
require_once RUN_MAINTENANCE_IF_MAIN;
|
||||
|
|
@ -336,9 +336,9 @@ CREATE INDEX /*i*/cl_timestamp ON /*_*/categorylinks (cl_to,cl_timestamp);
|
|||
CREATE INDEX /*i*/cl_collation_ext ON /*_*/categorylinks (cl_collation, cl_to, cl_type, cl_from);
|
||||
|
||||
--
|
||||
-- Track all existing categories. Something is a category if 1) it has an en-
|
||||
-- try somewhere in categorylinks, or 2) it once did. Categories might not
|
||||
-- have corresponding pages, so they need to be tracked separately.
|
||||
-- Track all existing categories. Something is a category if 1) it has an entry
|
||||
-- somewhere in categorylinks, or 2) it has a description page. Categories
|
||||
-- might not have corresponding pages, so they need to be tracked separately.
|
||||
--
|
||||
CREATE TABLE /*_*/category (
|
||||
-- Primary key
|
||||
|
|
|
|||
|
|
@ -623,9 +623,9 @@ CREATE INDEX /*i*/cl_timestamp ON /*_*/categorylinks (cl_to,cl_timestamp);
|
|||
CREATE INDEX /*i*/cl_collation_ext ON /*_*/categorylinks (cl_collation, cl_to, cl_type, cl_from);
|
||||
|
||||
--
|
||||
-- Track all existing categories. Something is a category if 1) it has an en-
|
||||
-- try somewhere in categorylinks, or 2) it once did. Categories might not
|
||||
-- have corresponding pages, so they need to be tracked separately.
|
||||
-- Track all existing categories. Something is a category if 1) it has an entry
|
||||
-- somewhere in categorylinks, or 2) it has a description page. Categories
|
||||
-- might not have corresponding pages, so they need to be tracked separately.
|
||||
--
|
||||
CREATE TABLE /*_*/category (
|
||||
-- Primary key
|
||||
|
|
|
|||
Loading…
Reference in a new issue