rdbms: Stop going read-only if all replicas are lagged

This basically can't happen as due to semi-sync, master goes read-only
if half of replicas can't keep up.

So the result is that when heartbeat shows high values when DBAs are
moving replicas around (incorrectly), MediaWiki over-reacts and makes
the whole wiki read-only.

Bug: T314975
Change-Id: I53b261dfed0199000f1443d8e04a710a3b49f2c0
This commit is contained in:
Amir Sarabadani 2023-01-17 07:04:06 +01:00 committed by Krinkle
parent 3ee932ef5f
commit f59e4ad95e

View file

@ -525,7 +525,7 @@ class LoadBalancer implements ILoadBalancerForOwner {
$this->getLoadMonitor()->scaleLoads( $loads );
// Pick a server, accounting for weight, load, lag, and session consistency
[ $i, $laggedReplicaMode ] = $this->pickReaderIndex( $loads );
$i = $this->pickReaderIndex( $loads );
if ( $i === false ) {
// Connection attempts failed
return false;
@ -537,6 +537,8 @@ class LoadBalancer implements ILoadBalancerForOwner {
if ( !$this->awaitSessionPrimaryPos( $i ) ) {
// Data will be outdated compared to what was expected
$laggedReplicaMode = true;
} else {
$laggedReplicaMode = false;
}
// Keep using this server for DB_REPLICA handles for this group
@ -583,53 +585,41 @@ class LoadBalancer implements ILoadBalancerForOwner {
* This will leave the server connection open within the pool for reuse
*
* @param array $loads List of server weights
* @return array (reader index, lagged replica mode) or (false, false) on failure
* @return int|false reader index or false
*/
private function pickReaderIndex( array $loads ) {
if ( $loads === [] ) {
throw new InvalidArgumentException( "Server configuration array is empty" );
}
/** @var int|false $i Index of selected server */
$i = false;
$laggedReplicaMode = false;
// Quickly look through the available servers for a server that meets criteria...
$currentLoads = $loads;
$i = false;
while ( count( $currentLoads ) ) {
if ( $laggedReplicaMode ) {
$i = ArrayUtils::pickRandom( $currentLoads );
if ( $this->waitForPos && $this->waitForPos->asOfTime() ) {
$this->logger->debug( __METHOD__ . ": session has replication position" );
// "chronologyCallback" sets "waitForPos" for session consistency.
// This triggers doWait() after connect, so it's especially good to
// avoid lagged servers so as to avoid excessive delay in that method.
$ago = microtime( true ) - $this->waitForPos->asOfTime();
// Aim for <= 1 second of waiting (being too picky can backfire)
$i = $this->getRandomNonLagged( $currentLoads, $ago + 1 );
} else {
$i = false;
if ( $this->waitForPos && $this->waitForPos->asOfTime() ) {
$this->logger->debug( __METHOD__ . ": session has replication position" );
// "chronologyCallback" sets "waitForPos" for session consistency.
// This triggers doWait() after connect, so it's especially good to
// avoid lagged servers so as to avoid excessive delay in that method.
$ago = microtime( true ) - $this->waitForPos->asOfTime();
// Aim for <= 1 second of waiting (being too picky can backfire)
$i = $this->getRandomNonLagged( $currentLoads, $ago + 1 );
}
if ( $i === false ) {
// Any server with less lag than it's 'max lag' param is preferable
$i = $this->getRandomNonLagged( $currentLoads );
}
if ( $i === false && count( $currentLoads ) ) {
// All replica DBs lagged. Switch to read-only mode
$this->logger->error( __METHOD__ . ": excessive replication lag" );
$i = ArrayUtils::pickRandom( $currentLoads );
$laggedReplicaMode = true;
}
// Any server with less lag than it's 'max lag' param is preferable
$i = $this->getRandomNonLagged( $currentLoads );
}
if ( $i === false && count( $currentLoads ) ) {
// All replica DBs lagged, just pick anything.
$i = ArrayUtils::pickRandom( $currentLoads );
}
if ( $i === false ) {
// pickRandom() returned false.
// This is permanent and means the configuration or the load monitor
// This is permanent and means the configuration or LoadMonitor
// wants us to return false.
$this->logger->debug( __METHOD__ . ": no suitable server found" );
return [ false, false ];
return false;
}
$serverName = $this->getServerName( $i );
@ -642,7 +632,6 @@ class LoadBalancer implements ILoadBalancerForOwner {
if ( !$conn ) {
$this->logger->warning( __METHOD__ . ": failed connecting to $serverName" );
unset( $currentLoads[$i] ); // avoid this server next iteration
$i = false;
continue;
}
@ -655,7 +644,7 @@ class LoadBalancer implements ILoadBalancerForOwner {
$this->logger->error( __METHOD__ . ": all servers down" );
}
return [ $i, $laggedReplicaMode ];
return $i;
}
public function waitFor( $pos ) {