Make getLagFromPtHeartbeat() always use the LB cluster master entry

Before, it just used the immediate master entry, which could be another
slave. In that case, it may not even exists at all.

Bug: T119648
Change-Id: Iea970b81ad2c9855aafcccf0bb0662fc0b3a8d4d
This commit is contained in:
Aaron Schulz 2015-12-03 16:37:56 -08:00
parent 90058e5ce7
commit 9c0b55d926
5 changed files with 164 additions and 23 deletions

View file

@ -148,6 +148,9 @@ abstract class DatabaseBase implements IDatabase {
*/
private $mTrxWriteDuration = 0.0;
/** @var IDatabase|null Lazy handle to the master DB this server replicates from */
private $lazyMasterHandle;
/**
* @since 1.21
* @var resource File handle for upgrade
@ -328,6 +331,25 @@ abstract class DatabaseBase implements IDatabase {
}
}
/**
* Set a lazy-connecting DB handle to the master DB (for replication status purposes)
*
* @param IDatabase $conn
* @since 1.27
*/
public function setLazyMasterHandle( IDatabase $conn ) {
$this->lazyMasterHandle = $conn;
}
/**
* @return IDatabase|null
* @see setLazyMasterHandle()
* @since 1.27
*/
public function getLazyMasterHandle() {
return $this->lazyMasterHandle;
}
/**
* @return TransactionProfiler
*/

View file

@ -621,13 +621,20 @@ abstract class DatabaseMysqlBase extends Database {
abstract protected function mysqlPing();
function getLag() {
if ( $this->lagDetectionMethod === 'pt-heartbeat' ) {
if ( $this->getLagDetectionMethod() === 'pt-heartbeat' ) {
return $this->getLagFromPtHeartbeat();
} else {
return $this->getLagFromSlaveStatus();
}
}
/**
* @return string
*/
protected function getLagDetectionMethod() {
return $this->lagDetectionMethod;
}
/**
* @return bool|int
*/
@ -645,35 +652,82 @@ abstract class DatabaseMysqlBase extends Database {
* @return bool|float
*/
protected function getLagFromPtHeartbeat() {
$key = wfMemcKey( 'mysql', 'master-server-id', $this->getServer() );
$masterId = intval( $this->srvCache->get( $key ) );
if ( !$masterId ) {
$res = $this->query( 'SHOW SLAVE STATUS', __METHOD__ );
$row = $res ? $res->fetchObject() : false;
if ( $row && strval( $row->Master_Server_Id ) !== '' ) {
$masterId = intval( $row->Master_Server_Id );
$this->srvCache->set( $key, $masterId, 30 );
}
$masterInfo = $this->getMasterServerInfo();
if ( !$masterInfo ) {
return false; // could not get master server ID
}
if ( !$masterId ) {
return false;
}
list( $time, $nowUnix ) = $this->getHeartbeatData( $masterInfo['serverId'] );
if ( $time !== null ) {
// @time is in ISO format like "2015-09-25T16:48:10.000510"
$dateTime = new DateTime( $time, new DateTimeZone( 'UTC' ) );
$timeUnix = (int)$dateTime->format( 'U' ) + $dateTime->format( 'u' ) / 1e6;
$res = $this->query(
"SELECT TIMESTAMPDIFF(MICROSECOND,ts,UTC_TIMESTAMP(6)) AS Lag " .
"FROM heartbeat.heartbeat WHERE server_id = $masterId"
);
$row = $res ? $res->fetchObject() : false;
if ( $row ) {
return max( floatval( $row->Lag ) / 1e6, 0.0 );
return max( $nowUnix - $timeUnix, 0.0 );
}
return false;
}
protected function getMasterServerInfo() {
$cache = $this->srvCache;
$key = $cache->makeGlobalKey(
'mysql',
'master-info',
// Using one key for all cluster slaves is preferable
$this->getLBInfo( 'clusterMasterHost' ) ?: $this->getServer()
);
$that = $this;
return $cache->getWithSetCallback(
$key,
$cache::TTL_INDEFINITE,
function () use ( $that, $cache, $key ) {
// Get and leave a lock key in place for a short period
if ( !$cache->lock( $key, 0, 10 ) ) {
return false; // avoid master connection spike slams
}
$conn = $that->getLazyMasterHandle();
if ( !$conn ) {
return false; // something is misconfigured
}
// Connect to and query the master; catch errors to avoid outages
try {
$res = $conn->query( 'SELECT @@server_id AS id', __METHOD__ );
$row = $res ? $res->fetchObject() : false;
$id = $row ? (int)$row->id : 0;
} catch ( DBError $e ) {
$id = 0;
}
// Cache the ID if it was retrieved
return $id ? array( 'serverId' => $id, 'asOf' => time() ) : false;
}
);
}
/**
* @param string $masterId Server ID
* @return array (heartbeat `ts` column value or null, UNIX timestamp)
* @see https://www.percona.com/doc/percona-toolkit/2.1/pt-heartbeat.html
*/
protected function getHeartbeatData( $masterId ) {
// Get the status row for this master; use the oldest for sanity in case the master
// has entries listed under different server IDs (which should really not happen).
// Note: this would use "MAX(TIMESTAMPDIFF(MICROSECOND,ts,UTC_TIMESTAMP(6)))" but the
// percision field is not supported in MySQL <= 5.5.
$res = $this->query(
"SELECT ts FROM heartbeat.heartbeat WHERE server_id=" . intval( $masterId )
);
$row = $res ? $res->fetchObject() : false;
return array( $row ? $row->ts : null, microtime( true ) );
}
public function getApproximateLagStatus() {
if ( $this->lagDetectionMethod === 'pt-heartbeat' ) {
if ( $this->getLagDetectionMethod() === 'pt-heartbeat' ) {
// Disable caching since this is fast enough and we don't wan't
// to be *too* pessimistic by having both the cache TTL and the
// pt-heartbeat interval count as lag in getSessionLagStatus()

View file

@ -816,11 +816,14 @@ class LoadBalancer {
$server['dbname'] = $dbNameOverride;
}
// Let the handle know what the cluster master is (e.g. "db1052")
$masterName = $this->getServerName( 0 );
$server['clusterMasterHost'] = $masterName;
// Log when many connection are made on requests
if ( ++$this->connsOpened >= self::CONN_HELD_WARN_THRESHOLD ) {
$masterAddr = $this->getServerName( 0 );
wfDebugLog( 'DBPerformance', __METHOD__ . ": " .
"{$this->connsOpened}+ connections made (master=$masterAddr)\n" .
"{$this->connsOpened}+ connections made (master=$masterName)\n" .
wfBacktrace( true ) );
}
@ -834,6 +837,9 @@ class LoadBalancer {
}
$db->setLBInfo( $server );
$db->setLazyMasterHandle(
$this->getLazyConnectionRef( DB_MASTER, array(), $db->getWikiID() )
);
return $db;
}

View file

@ -256,4 +256,59 @@ class DatabaseMysqlBaseTest extends MediaWikiTestCase {
$this->assertTrue( $pos2->hasReached( $pos1 ) );
$this->assertFalse( $pos1->hasReached( $pos2 ) );
}
/**
* @dataProvider provideLagAmounts
*/
function testPtHeartbeat( $lag ) {
$db = $this->getMockBuilder( 'DatabaseMysql' )
->disableOriginalConstructor()
->setMethods( array(
'getLagDetectionMethod', 'getHeartbeatData', 'getMasterServerInfo' ) )
->getMock();
$db->expects( $this->any() )
->method( 'getLagDetectionMethod' )
->will( $this->returnValue( 'pt-heartbeat' ) );
$db->expects( $this->any() )
->method( 'getMasterServerInfo' )
->will( $this->returnValue( array( 'serverId' => 172, 'asOf' => time() ) ) );
// Fake the current time.
list( $nowSecFrac, $nowSec ) = explode( ' ', microtime() );
$now = (float)$nowSec + (float)$nowSecFrac;
// Fake the heartbeat time.
// Work arounds for weak DataTime microseconds support.
$ptTime = $now - $lag;
$ptSec = (int)$ptTime;
$ptSecFrac = ( $ptTime - $ptSec );
$ptDateTime = new DateTime( "@$ptSec" );
$ptTimeISO = $ptDateTime->format( 'Y-m-d\TH:i:s' );
$ptTimeISO .= ltrim( number_format( $ptSecFrac, 6 ), '0' );
$db->expects( $this->any() )
->method( 'getHeartbeatData' )
->with( 172 )
->will( $this->returnValue( array( $ptTimeISO, $now ) ) );
$db->setLBInfo( 'clusterMasterHost', 'db1052' );
$lagEst = $db->getLag();
$this->assertGreaterThan( $lag - .010, $lagEst, "Correct heatbeat lag" );
$this->assertLessThan( $lag + .010, $lagEst, "Correct heatbeat lag" );
}
function provideLagAmounts() {
return array(
array( 0 ),
array( 0.3 ),
array( 6.5 ),
array( 10.1 ),
array( 200.2 ),
array( 400.7 ),
array( 600.22 ),
array( 1000.77 ),
);
}
}

View file

@ -103,9 +103,13 @@ class LBFactoryTest extends MediaWikiTestCase {
$dbw = $lb->getConnection( DB_MASTER );
$this->assertTrue( $dbw->getLBInfo( 'master' ), 'master shows as master' );
$this->assertEquals(
$wgDBserver, $dbw->getLBInfo( 'clusterMasterHost' ), 'cluster master set' );
$dbr = $lb->getConnection( DB_SLAVE );
$this->assertTrue( $dbr->getLBInfo( 'slave' ), 'slave shows as slave' );
$this->assertEquals(
$wgDBserver, $dbr->getLBInfo( 'clusterMasterHost' ), 'cluster master set' );
$factory->shutdown();
$lb->closeAll();