From 9ae97460a1a84b4a9cc32811f45399dcde4ce9b9 Mon Sep 17 00:00:00 2001 From: Aaron Schulz Date: Mon, 23 Jan 2023 20:26:12 -0800 Subject: [PATCH] rdbms: improve the moving average method in LoadMonitor Make the moving average ratio dynamic and generally much higher. It scales up based on how far in the past the last time the server was polled. This makes it better handle a wide range of traffic (e.g. large vs small sites or web vs CLI mode) and converge faster on recognizing a down server. Bug: T265386 Change-Id: I155b42bb93a512b17e244b58d5c4cd55fa478886 --- .../libs/rdbms/loadmonitor/LoadMonitor.php | 72 +++++++++---------- 1 file changed, 34 insertions(+), 38 deletions(-) diff --git a/includes/libs/rdbms/loadmonitor/LoadMonitor.php b/includes/libs/rdbms/loadmonitor/LoadMonitor.php index 55b145b4d96..26da96ac12e 100644 --- a/includes/libs/rdbms/loadmonitor/LoadMonitor.php +++ b/includes/libs/rdbms/loadmonitor/LoadMonitor.php @@ -50,7 +50,7 @@ class LoadMonitor implements ILoadMonitor { /** @var StatsdDataFactoryInterface */ protected $statsd; - /** @var float Moving average ratio (e.g. 0.1 for 10% weight to new weight) */ + /** @var float Maximum new gauge coefficient for moving averages */ private $movingAveRatio; /** @var int Amount of replication lag in seconds before warnings are logged */ private $lagWarnThreshold; @@ -74,9 +74,10 @@ class LoadMonitor implements ILoadMonitor { * @param ILoadBalancer $lb * @param BagOStuff $srvCache * @param WANObjectCache $wCache - * @param array $options - * - movingAveRatio: moving average constant for server weight updates based on lag - * - lagWarnThreshold: how many seconds of lag trigger warnings + * @param array $options Additional parameters include: + * - movingAveRatio: maximum new gauge coefficient for moving averages + * when the new gauge is 1 second newer than the prior one [default: .54] + * - lagWarnThreshold: how many seconds of lag trigger warnings [default: 10] */ public function __construct( ILoadBalancer $lb, BagOStuff $srvCache, WANObjectCache $wCache, array $options = [] @@ -87,7 +88,7 @@ class LoadMonitor implements ILoadMonitor { $this->logger = new NullLogger(); $this->statsd = new NullStatsdDataFactory(); - $this->movingAveRatio = $options['movingAveRatio'] ?? 0.1; + $this->movingAveRatio = (float)( $options['movingAveRatio'] ?? 0.54 ); $this->lagWarnThreshold = $options['lagWarnThreshold'] ?? LoadBalancer::MAX_LAG_DEFAULT; } @@ -208,6 +209,7 @@ class LoadMonitor implements ILoadMonitor { return $this->getPlaceholderServerStates( $serverIndexes ); } + $priorAsOf = $priorStates['timestamp'] ?? 0; $priorScales = $priorStates ? $priorStates['weightScales'] : []; $cluster = $this->lb->getClusterName(); @@ -235,9 +237,10 @@ class LoadMonitor implements ILoadMonitor { // Get new weight scale using a moving average of the naïve and prior values $lastScale = $priorScales[$i] ?? 1.0; $naiveScale = $this->getWeightScale( $i, $conn ?: null ); - $newScale = $this->getNewScaleViaMovingAve( + $newScale = $this->movingAverage( $lastScale, $naiveScale, + max( $this->getCurrentTime() - $priorAsOf, 0.0 ), $this->movingAveRatio ); // Scale from 0% to 100% of nominal weight @@ -319,40 +322,33 @@ class LoadMonitor implements ILoadMonitor { } /** - * Get the moving average weight scale given a naive and the last iteration value + * Update a moving average for a gauge, accounting for the time delay since the last gauge * - * One case of particular note is if a server totally cannot have its state queried. - * Ideally, the scale should be able to drop from 1.0 to a miniscule amount (say 0.001) - * fairly quickly. To get the time to reach 0.001, some calculations can be done: - * - * SCALE = $naiveScale * $movAveRatio + $lastScale * (1 - $movAveRatio) - * SCALE = 0 * $movAveRatio + $lastScale * (1 - $movAveRatio) - * SCALE = $lastScale * (1 - $movAveRatio) - * - * Given a starting weight scale of 1.0: - * 1.0 * (1 - $movAveRatio)^(# iterations) = 0.001 - * ceil( log<1 - $movAveRatio>(0.001) ) = (# iterations) - * t = (# iterations) * (POLL_PERIOD + SHARED_CACHE_TTL) - * t = (# iterations) * (1e3 * POLL_PERIOD_MS + SHARED_CACHE_TTL) - * - * If $movAveRatio is 0.5, then: - * t = ceil( log<0.5>(0.01) ) * 1.5 = 7 * 1.5 = 10.5 seconds [for 1% scale] - * t = ceil( log<0.5>(0.001) ) * 1.5 = 10 * 1.5 = 15 seconds [for 0.1% scale] - * - * If $movAveRatio is 0.8, then: - * t = ceil( log<0.2>(0.01) ) * 1.5 = 3 * 1.5 = 4.5 seconds [for 1% scale] - * t = ceil( log<0.2>(0.001) ) * 1.5 = 5 * 1.5 = 7.5 seconds [for 0.1% scale] - * - * Use of connection failure rate can greatly speed this process up - * - * @param float $lastScale Current moving average of scaling factors - * @param float $naiveScale New scaling factor - * @param float $movAveRatio Weight given to the new value - * @return float - * @since 1.35 + * @param float|int|null $priorValue Prior moving average of value or null + * @param float|int|null $gaugeValue Newly gauged value or null + * @param float $delay Seconds between the new gauge and the prior one + * @param float $movAveRatio New gauge weight when it is 1 second newer than the prior one + * @return float|false New moving average of value */ - protected function getNewScaleViaMovingAve( $lastScale, $naiveScale, $movAveRatio ) { - return $movAveRatio * $naiveScale + ( 1 - $movAveRatio ) * $lastScale; + public function movingAverage( + $priorValue, + $gaugeValue, + float $delay, + float $movAveRatio + ) { + if ( $gaugeValue === null ) { + return $priorValue; + } elseif ( $priorValue === null ) { + return $gaugeValue; + } + + // Apply more weight to the newer gauge the more outdated the prior gauge is. + // The rate of state updates generally depends on the amount of site traffic. + // Smaller will get less frequent updates, but the gauges still still converge + // within reasonable time bounds so that unreachable DB servers are avoided. + $delayAwareRatio = 1 - pow( 1 - $movAveRatio, $delay ); + + return max( $delayAwareRatio * $gaugeValue + ( 1 - $delayAwareRatio ) * $priorValue, 0.0 ); } /**