TempUser: ScrambleMapping

Add a mapping function which converts sequential integers to a sequence
in which most of the digits change each time, but without significantly
increasing the length.

Change-Id: I1011a96894fbf0b92d20a96149e68014f53e3c3c
This commit is contained in:
Tim Starling 2022-04-05 13:45:12 +10:00 committed by Aaron Schulz
parent dbfc16a68b
commit a124d003cf
5 changed files with 145 additions and 8 deletions

View file

@ -4648,10 +4648,13 @@ config-schema:
allocating IDs, at the expense of making the IDs be non-monotonic.
- serialMapping: (array) Configuration for mapping integer indexes to strings
to substitute into genPattern.
- type: (string) May be "plain-numeric" to use ASCII decimal numbers,
"localized-numeric" to use numbers localized using a specific
language, or "filtered-radix" to use numbers in an arbitrary base
between 2 and 36, with an optional list of "bad" IDs to skip over.
- type: (string) May be
- "plain-numeric" to use ASCII decimal numbers
- "localized-numeric" to use numbers localized using a specific language
- "filtered-radix" to use numbers in an arbitrary base between 2 and 36,
with an optional list of "bad" IDs to skip over.
- "scramble": to use ASCII decimal numbers that are short but
non-consecutive.
- language: (string) With "localized-numeric", the language code
- radix: (int) With "filtered-radix", the base
- badIndexes: (array) With "filtered-radix", an array with the bad unmapped

View file

@ -7079,10 +7079,13 @@ class MainConfigSchema {
* allocating IDs, at the expense of making the IDs be non-monotonic.
* - serialMapping: (array) Configuration for mapping integer indexes to strings
* to substitute into genPattern.
* - type: (string) May be "plain-numeric" to use ASCII decimal numbers,
* "localized-numeric" to use numbers localized using a specific
* language, or "filtered-radix" to use numbers in an arbitrary base
* between 2 and 36, with an optional list of "bad" IDs to skip over.
* - type: (string) May be
* - "plain-numeric" to use ASCII decimal numbers
* - "localized-numeric" to use numbers localized using a specific language
* - "filtered-radix" to use numbers in an arbitrary base between 2 and 36,
* with an optional list of "bad" IDs to skip over.
* - "scramble": to use ASCII decimal numbers that are short but
* non-consecutive.
* - language: (string) With "localized-numeric", the language code
* - radix: (int) With "filtered-radix", the base
* - badIndexes: (array) With "filtered-radix", an array with the bad unmapped

View file

@ -0,0 +1,97 @@
<?php
namespace MediaWiki\User\TempUser;
/**
* A mapping which converts sequential input into an output sequence that looks
* pseudo-random, but preserves the base-10 length of the input number.
*
* Take a sequence generated by multiplying the previous element of the
* sequence by a fixed number "g", then applying the modulus "p":
*
* X(0) = 1
* X(i) = ( g X(i-1) ) mod p
*
* If g is a primitive root modulo p, then this sequence will cover all values
* from 1 to p-1 before it repeats. X(i) is a modular exponential function
* (g^i mod p) and algorithms are available to calculate it efficiently.
*
* Loosely speaking, we choose a sequence based on the number of digits N in the
* input, with the period being approximately 10^N, so that the number of digits
* in the output will be approximately the same.
*
* More precisely, after offsetting the subsequent sequences to avoid colliding
* with the previous sequences, the period ends up being about 0.9 * 10^N
*
* The modulo p is always a prime number because that makes the maths easier.
* We use a value for g close to p/sqrt(3) since that seems to stir the digits
* better than the largest or smallest primitive root.
*
* @internal
*/
class ScrambleMapping implements SerialMapping {
/**
* Appropriately sized prime moduli and their associated largest primitive
* root. Generated with this GP/PARI script:
* s=0; \
* for(q = 2, 10, \
* p=precprime(10^q - s); \
* s = s + p; \
* forstep(i = floor(p/sqrt(3)), 1, -1, \
* if(znorder(Mod(i, p)) == p-1, \
* print("[ ", i, ", ", p, " ],"); \
* break )))
*/
private const GENERATORS = [
[ 56, 97 ],
[ 511, 887 ],
[ 5203, 9013 ],
[ 51947, 90001 ],
[ 519612, 900001 ],
[ 5196144, 8999993 ],
[ 51961523, 89999999 ],
[ 519615218, 899999963 ],
[ 5196152444, 9000000043 ],
];
/** @var int */
private $offset;
/** @var bool */
private $hasGmp;
/** @var bool */
private $hasBcm;
public function __construct( $config ) {
$this->offset = $config['offset'] ?? 0;
$this->hasGmp = extension_loaded( 'gmp' );
$this->hasBcm = extension_loaded( 'bcmath' );
if ( !$this->hasGmp && !$this->hasBcm ) {
throw new \MWException( __CLASS__ . ' requires the bcmath or gmp extension' );
}
}
public function getSerialIdForIndex( int $index ): string {
if ( $index <= 0 ) {
return (string)$index;
}
$offset = $this->offset;
foreach ( self::GENERATORS as [ $g, $p ] ) {
if ( $index - $offset < $p ) {
return (string)( $offset + $this->powmod( $g, $index - $offset, $p ) );
}
$offset += $p - 1;
}
throw new \MWException( __METHOD__ . ": The index $index is too large" );
}
private function powmod( $num, $exponent, $modulus ) {
if ( $this->hasGmp ) {
return \gmp_intval( \gmp_powm( $num, $exponent, $modulus ) );
} elseif ( $this->hasBcm ) {
return (int)\bcpowmod( (string)$num, (string)$exponent, (string)$modulus );
} else {
throw new \MWException( __CLASS__ . ' requires the bcmath or gmp extension' );
}
}
}

View file

@ -69,6 +69,9 @@ class TempUserCreator implements TempUserConfig {
],
'filtered-radix' => [
'class' => FilteredRadixSerialMapping::class,
],
'scramble' => [
'class' => ScrambleMapping::class,
]
];

View file

@ -0,0 +1,31 @@
<?php
namespace MediaWiki\Tests\User\TempUser;
use MediaWiki\User\TempUser\ScrambleMapping;
use PHPUnit\Framework\TestCase;
/**
* @covers \MediaWiki\User\TempUser\ScrambleMapping
*/
class ScrambleMappingTest extends TestCase {
public function testMap() {
$map = new ScrambleMapping( [] );
$duplicates = 0;
// This has been verified up to 1e8 but for CI purposes we will use 200
$n = 200;
// Make a bit array for duplicate detection, with enough space for one extra digit
$bitArray = str_repeat( "\0", $n * 10 / 8 );
for ( $i = 0; $i < $n; $i++ ) {
$value = (int)$map->getSerialIdForIndex( $i );
$minor = $value % 8;
$major = ( $value - $minor ) / 8;
$prevBits = ord( $bitArray[$major] );
$prevStatus = ( $prevBits & ( 1 << $minor ) );
$duplicates += ( $prevStatus ? 1 : 0 );
$newBits = $prevBits | ( 1 << $minor );
$bitArray[$major] = chr( $newBits );
}
$this->assertSame( 0, $duplicates, 'duplicate detected' );
}
}