2022-05-26 12:57:29 +00:00
|
|
|
<?php
|
|
|
|
|
/**
|
|
|
|
|
* Copyright (C) 2011-2020 Wikimedia Foundation and others.
|
|
|
|
|
*
|
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
|
* it under the terms of the GNU General Public License as published by
|
|
|
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
* (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License along
|
|
|
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
namespace MediaWiki\Rest\Handler;
|
|
|
|
|
|
|
|
|
|
use Composer\Semver\Semver;
|
|
|
|
|
use ExtensionRegistry;
|
2022-06-14 18:26:34 +00:00
|
|
|
use InvalidArgumentException;
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
use LanguageCode;
|
2022-05-26 12:57:29 +00:00
|
|
|
use Liuggio\StatsdClient\Factory\StatsdDataFactoryInterface;
|
|
|
|
|
use LogicException;
|
2022-12-05 18:39:37 +00:00
|
|
|
use MediaWiki\Linker\LinkTarget;
|
2022-05-26 12:57:29 +00:00
|
|
|
use MediaWiki\Logger\LoggerFactory;
|
2022-08-17 19:27:48 +00:00
|
|
|
use MediaWiki\MainConfigNames;
|
2022-05-26 12:57:29 +00:00
|
|
|
use MediaWiki\MediaWikiServices;
|
2022-12-05 18:39:37 +00:00
|
|
|
use MediaWiki\Page\ExistingPageRecord;
|
2022-08-26 20:07:48 +00:00
|
|
|
use MediaWiki\Page\PageIdentity;
|
2022-12-04 21:35:20 +00:00
|
|
|
use MediaWiki\Page\ProperPageIdentity;
|
2022-05-26 12:57:29 +00:00
|
|
|
use MediaWiki\Rest\Handler;
|
2023-01-13 21:30:21 +00:00
|
|
|
use MediaWiki\Rest\Handler\Helper\HtmlInputTransformHelper;
|
|
|
|
|
use MediaWiki\Rest\Handler\Helper\HtmlOutputRendererHelper;
|
|
|
|
|
use MediaWiki\Rest\Handler\Helper\ParsoidFormatHelper;
|
2022-05-26 12:57:29 +00:00
|
|
|
use MediaWiki\Rest\HttpException;
|
2022-06-14 18:26:34 +00:00
|
|
|
use MediaWiki\Rest\LocalizedHttpException;
|
2022-05-26 12:57:29 +00:00
|
|
|
use MediaWiki\Rest\Response;
|
|
|
|
|
use MediaWiki\Rest\ResponseException;
|
2023-06-02 20:29:53 +00:00
|
|
|
use MediaWiki\Revision\BadRevisionException;
|
2022-04-21 10:54:28 +00:00
|
|
|
use MediaWiki\Revision\MutableRevisionRecord;
|
2022-05-26 12:57:29 +00:00
|
|
|
use MediaWiki\Revision\RevisionAccessException;
|
2022-04-21 10:54:28 +00:00
|
|
|
use MediaWiki\Revision\SlotRecord;
|
2023-06-02 20:29:53 +00:00
|
|
|
use MediaWiki\Revision\SuppressedDataException;
|
2023-03-01 20:33:26 +00:00
|
|
|
use MediaWiki\Title\Title;
|
2023-02-23 20:44:38 +00:00
|
|
|
use MediaWiki\WikiMap\WikiMap;
|
2022-06-14 18:26:34 +00:00
|
|
|
use MobileContext;
|
2022-05-26 12:57:29 +00:00
|
|
|
use RequestContext;
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
use Wikimedia\Bcp47Code\Bcp47Code;
|
2022-05-26 12:57:29 +00:00
|
|
|
use Wikimedia\Http\HttpAcceptParser;
|
2022-06-14 18:26:34 +00:00
|
|
|
use Wikimedia\Message\DataMessageValue;
|
2022-05-26 12:57:29 +00:00
|
|
|
use Wikimedia\Parsoid\Config\DataAccess;
|
|
|
|
|
use Wikimedia\Parsoid\Config\PageConfig;
|
|
|
|
|
use Wikimedia\Parsoid\Config\PageConfigFactory;
|
|
|
|
|
use Wikimedia\Parsoid\Config\SiteConfig;
|
|
|
|
|
use Wikimedia\Parsoid\Core\ClientError;
|
|
|
|
|
use Wikimedia\Parsoid\Core\PageBundle;
|
|
|
|
|
use Wikimedia\Parsoid\Core\ResourceLimitExceededException;
|
2022-06-30 14:21:30 +00:00
|
|
|
use Wikimedia\Parsoid\DOM\Document;
|
2022-05-26 12:57:29 +00:00
|
|
|
use Wikimedia\Parsoid\Parsoid;
|
|
|
|
|
use Wikimedia\Parsoid\Utils\ContentUtils;
|
|
|
|
|
use Wikimedia\Parsoid\Utils\DOMCompat;
|
|
|
|
|
use Wikimedia\Parsoid\Utils\DOMUtils;
|
|
|
|
|
use Wikimedia\Parsoid\Utils\Timing;
|
2022-04-21 10:54:28 +00:00
|
|
|
use WikitextContent;
|
2022-05-26 12:57:29 +00:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Base class for Parsoid handlers.
|
|
|
|
|
*/
|
|
|
|
|
abstract class ParsoidHandler extends Handler {
|
2022-12-05 18:39:37 +00:00
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
// TODO logging, timeouts(?), CORS
|
|
|
|
|
// TODO content negotiation (routes.js routes.acceptable)
|
|
|
|
|
// TODO handle MaxConcurrentCallsError (pool counter?)
|
|
|
|
|
|
|
|
|
|
/** @var array Parsoid-specific settings array from $config */
|
|
|
|
|
private $parsoidSettings;
|
|
|
|
|
|
|
|
|
|
/** @var SiteConfig */
|
|
|
|
|
protected $siteConfig;
|
|
|
|
|
|
|
|
|
|
/** @var PageConfigFactory */
|
|
|
|
|
protected $pageConfigFactory;
|
|
|
|
|
|
|
|
|
|
/** @var DataAccess */
|
|
|
|
|
protected $dataAccess;
|
|
|
|
|
|
|
|
|
|
/** @var ExtensionRegistry */
|
|
|
|
|
protected $extensionRegistry;
|
|
|
|
|
|
|
|
|
|
/** @var ?StatsdDataFactoryInterface A statistics aggregator */
|
|
|
|
|
protected $metrics;
|
|
|
|
|
|
|
|
|
|
/** @var array */
|
|
|
|
|
private $requestAttributes;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @return static
|
|
|
|
|
*/
|
|
|
|
|
public static function factory(): ParsoidHandler {
|
|
|
|
|
$services = MediaWikiServices::getInstance();
|
|
|
|
|
// @phan-suppress-next-line PhanTypeInstantiateAbstractStatic
|
|
|
|
|
return new static(
|
2022-08-17 19:27:48 +00:00
|
|
|
$services->getMainConfig()->get( MainConfigNames::ParsoidSettings ),
|
2022-05-26 12:57:29 +00:00
|
|
|
$services->getParsoidSiteConfig(),
|
|
|
|
|
$services->getParsoidPageConfigFactory(),
|
|
|
|
|
$services->getParsoidDataAccess()
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param array $parsoidSettings
|
|
|
|
|
* @param SiteConfig $siteConfig
|
|
|
|
|
* @param PageConfigFactory $pageConfigFactory
|
|
|
|
|
* @param DataAccess $dataAccess
|
|
|
|
|
*/
|
|
|
|
|
public function __construct(
|
|
|
|
|
array $parsoidSettings,
|
|
|
|
|
SiteConfig $siteConfig,
|
|
|
|
|
PageConfigFactory $pageConfigFactory,
|
|
|
|
|
DataAccess $dataAccess
|
|
|
|
|
) {
|
|
|
|
|
$this->parsoidSettings = $parsoidSettings;
|
|
|
|
|
$this->siteConfig = $siteConfig;
|
|
|
|
|
$this->pageConfigFactory = $pageConfigFactory;
|
|
|
|
|
$this->dataAccess = $dataAccess;
|
|
|
|
|
$this->extensionRegistry = ExtensionRegistry::getInstance();
|
|
|
|
|
$this->metrics = $siteConfig->metrics();
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-14 18:26:34 +00:00
|
|
|
/**
|
|
|
|
|
* Verify that the {domain} path parameter matches the actual domain.
|
|
|
|
|
* @todo Remove this when we no longer need to support the {domain}
|
|
|
|
|
* parameter with backwards compatibility with the parsoid
|
|
|
|
|
* extension.
|
|
|
|
|
* @param string $domain Domain name parameter to validate
|
|
|
|
|
*/
|
|
|
|
|
protected function assertDomainIsCorrect( $domain ): void {
|
|
|
|
|
// We are cutting some corners here (IDN, non-ASCII casing)
|
|
|
|
|
// since domain name support is provisional.
|
|
|
|
|
// TODO use a proper validator instead
|
2022-08-17 19:27:48 +00:00
|
|
|
$server = \RequestContext::getMain()->getConfig()->get( MainConfigNames::Server );
|
2022-06-14 18:26:34 +00:00
|
|
|
$expectedDomain = wfParseUrl( $server )['host'] ?? null;
|
|
|
|
|
if ( !$expectedDomain ) {
|
|
|
|
|
throw new LogicException( 'Cannot parse $wgServer' );
|
|
|
|
|
}
|
|
|
|
|
if ( strcasecmp( $expectedDomain, $domain ) === 0 ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO: This should really go away! It's only acceptable because
|
|
|
|
|
// this entire method is going to be removed once we no longer
|
|
|
|
|
// need the parsoid extension endpoints with the {domain} parameter.
|
|
|
|
|
if ( $this->extensionRegistry->isLoaded( 'MobileFrontend' ) ) {
|
|
|
|
|
// @phan-suppress-next-line PhanUndeclaredClassMethod
|
|
|
|
|
$mobileServer = MobileContext::singleton()->getMobileUrl( $server );
|
|
|
|
|
$expectedMobileDomain = wfParseUrl( $mobileServer )['host'] ?? null;
|
|
|
|
|
if ( $expectedMobileDomain && strcasecmp( $expectedMobileDomain, $domain ) === 0 ) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$msg = new DataMessageValue(
|
|
|
|
|
'mwparsoid-invalid-domain',
|
|
|
|
|
[],
|
|
|
|
|
'invalid-domain',
|
|
|
|
|
[ 'expected' => $expectedDomain, 'actual' => $domain, ]
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
throw new LocalizedHttpException( $msg, 400, [
|
|
|
|
|
'error' => 'parameter-validation-failed',
|
|
|
|
|
'name' => 'domain',
|
|
|
|
|
'value' => $domain,
|
|
|
|
|
'failureCode' => $msg->getCode(),
|
|
|
|
|
'failureData' => $msg->getData(),
|
|
|
|
|
] );
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
/**
|
|
|
|
|
* Get the parsed body by content-type
|
|
|
|
|
*
|
|
|
|
|
* @return array
|
|
|
|
|
*/
|
|
|
|
|
protected function getParsedBody(): array {
|
|
|
|
|
$request = $this->getRequest();
|
2022-06-10 21:11:58 +00:00
|
|
|
[ $contentType ] = explode( ';', $request->getHeader( 'Content-Type' )[0] ?? '', 2 );
|
2022-05-26 12:57:29 +00:00
|
|
|
switch ( $contentType ) {
|
|
|
|
|
case 'application/x-www-form-urlencoded':
|
|
|
|
|
case 'multipart/form-data':
|
|
|
|
|
return $request->getPostParams();
|
|
|
|
|
case 'application/json':
|
|
|
|
|
$json = json_decode( $request->getBody()->getContents(), true );
|
|
|
|
|
if ( !is_array( $json ) ) {
|
|
|
|
|
throw new HttpException( 'Payload does not JSON decode to an array.', 400 );
|
|
|
|
|
}
|
|
|
|
|
return $json;
|
|
|
|
|
default:
|
|
|
|
|
throw new HttpException( 'Unsupported Media Type', 415 );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Rough equivalent of req.local from Parsoid-JS.
|
|
|
|
|
* FIXME most of these should be replaced with more native ways of handling the request.
|
|
|
|
|
* @return array
|
|
|
|
|
*/
|
|
|
|
|
protected function &getRequestAttributes(): array {
|
|
|
|
|
if ( $this->requestAttributes ) {
|
|
|
|
|
return $this->requestAttributes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$request = $this->getRequest();
|
|
|
|
|
$body = ( $request->getMethod() === 'POST' ) ? $this->getParsedBody() : [];
|
|
|
|
|
$opts = array_merge( $body, array_intersect_key( $request->getPathParams(),
|
|
|
|
|
[ 'from' => true, 'format' => true ] ) );
|
|
|
|
|
'@phan-var array<string,array|bool|string> $opts'; // @var array<string,array|bool|string> $opts
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
$contentLanguage = $request->getHeaderLine( 'Content-Language' ) ?: null;
|
|
|
|
|
if ( $contentLanguage ) {
|
|
|
|
|
$contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
|
|
|
|
|
$contentLanguage
|
|
|
|
|
);
|
|
|
|
|
}
|
2022-05-26 12:57:29 +00:00
|
|
|
$attribs = [
|
|
|
|
|
'titleMissing' => empty( $request->getPathParams()['title'] ),
|
|
|
|
|
'pageName' => $request->getPathParam( 'title' ) ?? '',
|
|
|
|
|
'oldid' => $request->getPathParam( 'revision' ),
|
|
|
|
|
// "body_only" flag to return just the body (instead of the entire HTML doc)
|
|
|
|
|
// We would like to deprecate use of this flag: T181657
|
|
|
|
|
'body_only' => $request->getQueryParams()['body_only'] ?? $body['body_only'] ?? null,
|
|
|
|
|
'errorEnc' => ParsoidFormatHelper::ERROR_ENCODING[$opts['format']] ?? 'plain',
|
|
|
|
|
'iwp' => WikiMap::getCurrentWikiId(), // PORT-FIXME verify
|
|
|
|
|
'offsetType' => $body['offsetType']
|
|
|
|
|
?? $request->getQueryParams()['offsetType']
|
|
|
|
|
// Lint requests should return UCS2 offsets by default
|
|
|
|
|
?? ( $opts['format'] === ParsoidFormatHelper::FORMAT_LINT ? 'ucs2' : 'byte' ),
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
'pagelanguage' => $contentLanguage,
|
2022-05-26 12:57:29 +00:00
|
|
|
];
|
|
|
|
|
|
2022-11-15 22:57:38 +00:00
|
|
|
// For use in getHtmlOutputRendererHelper
|
|
|
|
|
$opts['stash'] = $request->getQueryParams()['stash'] ?? false;
|
2022-09-09 09:32:17 +00:00
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
if ( $request->getMethod() === 'POST' ) {
|
|
|
|
|
if ( isset( $opts['original']['revid'] ) ) {
|
|
|
|
|
$attribs['oldid'] = $opts['original']['revid'];
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $opts['original']['title'] ) ) {
|
|
|
|
|
$attribs['titleMissing'] = false;
|
|
|
|
|
$attribs['pageName'] = $opts['original']['title'];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if ( $attribs['oldid'] !== null ) {
|
|
|
|
|
if ( $attribs['oldid'] === '' ) {
|
|
|
|
|
$attribs['oldid'] = null;
|
|
|
|
|
} else {
|
|
|
|
|
$attribs['oldid'] = (int)$attribs['oldid'];
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
$acceptLanguage = $request->getHeaderLine( 'Accept-Language' ) ?: null;
|
|
|
|
|
if ( $acceptLanguage ) {
|
|
|
|
|
$acceptLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
|
|
|
|
|
$acceptLanguage
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
$attribs['envOptions'] = [
|
|
|
|
|
// We use `prefix` but ought to use `domain` (T206764)
|
|
|
|
|
'prefix' => $attribs['iwp'],
|
2022-06-10 21:11:58 +00:00
|
|
|
// For the legacy "domain" path parameter used by the endpoints exposed
|
2022-06-14 18:26:34 +00:00
|
|
|
// by the parsoid extension. Will be null for core endpoints.
|
2022-06-10 21:11:58 +00:00
|
|
|
'domain' => $request->getPathParam( 'domain' ),
|
2022-05-26 12:57:29 +00:00
|
|
|
'pageName' => $attribs['pageName'],
|
|
|
|
|
'offsetType' => $attribs['offsetType'],
|
|
|
|
|
'cookie' => $request->getHeaderLine( 'Cookie' ),
|
|
|
|
|
'reqId' => $request->getHeaderLine( 'X-Request-Id' ),
|
|
|
|
|
'userAgent' => $request->getHeaderLine( 'User-Agent' ),
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
'htmlVariantLanguage' => $acceptLanguage,
|
2022-05-26 12:57:29 +00:00
|
|
|
// Semver::satisfies checks below expect a valid outputContentVersion value.
|
|
|
|
|
// Better to set it here instead of adding the default value at every check.
|
|
|
|
|
'outputContentVersion' => Parsoid::defaultHTMLVersion(),
|
|
|
|
|
];
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
|
|
|
|
|
# Convert language codes in $opts['updates']['variant'] if present
|
|
|
|
|
$sourceVariant = $opts['updates']['variant']['source'] ?? null;
|
|
|
|
|
if ( $sourceVariant ) {
|
|
|
|
|
$sourceVariant = LanguageCode::normalizeNonstandardCodeAndWarn(
|
|
|
|
|
$sourceVariant
|
|
|
|
|
);
|
|
|
|
|
$opts['updates']['variant']['source'] = $sourceVariant;
|
|
|
|
|
}
|
|
|
|
|
$targetVariant = $opts['updates']['variant']['target'] ?? null;
|
|
|
|
|
if ( $targetVariant ) {
|
|
|
|
|
$targetVariant = LanguageCode::normalizeNonstandardCodeAndWarn(
|
|
|
|
|
$targetVariant
|
|
|
|
|
);
|
|
|
|
|
$opts['updates']['variant']['target'] = $targetVariant;
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $opts['wikitext']['headers']['content-language'] ) ) {
|
|
|
|
|
$contentLanguage = $opts['wikitext']['headers']['content-language'];
|
|
|
|
|
$contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
|
|
|
|
|
$contentLanguage
|
|
|
|
|
);
|
|
|
|
|
$opts['wikitext']['headers']['content-language'] = $contentLanguage;
|
|
|
|
|
}
|
|
|
|
|
if ( isset( $opts['original']['wikitext']['headers']['content-language'] ) ) {
|
|
|
|
|
$contentLanguage = $opts['original']['wikitext']['headers']['content-language'];
|
|
|
|
|
$contentLanguage = LanguageCode::normalizeNonstandardCodeAndWarn(
|
|
|
|
|
$contentLanguage
|
|
|
|
|
);
|
|
|
|
|
$opts['original']['wikitext']['headers']['content-language'] = $contentLanguage;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
$attribs['opts'] = $opts;
|
|
|
|
|
|
2022-06-14 18:26:34 +00:00
|
|
|
// TODO: Remove assertDomainIsCorrect() once we no longer need to support the {domain}
|
|
|
|
|
// parameter for the endpoints exposed by the parsoid extension.
|
|
|
|
|
if ( empty( $this->parsoidSettings['debugApi'] ) && $attribs['envOptions']['domain'] !== null ) {
|
|
|
|
|
$this->assertDomainIsCorrect( $attribs['envOptions']['domain'] );
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
$this->requestAttributes = $attribs;
|
|
|
|
|
return $this->requestAttributes;
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
/**
|
|
|
|
|
* @param array $attribs
|
|
|
|
|
* @param ?string $source
|
2022-12-09 17:07:38 +00:00
|
|
|
* @param PageConfig $page
|
2022-12-09 04:49:41 +00:00
|
|
|
* @param ?int $revId
|
2022-12-05 18:39:37 +00:00
|
|
|
*
|
|
|
|
|
* @return HtmlOutputRendererHelper
|
|
|
|
|
*/
|
|
|
|
|
private function getHtmlOutputRendererHelper(
|
|
|
|
|
array $attribs,
|
|
|
|
|
?string $source,
|
2022-12-09 17:07:38 +00:00
|
|
|
PageConfig $page,
|
2022-12-09 04:49:41 +00:00
|
|
|
?int $revId
|
2022-12-05 18:39:37 +00:00
|
|
|
): HtmlOutputRendererHelper {
|
|
|
|
|
$services = MediaWikiServices::getInstance();
|
|
|
|
|
|
2022-12-09 17:07:38 +00:00
|
|
|
// TODO: This method (and wt2html) should take a PageIdentity + revId,
|
|
|
|
|
// to reduce the usage of PageConfig in MW core.
|
|
|
|
|
$page = $this->getPageConfigToIdentity( $page );
|
2022-12-05 18:39:37 +00:00
|
|
|
|
|
|
|
|
$helper = new HtmlOutputRendererHelper(
|
|
|
|
|
$services->getParsoidOutputStash(),
|
|
|
|
|
$services->getStatsdDataFactory(),
|
|
|
|
|
$services->getParsoidOutputAccess(),
|
|
|
|
|
$services->getHtmlTransformFactory(),
|
|
|
|
|
$services->getContentHandlerFactory(),
|
|
|
|
|
$services->getLanguageFactory()
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$user = RequestContext::getMain()->getUser();
|
|
|
|
|
|
|
|
|
|
$params = [];
|
2022-12-09 04:49:41 +00:00
|
|
|
$helper->init( $page, $params, $user, $revId );
|
2022-12-05 18:39:37 +00:00
|
|
|
|
|
|
|
|
// XXX: should default to the page's content model?
|
|
|
|
|
$model = $attribs['opts']['contentmodel']
|
|
|
|
|
?? ( $attribs['envOptions']['contentmodel'] ?? CONTENT_MODEL_WIKITEXT );
|
|
|
|
|
|
|
|
|
|
if ( $source !== null ) {
|
|
|
|
|
$helper->setContentSource( $source, $model );
|
|
|
|
|
}
|
|
|
|
|
|
2022-11-15 22:57:38 +00:00
|
|
|
if ( isset( $attribs['opts']['stash'] ) ) {
|
|
|
|
|
$helper->setStashingEnabled( $attribs['opts']['stash'] );
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-02 13:52:35 +00:00
|
|
|
if ( isset( $attribs['envOptions']['outputContentVersion'] ) ) {
|
2022-12-05 18:39:37 +00:00
|
|
|
$helper->setOutputProfileVersion( $attribs['envOptions']['outputContentVersion'] );
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-02 13:52:35 +00:00
|
|
|
if ( isset( $attribs['envOptions']['offsetType'] ) ) {
|
|
|
|
|
$helper->setOffsetType( $attribs['envOptions']['offsetType'] );
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
if ( isset( $attribs['pagelanguage'] ) ) {
|
|
|
|
|
$helper->setPageLanguage( $attribs['pagelanguage'] );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( isset( $attribs['envOptions']['htmlVariantLanguage'] ) ) {
|
|
|
|
|
$helper->setVariantConversionLanguage( $attribs['envOptions']['htmlVariantLanguage'] );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $helper;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-30 19:19:53 +00:00
|
|
|
/**
|
|
|
|
|
* @param array $attribs
|
|
|
|
|
* @param string $html
|
|
|
|
|
* @param PageConfig|PageIdentity $page
|
|
|
|
|
*
|
|
|
|
|
* @return HtmlInputTransformHelper
|
|
|
|
|
*/
|
|
|
|
|
protected function getHtmlInputTransformHelper(
|
2022-08-04 21:14:47 +00:00
|
|
|
array $attribs,
|
|
|
|
|
string $html,
|
2022-08-30 19:19:53 +00:00
|
|
|
$page
|
|
|
|
|
): HtmlInputTransformHelper {
|
|
|
|
|
$services = MediaWikiServices::getInstance();
|
2022-08-02 14:51:10 +00:00
|
|
|
|
2022-08-30 19:19:53 +00:00
|
|
|
// Support PageConfig for backwards compatibility.
|
|
|
|
|
// We should leave it to lower level code to create it.
|
|
|
|
|
if ( $page instanceof PageConfig ) {
|
2022-12-04 21:35:20 +00:00
|
|
|
$page = $this->getPageConfigToIdentity( $page );
|
2022-07-20 18:19:59 +00:00
|
|
|
}
|
|
|
|
|
|
2022-11-18 16:55:18 +00:00
|
|
|
$helper = $services->getPageRestHelperFactory()->newHtmlInputTransformHelper(
|
2022-08-30 19:19:53 +00:00
|
|
|
$attribs['envOptions']
|
|
|
|
|
);
|
2022-07-26 19:41:03 +00:00
|
|
|
|
2022-11-18 16:55:18 +00:00
|
|
|
$metrics = $this->siteConfig->metrics();
|
|
|
|
|
|
|
|
|
|
if ( $metrics ) {
|
|
|
|
|
$helper->setMetrics( $metrics );
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-30 19:19:53 +00:00
|
|
|
$parameters = $attribs['opts'] + $attribs;
|
|
|
|
|
$body = $attribs['opts'];
|
2022-08-26 20:07:48 +00:00
|
|
|
|
2022-08-30 19:19:53 +00:00
|
|
|
$body['html'] = $html;
|
2022-07-26 19:41:03 +00:00
|
|
|
|
2022-08-30 19:19:53 +00:00
|
|
|
$helper->init( $page, $body, $parameters );
|
2022-07-20 18:19:59 +00:00
|
|
|
|
2022-08-30 19:19:53 +00:00
|
|
|
return $helper;
|
2022-07-20 18:19:59 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
/**
|
|
|
|
|
* FIXME: Combine with ParsoidFormatHelper::parseContentTypeHeader
|
|
|
|
|
*/
|
|
|
|
|
private const NEW_SPEC =
|
|
|
|
|
'#^https://www.mediawiki.org/wiki/Specs/(HTML|pagebundle)/(\d+\.\d+\.\d+)$#D';
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This method checks if we support the requested content formats
|
|
|
|
|
* As a side-effect, it updates $attribs to set outputContentVersion
|
|
|
|
|
* that Parsoid should generate based on request headers.
|
|
|
|
|
*
|
|
|
|
|
* @param array &$attribs Request attributes from getRequestAttributes()
|
|
|
|
|
* @return bool
|
|
|
|
|
*/
|
|
|
|
|
protected function acceptable( array &$attribs ): bool {
|
|
|
|
|
$request = $this->getRequest();
|
|
|
|
|
$format = $attribs['opts']['format'];
|
|
|
|
|
|
|
|
|
|
if ( $format === ParsoidFormatHelper::FORMAT_WIKITEXT ) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$acceptHeader = $request->getHeader( 'Accept' );
|
|
|
|
|
if ( !$acceptHeader ) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$parser = new HttpAcceptParser();
|
|
|
|
|
$acceptableTypes = $parser->parseAccept( $acceptHeader[0] ); // FIXME: Multiple headers valid?
|
|
|
|
|
if ( !$acceptableTypes ) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// `acceptableTypes` is already sorted by quality.
|
|
|
|
|
foreach ( $acceptableTypes as $t ) {
|
|
|
|
|
$type = "{$t['type']}/{$t['subtype']}";
|
|
|
|
|
$profile = $t['params']['profile'] ?? null;
|
|
|
|
|
if (
|
|
|
|
|
( $format === ParsoidFormatHelper::FORMAT_HTML && $type === 'text/html' ) ||
|
|
|
|
|
( $format === ParsoidFormatHelper::FORMAT_PAGEBUNDLE && $type === 'application/json' )
|
|
|
|
|
) {
|
|
|
|
|
if ( $profile ) {
|
|
|
|
|
preg_match( self::NEW_SPEC, $profile, $matches );
|
|
|
|
|
if ( $matches && strtolower( $matches[1] ) === $format ) {
|
|
|
|
|
$contentVersion = Parsoid::resolveContentVersion( $matches[2] );
|
|
|
|
|
if ( $contentVersion ) {
|
|
|
|
|
// $attribs mutated here!
|
|
|
|
|
$attribs['envOptions']['outputContentVersion'] = $contentVersion;
|
|
|
|
|
return true;
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
} elseif (
|
|
|
|
|
( $type === '*/*' ) ||
|
|
|
|
|
( $format === ParsoidFormatHelper::FORMAT_HTML && $type === 'text/*' )
|
|
|
|
|
) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* @param string $title The page to be transformed
|
|
|
|
|
* @param ?int $revision The revision to be transformed
|
|
|
|
|
* @param ?string $wikitextOverride
|
|
|
|
|
* Custom wikitext to use instead of the real content of the page.
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
* @param ?Bcp47Code $pagelanguageOverride
|
2022-05-26 12:57:29 +00:00
|
|
|
* @return PageConfig
|
|
|
|
|
*/
|
|
|
|
|
protected function createPageConfig(
|
|
|
|
|
string $title, ?int $revision, ?string $wikitextOverride = null,
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
?Bcp47Code $pagelanguageOverride = null
|
2022-05-26 12:57:29 +00:00
|
|
|
): PageConfig {
|
|
|
|
|
$title = $title ? Title::newFromText( $title ) : Title::newMainPage();
|
|
|
|
|
if ( !$title ) {
|
|
|
|
|
// TODO use proper validation
|
|
|
|
|
throw new LogicException( 'Title not found!' );
|
|
|
|
|
}
|
|
|
|
|
$user = RequestContext::getMain()->getUser();
|
2022-04-21 10:54:28 +00:00
|
|
|
|
|
|
|
|
if ( $wikitextOverride === null ) {
|
|
|
|
|
$revisionRecord = null;
|
|
|
|
|
} else {
|
|
|
|
|
// Create a mutable revision record point to the same revision
|
|
|
|
|
// and set to the desired wikitext.
|
|
|
|
|
$revisionRecord = new MutableRevisionRecord( $title );
|
|
|
|
|
if ( $revision !== null ) {
|
|
|
|
|
$revisionRecord->setId( $revision );
|
|
|
|
|
}
|
|
|
|
|
$revisionRecord->setSlot(
|
|
|
|
|
SlotRecord::newUnsaved(
|
|
|
|
|
SlotRecord::MAIN,
|
|
|
|
|
new WikitextContent( $wikitextOverride )
|
|
|
|
|
)
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
// Note: Parsoid by design isn't supposed to use the user
|
|
|
|
|
// context right now, and all user state is expected to be
|
|
|
|
|
// introduced as a post-parse transform. So although we pass a
|
|
|
|
|
// User here, it only currently affects the output in obscure
|
|
|
|
|
// corner cases; see PageConfigFactory::create() for more.
|
|
|
|
|
// @phan-suppress-next-line PhanUndeclaredMethod method defined in subtype
|
|
|
|
|
return $this->pageConfigFactory->create(
|
2022-04-21 10:54:28 +00:00
|
|
|
$title, $user, $revisionRecord ?? $revision, null, $pagelanguageOverride,
|
2022-05-26 12:57:29 +00:00
|
|
|
$this->parsoidSettings
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Redirect to another Parsoid URL (e.g. canonization)
|
2022-06-10 21:11:58 +00:00
|
|
|
*
|
2022-07-04 13:20:59 +00:00
|
|
|
* @stable to override
|
|
|
|
|
*
|
2022-05-26 12:57:29 +00:00
|
|
|
* @param string $path Target URL
|
2022-06-10 21:11:58 +00:00
|
|
|
* @param array $pathParams Path parameters to inject into path
|
2022-05-26 12:57:29 +00:00
|
|
|
* @param array $queryParams Query parameters
|
2022-06-10 21:11:58 +00:00
|
|
|
*
|
2022-05-26 12:57:29 +00:00
|
|
|
* @return Response
|
|
|
|
|
*/
|
|
|
|
|
protected function createRedirectResponse(
|
2022-06-10 21:11:58 +00:00
|
|
|
string $path, array $pathParams = [], array $queryParams = []
|
2022-05-26 12:57:29 +00:00
|
|
|
): Response {
|
2022-07-04 13:20:59 +00:00
|
|
|
// FIXME this should not be necessary in the REST entry point
|
2022-05-26 12:57:29 +00:00
|
|
|
unset( $queryParams['title'] );
|
|
|
|
|
|
2022-07-04 13:20:59 +00:00
|
|
|
$url = $this->getRedirectRouteUrl( $path, $pathParams, $queryParams );
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
if ( $this->getRequest()->getMethod() === 'POST' ) {
|
2022-07-04 13:20:59 +00:00
|
|
|
// 307 response
|
|
|
|
|
$response = $this->getResponseFactory()->createTemporaryRedirect( $url );
|
2022-05-26 12:57:29 +00:00
|
|
|
} else {
|
2022-07-04 13:20:59 +00:00
|
|
|
// 302 response
|
|
|
|
|
$response = $this->getResponseFactory()->createLegacyTemporaryRedirect( $url );
|
2022-05-26 12:57:29 +00:00
|
|
|
}
|
|
|
|
|
$response->setHeader( 'Cache-Control', 'private,no-cache,s-maxage=0' );
|
|
|
|
|
return $response;
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-04 13:20:59 +00:00
|
|
|
/**
|
|
|
|
|
* Returns the target URL for a redirect to the given path and parameters.
|
|
|
|
|
* This exists so subclasses can override it to control whether the redirect
|
|
|
|
|
* should go to a private or to a public URL.
|
|
|
|
|
*
|
|
|
|
|
* @see Router::getRouteUrl()
|
|
|
|
|
*
|
|
|
|
|
* @stable to override
|
|
|
|
|
*
|
|
|
|
|
* @param string $path
|
|
|
|
|
* @param array $pathParams
|
|
|
|
|
* @param array $queryParams
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
protected function getRedirectRouteUrl(
|
|
|
|
|
string $path, array $pathParams = [], array $queryParams = []
|
|
|
|
|
) {
|
|
|
|
|
return $this->getRouter()->getRouteUrl( $path, $pathParams, $queryParams );
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
/**
|
|
|
|
|
* Try to create a PageConfig object. If we get an exception (because content
|
|
|
|
|
* may be missing or inaccessible), throw an appropriate HTTP response object
|
|
|
|
|
* for callers to handle.
|
|
|
|
|
*
|
|
|
|
|
* @param array $attribs
|
|
|
|
|
* @param ?string $wikitext
|
|
|
|
|
* @param bool $html2WtMode
|
|
|
|
|
* @return PageConfig
|
|
|
|
|
* @throws HttpException
|
|
|
|
|
*/
|
|
|
|
|
protected function tryToCreatePageConfig(
|
|
|
|
|
array $attribs, ?string $wikitext = null, bool $html2WtMode = false
|
|
|
|
|
): PageConfig {
|
|
|
|
|
$oldid = $attribs['oldid'];
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
$pageConfig = $this->createPageConfig(
|
|
|
|
|
$attribs['pageName'], $oldid, $wikitext,
|
|
|
|
|
$attribs['pagelanguage']
|
|
|
|
|
);
|
|
|
|
|
} catch ( RevisionAccessException $exception ) {
|
|
|
|
|
throw new HttpException( 'The specified revision is deleted or suppressed.', 404 );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$hasOldId = ( $attribs['oldid'] !== null );
|
2023-06-02 20:29:53 +00:00
|
|
|
if ( !$html2WtMode || $hasOldId ) {
|
|
|
|
|
$pageContent = $pageConfig->getRevisionContent();
|
|
|
|
|
if ( $pageContent === null ) {
|
|
|
|
|
// T234549
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'The specified revision does not exist.', 404
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
try {
|
|
|
|
|
// FIXME: Parsoid is littered with the main slot assumption
|
|
|
|
|
$pageContent->getContent( SlotRecord::MAIN );
|
|
|
|
|
} catch ( BadRevisionException $e ) {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'The text of this revision is missing or corrupted.', 410
|
|
|
|
|
);
|
|
|
|
|
} catch ( SuppressedDataException $e ) {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'Access to the content has been suppressed for this audience.', 403
|
|
|
|
|
);
|
|
|
|
|
} catch ( RevisionAccessException $e ) {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'Revision access exception.', 404
|
|
|
|
|
);
|
|
|
|
|
}
|
2022-05-26 12:57:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( !$html2WtMode && $wikitext === null && !$hasOldId ) {
|
|
|
|
|
// Redirect to the latest revid
|
|
|
|
|
throw new ResponseException(
|
|
|
|
|
$this->createRedirectToOldidResponse( $pageConfig, $attribs )
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// All good!
|
|
|
|
|
return $pageConfig;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-26 20:07:48 +00:00
|
|
|
/**
|
|
|
|
|
* Try to create a PageIdentity object.
|
|
|
|
|
* If no page is specified in the request, this will return the wiki's main page.
|
|
|
|
|
* If an invalid page is requested, this throws an appropriate HTTPException.
|
|
|
|
|
*
|
|
|
|
|
* @param array $attribs
|
|
|
|
|
* @return PageIdentity
|
|
|
|
|
* @throws HttpException
|
|
|
|
|
*/
|
|
|
|
|
protected function tryToCreatePageIdentity( array $attribs ): PageIdentity {
|
|
|
|
|
if ( !isset( $attribs['pageName'] ) || $attribs['pageName'] === '' ) {
|
|
|
|
|
return Title::newMainPage();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// XXX: Should be injected, but the Parsoid extension relies on the
|
|
|
|
|
// constructor signature. Also, ParsoidHandler should go away soon anyway.
|
|
|
|
|
$pageStore = MediaWikiServices::getInstance()->getPageStore();
|
|
|
|
|
|
|
|
|
|
$page = $pageStore->getPageByText( $attribs['pageName'] );
|
|
|
|
|
|
|
|
|
|
if ( !$page ) {
|
|
|
|
|
throw new HttpException( 'Bad page name: ' . $attribs['pageName'], 400 );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $page;
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-10 21:11:58 +00:00
|
|
|
/**
|
2022-06-14 18:26:34 +00:00
|
|
|
* Get the path for the transform endpoint. May be overwritten to override the path.
|
|
|
|
|
*
|
|
|
|
|
* This is done in the parsoid extension, for backwards compatibility
|
|
|
|
|
* with the old endpoint URLs.
|
|
|
|
|
*
|
2022-07-04 13:20:59 +00:00
|
|
|
* @stable to override
|
|
|
|
|
*
|
2022-06-14 18:26:34 +00:00
|
|
|
* @param string $format The format the endpoint is expected to return.
|
2022-06-10 21:11:58 +00:00
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
2022-06-14 18:26:34 +00:00
|
|
|
protected function getTransformEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
|
|
|
|
|
return '/coredev/v0/transform/{from}/to/{format}/{title}/{revision}';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get the path for the page content endpoint. May be overwritten to override the path.
|
|
|
|
|
*
|
|
|
|
|
* This is done in the parsoid extension, for backwards compatibility
|
|
|
|
|
* with the old endpoint URLs.
|
|
|
|
|
*
|
2022-07-04 13:20:59 +00:00
|
|
|
* @stable to override
|
|
|
|
|
*
|
2022-06-14 18:26:34 +00:00
|
|
|
* @param string $format The format the endpoint is expected to return.
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
protected function getPageContentEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
|
|
|
|
|
if ( $format !== ParsoidFormatHelper::FORMAT_HTML ) {
|
|
|
|
|
throw new InvalidArgumentException( 'Unsupported page content format: ' . $format );
|
|
|
|
|
}
|
|
|
|
|
return '/v1/page/{title}/html';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Get the path for the page content endpoint. May be overwritten to override the path.
|
|
|
|
|
*
|
|
|
|
|
* This is done in the parsoid extension, for backwards compatibility
|
|
|
|
|
* with the old endpoint URLs.
|
|
|
|
|
*
|
2022-07-04 13:20:59 +00:00
|
|
|
* @stable to override
|
|
|
|
|
*
|
2022-06-14 18:26:34 +00:00
|
|
|
* @param string $format The format the endpoint is expected to return.
|
|
|
|
|
*
|
|
|
|
|
* @return string
|
|
|
|
|
*/
|
|
|
|
|
protected function getRevisionContentEndpoint( string $format = ParsoidFormatHelper::FORMAT_HTML ): string {
|
|
|
|
|
if ( $format !== ParsoidFormatHelper::FORMAT_HTML ) {
|
|
|
|
|
throw new InvalidArgumentException( 'Unsupported revision content format: ' . $format );
|
2022-06-10 21:11:58 +00:00
|
|
|
}
|
2022-06-14 18:26:34 +00:00
|
|
|
return '/v1/revision/{revision}/html';
|
2022-06-10 21:11:58 +00:00
|
|
|
}
|
|
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
/**
|
|
|
|
|
* @param LinkTarget $redirectTarget
|
|
|
|
|
* @param string $domain
|
|
|
|
|
* @param string $format
|
|
|
|
|
*
|
|
|
|
|
* @throws ResponseException
|
|
|
|
|
*/
|
|
|
|
|
private function followWikiRedirect( $redirectTarget, $domain, $format ): void {
|
|
|
|
|
$pageStore = MediaWikiServices::getInstance()->getPageStore();
|
|
|
|
|
$titleFormatter = MediaWikiServices::getInstance()->getTitleFormatter();
|
|
|
|
|
$redirectTarget = $pageStore->getPageForLink( $redirectTarget );
|
|
|
|
|
|
|
|
|
|
if ( $redirectTarget instanceof ExistingPageRecord ) {
|
|
|
|
|
$pathParams = [
|
|
|
|
|
'domain' => $domain,
|
|
|
|
|
'format' => $format,
|
|
|
|
|
'title' => $titleFormatter->getPrefixedDBkey( $redirectTarget ),
|
|
|
|
|
'revision' => $redirectTarget->getLatest()
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
// NOTE: Core doesn't have REST endpoints that return raw wikitext,
|
|
|
|
|
// so the below will fail unless the methods are overwritten.
|
|
|
|
|
if ( $redirectTarget->exists() ) {
|
|
|
|
|
$redirectPath = $this->getRevisionContentEndpoint( ParsoidFormatHelper::FORMAT_WIKITEXT );
|
|
|
|
|
} else {
|
|
|
|
|
$redirectPath = $this->getPageContentEndpoint( ParsoidFormatHelper::FORMAT_WIKITEXT );
|
|
|
|
|
}
|
|
|
|
|
throw new ResponseException(
|
|
|
|
|
$this->createRedirectResponse(
|
|
|
|
|
$redirectPath,
|
|
|
|
|
$pathParams,
|
|
|
|
|
$this->getRequest()->getQueryParams()
|
|
|
|
|
)
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
/**
|
|
|
|
|
* Expand the current URL with the latest revision number and redirect there.
|
|
|
|
|
*
|
|
|
|
|
* @param PageConfig $pageConfig
|
|
|
|
|
* @param array $attribs Request attributes from getRequestAttributes()
|
|
|
|
|
* @return Response
|
|
|
|
|
*/
|
|
|
|
|
protected function createRedirectToOldidResponse(
|
|
|
|
|
PageConfig $pageConfig, array $attribs
|
|
|
|
|
): Response {
|
|
|
|
|
$format = $this->getRequest()->getPathParam( 'format' );
|
|
|
|
|
$target = $pageConfig->getTitle();
|
|
|
|
|
$revid = $pageConfig->getRevisionId();
|
|
|
|
|
|
|
|
|
|
if ( $revid === null ) {
|
|
|
|
|
throw new LogicException( 'Expected page to have a revision id.' );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$this->metrics->increment( 'redirectToOldid.' . $format );
|
|
|
|
|
|
2022-06-10 21:11:58 +00:00
|
|
|
$pathParams = [
|
|
|
|
|
'domain' => $attribs['envOptions']['domain'],
|
|
|
|
|
'format' => $format,
|
|
|
|
|
'title' => $target,
|
|
|
|
|
'revision' => $revid
|
|
|
|
|
];
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
if ( $this->getRequest()->getMethod() === 'POST' ) {
|
2022-06-10 21:11:58 +00:00
|
|
|
$pathParams['from'] = $this->getRequest()->getPathParam( 'from' );
|
2022-06-14 18:26:34 +00:00
|
|
|
$newPath = $this->getTransformEndpoint( $format );
|
2022-05-26 12:57:29 +00:00
|
|
|
} else {
|
2022-06-14 18:26:34 +00:00
|
|
|
$newPath = $this->getRevisionContentEndpoint( $format );
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
}
|
2022-06-10 21:11:58 +00:00
|
|
|
return $this->createRedirectResponse( $newPath, $pathParams, $this->getRequest()->getQueryParams() );
|
2022-05-26 12:57:29 +00:00
|
|
|
}
|
|
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
public function wtLint( PageConfig $pageConfig, array $attribs, ?string $wikitext = null ) {
|
|
|
|
|
$envOptions = $attribs['envOptions'];
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
$parsoid = $this->newParsoid();
|
|
|
|
|
return $parsoid->wikitext2lint( $pageConfig, $envOptions );
|
|
|
|
|
} catch ( ClientError $e ) {
|
|
|
|
|
throw new HttpException( $e->getMessage(), 400 );
|
|
|
|
|
} catch ( ResourceLimitExceededException $e ) {
|
|
|
|
|
throw new HttpException( $e->getMessage(), 413 );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private function allowParserCacheWrite() {
|
|
|
|
|
$config = RequestContext::getMain()->getConfig();
|
|
|
|
|
|
|
|
|
|
// HACK: remove before the release of MW 1.40 / early 2023.
|
|
|
|
|
if ( $config->has( 'TemporaryParsoidHandlerParserCacheWriteRatio' ) ) {
|
|
|
|
|
// We need to be careful about ramping up the cache writes,
|
|
|
|
|
// so we don't run out of disk space.
|
|
|
|
|
return wfRandom() < $config->get( 'TemporaryParsoidHandlerParserCacheWriteRatio' );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
/**
|
|
|
|
|
* Wikitext -> HTML helper.
|
|
|
|
|
* Spec'd in https://phabricator.wikimedia.org/T75955 and the API tests.
|
|
|
|
|
*
|
|
|
|
|
* @param PageConfig $pageConfig
|
|
|
|
|
* @param array $attribs Request attributes from getRequestAttributes()
|
|
|
|
|
* @param ?string $wikitext Wikitext to transform (or null to use the
|
|
|
|
|
* page specified in the request attributes).
|
2022-12-05 18:39:37 +00:00
|
|
|
*
|
2022-05-26 12:57:29 +00:00
|
|
|
* @return Response
|
|
|
|
|
*/
|
|
|
|
|
protected function wt2html(
|
|
|
|
|
PageConfig $pageConfig, array $attribs, ?string $wikitext = null
|
|
|
|
|
) {
|
|
|
|
|
$request = $this->getRequest();
|
|
|
|
|
$opts = $attribs['opts'];
|
|
|
|
|
$format = $opts['format'];
|
|
|
|
|
$oldid = $attribs['oldid'];
|
2022-11-15 22:57:38 +00:00
|
|
|
$stash = $opts['stash'] ?? false;
|
2022-05-26 12:57:29 +00:00
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
if ( $format === ParsoidFormatHelper::FORMAT_LINT ) {
|
|
|
|
|
$lints = $this->wtLint( $pageConfig, $attribs, $wikitext );
|
2023-05-23 00:47:03 +00:00
|
|
|
|
|
|
|
|
// FIXME: Once we improve handling of "hidden" lint categories in Parsoid
|
|
|
|
|
// to not generate these by default, we can get rid of this hack here.
|
|
|
|
|
// Till then, filter out 'large-tables' lints.
|
|
|
|
|
$lintRemoved = false;
|
|
|
|
|
foreach ( $lints as $index => $lint ) {
|
|
|
|
|
if ( $lint['type'] === 'large-tables' ) {
|
|
|
|
|
unset( $lints[$index] );
|
|
|
|
|
$lintRemoved = true;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Resequence array indexes of unset elements which otherwise are json formatted
|
|
|
|
|
// with unwanted index keys
|
|
|
|
|
if ( $lintRemoved ) {
|
|
|
|
|
$lints = array_values( $lints );
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
$response = $this->getResponseFactory()->createJson( $lints );
|
|
|
|
|
return $response;
|
|
|
|
|
}
|
2022-05-26 12:57:29 +00:00
|
|
|
|
|
|
|
|
// Performance Timing options
|
|
|
|
|
// init refers to time elapsed before parsing begins
|
|
|
|
|
$metrics = $this->metrics;
|
|
|
|
|
$timing = Timing::start( $metrics );
|
|
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
$helper = $this->getHtmlOutputRendererHelper(
|
|
|
|
|
$attribs,
|
|
|
|
|
$wikitext,
|
2022-12-09 04:49:41 +00:00
|
|
|
$pageConfig,
|
|
|
|
|
$pageConfig->getRevisionId()
|
2022-12-05 18:39:37 +00:00
|
|
|
);
|
2022-05-26 12:57:29 +00:00
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
if ( !$this->allowParserCacheWrite() ) {
|
|
|
|
|
// NOTE: In theory, we want to always write to the parser cache. However,
|
|
|
|
|
// the ParserCache takes a lot of disk space, and we need to have fine grained control
|
|
|
|
|
// over when we write to it, so we can avoid running out of disc space.
|
|
|
|
|
$helper->setUseParserCache( true, false );
|
|
|
|
|
}
|
2022-12-01 21:08:10 +00:00
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
if (
|
|
|
|
|
!empty( $this->parsoidSettings['devAPI'] ) &&
|
|
|
|
|
( $request->getQueryParams()['follow_redirects'] ?? false )
|
|
|
|
|
) {
|
2022-12-04 21:35:20 +00:00
|
|
|
$page = $this->getPageConfigToIdentity( $pageConfig );
|
|
|
|
|
$redirectLookup = MediaWikiServices::getInstance()->getRedirectLookup();
|
|
|
|
|
$redirectTarget = $redirectLookup->getRedirectTarget( $page );
|
2022-05-26 12:57:29 +00:00
|
|
|
if ( $redirectTarget ) {
|
2022-12-05 18:39:37 +00:00
|
|
|
$this->followWikiRedirect(
|
|
|
|
|
$redirectTarget,
|
|
|
|
|
$attribs['envOptions']['domain'],
|
|
|
|
|
$format
|
2022-05-26 12:57:29 +00:00
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
$needsPageBundle = ( $format === ParsoidFormatHelper::FORMAT_PAGEBUNDLE );
|
|
|
|
|
|
|
|
|
|
if ( Semver::satisfies( $attribs['envOptions']['outputContentVersion'],
|
|
|
|
|
'!=' . Parsoid::defaultHTMLVersion() ) ) {
|
|
|
|
|
$metrics->increment( 'wt2html.parse.version.notdefault' );
|
|
|
|
|
}
|
2022-05-26 12:57:29 +00:00
|
|
|
|
|
|
|
|
if ( $attribs['body_only'] ) {
|
2022-12-05 18:39:37 +00:00
|
|
|
$helper->setFlavor( 'fragment' );
|
|
|
|
|
} elseif ( !$needsPageBundle ) {
|
|
|
|
|
// Inline data-parsoid. This will happen when no special params are set.
|
|
|
|
|
$helper->setFlavor( 'edit' );
|
2022-05-26 12:57:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $wikitext === null && $oldid !== null ) {
|
|
|
|
|
$mstr = 'pageWithOldid';
|
|
|
|
|
} else {
|
|
|
|
|
$mstr = 'wt';
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$timing->end( "wt2html.$mstr.init" );
|
|
|
|
|
$metrics->timing(
|
|
|
|
|
"wt2html.$mstr.size.input",
|
|
|
|
|
strlen( $pageConfig->getPageMainContent() )
|
|
|
|
|
);
|
|
|
|
|
$parseTiming = Timing::start( $metrics );
|
|
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
if ( $needsPageBundle ) {
|
|
|
|
|
$pb = $helper->getPageBundle();
|
|
|
|
|
|
|
|
|
|
$response = $this->getResponseFactory()->createJson( $pb->responseData() );
|
|
|
|
|
$helper->putHeaders( $response, false );
|
|
|
|
|
|
|
|
|
|
ParsoidFormatHelper::setContentType(
|
|
|
|
|
$response,
|
|
|
|
|
ParsoidFormatHelper::FORMAT_PAGEBUNDLE,
|
|
|
|
|
$pb->version
|
|
|
|
|
);
|
2022-05-26 12:57:29 +00:00
|
|
|
} else {
|
2022-12-05 18:39:37 +00:00
|
|
|
$out = $helper->getHtml();
|
2022-11-15 22:57:38 +00:00
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
$response = $this->getResponseFactory()->create();
|
|
|
|
|
$response->getBody()->write( $out->getRawText() );
|
2022-12-01 21:08:10 +00:00
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
$helper->putHeaders( $response, true );
|
|
|
|
|
|
2022-11-15 22:57:38 +00:00
|
|
|
// Emit an ETag only if stashing is enabled. It's not reliably useful otherwise.
|
|
|
|
|
if ( $stash ) {
|
|
|
|
|
$eTag = $helper->getETag();
|
|
|
|
|
if ( $eTag ) {
|
|
|
|
|
$response->setHeader( 'ETag', $eTag );
|
|
|
|
|
}
|
2022-12-05 18:39:37 +00:00
|
|
|
}
|
|
|
|
|
}
|
2022-12-01 21:08:10 +00:00
|
|
|
|
2022-12-05 18:39:37 +00:00
|
|
|
// XXX: For pagebundle requests, this can be somewhat inflated
|
|
|
|
|
// because of pagebundle json-encoding overheads
|
|
|
|
|
$outSize = $response->getBody()->getSize();
|
|
|
|
|
$parseTime = $parseTiming->end( "wt2html.$mstr.parse" );
|
|
|
|
|
$timing->end( 'wt2html.total' );
|
|
|
|
|
$metrics->timing( "wt2html.$mstr.size.output", $outSize );
|
|
|
|
|
|
|
|
|
|
// Ignore slow parse metrics for non-oldid parses
|
|
|
|
|
if ( $mstr === 'pageWithOldid' ) {
|
|
|
|
|
if ( $parseTime > 3000 ) {
|
|
|
|
|
LoggerFactory::getInstance( 'slow-parsoid' )
|
|
|
|
|
->info( 'Parsing {title} was slow, took {time} seconds', [
|
|
|
|
|
'time' => number_format( $parseTime / 1000, 2 ),
|
|
|
|
|
'title' => $pageConfig->getTitle(),
|
|
|
|
|
] );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $parseTime > 10 && $outSize > 100 ) {
|
|
|
|
|
// * Don't bother with this metric for really small parse times
|
|
|
|
|
// p99 for initialization time is ~7ms according to grafana.
|
|
|
|
|
// So, 10ms ensures that startup overheads don't skew the metrics
|
|
|
|
|
// * For body_only=false requests, <head> section isn't generated
|
|
|
|
|
// and if the output is small, per-request overheads can skew
|
|
|
|
|
// the timePerKB metrics.
|
|
|
|
|
|
|
|
|
|
// NOTE: This is slightly misleading since there are fixed costs
|
|
|
|
|
// for generating output like the <head> section and should be factored in,
|
|
|
|
|
// but this is good enough for now as a useful first degree of approxmation.
|
|
|
|
|
$timePerKB = $parseTime * 1024 / $outSize;
|
|
|
|
|
$metrics->timing( 'wt2html.timePerKB', $timePerKB );
|
|
|
|
|
|
|
|
|
|
if ( $timePerKB > 500 ) {
|
|
|
|
|
// At 100ms/KB, even a 100KB page which isn't that large will take 10s.
|
|
|
|
|
// So, we probably want to shoot for a threshold under 100ms.
|
|
|
|
|
// But, let's start with 500ms+ outliers first and see what we uncover.
|
2022-05-26 12:57:29 +00:00
|
|
|
LoggerFactory::getInstance( 'slow-parsoid' )
|
2022-12-05 18:39:37 +00:00
|
|
|
->info( 'Parsing {title} was slow, timePerKB took {timePerKB} ms, total: {time} seconds', [
|
2022-05-26 12:57:29 +00:00
|
|
|
'time' => number_format( $parseTime / 1000, 2 ),
|
2022-12-05 18:39:37 +00:00
|
|
|
'timePerKB' => number_format( $timePerKB, 1 ),
|
2022-05-26 12:57:29 +00:00
|
|
|
'title' => $pageConfig->getTitle(),
|
|
|
|
|
] );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if ( $wikitext !== null ) {
|
|
|
|
|
// Don't cache requests when wt is set in case somebody uses
|
|
|
|
|
// GET for wikitext parsing
|
2022-12-05 18:39:37 +00:00
|
|
|
// XXX: can we just refuse to do wikitext parsing in a GET request?
|
2022-05-26 12:57:29 +00:00
|
|
|
$response->setHeader( 'Cache-Control', 'private,no-cache,s-maxage=0' );
|
|
|
|
|
} elseif ( $oldid !== null ) {
|
2022-12-05 18:39:37 +00:00
|
|
|
// XXX: can this go away? Parsoid's PageContent class doesn't expose supressed revision content.
|
2022-05-26 12:57:29 +00:00
|
|
|
if ( $request->getHeaderLine( 'Cookie' ) ||
|
|
|
|
|
$request->getHeaderLine( 'Authorization' ) ) {
|
|
|
|
|
// Don't cache requests with a session.
|
|
|
|
|
$response->setHeader( 'Cache-Control', 'private,no-cache,s-maxage=0' );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return $response;
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-30 14:21:30 +00:00
|
|
|
protected function newParsoid(): Parsoid {
|
|
|
|
|
return new Parsoid( $this->siteConfig, $this->dataAccess );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
protected function parseHTML( string $html, bool $validateXMLNames = false ): Document {
|
2022-07-28 16:24:07 +00:00
|
|
|
return DOMUtils::parseHTML( $html, $validateXMLNames );
|
2022-06-30 14:21:30 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
/**
|
2022-08-26 20:07:48 +00:00
|
|
|
* @param PageConfig|PageIdentity $page
|
2022-07-18 10:10:59 +00:00
|
|
|
* @param array $attribs Attributes gotten from requests
|
|
|
|
|
* @param string $html Original HTML
|
|
|
|
|
*
|
2022-05-26 12:57:29 +00:00
|
|
|
* @return Response
|
2022-07-18 10:10:59 +00:00
|
|
|
* @throws HttpException
|
2022-05-26 12:57:29 +00:00
|
|
|
*/
|
|
|
|
|
protected function html2wt(
|
2022-08-26 20:07:48 +00:00
|
|
|
$page, array $attribs, string $html
|
2022-05-26 12:57:29 +00:00
|
|
|
) {
|
2022-08-26 20:07:48 +00:00
|
|
|
if ( $page instanceof PageConfig ) {
|
|
|
|
|
// TODO: Deprecate passing a PageConfig.
|
2022-10-16 21:44:53 +00:00
|
|
|
// Ideally, callers would use HtmlToContentTransform directly.
|
2022-08-30 19:19:53 +00:00
|
|
|
// XXX: This is slow, and we already have the parsed title and ID inside the PageConfig...
|
2022-08-26 20:07:48 +00:00
|
|
|
$page = Title::newFromTextThrow( $page->getTitle() );
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-18 10:10:59 +00:00
|
|
|
try {
|
2022-08-30 19:19:53 +00:00
|
|
|
$transform = $this->getHtmlInputTransformHelper( $attribs, $html, $page );
|
2022-05-26 12:57:29 +00:00
|
|
|
|
2022-07-18 10:10:59 +00:00
|
|
|
$response = $this->getResponseFactory()->create();
|
2022-08-30 19:19:53 +00:00
|
|
|
$transform->putContent( $response );
|
2022-05-26 12:57:29 +00:00
|
|
|
|
2022-07-18 10:10:59 +00:00
|
|
|
return $response;
|
2022-05-26 12:57:29 +00:00
|
|
|
} catch ( ClientError $e ) {
|
|
|
|
|
throw new HttpException( $e->getMessage(), 400 );
|
|
|
|
|
}
|
2022-07-18 10:10:59 +00:00
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
/**
|
|
|
|
|
* Pagebundle -> pagebundle helper.
|
|
|
|
|
*
|
|
|
|
|
* @param array<string,array|string> $attribs
|
|
|
|
|
* @return Response
|
|
|
|
|
* @throws HttpException
|
|
|
|
|
*/
|
|
|
|
|
protected function pb2pb( array $attribs ) {
|
|
|
|
|
$opts = $attribs['opts'];
|
|
|
|
|
|
|
|
|
|
$revision = $opts['previous'] ?? $opts['original'] ?? null;
|
|
|
|
|
if ( !isset( $revision['html'] ) ) {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'Missing revision html.', 400
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$vOriginal = ParsoidFormatHelper::parseContentTypeHeader(
|
|
|
|
|
$revision['html']['headers']['content-type'] ?? '' );
|
|
|
|
|
if ( $vOriginal === null ) {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'Content-type of revision html is missing.', 400
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
$attribs['envOptions']['inputContentVersion'] = $vOriginal;
|
|
|
|
|
'@phan-var array<string,array|string> $attribs'; // @var array<string,array|string> $attribs
|
|
|
|
|
|
|
|
|
|
$this->metrics->increment(
|
|
|
|
|
'pb2pb.original.version.' . $attribs['envOptions']['inputContentVersion']
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if ( !empty( $opts['updates'] ) ) {
|
|
|
|
|
// FIXME: Handling missing revisions uniformly for all update types
|
|
|
|
|
// is not probably the right thing to do but probably okay for now.
|
|
|
|
|
// This might need revisiting as we add newer types.
|
|
|
|
|
$pageConfig = $this->tryToCreatePageConfig( $attribs, null, true );
|
|
|
|
|
// If we're only updating parts of the original version, it should
|
|
|
|
|
// satisfy the requested content version, since we'll be returning
|
|
|
|
|
// that same one.
|
|
|
|
|
// FIXME: Since this endpoint applies the acceptable middleware,
|
|
|
|
|
// `getOutputContentVersion` is not what's been passed in, but what
|
|
|
|
|
// can be produced. Maybe that should be selectively applied so
|
|
|
|
|
// that we can update older versions where it makes sense?
|
|
|
|
|
// Uncommenting below implies that we can only update the latest
|
|
|
|
|
// version, since carrot semantics is applied in both directions.
|
|
|
|
|
// if ( !Semver::satisfies(
|
|
|
|
|
// $attribs['envOptions']['inputContentVersion'],
|
|
|
|
|
// "^{$attribs['envOptions']['outputContentVersion']}"
|
|
|
|
|
// ) ) {
|
|
|
|
|
// throw new HttpException(
|
|
|
|
|
// 'We do not know how to do this conversion.', 415
|
|
|
|
|
// );
|
|
|
|
|
// }
|
|
|
|
|
if ( !empty( $opts['updates']['redlinks'] ) ) {
|
|
|
|
|
// Q(arlolra): Should redlinks be more complex than a bool?
|
|
|
|
|
// See gwicke's proposal at T114413#2240381
|
|
|
|
|
return $this->updateRedLinks( $pageConfig, $attribs, $revision );
|
|
|
|
|
} elseif ( isset( $opts['updates']['variant'] ) ) {
|
|
|
|
|
return $this->languageConversion( $pageConfig, $attribs, $revision );
|
|
|
|
|
} else {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'Unknown transformation.', 400
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// TODO(arlolra): subbu has some sage advice in T114413#2365456 that
|
|
|
|
|
// we should probably be more explicit about the pb2pb conversion
|
|
|
|
|
// requested rather than this increasingly complex fallback logic.
|
|
|
|
|
$downgrade = Parsoid::findDowngrade(
|
|
|
|
|
$attribs['envOptions']['inputContentVersion'],
|
|
|
|
|
$attribs['envOptions']['outputContentVersion']
|
|
|
|
|
);
|
|
|
|
|
if ( $downgrade ) {
|
|
|
|
|
$pb = new PageBundle(
|
|
|
|
|
$revision['html']['body'],
|
|
|
|
|
$revision['data-parsoid']['body'] ?? null,
|
|
|
|
|
$revision['data-mw']['body'] ?? null
|
|
|
|
|
);
|
|
|
|
|
$this->validatePb( $pb, $attribs['envOptions']['inputContentVersion'] );
|
|
|
|
|
Parsoid::downgrade( $downgrade, $pb );
|
|
|
|
|
|
|
|
|
|
if ( !empty( $attribs['body_only'] ) ) {
|
2022-06-30 14:21:30 +00:00
|
|
|
$doc = $this->parseHTML( $pb->html );
|
2022-05-26 12:57:29 +00:00
|
|
|
$body = DOMCompat::getBody( $doc );
|
|
|
|
|
$pb->html = ContentUtils::toXML( $body, [
|
|
|
|
|
'innerXML' => true,
|
|
|
|
|
] );
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
$response = $this->getResponseFactory()->createJson( $pb->responseData() );
|
|
|
|
|
ParsoidFormatHelper::setContentType(
|
|
|
|
|
$response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $pb->version
|
|
|
|
|
);
|
|
|
|
|
return $response;
|
|
|
|
|
// Ensure we only reuse from semantically similar content versions.
|
|
|
|
|
} elseif ( Semver::satisfies( $attribs['envOptions']['outputContentVersion'],
|
|
|
|
|
'^' . $attribs['envOptions']['inputContentVersion'] ) ) {
|
|
|
|
|
$pageConfig = $this->tryToCreatePageConfig( $attribs );
|
|
|
|
|
return $this->wt2html( $pageConfig, $attribs );
|
|
|
|
|
} else {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'We do not know how to do this conversion.', 415
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Update red links on a document.
|
|
|
|
|
*
|
|
|
|
|
* @param PageConfig $pageConfig
|
|
|
|
|
* @param array $attribs
|
|
|
|
|
* @param array $revision
|
|
|
|
|
* @return Response
|
|
|
|
|
*/
|
|
|
|
|
protected function updateRedLinks(
|
|
|
|
|
PageConfig $pageConfig, array $attribs, array $revision
|
|
|
|
|
) {
|
2022-06-30 14:21:30 +00:00
|
|
|
$parsoid = $this->newParsoid();
|
2022-05-26 12:57:29 +00:00
|
|
|
|
|
|
|
|
$pb = new PageBundle(
|
|
|
|
|
$revision['html']['body'],
|
|
|
|
|
$revision['data-parsoid']['body'] ?? null,
|
|
|
|
|
$revision['data-mw']['body'] ?? null,
|
|
|
|
|
$attribs['envOptions']['inputContentVersion'],
|
|
|
|
|
$revision['html']['headers'] ?? null,
|
|
|
|
|
$revision['contentmodel'] ?? null
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$out = $parsoid->pb2pb(
|
|
|
|
|
$pageConfig, 'redlinks', $pb, []
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
$this->validatePb( $out, $attribs['envOptions']['inputContentVersion'] );
|
|
|
|
|
|
|
|
|
|
$response = $this->getResponseFactory()->createJson( $out->responseData() );
|
|
|
|
|
ParsoidFormatHelper::setContentType(
|
|
|
|
|
$response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $out->version
|
|
|
|
|
);
|
|
|
|
|
return $response;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Do variant conversion on a document.
|
|
|
|
|
*
|
|
|
|
|
* @param PageConfig $pageConfig
|
|
|
|
|
* @param array $attribs
|
|
|
|
|
* @param array $revision
|
|
|
|
|
* @return Response
|
|
|
|
|
* @throws HttpException
|
|
|
|
|
*/
|
|
|
|
|
protected function languageConversion(
|
|
|
|
|
PageConfig $pageConfig, array $attribs, array $revision
|
|
|
|
|
) {
|
|
|
|
|
$opts = $attribs['opts'];
|
|
|
|
|
$target = $opts['updates']['variant']['target'] ??
|
|
|
|
|
$attribs['envOptions']['htmlVariantLanguage'];
|
2022-10-04 11:54:37 +00:00
|
|
|
$source = $opts['updates']['variant']['source'] ?? null;
|
2022-05-26 12:57:29 +00:00
|
|
|
|
|
|
|
|
if ( !$target ) {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'Target variant is required.', 400
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
2022-10-04 11:54:37 +00:00
|
|
|
$pageIdentity = $this->tryToCreatePageIdentity( $attribs );
|
2022-05-26 12:57:29 +00:00
|
|
|
|
|
|
|
|
$pb = new PageBundle(
|
|
|
|
|
$revision['html']['body'],
|
|
|
|
|
$revision['data-parsoid']['body'] ?? null,
|
|
|
|
|
$revision['data-mw']['body'] ?? null,
|
|
|
|
|
$attribs['envOptions']['inputContentVersion'],
|
|
|
|
|
$revision['html']['headers'] ?? null,
|
|
|
|
|
$revision['contentmodel'] ?? null
|
|
|
|
|
);
|
2022-10-04 11:54:37 +00:00
|
|
|
|
Use Bcp47Code when interfacing with Parsoid
It is very easy for developers and maintainers to mix up "internal
MediaWiki language codes" and "BCP-47 language codes"; the latter are
standards-compliant and used in web protocols like HTTP, HTML, and
SVG; but much of WMF production is very dependent on historical codes
used by MediaWiki which in some cases predate the IANA standardized
name for the language in question.
Phan and other static checking tools aren't much help distinguishing
BCP-47 from internal codes when both are represented with the PHP
string type, so the wikimedia/bcp-47-code package introduced a very
lightweight wrapper type in order to uniquely identify BCP-47 codes.
Language implements Bcp47Code, and LanguageFactory::getLanguage() is
an easy way to convert (or downcast) between Bcp47Code and Language
objects.
This patch updates the Parsoid integration code and the associated
REST handlers to use Bcp47Code in APIs so that the standalone Parsoid
library does not need to know anything about MediaWiki-internal codes.
The principle has been, first, to try to convert a string to a
Bcp47Code as soon as possible and as close to the original input as
possible, so it is easy to see *why* a given string is a BCP-47 code
(usually, because it is coming from HTTP/HTML/etc) and we're not stuck
deep inside some method trying to figure out where a string we're
given is coming from and therefore what sort of string code it might
be. Second, we've added explicit compatibility code to accept
MediaWiki internal codes and convert them to Bcp47Code for backward
compatibility with existing clients, using the @internal
LanguageCode::normalizeNonstandardCodeAndWarn() method. The intention
is to gradually remove these backward compatibility thunks and replace
them with HTTP 400 errors or wfDeprecated messages in order to
identify and repair callers who are incorrectly using
non-standard-compliant language codes in web standards
(HTTP/HTML/SVG/etc).
Finally, maintaining a code as a Bcp47Code and not immediately
converting to Language helps us delay or even avoid full loading of a
Language object in some cases, which is another reason to occasionally
push Bcp47Code (instead of Language) down the call stack.
Bug: T327379
Depends-On: I830867d58f8962d6a57be16ce3735e8384f9ac1c
Change-Id: I982e0df706a633b05dcc02b5220b737c19adc401
2022-11-04 17:29:23 +00:00
|
|
|
// XXX: DI should inject HtmlTransformFactory
|
2022-10-04 11:54:37 +00:00
|
|
|
$languageVariantConverter = MediaWikiServices::getInstance()
|
2022-10-16 21:44:53 +00:00
|
|
|
->getHtmlTransformFactory()
|
2022-10-04 11:54:37 +00:00
|
|
|
->getLanguageVariantConverter( $pageIdentity );
|
|
|
|
|
$languageVariantConverter->setPageConfig( $pageConfig );
|
2022-10-07 10:38:50 +00:00
|
|
|
$httpContentLanguage = $attribs['pagelanguage' ] ?? null;
|
|
|
|
|
if ( $httpContentLanguage ) {
|
2022-10-04 13:34:05 +00:00
|
|
|
$languageVariantConverter->setPageLanguageOverride( $httpContentLanguage );
|
2022-10-07 10:38:50 +00:00
|
|
|
}
|
|
|
|
|
|
2022-10-04 11:54:37 +00:00
|
|
|
try {
|
|
|
|
|
$out = $languageVariantConverter->convertPageBundleVariant( $pb, $target, $source );
|
|
|
|
|
} catch ( InvalidArgumentException $e ) {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
'Unsupported language conversion',
|
|
|
|
|
400,
|
|
|
|
|
[ 'reason' => $e->getMessage() ]
|
|
|
|
|
);
|
|
|
|
|
}
|
2022-05-26 12:57:29 +00:00
|
|
|
|
|
|
|
|
$response = $this->getResponseFactory()->createJson( $out->responseData() );
|
|
|
|
|
ParsoidFormatHelper::setContentType(
|
|
|
|
|
$response, ParsoidFormatHelper::FORMAT_PAGEBUNDLE, $out->version
|
|
|
|
|
);
|
|
|
|
|
return $response;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** @inheritDoc */
|
|
|
|
|
abstract public function execute(): Response;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Validate a PageBundle against the given contentVersion, and throw
|
|
|
|
|
* an HttpException if it does not match.
|
|
|
|
|
* @param PageBundle $pb
|
|
|
|
|
* @param string $contentVersion
|
|
|
|
|
* @throws HttpException
|
|
|
|
|
*/
|
|
|
|
|
private function validatePb( PageBundle $pb, string $contentVersion ): void {
|
|
|
|
|
$errorMessage = '';
|
|
|
|
|
if ( !$pb->validate( $contentVersion, $errorMessage ) ) {
|
|
|
|
|
throw new HttpException( $errorMessage, 400 );
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-04 21:35:20 +00:00
|
|
|
/**
|
|
|
|
|
* @param PageConfig $page
|
|
|
|
|
*
|
|
|
|
|
* @return ProperPageIdentity
|
|
|
|
|
* @throws HttpException
|
|
|
|
|
*/
|
|
|
|
|
private function getPageConfigToIdentity( PageConfig $page ): ProperPageIdentity {
|
|
|
|
|
$services = MediaWikiServices::getInstance();
|
|
|
|
|
|
|
|
|
|
$title = $page->getTitle();
|
|
|
|
|
$page = $services->getPageStore()->getPageByText( $title );
|
|
|
|
|
|
|
|
|
|
if ( !$page ) {
|
|
|
|
|
throw new HttpException(
|
|
|
|
|
"Bad title: $title",
|
|
|
|
|
400
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return $page;
|
|
|
|
|
}
|
|
|
|
|
|
2022-05-26 12:57:29 +00:00
|
|
|
}
|