* Added externallinks table, to track links to arbitrary URLs
* Convert unnecessary URL escape codes in external links to their equivalent character before doing anything with them. This prevents certain kinds of spam filter evasion. (Parser.php only)
This commit is contained in:
parent
f88b6fd3ec
commit
eb53cc0856
9 changed files with 207 additions and 5 deletions
|
|
@ -64,6 +64,7 @@ Database:
|
|||
namespace are changed
|
||||
* Respect database prefix in dumpHTML.inc
|
||||
* Removed read-only check from Database::query()
|
||||
* Added externallinks table, to track links to arbitrary URLs
|
||||
|
||||
Documentation:
|
||||
* (bug 3306) Document $wgLocalTZoffset
|
||||
|
|
@ -251,6 +252,9 @@ Parser:
|
|||
* Fix XML validity checks in parser tests on PHP 5.1
|
||||
* (bug 4377) "[" is not valid in URLs
|
||||
* (bug 4453) fix for __TOC__ dollar-number breakage
|
||||
* Convert unnecessary URL escape codes in external links to their equivalent
|
||||
character before doing anything with them. This prevents certain kinds of
|
||||
spam filter evasion.
|
||||
|
||||
Upload:
|
||||
* (bug 2527) Always set destination filename when new file is selected
|
||||
|
|
|
|||
|
|
@ -1808,4 +1808,34 @@ function wfBaseName( $path ) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a URL index, appropriate for the el_index field of externallinks.
|
||||
*/
|
||||
function wfMakeUrlIndex( $url ) {
|
||||
wfSuppressWarnings();
|
||||
$bits = parse_url( $url );
|
||||
wfRestoreWarnings();
|
||||
if ( !$bits || $bits['scheme'] !== 'http' ) {
|
||||
return false;
|
||||
}
|
||||
// Reverse the labels in the hostname, convert to lower case
|
||||
$reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
|
||||
// Add an extra dot to the end
|
||||
if ( substr( $reversedHost, -1 ) !== '.' ) {
|
||||
$reversedHost .= '.';
|
||||
}
|
||||
// Reconstruct the pseudo-URL
|
||||
$index = "http://$reversedHost";
|
||||
// Leave out user and password. Add the port, path, query and fragment
|
||||
if ( isset( $bits['port'] ) ) $index .= ':' . $bits['port'];
|
||||
if ( isset( $bits['path'] ) ) {
|
||||
$index .= $bits['path'];
|
||||
} else {
|
||||
$index .= '/';
|
||||
}
|
||||
if ( isset( $bits['query'] ) ) $index .= '?' . $bits['query'];
|
||||
if ( isset( $bits['fragment'] ) ) $index .= '#' . $bits['fragment'];
|
||||
return $index;
|
||||
}
|
||||
|
||||
?>
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ class LinksUpdate {
|
|||
$mLinks, # Map of title strings to IDs for the links in the document
|
||||
$mImages, # DB keys of the images used, in the array key only
|
||||
$mTemplates, # Map of title strings to IDs for the template references, including broken ones
|
||||
$mExternals, # URLs of external links, array key only
|
||||
$mCategories, # Map of category names to sort keys
|
||||
$mDb, # Database connection reference
|
||||
$mOptions; # SELECT options to be used (array)
|
||||
|
|
@ -52,6 +53,7 @@ class LinksUpdate {
|
|||
$this->mLinks =& $this->mParserOutput->getLinks();
|
||||
$this->mImages =& $this->mParserOutput->getImages();
|
||||
$this->mTemplates =& $this->mParserOutput->getTemplates();
|
||||
$this->mExternals =& $this->mParserOutput->getExternalLinks();
|
||||
$this->mCategories =& $this->mParserOutput->getCategories();
|
||||
|
||||
}
|
||||
|
|
@ -87,6 +89,11 @@ class LinksUpdate {
|
|||
$this->incrTableUpdate( 'imagelinks', 'il', $this->getImageDeletions( $existing ),
|
||||
$this->getImageInsertions( $existing ) );
|
||||
|
||||
# External links
|
||||
$existing = $this->getExistingExternals();
|
||||
$this->incrTableUpdate( 'externallinks', 'el', $this->getExternalDeletions( $existing ),
|
||||
$this->getExternalInsertions( $existing ) );
|
||||
|
||||
# Category links
|
||||
$existing = $this->getExistingCategories();
|
||||
$this->incrTableUpdate( 'categorylinks', 'cl', $this->getCategoryDeletions( $existing ),
|
||||
|
|
@ -117,6 +124,7 @@ class LinksUpdate {
|
|||
$this->dumbTableUpdate( 'imagelinks', $this->getImageInsertions(), 'il_from' );
|
||||
$this->dumbTableUpdate( 'categorylinks', $this->getCategoryInsertions(), 'cl_from' );
|
||||
$this->dumbTableUpdate( 'templatelinks', $this->getTemplateInsertions(), 'tl_from' );
|
||||
$this->dumbTableUpdate( 'externallinks', $this->getExternalInsertions(), 'el_from' );
|
||||
|
||||
# Update the cache of all the category pages
|
||||
$this->invalidateCategories( $categoryUpdates );
|
||||
|
|
@ -238,7 +246,7 @@ class LinksUpdate {
|
|||
function getImageInsertions( $existing = array() ) {
|
||||
$arr = array();
|
||||
$diffs = array_diff_key( $this->mImages, $existing );
|
||||
foreach( $diffs as $iname => $val ) {
|
||||
foreach( $diffs as $iname => $dummy ) {
|
||||
$arr[] = array(
|
||||
'il_from' => $this->mId,
|
||||
'il_to' => $iname
|
||||
|
|
@ -247,6 +255,23 @@ class LinksUpdate {
|
|||
return $arr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an array of externallinks insertions. Skips the names specified in $existing
|
||||
* @access private
|
||||
*/
|
||||
function getExternalInsertions( $existing = array() ) {
|
||||
$arr = array();
|
||||
$diffs = array_diff_key( $this->mExternals, $existing );
|
||||
foreach( $diffs as $url => $dummy ) {
|
||||
$arr[] = array(
|
||||
'el_from' => $this->mId,
|
||||
'el_to' => $url,
|
||||
'el_index' => wfMakeUrlIndex( $url ),
|
||||
);
|
||||
}
|
||||
return $arr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an array of category insertions
|
||||
* @param array $existing Array mapping existing category names to sort keys. If both
|
||||
|
|
@ -309,6 +334,15 @@ class LinksUpdate {
|
|||
return array_diff_key( $existing, $this->mImages );
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an array of existing external links, returns those links which are not
|
||||
* in $this and thus should be deleted.
|
||||
* @access private
|
||||
*/
|
||||
function getExternalDeletions( $existing ) {
|
||||
return array_diff_key( $existing, $this->mExternals );
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an array of existing categories, returns those categories which are not in $this
|
||||
* and thus should be deleted.
|
||||
|
|
@ -333,6 +367,7 @@ class LinksUpdate {
|
|||
}
|
||||
$arr[$row->pl_namespace][$row->pl_title] = 1;
|
||||
}
|
||||
$this->mDb->freeResult( $res );
|
||||
return $arr;
|
||||
}
|
||||
|
||||
|
|
@ -351,6 +386,7 @@ class LinksUpdate {
|
|||
}
|
||||
$arr[$row->tl_namespace][$row->tl_title] = 1;
|
||||
}
|
||||
$this->mDb->freeResult( $res );
|
||||
return $arr;
|
||||
}
|
||||
|
||||
|
|
@ -366,6 +402,23 @@ class LinksUpdate {
|
|||
while ( $row = $this->mDb->fetchObject( $res ) ) {
|
||||
$arr[$row->il_to] = 1;
|
||||
}
|
||||
$this->mDb->freeResult( $res );
|
||||
return $arr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an array of existing external links, URLs in the keys
|
||||
* @access private
|
||||
*/
|
||||
function getExistingExternals() {
|
||||
$fname = 'LinksUpdate::getExistingExternals';
|
||||
$res = $this->mDb->select( 'externallinks', array( 'el_to' ),
|
||||
array( 'el_from' => $this->mId ), $fname, $this->mOptions );
|
||||
$arr = array();
|
||||
while ( $row = $this->mDb->fetchObject( $res ) ) {
|
||||
$arr[$row->el_to] = 1;
|
||||
}
|
||||
$this->mDb->freeResult( $res );
|
||||
return $arr;
|
||||
}
|
||||
|
||||
|
|
@ -381,6 +434,7 @@ class LinksUpdate {
|
|||
while ( $row = $this->mDb->fetchObject( $res ) ) {
|
||||
$arr[$row->cl_to] = $row->cl_sortkey;
|
||||
}
|
||||
$this->mDb->freeResult( $res );
|
||||
return $arr;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1121,19 +1121,23 @@ class Parser
|
|||
|
||||
# Replace & from obsolete syntax with &.
|
||||
# All HTML entities will be escaped by makeExternalLink()
|
||||
# or maybeMakeExternalImage()
|
||||
$url = str_replace( '&', '&', $url );
|
||||
# Replace unnecessary URL escape codes with the referenced character
|
||||
# This prevents spammers from hiding links from the filters
|
||||
$url = Parser::replaceUnusualEscapes( $url );
|
||||
|
||||
# Process the trail (i.e. everything after this link up until start of the next link),
|
||||
# replacing any non-bracketed links
|
||||
$trail = $this->replaceFreeExternalLinks( $trail );
|
||||
|
||||
|
||||
# Use the encoded URL
|
||||
# This means that users can paste URLs directly into the text
|
||||
# Funny characters like ö aren't valid in URLs anyway
|
||||
# This was changed in August 2004
|
||||
$s .= $sk->makeExternalLink( $url, $text, false, $linktype ) . $dtrail . $trail;
|
||||
|
||||
# Register link in the output object
|
||||
$this->mOutput->addExternalLink( $url );
|
||||
}
|
||||
|
||||
wfProfileOut( $fname );
|
||||
|
|
@ -1189,12 +1193,16 @@ class Parser
|
|||
# All HTML entities will be escaped by makeExternalLink()
|
||||
# or maybeMakeExternalImage()
|
||||
$url = str_replace( '&', '&', $url );
|
||||
# Replace unnecessary URL escape codes with their equivalent characters
|
||||
$url = Parser::replaceUnusualEscapes( $url );
|
||||
|
||||
# Is this an external image?
|
||||
$text = $this->maybeMakeExternalImage( $url );
|
||||
if ( $text === false ) {
|
||||
# Not an image, make a link
|
||||
$text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free' );
|
||||
# Register it in the output object
|
||||
$this->mOutput->addExternalLink( $url );
|
||||
}
|
||||
$s .= $text . $trail;
|
||||
} else {
|
||||
|
|
@ -1205,6 +1213,36 @@ class Parser
|
|||
return $s;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace unusual URL escape codes with their equivalent characters
|
||||
* @param string
|
||||
* @return string
|
||||
* @static
|
||||
*/
|
||||
function replaceUnusualEscapes( $url ) {
|
||||
return preg_replace_callback( '/%[0-9A-Fa-f]{2}/',
|
||||
array( 'Parser', 'replaceUnusualEscapesCallback' ), $url );
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function used in replaceUnusualEscapes().
|
||||
* Replaces unusual URL escape codes with their equivalent character
|
||||
* @static
|
||||
* @access private
|
||||
*/
|
||||
function replaceUnusualEscapesCallback( $matches ) {
|
||||
$char = urldecode( $matches[0] );
|
||||
$ord = ord( $char );
|
||||
// Is it an unsafe or HTTP reserved character according to RFC 1738?
|
||||
if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) {
|
||||
// No, shouldn't be escaped
|
||||
return $char;
|
||||
} else {
|
||||
// Yes, leave it escaped
|
||||
return $matches[0];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* make an image if it's allowed, either through the global
|
||||
* option or through the exception
|
||||
|
|
@ -3742,7 +3780,8 @@ class ParserOutput
|
|||
$mTitleText, # title text of the chosen language variant
|
||||
$mLinks, # 2-D map of NS/DBK to ID for the links in the document. ID=zero for broken.
|
||||
$mTemplates, # 2-D map of NS/DBK to ID for the template references. ID=zero for broken.
|
||||
$mImages; # DB keys of the images used, in the array key only
|
||||
$mImages, # DB keys of the images used, in the array key only
|
||||
$mExternalLinks; # External link URLs, in the key only
|
||||
|
||||
function ParserOutput( $text = '', $languageLinks = array(), $categoryLinks = array(),
|
||||
$containsOldMagic = false, $titletext = '' )
|
||||
|
|
@ -3757,6 +3796,7 @@ class ParserOutput
|
|||
$this->mLinks = array();
|
||||
$this->mTemplates = array();
|
||||
$this->mImages = array();
|
||||
$this->mExternalLinks = array();
|
||||
}
|
||||
|
||||
function getText() { return $this->mText; }
|
||||
|
|
@ -3768,6 +3808,7 @@ class ParserOutput
|
|||
function &getLinks() { return $this->mLinks; }
|
||||
function &getTemplates() { return $this->mTemplates; }
|
||||
function &getImages() { return $this->mImages; }
|
||||
function &getExternalLinks() { return $this->mExternalLinks; }
|
||||
|
||||
function containsOldMagic() { return $this->mContainsOldMagic; }
|
||||
function setText( $text ) { return wfSetVar( $this->mText, $text ); }
|
||||
|
|
@ -3780,6 +3821,7 @@ class ParserOutput
|
|||
function addCategory( $c, $sort ) { $this->mCategories[$c] = $sort; }
|
||||
function addImage( $name ) { $this->mImages[$name] = 1; }
|
||||
function addLanguageLink( $t ) { $this->mLanguageLinks[] = $t; }
|
||||
function addExternalLink( $url ) { $this->mExternalLinks[$url] = 1; }
|
||||
|
||||
function addLink( $title, $id ) {
|
||||
$ns = $title->getNamespace();
|
||||
|
|
|
|||
13
maintenance/archives/patch-externallinks.sql
Normal file
13
maintenance/archives/patch-externallinks.sql
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
--
|
||||
-- Track links to external URLs
|
||||
--
|
||||
CREATE TABLE /*$wgDBprefix*/externallinks (
|
||||
el_from int(8) unsigned NOT NULL default '0',
|
||||
el_to blob NOT NULL default '',
|
||||
el_index blob NOT NULL default '',
|
||||
|
||||
KEY (el_from, el_to(40)),
|
||||
KEY (el_to(60), el_from),
|
||||
KEY (el_index(60))
|
||||
) TYPE=InnoDB;
|
||||
|
||||
|
|
@ -462,6 +462,34 @@ CREATE TABLE /*$wgDBprefix*/categorylinks (
|
|||
|
||||
) TYPE=InnoDB, DEFAULT CHARSET=utf8;
|
||||
|
||||
--
|
||||
-- Track links to external URLs
|
||||
--
|
||||
CREATE TABLE /*$wgDBprefix*/externallinks (
|
||||
-- page_id of the referring page
|
||||
el_from int(8) unsigned NOT NULL default '0',
|
||||
|
||||
-- The URL
|
||||
el_to blob NOT NULL default '',
|
||||
|
||||
-- In the case of HTTP URLs, this is the URL with any username or password
|
||||
-- removed, and with the labels in the hostname reversed and converted to
|
||||
-- lower case. An extra dot is added to allow for matching of either
|
||||
-- example.com or *.example.com in a single scan.
|
||||
-- Example:
|
||||
-- http://user:password@sub.example.com/page.html
|
||||
-- becomes
|
||||
-- http://com.example.sub./page.html
|
||||
-- which allows for fast searching for all pages under example.com with the
|
||||
-- clause:
|
||||
-- WHERE el_index LIKE 'http://com.example.%'
|
||||
el_index blob NOT NULL default '',
|
||||
|
||||
KEY (el_from, el_to(40)),
|
||||
KEY (el_to(60), el_from),
|
||||
KEY (el_index(60))
|
||||
) TYPE=InnoDB, DEFAULT CHARSET=utf8;
|
||||
|
||||
--
|
||||
-- Contains a single row with some aggregate info
|
||||
-- on the state of the site.
|
||||
|
|
|
|||
|
|
@ -69,7 +69,7 @@ function refreshLinks( $start, $newOnly = false, $maxLag = false, $end = 0 ) {
|
|||
}
|
||||
|
||||
function fixLinksFromArticle( $id ) {
|
||||
global $wgTitle, $wgArticle, $wgOut, $wgParser, $wgLinkCache;
|
||||
global $wgTitle, $wgArticle, $wgOut, $wgParser;
|
||||
|
||||
$wgTitle = Title::newFromID( $id );
|
||||
$dbw =& wfGetDB( DB_MASTER );
|
||||
|
|
@ -105,6 +105,8 @@ function deleteLinksFromNonexistent( $maxLag = 0 ) {
|
|||
'pagelinks' => 'pl_from',
|
||||
'imagelinks' => 'il_from',
|
||||
'categorylinks' => 'cl_from',
|
||||
'templatelinks' => 'tl_from',
|
||||
'externallinks' => 'el_from',
|
||||
);
|
||||
|
||||
$page = $dbw->tableName( 'page' );
|
||||
|
|
|
|||
|
|
@ -449,6 +449,34 @@ CREATE TABLE /*$wgDBprefix*/categorylinks (
|
|||
|
||||
) TYPE=InnoDB;
|
||||
|
||||
--
|
||||
-- Track links to external URLs
|
||||
--
|
||||
CREATE TABLE /*$wgDBprefix*/externallinks (
|
||||
-- page_id of the referring page
|
||||
el_from int(8) unsigned NOT NULL default '0',
|
||||
|
||||
-- The URL
|
||||
el_to blob NOT NULL default '',
|
||||
|
||||
-- In the case of HTTP URLs, this is the URL with any username or password
|
||||
-- removed, and with the labels in the hostname reversed and converted to
|
||||
-- lower case. An extra dot is added to allow for matching of either
|
||||
-- example.com or *.example.com in a single scan.
|
||||
-- Example:
|
||||
-- http://user:password@sub.example.com/page.html
|
||||
-- becomes
|
||||
-- http://com.example.sub./page.html
|
||||
-- which allows for fast searching for all pages under example.com with the
|
||||
-- clause:
|
||||
-- WHERE el_index LIKE 'http://com.example.%'
|
||||
el_index blob NOT NULL default '',
|
||||
|
||||
KEY (el_from, el_to(40)),
|
||||
KEY (el_to(60), el_from),
|
||||
KEY (el_index(60))
|
||||
) TYPE=InnoDB;
|
||||
|
||||
--
|
||||
-- Contains a single row with some aggregate info
|
||||
-- on the state of the site.
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ $wgNewTables = array(
|
|||
array( 'user_newtalk', 'patch-usernewtalk2.sql' ),
|
||||
array( 'transcache', 'patch-transcache.sql' ),
|
||||
array( 'trackbacks', 'patch-trackbacks.sql' ),
|
||||
array( 'externallinks', 'patch-externallinks.sql' ),
|
||||
);
|
||||
|
||||
$wgNewFields = array(
|
||||
|
|
|
|||
Loading…
Reference in a new issue