* Added externallinks table, to track links to arbitrary URLs

* Convert unnecessary URL escape codes in external links to their equivalent
  character before doing anything with them. This prevents certain kinds of
  spam filter evasion. (Parser.php only)
This commit is contained in:
Tim Starling 2006-01-26 13:29:14 +00:00
parent f88b6fd3ec
commit eb53cc0856
9 changed files with 207 additions and 5 deletions

View file

@ -64,6 +64,7 @@ Database:
namespace are changed
* Respect database prefix in dumpHTML.inc
* Removed read-only check from Database::query()
* Added externallinks table, to track links to arbitrary URLs
Documentation:
* (bug 3306) Document $wgLocalTZoffset
@ -251,6 +252,9 @@ Parser:
* Fix XML validity checks in parser tests on PHP 5.1
* (bug 4377) "[" is not valid in URLs
* (bug 4453) fix for __TOC__ dollar-number breakage
* Convert unnecessary URL escape codes in external links to their equivalent
character before doing anything with them. This prevents certain kinds of
spam filter evasion.
Upload:
* (bug 2527) Always set destination filename when new file is selected

View file

@ -1808,4 +1808,34 @@ function wfBaseName( $path ) {
}
}
/**
* Make a URL index, appropriate for the el_index field of externallinks.
*/
function wfMakeUrlIndex( $url ) {
wfSuppressWarnings();
$bits = parse_url( $url );
wfRestoreWarnings();
if ( !$bits || $bits['scheme'] !== 'http' ) {
return false;
}
// Reverse the labels in the hostname, convert to lower case
$reversedHost = strtolower( implode( '.', array_reverse( explode( '.', $bits['host'] ) ) ) );
// Add an extra dot to the end
if ( substr( $reversedHost, -1 ) !== '.' ) {
$reversedHost .= '.';
}
// Reconstruct the pseudo-URL
$index = "http://$reversedHost";
// Leave out user and password. Add the port, path, query and fragment
if ( isset( $bits['port'] ) ) $index .= ':' . $bits['port'];
if ( isset( $bits['path'] ) ) {
$index .= $bits['path'];
} else {
$index .= '/';
}
if ( isset( $bits['query'] ) ) $index .= '?' . $bits['query'];
if ( isset( $bits['fragment'] ) ) $index .= '#' . $bits['fragment'];
return $index;
}
?>

View file

@ -19,6 +19,7 @@ class LinksUpdate {
$mLinks, # Map of title strings to IDs for the links in the document
$mImages, # DB keys of the images used, in the array key only
$mTemplates, # Map of title strings to IDs for the template references, including broken ones
$mExternals, # URLs of external links, array key only
$mCategories, # Map of category names to sort keys
$mDb, # Database connection reference
$mOptions; # SELECT options to be used (array)
@ -52,6 +53,7 @@ class LinksUpdate {
$this->mLinks =& $this->mParserOutput->getLinks();
$this->mImages =& $this->mParserOutput->getImages();
$this->mTemplates =& $this->mParserOutput->getTemplates();
$this->mExternals =& $this->mParserOutput->getExternalLinks();
$this->mCategories =& $this->mParserOutput->getCategories();
}
@ -87,6 +89,11 @@ class LinksUpdate {
$this->incrTableUpdate( 'imagelinks', 'il', $this->getImageDeletions( $existing ),
$this->getImageInsertions( $existing ) );
# External links
$existing = $this->getExistingExternals();
$this->incrTableUpdate( 'externallinks', 'el', $this->getExternalDeletions( $existing ),
$this->getExternalInsertions( $existing ) );
# Category links
$existing = $this->getExistingCategories();
$this->incrTableUpdate( 'categorylinks', 'cl', $this->getCategoryDeletions( $existing ),
@ -117,6 +124,7 @@ class LinksUpdate {
$this->dumbTableUpdate( 'imagelinks', $this->getImageInsertions(), 'il_from' );
$this->dumbTableUpdate( 'categorylinks', $this->getCategoryInsertions(), 'cl_from' );
$this->dumbTableUpdate( 'templatelinks', $this->getTemplateInsertions(), 'tl_from' );
$this->dumbTableUpdate( 'externallinks', $this->getExternalInsertions(), 'el_from' );
# Update the cache of all the category pages
$this->invalidateCategories( $categoryUpdates );
@ -238,7 +246,7 @@ class LinksUpdate {
function getImageInsertions( $existing = array() ) {
$arr = array();
$diffs = array_diff_key( $this->mImages, $existing );
foreach( $diffs as $iname => $val ) {
foreach( $diffs as $iname => $dummy ) {
$arr[] = array(
'il_from' => $this->mId,
'il_to' => $iname
@ -247,6 +255,23 @@ class LinksUpdate {
return $arr;
}
/**
* Get an array of externallinks insertions. Skips the names specified in $existing
* @access private
*/
function getExternalInsertions( $existing = array() ) {
$arr = array();
$diffs = array_diff_key( $this->mExternals, $existing );
foreach( $diffs as $url => $dummy ) {
$arr[] = array(
'el_from' => $this->mId,
'el_to' => $url,
'el_index' => wfMakeUrlIndex( $url ),
);
}
return $arr;
}
/**
* Get an array of category insertions
* @param array $existing Array mapping existing category names to sort keys. If both
@ -309,6 +334,15 @@ class LinksUpdate {
return array_diff_key( $existing, $this->mImages );
}
/**
* Given an array of existing external links, returns those links which are not
* in $this and thus should be deleted.
* @access private
*/
function getExternalDeletions( $existing ) {
return array_diff_key( $existing, $this->mExternals );
}
/**
* Given an array of existing categories, returns those categories which are not in $this
* and thus should be deleted.
@ -333,6 +367,7 @@ class LinksUpdate {
}
$arr[$row->pl_namespace][$row->pl_title] = 1;
}
$this->mDb->freeResult( $res );
return $arr;
}
@ -351,6 +386,7 @@ class LinksUpdate {
}
$arr[$row->tl_namespace][$row->tl_title] = 1;
}
$this->mDb->freeResult( $res );
return $arr;
}
@ -366,6 +402,23 @@ class LinksUpdate {
while ( $row = $this->mDb->fetchObject( $res ) ) {
$arr[$row->il_to] = 1;
}
$this->mDb->freeResult( $res );
return $arr;
}
/**
* Get an array of existing external links, URLs in the keys
* @access private
*/
function getExistingExternals() {
$fname = 'LinksUpdate::getExistingExternals';
$res = $this->mDb->select( 'externallinks', array( 'el_to' ),
array( 'el_from' => $this->mId ), $fname, $this->mOptions );
$arr = array();
while ( $row = $this->mDb->fetchObject( $res ) ) {
$arr[$row->el_to] = 1;
}
$this->mDb->freeResult( $res );
return $arr;
}
@ -381,6 +434,7 @@ class LinksUpdate {
while ( $row = $this->mDb->fetchObject( $res ) ) {
$arr[$row->cl_to] = $row->cl_sortkey;
}
$this->mDb->freeResult( $res );
return $arr;
}
}

View file

@ -1121,19 +1121,23 @@ class Parser
# Replace & from obsolete syntax with &.
# All HTML entities will be escaped by makeExternalLink()
# or maybeMakeExternalImage()
$url = str_replace( '&', '&', $url );
# Replace unnecessary URL escape codes with the referenced character
# This prevents spammers from hiding links from the filters
$url = Parser::replaceUnusualEscapes( $url );
# Process the trail (i.e. everything after this link up until start of the next link),
# replacing any non-bracketed links
$trail = $this->replaceFreeExternalLinks( $trail );
# Use the encoded URL
# This means that users can paste URLs directly into the text
# Funny characters like ö aren't valid in URLs anyway
# This was changed in August 2004
$s .= $sk->makeExternalLink( $url, $text, false, $linktype ) . $dtrail . $trail;
# Register link in the output object
$this->mOutput->addExternalLink( $url );
}
wfProfileOut( $fname );
@ -1189,12 +1193,16 @@ class Parser
# All HTML entities will be escaped by makeExternalLink()
# or maybeMakeExternalImage()
$url = str_replace( '&', '&', $url );
# Replace unnecessary URL escape codes with their equivalent characters
$url = Parser::replaceUnusualEscapes( $url );
# Is this an external image?
$text = $this->maybeMakeExternalImage( $url );
if ( $text === false ) {
# Not an image, make a link
$text = $sk->makeExternalLink( $url, $wgContLang->markNoConversion($url), true, 'free' );
# Register it in the output object
$this->mOutput->addExternalLink( $url );
}
$s .= $text . $trail;
} else {
@ -1205,6 +1213,36 @@ class Parser
return $s;
}
/**
* Replace unusual URL escape codes with their equivalent characters
* @param string
* @return string
* @static
*/
function replaceUnusualEscapes( $url ) {
return preg_replace_callback( '/%[0-9A-Fa-f]{2}/',
array( 'Parser', 'replaceUnusualEscapesCallback' ), $url );
}
/**
* Callback function used in replaceUnusualEscapes().
* Replaces unusual URL escape codes with their equivalent character
* @static
* @access private
*/
function replaceUnusualEscapesCallback( $matches ) {
$char = urldecode( $matches[0] );
$ord = ord( $char );
// Is it an unsafe or HTTP reserved character according to RFC 1738?
if ( $ord > 32 && $ord < 127 && strpos( '<>"#{}|\^~[]`;/?', $char ) === false ) {
// No, shouldn't be escaped
return $char;
} else {
// Yes, leave it escaped
return $matches[0];
}
}
/**
* make an image if it's allowed, either through the global
* option or through the exception
@ -3742,7 +3780,8 @@ class ParserOutput
$mTitleText, # title text of the chosen language variant
$mLinks, # 2-D map of NS/DBK to ID for the links in the document. ID=zero for broken.
$mTemplates, # 2-D map of NS/DBK to ID for the template references. ID=zero for broken.
$mImages; # DB keys of the images used, in the array key only
$mImages, # DB keys of the images used, in the array key only
$mExternalLinks; # External link URLs, in the key only
function ParserOutput( $text = '', $languageLinks = array(), $categoryLinks = array(),
$containsOldMagic = false, $titletext = '' )
@ -3757,6 +3796,7 @@ class ParserOutput
$this->mLinks = array();
$this->mTemplates = array();
$this->mImages = array();
$this->mExternalLinks = array();
}
function getText() { return $this->mText; }
@ -3768,6 +3808,7 @@ class ParserOutput
function &getLinks() { return $this->mLinks; }
function &getTemplates() { return $this->mTemplates; }
function &getImages() { return $this->mImages; }
function &getExternalLinks() { return $this->mExternalLinks; }
function containsOldMagic() { return $this->mContainsOldMagic; }
function setText( $text ) { return wfSetVar( $this->mText, $text ); }
@ -3780,6 +3821,7 @@ class ParserOutput
function addCategory( $c, $sort ) { $this->mCategories[$c] = $sort; }
function addImage( $name ) { $this->mImages[$name] = 1; }
function addLanguageLink( $t ) { $this->mLanguageLinks[] = $t; }
function addExternalLink( $url ) { $this->mExternalLinks[$url] = 1; }
function addLink( $title, $id ) {
$ns = $title->getNamespace();

View file

@ -0,0 +1,13 @@
--
-- Track links to external URLs
--
CREATE TABLE /*$wgDBprefix*/externallinks (
el_from int(8) unsigned NOT NULL default '0',
el_to blob NOT NULL default '',
el_index blob NOT NULL default '',
KEY (el_from, el_to(40)),
KEY (el_to(60), el_from),
KEY (el_index(60))
) TYPE=InnoDB;

View file

@ -462,6 +462,34 @@ CREATE TABLE /*$wgDBprefix*/categorylinks (
) TYPE=InnoDB, DEFAULT CHARSET=utf8;
--
-- Track links to external URLs
--
CREATE TABLE /*$wgDBprefix*/externallinks (
-- page_id of the referring page
el_from int(8) unsigned NOT NULL default '0',
-- The URL
el_to blob NOT NULL default '',
-- In the case of HTTP URLs, this is the URL with any username or password
-- removed, and with the labels in the hostname reversed and converted to
-- lower case. An extra dot is added to allow for matching of either
-- example.com or *.example.com in a single scan.
-- Example:
-- http://user:password@sub.example.com/page.html
-- becomes
-- http://com.example.sub./page.html
-- which allows for fast searching for all pages under example.com with the
-- clause:
-- WHERE el_index LIKE 'http://com.example.%'
el_index blob NOT NULL default '',
KEY (el_from, el_to(40)),
KEY (el_to(60), el_from),
KEY (el_index(60))
) TYPE=InnoDB, DEFAULT CHARSET=utf8;
--
-- Contains a single row with some aggregate info
-- on the state of the site.

View file

@ -69,7 +69,7 @@ function refreshLinks( $start, $newOnly = false, $maxLag = false, $end = 0 ) {
}
function fixLinksFromArticle( $id ) {
global $wgTitle, $wgArticle, $wgOut, $wgParser, $wgLinkCache;
global $wgTitle, $wgArticle, $wgOut, $wgParser;
$wgTitle = Title::newFromID( $id );
$dbw =& wfGetDB( DB_MASTER );
@ -105,6 +105,8 @@ function deleteLinksFromNonexistent( $maxLag = 0 ) {
'pagelinks' => 'pl_from',
'imagelinks' => 'il_from',
'categorylinks' => 'cl_from',
'templatelinks' => 'tl_from',
'externallinks' => 'el_from',
);
$page = $dbw->tableName( 'page' );

View file

@ -449,6 +449,34 @@ CREATE TABLE /*$wgDBprefix*/categorylinks (
) TYPE=InnoDB;
--
-- Track links to external URLs
--
CREATE TABLE /*$wgDBprefix*/externallinks (
-- page_id of the referring page
el_from int(8) unsigned NOT NULL default '0',
-- The URL
el_to blob NOT NULL default '',
-- In the case of HTTP URLs, this is the URL with any username or password
-- removed, and with the labels in the hostname reversed and converted to
-- lower case. An extra dot is added to allow for matching of either
-- example.com or *.example.com in a single scan.
-- Example:
-- http://user:password@sub.example.com/page.html
-- becomes
-- http://com.example.sub./page.html
-- which allows for fast searching for all pages under example.com with the
-- clause:
-- WHERE el_index LIKE 'http://com.example.%'
el_index blob NOT NULL default '',
KEY (el_from, el_to(40)),
KEY (el_to(60), el_from),
KEY (el_index(60))
) TYPE=InnoDB;
--
-- Contains a single row with some aggregate info
-- on the state of the site.

View file

@ -26,6 +26,7 @@ $wgNewTables = array(
array( 'user_newtalk', 'patch-usernewtalk2.sql' ),
array( 'transcache', 'patch-transcache.sql' ),
array( 'trackbacks', 'patch-trackbacks.sql' ),
array( 'externallinks', 'patch-externallinks.sql' ),
);
$wgNewFields = array(