wiki.techinc.nl/maintenance/archives/importUseModWiki.php

486 lines
13 KiB
PHP

<?php
/**
* @deprecated
* @package MediaWiki
* @subpackage MaintenanceArchive
*/
/** */
print "This script is obsolete!";
print "It is retained in the source here in case some of its
code might be useful for ad-hoc conversion tasks, but it is
not maintained and probably won't even work as is.";
exit();
/*
Import data from a UseModWiki into a PediaWiki wiki
2003-02-09 Brion VIBBER <brion@pobox.com>
Based loosely on Magnus's code from 2001-2002
Pass one: collect data on links & title case, users
Pass two: spit out SQL for
Separately, be sure to run the link & index rebuilding scripts!
*/
/* globals
*/
$wgRootDirectory = "/home/brion/vikio/wiki-ca/lib-http/db/wiki";
$wgFieldSeparator = "\xb3"; # Some wikis may use different char
$FS = $wgFieldSeparator ;
$FS1 = $FS."1" ;
$FS2 = $FS."2" ;
$FS3 = $FS."3" ;
# Images to import
$imageimport = '(http:\/\/(?:www\.|meta\.|)wikipedia\.(?:com|org)\/upload\/(?:[a-z]\/[a-z][0-9]\/)?(.*\.(?:gif|jpg|jpeg|png)))';
# Number of *seconds to add* to timestamp to get UTC/GMT
#$wgTimezoneCorrection = 0; # GMT
$wgTimezoneCorrection = 8*3600; # PST - California
# Other options...
$historyonly = false; # Don't add converted revisions to cur table; just get old histories
$lasthistoryonly = false; # Only add the _original_ form of the _current_ revision
/* Vary by language */
$namespaces = array( 0 => "", 1 => "Talk:", 2 => "User:", 3 => "User_talk:", 4
=> "Wikipedia:", 5 => "Wikipedia_talk:", 6 => "Image:", 7 => "Image_talk:" );
$talkending = "Talk";
$mediatext = "Media";
$conversionscript = "Conversion script";
$conversioncomment = "Automatic conversion";
$redirectcomment = "Automatic converion, moved to \$1";
$conversiontime = gmdate( "YmdHis" ); # Conversions will be marked with this timestamp
# Stats and caches
$oldtitles = array();
$usercache = array();
$titlecache = array();
$linkcache = array();
/**
* Some oversimplified test types
*
* @deprecated
* @package MediaWiki
* @subpackage MaintenanceArchive
*/
class Title {
var $title, $namespace;
function fromData( $namespace, $title ) {
$x = new Title;
$x->namespace = $namespace;
$x->title = $title;
return $x;
}
}
# See tests in importTests.php
if( ! $testingonly ) {
firstPass();
secondPass();
}
# ------------------------------------------------------------------------------
/* First pass:
Information please!
*/
function firstPass()
{
global $wgRootDirectory, $oldtitles;
$letters = array(
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
foreach( $letters as $letter ) {
firstPassDirectory( "$wgRootDirectory/page/$letter" );
}
}
function firstPassDirectory( $dir )
{
global $titlecache;
$mydir = opendir( $dir );
while( $entry = readdir( $mydir ) ) {
if( $entry != '.' && $entry != '..' ) {
if( is_dir( "$dir/$entry" ) ) {
firstPassDirectory( "$dir/$entry" );
}
} elseif( preg_match( '/$(.+)\.db$/', $entry, $m ) ) {
$titlecache[$title] = transformTitle( $m[1] );
countLinksFrom( $title );
} else {
echo "-- File '$entry' doesn't seem to contain an article. Skipping.\n";
}
}
}
/* Second pass:
make the dang SQL
*/
function secondPass()
{
global $titlecache, $usercache, $redirects;
foreach( $usercache as $oldname => $user ) {
echo importUser( $oldname );
}
foreach( $titlecache as $oldtitle => $newtitle ) {
echo importPage( $oldtitle );
}
echo "\n-- Done!\n";
}
# ------------------------------------------------------------------------------
/* fetch_ functions
Grab a given item from the database
*/
function fetchUser( $uid )
{
global $FS,$FS2,$FS3, $wgRootDirectory;
$fname = $wgRootDirectory . "/pages/" . $title;
if( !file_exists( $fname ) ) return false;
$data = splitHash( implode( "", file( $fname ) ) );
# enough?
return $data;
}
function fetchPage( $title )
{
global $FS,$FS2,$FS3, $wgRootDirectory;
$fname = $wgRootDirectory . "/pages/" . $title;
if( !file_exists( $fname ) ) return false;
$page = splitHash( implode( "", file( $fname ) ) );
$section = splitHash( $FS2, $page["text_default"] );
$text = splitHash( $FS3, $section["data"] );
return array ( "text" => $text["text"] , "summary" => $text["summary"] ,
"minor" => $text["minor"] , "ts" => $section["ts"] ,
"username" => $section["username"] , "host" => $section["host"] ) ;
}
function fetchKeptPages( $title )
{
global $FS,$FS2,$FS3, $wgRootDirectory, $wgTimezoneCorrection;
$fname = $wgRootDirectory . "/keep/" . $title . ".kp";
if( !file_exists( $fname ) ) return array();
$keptlist = explode( $FS1, implode( "", file( $fname ) ) );
array_shift( $keptlist ); # Drop the junk at beginning of file
$revisions = array();
foreach( $keptlist as $rev ) {
$section = splitHash( $FS2, $rev );
$text = splitHash( $FS3, $section["data"] );
if ( $text["text"] && $text["minor"] != "" && ( $section["ts"]*1 > 0 ) ) {
array_push( $revisions, array ( "text" => $text["text"] , "summary" => $text["summary"] ,
"minor" => $text["minor"] , "ts" => $section["ts"] ,
"username" => $section["username"] , "host" => $section["host"] ) );
} else {
echo "-- skipped a bad old revision\n";
}
}
return $revisions;
}
function splitHash ( $sep , $str ) {
$temp = explode ( $sep , $str ) ;
$ret = array () ;
for ( $i = 0; $i+1 < count ( $temp ) ; $i++ ) {
$ret[$temp[$i]] = $temp[++$i] ;
}
return $ret ;
}
/* import_ functions
Take a fetched item and produce SQL
*/
/* importUser
$uid is the UseMod user id number.
The new ones will be assigned arbitrarily and are for internal use only.
THIS IS DELAYED SINCE PUBLIC DUMPS DONT INCLUDE USER DIR
*/
function importUser( $uid )
{
global $last_uid, $user_list, $wgTimestampCorrection;
return "";
$stuff = fetchUser( $uid );
$last_uid++;
$name = wfStrencode( $stuff->username );
$hash = md5hash( $stuff->password ); # Doable?
$tzoffset = $stuff['tzoffset'] - ($wgTimestampCorrection / 3600); # -8 to 0; +9 to +1
$hideminor = ($stuff['rcall'] ? 0 : 1);
$options = "cols={$stuff['editcols']}
rows={$stuff['editrows']}
rcdays={$stuff['rcdays']}
timecorrection={$tzoffset}
hideminor={$hideminor}
";
$sql = "INSERT
INTO user (user_id,user_name,user_password,user_options)
VALUES ({$last_uid},'{$name}','{$hash}','{$options}');\n";
return $sql;
}
function checkUserCache( $name, $host )
{
global $usercache;
if( $name ) {
if( in_array( $name, $usercache ) ) {
$userid = $usercache[$name];
} else {
# If we haven't imported user accounts
$userid = 0;
}
$username = wfStrencode( $name );
} else {
$userid = 0;
$username = wfStrencode( $host );
}
return array( $userid, $username );
}
function importPage( $title )
{
global $wgTimezoneCorrection, $titlecache, $usercache;
global $conversionscript, $conversioncomment, $conversiontime;
global $historyonly, $lasthistoryonly;
$page = fetchPage( $title );
$newtext = wfStrencode( rewritePage( $title, $page->text ) );
$t = renamePage( $title );
$newtitle = wfStrencode( $t->title );
$namespace = $t->namespace;
# Current revision:
$text = wfStrencode( $page->text );
$minor = ($page->minor ? 1 : 0);
list( $userid, $username ) = checkUserCache( $page->username, $page->host );
$timestamp = wfUnix2Timestamp( $page->timestamp + $wgTimezoneCorrection );
$redirect = ( preg_match( '/^#REDIRECT/', $page->text ) ? 1 : 0 );
$sql = "\n";
if( !$historyonly ) {
$sql .= "INSERT
INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
VALUES ($namespace,'$newtitle','$newtext','$conversioncomment',0,'$conversionscript','$conversiontime',$redirect,$minor);\n";
}
$sql .= "INSERT
INTO old (old_namespace,old_title,old_text,old_comment,old_user,old_user_text,old_timestamp,old_minor_edit)
VALUES";
$sqlfinal = "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$minor)\n";
# History
if( !$lasthistoryonly ) {
$revisions = fetchKeptPages( $title );
foreach( $revisions as $rev ) {
$text = wfStrencode( $rev->text );
$minor = ($rev->minor ? 1 : 0);
list( $userid, $username ) = checkUserCache( $rev->username, $rev->host );
$timestamp = wfUnix2Timestamp( $rev->timestamp + $wgTimezoneCorrection );
$sql .= "\t\t($namespace,'$newtitle','$text','$comment',$userid,'$username','$timestamp',$redirect,$minor),\n";
}
}
return $sql . $sqlfinal;
}
# Count up basic links
function countLinksFrom( $title )
{
$page = fetchPage( $title );
$page->text = preg_replace(
'/<nowiki>.*<\/nowiki>/sDU',
'',
$page->text );
$page->text = preg_replace(
'/\[\[\s*([0-9a-zA-Z_ \x80-\xff]+)\s*(?:\|\s*([^]]+))?\s*\]\]/e',
'countLinkTo( ucfirst( "$1" ) )',
$page->text );
}
function countLinkTo( $title )
{
global $linkcache;
$t = transformTitle( $title );
$linkform = FreeToNormal( $t->title );
$x = $linkcache[$title];
if ( count ( $x ) ) {
$y = $x[$linkform] ;
if ( $y ) $y++; else $y = 1 ;
$x[$linkform] = $y ;
} else {
$x = array ( $linkform => 1 ) ;
}
$linkcache[$title] = $x;
}
# Preferentially change case
function renamePage( $title )
{
global $linkcache;
$t = transformTitle( $title );
# We want to use the most frequently linked-to form as the title
$maxcount = 0 ; $maxform = $t->title ;
foreach ( $linkcache[$title] as $linkform => $count ) {
if ( $count > $maxcount ) {
$maxcount = $count ;
$maxform = $linkform ;
}
}
if( $maxform != $t->title) {
doRenamePage( $t, $maxform );
}
}
function doRenamePage( $title, $maxform )
{
global $linkcache, $redirectcomment, $conversionscript, $conversiontime;
$sql = "INSERT INTO cur (cur_namespace,cur_title,cur_text,cur_comment,cur_user,cur_user_text,cur_timestamp,cur_is_redirect,cur_minor_edit)
VALUES ";
$redirsql = array();
foreach( $linkcache[$title] as $linkform => $count ) {
if( $linkform != $maxform ) {
$comment = wfStrencode( str_replace( "$1", $maxform, $redirectcomment ) );
array_push( $redirsql, "($namespace,'$redirtitle','$comment',0,'$conversionscript','$conversiontime',1,1)" );
}
}
$sql .= implode( ",\n\t", $redirsql ) . ";\n";
return $sql;
}
# Account for syntax changes
function rewritePage( $title, $text )
{
# ...
$text = removeTalkLink( $text );
$text = preg_replace( '/(^|<nowiki>).+?(<\/nowiki>|$)/esD',
'rewritePageBits( $title, "$1")',
$text );
return $text;
}
function rewritePageBits( $title, $text ) {
$text = fixSubpages( $title, $text );
$text = fixMedialinks( $text );
$text = fixImagelinks( $text );
return $text;
}
function removeTalkLink( &$text ) {
global $talkending;
return preg_replace( "[\\n*(?:\[\[)?/{$talkending}(?:\]\])?\\s*]sDi", '', $text );
}
function fixSubpages( $text, &$title ) {
$old = preg_quote( $text );
$text = preg_replace( "<(^|\s)/([A-Z\xc0-\xdf].*?)\b>",
"$1[[$title/$2|/$2]]", $text );
$text = preg_replace( "<\[\[/([^|]*?)\]\]>e",
"\"[[$title/\" . ucfirst( \"$1|/$1]]\" )", $text );
$text = preg_replace( "<\[\[/(.*?)\]\]>e",
"\"[[$title/\" . ucfirst( \"$1]]\" )", $text );
return $text;
}
function fixImagelinks( &$text ) {
global $imageimport, $namespaces;
return preg_replace( "/$imageimport/e",
'"[[{$namespaces[6]}" . fetchMediaFile( "$1", "$2" ) . "]]"',
$text );
}
function fixMedialinks( &$text ) {
global $imageimport, $mediatext;
$text = preg_replace( "/\[$imageimport\]/e",
'"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "]]"',
$text );
return preg_replace( "/\[$imageimport (.+?)\]/e",
'"[[$mediatext:" . fetchMediaFile( "$1", "$2" ) . "|$3]]"',
$text );
}
function fetchMediaFile( $url, $filename )
{
# Copy an image file into local upload space
# FIXME
return ucfirst( $filename );
}
# Simple move of talk pages, etc
function transformTitle( $title, $dorename = false )
{
global $talkending;
if( preg_match( "/^(.+)[ _]?\\/[ _]?($talkending)/i", $title, $m ) ) {
$thetitle = $m[1];
$namespace = 1;
} else {
$thetitle = $title;
$namespace = 0;
}
return Title::fromData( $namespace, $thetitle );
}
# Translated out of old usemod wiki...
function FreeToNormal ( $id , $FreeUpper = true ) {
$id = str_replace ( " ", "_", $id ) ;
$id = ucfirst($id);
if (strstr($id, '_') != false) { # Quick check for any space/underscores
$id = preg_replace ( '/__+/' , "_" , $id ) ;
$id = preg_replace ( '/^_/' , "", $id ) ;
$id = preg_replace ( '/_$/' , "", $id ) ;
#if ($UseSubpage) {
$id = preg_replace ( '|_/|', "/" , $id ) ;
$id = preg_replace ( '|/_|', "/" , $id ) ;
#}
}
if ($FreeUpper) {
# Note that letters after ' are *not* capitalized
if (preg_match ( '|[-_.,\(\)/][a-z]|' , $id ) ) { # Quick check for non-canon
$id = preg_replace ( '|([-_.,\(\)/])([a-z])|e' , '"$1" . strtoupper("$2")' , $id ) ;
}
}
return $id;
}
# Whee!
function recodeInput( $text )
{
return $text;
}
function wfUnix2Timestamp( $unixtime ) {
return gmdate( "YmdHis", $timestamp );
}
function wfTimestamp2Unix( $ts )
{
return gmmktime( ( (int)substr( $ts, 8, 2) ),
(int)substr( $ts, 10, 2 ), (int)substr( $ts, 12, 2 ),
(int)substr( $ts, 4, 2 ), (int)substr( $ts, 6, 2 ),
(int)substr( $ts, 0, 4 ) );
}
?>