wiki.techinc.nl/maintenance/importUseModWiki.php

<?php
/**
 * Import data from a UseModWiki into a MediaWiki wiki
 * 2003-02-09 Brion VIBBER <brion@pobox.com>
 * Based loosely on Magnus's code from 2001-2002
 *
 * Updated limited version to get something working temporarily
 * 2003-10-09
 * Be sure to run the link & index rebuilding scripts!
 *
 * Some more munging for charsets etc
 * 2003-11-28
 *
 * Partial fix for pages starting with lowercase letters (??)
 * and CamelCase and /Subpage link conversion
 * 2004-11-17
 *
 * Rewrite output to create Special:Export format for import
 * instead of raw SQL. Should be 'future-proof' against future
 * schema changes.
 * 2005-03-14
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @todo document
 * @file
 * @ingroup Maintenance
 */

require_once( "Maintenance.php" );

class ImportUseModWiki extends Maintenance {

	private $encoding, $rootDirectory = '';

	/**
	 * Field separators
	 * @var String
	 */
	private $FS1, $FS2, $FS3 = '';

	/**
	 * @var Array
	 */
	private $usercache, $nowiki = array();

	public function __construct() {
		parent::__construct();
		$this->mDescription = "Import pages from UseMod wikis";
		$this->addOption( 'encoding', 'Encoding of the imported text, default CP1252', false, true );
		/**
		 * If UseModWiki's New File System is used:
		 * $NewFS  = 1;  # 1 = new multibyte $FS,  0 = old $FS
		 * Use "\xb3";  for the Old File System
		 * Changed with UTF-8 UseModWiki
		 * http://www.usemod.com/cgi-bin/wiki.pl?SupportForUtf8
		 * http://www.usemod.com/cgi-bin/wiki.pl?WikiBugs/NewFieldSeparatorWronglyTreated
		 * http://www.meatballwiki.org/wiki/WikiEngine#Q_amp_A
		 */
		$this->addOption( 'separator', 'Field separator to use, default \x1E\xFF\xFE\x1E', false, true );
		$this->addArg( 'path', 'Path to your UseMod wiki' );
	}

	public function execute() {
		$this->rootDirectory = $this->getArg();
		$this->encoding = $this->getOption( 'encoding', 'CP1252' );
		$sep = $this->getOption( 'separator', "\x1E\xFF\xFE\x1E" );
		$this->FS1 = "{$sep}1";
		$this->FS2 = "{$sep}2";
		$this->FS3 = "{$sep}3";

		echo <<<XML
<?xml version="1.0" encoding="UTF-8" ?>
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.1/"
		   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
		   xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.1/
							   http://www.mediawiki.org/xml/export-0.1.xsd"
		   version="0.1"
		   xml:lang="en">
<!-- generated by importUseModWiki.php -->

XML;
	$letters = array(
		'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
		'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
		'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'other' );
	foreach ( $letters as $letter ) {
		$dir = "{$this->rootDirectory}/page/$letter";
		if ( is_dir( $dir ) )
			$this->importPageDirectory( $dir );
	}
	echo <<<XML
</mediawiki>

XML;
	}

	private function importPageDirectory( $dir, $prefix = "" ) {
		echo "\n<!-- Checking page directory " . $this->xmlCommentSafe( $dir ) . " -->\n";
		$mydir = opendir( $dir );
		while ( $entry = readdir( $mydir ) ) {
			$m = array();
			if ( preg_match( '/^(.+)\.db$/', $entry, $m ) ) {
				echo $this->importPage( $prefix . $m[1] );
			} else {
				if ( is_dir( "$dir/$entry" ) ) {
					if ( $entry != '.' && $entry != '..' ) {
						$this->importPageDirectory( "$dir/$entry", "$entry/" );
					}
				} else {
					echo "<!-- File '" . $this->xmlCommentSafe( $entry ) . "' doesn't seem to contain an article. Skipping. -->\n";
				}
			}
		}
	}

	private function useModFilename( $title ) {
		$c = substr( $title, 0, 1 );
		if ( preg_match( '/[A-Z]/i', $c ) ) {
			return strtoupper( $c ) . "/$title";
		}
		return "other/$title";
	}

	private function fetchPage( $title ) {
		$fname = $this->rootDirectory . "/page/" . $this->useModFilename( $title ) . ".db";
		if ( !file_exists( $fname ) ) {
			echo "Couldn't open file '$fname' for page '$title'.\n";
			die( -1 );
		}

		$page = $this->splitHash( $this->FS1, file_get_contents( $fname ) );
		$section = $this->splitHash( $this->FS2, $page["text_default"] );
		$text = $this->splitHash( $this->FS3, $section["data"] );

		return $this->array2object( array( "text" => $text["text"] , "summary" => $text["summary"] ,
			"minor" => $text["minor"] , "ts" => $section["ts"] ,
			"username" => $section["username"] , "host" => $section["host"] ) );
	}

	private function fetchKeptPages( $title ) {
		$fname = $this->rootDirectory . "/keep/" . $this->useModFilename( $title ) . ".kp";
		if ( !file_exists( $fname ) ) return array();

		$keptlist = explode( $this->FS1, file_get_contents( $fname ) );
		array_shift( $keptlist ); # Drop the junk at beginning of file

		$revisions = array();
		foreach ( $keptlist as $rev ) {
			$section = $this->splitHash( $this->FS2, $rev );
			$text = $this->splitHash( $this->FS3, $section["data"] );
			if ( $text["text"] && $text["minor"] != "" && ( $section["ts"] * 1 > 0 ) ) {
				array_push( $revisions, $this->array2object( array ( "text" => $text["text"] , "summary" => $text["summary"] ,
					"minor" => $text["minor"] , "ts" => $section["ts"] ,
					"username" => $section["username"] , "host" => $section["host"] ) ) );
			} else {
				echo "<!-- skipped a bad old revision -->\n";
			}
		}
		return $revisions;
	}

	private function splitHash( $sep , $str ) {
		$temp = explode ( $sep , $str ) ;
		$ret = array () ;
		for ( $i = 0; $i + 1 < count ( $temp ) ; $i++ ) {
			$ret[$temp[$i]] = $temp[++$i] ;
			}
		return $ret ;
	}

	private function checkUserCache( $name, $host ) {
		if ( $name ) {
			if ( in_array( $name, $this->usercache ) ) {
				$userid = $this->usercache[$name];
			} else {
				# If we haven't imported user accounts
				$userid = 0;
			}
			$username = str_replace( '_', ' ', $name );
		} else {
			$userid = 0;
			$username = $host;
		}
		return array( $userid, $username );
	}

	private function importPage( $title ) {
		echo "\n<!-- Importing page " . $this->xmlCommentSafe( $title ) . " -->\n";
		$page = $this->fetchPage( $title );

		$newtitle = $this->xmlsafe( str_replace( '_', ' ', $this->recodeText( $title ) ) );

		$munged = $this->mungeFormat( $page->text );
		if ( $munged != $page->text ) {
			/**
			 * Save a *new* revision with the conversion, and put the
			 * previous last version into the history.
			 */
			$next = $this->array2object( array(
				'text'     => $munged,
				'minor'    => 1,
				'username' => 'Conversion script',
				'host'     => '127.0.0.1',
				'ts'       => time(),
				'summary'  => 'link fix',
				) );
			$revisions = array( $page, $next );
		} else {
			/**
			 * Current revision:
			 */
			$revisions = array( $page );
		}
		$xml = <<<XML
		<page>
			<title>$newtitle</title>

XML;

		# History
		$revisions = array_merge( $revisions, $this->fetchKeptPages( $title ) );
		if ( count( $revisions ) == 0 ) {
			return NULL; // Was "$sql", which does not appear to be defined.
		}

		foreach ( $revisions as $rev ) {
			$text      = $this->xmlsafe( $this->recodeText( $rev->text ) );
			$minor     = ( $rev->minor ? '<minor/>' : '' );
			list( /* $userid */ , $username ) = $this->checkUserCache( $rev->username, $rev->host );
			$username  = $this->xmlsafe( $this->recodeText( $username ) );
			$timestamp = $this->xmlsafe( $this->timestamp2ISO8601( $rev->ts ) );
			$comment   = $this->xmlsafe( $this->recodeText( $rev->summary ) );

			$xml .= <<<XML
			<revision>
				<timestamp>$timestamp</timestamp>
				<contributor><username>$username</username></contributor>
				$minor
				<comment>$comment</comment>
				<text>$text</text>
			</revision>

XML;
		}
		$xml .= "</page>\n\n";
		return $xml;
	}

	private function recodeText( $string ) {
		# For currently latin-1 wikis
		$string = str_replace( "\r\n", "\n", $string );
		$string = @iconv( $this->encoding, "UTF-8", $string );
		$string = $this->mungeToUtf8( $string ); # Any old &#1234; stuff
		return $string;
	}

	/**
	 * @todo FIXME: Don't use /e
	 */
	private function mungeToUtf8( $string ) {
		$string = preg_replace ( '/&#([0-9]+);/e', 'wfUtf8Sequence($1)', $string );
		$string = preg_replace ( '/&#x([0-9a-f]+);/ie', 'wfUtf8Sequence(0x$1)', $string );
		# Should also do named entities here
		return $string;
	}

	private function timestamp2ISO8601( $ts ) {
		# 2003-08-05T18:30:02Z
		return gmdate( 'Y-m-d', $ts ) . 'T' . gmdate( 'H:i:s', $ts ) . 'Z';
	}

	/**
	 * The page may contain old data which has not been properly normalized.
	 * Invalid UTF-8 sequences or forbidden control characters will make our
	 * XML output invalid, so be sure to strip them out.
	 * @param String $string Text to clean up
	 * @return String
	 */
	private function xmlsafe( $string ) {
		$string = UtfNormal::cleanUp( $string );
		$string = htmlspecialchars( $string );
		return $string;
	}

	private function xmlCommentSafe( $text ) {
		return str_replace( '--', '\\-\\-', $this->xmlsafe( $this->recodeText( $text ) ) );
	}

	private function array2object( $arr ) {
		$o = (object)0;
		foreach ( $arr as $x => $y ) {
			$o->$x = $y;
		}
		return $o;
	}

	/**
	 * Make CamelCase and /Talk links work
	 */
	private function mungeFormat( $text ) {
		$this->nowiki = array();
		$staged = preg_replace_callback(
			'/(<nowiki>.*?<\\/nowiki>|(?:http|https|ftp):\\S+|\[\[[^]\\n]+]])/s',
			array( $this, 'nowikiPlaceholder' ), $text );

		# This is probably not  100% correct, I'm just
		# glancing at the UseModWiki code.
		$upper   = "[A-Z]";
		$lower   = "[a-z_0-9]";
		$any     = "[A-Za-z_0-9]";
		$camel   = "(?:$upper+$lower+$upper+$any*)";
		$subpage = "(?:\\/$any+)";
		$substart = "(?:\\/$upper$any*)";

		$munged = preg_replace( "/(?!\\[\\[)($camel$subpage*|$substart$subpage*)\\b(?!\\]\\]|>)/",
			'[[$1]]', $staged );

		$final = preg_replace( '/' . preg_quote( $this->placeholder() ) . '/s',
			array( $this, 'nowikiShift' ), $munged );
		return $final;
	}

	private function placeholder( $x = null ) {
		return '\xffplaceholder\xff';
	}

	public function nowikiPlaceholder( $matches ) {
		$this->nowiki[] = $matches[1];
		return $this->placeholder();
	}

	public function nowikiShift() {
		return array_shift( $this->nowiki );
	}
}

function wfUtf8Sequence( $codepoint ) {
	if ( $codepoint < 0x80 ) {
		return chr( $codepoint );
	}
	if ( $codepoint < 0x800 ) {
		return	chr( $codepoint >>  6 & 0x3f | 0xc0 ) .
				chr( $codepoint       & 0x3f | 0x80 );
	}
	if ( $codepoint <  0x10000 ) {
		return	chr( $codepoint >> 12 & 0x0f | 0xe0 ) .
				chr( $codepoint >>  6 & 0x3f | 0x80 ) .
				chr( $codepoint       & 0x3f | 0x80 );
	}
	if ( $codepoint < 0x100000 ) {
		return	chr( $codepoint >> 18 & 0x07 | 0xf0 ) . # Double-check this
				chr( $codepoint >> 12 & 0x3f | 0x80 ) .
				chr( $codepoint >>  6 & 0x3f | 0x80 ) .
				chr( $codepoint       & 0x3f | 0x80 );
	}
	# Doesn't yet handle outside the BMP
	return "&#$codepoint;";
}

$maintClass = 'ImportUseModWiki';
require_once( RUN_MAINTENANCE_IF_MAIN );